Просмотр исходного кода

sync dom, prompts and schema with browser-use v0.1.41

alexchenzl 3 месяцев назад
Родитель
Сommit
f4230721c4

+ 1 - 9
chrome-extension/src/background/agent/actions/builder.ts

@@ -185,14 +185,6 @@ export class ActionBuilder {
     }, goBackActionSchema);
     actions.push(goBack);
 
-    // # wait for x seconds
-    // @self.registry.action('Wait for x seconds default 3')
-    // async def wait(seconds: int = 3):
-    // 	msg = f'🕒  Waiting for {seconds} seconds'
-    // 	logger.info(msg)
-    // 	await asyncio.sleep(seconds)
-    // 	return ActionResult(extracted_content=msg, include_in_memory=True)
-
     const wait = new Action(async (input: z.infer<typeof waitActionSchema.schema>) => {
       const seconds = input.seconds || 3;
       const intent = input.intent || `Waiting for ${seconds} seconds`;
@@ -219,7 +211,7 @@ export class ActionBuilder {
         }
 
         // Check if element is a file uploader
-        if (await page.isFileUploader(elementNode)) {
+        if (page.isFileUploader(elementNode)) {
           const msg = `Index ${input.index} - has an element which opens file upload dialog. To upload files please use a specific function to upload files`;
           logger.info(msg);
           return new ActionResult({

+ 18 - 21
chrome-extension/src/background/agent/actions/json_schema.ts

@@ -88,27 +88,24 @@ export const jsonNavigatorOutputSchema = {
             type: 'object',
             nullable: true,
           },
-          // wait: {
-          //   properties: {
-          //     intent: {
-          //       title: 'Intent',
-          //       type: 'string',
-          //       description: 'purpose of this action'
-          //     },
-          //     seconds: {
-          //       title: 'Seconds',
-          //       type: 'integer',
-          //       default: 3
-          //     }
-          //   },
-          //   required: [
-          //     'intent',
-          //     'seconds'
-          //   ],
-          //   title: 'WaitAction',
-          //   type: 'object',
-          //   nullable: true
-          // },
+          wait: {
+            properties: {
+              intent: {
+                title: 'Intent',
+                type: 'string',
+                description: 'purpose of this action',
+              },
+              seconds: {
+                title: 'Seconds',
+                type: 'integer',
+                default: 3,
+              },
+            },
+            required: ['intent', 'seconds'],
+            title: 'WaitAction',
+            type: 'object',
+            nullable: true,
+          },
           click_element: {
             properties: {
               intent: {

+ 2 - 0
chrome-extension/src/background/agent/agents/navigator.ts

@@ -139,6 +139,8 @@ export class NavigatorAgent extends BaseAgent<z.ZodType, NavigatorResult> {
 
       // call the model to get the actions to take
       const inputMessages = messageManager.getMessages();
+      // logger.info('Navigator input messages', inputMessages);
+
       const modelOutput = await this.invoke(inputMessages);
 
       // check if the task is paused or stopped

+ 15 - 13
chrome-extension/src/background/agent/prompts/base.ts

@@ -75,20 +75,22 @@ abstract class BasePrompt {
       }
     }
 
-    const stateDescription = `
-    [Task history memory ends]
-    [Current state starts here]
-    The following is one-time information - if you need to remember it write it to memory:
-    Current tab: {id: ${browserState.tabId}, url: ${browserState.url}, title: ${browserState.title}}
-    Other available tabs:
-    ${browserState.tabs
+    const currentTab = `{id: ${browserState.tabId}, url: ${browserState.url}, title: ${browserState.title}}`;
+    const otherTabs = browserState.tabs
       .filter(tab => tab.id !== browserState.tabId)
-      .map(tab => ` - {id: ${tab.id}, url: ${tab.url}, title: ${tab.title}}`)
-      .join('\n')}
-    Interactive elements from top layer of the current page inside the viewport:
-    ${formattedElementsText}
-    ${stepInfoDescription}
-    ${actionResultsDescription}`;
+      .map(tab => `- {id: ${tab.id}, url: ${tab.url}, title: ${tab.title}}`);
+    const stateDescription = `
+[Task history memory ends]
+[Current state starts here]
+The following is one-time information - if you need to remember it write it to memory:
+Current tab: ${currentTab}
+Other available tabs:
+  ${otherTabs.join('\n')}
+Interactive elements from top layer of the current page inside the viewport:
+${formattedElementsText}
+${stepInfoDescription}
+${actionResultsDescription}
+`;
 
     if (browserState.screenshot && context.options.useVision) {
       return new HumanMessage({

+ 32 - 9
chrome-extension/src/background/agent/prompts/templates/navigator.ts

@@ -5,7 +5,7 @@ You are an AI agent designed to automate browser tasks. Your goal is to accompli
 
 Task
 Previous steps
-Current URL
+Current Tab
 Open Tabs
 Interactive Elements
 [index]<type>text</type>
@@ -32,13 +32,13 @@ Interactive Elements
 2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence.
 Common action sequences:
 
-- Form filling: [{"input_text": {"index": 1, "text": "username"}}, {"input_text": {"index": 2, "text": "password"}}, {"click_element": {"index": 3}}]
-- Navigation and extraction: [{"go_to_url": {"url": "https://example.com"}}, {"extract_content": {"goal": "extract the names"}}]
+- Form filling: [{"input_text": {"intent": "Fill title", "index": 1, "text": "username"}}, {"input_text": {"intent": "Fill title", "index": 2, "text": "password"}}, {"click_element": {"intent": "Click submit button", "index": 3}}]
+- Navigation: [{"go_to_url": {"intent": "Go to url", "url": "https://example.com"}}]
 - Actions are executed in the given order
-- If the page changes after an action, the sequence is interrupted and you get the new state.
-- Only provide the action sequence until an action which changes the page state significantly.
+- If the page changes after an action, the sequence will be interrupted
+- Only provide the action sequence until an action which changes the page state significantly
 - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page
-- only use multiple actions if it makes sense.
+- only use multiple actions if it makes sense
 
 3. ELEMENT INTERACTION:
 
@@ -51,7 +51,7 @@ Common action sequences:
 - Handle popups/cookies by accepting or closing them
 - Use scroll to find elements you are looking for
 - If you want to research something, open a new tab instead of using the current tab
-- If captcha pops up, try to solve it - else try a different approach
+- If captcha pops up, try to solve it if a screenshot image is provided - else try a different approach
 - If the page is not fully loaded, use wait action
 
 5. TASK COMPLETION:
@@ -62,6 +62,7 @@ Common action sequences:
 - If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step.
 - Don't hallucinate actions
 - Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task.
+- Include exact relevant urls if available, but do NOT make up any urls
 
 6. VISUAL CONTEXT:
 
@@ -79,6 +80,28 @@ Common action sequences:
 
 9. Extraction:
 
-- If your task is to find information - call extract_content on the specific pages to get and store the information.
-  Your responses must be always JSON with the specified format.
+- When searching for information or conducting research:
+  1. First analyze and extract relevant content from the current visible state
+  2. If the needed information is incomplete:
+     - Use cache_content action to cache the current findings
+     - Scroll down EXACTLY ONE PAGE at a time using scroll_page action
+     - NEVER scroll more than one page at once as this will cause loss of information
+     - Repeat the analyze-cache-scroll cycle until either:
+       * All required information is found, or
+       * Maximum 5 page scrolls have been performed
+  3. Before completing the task:
+     - Combine all cached content with the current state
+     - Verify all required information is collected
+     - Present the complete findings in the done action
+- Important extraction guidelines:
+  - Be thorough and specific when extracting information
+  - Always cache findings before scrolling to avoid losing information
+  - Always verify source information before caching
+  - Scroll down EXACTLY ONE PAGE at a time
+  - Stop after maximum 5 page scrolls
+
+10. Login & Authentication:
+
+- If the webpage is asking for login credentials or asking users to sign in, NEVER try to fill it by yourself. Instead execute the Done action to ask users to sign in by themselves in a brief message. 
+- Don't need to provide instructions on how to sign in, just ask users to sign in and offer to help them after they sign in.
 `;

+ 1 - 0
chrome-extension/src/background/agent/types.ts

@@ -41,6 +41,7 @@ export const DEFAULT_AGENT_OPTIONS: AgentOptions = {
     'value',
     'alt',
     'aria-expanded',
+    'data-date-format',
   ],
   planningInterval: 3,
 };

+ 2 - 2
chrome-extension/src/background/browser/page.ts

@@ -800,7 +800,7 @@ export default class Page {
 
     try {
       // Highlight before typing
-      // if (elementNode.highlightIndex !== undefined && elementNode.highlightIndex !== null) {
+      // if (elementNode.highlightIndex != null) {
       //   await this._updateState(useVision, elementNode.highlightIndex);
       // }
 
@@ -983,7 +983,7 @@ export default class Page {
 
     try {
       // Highlight before clicking
-      // if (elementNode.highlightIndex !== undefined && elementNode.highlightIndex !== null) {
+      // if (elementNode.highlightIndex !== null) {
       //   await this._updateState(useVision, elementNode.highlightIndex);
       // }
 

+ 3 - 1
chrome-extension/src/background/dom/service.ts

@@ -82,6 +82,7 @@ export async function getClickableElements(
   highlightElements = true,
   focusElement = -1,
   viewportExpansion = 0,
+  debugMode = false,
 ): Promise<DOMState> {
   const [elementTree, selectorMap] = await _buildDomTree(
     tabId,
@@ -89,6 +90,7 @@ export async function getClickableElements(
     highlightElements,
     focusElement,
     viewportExpansion,
+    debugMode,
   );
   return { elementTree, selectorMap };
 }
@@ -243,7 +245,7 @@ export function _parse_node(nodeData: RawDomTreeNode): [DOMBaseNode | null, stri
     isInteractive: elementData.isInteractive ?? false,
     isTopElement: elementData.isTopElement ?? false,
     isInViewport: elementData.isInViewport ?? false,
-    highlightIndex: elementData.highlightIndex,
+    highlightIndex: elementData.highlightIndex ?? null,
     shadowRoot: elementData.shadowRoot ?? false,
     parent: null,
     viewportInfo: viewportInfo,

+ 2 - 2
chrome-extension/src/background/dom/views.ts

@@ -24,7 +24,7 @@ export class DOMTextNode extends DOMBaseNode {
   hasParentWithHighlightIndex(): boolean {
     let current = this.parent;
     while (current != null) {
-      if (current.highlightIndex !== undefined) {
+      if (current.highlightIndex !== null) {
         return true;
       }
       current = current.parent;
@@ -172,7 +172,7 @@ export class DOMElementNode extends DOMBaseNode {
       }
 
       // Skip this branch if we hit a highlighted element (except for the current node)
-      if (node instanceof DOMElementNode && node !== this && node.highlightIndex != null) {
+      if (node instanceof DOMElementNode && node !== this && node.highlightIndex !== null) {
         return;
       }
 

+ 11 - 11
packages/schema-utils/lib/json_schema.ts

@@ -56,17 +56,17 @@ export const jsonNavigatorOutputSchema = {
           ],
           description: 'Go back to previous page',
         },
-        // wait: {
-        //   anyOf: [
-        //     {
-        //       $ref: '#/$defs/WaitAction',
-        //     },
-        //     {
-        //       type: 'null',
-        //     },
-        //   ],
-        //   description: 'Wait for x seconds default 3',
-        // },
+        wait: {
+          anyOf: [
+            {
+              $ref: '#/$defs/WaitAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Wait for x seconds default 3',
+        },
         click_element: {
           anyOf: [
             {