Преглед на файлове

1. add a package to flatten OpenAI compatible json schema, and convert it to Gemini compatible schema
2. Gemini does not allow to send message with empty content
3. Ignore any input generated by LLM if the schema only has an empty object argument
4. Occasionally validator fall into a failure loop, fix it by re-planning
5. Update model settings to add Gemini models, and claude 3.7

alexchenzl преди 6 месеца
родител
ревизия
e4072339f5

+ 21 - 8
chrome-extension/src/background/agent/actions/builder.ts

@@ -36,19 +36,30 @@ export class InvalidInputError extends Error {
  */
 export class Action {
   constructor(
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
     private readonly handler: (input: any) => Promise<ActionResult>,
     public readonly schema: ActionSchema,
   ) {}
 
   async call(input: unknown): Promise<ActionResult> {
     // Validate input before calling the handler
-    const result = this.schema.schema.safeParse(input);
-    if (!result.success) {
-      const errorMessage = result.error.message;
-      logger.error('Invalid input', errorMessage);
+    const schema = this.schema.schema;
+
+    // check if the schema is schema: z.object({}), if so, ignore the input
+    const isEmptySchema =
+      schema instanceof z.ZodObject &&
+      Object.keys((schema as z.ZodObject<Record<string, z.ZodTypeAny>>).shape || {}).length === 0;
+
+    if (isEmptySchema) {
+      return await this.handler({});
+    }
+
+    const parsedArgs = this.schema.schema.safeParse(input);
+    if (!parsedArgs.success) {
+      const errorMessage = parsedArgs.error.message;
       throw new InvalidInputError(errorMessage);
     }
-    return await this.handler(result.data);
+    return await this.handler(parsedArgs.data);
   }
 
   name() {
@@ -60,6 +71,7 @@ export class Action {
    * @returns {string} The prompt for the action
    */
   prompt() {
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
     const schemaShape = (this.schema.schema as z.ZodObject<any>).shape || {};
     const schemaProperties = Object.entries(schemaShape).map(([key, value]) => {
       const zodValue = value as z.ZodTypeAny;
@@ -77,13 +89,14 @@ export class Action {
 export function buildDynamicActionSchema(actions: Action[]): z.ZodType {
   let schema = z.object({});
   for (const action of actions) {
-    // create a schema for the action, it could be action.schema.schema or null, and default to null
-    const actionSchema = action.schema.schema.nullable().default(null);
+    // create a schema for the action, it could be action.schema.schema or null
+    // but don't use default: null as it causes issues with Google Generative AI
+    const actionSchema = action.schema.schema.nullable();
     schema = schema.extend({
       [action.name()]: actionSchema,
     });
   }
-  return schema.partial();
+  return schema.partial().nullable();
 }
 
 export class ActionBuilder {

+ 221 - 0
chrome-extension/src/background/agent/actions/json_gemini.ts

@@ -0,0 +1,221 @@
+// TODO: don't know why zod can not generate the same schema, need to fix it
+export const geminiNavigatorOutputSchema = {
+  type: 'object',
+  properties: {
+    current_state: {
+      type: 'object',
+      description: 'Current state of the agent',
+      properties: {
+        page_summary: {
+          type: 'string',
+        },
+        evaluation_previous_goal: {
+          type: 'string',
+        },
+        memory: {
+          type: 'string',
+        },
+        next_goal: {
+          type: 'string',
+        },
+      },
+      required: ['page_summary', 'evaluation_previous_goal', 'memory', 'next_goal'],
+    },
+    action: {
+      type: 'array',
+      items: {
+        type: 'object',
+        properties: {
+          done: {
+            type: 'object',
+            properties: {
+              text: {
+                type: 'string',
+              },
+            },
+            required: ['text'],
+            nullable: true,
+          },
+          search_google: {
+            type: 'object',
+            properties: {
+              query: {
+                type: 'string',
+              },
+            },
+            required: ['query'],
+            nullable: true,
+          },
+          go_to_url: {
+            type: 'object',
+            properties: {
+              url: {
+                type: 'string',
+              },
+            },
+            required: ['url'],
+            nullable: true,
+          },
+          go_back: {
+            type: 'string',
+            nullable: true,
+            description:
+              'Accepts absolutely anything in the incoming data\nand discards it, so the final parsed model is empty.',
+          },
+          click_element: {
+            type: 'object',
+            properties: {
+              desc: {
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              index: {
+                type: 'integer',
+              },
+              xpath: {
+                type: 'string',
+                nullable: true,
+              },
+            },
+            required: ['desc', 'index'],
+            nullable: true,
+          },
+          input_text: {
+            type: 'object',
+            properties: {
+              desc: {
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              index: {
+                type: 'integer',
+              },
+              text: {
+                type: 'string',
+              },
+              xpath: {
+                type: 'string',
+                nullable: true,
+              },
+            },
+            required: ['desc', 'index', 'text'],
+            nullable: true,
+          },
+          switch_tab: {
+            type: 'object',
+            properties: {
+              tab_id: {
+                type: 'integer',
+              },
+            },
+            required: ['tab_id'],
+            nullable: true,
+          },
+          open_tab: {
+            type: 'object',
+            properties: {
+              url: {
+                type: 'string',
+              },
+            },
+            required: ['url'],
+            nullable: true,
+          },
+          cache_content: {
+            type: 'object',
+            properties: {
+              content: {
+                type: 'string',
+              },
+            },
+            required: ['content'],
+            nullable: true,
+          },
+          scroll_down: {
+            type: 'object',
+            properties: {
+              desc: {
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              amount: {
+                type: 'integer',
+                nullable: true,
+              },
+            },
+            required: ['desc'],
+            nullable: true,
+          },
+          scroll_up: {
+            type: 'object',
+            properties: {
+              desc: {
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              amount: {
+                type: 'integer',
+                nullable: true,
+              },
+            },
+            required: ['desc'],
+            nullable: true,
+          },
+          send_keys: {
+            type: 'object',
+            properties: {
+              desc: {
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              keys: {
+                type: 'string',
+              },
+            },
+            required: ['desc', 'keys'],
+            nullable: true,
+          },
+          scroll_to_text: {
+            type: 'object',
+            properties: {
+              desc: {
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              text: {
+                type: 'string',
+              },
+            },
+            required: ['desc', 'text'],
+            nullable: true,
+          },
+          get_dropdown_options: {
+            type: 'object',
+            properties: {
+              index: {
+                type: 'integer',
+              },
+            },
+            required: ['index'],
+            nullable: true,
+          },
+          select_dropdown_option: {
+            type: 'object',
+            properties: {
+              index: {
+                type: 'integer',
+              },
+              text: {
+                type: 'string',
+              },
+            },
+            required: ['index', 'text'],
+            nullable: true,
+          },
+        },
+        required: [],
+      },
+    },
+  },
+  required: ['current_state', 'action'],
+};

+ 233 - 412
chrome-extension/src/background/agent/actions/json_schema.ts

@@ -1,197 +1,8 @@
 // This is the json schema exported from browser-use, change page_id to tab_id
 // TODO: don't know why zod can not generate the same schema, need to fix it
 export const jsonNavigatorOutputSchema = {
-  $defs: {
-    ActionModel: {
-      properties: {
-        done: {
-          anyOf: [
-            {
-              $ref: '#/$defs/DoneAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Complete task',
-        },
-        search_google: {
-          anyOf: [
-            {
-              $ref: '#/$defs/SearchGoogleAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description:
-            'Search the query in Google in the current tab, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items. ',
-        },
-        go_to_url: {
-          anyOf: [
-            {
-              $ref: '#/$defs/GoToUrlAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Navigate to URL in the current tab',
-        },
-        go_back: {
-          anyOf: [
-            {
-              $ref: '#/$defs/NoParamsAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Go back',
-        },
-        click_element: {
-          anyOf: [
-            {
-              $ref: '#/$defs/ClickElementAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Click element',
-        },
-        input_text: {
-          anyOf: [
-            {
-              $ref: '#/$defs/InputTextAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Input text into a input interactive element',
-        },
-        switch_tab: {
-          anyOf: [
-            {
-              $ref: '#/$defs/SwitchTabAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Switch tab',
-        },
-        open_tab: {
-          anyOf: [
-            {
-              $ref: '#/$defs/OpenTabAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Open url in new tab',
-        },
-        cache_content: {
-          anyOf: [
-            {
-              $ref: '#/$defs/cache_content_parameters',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Cache what you have found so far from the current page so that it can be used in future steps',
-        },
-        scroll_down: {
-          anyOf: [
-            {
-              $ref: '#/$defs/ScrollAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page',
-        },
-        scroll_up: {
-          anyOf: [
-            {
-              $ref: '#/$defs/ScrollAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page',
-        },
-        send_keys: {
-          anyOf: [
-            {
-              $ref: '#/$defs/SendKeysAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description:
-            'Send strings of special keys like Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press. Be aware of different operating systems and their shortcuts',
-        },
-        scroll_to_text: {
-          anyOf: [
-            {
-              $ref: '#/$defs/scroll_to_text_parameters',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'If you dont find something which you want to interact with, scroll to it',
-        },
-        get_dropdown_options: {
-          anyOf: [
-            {
-              $ref: '#/$defs/get_dropdown_options_parameters',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Get all options from a native dropdown',
-        },
-        select_dropdown_option: {
-          anyOf: [
-            {
-              $ref: '#/$defs/select_dropdown_option_parameters',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description:
-            'Select dropdown option for interactive element index by the text of the option you want to select',
-        },
-      },
-      title: 'ActionModel',
-      type: 'object',
-    },
-    AgentBrain: {
+  properties: {
+    current_state: {
       description: 'Current state of the agent',
       properties: {
         page_summary: {
@@ -215,229 +26,239 @@ export const jsonNavigatorOutputSchema = {
       title: 'AgentBrain',
       type: 'object',
     },
-    ClickElementAction: {
-      properties: {
-        desc: {
-          title: 'Description',
-          type: 'string',
-          description: 'Description of the purpose of calling this action',
-        },
-        index: {
-          title: 'Index',
-          type: 'integer',
-        },
-        xpath: {
-          anyOf: [
-            {
-              type: 'string',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          title: 'Xpath',
-        },
-      },
-      required: ['index'],
-      title: 'ClickElementAction',
-      type: 'object',
-    },
-    DoneAction: {
-      properties: {
-        text: {
-          title: 'Text',
-          type: 'string',
-        },
-      },
-      required: ['text'],
-      title: 'DoneAction',
-      type: 'object',
-    },
-    GoToUrlAction: {
-      properties: {
-        url: {
-          title: 'Url',
-          type: 'string',
-        },
-      },
-      required: ['url'],
-      title: 'GoToUrlAction',
-      type: 'object',
-    },
-    InputTextAction: {
-      properties: {
-        desc: {
-          title: 'Description',
-          type: 'string',
-          description: 'Description of the purpose of calling this action',
-        },
-        index: {
-          title: 'Index',
-          type: 'integer',
-        },
-        text: {
-          title: 'Text',
-          type: 'string',
-        },
-        xpath: {
-          anyOf: [
-            {
-              type: 'string',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          title: 'Xpath',
-        },
-      },
-      required: ['index', 'text'],
-      title: 'InputTextAction',
-      type: 'object',
-    },
-    NoParamsAction: {
-      additionalProperties: true,
-      description:
-        'Accepts absolutely anything in the incoming data\nand discards it, so the final parsed model is empty.',
-      properties: {},
-      title: 'NoParamsAction',
-      type: 'object',
-    },
-    OpenTabAction: {
-      properties: {
-        url: {
-          title: 'Url',
-          type: 'string',
-        },
-      },
-      required: ['url'],
-      title: 'OpenTabAction',
-      type: 'object',
-    },
-    ScrollAction: {
-      properties: {
-        desc: {
-          title: 'Description',
-          type: 'string',
-          description: 'Description of the purpose of calling this action',
-        },
-        amount: {
-          anyOf: [
-            {
-              type: 'integer',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          title: 'Amount',
-        },
-      },
-      title: 'ScrollAction',
-      type: 'object',
-    },
-    SearchGoogleAction: {
-      properties: {
-        query: {
-          title: 'Query',
-          type: 'string',
-        },
-      },
-      required: ['query'],
-      title: 'SearchGoogleAction',
-      type: 'object',
-    },
-    SendKeysAction: {
-      properties: {
-        desc: {
-          title: 'Description',
-          type: 'string',
-          description: 'Description of the purpose of calling this action',
-        },
-        keys: {
-          title: 'Keys',
-          type: 'string',
-        },
-      },
-      required: ['keys'],
-      title: 'SendKeysAction',
-      type: 'object',
-    },
-    SwitchTabAction: {
-      properties: {
-        tab_id: {
-          title: 'Page Id',
-          type: 'integer',
-        },
-      },
-      required: ['tab_id'],
-      title: 'SwitchTabAction',
-      type: 'object',
-    },
-    cache_content_parameters: {
-      properties: {
-        content: {
-          title: 'Content',
-          type: 'string',
-        },
-      },
-      required: ['content'],
-      title: 'cache_content_parameters',
-      type: 'object',
-    },
-    get_dropdown_options_parameters: {
-      properties: {
-        index: {
-          title: 'Index',
-          type: 'integer',
-        },
-      },
-      required: ['index'],
-      title: 'get_dropdown_options_parameters',
-      type: 'object',
-    },
-    scroll_to_text_parameters: {
-      properties: {
-        desc: {
-          title: 'Description',
-          type: 'string',
-          description: 'Description of the purpose of calling this action',
-        },
-        text: {
-          title: 'Text',
-          type: 'string',
-        },
-      },
-      required: ['text'],
-      title: 'scroll_to_text_parameters',
-      type: 'object',
-    },
-    select_dropdown_option_parameters: {
-      properties: {
-        index: {
-          title: 'Index',
-          type: 'integer',
-        },
-        text: {
-          title: 'Text',
-          type: 'string',
-        },
-      },
-      required: ['index', 'text'],
-      title: 'select_dropdown_option_parameters',
-      type: 'object',
-    },
-  },
-  properties: {
-    current_state: {
-      $ref: '#/$defs/AgentBrain',
-    },
     action: {
       items: {
-        $ref: '#/$defs/ActionModel',
+        properties: {
+          done: {
+            properties: {
+              text: {
+                title: 'Text',
+                type: 'string',
+              },
+            },
+            required: ['text'],
+            title: 'DoneAction',
+            type: 'object',
+            nullable: true,
+          },
+          search_google: {
+            properties: {
+              query: {
+                title: 'Query',
+                type: 'string',
+              },
+            },
+            required: ['query'],
+            title: 'SearchGoogleAction',
+            type: 'object',
+            nullable: true,
+          },
+          go_to_url: {
+            properties: {
+              url: {
+                title: 'Url',
+                type: 'string',
+              },
+            },
+            required: ['url'],
+            title: 'GoToUrlAction',
+            type: 'object',
+            nullable: true,
+          },
+          go_back: {
+            additionalProperties: true,
+            description:
+              'Accepts absolutely anything in the incoming data\nand discards it, so the final parsed model is empty.',
+            properties: {},
+            title: 'NoParamsAction',
+            type: 'object',
+            nullable: true,
+          },
+          click_element: {
+            properties: {
+              desc: {
+                title: 'Intent',
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              index: {
+                title: 'Index',
+                type: 'integer',
+              },
+              xpath: {
+                title: 'XPath',
+                type: 'string',
+                nullable: true,
+              },
+            },
+            required: ['desc', 'index'],
+            title: 'ClickElementAction',
+            type: 'object',
+            nullable: true,
+          },
+          input_text: {
+            properties: {
+              desc: {
+                title: 'Intent',
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              index: {
+                title: 'Index',
+                type: 'integer',
+              },
+              text: {
+                title: 'Text',
+                type: 'string',
+              },
+              xpath: {
+                title: 'XPath',
+                type: 'string',
+                nullable: true,
+              },
+            },
+            required: ['desc', 'index', 'text'],
+            title: 'InputTextAction',
+            type: 'object',
+            nullable: true,
+          },
+          switch_tab: {
+            properties: {
+              tab_id: {
+                title: 'Page Id',
+                type: 'integer',
+              },
+            },
+            required: ['tab_id'],
+            title: 'SwitchTabAction',
+            type: 'object',
+            nullable: true,
+          },
+          open_tab: {
+            properties: {
+              url: {
+                title: 'Url',
+                type: 'string',
+              },
+            },
+            required: ['url'],
+            title: 'OpenTabAction',
+            type: 'object',
+            nullable: true,
+          },
+          cache_content: {
+            properties: {
+              content: {
+                title: 'Content',
+                type: 'string',
+              },
+            },
+            required: ['content'],
+            title: 'cache_content_parameters',
+            type: 'object',
+            nullable: true,
+          },
+          scroll_down: {
+            properties: {
+              desc: {
+                title: 'Intent',
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              amount: {
+                title: 'Amount',
+                type: 'integer',
+                nullable: true,
+              },
+            },
+            required: ['desc'],
+            title: 'ScrollAction',
+            type: 'object',
+            nullable: true,
+          },
+          scroll_up: {
+            properties: {
+              desc: {
+                title: 'Intent',
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              amount: {
+                title: 'Amount',
+                type: 'integer',
+                nullable: true,
+              },
+            },
+            required: ['desc'],
+            title: 'ScrollAction',
+            type: 'object',
+            nullable: true,
+          },
+          send_keys: {
+            properties: {
+              desc: {
+                title: 'Intent',
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              keys: {
+                title: 'Keys',
+                type: 'string',
+              },
+            },
+            required: ['desc', 'keys'],
+            title: 'SendKeysAction',
+            type: 'object',
+            nullable: true,
+          },
+          scroll_to_text: {
+            properties: {
+              desc: {
+                title: 'Intent',
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              text: {
+                title: 'Text',
+                type: 'string',
+              },
+            },
+            required: ['desc', 'text'],
+            title: 'scroll_to_text_parameters',
+            type: 'object',
+            nullable: true,
+          },
+          get_dropdown_options: {
+            properties: {
+              index: {
+                title: 'Index',
+                type: 'integer',
+              },
+            },
+            required: ['index'],
+            title: 'get_dropdown_options_parameters',
+            type: 'object',
+            nullable: true,
+          },
+          select_dropdown_option: {
+            properties: {
+              index: {
+                title: 'Index',
+                type: 'integer',
+              },
+              text: {
+                title: 'Text',
+                type: 'string',
+              },
+            },
+            required: ['index', 'text'],
+            title: 'select_dropdown_option_parameters',
+            type: 'object',
+            nullable: true,
+          },
+        },
+        title: 'ActionModel',
+        type: 'object',
       },
       title: 'Action',
       type: 'array',

+ 2 - 2
chrome-extension/src/background/agent/agents/base.ts

@@ -161,13 +161,13 @@ export abstract class BaseAgent<T extends z.ZodType, M = unknown> {
     ];
 
     const toolCallMessage = new AIMessage({
-      content: '',
+      content: 'tool call',
       tool_calls: toolCalls,
     });
     messageManager.addMessageWithTokens(toolCallMessage);
 
     const toolMessage = new ToolMessage({
-      content: '',
+      content: 'tool call response placeholder',
       tool_call_id: toolCallId,
     });
     messageManager.addMessageWithTokens(toolMessage);

+ 21 - 5
chrome-extension/src/background/agent/agents/navigator.ts

@@ -6,11 +6,11 @@ import type { Action } from '../actions/builder';
 import { buildDynamicActionSchema } from '../actions/builder';
 import { agentBrainSchema } from '../types';
 import { type BaseMessage, HumanMessage } from '@langchain/core/messages';
-import { jsonNavigatorOutputSchema } from '../actions/json_schema';
 import { Actors, ExecutionState } from '../event/types';
 import { isAuthenticationError } from '@src/background/utils';
 import { ChatModelAuthError } from './errors';
-
+import { jsonNavigatorOutputSchema } from '../actions/json_schema';
+import { geminiNavigatorOutputSchema } from '../actions/json_gemini';
 const logger = createLogger('NavigatorAgent');
 
 export class NavigatorActionRegistry {
@@ -63,14 +63,21 @@ export class NavigatorAgent extends BaseAgent<z.ZodType, NavigatorResult> {
   async invoke(inputMessages: BaseMessage[]): Promise<this['ModelOutput']> {
     // Use structured output
     if (this.withStructuredOutput) {
+      // For Google Generative AI, we need to use the modelOutputSchema directly
+      // but make sure it doesn't have any 'default' properties that cause issues
+
+      const schema =
+        this.chatModelLibrary === 'ChatGoogleGenerativeAI' ? geminiNavigatorOutputSchema : jsonNavigatorOutputSchema;
+
       // TODO: don't know why zod can not generate the same schema. Use the json schema exported from browser-use as a workaround for now, need to fix it
-      const structuredLlm = this.chatLLM.withStructuredOutput(jsonNavigatorOutputSchema, {
+      const structuredLlm = this.chatLLM.withStructuredOutput(schema, {
         includeRaw: true,
       });
 
       const response = await structuredLlm.invoke(inputMessages, {
         ...this.callOptions,
       });
+
       if (response.parsed) {
         return response.parsed;
       }
@@ -218,11 +225,17 @@ export class NavigatorAgent extends BaseAgent<z.ZodType, NavigatorResult> {
   private async doMultiAction(response: this['ModelOutput']): Promise<ActionResult[]> {
     const results: ActionResult[] = [];
     let errCount = 0;
+
+    logger.info('Actions', response.action);
     // sometimes response.action is a string, but not an array as expected, so we need to parse it as an array
     let actions: Record<string, unknown>[] = [];
     if (Array.isArray(response.action)) {
-      actions = response.action;
-    } else {
+      // if the item is null, skip it
+      actions = response.action.filter((item: unknown) => item !== null);
+      if (actions.length === 0) {
+        logger.warning('No valid actions found', response.action);
+      }
+    } else if (typeof response.action === 'string') {
       try {
         logger.warning('Unexpected action format', response.action);
         // try to parse the action as an JSON object
@@ -231,6 +244,9 @@ export class NavigatorAgent extends BaseAgent<z.ZodType, NavigatorResult> {
         logger.error('Invalid action format', response.action);
         throw new Error('Invalid action output format');
       }
+    } else {
+      // if the action is neither an array nor a string, it should be an object
+      actions = [response.action];
     }
 
     for (const action of actions) {

+ 7 - 3
chrome-extension/src/background/agent/executor.ts

@@ -120,6 +120,7 @@ export class Executor {
 
       let done = false;
       let step = 0;
+      let validatorFailed = false;
 
       for (step = 0; step < allowedMaxSteps; step++) {
         context.stepInfo = {
@@ -133,7 +134,8 @@ export class Executor {
         }
 
         // Run planner if configured
-        if (this.planner && context.nSteps % context.options.planningInterval === 0) {
+        if (this.planner && (context.nSteps % context.options.planningInterval === 0 || validatorFailed)) {
+          validatorFailed = false;
           // The first planning step is special, we don't want to add the browser state message to memory
           if (this.tasks.length > 1 || step > 0) {
             await this.navigator.addStateMessageToMemory();
@@ -153,9 +155,10 @@ export class Executor {
             } else {
               // task is not complete, let's navigate
               this.validator.setPlan(null);
+              done = false;
             }
-            if (!planOutput.result.web_task) {
-              done = true;
+
+            if (!planOutput.result.web_task && planOutput.result.done) {
               break;
             }
           }
@@ -173,6 +176,7 @@ export class Executor {
             logger.info('✅ Task completed successfully');
             break;
           }
+          validatorFailed = true;
         }
       }
 

+ 1 - 1
chrome-extension/src/background/agent/messages/service.ts

@@ -81,7 +81,7 @@ export default class MessageManager {
     ];
 
     const exampleToolCall = new AIMessage({
-      content: '',
+      content: 'example tool call',
       tool_calls: toolCalls,
     });
     this.addMessageWithTokens(exampleToolCall);

+ 2 - 0
packages/schema-utils/.eslintignore

@@ -0,0 +1,2 @@
+dist
+node_modules 

+ 25 - 0
packages/schema-utils/README.md

@@ -0,0 +1,25 @@
+# Tool Utils
+
+This package contains JSON schema definitions and related helpers for tools used across the extension.
+
+## Contents
+
+- JSON schema definitions for navigator output
+- Utility functions for schema flattening, conversion and formatting
+
+## Examples
+
+The `examples/` directory contains runnable examples that demonstrate the package's functionality:
+
+1. **flatten.ts** - Demonstrates how to flatten a JSON schema by dereferencing all `$ref` fields
+2. **convert.ts** - Shows how to convert an OpenAI-compatible schema to Gemini format
+
+To run these examples:
+
+```bash
+# Run the schema flattening example
+pnpm --filter @extension/schema-utils example:flatten
+
+# Run the schema conversion example
+pnpm --filter @extension/schema-utils example:convert
+```

+ 15 - 0
packages/schema-utils/build.mjs

@@ -0,0 +1,15 @@
+import esbuild from 'esbuild';
+
+/**
+ * @type { import('esbuild').BuildOptions }
+ */
+const buildOptions = {
+  entryPoints: ['./index.ts', './lib/**/*.ts', './lib/**/*.tsx', './examples/**/*.ts'],
+  tsconfig: './tsconfig.json',
+  bundle: false,
+  target: 'es6',
+  outdir: './dist',
+  sourcemap: true,
+};
+
+await esbuild.build(buildOptions); 

+ 9 - 0
packages/schema-utils/examples/convert.ts

@@ -0,0 +1,9 @@
+import { convertOpenAISchemaToGemini } from '../lib/helper.js';
+import { jsonNavigatorOutputSchema } from '../lib/json_schema.js';
+
+// Convert the schema
+console.log('Converting jsonNavigatorOutputSchema to Gemini format...');
+const geminiSchema = convertOpenAISchemaToGemini(jsonNavigatorOutputSchema);
+
+// pretty print the schema
+console.log(JSON.stringify(geminiSchema, null, 2));

+ 28 - 0
packages/schema-utils/examples/flatten.ts

@@ -0,0 +1,28 @@
+import { dereferenceJsonSchema } from '../lib/helper.js';
+import { jsonNavigatorOutputSchema } from '../lib/json_schema.js';
+
+/**
+ * This example demonstrates how to flatten the jsonNavigatorOutputSchema
+ * by dereferencing all $ref fields and removing the $defs section.
+ */
+
+// Flatten the schema by dereferencing all references
+console.log('Flattening jsonNavigatorOutputSchema...');
+const flattenedSchema = dereferenceJsonSchema(jsonNavigatorOutputSchema);
+
+// Pretty print the flattened schema
+console.log('Flattened Schema:');
+console.log(JSON.stringify(flattenedSchema, null, 2));
+
+// You can also see the size difference
+const originalSize = JSON.stringify(jsonNavigatorOutputSchema).length;
+const flattenedSize = JSON.stringify(flattenedSchema).length;
+
+console.log('\nSize comparison:');
+console.log(`Original schema size: ${originalSize} bytes`);
+console.log(`Flattened schema size: ${flattenedSize} bytes`);
+console.log(
+  `Difference: ${flattenedSize - originalSize} bytes (${((flattenedSize / originalSize) * 100).toFixed(2)}% of original)`,
+);
+
+// Note: The flattened schema is typically larger because references are replaced with their full definitions

+ 4 - 0
packages/schema-utils/index.ts

@@ -0,0 +1,4 @@
+export * from './lib/json_schema';
+export * from './lib/json_gemini';
+export * from './lib/helpers';
+export * from './lib/helper';

+ 242 - 0
packages/schema-utils/lib/helper.ts

@@ -0,0 +1,242 @@
+/**
+ * Type definition for a JSON Schema object
+ */
+export interface JsonSchemaObject {
+  $ref?: string;
+  $defs?: Record<string, JsonSchemaObject>;
+  type?: string;
+  properties?: Record<string, JsonSchemaObject>;
+  items?: JsonSchemaObject;
+  anyOf?: JsonSchemaObject[];
+  title?: string;
+  description?: string;
+  required?: string[];
+  default?: unknown;
+  additionalProperties?: boolean;
+  [key: string]: unknown;
+}
+
+/**
+ * Dereferences all $ref fields in a JSON schema by replacing them with the actual referenced schema
+ *
+ * @param schema The JSON schema to dereference
+ * @returns A new JSON schema with all references resolved
+ */
+export function dereferenceJsonSchema(schema: JsonSchemaObject): JsonSchemaObject {
+  // Create a deep copy of the schema to avoid modifying the original
+  const clonedSchema = JSON.parse(JSON.stringify(schema));
+
+  // Extract definitions to use for resolving references
+  const definitions = clonedSchema.$defs || {};
+
+  // Process the schema
+  const result = processSchemaNode(clonedSchema, definitions);
+
+  // Create a new object without $defs
+  const resultWithoutDefs: JsonSchemaObject = {};
+
+  // Copy all properties except $defs
+  for (const [key, value] of Object.entries(result)) {
+    if (key !== '$defs') {
+      resultWithoutDefs[key] = value;
+    }
+  }
+
+  return resultWithoutDefs;
+}
+
+/**
+ * Process a schema node, resolving all references
+ */
+function processSchemaNode(node: JsonSchemaObject, definitions: Record<string, JsonSchemaObject>): JsonSchemaObject {
+  // If it's not an object or is null, return as is
+  if (typeof node !== 'object' || node === null) {
+    return node;
+  }
+
+  // If it's a reference, resolve it
+  if (node.$ref) {
+    const refPath = node.$ref.replace('#/$defs/', '');
+    const definition = definitions[refPath];
+    if (definition) {
+      // Process the definition to resolve any nested references
+      return processSchemaNode(definition, definitions);
+    }
+  }
+
+  // Handle anyOf for references
+  if (node.anyOf) {
+    // Process each item in anyOf
+    const processedAnyOf = node.anyOf.map(item => processSchemaNode(item, definitions));
+
+    // If anyOf contains a reference and a null type, merge them
+    const nonNullTypes = processedAnyOf.filter(item => item.type !== 'null');
+    const hasNullType = processedAnyOf.some(item => item.type === 'null');
+
+    if (nonNullTypes.length === 1 && hasNullType) {
+      const result = { ...nonNullTypes[0] };
+      result.nullable = true;
+      return result;
+    }
+
+    // Otherwise, keep the anyOf structure but with processed items
+    return {
+      ...node,
+      anyOf: processedAnyOf,
+    };
+  }
+
+  // Create a new node with processed properties
+  const result: JsonSchemaObject = {};
+
+  // Copy all properties except $ref
+  for (const [key, value] of Object.entries(node)) {
+    if (key !== '$ref') {
+      if (key === 'properties' && typeof value === 'object' && value !== null) {
+        // Process properties
+        result.properties = {};
+        for (const [propKey, propValue] of Object.entries(value)) {
+          result.properties[propKey] = processSchemaNode(propValue as JsonSchemaObject, definitions);
+        }
+      } else if (key === 'items' && typeof value === 'object' && value !== null) {
+        // Process items for arrays
+        result.items = processSchemaNode(value as JsonSchemaObject, definitions);
+      } else {
+        // Copy other properties as is
+        result[key] = value;
+      }
+    }
+  }
+
+  return result;
+}
+
+/**
+ * Converts an OpenAI format JSON schema to a Google Gemini compatible schema
+ *
+ * Key differences handled:
+ * 1. OpenAI uses $defs and $ref for references, Gemini uses inline definitions
+ * 2. Different structure for nullable properties
+ * 3. Gemini has a flatter structure for defining properties
+ *
+ * @param openaiSchema The OpenAI format JSON schema to convert
+ * @returns A Google Gemini compatible JSON schema
+ */
+export function convertOpenAISchemaToGemini(openaiSchema: JsonSchemaObject): JsonSchemaObject {
+  // Create a new schema object
+  const geminiSchema: JsonSchemaObject = {
+    type: openaiSchema.type,
+    properties: {},
+    required: openaiSchema.required || [],
+  };
+
+  // Process definitions to use for resolving references
+  const definitions = openaiSchema.$defs || {};
+
+  // Process properties
+  if (openaiSchema.properties) {
+    geminiSchema.properties = processProperties(openaiSchema.properties, definitions);
+  }
+
+  return geminiSchema;
+}
+
+/**
+ * Process properties recursively, resolving references and converting to Gemini format
+ */
+function processProperties(
+  properties: Record<string, JsonSchemaObject>,
+  definitions: Record<string, JsonSchemaObject>,
+): Record<string, JsonSchemaObject> {
+  const result: Record<string, JsonSchemaObject> = {};
+
+  for (const [key, value] of Object.entries(properties)) {
+    if (typeof value !== 'object' || value === null) continue;
+
+    result[key] = processProperty(value, definitions);
+  }
+
+  return result;
+}
+
+/**
+ * Process a single property, resolving references and converting to Gemini format
+ */
+function processProperty(property: JsonSchemaObject, definitions: Record<string, JsonSchemaObject>): JsonSchemaObject {
+  // If it's a reference, resolve it
+  if (property.$ref) {
+    const refPath = property.$ref.replace('#/$defs/', '');
+    const definition = definitions[refPath];
+    if (definition) {
+      return processProperty(definition, definitions);
+    }
+  }
+
+  // Handle anyOf for nullable properties
+  if (property.anyOf) {
+    const nonNullType = property.anyOf.find(item => item.type !== 'null' && !item.$ref);
+
+    const refType = property.anyOf.find(item => item.$ref);
+
+    const isNullable = property.anyOf.some(item => item.type === 'null');
+
+    if (refType?.$ref) {
+      const refPath = refType.$ref.replace('#/$defs/', '');
+      const definition = definitions[refPath];
+
+      if (definition) {
+        const processed = processProperty(definition, definitions);
+        if (isNullable) {
+          processed.nullable = true;
+        }
+        return processed;
+      }
+    }
+
+    if (nonNullType) {
+      const processed = processProperty(nonNullType, definitions);
+      if (isNullable) {
+        processed.nullable = true;
+      }
+      return processed;
+    }
+  }
+
+  // Create a new property object
+  const result: JsonSchemaObject = {
+    type: property.type,
+  };
+
+  // Copy description if it exists
+  if (property.description) {
+    result.description = property.description;
+  }
+
+  // Process nested properties
+  if (property.properties) {
+    result.properties = processProperties(property.properties, definitions);
+
+    // Copy required fields
+    if (property.required) {
+      result.required = property.required;
+    } else {
+      result.required = [];
+    }
+  }
+
+  // Handle arrays
+  if (property.items) {
+    result.items = processProperty(property.items, definitions);
+  }
+
+  // Handle special case for NoParamsAction which is an object in OpenAI but a string in Gemini
+  if (property.additionalProperties === true && property.title === 'NoParamsAction' && property.description) {
+    return {
+      type: 'string',
+      nullable: true,
+      description: property.description,
+    };
+  }
+
+  return result;
+}

+ 432 - 0
packages/schema-utils/lib/json_schema.ts

@@ -0,0 +1,432 @@
+// This is the json schema exported from browser-use, change page_id to tab_id
+// TODO: don't know why zod can not generate the same schema, need to fix it
+export const jsonNavigatorOutputSchema = {
+  $defs: {
+    ActionModel: {
+      properties: {
+        done: {
+          anyOf: [
+            {
+              $ref: '#/$defs/DoneAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Complete task',
+        },
+        search_google: {
+          anyOf: [
+            {
+              $ref: '#/$defs/SearchGoogleAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description:
+            'Search the query in Google in the current tab, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items. ',
+        },
+        go_to_url: {
+          anyOf: [
+            {
+              $ref: '#/$defs/GoToUrlAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Navigate to URL in the current tab',
+        },
+        go_back: {
+          anyOf: [
+            {
+              $ref: '#/$defs/NoParamsAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Go back',
+        },
+        click_element: {
+          anyOf: [
+            {
+              $ref: '#/$defs/ClickElementAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Click element',
+        },
+        input_text: {
+          anyOf: [
+            {
+              $ref: '#/$defs/InputTextAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Input text into a input interactive element',
+        },
+        switch_tab: {
+          anyOf: [
+            {
+              $ref: '#/$defs/SwitchTabAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Switch tab',
+        },
+        open_tab: {
+          anyOf: [
+            {
+              $ref: '#/$defs/OpenTabAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Open url in new tab',
+        },
+        cache_content: {
+          anyOf: [
+            {
+              $ref: '#/$defs/cache_content_parameters',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Cache what you have found so far from the current page so that it can be used in future steps',
+        },
+        scroll_down: {
+          anyOf: [
+            {
+              $ref: '#/$defs/ScrollAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page',
+        },
+        scroll_up: {
+          anyOf: [
+            {
+              $ref: '#/$defs/ScrollAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page',
+        },
+        send_keys: {
+          anyOf: [
+            {
+              $ref: '#/$defs/SendKeysAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description:
+            'Send strings of special keys like Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press. Be aware of different operating systems and their shortcuts',
+        },
+        scroll_to_text: {
+          anyOf: [
+            {
+              $ref: '#/$defs/scroll_to_text_parameters',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'If you dont find something which you want to interact with, scroll to it',
+        },
+        get_dropdown_options: {
+          anyOf: [
+            {
+              $ref: '#/$defs/get_dropdown_options_parameters',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Get all options from a native dropdown',
+        },
+        select_dropdown_option: {
+          anyOf: [
+            {
+              $ref: '#/$defs/select_dropdown_option_parameters',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description:
+            'Select dropdown option for interactive element index by the text of the option you want to select',
+        },
+      },
+      title: 'ActionModel',
+      type: 'object',
+    },
+    AgentBrain: {
+      description: 'Current state of the agent',
+      properties: {
+        page_summary: {
+          title: 'Page Summary',
+          type: 'string',
+        },
+        evaluation_previous_goal: {
+          title: 'Evaluation Previous Goal',
+          type: 'string',
+        },
+        memory: {
+          title: 'Memory',
+          type: 'string',
+        },
+        next_goal: {
+          title: 'Next Goal',
+          type: 'string',
+        },
+      },
+      required: ['page_summary', 'evaluation_previous_goal', 'memory', 'next_goal'],
+      title: 'AgentBrain',
+      type: 'object',
+    },
+    ClickElementAction: {
+      properties: {
+        desc: {
+          title: 'Intent',
+          type: 'string',
+          description: 'Very short explanation of the intent or purpose for calling this action',
+        },
+        index: {
+          title: 'Index',
+          type: 'integer',
+        },
+        xpath: {
+          anyOf: [
+            {
+              type: 'string',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          title: 'Xpath',
+        },
+      },
+      required: ['desc', 'index'],
+      title: 'ClickElementAction',
+      type: 'object',
+    },
+    DoneAction: {
+      properties: {
+        text: {
+          title: 'Text',
+          type: 'string',
+        },
+      },
+      required: ['text'],
+      title: 'DoneAction',
+      type: 'object',
+    },
+    GoToUrlAction: {
+      properties: {
+        url: {
+          title: 'Url',
+          type: 'string',
+        },
+      },
+      required: ['url'],
+      title: 'GoToUrlAction',
+      type: 'object',
+    },
+    InputTextAction: {
+      properties: {
+        desc: {
+          title: 'Intent',
+          type: 'string',
+          description: 'Very short explanation of the intent or purpose for calling this action',
+        },
+        index: {
+          title: 'Index',
+          type: 'integer',
+        },
+        text: {
+          title: 'Text',
+          type: 'string',
+        },
+        xpath: {
+          anyOf: [
+            {
+              type: 'string',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          title: 'Xpath',
+        },
+      },
+      required: ['desc', 'index', 'text'],
+      title: 'InputTextAction',
+      type: 'object',
+    },
+    NoParamsAction: {
+      additionalProperties: true,
+      description:
+        'Accepts absolutely anything in the incoming data\nand discards it, so the final parsed model is empty.',
+      properties: {},
+      title: 'NoParamsAction',
+      type: 'object',
+    },
+    OpenTabAction: {
+      properties: {
+        url: {
+          title: 'Url',
+          type: 'string',
+        },
+      },
+      required: ['url'],
+      title: 'OpenTabAction',
+      type: 'object',
+    },
+    ScrollAction: {
+      properties: {
+        desc: {
+          title: 'Intent',
+          type: 'string',
+          description: 'Very short explanation of the intent or purpose for calling this action',
+        },
+        amount: {
+          anyOf: [
+            {
+              type: 'integer',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          title: 'Amount',
+        },
+      },
+      required: ['desc'],
+      title: 'ScrollAction',
+      type: 'object',
+    },
+    SearchGoogleAction: {
+      properties: {
+        query: {
+          title: 'Query',
+          type: 'string',
+        },
+      },
+      required: ['query'],
+      title: 'SearchGoogleAction',
+      type: 'object',
+    },
+    SendKeysAction: {
+      properties: {
+        desc: {
+          title: 'Intent',
+          type: 'string',
+          description: 'Very short explanation of the intent or purpose for calling this action',
+        },
+        keys: {
+          title: 'Keys',
+          type: 'string',
+        },
+      },
+      required: ['desc', 'keys'],
+      title: 'SendKeysAction',
+      type: 'object',
+    },
+    SwitchTabAction: {
+      properties: {
+        tab_id: {
+          title: 'Page Id',
+          type: 'integer',
+        },
+      },
+      required: ['tab_id'],
+      title: 'SwitchTabAction',
+      type: 'object',
+    },
+    cache_content_parameters: {
+      properties: {
+        content: {
+          title: 'Content',
+          type: 'string',
+        },
+      },
+      required: ['content'],
+      title: 'cache_content_parameters',
+      type: 'object',
+    },
+    get_dropdown_options_parameters: {
+      properties: {
+        index: {
+          title: 'Index',
+          type: 'integer',
+        },
+      },
+      required: ['index'],
+      title: 'get_dropdown_options_parameters',
+      type: 'object',
+    },
+    scroll_to_text_parameters: {
+      properties: {
+        desc: {
+          title: 'Intent',
+          type: 'string',
+          description: 'Very short explanation of the intent or purpose for calling this action',
+        },
+        text: {
+          title: 'Text',
+          type: 'string',
+        },
+      },
+      required: ['desc', 'text'],
+      title: 'scroll_to_text_parameters',
+      type: 'object',
+    },
+    select_dropdown_option_parameters: {
+      properties: {
+        index: {
+          title: 'Index',
+          type: 'integer',
+        },
+        text: {
+          title: 'Text',
+          type: 'string',
+        },
+      },
+      required: ['index', 'text'],
+      title: 'select_dropdown_option_parameters',
+      type: 'object',
+    },
+  },
+  properties: {
+    current_state: {
+      $ref: '#/$defs/AgentBrain',
+    },
+    action: {
+      items: {
+        $ref: '#/$defs/ActionModel',
+      },
+      title: 'Action',
+      type: 'array',
+    },
+  },
+  required: ['current_state', 'action'],
+  title: 'AgentOutput',
+  type: 'object',
+};

+ 29 - 0
packages/schema-utils/package.json

@@ -0,0 +1,29 @@
+{
+  "name": "@extension/schema-utils",
+  "version": "0.1.0",
+  "description": "JSON schema and related helpers for tools",
+  "private": true,
+  "type": "module",
+  "sideEffects": false,
+  "files": [
+    "dist/**"
+  ],
+  "types": "index.ts",
+  "main": "./dist/index.js",
+  "scripts": {
+    "clean:bundle": "rimraf dist",
+    "clean:node_modules": "pnpx rimraf node_modules",
+    "clean:turbo": "rimraf .turbo",
+    "clean": "pnpm clean:bundle && pnpm clean:node_modules && pnpm clean:turbo",
+    "ready": "node build.mjs",
+    "lint": "eslint . --ext .ts,.tsx",
+    "lint:fix": "pnpm lint --fix",
+    "prettier": "prettier . --write --ignore-path ../../.prettierignore",
+    "type-check": "tsc --noEmit",
+    "example:convert": "pnpm run ready && node dist/examples/convert.js",
+    "example:flatten": "pnpm run ready && node dist/examples/flatten.js"
+  },
+  "devDependencies": {
+    "@extension/tsconfig": "workspace:*"
+  }
+}

+ 174 - 0
packages/schema-utils/tests/convert.test.ts

@@ -0,0 +1,174 @@
+import { convertOpenAISchemaToGemini } from '../lib/helper';
+import type { JsonSchemaObject } from '../lib/json_schema';
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+
+// Create a simple test runner since we don't have Jest or Mocha installed
+function describe(name: string, fn: () => void) {
+  console.log(`\n--- ${name} ---`);
+  fn();
+}
+
+function it(name: string, fn: () => void) {
+  console.log(`\n  Test: ${name}`);
+  try {
+    fn();
+    console.log('  ✅ PASSED');
+  } catch (error) {
+    console.error('  ❌ FAILED:', error);
+  }
+}
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+function expect(actual: unknown) {
+  return {
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    toEqual: (expected: unknown) => {
+      const actualStr = JSON.stringify(actual);
+      const expectedStr = JSON.stringify(expected);
+      if (actualStr !== expectedStr) {
+        throw new Error(`Expected ${expectedStr} but got ${actualStr}`);
+      }
+    },
+    toBeTruthy: () => {
+      if (!actual) {
+        throw new Error(`Expected truthy value but got ${actual}`);
+      }
+    },
+  };
+}
+
+describe('convertOpenAISchemaToGemini', () => {
+  it('should convert OpenAI schema to Gemini format', () => {
+    // Sample OpenAI schema with references and nullable properties
+    const openaiSchema: JsonSchemaObject = {
+      type: 'object',
+      properties: {
+        name: {
+          type: 'string',
+          description: 'The name of the user',
+        },
+        age: {
+          type: 'number',
+          description: 'The age of the user',
+        },
+        address: {
+          $ref: '#/$defs/Address',
+        },
+        email: {
+          anyOf: [{ type: 'string', description: 'Email address' }, { type: 'null' }],
+        },
+        tags: {
+          type: 'array',
+          items: {
+            type: 'string',
+          },
+        },
+        profile: {
+          $ref: '#/$defs/Profile',
+        },
+      },
+      required: ['name', 'age'],
+      $defs: {
+        Address: {
+          type: 'object',
+          properties: {
+            street: { type: 'string' },
+            city: { type: 'string' },
+            zipCode: { type: 'string' },
+          },
+          required: ['street', 'city'],
+        },
+        Profile: {
+          type: 'object',
+          properties: {
+            bio: { type: 'string' },
+            website: {
+              anyOf: [{ type: 'string' }, { type: 'null' }],
+            },
+          },
+        },
+      },
+    };
+
+    // Convert the schema
+    const geminiSchema = convertOpenAISchemaToGemini(openaiSchema);
+
+    // Expected Gemini schema
+    const expectedGeminiSchema: JsonSchemaObject = {
+      type: 'object',
+      properties: {
+        name: {
+          type: 'string',
+          description: 'The name of the user',
+        },
+        age: {
+          type: 'number',
+          description: 'The age of the user',
+        },
+        address: {
+          type: 'object',
+          properties: {
+            street: { type: 'string' },
+            city: { type: 'string' },
+            zipCode: { type: 'string' },
+          },
+          required: ['street', 'city'],
+        },
+        email: {
+          type: 'string',
+          description: 'Email address',
+          nullable: true,
+        },
+        tags: {
+          type: 'array',
+          items: {
+            type: 'string',
+          },
+        },
+        profile: {
+          type: 'object',
+          properties: {
+            bio: { type: 'string' },
+            website: {
+              type: 'string',
+              nullable: true,
+            },
+          },
+          required: [],
+        },
+      },
+      required: ['name', 'age'],
+    };
+
+    // Verify the conversion
+    expect(geminiSchema).toEqual(expectedGeminiSchema);
+
+    // Write the schemas to files for manual inspection
+    const testDir = path.join(__dirname, 'output');
+    if (!fs.existsSync(testDir)) {
+      fs.mkdirSync(testDir, { recursive: true });
+    }
+
+    fs.writeFileSync(path.join(testDir, 'openai.json'), JSON.stringify(openaiSchema, null, 2));
+
+    fs.writeFileSync(path.join(testDir, 'gemini.json'), JSON.stringify(geminiSchema, null, 2));
+  });
+
+  it('should convert the actual json_schema.ts to gemini.json', () => {
+    // Import the actual schema from json_schema.ts
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    const { jsonNavigatorOutputSchema } = require('../lib/json_schema');
+
+    // Convert the schema
+    const geminiSchema = convertOpenAISchemaToGemini(jsonNavigatorOutputSchema);
+
+    // Write the converted schema to a file
+    const outputDir = path.join(__dirname, '../');
+    fs.writeFileSync(path.join(outputDir, 'gemini.json'), JSON.stringify(geminiSchema, null, 2));
+
+    // Verify the conversion was successful
+    expect(geminiSchema).toBeTruthy();
+    expect(geminiSchema.properties).toBeTruthy();
+  });
+});

+ 19 - 0
packages/schema-utils/tests/run-test.js

@@ -0,0 +1,19 @@
+#!/usr/bin/env node
+
+// Simple script to run our TypeScript test file
+const { execSync } = require('node:child_process');
+const path = require('node:path');
+
+try {
+  // Compile the test file with ts-node
+  console.log('Running convert.test.ts...');
+  execSync('npx ts-node tests/convert.test.ts', {
+    cwd: path.resolve(__dirname, '..'),
+    stdio: 'inherit',
+  });
+
+  console.log('\nTest completed successfully!');
+} catch (error) {
+  console.error('\nTest failed:', error.message);
+  process.exit(1);
+}

+ 5 - 0
packages/schema-utils/tsconfig.json

@@ -0,0 +1,5 @@
+{
+  "extends": "@extension/tsconfig/base.json",
+  "include": ["./**/*.ts", "./**/*.tsx"],
+  "exclude": ["node_modules", "dist"]
+}

+ 2 - 2
packages/storage/lib/settings/types.ts

@@ -13,12 +13,12 @@ export enum LLMProviderEnum {
 
 export const llmProviderModelNames = {
   [LLMProviderEnum.OpenAI]: ['gpt-4o', 'gpt-4o-mini', 'o1', 'o1-mini', 'o3-mini'],
-  [LLMProviderEnum.Anthropic]: ['claude-3-5-sonnet-latest', 'claude-3-5-haiku-latest'],
+  [LLMProviderEnum.Anthropic]: ['claude-3-7-sonnet-latest', 'claude-3-5-haiku-latest'],
   [LLMProviderEnum.Gemini]: [
     'gemini-2.0-flash',
     'gemini-2.0-flash-lite',
     'gemini-2.0-pro-exp-02-05',
-    'gemini-2.0-flash-thinking-exp-01-21',
+    // 'gemini-2.0-flash-thinking-exp-01-21', // TODO: not support function calling for now
   ],
 };
 

+ 1 - 1
pages/options/src/Options.tsx

@@ -5,7 +5,7 @@ import { withErrorBoundary, withSuspense } from '@extension/shared';
 import { GeneralSettings } from './components/GeneralSettings';
 import { ModelSettings } from './components/ModelSettings';
 const Options = () => {
-  const [activeTab, setActiveTab] = useState('general');
+  const [activeTab, setActiveTab] = useState('models');
 
   const renderTabContent = () => {
     switch (activeTab) {

+ 0 - 26
pages/options/src/components/ModelSettings.tsx

@@ -276,19 +276,6 @@ export const ModelSettings = () => {
                 onChange={e => handleApiKeyChange(LLMProviderEnum.Anthropic, e.target.value)}
                 className="w-full p-2 rounded-md bg-gray-50 border border-gray-200 focus:border-blue-400 focus:ring-2 focus:ring-blue-200 outline-none"
               />
-              <input
-                type="text"
-                placeholder="Custom Base URL (Optional)"
-                value={apiKeys[LLMProviderEnum.Anthropic]?.baseUrl || ''}
-                onChange={e =>
-                  handleApiKeyChange(
-                    LLMProviderEnum.Anthropic,
-                    apiKeys[LLMProviderEnum.Anthropic]?.apiKey || '',
-                    e.target.value,
-                  )
-                }
-                className="w-full p-2 rounded-md bg-gray-50 border border-gray-200 focus:border-blue-400 focus:ring-2 focus:ring-blue-200 outline-none"
-              />
             </div>
           </div>
 
@@ -316,19 +303,6 @@ export const ModelSettings = () => {
                 onChange={e => handleApiKeyChange(LLMProviderEnum.Gemini, e.target.value)}
                 className="w-full p-2 rounded-md bg-gray-50 border border-gray-200 focus:border-blue-400 focus:ring-2 focus:ring-blue-200 outline-none"
               />
-              <input
-                type="text"
-                placeholder="Custom Base URL (Optional)"
-                value={apiKeys[LLMProviderEnum.Gemini]?.baseUrl || ''}
-                onChange={e =>
-                  handleApiKeyChange(
-                    LLMProviderEnum.Gemini,
-                    apiKeys[LLMProviderEnum.Gemini]?.apiKey || '',
-                    e.target.value,
-                  )
-                }
-                className="w-full p-2 rounded-md bg-gray-50 border border-gray-200 focus:border-blue-400 focus:ring-2 focus:ring-blue-200 outline-none"
-              />
             </div>
           </div>
         </div>

+ 6 - 0
pnpm-lock.yaml

@@ -209,6 +209,12 @@ importers:
         specifier: workspace:*
         version: link:../tsconfig
 
+  packages/schema-utils:
+    devDependencies:
+      '@extension/tsconfig':
+        specifier: workspace:*
+        version: link:../tsconfig
+
   packages/shared:
     devDependencies:
       '@extension/storage':