Переглянути джерело

Merge pull request #15 from nanobrowser/gemini

Ashu 5 місяців тому
батько
коміт
0f2e96d9a7

+ 1 - 0
chrome-extension/package.json

@@ -20,6 +20,7 @@
     "@extension/storage": "workspace:*",
     "@langchain/anthropic": "^0.3.12",
     "@langchain/core": "^0.3.37",
+    "@langchain/google-genai": "0.1.10",
     "@langchain/openai": "^0.4.2",
     "puppeteer-core": "24.1.1",
     "webextension-polyfill": "^0.12.0",

+ 21 - 8
chrome-extension/src/background/agent/actions/builder.ts

@@ -36,19 +36,30 @@ export class InvalidInputError extends Error {
  */
 export class Action {
   constructor(
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
     private readonly handler: (input: any) => Promise<ActionResult>,
     public readonly schema: ActionSchema,
   ) {}
 
   async call(input: unknown): Promise<ActionResult> {
     // Validate input before calling the handler
-    const result = this.schema.schema.safeParse(input);
-    if (!result.success) {
-      const errorMessage = result.error.message;
-      logger.error('Invalid input', errorMessage);
+    const schema = this.schema.schema;
+
+    // check if the schema is schema: z.object({}), if so, ignore the input
+    const isEmptySchema =
+      schema instanceof z.ZodObject &&
+      Object.keys((schema as z.ZodObject<Record<string, z.ZodTypeAny>>).shape || {}).length === 0;
+
+    if (isEmptySchema) {
+      return await this.handler({});
+    }
+
+    const parsedArgs = this.schema.schema.safeParse(input);
+    if (!parsedArgs.success) {
+      const errorMessage = parsedArgs.error.message;
       throw new InvalidInputError(errorMessage);
     }
-    return await this.handler(result.data);
+    return await this.handler(parsedArgs.data);
   }
 
   name() {
@@ -60,6 +71,7 @@ export class Action {
    * @returns {string} The prompt for the action
    */
   prompt() {
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
     const schemaShape = (this.schema.schema as z.ZodObject<any>).shape || {};
     const schemaProperties = Object.entries(schemaShape).map(([key, value]) => {
       const zodValue = value as z.ZodTypeAny;
@@ -77,13 +89,14 @@ export class Action {
 export function buildDynamicActionSchema(actions: Action[]): z.ZodType {
   let schema = z.object({});
   for (const action of actions) {
-    // create a schema for the action, it could be action.schema.schema or null, and default to null
-    const actionSchema = action.schema.schema.nullable().default(null);
+    // create a schema for the action, it could be action.schema.schema or null
+    // but don't use default: null as it causes issues with Google Generative AI
+    const actionSchema = action.schema.schema.nullable();
     schema = schema.extend({
       [action.name()]: actionSchema,
     });
   }
-  return schema.partial();
+  return schema.partial().nullable();
 }
 
 export class ActionBuilder {

+ 221 - 0
chrome-extension/src/background/agent/actions/json_gemini.ts

@@ -0,0 +1,221 @@
+// TODO: don't know why zod can not generate the same schema, need to fix it
+export const geminiNavigatorOutputSchema = {
+  type: 'object',
+  properties: {
+    current_state: {
+      type: 'object',
+      description: 'Current state of the agent',
+      properties: {
+        page_summary: {
+          type: 'string',
+        },
+        evaluation_previous_goal: {
+          type: 'string',
+        },
+        memory: {
+          type: 'string',
+        },
+        next_goal: {
+          type: 'string',
+        },
+      },
+      required: ['page_summary', 'evaluation_previous_goal', 'memory', 'next_goal'],
+    },
+    action: {
+      type: 'array',
+      items: {
+        type: 'object',
+        properties: {
+          done: {
+            type: 'object',
+            properties: {
+              text: {
+                type: 'string',
+              },
+            },
+            required: ['text'],
+            nullable: true,
+          },
+          search_google: {
+            type: 'object',
+            properties: {
+              query: {
+                type: 'string',
+              },
+            },
+            required: ['query'],
+            nullable: true,
+          },
+          go_to_url: {
+            type: 'object',
+            properties: {
+              url: {
+                type: 'string',
+              },
+            },
+            required: ['url'],
+            nullable: true,
+          },
+          go_back: {
+            type: 'string',
+            nullable: true,
+            description:
+              'Accepts absolutely anything in the incoming data\nand discards it, so the final parsed model is empty.',
+          },
+          click_element: {
+            type: 'object',
+            properties: {
+              desc: {
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              index: {
+                type: 'integer',
+              },
+              xpath: {
+                type: 'string',
+                nullable: true,
+              },
+            },
+            required: ['desc', 'index'],
+            nullable: true,
+          },
+          input_text: {
+            type: 'object',
+            properties: {
+              desc: {
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              index: {
+                type: 'integer',
+              },
+              text: {
+                type: 'string',
+              },
+              xpath: {
+                type: 'string',
+                nullable: true,
+              },
+            },
+            required: ['desc', 'index', 'text'],
+            nullable: true,
+          },
+          switch_tab: {
+            type: 'object',
+            properties: {
+              tab_id: {
+                type: 'integer',
+              },
+            },
+            required: ['tab_id'],
+            nullable: true,
+          },
+          open_tab: {
+            type: 'object',
+            properties: {
+              url: {
+                type: 'string',
+              },
+            },
+            required: ['url'],
+            nullable: true,
+          },
+          cache_content: {
+            type: 'object',
+            properties: {
+              content: {
+                type: 'string',
+              },
+            },
+            required: ['content'],
+            nullable: true,
+          },
+          scroll_down: {
+            type: 'object',
+            properties: {
+              desc: {
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              amount: {
+                type: 'integer',
+                nullable: true,
+              },
+            },
+            required: ['desc'],
+            nullable: true,
+          },
+          scroll_up: {
+            type: 'object',
+            properties: {
+              desc: {
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              amount: {
+                type: 'integer',
+                nullable: true,
+              },
+            },
+            required: ['desc'],
+            nullable: true,
+          },
+          send_keys: {
+            type: 'object',
+            properties: {
+              desc: {
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              keys: {
+                type: 'string',
+              },
+            },
+            required: ['desc', 'keys'],
+            nullable: true,
+          },
+          scroll_to_text: {
+            type: 'object',
+            properties: {
+              desc: {
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              text: {
+                type: 'string',
+              },
+            },
+            required: ['desc', 'text'],
+            nullable: true,
+          },
+          get_dropdown_options: {
+            type: 'object',
+            properties: {
+              index: {
+                type: 'integer',
+              },
+            },
+            required: ['index'],
+            nullable: true,
+          },
+          select_dropdown_option: {
+            type: 'object',
+            properties: {
+              index: {
+                type: 'integer',
+              },
+              text: {
+                type: 'string',
+              },
+            },
+            required: ['index', 'text'],
+            nullable: true,
+          },
+        },
+        required: [],
+      },
+    },
+  },
+  required: ['current_state', 'action'],
+};

+ 233 - 412
chrome-extension/src/background/agent/actions/json_schema.ts

@@ -1,197 +1,8 @@
 // This is the json schema exported from browser-use, change page_id to tab_id
 // TODO: don't know why zod can not generate the same schema, need to fix it
 export const jsonNavigatorOutputSchema = {
-  $defs: {
-    ActionModel: {
-      properties: {
-        done: {
-          anyOf: [
-            {
-              $ref: '#/$defs/DoneAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Complete task',
-        },
-        search_google: {
-          anyOf: [
-            {
-              $ref: '#/$defs/SearchGoogleAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description:
-            'Search the query in Google in the current tab, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items. ',
-        },
-        go_to_url: {
-          anyOf: [
-            {
-              $ref: '#/$defs/GoToUrlAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Navigate to URL in the current tab',
-        },
-        go_back: {
-          anyOf: [
-            {
-              $ref: '#/$defs/NoParamsAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Go back',
-        },
-        click_element: {
-          anyOf: [
-            {
-              $ref: '#/$defs/ClickElementAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Click element',
-        },
-        input_text: {
-          anyOf: [
-            {
-              $ref: '#/$defs/InputTextAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Input text into a input interactive element',
-        },
-        switch_tab: {
-          anyOf: [
-            {
-              $ref: '#/$defs/SwitchTabAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Switch tab',
-        },
-        open_tab: {
-          anyOf: [
-            {
-              $ref: '#/$defs/OpenTabAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Open url in new tab',
-        },
-        cache_content: {
-          anyOf: [
-            {
-              $ref: '#/$defs/cache_content_parameters',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Cache what you have found so far from the current page so that it can be used in future steps',
-        },
-        scroll_down: {
-          anyOf: [
-            {
-              $ref: '#/$defs/ScrollAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page',
-        },
-        scroll_up: {
-          anyOf: [
-            {
-              $ref: '#/$defs/ScrollAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page',
-        },
-        send_keys: {
-          anyOf: [
-            {
-              $ref: '#/$defs/SendKeysAction',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description:
-            'Send strings of special keys like Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press. Be aware of different operating systems and their shortcuts',
-        },
-        scroll_to_text: {
-          anyOf: [
-            {
-              $ref: '#/$defs/scroll_to_text_parameters',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'If you dont find something which you want to interact with, scroll to it',
-        },
-        get_dropdown_options: {
-          anyOf: [
-            {
-              $ref: '#/$defs/get_dropdown_options_parameters',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description: 'Get all options from a native dropdown',
-        },
-        select_dropdown_option: {
-          anyOf: [
-            {
-              $ref: '#/$defs/select_dropdown_option_parameters',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          description:
-            'Select dropdown option for interactive element index by the text of the option you want to select',
-        },
-      },
-      title: 'ActionModel',
-      type: 'object',
-    },
-    AgentBrain: {
+  properties: {
+    current_state: {
       description: 'Current state of the agent',
       properties: {
         page_summary: {
@@ -215,229 +26,239 @@ export const jsonNavigatorOutputSchema = {
       title: 'AgentBrain',
       type: 'object',
     },
-    ClickElementAction: {
-      properties: {
-        desc: {
-          title: 'Description',
-          type: 'string',
-          description: 'Description of the purpose of calling this action',
-        },
-        index: {
-          title: 'Index',
-          type: 'integer',
-        },
-        xpath: {
-          anyOf: [
-            {
-              type: 'string',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          title: 'Xpath',
-        },
-      },
-      required: ['index'],
-      title: 'ClickElementAction',
-      type: 'object',
-    },
-    DoneAction: {
-      properties: {
-        text: {
-          title: 'Text',
-          type: 'string',
-        },
-      },
-      required: ['text'],
-      title: 'DoneAction',
-      type: 'object',
-    },
-    GoToUrlAction: {
-      properties: {
-        url: {
-          title: 'Url',
-          type: 'string',
-        },
-      },
-      required: ['url'],
-      title: 'GoToUrlAction',
-      type: 'object',
-    },
-    InputTextAction: {
-      properties: {
-        desc: {
-          title: 'Description',
-          type: 'string',
-          description: 'Description of the purpose of calling this action',
-        },
-        index: {
-          title: 'Index',
-          type: 'integer',
-        },
-        text: {
-          title: 'Text',
-          type: 'string',
-        },
-        xpath: {
-          anyOf: [
-            {
-              type: 'string',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          title: 'Xpath',
-        },
-      },
-      required: ['index', 'text'],
-      title: 'InputTextAction',
-      type: 'object',
-    },
-    NoParamsAction: {
-      additionalProperties: true,
-      description:
-        'Accepts absolutely anything in the incoming data\nand discards it, so the final parsed model is empty.',
-      properties: {},
-      title: 'NoParamsAction',
-      type: 'object',
-    },
-    OpenTabAction: {
-      properties: {
-        url: {
-          title: 'Url',
-          type: 'string',
-        },
-      },
-      required: ['url'],
-      title: 'OpenTabAction',
-      type: 'object',
-    },
-    ScrollAction: {
-      properties: {
-        desc: {
-          title: 'Description',
-          type: 'string',
-          description: 'Description of the purpose of calling this action',
-        },
-        amount: {
-          anyOf: [
-            {
-              type: 'integer',
-            },
-            {
-              type: 'null',
-            },
-          ],
-          default: null,
-          title: 'Amount',
-        },
-      },
-      title: 'ScrollAction',
-      type: 'object',
-    },
-    SearchGoogleAction: {
-      properties: {
-        query: {
-          title: 'Query',
-          type: 'string',
-        },
-      },
-      required: ['query'],
-      title: 'SearchGoogleAction',
-      type: 'object',
-    },
-    SendKeysAction: {
-      properties: {
-        desc: {
-          title: 'Description',
-          type: 'string',
-          description: 'Description of the purpose of calling this action',
-        },
-        keys: {
-          title: 'Keys',
-          type: 'string',
-        },
-      },
-      required: ['keys'],
-      title: 'SendKeysAction',
-      type: 'object',
-    },
-    SwitchTabAction: {
-      properties: {
-        tab_id: {
-          title: 'Page Id',
-          type: 'integer',
-        },
-      },
-      required: ['tab_id'],
-      title: 'SwitchTabAction',
-      type: 'object',
-    },
-    cache_content_parameters: {
-      properties: {
-        content: {
-          title: 'Content',
-          type: 'string',
-        },
-      },
-      required: ['content'],
-      title: 'cache_content_parameters',
-      type: 'object',
-    },
-    get_dropdown_options_parameters: {
-      properties: {
-        index: {
-          title: 'Index',
-          type: 'integer',
-        },
-      },
-      required: ['index'],
-      title: 'get_dropdown_options_parameters',
-      type: 'object',
-    },
-    scroll_to_text_parameters: {
-      properties: {
-        desc: {
-          title: 'Description',
-          type: 'string',
-          description: 'Description of the purpose of calling this action',
-        },
-        text: {
-          title: 'Text',
-          type: 'string',
-        },
-      },
-      required: ['text'],
-      title: 'scroll_to_text_parameters',
-      type: 'object',
-    },
-    select_dropdown_option_parameters: {
-      properties: {
-        index: {
-          title: 'Index',
-          type: 'integer',
-        },
-        text: {
-          title: 'Text',
-          type: 'string',
-        },
-      },
-      required: ['index', 'text'],
-      title: 'select_dropdown_option_parameters',
-      type: 'object',
-    },
-  },
-  properties: {
-    current_state: {
-      $ref: '#/$defs/AgentBrain',
-    },
     action: {
       items: {
-        $ref: '#/$defs/ActionModel',
+        properties: {
+          done: {
+            properties: {
+              text: {
+                title: 'Text',
+                type: 'string',
+              },
+            },
+            required: ['text'],
+            title: 'DoneAction',
+            type: 'object',
+            nullable: true,
+          },
+          search_google: {
+            properties: {
+              query: {
+                title: 'Query',
+                type: 'string',
+              },
+            },
+            required: ['query'],
+            title: 'SearchGoogleAction',
+            type: 'object',
+            nullable: true,
+          },
+          go_to_url: {
+            properties: {
+              url: {
+                title: 'Url',
+                type: 'string',
+              },
+            },
+            required: ['url'],
+            title: 'GoToUrlAction',
+            type: 'object',
+            nullable: true,
+          },
+          go_back: {
+            additionalProperties: true,
+            description:
+              'Accepts absolutely anything in the incoming data\nand discards it, so the final parsed model is empty.',
+            properties: {},
+            title: 'NoParamsAction',
+            type: 'object',
+            nullable: true,
+          },
+          click_element: {
+            properties: {
+              desc: {
+                title: 'Intent',
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              index: {
+                title: 'Index',
+                type: 'integer',
+              },
+              xpath: {
+                title: 'XPath',
+                type: 'string',
+                nullable: true,
+              },
+            },
+            required: ['desc', 'index'],
+            title: 'ClickElementAction',
+            type: 'object',
+            nullable: true,
+          },
+          input_text: {
+            properties: {
+              desc: {
+                title: 'Intent',
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              index: {
+                title: 'Index',
+                type: 'integer',
+              },
+              text: {
+                title: 'Text',
+                type: 'string',
+              },
+              xpath: {
+                title: 'XPath',
+                type: 'string',
+                nullable: true,
+              },
+            },
+            required: ['desc', 'index', 'text'],
+            title: 'InputTextAction',
+            type: 'object',
+            nullable: true,
+          },
+          switch_tab: {
+            properties: {
+              tab_id: {
+                title: 'Page Id',
+                type: 'integer',
+              },
+            },
+            required: ['tab_id'],
+            title: 'SwitchTabAction',
+            type: 'object',
+            nullable: true,
+          },
+          open_tab: {
+            properties: {
+              url: {
+                title: 'Url',
+                type: 'string',
+              },
+            },
+            required: ['url'],
+            title: 'OpenTabAction',
+            type: 'object',
+            nullable: true,
+          },
+          cache_content: {
+            properties: {
+              content: {
+                title: 'Content',
+                type: 'string',
+              },
+            },
+            required: ['content'],
+            title: 'cache_content_parameters',
+            type: 'object',
+            nullable: true,
+          },
+          scroll_down: {
+            properties: {
+              desc: {
+                title: 'Intent',
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              amount: {
+                title: 'Amount',
+                type: 'integer',
+                nullable: true,
+              },
+            },
+            required: ['desc'],
+            title: 'ScrollAction',
+            type: 'object',
+            nullable: true,
+          },
+          scroll_up: {
+            properties: {
+              desc: {
+                title: 'Intent',
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              amount: {
+                title: 'Amount',
+                type: 'integer',
+                nullable: true,
+              },
+            },
+            required: ['desc'],
+            title: 'ScrollAction',
+            type: 'object',
+            nullable: true,
+          },
+          send_keys: {
+            properties: {
+              desc: {
+                title: 'Intent',
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              keys: {
+                title: 'Keys',
+                type: 'string',
+              },
+            },
+            required: ['desc', 'keys'],
+            title: 'SendKeysAction',
+            type: 'object',
+            nullable: true,
+          },
+          scroll_to_text: {
+            properties: {
+              desc: {
+                title: 'Intent',
+                type: 'string',
+                description: 'Very short explanation of the intent or purpose for calling this action',
+              },
+              text: {
+                title: 'Text',
+                type: 'string',
+              },
+            },
+            required: ['desc', 'text'],
+            title: 'scroll_to_text_parameters',
+            type: 'object',
+            nullable: true,
+          },
+          get_dropdown_options: {
+            properties: {
+              index: {
+                title: 'Index',
+                type: 'integer',
+              },
+            },
+            required: ['index'],
+            title: 'get_dropdown_options_parameters',
+            type: 'object',
+            nullable: true,
+          },
+          select_dropdown_option: {
+            properties: {
+              index: {
+                title: 'Index',
+                type: 'integer',
+              },
+              text: {
+                title: 'Text',
+                type: 'string',
+              },
+            },
+            required: ['index', 'text'],
+            title: 'select_dropdown_option_parameters',
+            type: 'object',
+            nullable: true,
+          },
+        },
+        title: 'ActionModel',
+        type: 'object',
       },
       title: 'Action',
       type: 'array',

+ 2 - 2
chrome-extension/src/background/agent/agents/base.ts

@@ -161,13 +161,13 @@ export abstract class BaseAgent<T extends z.ZodType, M = unknown> {
     ];
 
     const toolCallMessage = new AIMessage({
-      content: '',
+      content: 'tool call',
       tool_calls: toolCalls,
     });
     messageManager.addMessageWithTokens(toolCallMessage);
 
     const toolMessage = new ToolMessage({
-      content: '',
+      content: 'tool call response placeholder',
       tool_call_id: toolCallId,
     });
     messageManager.addMessageWithTokens(toolMessage);

+ 21 - 5
chrome-extension/src/background/agent/agents/navigator.ts

@@ -6,11 +6,11 @@ import type { Action } from '../actions/builder';
 import { buildDynamicActionSchema } from '../actions/builder';
 import { agentBrainSchema } from '../types';
 import { type BaseMessage, HumanMessage } from '@langchain/core/messages';
-import { jsonNavigatorOutputSchema } from '../actions/json_schema';
 import { Actors, ExecutionState } from '../event/types';
 import { isAuthenticationError } from '@src/background/utils';
 import { ChatModelAuthError } from './errors';
-
+import { jsonNavigatorOutputSchema } from '../actions/json_schema';
+import { geminiNavigatorOutputSchema } from '../actions/json_gemini';
 const logger = createLogger('NavigatorAgent');
 
 export class NavigatorActionRegistry {
@@ -63,14 +63,21 @@ export class NavigatorAgent extends BaseAgent<z.ZodType, NavigatorResult> {
   async invoke(inputMessages: BaseMessage[]): Promise<this['ModelOutput']> {
     // Use structured output
     if (this.withStructuredOutput) {
+      // For Google Generative AI, we need to use the modelOutputSchema directly
+      // but make sure it doesn't have any 'default' properties that cause issues
+
+      const schema =
+        this.chatModelLibrary === 'ChatGoogleGenerativeAI' ? geminiNavigatorOutputSchema : jsonNavigatorOutputSchema;
+
       // TODO: don't know why zod can not generate the same schema. Use the json schema exported from browser-use as a workaround for now, need to fix it
-      const structuredLlm = this.chatLLM.withStructuredOutput(jsonNavigatorOutputSchema, {
+      const structuredLlm = this.chatLLM.withStructuredOutput(schema, {
         includeRaw: true,
       });
 
       const response = await structuredLlm.invoke(inputMessages, {
         ...this.callOptions,
       });
+
       if (response.parsed) {
         return response.parsed;
       }
@@ -218,11 +225,17 @@ export class NavigatorAgent extends BaseAgent<z.ZodType, NavigatorResult> {
   private async doMultiAction(response: this['ModelOutput']): Promise<ActionResult[]> {
     const results: ActionResult[] = [];
     let errCount = 0;
+
+    logger.info('Actions', response.action);
     // sometimes response.action is a string, but not an array as expected, so we need to parse it as an array
     let actions: Record<string, unknown>[] = [];
     if (Array.isArray(response.action)) {
-      actions = response.action;
-    } else {
+      // if the item is null, skip it
+      actions = response.action.filter((item: unknown) => item !== null);
+      if (actions.length === 0) {
+        logger.warning('No valid actions found', response.action);
+      }
+    } else if (typeof response.action === 'string') {
       try {
         logger.warning('Unexpected action format', response.action);
         // try to parse the action as an JSON object
@@ -231,6 +244,9 @@ export class NavigatorAgent extends BaseAgent<z.ZodType, NavigatorResult> {
         logger.error('Invalid action format', response.action);
         throw new Error('Invalid action output format');
       }
+    } else {
+      // if the action is neither an array nor a string, it should be an object
+      actions = [response.action];
     }
 
     for (const action of actions) {

+ 7 - 3
chrome-extension/src/background/agent/executor.ts

@@ -120,6 +120,7 @@ export class Executor {
 
       let done = false;
       let step = 0;
+      let validatorFailed = false;
 
       for (step = 0; step < allowedMaxSteps; step++) {
         context.stepInfo = {
@@ -133,7 +134,8 @@ export class Executor {
         }
 
         // Run planner if configured
-        if (this.planner && context.nSteps % context.options.planningInterval === 0) {
+        if (this.planner && (context.nSteps % context.options.planningInterval === 0 || validatorFailed)) {
+          validatorFailed = false;
           // The first planning step is special, we don't want to add the browser state message to memory
           if (this.tasks.length > 1 || step > 0) {
             await this.navigator.addStateMessageToMemory();
@@ -153,9 +155,10 @@ export class Executor {
             } else {
               // task is not complete, let's navigate
               this.validator.setPlan(null);
+              done = false;
             }
-            if (!planOutput.result.web_task) {
-              done = true;
+
+            if (!planOutput.result.web_task && planOutput.result.done) {
               break;
             }
           }
@@ -173,6 +176,7 @@ export class Executor {
             logger.info('✅ Task completed successfully');
             break;
           }
+          validatorFailed = true;
         }
       }
 

+ 12 - 0
chrome-extension/src/background/agent/helper.ts

@@ -1,6 +1,7 @@
 import { type ProviderConfig, LLMProviderEnum, AgentNameEnum } from '@extension/storage';
 import { ChatOpenAI } from '@langchain/openai';
 import { ChatAnthropic } from '@langchain/anthropic';
+import { ChatGoogleGenerativeAI } from '@langchain/google-genai';
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
 
 // create a chat model based on the agent name, the model name and provider
@@ -60,6 +61,17 @@ export function createChatModel(
       }
       return new ChatAnthropic(args);
     }
+    case LLMProviderEnum.Gemini: {
+      temperature = 0.5;
+      topP = 0.8;
+      const args = {
+        model: modelName,
+        apiKey: providerConfig.apiKey,
+        temperature,
+        topP,
+      };
+      return new ChatGoogleGenerativeAI(args);
+    }
     default: {
       throw new Error(`Provider ${providerName} not supported yet`);
     }

+ 1 - 1
chrome-extension/src/background/agent/messages/service.ts

@@ -81,7 +81,7 @@ export default class MessageManager {
     ];
 
     const exampleToolCall = new AIMessage({
-      content: '',
+      content: 'example tool call',
       tool_calls: toolCalls,
     });
     this.addMessageWithTokens(exampleToolCall);

+ 2 - 0
packages/schema-utils/.eslintignore

@@ -0,0 +1,2 @@
+dist
+node_modules 

+ 25 - 0
packages/schema-utils/README.md

@@ -0,0 +1,25 @@
+# Tool Utils
+
+This package contains JSON schema definitions and related helpers for tools used across the extension.
+
+## Contents
+
+- JSON schema definitions for navigator output
+- Utility functions for schema flattening, conversion and formatting
+
+## Examples
+
+The `examples/` directory contains runnable examples that demonstrate the package's functionality:
+
+1. **flatten.ts** - Demonstrates how to flatten a JSON schema by dereferencing all `$ref` fields
+2. **convert.ts** - Shows how to convert an OpenAI-compatible schema to Gemini format
+
+To run these examples:
+
+```bash
+# Run the schema flattening example
+pnpm --filter @extension/schema-utils example:flatten
+
+# Run the schema conversion example
+pnpm --filter @extension/schema-utils example:convert
+```

+ 15 - 0
packages/schema-utils/build.mjs

@@ -0,0 +1,15 @@
+import esbuild from 'esbuild';
+
+/**
+ * @type { import('esbuild').BuildOptions }
+ */
+const buildOptions = {
+  entryPoints: ['./index.ts', './lib/**/*.ts', './lib/**/*.tsx', './examples/**/*.ts'],
+  tsconfig: './tsconfig.json',
+  bundle: false,
+  target: 'es6',
+  outdir: './dist',
+  sourcemap: true,
+};
+
+await esbuild.build(buildOptions); 

+ 9 - 0
packages/schema-utils/examples/convert.ts

@@ -0,0 +1,9 @@
+import { convertOpenAISchemaToGemini } from '../lib/helper.js';
+import { jsonNavigatorOutputSchema } from '../lib/json_schema.js';
+
+// Convert the schema
+console.log('Converting jsonNavigatorOutputSchema to Gemini format...');
+const geminiSchema = convertOpenAISchemaToGemini(jsonNavigatorOutputSchema);
+
+// pretty print the schema
+console.log(JSON.stringify(geminiSchema, null, 2));

+ 28 - 0
packages/schema-utils/examples/flatten.ts

@@ -0,0 +1,28 @@
+import { dereferenceJsonSchema } from '../lib/helper.js';
+import { jsonNavigatorOutputSchema } from '../lib/json_schema.js';
+
+/**
+ * This example demonstrates how to flatten the jsonNavigatorOutputSchema
+ * by dereferencing all $ref fields and removing the $defs section.
+ */
+
+// Flatten the schema by dereferencing all references
+console.log('Flattening jsonNavigatorOutputSchema...');
+const flattenedSchema = dereferenceJsonSchema(jsonNavigatorOutputSchema);
+
+// Pretty print the flattened schema
+console.log('Flattened Schema:');
+console.log(JSON.stringify(flattenedSchema, null, 2));
+
+// You can also see the size difference
+const originalSize = JSON.stringify(jsonNavigatorOutputSchema).length;
+const flattenedSize = JSON.stringify(flattenedSchema).length;
+
+console.log('\nSize comparison:');
+console.log(`Original schema size: ${originalSize} bytes`);
+console.log(`Flattened schema size: ${flattenedSize} bytes`);
+console.log(
+  `Difference: ${flattenedSize - originalSize} bytes (${((flattenedSize / originalSize) * 100).toFixed(2)}% of original)`,
+);
+
+// Note: The flattened schema is typically larger because references are replaced with their full definitions

+ 4 - 0
packages/schema-utils/index.ts

@@ -0,0 +1,4 @@
+export * from './lib/json_schema';
+export * from './lib/json_gemini';
+export * from './lib/helpers';
+export * from './lib/helper';

+ 242 - 0
packages/schema-utils/lib/helper.ts

@@ -0,0 +1,242 @@
+/**
+ * Type definition for a JSON Schema object
+ */
+export interface JsonSchemaObject {
+  $ref?: string;
+  $defs?: Record<string, JsonSchemaObject>;
+  type?: string;
+  properties?: Record<string, JsonSchemaObject>;
+  items?: JsonSchemaObject;
+  anyOf?: JsonSchemaObject[];
+  title?: string;
+  description?: string;
+  required?: string[];
+  default?: unknown;
+  additionalProperties?: boolean;
+  [key: string]: unknown;
+}
+
+/**
+ * Dereferences all $ref fields in a JSON schema by replacing them with the actual referenced schema
+ *
+ * @param schema The JSON schema to dereference
+ * @returns A new JSON schema with all references resolved
+ */
+export function dereferenceJsonSchema(schema: JsonSchemaObject): JsonSchemaObject {
+  // Create a deep copy of the schema to avoid modifying the original
+  const clonedSchema = JSON.parse(JSON.stringify(schema));
+
+  // Extract definitions to use for resolving references
+  const definitions = clonedSchema.$defs || {};
+
+  // Process the schema
+  const result = processSchemaNode(clonedSchema, definitions);
+
+  // Create a new object without $defs
+  const resultWithoutDefs: JsonSchemaObject = {};
+
+  // Copy all properties except $defs
+  for (const [key, value] of Object.entries(result)) {
+    if (key !== '$defs') {
+      resultWithoutDefs[key] = value;
+    }
+  }
+
+  return resultWithoutDefs;
+}
+
+/**
+ * Process a schema node, resolving all references
+ */
+function processSchemaNode(node: JsonSchemaObject, definitions: Record<string, JsonSchemaObject>): JsonSchemaObject {
+  // If it's not an object or is null, return as is
+  if (typeof node !== 'object' || node === null) {
+    return node;
+  }
+
+  // If it's a reference, resolve it
+  if (node.$ref) {
+    const refPath = node.$ref.replace('#/$defs/', '');
+    const definition = definitions[refPath];
+    if (definition) {
+      // Process the definition to resolve any nested references
+      return processSchemaNode(definition, definitions);
+    }
+  }
+
+  // Handle anyOf for references
+  if (node.anyOf) {
+    // Process each item in anyOf
+    const processedAnyOf = node.anyOf.map(item => processSchemaNode(item, definitions));
+
+    // If anyOf contains a reference and a null type, merge them
+    const nonNullTypes = processedAnyOf.filter(item => item.type !== 'null');
+    const hasNullType = processedAnyOf.some(item => item.type === 'null');
+
+    if (nonNullTypes.length === 1 && hasNullType) {
+      const result = { ...nonNullTypes[0] };
+      result.nullable = true;
+      return result;
+    }
+
+    // Otherwise, keep the anyOf structure but with processed items
+    return {
+      ...node,
+      anyOf: processedAnyOf,
+    };
+  }
+
+  // Create a new node with processed properties
+  const result: JsonSchemaObject = {};
+
+  // Copy all properties except $ref
+  for (const [key, value] of Object.entries(node)) {
+    if (key !== '$ref') {
+      if (key === 'properties' && typeof value === 'object' && value !== null) {
+        // Process properties
+        result.properties = {};
+        for (const [propKey, propValue] of Object.entries(value)) {
+          result.properties[propKey] = processSchemaNode(propValue as JsonSchemaObject, definitions);
+        }
+      } else if (key === 'items' && typeof value === 'object' && value !== null) {
+        // Process items for arrays
+        result.items = processSchemaNode(value as JsonSchemaObject, definitions);
+      } else {
+        // Copy other properties as is
+        result[key] = value;
+      }
+    }
+  }
+
+  return result;
+}
+
+/**
+ * Converts an OpenAI format JSON schema to a Google Gemini compatible schema
+ *
+ * Key differences handled:
+ * 1. OpenAI uses $defs and $ref for references, Gemini uses inline definitions
+ * 2. Different structure for nullable properties
+ * 3. Gemini has a flatter structure for defining properties
+ *
+ * @param openaiSchema The OpenAI format JSON schema to convert
+ * @returns A Google Gemini compatible JSON schema
+ */
+export function convertOpenAISchemaToGemini(openaiSchema: JsonSchemaObject): JsonSchemaObject {
+  // Create a new schema object
+  const geminiSchema: JsonSchemaObject = {
+    type: openaiSchema.type,
+    properties: {},
+    required: openaiSchema.required || [],
+  };
+
+  // Process definitions to use for resolving references
+  const definitions = openaiSchema.$defs || {};
+
+  // Process properties
+  if (openaiSchema.properties) {
+    geminiSchema.properties = processProperties(openaiSchema.properties, definitions);
+  }
+
+  return geminiSchema;
+}
+
+/**
+ * Process properties recursively, resolving references and converting to Gemini format
+ */
+function processProperties(
+  properties: Record<string, JsonSchemaObject>,
+  definitions: Record<string, JsonSchemaObject>,
+): Record<string, JsonSchemaObject> {
+  const result: Record<string, JsonSchemaObject> = {};
+
+  for (const [key, value] of Object.entries(properties)) {
+    if (typeof value !== 'object' || value === null) continue;
+
+    result[key] = processProperty(value, definitions);
+  }
+
+  return result;
+}
+
+/**
+ * Process a single property, resolving references and converting to Gemini format
+ */
+function processProperty(property: JsonSchemaObject, definitions: Record<string, JsonSchemaObject>): JsonSchemaObject {
+  // If it's a reference, resolve it
+  if (property.$ref) {
+    const refPath = property.$ref.replace('#/$defs/', '');
+    const definition = definitions[refPath];
+    if (definition) {
+      return processProperty(definition, definitions);
+    }
+  }
+
+  // Handle anyOf for nullable properties
+  if (property.anyOf) {
+    const nonNullType = property.anyOf.find(item => item.type !== 'null' && !item.$ref);
+
+    const refType = property.anyOf.find(item => item.$ref);
+
+    const isNullable = property.anyOf.some(item => item.type === 'null');
+
+    if (refType?.$ref) {
+      const refPath = refType.$ref.replace('#/$defs/', '');
+      const definition = definitions[refPath];
+
+      if (definition) {
+        const processed = processProperty(definition, definitions);
+        if (isNullable) {
+          processed.nullable = true;
+        }
+        return processed;
+      }
+    }
+
+    if (nonNullType) {
+      const processed = processProperty(nonNullType, definitions);
+      if (isNullable) {
+        processed.nullable = true;
+      }
+      return processed;
+    }
+  }
+
+  // Create a new property object
+  const result: JsonSchemaObject = {
+    type: property.type,
+  };
+
+  // Copy description if it exists
+  if (property.description) {
+    result.description = property.description;
+  }
+
+  // Process nested properties
+  if (property.properties) {
+    result.properties = processProperties(property.properties, definitions);
+
+    // Copy required fields
+    if (property.required) {
+      result.required = property.required;
+    } else {
+      result.required = [];
+    }
+  }
+
+  // Handle arrays
+  if (property.items) {
+    result.items = processProperty(property.items, definitions);
+  }
+
+  // Handle special case for NoParamsAction which is an object in OpenAI but a string in Gemini
+  if (property.additionalProperties === true && property.title === 'NoParamsAction' && property.description) {
+    return {
+      type: 'string',
+      nullable: true,
+      description: property.description,
+    };
+  }
+
+  return result;
+}

+ 432 - 0
packages/schema-utils/lib/json_schema.ts

@@ -0,0 +1,432 @@
+// This is the json schema exported from browser-use, change page_id to tab_id
+// TODO: don't know why zod can not generate the same schema, need to fix it
+export const jsonNavigatorOutputSchema = {
+  $defs: {
+    ActionModel: {
+      properties: {
+        done: {
+          anyOf: [
+            {
+              $ref: '#/$defs/DoneAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Complete task',
+        },
+        search_google: {
+          anyOf: [
+            {
+              $ref: '#/$defs/SearchGoogleAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description:
+            'Search the query in Google in the current tab, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items. ',
+        },
+        go_to_url: {
+          anyOf: [
+            {
+              $ref: '#/$defs/GoToUrlAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Navigate to URL in the current tab',
+        },
+        go_back: {
+          anyOf: [
+            {
+              $ref: '#/$defs/NoParamsAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Go back',
+        },
+        click_element: {
+          anyOf: [
+            {
+              $ref: '#/$defs/ClickElementAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Click element',
+        },
+        input_text: {
+          anyOf: [
+            {
+              $ref: '#/$defs/InputTextAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Input text into a input interactive element',
+        },
+        switch_tab: {
+          anyOf: [
+            {
+              $ref: '#/$defs/SwitchTabAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Switch tab',
+        },
+        open_tab: {
+          anyOf: [
+            {
+              $ref: '#/$defs/OpenTabAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Open url in new tab',
+        },
+        cache_content: {
+          anyOf: [
+            {
+              $ref: '#/$defs/cache_content_parameters',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Cache what you have found so far from the current page so that it can be used in future steps',
+        },
+        scroll_down: {
+          anyOf: [
+            {
+              $ref: '#/$defs/ScrollAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page',
+        },
+        scroll_up: {
+          anyOf: [
+            {
+              $ref: '#/$defs/ScrollAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page',
+        },
+        send_keys: {
+          anyOf: [
+            {
+              $ref: '#/$defs/SendKeysAction',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description:
+            'Send strings of special keys like Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press. Be aware of different operating systems and their shortcuts',
+        },
+        scroll_to_text: {
+          anyOf: [
+            {
+              $ref: '#/$defs/scroll_to_text_parameters',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'If you dont find something which you want to interact with, scroll to it',
+        },
+        get_dropdown_options: {
+          anyOf: [
+            {
+              $ref: '#/$defs/get_dropdown_options_parameters',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description: 'Get all options from a native dropdown',
+        },
+        select_dropdown_option: {
+          anyOf: [
+            {
+              $ref: '#/$defs/select_dropdown_option_parameters',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          description:
+            'Select dropdown option for interactive element index by the text of the option you want to select',
+        },
+      },
+      title: 'ActionModel',
+      type: 'object',
+    },
+    AgentBrain: {
+      description: 'Current state of the agent',
+      properties: {
+        page_summary: {
+          title: 'Page Summary',
+          type: 'string',
+        },
+        evaluation_previous_goal: {
+          title: 'Evaluation Previous Goal',
+          type: 'string',
+        },
+        memory: {
+          title: 'Memory',
+          type: 'string',
+        },
+        next_goal: {
+          title: 'Next Goal',
+          type: 'string',
+        },
+      },
+      required: ['page_summary', 'evaluation_previous_goal', 'memory', 'next_goal'],
+      title: 'AgentBrain',
+      type: 'object',
+    },
+    ClickElementAction: {
+      properties: {
+        desc: {
+          title: 'Intent',
+          type: 'string',
+          description: 'Very short explanation of the intent or purpose for calling this action',
+        },
+        index: {
+          title: 'Index',
+          type: 'integer',
+        },
+        xpath: {
+          anyOf: [
+            {
+              type: 'string',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          title: 'Xpath',
+        },
+      },
+      required: ['desc', 'index'],
+      title: 'ClickElementAction',
+      type: 'object',
+    },
+    DoneAction: {
+      properties: {
+        text: {
+          title: 'Text',
+          type: 'string',
+        },
+      },
+      required: ['text'],
+      title: 'DoneAction',
+      type: 'object',
+    },
+    GoToUrlAction: {
+      properties: {
+        url: {
+          title: 'Url',
+          type: 'string',
+        },
+      },
+      required: ['url'],
+      title: 'GoToUrlAction',
+      type: 'object',
+    },
+    InputTextAction: {
+      properties: {
+        desc: {
+          title: 'Intent',
+          type: 'string',
+          description: 'Very short explanation of the intent or purpose for calling this action',
+        },
+        index: {
+          title: 'Index',
+          type: 'integer',
+        },
+        text: {
+          title: 'Text',
+          type: 'string',
+        },
+        xpath: {
+          anyOf: [
+            {
+              type: 'string',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          title: 'Xpath',
+        },
+      },
+      required: ['desc', 'index', 'text'],
+      title: 'InputTextAction',
+      type: 'object',
+    },
+    NoParamsAction: {
+      additionalProperties: true,
+      description:
+        'Accepts absolutely anything in the incoming data\nand discards it, so the final parsed model is empty.',
+      properties: {},
+      title: 'NoParamsAction',
+      type: 'object',
+    },
+    OpenTabAction: {
+      properties: {
+        url: {
+          title: 'Url',
+          type: 'string',
+        },
+      },
+      required: ['url'],
+      title: 'OpenTabAction',
+      type: 'object',
+    },
+    ScrollAction: {
+      properties: {
+        desc: {
+          title: 'Intent',
+          type: 'string',
+          description: 'Very short explanation of the intent or purpose for calling this action',
+        },
+        amount: {
+          anyOf: [
+            {
+              type: 'integer',
+            },
+            {
+              type: 'null',
+            },
+          ],
+          title: 'Amount',
+        },
+      },
+      required: ['desc'],
+      title: 'ScrollAction',
+      type: 'object',
+    },
+    SearchGoogleAction: {
+      properties: {
+        query: {
+          title: 'Query',
+          type: 'string',
+        },
+      },
+      required: ['query'],
+      title: 'SearchGoogleAction',
+      type: 'object',
+    },
+    SendKeysAction: {
+      properties: {
+        desc: {
+          title: 'Intent',
+          type: 'string',
+          description: 'Very short explanation of the intent or purpose for calling this action',
+        },
+        keys: {
+          title: 'Keys',
+          type: 'string',
+        },
+      },
+      required: ['desc', 'keys'],
+      title: 'SendKeysAction',
+      type: 'object',
+    },
+    SwitchTabAction: {
+      properties: {
+        tab_id: {
+          title: 'Page Id',
+          type: 'integer',
+        },
+      },
+      required: ['tab_id'],
+      title: 'SwitchTabAction',
+      type: 'object',
+    },
+    cache_content_parameters: {
+      properties: {
+        content: {
+          title: 'Content',
+          type: 'string',
+        },
+      },
+      required: ['content'],
+      title: 'cache_content_parameters',
+      type: 'object',
+    },
+    get_dropdown_options_parameters: {
+      properties: {
+        index: {
+          title: 'Index',
+          type: 'integer',
+        },
+      },
+      required: ['index'],
+      title: 'get_dropdown_options_parameters',
+      type: 'object',
+    },
+    scroll_to_text_parameters: {
+      properties: {
+        desc: {
+          title: 'Intent',
+          type: 'string',
+          description: 'Very short explanation of the intent or purpose for calling this action',
+        },
+        text: {
+          title: 'Text',
+          type: 'string',
+        },
+      },
+      required: ['desc', 'text'],
+      title: 'scroll_to_text_parameters',
+      type: 'object',
+    },
+    select_dropdown_option_parameters: {
+      properties: {
+        index: {
+          title: 'Index',
+          type: 'integer',
+        },
+        text: {
+          title: 'Text',
+          type: 'string',
+        },
+      },
+      required: ['index', 'text'],
+      title: 'select_dropdown_option_parameters',
+      type: 'object',
+    },
+  },
+  properties: {
+    current_state: {
+      $ref: '#/$defs/AgentBrain',
+    },
+    action: {
+      items: {
+        $ref: '#/$defs/ActionModel',
+      },
+      title: 'Action',
+      type: 'array',
+    },
+  },
+  required: ['current_state', 'action'],
+  title: 'AgentOutput',
+  type: 'object',
+};

+ 29 - 0
packages/schema-utils/package.json

@@ -0,0 +1,29 @@
+{
+  "name": "@extension/schema-utils",
+  "version": "0.1.0",
+  "description": "JSON schema and related helpers for tools",
+  "private": true,
+  "type": "module",
+  "sideEffects": false,
+  "files": [
+    "dist/**"
+  ],
+  "types": "index.ts",
+  "main": "./dist/index.js",
+  "scripts": {
+    "clean:bundle": "rimraf dist",
+    "clean:node_modules": "pnpx rimraf node_modules",
+    "clean:turbo": "rimraf .turbo",
+    "clean": "pnpm clean:bundle && pnpm clean:node_modules && pnpm clean:turbo",
+    "ready": "node build.mjs",
+    "lint": "eslint . --ext .ts,.tsx",
+    "lint:fix": "pnpm lint --fix",
+    "prettier": "prettier . --write --ignore-path ../../.prettierignore",
+    "type-check": "tsc --noEmit",
+    "example:convert": "pnpm run ready && node dist/examples/convert.js",
+    "example:flatten": "pnpm run ready && node dist/examples/flatten.js"
+  },
+  "devDependencies": {
+    "@extension/tsconfig": "workspace:*"
+  }
+}

+ 174 - 0
packages/schema-utils/tests/convert.test.ts

@@ -0,0 +1,174 @@
+import { convertOpenAISchemaToGemini } from '../lib/helper';
+import type { JsonSchemaObject } from '../lib/json_schema';
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+
+// Create a simple test runner since we don't have Jest or Mocha installed
+function describe(name: string, fn: () => void) {
+  console.log(`\n--- ${name} ---`);
+  fn();
+}
+
+function it(name: string, fn: () => void) {
+  console.log(`\n  Test: ${name}`);
+  try {
+    fn();
+    console.log('  ✅ PASSED');
+  } catch (error) {
+    console.error('  ❌ FAILED:', error);
+  }
+}
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+function expect(actual: unknown) {
+  return {
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    toEqual: (expected: unknown) => {
+      const actualStr = JSON.stringify(actual);
+      const expectedStr = JSON.stringify(expected);
+      if (actualStr !== expectedStr) {
+        throw new Error(`Expected ${expectedStr} but got ${actualStr}`);
+      }
+    },
+    toBeTruthy: () => {
+      if (!actual) {
+        throw new Error(`Expected truthy value but got ${actual}`);
+      }
+    },
+  };
+}
+
+describe('convertOpenAISchemaToGemini', () => {
+  it('should convert OpenAI schema to Gemini format', () => {
+    // Sample OpenAI schema with references and nullable properties
+    const openaiSchema: JsonSchemaObject = {
+      type: 'object',
+      properties: {
+        name: {
+          type: 'string',
+          description: 'The name of the user',
+        },
+        age: {
+          type: 'number',
+          description: 'The age of the user',
+        },
+        address: {
+          $ref: '#/$defs/Address',
+        },
+        email: {
+          anyOf: [{ type: 'string', description: 'Email address' }, { type: 'null' }],
+        },
+        tags: {
+          type: 'array',
+          items: {
+            type: 'string',
+          },
+        },
+        profile: {
+          $ref: '#/$defs/Profile',
+        },
+      },
+      required: ['name', 'age'],
+      $defs: {
+        Address: {
+          type: 'object',
+          properties: {
+            street: { type: 'string' },
+            city: { type: 'string' },
+            zipCode: { type: 'string' },
+          },
+          required: ['street', 'city'],
+        },
+        Profile: {
+          type: 'object',
+          properties: {
+            bio: { type: 'string' },
+            website: {
+              anyOf: [{ type: 'string' }, { type: 'null' }],
+            },
+          },
+        },
+      },
+    };
+
+    // Convert the schema
+    const geminiSchema = convertOpenAISchemaToGemini(openaiSchema);
+
+    // Expected Gemini schema
+    const expectedGeminiSchema: JsonSchemaObject = {
+      type: 'object',
+      properties: {
+        name: {
+          type: 'string',
+          description: 'The name of the user',
+        },
+        age: {
+          type: 'number',
+          description: 'The age of the user',
+        },
+        address: {
+          type: 'object',
+          properties: {
+            street: { type: 'string' },
+            city: { type: 'string' },
+            zipCode: { type: 'string' },
+          },
+          required: ['street', 'city'],
+        },
+        email: {
+          type: 'string',
+          description: 'Email address',
+          nullable: true,
+        },
+        tags: {
+          type: 'array',
+          items: {
+            type: 'string',
+          },
+        },
+        profile: {
+          type: 'object',
+          properties: {
+            bio: { type: 'string' },
+            website: {
+              type: 'string',
+              nullable: true,
+            },
+          },
+          required: [],
+        },
+      },
+      required: ['name', 'age'],
+    };
+
+    // Verify the conversion
+    expect(geminiSchema).toEqual(expectedGeminiSchema);
+
+    // Write the schemas to files for manual inspection
+    const testDir = path.join(__dirname, 'output');
+    if (!fs.existsSync(testDir)) {
+      fs.mkdirSync(testDir, { recursive: true });
+    }
+
+    fs.writeFileSync(path.join(testDir, 'openai.json'), JSON.stringify(openaiSchema, null, 2));
+
+    fs.writeFileSync(path.join(testDir, 'gemini.json'), JSON.stringify(geminiSchema, null, 2));
+  });
+
+  it('should convert the actual json_schema.ts to gemini.json', () => {
+    // Import the actual schema from json_schema.ts
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    const { jsonNavigatorOutputSchema } = require('../lib/json_schema');
+
+    // Convert the schema
+    const geminiSchema = convertOpenAISchemaToGemini(jsonNavigatorOutputSchema);
+
+    // Write the converted schema to a file
+    const outputDir = path.join(__dirname, '../');
+    fs.writeFileSync(path.join(outputDir, 'gemini.json'), JSON.stringify(geminiSchema, null, 2));
+
+    // Verify the conversion was successful
+    expect(geminiSchema).toBeTruthy();
+    expect(geminiSchema.properties).toBeTruthy();
+  });
+});

+ 19 - 0
packages/schema-utils/tests/run-test.js

@@ -0,0 +1,19 @@
+#!/usr/bin/env node
+
+// Simple script to run our TypeScript test file
+const { execSync } = require('node:child_process');
+const path = require('node:path');
+
+try {
+  // Compile the test file with ts-node
+  console.log('Running convert.test.ts...');
+  execSync('npx ts-node tests/convert.test.ts', {
+    cwd: path.resolve(__dirname, '..'),
+    stdio: 'inherit',
+  });
+
+  console.log('\nTest completed successfully!');
+} catch (error) {
+  console.error('\nTest failed:', error.message);
+  process.exit(1);
+}

+ 5 - 0
packages/schema-utils/tsconfig.json

@@ -0,0 +1,5 @@
+{
+  "extends": "@extension/tsconfig/base.json",
+  "include": ["./**/*.ts", "./**/*.tsx"],
+  "exclude": ["node_modules", "dist"]
+}

+ 7 - 4
packages/storage/lib/settings/types.ts

@@ -9,14 +9,17 @@ export enum LLMProviderEnum {
   OpenAI = 'openai',
   Anthropic = 'anthropic',
   Gemini = 'gemini',
-  Deepseek = 'deepseek',
 }
 
 export const llmProviderModelNames = {
   [LLMProviderEnum.OpenAI]: ['gpt-4o', 'gpt-4o-mini', 'o1', 'o1-mini', 'o3-mini'],
-  [LLMProviderEnum.Anthropic]: ['claude-3-5-sonnet-latest', 'claude-3-5-haiku-latest'],
-  [LLMProviderEnum.Gemini]: ['gemini-2.0-flash', 'gemini-2.0-flash-thinking-exp-01-21', 'gemini-2.0-pro-exp-02-05'],
-  [LLMProviderEnum.Deepseek]: ['deepseek-v3', 'deepseek-reasoner'],
+  [LLMProviderEnum.Anthropic]: ['claude-3-7-sonnet-latest', 'claude-3-5-haiku-latest'],
+  [LLMProviderEnum.Gemini]: [
+    'gemini-2.0-flash',
+    'gemini-2.0-flash-lite',
+    'gemini-2.0-pro-exp-02-05',
+    // 'gemini-2.0-flash-thinking-exp-01-21', // TODO: not support function calling for now
+  ],
 };
 
 /**

+ 1 - 1
pages/options/src/Options.tsx

@@ -5,7 +5,7 @@ import { withErrorBoundary, withSuspense } from '@extension/shared';
 import { GeneralSettings } from './components/GeneralSettings';
 import { ModelSettings } from './components/ModelSettings';
 const Options = () => {
-  const [activeTab, setActiveTab] = useState('general');
+  const [activeTab, setActiveTab] = useState('models');
 
   const renderTabContent = () => {
     switch (activeTab) {

+ 24 - 10
pages/options/src/components/ModelSettings.tsx

@@ -276,17 +276,31 @@ export const ModelSettings = () => {
                 onChange={e => handleApiKeyChange(LLMProviderEnum.Anthropic, e.target.value)}
                 className="w-full p-2 rounded-md bg-gray-50 border border-gray-200 focus:border-blue-400 focus:ring-2 focus:ring-blue-200 outline-none"
               />
-              <input
-                type="text"
-                placeholder="Custom Base URL (Optional)"
-                value={apiKeys[LLMProviderEnum.Anthropic]?.baseUrl || ''}
-                onChange={e =>
-                  handleApiKeyChange(
-                    LLMProviderEnum.Anthropic,
-                    apiKeys[LLMProviderEnum.Anthropic]?.apiKey || '',
-                    e.target.value,
-                  )
+            </div>
+          </div>
+
+          <div className="border-t border-gray-200" />
+
+          {/* Gemini Section */}
+          <div className="space-y-4">
+            <div className="flex items-center justify-between">
+              <h3 className="text-lg font-medium text-gray-700">Gemini</h3>
+              <Button
+                {...getButtonProps(LLMProviderEnum.Gemini)}
+                size="sm"
+                onClick={() =>
+                  apiKeys[LLMProviderEnum.Gemini]?.apiKey && !modifiedProviders.has(LLMProviderEnum.Gemini)
+                    ? handleDelete(LLMProviderEnum.Gemini)
+                    : handleSave(LLMProviderEnum.Gemini)
                 }
+              />
+            </div>
+            <div className="space-y-3">
+              <input
+                type="password"
+                placeholder="Gemini API key"
+                value={apiKeys[LLMProviderEnum.Gemini]?.apiKey || ''}
+                onChange={e => handleApiKeyChange(LLMProviderEnum.Gemini, e.target.value)}
                 className="w-full p-2 rounded-md bg-gray-50 border border-gray-200 focus:border-blue-400 focus:ring-2 focus:ring-blue-200 outline-none"
               />
             </div>

+ 29 - 0
pnpm-lock.yaml

@@ -117,6 +117,9 @@ importers:
       '@langchain/core':
         specifier: ^0.3.37
         version: 0.3.37(openai@4.82.0(ws@8.18.0)(zod@3.24.1))
+      '@langchain/google-genai':
+        specifier: 0.1.10
+        version: 0.1.10(@langchain/core@0.3.37(openai@4.82.0(ws@8.18.0)(zod@3.24.1)))(zod@3.24.1)
       '@langchain/openai':
         specifier: ^0.4.2
         version: 0.4.2(@langchain/core@0.3.37(openai@4.82.0(ws@8.18.0)(zod@3.24.1)))(ws@8.18.0)
@@ -206,6 +209,12 @@ importers:
         specifier: workspace:*
         version: link:../tsconfig
 
+  packages/schema-utils:
+    devDependencies:
+      '@extension/tsconfig':
+        specifier: workspace:*
+        version: link:../tsconfig
+
   packages/shared:
     devDependencies:
       '@extension/storage':
@@ -680,6 +689,10 @@ packages:
     resolution: {integrity: sha512-Ys+3g2TaW7gADOJzPt83SJtCDhMjndcDMFVQ/Tj9iA1BfJzFKD9mAUXT3OenpuPHbI6P/myECxRJrofUsDx/5g==}
     engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
 
+  '@google/generative-ai@0.21.0':
+    resolution: {integrity: sha512-7XhUbtnlkSEZK15kN3t+tzIMxsbKm/dSkKBFalj+20NvPKe1kBY7mR2P7vuijEn+f06z5+A8bVGKO0v39cr6Wg==}
+    engines: {node: '>=18.0.0'}
+
   '@humanwhocodes/config-array@0.11.14':
     resolution: {integrity: sha512-3T8LkOmg45BV5FICb15QQMsyUSWrQ8AygVfC7ZG32zOalnqrilm018ZVCw0eapXux8FtA33q8PSRSstjee3jSg==}
     engines: {node: '>=10.10.0'}
@@ -731,6 +744,12 @@ packages:
     resolution: {integrity: sha512-LFk9GqHxcyCFx0oXvCBP7vDZIOUHYzzNU7JR+2ofIMnfkBLzcCKzBLySQDfPtd13PrpGHkaeOeLq8H1Tqi9lSw==}
     engines: {node: '>=18'}
 
+  '@langchain/google-genai@0.1.10':
+    resolution: {integrity: sha512-+0xFWvauNDNp8Nvhy5F5g8RbB5g4WWQSIxoPI4IQIUICBBT/kS/Omf1VJI6Loc0IH93m9ZSwYxRVCRu3qx51TQ==}
+    engines: {node: '>=18'}
+    peerDependencies:
+      '@langchain/core': '>=0.3.17 <0.4.0'
+
   '@langchain/openai@0.4.2':
     resolution: {integrity: sha512-Cuj7qbVcycALTP0aqZuPpEc7As8cwiGaU21MhXRyZFs+dnWxKYxZ1Q1z4kcx6cYkq/I+CNwwmk+sP+YruU73Aw==}
     engines: {node: '>=18'}
@@ -3555,6 +3574,8 @@ snapshots:
 
   '@eslint/js@8.57.0': {}
 
+  '@google/generative-ai@0.21.0': {}
+
   '@humanwhocodes/config-array@0.11.14':
     dependencies:
       '@humanwhocodes/object-schema': 2.0.3
@@ -3630,6 +3651,14 @@ snapshots:
     transitivePeerDependencies:
       - openai
 
+  '@langchain/google-genai@0.1.10(@langchain/core@0.3.37(openai@4.82.0(ws@8.18.0)(zod@3.24.1)))(zod@3.24.1)':
+    dependencies:
+      '@google/generative-ai': 0.21.0
+      '@langchain/core': 0.3.37(openai@4.82.0(ws@8.18.0)(zod@3.24.1))
+      zod-to-json-schema: 3.24.1(zod@3.24.1)
+    transitivePeerDependencies:
+      - zod
+
   '@langchain/openai@0.4.2(@langchain/core@0.3.37(openai@4.82.0(ws@8.18.0)(zod@3.24.1)))(ws@8.18.0)':
     dependencies:
       '@langchain/core': 0.3.37(openai@4.82.0(ws@8.18.0)(zod@3.24.1))