Bläddra i källkod

fix missing description of actions in generated flatten schemas

alexchenzl 3 månader sedan
förälder
incheckning
2912cea937

+ 37 - 18
chrome-extension/src/background/agent/actions/json_gemini.ts

@@ -25,6 +25,8 @@ export const geminiNavigatorOutputSchema = {
         properties: {
           done: {
             type: 'object',
+            description: 'Complete task',
+            nullable: true,
             properties: {
               text: {
                 type: 'string',
@@ -34,10 +36,12 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['text', 'success'],
-            nullable: true,
           },
           search_google: {
             type: 'object',
+            description:
+              'Search the query in Google in the current tab, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items. ',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -48,10 +52,11 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'query'],
-            nullable: true,
           },
           go_to_url: {
             type: 'object',
+            description: 'Navigate to URL in the current tab',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -62,10 +67,11 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'url'],
-            nullable: true,
           },
           go_back: {
             type: 'object',
+            description: 'Go back to previous page',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -73,10 +79,11 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent'],
-            nullable: true,
           },
           wait: {
             type: 'object',
+            description: 'Wait for x seconds default 3',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -87,10 +94,11 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'seconds'],
-            nullable: true,
           },
           click_element: {
             type: 'object',
+            description: 'Click element by index',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -105,10 +113,11 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'index'],
-            nullable: true,
           },
           input_text: {
             type: 'object',
+            description: 'Input text into an interactive input element',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -126,10 +135,11 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'index', 'text'],
-            nullable: true,
           },
           switch_tab: {
             type: 'object',
+            description: 'Switch tab',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -140,10 +150,11 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'tab_id'],
-            nullable: true,
           },
           open_tab: {
             type: 'object',
+            description: 'Open url in new tab',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -154,10 +165,11 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'url'],
-            nullable: true,
           },
           close_tab: {
             type: 'object',
+            description: 'Close tab by tab_id',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -168,10 +180,11 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'tab_id'],
-            nullable: true,
           },
           cache_content: {
             type: 'object',
+            description: 'Cache what you have found so far from the current page for future use',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -182,10 +195,11 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'content'],
-            nullable: true,
           },
           scroll_down: {
             type: 'object',
+            description: 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -197,10 +211,11 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'amount'],
-            nullable: true,
           },
           scroll_up: {
             type: 'object',
+            description: 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -212,10 +227,12 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'amount'],
-            nullable: true,
           },
           send_keys: {
             type: 'object',
+            description:
+              'Send strings of special keys like Escape, Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press.',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -226,10 +243,11 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'keys'],
-            nullable: true,
           },
           scroll_to_text: {
             type: 'object',
+            description: 'If you dont find something which you want to interact with, scroll to it',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -240,10 +258,11 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'text'],
-            nullable: true,
           },
           get_dropdown_options: {
             type: 'object',
+            description: 'Get all options from a native dropdown',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -254,10 +273,12 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'index'],
-            nullable: true,
           },
           select_dropdown_option: {
             type: 'object',
+            description:
+              'Select dropdown option for interactive element index by the text of the option you want to select',
+            nullable: true,
             properties: {
               intent: {
                 type: 'string',
@@ -271,10 +292,8 @@ export const geminiNavigatorOutputSchema = {
               },
             },
             required: ['intent', 'index', 'text'],
-            nullable: true,
           },
         },
-        required: [],
       },
     },
   },

+ 24 - 0
chrome-extension/src/background/agent/actions/json_schema.ts

@@ -26,6 +26,7 @@ export const jsonNavigatorOutputSchema = {
       items: {
         properties: {
           done: {
+            description: 'Complete task',
             properties: {
               text: {
                 title: 'Text',
@@ -42,6 +43,8 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           search_google: {
+            description:
+              'Search the query in Google in the current tab, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items. ',
             properties: {
               intent: {
                 title: 'Intent',
@@ -59,6 +62,7 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           go_to_url: {
+            description: 'Navigate to URL in the current tab',
             properties: {
               intent: {
                 title: 'Intent',
@@ -76,6 +80,7 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           go_back: {
+            description: 'Go back to previous page',
             properties: {
               intent: {
                 title: 'Intent',
@@ -89,6 +94,7 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           wait: {
+            description: 'Wait for x seconds default 3',
             properties: {
               intent: {
                 title: 'Intent',
@@ -107,6 +113,7 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           click_element: {
+            description: 'Click element by index',
             properties: {
               intent: {
                 title: 'Intent',
@@ -118,6 +125,7 @@ export const jsonNavigatorOutputSchema = {
                 type: 'integer',
               },
               xpath: {
+                title: 'Xpath',
                 type: 'string',
                 nullable: true,
               },
@@ -128,6 +136,7 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           input_text: {
+            description: 'Input text into an interactive input element',
             properties: {
               intent: {
                 title: 'Intent',
@@ -143,6 +152,7 @@ export const jsonNavigatorOutputSchema = {
                 type: 'string',
               },
               xpath: {
+                title: 'Xpath',
                 type: 'string',
                 nullable: true,
               },
@@ -153,6 +163,7 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           switch_tab: {
+            description: 'Switch tab',
             properties: {
               intent: {
                 title: 'Intent',
@@ -170,6 +181,7 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           open_tab: {
+            description: 'Open url in new tab',
             properties: {
               intent: {
                 title: 'Intent',
@@ -187,6 +199,7 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           close_tab: {
+            description: 'Close tab by tab_id',
             properties: {
               intent: {
                 title: 'Intent',
@@ -204,6 +217,7 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           cache_content: {
+            description: 'Cache what you have found so far from the current page for future use',
             properties: {
               intent: {
                 title: 'Intent',
@@ -221,6 +235,7 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           scroll_down: {
+            description: 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page',
             properties: {
               intent: {
                 title: 'Intent',
@@ -228,6 +243,7 @@ export const jsonNavigatorOutputSchema = {
                 description: 'purpose of this action',
               },
               amount: {
+                title: 'Amount',
                 type: 'integer',
                 nullable: true,
               },
@@ -238,6 +254,7 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           scroll_up: {
+            description: 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page',
             properties: {
               intent: {
                 title: 'Intent',
@@ -245,6 +262,7 @@ export const jsonNavigatorOutputSchema = {
                 description: 'purpose of this action',
               },
               amount: {
+                title: 'Amount',
                 type: 'integer',
                 nullable: true,
               },
@@ -255,6 +273,8 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           send_keys: {
+            description:
+              'Send strings of special keys like Escape, Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press.',
             properties: {
               intent: {
                 title: 'Intent',
@@ -272,6 +292,7 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           scroll_to_text: {
+            description: 'If you dont find something which you want to interact with, scroll to it',
             properties: {
               intent: {
                 title: 'Intent',
@@ -289,6 +310,7 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           get_dropdown_options: {
+            description: 'Get all options from a native dropdown',
             properties: {
               intent: {
                 title: 'Intent',
@@ -306,6 +328,8 @@ export const jsonNavigatorOutputSchema = {
             nullable: true,
           },
           select_dropdown_option: {
+            description:
+              'Select dropdown option for interactive element index by the text of the option you want to select',
             properties: {
               intent: {
                 title: 'Intent',

+ 133 - 73
packages/schema-utils/lib/helper.ts

@@ -60,7 +60,27 @@ function processSchemaNode(node: JsonSchemaObject, definitions: Record<string, J
     const definition = definitions[refPath];
     if (definition) {
       // Process the definition to resolve any nested references
-      return processSchemaNode(definition, definitions);
+      const processedDefinition = processSchemaNode(definition, definitions);
+
+      // Create a new object that preserves properties from the original node (except $ref)
+      const result: JsonSchemaObject = {};
+
+      // First copy properties from the original node except $ref
+      for (const [key, value] of Object.entries(node)) {
+        if (key !== '$ref') {
+          result[key] = value;
+        }
+      }
+
+      // Then copy properties from the processed definition
+      // Don't override any existing properties in the original node
+      for (const [key, value] of Object.entries(processedDefinition)) {
+        if (result[key] === undefined) {
+          result[key] = value;
+        }
+      }
+
+      return result;
     }
   }
 
@@ -74,7 +94,24 @@ function processSchemaNode(node: JsonSchemaObject, definitions: Record<string, J
     const hasNullType = processedAnyOf.some(item => item.type === 'null');
 
     if (nonNullTypes.length === 1 && hasNullType) {
-      const result = { ...nonNullTypes[0] };
+      // Create a result that preserves all properties from the original node
+      const result: JsonSchemaObject = {};
+
+      // Copy all properties from original node except anyOf
+      for (const [key, value] of Object.entries(node)) {
+        if (key !== 'anyOf') {
+          result[key] = value;
+        }
+      }
+
+      // Merge in properties from the non-null type
+      for (const [key, value] of Object.entries(nonNullTypes[0])) {
+        // Don't override properties that were in the original node
+        if (result[key] === undefined) {
+          result[key] = value;
+        }
+      }
+
       result.nullable = true;
       return result;
     }
@@ -115,93 +152,95 @@ function processSchemaNode(node: JsonSchemaObject, definitions: Record<string, J
  * Converts an OpenAI format JSON schema to a Google Gemini compatible schema
  *
  * Key differences handled:
- * 1. OpenAI uses $defs and $ref for references, Gemini uses inline definitions
+ * 1. OpenAI accepts $defs and $ref for references, Gemini only accepts inline definitions
  * 2. Different structure for nullable properties
  * 3. Gemini has a flatter structure for defining properties
+ * 4. https://ai.google.dev/api/caching#Schema
+ * 5. https://ai.google.dev/gemini-api/docs/structured-output?lang=node#json-schemas
  *
  * @param openaiSchema The OpenAI format JSON schema to convert
+ * @param ensureOrder If true, adds the propertyOrdering field for consistent ordering
  * @returns A Google Gemini compatible JSON schema
  */
-export function convertOpenAISchemaToGemini(openaiSchema: JsonSchemaObject): JsonSchemaObject {
+export function convertOpenAISchemaToGemini(openaiSchema: JsonSchemaObject, ensureOrder = false): JsonSchemaObject {
+  // First flatten the schema with dereferenceJsonSchema
+  const flattenedSchema = dereferenceJsonSchema(openaiSchema);
+
   // Create a new schema object
   const geminiSchema: JsonSchemaObject = {
-    type: openaiSchema.type,
+    type: flattenedSchema.type,
     properties: {},
-    required: openaiSchema.required || [],
+    required: flattenedSchema.required || [],
   };
 
-  // Process definitions to use for resolving references
-  const definitions = openaiSchema.$defs || {};
-
   // Process properties
-  if (openaiSchema.properties) {
-    geminiSchema.properties = processProperties(openaiSchema.properties, definitions);
+  if (flattenedSchema.properties) {
+    geminiSchema.properties = processPropertiesForGemini(flattenedSchema.properties, ensureOrder);
+
+    // Add propertyOrdering for top-level properties if ensureOrder is true
+    if (ensureOrder && geminiSchema.properties) {
+      geminiSchema.propertyOrdering = Object.keys(flattenedSchema.properties);
+    }
+  }
+
+  // Copy other Gemini-compatible fields
+  if (flattenedSchema.description) {
+    geminiSchema.description = flattenedSchema.description;
+  }
+
+  if (flattenedSchema.format) {
+    geminiSchema.format = flattenedSchema.format;
+  }
+
+  if (flattenedSchema.enum) {
+    geminiSchema.enum = flattenedSchema.enum;
+  }
+
+  if (flattenedSchema.nullable) {
+    geminiSchema.nullable = flattenedSchema.nullable;
+  }
+
+  // Handle array items
+  if (flattenedSchema.type === 'array' && flattenedSchema.items) {
+    geminiSchema.items = processPropertyForGemini(flattenedSchema.items);
+
+    if (flattenedSchema.minItems !== undefined) {
+      geminiSchema.minItems = flattenedSchema.minItems;
+    }
+
+    if (flattenedSchema.maxItems !== undefined) {
+      geminiSchema.maxItems = flattenedSchema.maxItems;
+    }
   }
 
   return geminiSchema;
 }
 
 /**
- * Process properties recursively, resolving references and converting to Gemini format
+ * Process properties recursively, converting to Gemini format
  */
-function processProperties(
+function processPropertiesForGemini(
   properties: Record<string, JsonSchemaObject>,
-  definitions: Record<string, JsonSchemaObject>,
+  addPropertyOrdering: boolean = false,
 ): Record<string, JsonSchemaObject> {
   const result: Record<string, JsonSchemaObject> = {};
 
   for (const [key, value] of Object.entries(properties)) {
     if (typeof value !== 'object' || value === null) continue;
 
-    result[key] = processProperty(value, definitions);
+    result[key] = processPropertyForGemini(value, addPropertyOrdering);
   }
 
   return result;
 }
 
 /**
- * Process a single property, resolving references and converting to Gemini format
+ * Process a single property, converting to Gemini format
+ *
+ * @param property The property to process
+ * @param addPropertyOrdering Whether to add property ordering for object properties
  */
-function processProperty(property: JsonSchemaObject, definitions: Record<string, JsonSchemaObject>): JsonSchemaObject {
-  // If it's a reference, resolve it
-  if (property.$ref) {
-    const refPath = property.$ref.replace('#/$defs/', '');
-    const definition = definitions[refPath];
-    if (definition) {
-      return processProperty(definition, definitions);
-    }
-  }
-
-  // Handle anyOf for nullable properties
-  if (property.anyOf) {
-    const nonNullType = property.anyOf.find(item => item.type !== 'null' && !item.$ref);
-
-    const refType = property.anyOf.find(item => item.$ref);
-
-    const isNullable = property.anyOf.some(item => item.type === 'null');
-
-    if (refType?.$ref) {
-      const refPath = refType.$ref.replace('#/$defs/', '');
-      const definition = definitions[refPath];
-
-      if (definition) {
-        const processed = processProperty(definition, definitions);
-        if (isNullable) {
-          processed.nullable = true;
-        }
-        return processed;
-      }
-    }
-
-    if (nonNullType) {
-      const processed = processProperty(nonNullType, definitions);
-      if (isNullable) {
-        processed.nullable = true;
-      }
-      return processed;
-    }
-  }
-
+function processPropertyForGemini(property: JsonSchemaObject, addPropertyOrdering = false): JsonSchemaObject {
   // Create a new property object
   const result: JsonSchemaObject = {
     type: property.type,
@@ -212,30 +251,51 @@ function processProperty(property: JsonSchemaObject, definitions: Record<string,
     result.description = property.description;
   }
 
-  // Process nested properties
-  if (property.properties) {
-    result.properties = processProperties(property.properties, definitions);
+  // Copy format if it exists
+  if (property.format) {
+    result.format = property.format;
+  }
+
+  // Copy enum if it exists
+  if (property.enum) {
+    result.enum = property.enum;
+  }
+
+  // Copy nullable if it exists
+  if (property.nullable) {
+    result.nullable = property.nullable;
+  }
+
+  // Process nested properties for objects
+  if (property.type === 'object' && property.properties) {
+    result.properties = processPropertiesForGemini(property.properties, addPropertyOrdering);
 
     // Copy required fields
     if (property.required) {
       result.required = property.required;
-    } else {
-      result.required = [];
+    }
+
+    // Add propertyOrdering for nested object if needed
+    if (addPropertyOrdering && property.properties) {
+      result.propertyOrdering = Object.keys(property.properties);
+    }
+    // Copy propertyOrdering if it already exists
+    else if (property.propertyOrdering) {
+      result.propertyOrdering = property.propertyOrdering;
     }
   }
 
   // Handle arrays
-  if (property.items) {
-    result.items = processProperty(property.items, definitions);
-  }
+  if (property.type === 'array' && property.items) {
+    result.items = processPropertyForGemini(property.items, addPropertyOrdering);
 
-  // Handle special case for NoParamsAction which is an object in OpenAI but a string in Gemini
-  if (property.additionalProperties === true && property.title === 'NoParamsAction' && property.description) {
-    return {
-      type: 'string',
-      nullable: true,
-      description: property.description,
-    };
+    if (property.minItems !== undefined) {
+      result.minItems = property.minItems;
+    }
+
+    if (property.maxItems !== undefined) {
+      result.maxItems = property.maxItems;
+    }
   }
 
   return result;
@@ -251,7 +311,7 @@ export function stringifyCustom(value: JSONSchemaType, indent = '', baseIndent =
   switch (typeof value) {
     case 'string':
       // Escape single quotes within the string if necessary
-      return `'${value.replace(/'/g, "\\\\'")}'`;
+      return `'${(value as string).replace(/'/g, "\\\\'")}'`;
     case 'number':
     case 'boolean':
       return String(value);
@@ -270,7 +330,7 @@ export function stringifyCustom(value: JSONSchemaType, indent = '', baseIndent =
       const properties = keys.map(key => {
         // Assume keys are valid JS identifiers and don't need quotes
         const formattedKey = key;
-        const formattedValue = stringifyCustom(value[key], currentIndent, baseIndent);
+        const formattedValue = stringifyCustom(value[key] as JSONSchemaType, currentIndent, baseIndent);
         return `${currentIndent}${formattedKey}: ${formattedValue}`;
       });
       return `{\n${properties.join(',\n')}\n${indent}}`;