Ver Fonte

Merge pull request #10 from nanobrowser/bugfix

Bugfix
Ashu há 5 meses atrás
pai
commit
71093f2d83

+ 33 - 0
README.md

@@ -66,6 +66,39 @@ Looking for a powerful AI web agent without the $200/month price tag of OpenAI O
     *   Add your LLM API keys.
     *   Choose which model to use for different agents (Navigator, Planner, Validator)
 
+## 🛠️ Build from Source
+
+If you prefer to build Nanobrowser yourself, follow these steps:
+
+1. **Prerequisites**:
+   * [Node.js](https://nodejs.org/) (v22.12.0 or higher)
+   * [pnpm](https://pnpm.io/installation) (v9.15.1 or higher)
+
+2. **Clone the Repository**:
+   ```bash
+   git clone https://github.com/nanobrowser/nanobrowser.git
+   cd nanobrowser
+   ```
+
+3. **Install Dependencies**:
+   ```bash
+   pnpm install
+   ```
+
+4. **Build the Extension**:
+   ```bash
+   pnpm build
+   ```
+
+5. **Load the Extension**:
+   * The built extension will be in the `dist` directory
+   * Follow the installation steps from the Quick Start section to load the extension into your browser
+
+6. **Development Mode** (optional):
+   ```bash
+   pnpm dev
+   ```
+
 ## 💡 See It In Action
 
 Here are some powerful tasks you can accomplish with just a sentence:

+ 12 - 6
chrome-extension/src/background/agent/actions/builder.ts

@@ -156,7 +156,8 @@ export class ActionBuilder {
 
     // Element Interaction Actions
     const clickElement = new Action(async (input: z.infer<typeof clickElementActionSchema.schema>) => {
-      this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.ACT_START, input.desc);
+      const todo = input.desc || `Click element with index ${input.index}`;
+      this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.ACT_START, todo);
 
       const page = await this.context.browserContext.getCurrentPage();
       const state = await page.getState();
@@ -207,7 +208,8 @@ export class ActionBuilder {
     actions.push(clickElement);
 
     const inputText = new Action(async (input: z.infer<typeof inputTextActionSchema.schema>) => {
-      this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.ACT_START, input.desc);
+      const todo = input.desc || `Input text into index ${input.index}`;
+      this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.ACT_START, todo);
 
       const page = await this.context.browserContext.getCurrentPage();
       const state = await page.getState();
@@ -284,7 +286,8 @@ export class ActionBuilder {
     actions.push(cacheContent);
 
     const scrollDown = new Action(async (input: z.infer<typeof scrollDownActionSchema.schema>) => {
-      this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.ACT_START, input.desc);
+      const todo = input.desc || 'Scroll down the page';
+      this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.ACT_START, todo);
 
       const page = await this.context.browserContext.getCurrentPage();
       await page.scrollDown(input.amount);
@@ -296,7 +299,8 @@ export class ActionBuilder {
     actions.push(scrollDown);
 
     const scrollUp = new Action(async (input: z.infer<typeof scrollUpActionSchema.schema>) => {
-      this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.ACT_START, input.desc);
+      const todo = input.desc || 'Scroll up the page';
+      this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.ACT_START, todo);
 
       const page = await this.context.browserContext.getCurrentPage();
       await page.scrollUp(input.amount);
@@ -309,7 +313,8 @@ export class ActionBuilder {
 
     // Keyboard Actions
     const sendKeys = new Action(async (input: z.infer<typeof sendKeysActionSchema.schema>) => {
-      this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.ACT_START, input.desc);
+      const todo = input.desc || `Send keys: ${input.keys}`;
+      this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.ACT_START, todo);
 
       const page = await this.context.browserContext.getCurrentPage();
       await page.sendKeys(input.keys);
@@ -320,7 +325,8 @@ export class ActionBuilder {
     actions.push(sendKeys);
 
     const scrollToText = new Action(async (input: z.infer<typeof scrollToTextActionSchema.schema>) => {
-      this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.ACT_START, input.desc);
+      const todo = input.desc || `Scroll to text: ${input.text}`;
+      this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.ACT_START, todo);
 
       const page = await this.context.browserContext.getCurrentPage();
       try {

+ 6 - 6
chrome-extension/src/background/agent/actions/schemas.ts

@@ -41,7 +41,7 @@ export const clickElementActionSchema: ActionSchema = {
   name: 'click_element',
   description: 'Click element',
   schema: z.object({
-    desc: z.string(),
+    desc: z.string().optional(), // some small LLM can not generate a description, so let it be optional (but it's still makred as required in json schema)
     index: z.number(),
     xpath: z.string().optional(),
   }),
@@ -51,7 +51,7 @@ export const inputTextActionSchema: ActionSchema = {
   name: 'input_text',
   description: 'Input text into an interactive input element',
   schema: z.object({
-    desc: z.string(),
+    desc: z.string().optional(),
     index: z.number(),
     text: z.string(),
     xpath: z.string().optional(),
@@ -98,7 +98,7 @@ export const scrollDownActionSchema: ActionSchema = {
   name: 'scroll_down',
   description: 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page',
   schema: z.object({
-    desc: z.string(),
+    desc: z.string().optional(),
     amount: z.number().optional(),
   }),
 };
@@ -107,7 +107,7 @@ export const scrollUpActionSchema: ActionSchema = {
   name: 'scroll_up',
   description: 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page',
   schema: z.object({
-    desc: z.string(),
+    desc: z.string().optional(),
     amount: z.number().optional(),
   }),
 };
@@ -117,7 +117,7 @@ export const sendKeysActionSchema: ActionSchema = {
   description:
     'Send strings of special keys like Backspace, Insert, PageDown, Delete, Enter. Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard press. Be aware of different operating systems and their shortcuts',
   schema: z.object({
-    desc: z.string(),
+    desc: z.string().optional(),
     keys: z.string(),
   }),
 };
@@ -126,7 +126,7 @@ export const scrollToTextActionSchema: ActionSchema = {
   name: 'scroll_to_text',
   description: 'If you dont find something which you want to interact with, scroll to it',
   schema: z.object({
-    desc: z.string(),
+    desc: z.string().optional(),
     text: z.string(),
   }),
 };

+ 4 - 4
chrome-extension/src/background/agent/prompts/navigator.ts

@@ -34,12 +34,12 @@ export class NavigatorPrompt extends BasePrompt {
 
    Common action sequences:
    - Form filling: [
-       {"input_text": {"index": 1, "text": "username"}},
-       {"input_text": {"index": 2, "text": "password"}},
-       {"click_element": {"index": 3}}
+       {"input_text": {"desc": "Fill title", "index": 1, "text": "example title"}},
+       {"input_text": {"desc": "Fill comment", "index": 2, "text": "example comment"}},
+       {"click_element": {"desc": "Click submit button", "index": 3}}
      ]
    - Navigation: [
-       {"open_tab": {}},
+       {"open_tab": {"url": "https://example.com"}},
        {"go_to_url": {"url": "https://example.com"}},
      ]
 

chrome-extension/public/content.css → pages/content/public/_content.css


+ 0 - 0
pages/options/public/_options.css


+ 2 - 2
pages/options/src/components/ModelSettings.tsx

@@ -75,8 +75,8 @@ export const ModelSettings = () => {
     setApiKeys(prev => ({
       ...prev,
       [provider]: {
-        apiKey,
-        baseUrl: baseUrl !== undefined ? baseUrl : prev[provider]?.baseUrl,
+        apiKey: apiKey.trim(),
+        baseUrl: baseUrl !== undefined ? baseUrl.trim() : prev[provider]?.baseUrl,
       },
     }));
   };