Selaa lähdekoodia

Merge pull request #1 from alexchenzl/initial

initial bulk upload
Ashu 7 kuukautta sitten
vanhempi
commit
9c42319c22
74 muutettua tiedostoa jossa 5581 lisäystä ja 2 poistoa
  1. 8 2
      .gitignore
  2. 153 0
      README.md
  3. 75 0
      config_example.yaml
  4. 44 0
      extension/README.md
  5. 124 0
      extension/src/background.js
  6. 31 0
      extension/src/content.js
  7. 49 0
      extension/src/history.js
  8. 3 0
      extension/src/icons/evaluator.svg
  9. BIN
      extension/src/icons/icon128.png
  10. BIN
      extension/src/icons/icon16.png
  11. BIN
      extension/src/icons/icon48.png
  12. 3 0
      extension/src/icons/manager.svg
  13. 3 0
      extension/src/icons/navigator.svg
  14. 3 0
      extension/src/icons/planner.svg
  15. 3 0
      extension/src/icons/system.svg
  16. 3 0
      extension/src/icons/user.svg
  17. 3 0
      extension/src/icons/validator.svg
  18. 43 0
      extension/src/manifest.json
  19. 188 0
      extension/src/sidebar.css
  20. 26 0
      extension/src/sidebar.html
  21. 352 0
      extension/src/sidebar.js
  22. 81 0
      install.sh
  23. 32 0
      pyproject.toml
  24. 0 0
      src/nanobrowser/__init__.py
  25. 3 0
      src/nanobrowser/cli/__init__.py
  26. 117 0
      src/nanobrowser/cli/main.py
  27. 0 0
      src/nanobrowser/lib/__init__.py
  28. 0 0
      src/nanobrowser/lib/agent/__init__.py
  29. 6 0
      src/nanobrowser/lib/agent/agents/__init__.py
  30. 109 0
      src/nanobrowser/lib/agent/agents/base.py
  31. 177 0
      src/nanobrowser/lib/agent/agents/navigator.py
  32. 146 0
      src/nanobrowser/lib/agent/agents/planner.py
  33. 55 0
      src/nanobrowser/lib/agent/context.py
  34. 13 0
      src/nanobrowser/lib/agent/event/__init__.py
  35. 85 0
      src/nanobrowser/lib/agent/event/base.py
  36. 25 0
      src/nanobrowser/lib/agent/event/logging_subscriber.py
  37. 26 0
      src/nanobrowser/lib/agent/event/manager.py
  38. 254 0
      src/nanobrowser/lib/agent/executor.py
  39. 11 0
      src/nanobrowser/lib/agent/memory/__init__.py
  40. 29 0
      src/nanobrowser/lib/agent/memory/base.py
  41. 33 0
      src/nanobrowser/lib/agent/memory/in_memory_history.py
  42. 9 0
      src/nanobrowser/lib/agent/prompts/__init__.py
  43. 32 0
      src/nanobrowser/lib/agent/prompts/base.py
  44. 47 0
      src/nanobrowser/lib/agent/prompts/navigator.py
  45. 102 0
      src/nanobrowser/lib/agent/prompts/planner.py
  46. 18 0
      src/nanobrowser/lib/agent/prompts/validator.py
  47. 8 0
      src/nanobrowser/lib/agent/tools/__init__.py
  48. 11 0
      src/nanobrowser/lib/agent/tools/base.py
  49. 242 0
      src/nanobrowser/lib/agent/tools/click_using_selector.py
  50. 140 0
      src/nanobrowser/lib/agent/tools/enter_text_and_click.py
  51. 251 0
      src/nanobrowser/lib/agent/tools/enter_text_using_selector.py
  52. 139 0
      src/nanobrowser/lib/agent/tools/get_dom_with_content_type.py
  53. 39 0
      src/nanobrowser/lib/agent/tools/get_url.py
  54. 113 0
      src/nanobrowser/lib/agent/tools/open_url.py
  55. 138 0
      src/nanobrowser/lib/agent/tools/pdf_text_extractor.py
  56. 138 0
      src/nanobrowser/lib/agent/tools/press_key_combination.py
  57. 0 0
      src/nanobrowser/lib/browser/__init__.py
  58. 163 0
      src/nanobrowser/lib/browser/context.py
  59. 0 0
      src/nanobrowser/lib/browser/dom/__init__.py
  60. 43 0
      src/nanobrowser/lib/browser/dom/dom_helper.py
  61. 98 0
      src/nanobrowser/lib/browser/dom/dom_mutation_observer.py
  62. 530 0
      src/nanobrowser/lib/browser/dom/get_detailed_accessibility_tree.py
  63. 176 0
      src/nanobrowser/lib/browser/launcher.py
  64. 257 0
      src/nanobrowser/lib/browser/manager.py
  65. 0 0
      src/nanobrowser/lib/config/__init__.py
  66. 127 0
      src/nanobrowser/lib/config/config.py
  67. 34 0
      src/nanobrowser/lib/config/logging_config.py
  68. 0 0
      src/nanobrowser/lib/utils/__init__.py
  69. 59 0
      src/nanobrowser/lib/utils/path_manager.py
  70. 35 0
      src/nanobrowser/lib/utils/time_utils.py
  71. 0 0
      src/nanobrowser/lib/websocket/__init__.py
  72. 63 0
      src/nanobrowser/lib/websocket/message.py
  73. 161 0
      src/nanobrowser/lib/websocket/server.py
  74. 92 0
      src/nanobrowser/lib/websocket/task.py

+ 8 - 2
.gitignore

@@ -14,7 +14,6 @@ dist/
 downloads/
 eggs/
 .eggs/
-lib/
 lib64/
 parts/
 sdist/
@@ -165,7 +164,14 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
 
 # PyPI configuration file
 .pypirc
+
+
+.DS_Store
+.nanobrowser/
+config.yaml*
+config.yml*
+uv.lock

+ 153 - 0
README.md

@@ -0,0 +1,153 @@
+# NanoBrowser
+
+Open source multi-agent browser automation tool with built-in Chrome extension.
+
+
+## System Architecture
+
+The system architecture is visualized using a mermaid diagram, illustrating the interaction between various components such as the Chrome Extension, WebSocket Server, and Multi-Agent components.
+
+```mermaid
+graph TB
+subgraph "Nano Browser Agent"
+direction LR
+WS[WebSocket Server]
+subgraph "Multi-Agent components"
+direction TB
+N[Navigator Agent]
+P[Planner Agent]
+E[Evaluator Agent]
+V[Validator Agent]
+end
+end
+subgraph "Chrome Extension"
+direction TB
+UI[Chat Sidebar UI]
+CE[Extension Background]
+end
+
+%% External Communications
+UI <--> CE
+CE <-->|WebSocket| WS
+
+%% Agent interactions
+P -->|Next step| N
+N -->|Execution results| E
+E -->|Success/Failure + Feedback| P
+P -->|Final result| V
+V -->|Validation failure + Feedback| P
+
+%% Layout hints
+linkStyle 2,3,4,5 stroke-width:2
+```
+
+### Chrome Extension
+- **Chat Sidebar UI**: User interface for interacting with the multi-agent system
+- **Extension Background**: Handles WebSocket communication with the server
+
+### WebSocket Server
+- Manages bidirectional communication between the Chrome extension and the multi-agent system
+- Relays commands and status updates between components
+
+### Multi-Agent Components
+- **Planner Agent**: Breaks down web navigation tasks into manageable steps
+- **Navigator Agent**: Executes web navigation steps as planned
+- **Evaluator Agent**: Assesses the success of each navigation step and provides feedback for retries (TODO)
+- **Validator Agent**: Verifies overall task completion and results (TODO)
+
+
+## Usage
+
+### Prerequisites
+- Chrome browser installed on your machine
+- Use git to clone this repository or download the zip file and unzip it
+
+### Install the Chrome Extension
+
+To install the Chrome Extension:
+
+1. Open Chrome and navigate to `chrome://extensions/`.
+2. Enable "Developer mode" by toggling the switch in the top right corner.
+3. Click "Load unpacked" button in the top left.
+4. Select the `nanobrowser/extension/src` directory.
+5. The extension should now appear in your Chrome toolbar.
+
+### Run the NanoBrowser Agent Server via script (MacOS)
+
+To run the NanoBrowser Agent Server via script (MacOS), follow these steps:
+
+1. Navigate to the `nanobrowser` directory:
+   ```bash
+   cd nanobrowser
+   ```
+2. Run the installation script:
+   ```bash
+   ./install.sh
+   ```
+
+### Run the NanoBrowser Agent Server Manually
+
+To run the NanoBrowser Agent Server manually on Linux/MacOS or Windows, follow these steps:
+
+1. **Install `uv`**:
+   - **Linux**: Open your terminal and run:
+     ```bash
+     curl -LsSf https://astral.sh/uv/install.sh | sh
+     ```
+   - **Windows**: For detailed installation instructions, please refer to the official documentation [here](https://docs.astral.sh/uv/getting-started/installation/).
+
+2. **Create a Virtual Environment**:
+   - Navigate to the `nanobrowser` directory:
+     ```bash
+     cd nanobrowser
+     ```
+   - Create a virtual environment:
+     ```bash
+     uv venv --python 3.10
+     ```
+
+3. **Install Dependencies**:
+   - Install the required dependencies:
+     ```bash
+     uv pip install .
+     ```
+
+4. **Set Up Configuration**:
+   - If a `config.yaml` file does not exist, copy the example configuration:
+     ```bash
+     cp config_example.yaml config.yaml
+     ```
+   - Edit `config.yaml` with your settings, including filling in the LLM API keys.
+
+5. **Run the Project**:
+   - Ensure you have Google Chrome installed and the Chrome extension loaded in developer mode.
+   - Finally, run the NanoBrowser Agent Server:
+     ```bash
+     uv run nanobrowser
+     ```
+
+## Contributing
+
+Contributions to the NanoBrowser project are welcome! To contribute:
+
+1. **Fork the repository**.
+2. **Clone your fork**.
+3. **Create a new branch** for your feature or fix.
+4. **Make your changes** and commit them.
+5. **Push to your fork** and create a pull request.
+
+Thank you for your interest in contributing to NanoBrowser!
+
+
+## Acknowledgments
+
+Special thanks to the open-source projects that inspired NanoBrowser:
+- [python-cdp](https://github.com/HMaker/python-cdp)
+- [Agent-E](https://github.com/EmergenceAI/Agent-E)
+- [fuji-web](https://github.com/normal-computing/fuji-web)
+
+Their foundational work helped shape this project. We appreciate the open-source community's collaborative spirit.
+
+## License
+
+This project is licensed under the [Apache License 2.0](https://github.com/alexchenzl/nanobrowser/blob/master/LICENSE) - see the [LICENSE](LICENSE) file for details.

+ 75 - 0
config_example.yaml

@@ -0,0 +1,75 @@
+# Log level INFO / DEBUG / WARNING / ERROR / CRITICAL
+log_level: "INFO"
+
+# Base workspace directory, it's better to set it to an absolute path
+base_dir: "/Users/jason/.nanobrowser"
+
+# Whether to save chat history into the workspace/messages directory
+save_chat_history: true
+
+# Whether to log execution events
+log_events: true
+
+# Default max steps allowed for a single task
+max_steps: 100
+
+# Default max errors allowed for a single task
+max_errors: 20
+
+# Default max tool rounds allowed for a single task
+max_tool_rounds: 20
+
+# Browser configuration
+# Only needed if chrome_app_path can not be detected automatically
+#
+# browser:
+#   chrome_app_path: "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
+#   cdp_port: 9222
+
+# Configurations for the agent planner and navigator
+# Note: 
+#     api_key is optional, can also be set via environment variables, example:
+#     export OPENAI_API_KEY=sk-...
+#     export ANTHROPIC_API_KEY=sk-...
+#
+# Agent planner configuration
+planner:
+  model: "gpt-4o"
+  model_provider: "openai"
+  api_key: "sk-..." # Optional
+  inference_config:
+    temperature: 0
+    top_p: 0.001
+
+# Agent navigator configuration
+navigator:
+  model: "gpt-4o"
+  model_provider: "openai"
+  api_key: "sk-..."
+  inference_config:
+    temperature: 0
+    top_p: 0.001
+
+# Agent planner configuration
+# planner:
+#   model: "claude-3-5-sonnet-20241022"
+#   model_provider: "anthropic"
+#   api_key: "sk-..." # Optional
+#   inference_config:
+#     temperature: 0.1
+#     top_p: 0.1
+
+# # Agent navigator configuration
+# navigator:
+#   model: "claude-3-5-sonnet-20241022"
+#   model_provider: "anthropic"
+#   api_key: "sk-..."
+#   inference_config:
+#     temperature: 0.1
+#     top_p: 0.1
+
+# WebSocket server configuration
+# Do not change this unless you know what you are doing
+server:
+  host: "127.0.0.1" 
+  port: 6768

+ 44 - 0
extension/README.md

@@ -0,0 +1,44 @@
+# NanoBrowser Chrome Extension
+
+NanoBrowser Chrome Extension empowers users to automate web browser tasks through natural language conversations. Simply chat with the extension in the browser sidebar to perform complex web automation – from filling forms to scraping data to navigating websites. Under the hood, it connects to a local NanoBrowser agent server to execute these commands seamlessly.
+
+## Prerequisites
+
+- Chrome browser
+- NanoBrowser agent server running locally on your machine
+
+## Installation
+
+To install this extension in Chrome Developer Mode:
+
+1. Open Chrome and navigate to `chrome://extensions/`
+2. Enable "Developer mode" by toggling the switch in the top right corner
+3. Click "Load unpacked" button in the top left
+4. Select the directory containing this extension's files
+5. The extension should now appear in your Chrome toolbar
+
+## Features
+
+- Convenient browser sidebar interface for easy access
+- Natural language interaction with NanoBrowser agent
+- Web browser automation capabilities
+
+## Usage
+
+1. Ensure the NanoBrowser agent server is running locally
+2. Click the extension icon in your Chrome toolbar to open the sidebar
+3. Start chatting with the agent to perform web automation tasks
+4. The agent will execute your requests and provide step by step feedback in the chat window.
+
+## Development
+
+For developers interested in modifying or extending the extension:
+
+1. Clone this repository
+2. Make your changes
+3. Test locally using Chrome Developer Mode
+4. Submit pull requests for improvements
+
+## License
+
+This project is licensed under the [Apache License 2.0](LICENSE) - see the LICENSE file for details.

+ 124 - 0
extension/src/background.js

@@ -0,0 +1,124 @@
+// Fixed URI for WebSocket server
+const WS_URL = 'ws://localhost:6768/ws';
+const TEN_SECONDS_MS = 10000;
+let webSocket = null;
+
+// Setup side panel behavior
+chrome.sidePanel
+  .setPanelBehavior({ openPanelOnActionClick: true })
+  .catch((error) => console.error(error));
+
+// WebSocket connection management
+function connectWebSocket() {
+  webSocket = new WebSocket(WS_URL);
+  
+  webSocket.onopen = () => {
+    console.log('WebSocket connected');
+    broadcastConnectionStatus(true);
+    keepAlive();
+  };
+  
+  webSocket.onclose = () => {
+    console.log('WebSocket disconnected');
+    broadcastConnectionStatus(false);
+    // Attempt to reconnect after 5 seconds
+    setTimeout(connectWebSocket, 5000);
+  };
+  
+  webSocket.onmessage = (event) => {
+    try {
+      const message = JSON.parse(event.data);
+      const kind = message.kind;
+      if (kind === 'state') {
+        // Broadcast task progress to sidebar
+        broadcastToSidebar({
+          type: 'state',
+          data: message.data
+        });
+      } else if (kind === 'ack') {
+        // console.log('ACK:', message);
+      }
+    } catch (error) {
+      console.error('Failed to parse WebSocket message:', error);
+    }
+  };
+}
+
+function keepAlive() {
+  const keepAliveIntervalId = setInterval(
+    () => {
+      if (webSocket && webSocket.readyState === WebSocket.OPEN) {
+        console.log('sending heartbeat');
+        const heartbeatMessage = {
+          kind: "hb",
+          data: {
+            timestamp: Math.floor(new Date().getTime() / 1000)
+          }
+        };
+        webSocket.send(JSON.stringify(heartbeatMessage));
+      } else {
+        clearInterval(keepAliveIntervalId);
+      }
+    },
+    TEN_SECONDS_MS
+  );
+}
+
+// Broadcast helpers
+function broadcastConnectionStatus(isConnected) {
+  broadcastToSidebar({
+    type: 'connection_status',
+    data: { isConnected }
+  });
+}
+
+function broadcastToSidebar(message) {
+  chrome.runtime.sendMessage(message).catch(err => {
+    console.log('Failed to send message to sidebar:', err);
+  });
+}
+
+function generateTaskId() {
+  const timestamp = Date.now();
+  const random = Math.floor(Math.random() * 900000) + 100000; // 6-digit random number
+  return `${timestamp}-${random}`;
+}
+
+// Message handling from sidebar
+chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
+  if (message.type === 'SEND_MESSAGE' && webSocket) {
+    const taskMessage = {
+      kind: "create",
+      data: {
+        task_id: generateTaskId(),
+        intent: message.text,
+        args: { tab_id: message.tabId }
+      }
+    };
+    webSocket.send(JSON.stringify(taskMessage));
+    sendResponse({ success: true });
+  }
+  return true;
+});
+
+// Initialize WebSocket connection
+connectWebSocket();
+
+chrome.tabs.onUpdated.addListener((tabId, changeInfo, tab) => {
+  if (changeInfo.status === 'complete' && tab.url?.startsWith('http')) {
+    chrome.scripting.executeScript({
+      target: { tabId: tabId },
+      files: ['content.js']
+    }).catch(err => console.error('Failed to inject content script:', err));
+  }
+});
+
+chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
+    if (message.type === 'GET_TAB_ID') {
+        if (sender.tab && sender.tab.id !== undefined) {
+            sendResponse({ tabId: sender.tab.id });
+        } else {
+            sendResponse({ tabId: null });
+        }
+    }
+}); 

+ 31 - 0
extension/src/content.js

@@ -0,0 +1,31 @@
+// Function to generate a fallback ID when tab ID is unavailable
+function generateFallbackId() {
+    return `fallback-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+}
+
+// Function to ensure ID is assigned
+function ensureTabId() {
+    if (!document.body.hasAttribute('data-nano-tab-id')) {
+        // Get tab ID from chrome runtime
+        chrome.runtime.sendMessage({ type: 'GET_TAB_ID' }, (response) => {
+            let uniqueId;
+            if (response?.tabId) {
+                uniqueId = `nano-tab-${response.tabId}`;
+            } else {
+                uniqueId = generateFallbackId();
+                console.warn('Using fallback ID: Tab ID was unavailable');
+            }
+            document.body.setAttribute('data-nano-tab-id', uniqueId);
+        });
+    }
+    return document.body.getAttribute('data-nano-tab-id');
+}
+
+// Run immediately when script loads
+ensureTabId();
+
+// Handle dynamic page changes
+const observer = new MutationObserver(ensureTabId);
+observer.observe(document.body, {
+    attributes: true
+});

+ 49 - 0
extension/src/history.js

@@ -0,0 +1,49 @@
+class MessageHistory {
+    constructor(maxSize = 512) {
+        this.maxSize = maxSize;
+        this.history = [];
+    }
+
+    // Add a new message to history
+    addMessage(message) {
+        // format timestamp to string
+        message.timestamp = message.timestamp.toISOString();
+        this.history.push(message);
+
+        // Prune old messages if we exceed maxSize
+        if (this.history.length > this.maxSize) {
+            this.history = this.history.slice(-this.maxSize);
+        }
+
+        // Save to chrome storage
+        this.saveToStorage();
+    }
+
+    // Load history from storage
+    async loadHistory() {
+        return new Promise((resolve) => {
+            chrome.storage.local.get(['messageHistory'], (result) => {
+                if (result.messageHistory) {
+                    this.history = result.messageHistory;
+                }
+                resolve(this.history);
+            });
+        });
+    }
+
+    // Save current history to storage
+    saveToStorage() {
+        chrome.storage.local.set({ messageHistory: this.history });
+    }
+
+    // Clear all history
+    clearHistory() {
+        this.history = [];
+        this.saveToStorage();
+    }
+
+    // Get all history
+    getHistory() {
+        return this.history;
+    }
+} 

+ 3 - 0
extension/src/icons/evaluator.svg

@@ -0,0 +1,3 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+  <path fill="rgba(255, 255, 255, 0.9)" d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zm-9 14l-5-5 1.41-1.41L10 14.17l7.59-7.59L19 8l-9 9z"/>
+</svg> 

BIN
extension/src/icons/icon128.png


BIN
extension/src/icons/icon16.png


BIN
extension/src/icons/icon48.png


+ 3 - 0
extension/src/icons/manager.svg

@@ -0,0 +1,3 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+  <path fill="rgba(255, 255, 255, 0.9)" d="M12 2L4 6v12l8 4 8-4V6L12 2zm5.5 14.5l-5.5 2.75-5.5-2.75v-1.5L12 17.25l5.5-2.75v2zm0-4l-5.5 2.75L6.5 12.5v-1.5L12 13.25l5.5-2.75v2zm-5.5-6l4.5 2.25L12 11l-4.5-2.25L12 6.5z"/>
+</svg> 

+ 3 - 0
extension/src/icons/navigator.svg

@@ -0,0 +1,3 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+  <path fill="rgba(255, 255, 255, 0.9)"  d="M12 2L4.5 20.29l.71.71L12 18l6.79 3 .71-.71L12 2zm0 15l-4.5 2 2.5-7h4l2.5 7-4.5-2z"/>
+</svg> 

+ 3 - 0
extension/src/icons/planner.svg

@@ -0,0 +1,3 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+  <path fill="rgba(255, 255, 255, 0.9)"  d="M19 3h-4.18C14.4 1.84 13.3 1 12 1c-1.3 0-2.4.84-2.82 2H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zm-7 0c.55 0 1 .45 1 1s-.45 1-1 1-1-.45-1-1 .45-1 1-1zm2 14H7v-2h7v2zm3-4H7v-2h10v2zm0-4H7V7h10v2z"/>
+</svg> 

+ 3 - 0
extension/src/icons/system.svg

@@ -0,0 +1,3 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+  <path fill="rgba(255, 255, 255, 0.9)" d="M12 3L1 9l4 2.18v6L12 21l7-3.82v-6l2-1.09V17h2V9L12 3zm6.82 6L12 12.72 5.18 9 12 5.28 18.82 9zM17 15.99l-5 2.73-5-2.73v-3.72L12 15l5-2.73v3.72z"/>
+</svg> 

+ 3 - 0
extension/src/icons/user.svg

@@ -0,0 +1,3 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+  <path fill="rgba(255, 255, 255, 0.9)" d="M12 12c2.21 0 4-1.79 4-4s-1.79-4-4-4-4 1.79-4 4 1.79 4 4 4zm0 2c-2.67 0-8 1.34-8 4v2h16v-2c0-2.66-5.33-4-8-4z"/>
+</svg> 

+ 3 - 0
extension/src/icons/validator.svg

@@ -0,0 +1,3 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+  <path fill="currentColor" d="M16.59 7.58L10 14.17l-3.59-3.58L5 12l5 5 8-8zM12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm0 18c-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8-3.58 8-8 8z"/>
+</svg> 

+ 43 - 0
extension/src/manifest.json

@@ -0,0 +1,43 @@
+{
+  "manifest_version": 3,
+  "name": "Nano Browser",
+  "version": "1.0",
+  "description": "Nano Browser - AI Agent for browsing the web",
+  "permissions": [
+    "sidePanel",
+    "storage",
+    "scripting",
+    "activeTab",
+    "tabs"
+  ],
+  "host_permissions": [
+    "<all_urls>",
+    "ws://localhost:6768/*"
+  ],
+  "background": {
+    "service_worker": "background.js",
+    "type": "module"
+  },
+  "action": {
+    "default_title": "Nano Browser",
+    "default_icon": {
+      "16": "icons/icon16.png",
+      "48": "icons/icon48.png",
+      "128": "icons/icon128.png"
+    }
+  },
+  "icons": {
+    "16": "icons/icon16.png",
+    "48": "icons/icon48.png",
+    "128": "icons/icon128.png"
+  },
+  "side_panel": {
+    "default_path": "sidebar.html"
+  },
+  "content_scripts": [
+    {
+      "matches": ["<all_urls>"],
+      "js": ["content.js"]
+    }
+  ]
+} 

+ 188 - 0
extension/src/sidebar.css

@@ -0,0 +1,188 @@
+html, body {
+    margin: 0;
+    padding: 0;
+    height: 100%;
+    overflow: hidden;
+}
+
+.sidebar-container {
+    display: flex;
+    flex-direction: column;
+    height: 100vh;
+}
+
+#chat-container {
+    flex: 1;
+    overflow-y: auto;
+    padding: 16px 0;
+}
+
+.bottom-container {
+    border-top: 1px solid #e0e0e0;
+    background: white;
+    padding: 8px;
+}
+
+#messages-container {
+    height: 100%;
+    overflow-y: auto;
+}
+
+.input-container {
+    display: flex;
+    gap: 8px;
+    padding: 0;
+    align-items: center;
+}
+
+#chat-input {
+    flex: 1;
+    padding: 8px;
+    border: 1px solid #ced4da;
+    border-radius: 4px;
+    resize: vertical;
+    height: 4.5em;
+    min-height: 4.5em;
+    max-height: 100px;
+    font-size: 14px;
+}
+
+#send-button {
+    padding: 8px;
+    background: #007bff;
+    color: white;
+    border: none;
+    border-radius: 4px;
+    cursor: pointer;
+    height: 4.5em;
+}
+
+#send-button:hover {
+    background: #0056b3;
+}
+
+.connection-status {
+    padding: 6px;
+    text-align: center;
+    background: #dc3545;
+    color: white;
+    border-radius: 4px;
+    margin-bottom: 8px;
+    font-size: 12px;
+}
+
+.connection-status.connected {
+    background: #28a745;
+}
+
+.connection-status.disconnected {
+    background: #dc3545;
+}
+
+/* Message block styles */
+.message-block {
+    display: flex;
+    padding: 8px 16px;
+    gap: 12px;
+    align-items: flex-start;
+    width: 100%;
+    box-sizing: border-box;
+}
+
+.actor-icon {
+    width: 32px;
+    height: 32px;
+    flex-shrink: 0;
+    border-radius: 50%;
+}
+
+.actor-icon img {
+    width: 24px;
+    height: 24px;
+    padding: 4px;
+    border-radius: 50%;
+}
+
+.actor-name {
+    font-weight: 600;
+    font-size: 14px;
+    color: #2c2c2c;
+    margin-bottom: 4px;
+}
+
+.message-text {
+    font-size: 14px;
+    color: #4a4a4a;
+    line-height: 1.4;
+    word-wrap: break-word;
+    white-space: pre-wrap;
+}
+
+.message-time {
+    font-size: 12px;
+    color: #888;
+    white-space: nowrap;
+    margin-left: auto;
+    flex-shrink: 0;
+}
+
+/* For messages from the same actor */
+.message-block.same-actor {
+    padding-left: 60px;
+}
+
+/* Add this new style for user messages with previous messages */
+.message-block[data-actor="user"]:not(:first-child) {
+    border-top: 1px solid #e0e0e0;
+}
+
+.message-block.same-actor .actor-icon {
+    display: none;
+}
+
+.message-block + .message-block {
+    margin-top: 4px;
+}
+
+/* Message block styles */
+.message-content {
+    flex: 1;
+    display: flex;
+    flex-direction: column;
+    position: relative;
+}
+
+/* Progress indicator styles */
+.message-text.progress-message {
+    position: relative;
+    min-height: 4px;
+    width: 100%;
+    margin: 8px 0;
+    background: #f5f5f5;
+    overflow: hidden;
+}
+
+.progress-bar {
+    position: absolute;
+    bottom: 0;
+    left: 0;
+    height: 4px;
+    width: 100%;
+    background: linear-gradient(90deg,
+        transparent 0%,
+        #007bff 0%,
+        #00bcd4 30%,
+        transparent 30%
+    );
+    border-radius: 2px;
+    animation: progress-animation 1.5s infinite ease-in-out;
+}
+
+@keyframes progress-animation {
+    0% {
+        transform: translateX(-100%);
+    }
+    100% {
+        transform: translateX(100%);
+    }
+} 

+ 26 - 0
extension/src/sidebar.html

@@ -0,0 +1,26 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>AI Chat Sidebar</title>
+    <link rel="stylesheet" href="sidebar.css">
+</head>
+<body>
+    <div class="sidebar-container">
+        <div id="chat-container">
+            <div id="messages-container">
+                <!-- Messages will be inserted here dynamically -->
+            </div>
+        </div>
+        <div class="bottom-container">
+            <div id="connection-status" class="connection-status" style="display: none;">Not Connected</div>
+            <div class="input-container">
+                <textarea id="chat-input" placeholder="Type your message..."></textarea>
+                <button id="send-button">Send</button>
+            </div>
+        </div>
+    </div>
+    <script src="websocket-client.js"></script>
+    <script src="history.js"></script>
+    <script src="sidebar.js"></script>
+</body>
+</html> 

+ 352 - 0
extension/src/sidebar.js

@@ -0,0 +1,352 @@
+function setInputsEnabled(enabled) {
+    const chatInput = document.getElementById('chat-input');
+    const sendButton = document.getElementById('send-button');
+
+    chatInput.disabled = !enabled;
+    sendButton.disabled = !enabled;
+    
+    // Add visual styling for disabled state
+    if (enabled) {
+        chatInput.style.backgroundColor = '';
+        chatInput.style.color = '';
+        sendButton.style.opacity = '';
+    } else {
+        chatInput.style.backgroundColor = '#f5f5f5';
+        chatInput.style.color = '#999';
+        sendButton.style.opacity = '0.5';
+    }
+}
+
+document.addEventListener('DOMContentLoaded', async () => {
+    const chatInput = document.getElementById('chat-input');
+    const sendButton = document.getElementById('send-button');
+    const connectionStatus = document.getElementById('connection-status');
+
+    const messagesContainer = document.getElementById('messages-container');
+    const messageHistory = new MessageHistory();
+    // Load history messages efficiently
+    const history = await messageHistory.loadHistory();
+    if (history.length > 0) {
+        // Create fragment to batch DOM updates
+        const fragment = document.createDocumentFragment();
+        let previousMessage = null;
+        
+        for (const message of history) {
+            const messageElement = createMessageElement(message, previousMessage);
+            if (messageElement) {
+                messageElement.message = message;
+                fragment.appendChild(messageElement);
+                previousMessage = message;
+            }
+        }
+        
+        messagesContainer.appendChild(fragment);
+        messagesContainer.scrollTop = messagesContainer.scrollHeight;
+    }
+    window.messageHistory = messageHistory;
+
+    // Handle sending messages
+    function handleSendMessage() {
+        const text = chatInput.value.trim();
+        if (!text) return;
+
+        // Disable inputs when sending message
+        setInputsEnabled(false);
+
+        // Add user message to chat
+        addMessage({
+            actor: 'user',
+            content: text,
+            timestamp: new Date()
+        });
+        chatInput.value = '';
+
+        // Get current tab ID and send message
+        chrome.tabs.query({active: true, currentWindow: true}, (tabs) => {
+            const tabId = tabs[0]?.id ? `nano-tab-${tabs[0].id}` : generateFallbackId();
+            
+            // Send message to service worker
+            chrome.runtime.sendMessage({
+                type: 'SEND_MESSAGE',
+                text,
+                tabId
+            }).catch(err => {
+                addMessage({
+                    actor: 'system',
+                    content: 'Failed to send message',
+                    timestamp: new Date()
+                });
+            });
+        });
+    }
+
+    // Handle messages from service worker
+    chrome.runtime.onMessage.addListener((message) => {
+        if (message.type === 'connection_status') {
+            updateConnectionStatus(message.data.isConnected);
+        } else if (message.type === 'state') {
+            handleTaskState(message.data);
+        }
+    });
+
+    // Update connection status
+    function updateConnectionStatus(isConnected) {
+        connectionStatus.textContent = isConnected ? 'Connected' : 'Not Connected';
+        connectionStatus.style.display = 'block';
+        connectionStatus.className = `connection-status ${isConnected ? 'connected' : 'disconnected'}`;
+    }
+
+    // Event listeners
+    sendButton.addEventListener('click', handleSendMessage);
+    chatInput.addEventListener('keypress', (e) => {
+        if (e.key === 'Enter' && !e.shiftKey) {
+            e.preventDefault();
+            handleSendMessage();
+        }
+    });
+
+    // Add auto-grow functionality
+    function autoGrow() {
+        chatInput.style.height = 'auto'; // Reset height to recalculate
+        chatInput.style.height = `${Math.min(chatInput.scrollHeight, 100)}px`; // Set new height up to max
+    }
+    
+    // Listen for input events
+    chatInput.addEventListener('input', autoGrow);
+});
+
+// Helper function for generating fallback ID
+function generateFallbackId() {
+    return `fallback-${Date.now()}-${Math.random().toString(36).substring(2, 11)}`;
+}
+
+const ACTORS = {
+    user: {
+        name: 'User',
+        icon: 'icons/user.svg',
+        iconBackground: '#4CAF50'  // Stronger green
+    },
+    system: {
+        name: 'System',
+        icon: 'icons/system.svg',
+        iconBackground: '#2196F3'  // Stronger blue
+    },
+    manager: {
+        name: 'Manager',
+        icon: 'icons/manager.svg',
+        iconBackground: '#9C27B0'  // Stronger purple
+    },
+    planner: {
+        name: 'Planner',
+        icon: 'icons/planner.svg',
+        iconBackground: '#FF9800'  // Stronger orange
+    },
+    navigator: {
+        name: 'Navigator',
+        icon: 'icons/navigator.svg',
+        iconBackground: '#00BCD4'  // Stronger cyan
+    },
+    evaluator: {
+        name: 'Evaluator',
+        icon: 'icons/evaluator.svg',
+        iconBackground: '#FFC107'  // Stronger yellow
+    },
+    validator: {
+        name: 'Validator',
+        icon: 'icons/validator.svg',
+        iconBackground: '#F44336'  // Stronger red
+    }
+};
+
+
+
+
+function formatTime(date) {
+    // Convert string dates to Date objects if needed
+    const dateObj = date instanceof Date ? date : new Date(date);
+    return dateObj.toTimeString().split(' ')[0];
+}
+
+function createMessageElement(message, previousMessage) {
+    // Check if previous message was a progress bar
+    if (previousMessage?.content === 'Working on it...') {
+        // If current message is also a progress bar, skip creating new element
+        if (message.content === 'Working on it...') {
+            return null;
+        }
+        
+        // If current message is not a progress bar, remove the last progress message
+        const messagesContainer = document.getElementById('messages-container');
+        const lastMessage = messagesContainer.lastElementChild;
+        if (lastMessage) {
+            messagesContainer.removeChild(lastMessage);
+        }
+    }
+
+    const messageBlock = document.createElement('div');
+    messageBlock.className = 'message-block';
+    messageBlock.setAttribute('data-actor', message.actor);
+    
+    const isSameActor = previousMessage && previousMessage.actor === message.actor;
+    if (isSameActor) {
+        messageBlock.classList.add('same-actor');
+    }
+
+    // Actor icon
+    const actorIcon = document.createElement('div');
+    actorIcon.className = 'actor-icon';
+    if (!isSameActor && ACTORS[message.actor].icon) {
+        const iconImg = document.createElement('img');
+        iconImg.src = ACTORS[message.actor].icon;
+        iconImg.alt = ACTORS[message.actor].name;
+        actorIcon.appendChild(iconImg);
+        actorIcon.style.backgroundColor = ACTORS[message.actor].iconBackground;
+    }
+    messageBlock.appendChild(actorIcon);
+
+    // Message content container
+    const contentContainer = document.createElement('div');
+    contentContainer.className = 'message-content';
+
+    // Display actor name if it's not the same as the previous message
+    if (!isSameActor) {
+        const actorName = document.createElement('div');
+        actorName.className = 'actor-name';
+        actorName.textContent = ACTORS[message.actor].name;
+        contentContainer.appendChild(actorName);
+    }
+
+    // Create time element first
+    const timeElement = document.createElement('div');
+    timeElement.className = 'message-time';
+    timeElement.textContent = formatTime(message.timestamp);
+
+    // Message text
+    const messageText = document.createElement('div');
+    messageText.className = 'message-text';
+    
+    // Add progress indicator for "Working on it..." messages
+    if (message.content === 'Working on it...') {
+        messageText.classList.add('progress-message');
+        const progressBar = document.createElement('div');
+        progressBar.className = 'progress-bar';
+        messageText.appendChild(progressBar);
+        
+        // Hide the time element for progress messages
+        timeElement.style.display = 'none';
+    } else {
+        messageText.textContent = message.content;
+    }
+    
+    contentContainer.appendChild(messageText);
+    contentContainer.appendChild(timeElement);
+    messageBlock.appendChild(contentContainer);
+
+    return messageBlock;
+}
+
+function addMessage(message) {
+    const messagesContainer = document.getElementById('messages-container');
+    const previousMessage = messagesContainer.lastElementChild?.message;
+    
+    const messageElement = createMessageElement(message, previousMessage);
+    // Only append if messageElement is not null
+    if (messageElement) {
+        messageElement.message = message;
+        messagesContainer.appendChild(messageElement);
+        messagesContainer.scrollTop = messagesContainer.scrollHeight;
+
+        // Save message to history
+        if (message.content !== 'Working on it...') {
+            window.messageHistory.addMessage(message);
+        }
+    }
+}
+
+function handleTaskState(data) {
+
+    const state = data.state;
+    const actor = data.actor || 'system';
+    const timestamp = new Date(data.timestamp) || new Date();
+    const eventData = data.data;
+    let content = eventData?.details
+    let skip = false;
+    let display_progress = false;
+
+    if (actor === 'manager') {
+        if (state === 'task.start') {
+            content = 'Task received. We are working on it.';
+        } else if (state === 'task.error') {
+            content = `Task failed. \n\n ${content}`;
+            setInputsEnabled(true);
+        } else if (state === 'task.cancel') {
+            content = 'Task canceled.';
+            setInputsEnabled(true);
+        } else if (state === 'task.ok') {
+            setInputsEnabled(true);
+        }
+    } else if (actor === 'planner') {
+        if (state === 'step.start') {
+            skip = true;
+        } else if (state === 'step.ok') {
+            // if plan is not empty, display the plan first
+            if (eventData?.plan) {
+                if (eventData.step === 1) {
+                    plan = `I made a plan for this task: \n\n${eventData.plan}`;
+                } else {
+                    plan = `I updated the plan: \n\n${eventData.plan}`;
+                }
+                addMessage({
+                    actor,
+                    content: plan,
+                    timestamp
+                }); 
+            }
+            // skip to display the details: next step or final response
+            skip = true; 
+        } else if (state === 'step.error') {
+            content = `Step failed. \n\n ${content}`;
+        } else if (state === 'step.cancel') {
+            content = 'Step canceled.';
+        }
+    } else if (actor === 'navigator') {
+        // by default, display progress when navigating
+        display_progress = true;
+        if (state === 'step.start') {
+            // remove string like "[mmid='914']"
+            content = content.replace(/\[mmid='\d+'\]/g, '');
+        } else if (state === 'step.error') {
+            content = `Step failed. \n ${content}`;
+            display_progress = false;
+        } else if (state === 'step.cancel') {
+            content = 'Step canceled.';
+            display_progress = false;
+        } else if (state === 'step.ok') {
+            // display progress if it's not the final response
+            if (eventData?.final) {
+                display_progress = false;
+            }
+            skip = true;
+        } else{
+            // skip to display other messages, like tool calls
+            skip = true;
+        }
+    } 
+
+    if (!skip) {
+        addMessage({
+            actor,
+            content,
+            timestamp
+        });
+    }
+    
+    // display progress if needed
+    if (display_progress) {
+        addMessage({
+            actor: actor,
+            content: 'Working on it...',
+            timestamp
+        });
+    }
+} 

+ 81 - 0
install.sh

@@ -0,0 +1,81 @@
+#!/bin/bash
+
+# Change to the directory where the script is located
+cd "$(dirname "$0")"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}Setting up the project...${NC}"
+
+# Check if uv is installed
+if ! command -v uv &> /dev/null; then
+    echo -e "${YELLOW}uv not found. Installing...${NC}"
+    # Install uv using the standalone installer
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+else
+    echo -e "${GREEN}uv is already installed${NC}"
+    # Optionally update uv
+    echo "Updating uv..."
+    uv self update
+fi
+
+# Verify uv installation
+if ! command -v uv &> /dev/null; then
+    echo -e "${RED}Failed to install uv. Please install it manually.${NC}"
+    exit 1
+fi
+
+# Initialize the project with uv (only if .venv doesn't exist)
+if [ ! -d ".venv" ]; then
+    echo -e "\n${GREEN}Creating virtual environment...${NC}"
+    uv venv --python 3.10
+else
+    echo -e "\n${GREEN}Virtual environment already exists${NC}"
+fi
+
+# Install dependencies from pyproject.toml
+echo -e "\n${GREEN}Installing dependencies...${NC}"
+uv pip install .
+
+# Set up pycache directory before build
+mkdir -p .cache/__pycache__
+PYTHONPYCACHEPREFIX="$(pwd)/.cache/__pycache__"
+
+# Build the project
+echo -e "\n${GREEN}Building project...${NC}"
+uv build
+
+# Configure config.yaml
+if [ ! -f "config.yaml" ] && [ -f "config_example.yaml" ]; then
+    echo -e "\n${YELLOW}Setting up configuration...${NC}"
+    cp config_example.yaml config.yaml
+    
+    # Get current directory and update base_dir in config.yaml
+    CURRENT_DIR=$(pwd)
+    WORKSPACE_DIR="${CURRENT_DIR}/.nanobrowser"
+    
+    # Create .nanobrowser directory if it doesn't exist
+    mkdir -p "${WORKSPACE_DIR}"
+    
+    # Update base_dir in config.yaml
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+        # macOS
+        sed -i '' "s|/Users/jason/.nanobrowser|${WORKSPACE_DIR}|" config.yaml
+    else
+        # Linux
+        sed -i "s|/Users/jason/.nanobrowser|${WORKSPACE_DIR}|" config.yaml
+    fi
+    
+    echo "A default config.yaml file has been created in the current directory. Please configure it before running the application."
+fi
+
+echo -e "\n${GREEN}Setup completed!${NC}"
+echo "To run the project:"
+echo "1. Edit config.yaml with your settings, fill in the LLM api keys"
+echo "2. Make sure you have a Google Chrome browser installed"
+echo "3. Make sure you have the chrome extension installed via the developer mode in chrome"
+echo "4. Run: uv run nanobrowser"

+ 32 - 0
pyproject.toml

@@ -0,0 +1,32 @@
+[project]
+name = "nanobrowser"
+version = "0.1.0a1"
+description = "A multi-agent browser automation tool"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "click>=8.1.8",
+    "langchain>=0.3.12",
+    "langchain-anthropic>=0.3.1",
+    "langchain-openai>=0.2.14",
+    "pdfplumber>=0.11.4",
+    "playwright>=1.49.1",
+    "pydantic>=2.10.3",
+    "python-dotenv>=1.0.1",
+    "pytz>=2024.2",
+    "websockets>=14.1",
+]
+
+[project.scripts]
+nanobrowser = "nanobrowser.cli:cli"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/nanobrowser"]
+
+[tool.hatch.metadata]
+allow-direct-references = true
+

+ 0 - 0
src/nanobrowser/__init__.py


+ 3 - 0
src/nanobrowser/cli/__init__.py

@@ -0,0 +1,3 @@
+from .main import cli
+
+__all__ = ['cli']

+ 117 - 0
src/nanobrowser/cli/main.py

@@ -0,0 +1,117 @@
+import click
+import asyncio
+from langchain.chat_models import init_chat_model
+from nanobrowser.lib.config.config import NanoConfig
+from nanobrowser.lib.agent.executor import Executor
+from nanobrowser.lib.websocket.server import start_server
+from nanobrowser.lib.utils.time_utils import generate_new_task_id
+from pathlib import Path
+
+async def _setup_executor(config: NanoConfig):
+    llm_planner = init_chat_model(
+        model=config.planner.model,
+        model_provider=config.planner.model_provider,
+        **config.planner.inference_config
+    )
+
+    llm_navigator = init_chat_model(
+        model=config.navigator.model,
+        model_provider=config.navigator.model_provider,
+        **config.navigator.inference_config
+    )
+
+    executor = Executor(
+        config.base_dir,
+        llm_planner,
+        llm_navigator,
+        chrome_app_path=config.browser.chrome_app_path,
+        chrome_cdp_port=config.browser.cdp_port,
+        max_steps=config.max_steps,
+        max_errors=config.max_errors,
+    )
+
+    try:
+        await executor.initialize()
+        return executor
+    except Exception as e:
+        print(f"Error: {e}")
+        print("Failed to initialize executor. Exiting.")
+        await executor.close()
+        return None
+    
+async def _run_command_loop(config: NanoConfig):
+    executor = await _setup_executor(config) 
+    if executor is None:
+        return
+    
+    tab_id = None
+    try:
+        print("Welcome to the interactive Multi-Agent web automation tool. Type 'quit' to exit.")
+
+        while True:
+            command = input("Enter command (type 'quit' to exit): ").strip().lower()
+            if command == "quit":
+                break
+            else:
+                task_id = generate_new_task_id()
+                await executor.run(command, task_id, tab_id=tab_id)
+    except Exception as e:
+        print(f"Error: {e}")
+    finally:
+        await executor.close()
+
+async def _run_websocket_server(config: NanoConfig):
+    executor = await _setup_executor(config)
+    if executor is None:
+        return
+    
+    try:
+        await start_server(
+            config.server.host,
+            config.server.port,
+            config.base_dir,
+            executor
+        )
+    except Exception as e:
+        print(f"Error: {e}")
+    finally:
+        await executor.close()
+
+@click.group(invoke_without_command=True)
+@click.option('-c', '--config', required=False, type=click.Path(exists=True), help='Path to config YAML file, defaults to config.yaml in current directory')
+@click.pass_context
+def cli(ctx, config):
+    """NanoBrowser CLI application"""
+    ctx.ensure_object(dict)
+    
+    # Try to find config.yaml if not specified
+    if config is None:
+        default_config = Path.cwd() / 'config.yaml'
+        if default_config.exists():
+            config = str(default_config)
+        else:
+            raise click.UsageError('No config file specified and no config.yaml found in current directory. '
+                                 'Please specify a config file using -c/--config option.')
+    
+    ctx.obj['config'] = config
+    
+    if ctx.invoked_subcommand is None:
+        ctx.invoke(serve)
+
+@cli.command()
+@click.pass_context
+def serve(ctx):
+    """Run the WebSocket server to work with chrome extension, this is the default command"""
+    # Load config from YAML
+    nano_config = NanoConfig.from_yaml(ctx.obj['config'])
+    asyncio.run(_run_websocket_server(nano_config))
+
+@cli.command()
+@click.pass_context
+def cmd(ctx):
+    """Run the interactive command loop with configuration"""
+    nano_config = NanoConfig.from_yaml(ctx.obj['config'])
+    asyncio.run(_run_command_loop(nano_config))
+
+if __name__ == '__main__':
+    cli()

+ 0 - 0
src/nanobrowser/lib/__init__.py


+ 0 - 0
src/nanobrowser/lib/agent/__init__.py


+ 6 - 0
src/nanobrowser/lib/agent/agents/__init__.py

@@ -0,0 +1,6 @@
+"""
+Collection of different agent implementations built on top of the base Agent class.
+
+This package contains various specialized agents that handle different aspects of
+the system's functionality, such as navigation and planning operations.
+"""

+ 109 - 0
src/nanobrowser/lib/agent/agents/base.py

@@ -0,0 +1,109 @@
+"""
+Base agent implementation that defines the core agent interface and common functionality.
+
+This module provides the abstract base class that all other agents should inherit from,
+along with common agent-related data structures and utilities.
+"""
+import json
+import logging
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Dict, Optional, Any, List
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.tools import BaseTool
+from pydantic import BaseModel
+from ..memory import BaseChatHistory, InMemoryChatHistory
+from ..prompts.base import BasePrompt
+from ..context import AgentContext
+from ..event.base import Event, EventData, ExecutionState
+
+logger = logging.getLogger(__name__)
+
+class AgentOutput(BaseModel):
+    """Model representing the standardized output format for all agents"""
+    intent: str | BaseModel
+    tool_calls: Optional[List[Dict[str, Any]]] = None
+    result: Optional[str | BaseModel] = None
+    error: Optional[str] = None
+
+@dataclass
+class  BaseAgentOptions():
+    """Basic configuration options for initializing an Agent"""
+    # agent id
+    id: str
+    # Langchain chat model
+    chatLLM: BaseChatModel 
+    # agent context
+    context: Optional[AgentContext] = None
+    # prompt builder
+    prompt: Optional[BasePrompt] = None
+    # message history
+    chat_history: Optional[BaseChatHistory] = None
+    
+class BaseAgent(ABC):
+    """Base class for all agents"""
+    def __init__(self, options: BaseAgentOptions):
+        self.id = options.id
+        self.chatLLM = options.chatLLM
+        self.prompt = options.prompt if options.prompt else BasePrompt()
+        self.context = options.context
+        self.tools = {}
+        # make sure message history is created
+        if options.chat_history is None:
+            self.message_history = InMemoryChatHistory()
+        else:
+            self.message_history = options.chat_history
+
+    def register_tool(self, tool_func: BaseTool):
+        logger.debug(f"Registering tool: {tool_func.name}")
+        self.tools[tool_func.name] = tool_func
+
+    def build_system_message(self) -> SystemMessage:
+        prompt = self.prompt.get_system_prompt()
+        return SystemMessage(content=prompt)
+    
+    def build_user_message(
+        self,
+        user_input: str,
+        url: Optional[str] = None,
+        title: Optional[str] = None,
+        follow_up: Optional[bool] = False,
+    ) -> HumanMessage:
+        prompt = self.prompt.build_user_prompt(user_input, url=url, title=title, follow_up=follow_up)
+        return HumanMessage(content=prompt)
+    
+    def reset(self):
+        if self.message_history:
+            self.message_history.clear()
+
+   
+    def save_chat_history(self):
+        if self.message_history and self.context:
+            messages = self.message_history.get_messages()
+            # convert to json
+            messages_json = [message.to_json() for message in messages]
+            # save to file
+            task_id = self.context.task_id
+            file_path = self.context.path_manager.messages / f"{task_id}-{self.id}.json"
+            with open(file_path, "w") as f:
+                json.dump(messages_json, f, indent=2)
+
+    def load_chat_history(self):
+        pass
+
+    async def emit_event(self, state: ExecutionState, data: EventData):
+        if self.context:
+            await self.context.event_manager.emit(Event.create(
+                state=state,
+                actor=self.id,
+                data=data
+            ))
+
+    @abstractmethod
+    async def process_request(
+        self,
+        user_input: str | BaseModel,
+        additional_params: Optional[Dict[str, Any]] = None
+    ) -> AgentOutput:
+        pass

+ 177 - 0
src/nanobrowser/lib/agent/agents/navigator.py

@@ -0,0 +1,177 @@
+"""
+Navigator agent executes the navigation plan step by step, using tools to interact with the browser
+"""
+import logging
+from copy import deepcopy
+from typing import Dict, Optional, Any
+from pydantic import BaseModel, Field
+from langchain_core.messages import ToolMessage
+from .base import BaseAgentOptions, BaseAgent, AgentOutput
+from ..prompts.navigator import NavigatorPrompt
+from ..tools import click, enter_text_and_click, entertext, bulk_enter_text, get_dom_with_content_type, openurl, press_key_combination, extract_text_from_pdf
+from ..event.base import ExecutionState, EventData
+
+logger = logging.getLogger(__name__)
+
+class NavigatorResult(BaseModel):
+    """
+    Result of the navigator agent
+    """
+    final_response: str = Field(
+        description="The final response or conclusion after navigation"
+    )
+
+class NavigatorAgent(BaseAgent):
+    def __init__(self, options: BaseAgentOptions):
+        super().__init__(options)
+
+        # make sure prompt is set
+        if options.prompt is None:
+            self.prompt = NavigatorPrompt()
+        
+        # register default browser tools
+        self._register_default_tools()
+
+    def _register_default_tools(self):
+        self.register_tool(click)
+        self.register_tool(enter_text_and_click)
+        self.register_tool(entertext)
+        self.register_tool(bulk_enter_text)
+        self.register_tool(get_dom_with_content_type)
+        self.register_tool(openurl)
+        self.register_tool(press_key_combination)
+        self.register_tool(extract_text_from_pdf)
+
+    async def process_request(
+        self,
+        user_input: str,
+        additional_params: Optional[Dict[str, Any]] = None
+    ) -> AgentOutput:
+        self.context.step += 1
+        self.context.tool_round = 0
+
+        # clear message history at the beginning of every request
+        self.message_history.clear()
+        self.message_history.add_message(self.build_system_message())
+
+        # emit step started event
+        await self.emit_event(ExecutionState.STEP_START, EventData(
+                task_id=self.context.task_id,
+                step=self.context.step, 
+                details=user_input
+            )
+        )
+
+        # get current page info
+        page_info = await self.context.browser_context.get_current_page_info()
+        user_message = self.build_user_message(user_input, url=page_info.url, title=page_info.title)
+        logger.debug(f"Navigator user message: {user_message}")
+
+        self.message_history.add_message(user_message)
+
+        tools_to_use = []
+        for _, tool_func in self.tools.items():
+            tools_to_use.append(tool_func)
+
+        allowed_tool_rounds = self.context.max_tool_rounds
+        event_data = EventData(
+                task_id=self.context.task_id,
+                step=self.context.step,
+            )
+        
+        try:
+            tool_calls = []
+            final_message = ''
+
+            while allowed_tool_rounds > 0: 
+                llm_with_tools = self.chatLLM.bind_tools(tools_to_use)
+                ai_response = await llm_with_tools.ainvoke(self.message_history.get_messages())    
+                self.message_history.add_message(ai_response)
+
+                if ai_response.tool_calls:
+                    self.context.tool_round += 1
+                    event_data.tool_round = self.context.tool_round
+
+                    # execute tool calls and return tool messages back to the LLM
+                    for tool_call in ai_response.tool_calls:
+                        tool_name = tool_call["name"].lower()
+                        tool_args = tool_call["args"]
+                        selected_tool = self.tools[tool_name]
+                        logger.debug(f"Invoking tool: {selected_tool.name} with args: {tool_args}")
+
+                        # inject context into tool call
+                        tool_call_copy = deepcopy(tool_call)
+                        tool_call_copy["args"]["context"] = self.context
+
+                        tool_response = await selected_tool.ainvoke(tool_call_copy)
+                        tool_msg = ToolMessage(
+                            tool_call_id=tool_call["id"],
+                            content=tool_response,
+                        )
+                        # return the tool message to the LLM
+                        self.message_history.add_message(tool_msg)
+                        # record the tool call
+                        tool_calls.append(tool_call_copy)
+
+                    # Also send current page info to the LLM
+                    page_info = await self.context.browser_context.get_current_page_info()
+                    user_message = self.build_user_message(
+                        """
+                        Please analyze the results of the above tool calls and current web page info, check if the sub-task is complete.
+                        - If yes, return the final response.
+                        - If no, return the next tool call.
+                        """,
+                        url=page_info.url, 
+                        title=page_info.title
+                    )
+                    self.message_history.add_message(user_message)
+
+                else:
+                    # remove the termination message from the response to avoid confusion
+                    final_message = ai_response.content.replace("##TERMINATE TASK##", "")
+                    event_data.final = True
+                    break
+
+                allowed_tool_rounds -= 1
+
+            if allowed_tool_rounds == 0:
+                # emit event
+                self.context.error += 1
+                error_msg = "too many rounds of tool calls in subtask"
+                event_data.details = error_msg
+                await self.emit_event(ExecutionState.STEP_FAIL, event_data)
+                return AgentOutput(
+                    intent=user_input,
+                    tool_calls=tool_calls,
+                    result= None,
+                    error=error_msg
+                )
+            
+            # emit event
+            event_data.details = final_message
+            await self.emit_event(ExecutionState.STEP_OK, event_data)
+
+            return AgentOutput(
+                intent=user_input,
+                tool_calls=tool_calls,
+                result=NavigatorResult(final_response=final_message),
+                error=None
+            )
+        
+        except Exception as e:
+            self.context.error += 1
+            error_msg = str(e)
+            # emit event
+            event_data.details = error_msg
+            await self.emit_event(ExecutionState.STEP_FAIL, event_data)
+
+            logger.error(f"Error parsing navigator response: {e}")
+            return AgentOutput(
+                intent=user_input,
+                result=None,
+                error=error_msg
+            )
+        
+
+
+

+ 146 - 0
src/nanobrowser/lib/agent/agents/planner.py

@@ -0,0 +1,146 @@
+"""
+Planner agent generates a navigation plan for the user's request
+"""
+
+import logging
+from typing import Dict, Any, Optional
+from pydantic import BaseModel, Field
+from langchain_core.messages import AIMessage
+from .base import BaseAgentOptions, BaseAgent, AgentOutput
+from ..prompts.planner import PlannerPrompt
+from ..event.base import ExecutionState, EventData
+
+logger = logging.getLogger(__name__)
+
+class PlannerResult(BaseModel):
+    """
+    Result of the planner agent
+    """
+    terminated: bool = Field(
+        description="Flag indicating whether the planner should terminate"
+    )
+    plan: Optional[str] = Field(
+        default=None,
+        description="Planned steps/actions to be executed"
+    )
+    next_step: Optional[str] = Field(
+        default=None,
+        description="The next immediate step/action to be executed"
+    )
+    final_response: Optional[str] = Field(
+        default=None,
+        description="The final response or conclusion after plan execution"
+    )
+
+
+class PlannerAgent(BaseAgent):
+    def __init__(self, options: BaseAgentOptions):
+        super().__init__(options)
+
+        # make sure prompt is set
+        if options.prompt is None:
+            self.prompt = PlannerPrompt()
+         
+
+    async def process_request(
+        self,
+        user_input: str,
+        additional_params: Optional[Dict[str, Any]] = None
+    ) -> AgentOutput:
+
+        self.context.step += 1
+        self.context.tool_round = 0
+        follow_up = self.context.step > 1
+
+        # emit step started event
+        await self.emit_event(ExecutionState.STEP_START, EventData(
+            task_id=self.context.task_id,
+            step=self.context.step,
+            details=user_input
+        ))
+        
+        try:
+            # new task arrival, add system message
+            if self.message_history.length() == 0:
+                self.message_history.add_message(self.build_system_message())
+            
+            # get current page info
+            page_info = await self.context.browser_context.get_current_page_info()
+            user_message = self.build_user_message(user_input, url=page_info.url, title=page_info.title, follow_up=follow_up)
+            logger.debug(f"Planner user message: {user_message}")
+
+            self.message_history.add_message(user_message)
+
+            retry = 0
+            while retry < 3:
+                # sometimes LLM doesn't return the structured output, so we need to retry
+                structured_llm = self.chatLLM.with_structured_output(PlannerResult, include_raw=True)
+                response: dict[str, Any] = structured_llm.invoke(self.message_history.get_messages())
+                
+                result = response["parsed"]
+                logger.debug(f"Planner result: {result} retry: {retry}")
+                if result is not None:
+                    break
+                retry += 1
+
+            result_str = result.model_dump_json(exclude_none=True)
+            self.message_history.add_message(AIMessage(content=result_str))
+
+            # emit event
+            event_data = EventData(
+                    task_id=self.context.task_id,
+                    step=self.context.step,
+                )
+            
+            if result.terminated:
+                event_data.details = result.final_response
+                event_data.final = True
+            else:
+                if result.plan is not None:
+                    event_data.plan = result.plan
+
+                # not terminated, but no next step provided
+                if result.next_step is None:
+                    self.context.error += 1
+                    error_msg = "Only plan provided, but no next step provided"
+                    
+                    event_data.details = error_msg
+                    await self.emit_event(ExecutionState.STEP_FAIL, event_data)
+
+                    return AgentOutput(
+                        intent=user_input,
+                        result=None,
+                        error=error_msg
+                    )
+                else:
+                    event_data.details = result.next_step
+
+            # emit event
+            await self.emit_event(ExecutionState.STEP_OK, event_data)
+
+            return AgentOutput(
+                intent=user_input,
+                result=result,
+                error=None
+            )
+        except Exception as e:
+            self.context.error += 1
+            error_msg = str(e)
+            # emit event
+            await self.emit_event(ExecutionState.STEP_FAIL, EventData(
+                task_id=self.context.task_id,
+                step=self.context.step,
+                details=error_msg
+            ))
+
+            # log detailed error
+            logger.error(f"Error parsing planner response: {e}")
+            return AgentOutput(
+                intent=user_input,
+                result=None,
+                error=error_msg
+            )
+        
+
+
+

+ 55 - 0
src/nanobrowser/lib/agent/context.py

@@ -0,0 +1,55 @@
+"""
+Common used types across the sub-packages of the agent package
+"""
+from dataclasses import dataclass
+from typing import Optional
+from .event.manager import EventManager
+from ..browser.context import BrowserContext
+from ..utils.path_manager import PathManager
+
+# default values for agent context
+DEFAULT_MAX_STEPS = 50
+DEFAULT_MAX_ERRORS = 20
+DEFAULT_MAX_TOOL_ROUNDS = 20
+
+@dataclass
+class AgentContext():
+    """
+    Context for agent, used to provide the agent with the necessary information to perform its tasks.
+
+    Example:
+    
+     - It will be injected into tool actions so that they can interact with the browser, file system, etc.
+     - It will be used to provide LLMs with the current task id, step, round, error count, max steps, max tool rounds, etc.
+    """
+    # path manager
+    path_manager: PathManager
+    # browser context
+    browser_context: BrowserContext
+    # event manager
+    event_manager: EventManager
+    # current task id
+    task_id: Optional[str] = None
+    # current step in task execution
+    step: int = 0
+    # current round of tool calls in step execution
+    tool_round: int = 0
+    # current error count
+    error: int = 0
+    # max steps allowed in task execution
+    max_steps: int = DEFAULT_MAX_STEPS
+    # max rounds of tool calls allowed in one step
+    max_tool_rounds: int = DEFAULT_MAX_TOOL_ROUNDS
+    # max errors allowed
+    max_errors: int = DEFAULT_MAX_ERRORS
+
+
+class Actors:
+    """Actors in the agent system"""
+    MANAGER = "manager"     # Manager is a virtual actor that represents the agent service
+    PLANNER = "planner"     # Planner is the agent that plans the task
+    NAVIGATOR = "navigator" # Navigator is the agent that navigates the browser
+    EVALUATOR = "evaluator" # Evaluator is the agent that evaluates the step result
+    VALIDATOR = "validator" # Validator is the agent that validates the final result
+    USER = "user"           # User is the actor that interacts with the agent
+

+ 13 - 0
src/nanobrowser/lib/agent/event/__init__.py

@@ -0,0 +1,13 @@
+from .base import ExecutionState, EventType, Event, EventData, EventCallback
+from .manager import EventManager
+from .logging_subscriber import TaskEventLogger
+
+__all__ = [
+    'ExecutionState',
+    'EventType',
+    'Event',
+    'EventData',
+    'EventManager',
+    'TaskEventLogger',
+    'EventCallback'
+] 

+ 85 - 0
src/nanobrowser/lib/agent/event/base.py

@@ -0,0 +1,85 @@
+from pydantic import BaseModel
+from enum import Enum
+from typing import Optional, Callable, Coroutine
+from ...utils.time_utils import get_current_timestamp_str
+
+class EventType(Enum):
+    """
+    Type of events that can be subscribed to.
+
+    For now, only execution events are supported.
+    """
+    EXECUTION = "execution"
+
+class ExecutionState(Enum):
+    """States representing different phases in the execution lifecycle.
+    
+    Format: <SCOPE>.<STATUS>
+    Scopes: task, step, act
+    Statuses: start, ok, fail, cancel
+    
+    Examples:
+        TASK_OK = "task.ok"  # Task completed successfully
+        STEP_FAIL = "step.fail"  # Step failed
+        ACT_START = "act.start"  # Action started
+    """
+    # Task level states
+    TASK_START = "task.start"
+    TASK_OK = "task.ok"
+    TASK_FAIL = "task.fail"
+    TASK_CANCEL = "task.cancel"
+
+    # Step level states
+    STEP_START = "step.start"
+    STEP_OK = "step.ok"
+    STEP_FAIL = "step.fail"
+    STEP_CANCEL = "step.cancel"
+
+    # Action/Tool level states
+    ACT_START = "act.start"
+    ACT_OK = "act.ok"
+    ACT_FAIL = "act.fail"
+
+class EventData(BaseModel):
+    """Data associated with an event"""
+    task_id: str
+    # step is the step number of the task where the event occurred
+    step: int
+    # tool_round is the round of the tool call used to execute the step
+    tool_round: int = 0
+    # details is the content of the event
+    details: str = ""
+    # final is True if the event is the final response from the actor
+    final: Optional[bool] = None 
+    # plan is present if planner made/revised a plan for the task at the step
+    plan: Optional[str] = None
+    # tool is the tool name used to execute the action step
+    tool: Optional[str] = None
+
+
+class Event(BaseModel):
+    """
+    Represents a state change event in the task execution system.
+    Each event has a type, a specific state that changed,
+    the actor that triggered the change, and associated data.
+    """
+    type: EventType
+    state: ExecutionState
+    actor: str
+    data: EventData
+    timestamp: str
+
+    @classmethod
+    def create(cls, state: ExecutionState, actor: str, data: EventData, 
+               timestamp: str = None, type: EventType = EventType.EXECUTION) -> 'Event':
+        ts_str = timestamp or get_current_timestamp_str()
+        return cls(
+            type=type,
+            state=state,
+            actor=actor,
+            data=data,
+            timestamp=ts_str
+        )
+
+# The type of callback for event subscribers
+EventCallback = Callable[[Event], Coroutine]

+ 25 - 0
src/nanobrowser/lib/agent/event/logging_subscriber.py

@@ -0,0 +1,25 @@
+import logging
+from .base import Event
+
+logger = logging.getLogger(__name__)
+
+class TaskEventLogger:
+    @staticmethod
+    async def handle_event(event: Event):
+        # Build log message parts
+        base_msg = f"TASK[{event.data.task_id}]: {event.data.step}:{event.data.tool_round}: {event.state.value} from {event.actor}"
+        details = [event.data.details] 
+
+        # Add optional components only if they have values
+        # if it's final
+        if hasattr(event.data, 'final') and event.data.final:
+            details.append(f"final: {event.data.final}")
+        # if it has a plan
+        if hasattr(event.data, 'plan') and event.data.plan:
+            details.append(f"plan: {event.data.plan}")
+        # if it has a tool
+        if hasattr(event.data, 'tool') and event.data.tool:
+            details.append(f"tool: {event.data.tool}")
+
+        log_msg = f"{base_msg} - {' | '.join(details)}"
+        logger.info(log_msg) 

+ 26 - 0
src/nanobrowser/lib/agent/event/manager.py

@@ -0,0 +1,26 @@
+import asyncio
+from typing import Dict, List
+from .base import EventType, Event, EventCallback
+
+class EventManager:
+    def __init__(self):
+        self._subscribers: Dict[EventType, List[EventCallback]] = {}
+
+    def subscribe(self, event_type: EventType, callback: EventCallback):
+        if event_type not in self._subscribers:
+            self._subscribers[event_type] = []
+        if not any(cb is callback for cb in self._subscribers[event_type]):
+            self._subscribers[event_type].append(callback)
+
+    def unsubscribe(self, event_type: EventType, callback: EventCallback):
+        if event_type in self._subscribers:
+            self._subscribers[event_type] = [
+                cb for cb in self._subscribers[event_type] 
+                if cb is not callback  # Use identity comparison instead of equality
+            ]
+
+    async def emit(self, event: Event):        
+        if event.type in self._subscribers:
+            await asyncio.gather(
+                *[callback(event) for callback in self._subscribers[event.type]]
+            )

+ 254 - 0
src/nanobrowser/lib/agent/executor.py

@@ -0,0 +1,254 @@
+from dataclasses import dataclass
+from typing import Optional, Union
+from pathlib import Path
+from asyncio import Lock
+import logging
+from pydantic import BaseModel
+from langchain_core.language_models.chat_models import BaseChatModel
+from .agents.planner import PlannerAgent, PlannerResult
+from .agents.navigator import NavigatorAgent, NavigatorResult
+from .agents.base import BaseAgentOptions
+from ..browser.manager import PlaywrightManager, PlaywrightOptions
+from .context import AgentContext, Actors
+from .event import EventManager, EventType, Event, TaskEventLogger, ExecutionState, EventData, EventCallback
+from ..utils.path_manager import PathManager
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class StepState:
+    steps: int
+    input: str
+    output: Union[str, BaseModel, None] = None
+    error: Union[str, None] = None
+    terminated: bool = False
+
+class Executor:
+    _instance = None
+    _lock = Lock()
+    
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance:
+            cls._instance = super(Executor, cls).__new__(cls)
+        return cls._instance
+
+    def __init__(self, 
+                 base_dir: Path,
+                 llmPlanner:BaseChatModel, 
+                 llmNavigator:BaseChatModel, 
+                 save_chat_history: bool=True,
+                 chrome_app_path: Optional[Path]=None,
+                 chrome_cdp_port: Optional[int]=9222,
+                 log_events: Optional[bool]=True,
+                 max_steps: Optional[int]=100,
+                 max_errors: Optional[int]=20,
+                 max_tool_rounds: Optional[int]=20,
+                 ):
+        # Only initialize if not already initialized
+        if not hasattr(self, '_initialized'):
+            self._path_manager = PathManager(base_dir)
+            self._save_chat_history = save_chat_history
+            self._planner_options = BaseAgentOptions(id=Actors.PLANNER, chatLLM=llmPlanner)
+            self._navigator_options = BaseAgentOptions(id=Actors.NAVIGATOR, chatLLM=llmNavigator)
+
+            self._browser_context = None
+            self._agent_context = None
+            self._initialized = False
+            self._current_task_id = None
+
+            # setdefault values
+            self._max_steps = max_steps
+            self._max_errors = max_errors
+            self._max_tool_rounds = max_tool_rounds
+
+            # create browser manager but not initialize it
+            playwright_options = PlaywrightOptions(
+                chrome_app_path=chrome_app_path,
+                cdp_port=chrome_cdp_port,
+                screenshots_dir=self._path_manager.screenshots,
+            )
+            self._browser_manager = PlaywrightManager(playwright_options)
+
+            # set up event manager
+            self._event_manager = EventManager()
+            if log_events:
+                self._event_manager.subscribe(EventType.EXECUTION, TaskEventLogger.handle_event)
+
+    async def initialize(self):
+        # Use lock to prevent multiple simultaneous initializations
+        async with self._lock:
+            if self._initialized:
+                return
+            
+            # initialize browser
+            await self._browser_manager.async_initialize()
+            self._browser_context = await self._browser_manager.get_browser_context()
+            
+            self._agent_context = AgentContext(
+                browser_context=self._browser_context,
+                event_manager=self._event_manager,
+                path_manager=self._path_manager,
+                max_steps=self._max_steps,
+                max_errors=self._max_errors,
+                max_tool_rounds=self._max_tool_rounds
+            )
+            
+            # set up planner and navigator 
+            self._planner_options.context = self._agent_context
+            self._navigator_options.context = self._agent_context
+            self._planner = PlannerAgent(self._planner_options)
+            self._navigator = NavigatorAgent(self._navigator_options)
+
+            self._initialized = True
+
+    async def close(self):
+        if self._browser_manager is not None:
+            await self._browser_manager.close()
+            self._browser_context = None
+            self._browser_manager = None
+            
+    async def run(self, task: str, task_id: str, max_steps: Optional[int] = 100, tab_id: Optional[str] = None):
+        """
+        Run a task
+        Args:
+            task (str): The task to execute
+            task_id (str): The ID of the task
+            max_steps (int): The maximum number of steps to execute
+            tab_id (str): The ID of the chrome tab to execute the task in. If not provided, a new tab will be opened. It's a hack to communicate with the chrome extension.
+        """
+        if not self._initialized:
+            raise Exception("Executor is not initialized. Call initialize() before executing tasks.")
+        
+        # Check if there's already a task running
+        async with self._lock:
+            if self._current_task_id:
+                error_message = f"Another task is currently running. Please wait for it to complete. Task ID: {self._current_task_id}"
+                # emit task failed event
+                await self._emit_event(ExecutionState.TASK_FAIL, EventData(
+                        task_id=task_id,
+                        step=0,
+                        details=error_message
+                ))
+                raise Exception(error_message)
+            
+            # Initialize new task
+            self._agent_context.task_id = task_id
+            self._current_task_id = task_id
+        
+        try:
+            # Start task execution in background
+            await self._execute_task(task, max_steps, tab_id)
+        finally:
+            # Clear current task when done
+            async with self._lock:
+                self._current_task_id = None
+
+    async def _execute_task(self, task: str, max_steps: int, tab_id: Optional[str]):
+        """Internal method to handle task execution"""
+        # reset agent context
+        self._agent_context.step = 0
+        self._agent_context.tool_round = 0
+        self._agent_context.error = 0
+        self._agent_context.max_steps = max_steps
+
+        # reset planner and navigator agents
+        self._planner.reset()
+        self._navigator.reset()
+
+        try:
+            if tab_id:
+                await self._browser_context.set_current_page(tab_id)
+
+            # emit task started event
+            await self._emit_event(ExecutionState.TASK_START, EventData(
+                task_id=self._agent_context.task_id,
+                step=self._agent_context.step,
+                details=task
+            ))
+
+            # execute the task
+            next_step = task
+            while True:
+                event_data = EventData(
+                    task_id=self._agent_context.task_id,
+                    step=self._agent_context.step,
+                )
+                # check if the task has reached the maximum number of steps
+                if self._agent_context.step >= self._agent_context.max_steps:
+                    event_data.details = f"Task failed with max steps reached: {self._agent_context.step}"
+                    await self._emit_event(ExecutionState.TASK_FAIL, event_data)
+                    break
+
+                if self._agent_context.error >= self._agent_context.max_errors:
+                    event_data.details = f"Task failed with max errors encountered: {self._agent_context.error}"
+                    await self._emit_event(ExecutionState.TASK_FAIL, event_data)
+                    break
+
+                # Planner makes a plan and decides the next step to be executed by Navigator
+                step_state = await self._plan(self._agent_context.step, next_step)
+                if step_state.terminated:
+                    event_data.details = step_state.output
+                    event_data.final = True
+                    await self._emit_event(ExecutionState.TASK_OK, event_data)
+                    break
+                elif step_state.error:
+                    next_step = step_state.error
+                    continue
+
+                # Extract the next step from the PlannerResult
+                next_step = step_state.output.next_step
+
+                # Navigator executes the next step
+                step_state = await self._navigate(self._agent_context.step, next_step)
+                if step_state.error:
+                    next_step = step_state.error
+                else:
+                    next_step = step_state.output
+        except Exception as e:
+            logger.error(f"Task failed with error: {e}")
+            await self._emit_event(ExecutionState.TASK_FAIL, EventData(
+                task_id=self._agent_context.task_id,
+                step=self._agent_context.step,
+                details=str(e)
+            ))
+        finally:
+            # save chat history
+            self._planner.save_chat_history()
+            self._navigator.save_chat_history()
+
+    async def _emit_event(self, state: ExecutionState, data: EventData):
+        if self._agent_context:
+            await self._agent_context.event_manager.emit(Event.create(
+                state=state,
+                actor=Actors.MANAGER,
+                data=data
+            ))
+
+    async def _plan(self, steps: int, input_text: str)-> StepState:
+        logger.debug(f"Step {steps+1}: planning - {input_text}")
+        plan_response = await self._planner.process_request(input_text)
+        if plan_response.result and isinstance(plan_response.result, PlannerResult):
+            if plan_response.result.terminated:
+                return StepState(steps=steps, input=input_text, output=plan_response.result.final_response, terminated=True)
+            else:
+                next_step = plan_response.result.next_step
+                if next_step is None:
+                    return StepState(steps=steps, input=input_text, error="Planner agent did not return a next step, terminating task")
+                else:
+                    return StepState(steps=steps, input=input_text, output=plan_response.result)
+        else:
+            return StepState(steps=steps, input=input_text, error=plan_response.error)
+    
+    async def _navigate(self, steps: int, task: str) -> StepState:
+        logger.debug(f"Step {steps+1}: navigating - {task}")
+        navigate_response = await self._navigator.process_request(task)
+        if navigate_response.result and isinstance(navigate_response.result, NavigatorResult):
+            # remove "##TERMINATE TASK##" from the response, if present, to avoid planner from terminating the task
+            final_response = navigate_response.result.final_response
+            return StepState(steps=steps, input=task, output=final_response)
+        else:
+            return StepState(steps=steps, input=task, error=navigate_response.error)
+        
+    async def subscribe_execution_state(self, callback: EventCallback):
+        """Subscribe to execution state changes during task execution. In the callback, you can check the execution state and take appropriate actions."""
+        self._event_manager.subscribe(EventType.EXECUTION, callback)

+ 11 - 0
src/nanobrowser/lib/agent/memory/__init__.py

@@ -0,0 +1,11 @@
+"""
+Memory module for the agent.
+"""
+from .base import BaseChatHistory
+from .in_memory_history import InMemoryChatHistory
+
+
+__all__ = [
+    "BaseChatHistory",
+    "InMemoryChatHistory"
+]

+ 29 - 0
src/nanobrowser/lib/agent/memory/base.py

@@ -0,0 +1,29 @@
+from abc import ABC, abstractmethod
+from typing import List, Callable
+from langchain_core.messages import BaseMessage
+
+class BaseChatHistory(ABC):
+
+    @abstractmethod
+    def length(self) -> int:
+        pass
+
+    @abstractmethod
+    def clear(self):
+        pass
+
+    @abstractmethod
+    def add_message(self, message: BaseMessage):
+        pass
+
+    @abstractmethod
+    def remove_message(self, message: BaseMessage):
+        pass
+
+    @abstractmethod
+    def get_messages(self) -> List[BaseMessage]:
+        pass
+
+    @abstractmethod
+    def trim_messages(self, max_tokens: int, token_counter: Callable[[list[BaseMessage]], int]):
+        pass

+ 33 - 0
src/nanobrowser/lib/agent/memory/in_memory_history.py

@@ -0,0 +1,33 @@
+from typing import List, Callable
+from langchain_core.messages import BaseMessage
+from langchain_core.messages.utils import trim_messages
+from .base import BaseChatHistory
+
+class InMemoryChatHistory(BaseChatHistory):
+    messages: List[BaseMessage] = []
+
+    def length(self):
+        return len(self.messages)
+    
+    def clear(self):
+        self.messages = []
+
+    def add_message(self, message: BaseMessage):
+        self.messages.append(message)
+
+    def remove_message(self, message: BaseMessage):
+        self.messages.remove(message)
+
+    def get_messages(self) -> List[BaseMessage]:
+        return self.messages
+    
+    def trim_messages(self, max_tokens: int, token_counter: Callable[[list[BaseMessage]], int]):
+        self.messages = trim_messages(
+            self.messages, 
+            max_tokens=max_tokens, 
+            token_counter=token_counter,
+            strategy="last",
+            include_system=True,
+        )
+    
+ 

+ 9 - 0
src/nanobrowser/lib/agent/prompts/__init__.py

@@ -0,0 +1,9 @@
+from .base import BasePrompt
+from .planner import PlannerPrompt
+from .navigator import NavigatorPrompt
+
+__all__ = [
+    "BasePrompt",
+    "PlannerPrompt",
+    "NavigatorPrompt"
+]

+ 32 - 0
src/nanobrowser/lib/agent/prompts/base.py

@@ -0,0 +1,32 @@
+from typing import Optional
+from ...utils.time_utils import get_current_timestamp_str
+
+class BasePrompt():
+ 
+    def get_system_prompt(self)->str:
+        return f"you are a helpful assistant. {self._current_datetime_info()}"
+
+    def build_user_prompt(
+            self, 
+            user_input: str,
+            url: Optional[str] = None, 
+            title: Optional[str] = None,
+            follow_up: Optional[bool] = False)->str:
+        current_page_info = self._current_page_info(url, title)
+        if current_page_info:
+            return f"{user_input} \n {current_page_info}"
+        else:
+            return user_input
+    
+    def _current_datetime_info(self, timezone: Optional[str] = None)->str:
+        return f"Current date and time: {get_current_timestamp_str(timezone)}"
+    
+    def _current_page_info(self, url: Optional[str] = None, title: Optional[str] = None)->str:
+        if url is None:
+            return ""
+            
+        info_parts = [f"- URL: {url}"]
+        if title:
+            info_parts.append(f"- Title: {title}")
+            
+        return "Current page:\n" + "\n".join(info_parts)

+ 47 - 0
src/nanobrowser/lib/agent/prompts/navigator.py

@@ -0,0 +1,47 @@
+from typing import Optional
+from .base import BasePrompt
+
+class NavigatorPrompt(BasePrompt):
+    def __init__(self):
+        # system template
+        self.system_template = """
+You will perform web navigation tasks, which may include logging into websites and interacting with any web content using the functions made available to you.
+Use the provided DOM representation for element location or text summarization.
+Interact with pages using only the "mmid" attribute in DOM elements.
+You must extract mmid value from the fetched DOM, do not conjure it up.
+Execute function sequentially to avoid navigation timing issues. Once a task is completed, confirm completion with ##TERMINATE TASK##.
+The given actions are NOT parallelizable. They are intended for sequential execution.
+If you need to call multiple functions in a task step, call one function at a time. Wait for the function's response before invoking the next function. This is important to avoid collision.
+Strictly for search fields, submit the field by pressing Enter key. For other forms, click on the submit button.
+
+Unless otherwise specified, the task must be performed on the current page. Use openurl only when explicitly instructed to navigate to a new page with a url specified. If you do not know the URL ask for it.
+You will NOT provide any URLs of links on webpage. If user asks for URLs, you will instead provide the text of the hyperlink on the page and offer to click on it. This is very very important.
+When inputing information, remember to follow the format of the input field. For example, if the input field is a date field, you will enter the date in the correct format (e.g. YYYY-MM-DD), you may get clues from the placeholder text in the input field.
+if the task is ambigous or there are multiple options to choose from, you will ask the user for clarification. You will not make any assumptions.
+Individual function will reply with action success and if any changes were observed as a consequence. Adjust your approach based on this feedback.
+Once the task is completed or cannot be completed, return a short summary of the actions you performed to accomplish the task, and what worked and what did not. This should be followed by ##TERMINATE TASK##. Your reply will not contain any other information.
+Additionally, If task requires an answer, you will also provide a short and precise answer followed by ##TERMINATE TASK##.
+Ensure that user questions are answered from the DOM and not from memory or assumptions. To answer a question about textual information on the page, prefer to use text_only DOM type. To answer a question about interactive elements, use all_fields DOM type.
+Do not provide any mmid values in your response.
+
+Important: 
+- If you encounter an issues or is unsure how to proceed, simply ##TERMINATE TASK## and provide a detailed summary of the exact issue encountered.
+- Do not repeat the same action multiple times if it fails. Instead, if something did not work after a few attempts, terminate the task.
+- {datetime_info}
+
+"""
+
+    def get_system_prompt(self)->str:   
+        return self.system_template.format(datetime_info=self._current_datetime_info())
+    
+    def build_user_prompt(self, 
+                          user_input: str, 
+                          url: Optional[str] = None, 
+                          title: Optional[str] = None,
+                          follow_up: Optional[bool] = False)->str:
+        current_page = self._current_page_info(url, title)
+        if current_page:
+            return f"{user_input}\n\n{current_page}"
+        else:
+            return user_input
+ 

+ 102 - 0
src/nanobrowser/lib/agent/prompts/planner.py

@@ -0,0 +1,102 @@
+from typing import Optional
+from .base import BasePrompt
+
+class PlannerPrompt(BasePrompt):
+    def __init__(self):
+        self.system_prompt = """You are a web automation task planner. You will receive tasks from the user and will work with a naive helper to accomplish it.
+You will think step by step and break down the tasks into sequence of simple subtasks. Subtasks will be delegated to the helper to execute.
+
+Capabilities and limitation of the helper:
+1. Helper have tools to navigate to urls, interact with page elements, input text in text fields or answer any question you may have about the current page.
+2. Helper cannot perform complex planning, reasoning or analysis. You will not delegate any such tasks to helper, instead you will perform them based on information from the helper.
+3. Helper is stateless and treats each step as a new task. Helper will not remember previous pages or actions. So, you will provide all necessary information as part of each step.
+4. Very Important: Helper cannot go back to previous pages. If you need the helper to return to a previous page, you must explicitly add the URL of the previous page in the step (e.g. return to the search result page by navigating to the url https://www.google.com/search?q=Finland")
+
+Guidelines:
+1. If you know the direct URL, use it directly instead of searching for it (e.g. go to www.espn.com). Optimise the plan to avoid unnecessary steps.
+2. Do not assume any capability exists on the webpage. Ask questions to the helper to confirm the presence of features (e.g. is there a sort by price feature available on the page?). This will help you revise the plan as needed and also establish common ground with the helper.
+3. Do not combine multiple steps into one. A step should be strictly as simple as interacting with a single element or navigating to a page. If you need to interact with multiple elements or perform multiple actions, you will break it down into multiple steps.
+4. Important: You will NOT ask for any URLs of hyperlinks in the page from the helper, instead you will simply ask the helper to click on specific result. URL of the current page will be automatically provided to you with each helper response.
+5. Very Important: Add verification as part of the plan, after each step and specifically before terminating to ensure that the task is completed successfully. Ask simple questions to verify the step completion (e.g. Can you confirm that White Nothing Phone 2 with 16GB RAM is present in the cart?). Do not assume the helper has performed the task correctly.
+6. If the task requires multiple informations, all of them are equally important and should be gathered before terminating the task. You will strive to meet all the requirements of the task.
+7. If one plan fails, you MUST revise the plan and try a different approach. You will NOT terminate a task untill you are absolutely convinced that the task is impossible to accomplish.
+
+Complexities of web navigation:
+1. Many forms have mandatory fields that need to be filled up before they can be submitted. Ask the helper for what fields look mandatory.
+2. In many websites, there are multiple options to filter or sort results. Ask the helper to list any  elements on the page which will help the task (e.g. are there any links or interactive elements that may lead me to the support page?).
+3. Always keep in mind complexities such as filtering, advanced search, sorting, and other features that may be present on the website. Ask the helper whether these features are available on the page when relevant and use them when the task requires it.
+4. Very often list of items such as, search results, list of products, list of reviews, list of people etc. may be divided into multiple pages. If you need complete information, it is critical to explicitly ask the helper to go through all the pages.
+5. Sometimes search capabilities available on the page will not yield the optimal results. Revise the search query to either more specific or more generic.
+6. When a page refreshes or navigates to a new page, information entered in the previous page may be lost. Check that the information needs to be re-entered (e.g. what are the values in source and destination on the page?).
+7. Sometimes some elements may not be visible or be disabled until some other action is performed. Ask the helper to confirm if there are any other fields that may need to be interacted for elements to appear or be enabled.
+
+{datetime_info}
+
+<output_format>
+    <json_structure>
+        <attribute name="plan" optional="true">
+            High-level plan string. Required only at task start or when plan needs revision.
+        </attribute>
+        <attribute name="next_step" required="true">
+            Detailed next step string consistent with plan. Required in all responses except when terminating.
+        </attribute>
+        <attribute name="terminate" required="true">
+            Value: "yes"/"no"
+            Set to "yes" when the exact task is complete without any compromises or you are absolutely convinced that the task cannot be completed, "no" otherwise. This is mandatory for every response.
+        </attribute>
+        <attribute name="final_response" required="when-terminating">
+            Final answer string to user. Required only when terminate is "yes". In search tasks, unless explicitly stated, you will provide the single best suited result in the response instead of listing multiple options.
+            <formatting>
+                - Use pure text (no markdown/html/json unless requested)
+                - Use "\n" for section separation
+                - Prefix key findings with "- "
+                - Use numbered lists for sequential items
+            </formatting>
+        </attribute>
+    </json_structure>
+</output_format>
+
+<example>
+<task>
+Find the cheapest premium economy flights from Helsinki to Stockholm on 15 March on Skyscanner. Current page: www.google.com
+</task>
+<ideal_output>
+{{
+"plan":"1. Go to www.skyscanner.com.\\n 2. List the interaction options...",
+"next_step": "Go to https://www.skyscanner.com",
+"terminate":"no"
+}}
+</ideal_output>
+</example>
+
+In case of response to a completed task:
+<example>
+<ideal_output>
+{{
+ "terminate":"yes", 
+ "final_response": "The cheapest premium economy flight from Helsinki to Stockholm on 15 March 2025 is <flight details>."
+}}
+</ideal_output>
+</example>
+
+
+"""
+
+    def get_system_prompt(self)->str:
+        return self.system_prompt.format(datetime_info=self._current_datetime_info())
+    
+    def build_user_prompt(self, 
+                          user_input: str, 
+                          url: Optional[str] = None, 
+                          title: Optional[str] = None,
+                          follow_up: Optional[bool] = False)->str:
+        content = user_input
+        if follow_up:
+            content = f"Execute this task:\n<task>\n{user_input}\n</task>"
+
+        current_page = self._current_page_info(url, title)
+        if current_page:
+            return f"{content}\n\n{current_page}"
+        else:
+            return content
+    

+ 18 - 0
src/nanobrowser/lib/agent/prompts/validator.py

@@ -0,0 +1,18 @@
+from .base import BasePrompt
+
+class ValidatorPrompt(BasePrompt):
+    """
+    Validator prompt for the agent.
+
+    Not implemented yet.
+    """
+    def __init__(self):
+        self.system_template = """
+
+"""
+
+    def get_system_prompt(self)->str:
+        return self.system_template
+    
+    def build_user_prompt(self, user_input: str)->str:
+        return self.user_template.format(user_input=user_input)

+ 8 - 0
src/nanobrowser/lib/agent/tools/__init__.py

@@ -0,0 +1,8 @@
+from .click_using_selector import click
+from .enter_text_and_click import enter_text_and_click
+from .enter_text_using_selector import entertext, bulk_enter_text
+from .get_dom_with_content_type import get_dom_with_content_type
+from .get_url import geturl
+from .open_url import openurl
+from .press_key_combination import press_key_combination
+from .pdf_text_extractor import extract_text_from_pdf

+ 11 - 0
src/nanobrowser/lib/agent/tools/base.py

@@ -0,0 +1,11 @@
+from typing import Annotated
+from pydantic import BaseModel, Field
+from langchain_core.tools import InjectedToolArg
+from ..context import AgentContext
+
+# Base arguments schema for tools that require context, used to inject AgentContext into tool actions
+class BaseToolArgsWithContextSchema(BaseModel):
+    context: Annotated[AgentContext, InjectedToolArg] = Field(description="The context for the agent to interact with the browser, file system, etc.")
+
+    class Config:
+        arbitrary_types_allowed = True # AgentContext is a complex custom type

+ 242 - 0
src/nanobrowser/lib/agent/tools/click_using_selector.py

@@ -0,0 +1,242 @@
+import asyncio
+import inspect
+import traceback
+import logging
+from langchain_core.tools import tool
+from pydantic import Field
+from playwright.async_api import ElementHandle
+from playwright.async_api import Page
+from  .base import BaseToolArgsWithContextSchema
+from ...browser.dom.dom_helper import get_element_outer_html
+from ...browser.dom.dom_mutation_observer import subscribe  # type: ignore
+from ...browser.dom.dom_mutation_observer import unsubscribe  # type: ignore
+from ..context import AgentContext, Actors
+from ..event import Event, ExecutionState, EventData
+
+logger = logging.getLogger(__name__)
+
+class ClickArgsSchema(BaseToolArgsWithContextSchema):
+    selector: str = Field(description="The properly formed query selector string to identify the element for the click action (e.g. [mmid='114']). When \"mmid\" attribute is present, use it for the query selector.")
+    wait_before_execution: float = Field(description="Optional wait time in seconds before executing the click event logic.", default=0.0)
+
+@tool(args_schema=ClickArgsSchema)
+async def click(context: AgentContext, selector: str, wait_before_execution: float = 0.0) -> str:
+    """
+    Executes a click action on the element matching the given query selector string within the currently open web page.
+    If there is no page open, it will raise a ValueError. An optional wait time can be specified before executing the click logic. Use this to wait for the page to load especially when the last action caused the DOM/Page to load.
+
+    Returns:
+    - Success if the click was successful, Appropropriate error message otherwise.
+    """
+    # logger.debug(f"Executing ClickElement with \"{selector}\" as the selector")
+    event_manager = context.event_manager
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_START,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="click",
+            details=f"Executing Click Element with selector: \"{selector}\""
+        )
+    ))
+
+    # Initialize PlaywrightManager and get the active browser page
+    browser_context = context.browser_context
+    page = await browser_context.get_current_page()
+
+    if page is None: # type: ignore
+        raise ValueError('No active page found. OpenURL command opens a new page.')
+
+    function_name = inspect.currentframe().f_code.co_name # type: ignore
+
+    await browser_context.take_screenshots(f"{function_name}_start", page)
+
+    await browser_context.highlight_element(selector, True)
+
+    dom_changes_detected=None
+    def detect_dom_changes(changes:str): # type: ignore
+        nonlocal dom_changes_detected
+        dom_changes_detected = changes # type: ignore
+
+    subscribe(detect_dom_changes)
+    result = await do_click(page, selector, wait_before_execution)
+    await asyncio.sleep(0.1) # sleep for 100ms to allow the mutation observer to detect changes
+    unsubscribe(detect_dom_changes)
+    await browser_context.take_screenshots(f"{function_name}_end", page)
+    # await browser_context.notify_user(result["summary_message"], message_type=MessageType.ACTION)
+    # emit event
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_OK,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="click",
+            details=result["summary_message"]
+        )
+    ))
+
+    if dom_changes_detected:
+        return f"Success: {result['summary_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action to click {selector} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction."
+    return result["detailed_message"]
+
+
+async def do_click(page: Page, selector: str, wait_before_execution: float) -> dict[str, str]:
+    """
+    Executes the click action on the element with the given selector within the provided page.
+
+    Parameters:
+    - page: The Playwright page instance.
+    - selector: The query selector string to identify the element for the click action.
+    - wait_before_execution: Optional wait time in seconds before executing the click event logic.
+
+    Returns:
+    dict[str,str] - Explanation of the outcome of this operation represented as a dictionary with 'summary_message' and 'detailed_message'.
+    """
+    logger.info(f"Executing ClickElement with \"{selector}\" as the selector. Wait time before execution: {wait_before_execution} seconds.")
+
+    # Wait before execution if specified
+    if wait_before_execution > 0:
+        await asyncio.sleep(wait_before_execution)
+
+    # Wait for the selector to be present and ensure it's attached and visible. If timeout, try javascript click
+    try:
+        logger.info(f"Executing ClickElement with \"{selector}\" as the selector. Waiting for the element to be attached and visible.")
+
+        element = await asyncio.wait_for(
+            page.wait_for_selector(selector, state="attached", timeout=2000),
+            timeout=2000
+        )
+        if element is None:
+            raise ValueError(f"Element with selector: \"{selector}\" not found")
+
+        logger.info(f"Element with selector: \"{selector}\" is attached. scrolling it into view if needed.")
+        try:
+            await element.scroll_into_view_if_needed(timeout=200)
+            logger.info(f"Element with selector: \"{selector}\" is attached and scrolled into view. Waiting for the element to be visible.")
+        except Exception:
+            # If scrollIntoView fails, just move on, not a big deal
+            pass
+
+        try:
+            await element.wait_for_element_state("visible", timeout=200)
+            logger.info(f"Executing ClickElement with \"{selector}\" as the selector. Element is attached and visibe. Clicking the element.")
+        except Exception:
+            # If the element is not visible, try to click it anyway
+            pass
+
+        element_tag_name = await element.evaluate("element => element.tagName.toLowerCase()")
+        element_outer_html = await get_element_outer_html(element, page, element_tag_name)
+
+
+        if element_tag_name == "option":
+            element_value = await element.get_attribute("value") # get the text that is in the value of the option
+            parent_element = await element.evaluate_handle("element => element.parentNode")
+            # await parent_element.evaluate(f"element => element.select_option(value=\"{element_value}\")")
+            await parent_element.select_option(value=element_value) # type: ignore
+
+            logger.info(f'Select menu option "{element_value}" selected')
+
+            return {"summary_message": f'Select menu option "{element_value}" selected',
+                    "detailed_message": f'Select menu option "{element_value}" selected. The select element\'s outer HTML is: {element_outer_html}.'}
+
+
+        #Playwright click seems to fail more often than not, disabling it for now and just going with JS click
+        #await perform_playwright_click(element, selector)
+        msg = await perform_javascript_click(page, selector)
+        return {"summary_message": msg, "detailed_message": f"{msg} The clicked element's outer HTML is: {element_outer_html}."} # type: ignore
+    except Exception as e:
+        logger.error(f"Unable to click element with selector: \"{selector}\". Error: {e}")
+        traceback.print_exc()
+        msg = f"Unable to click element with selector: \"{selector}\" since the selector is invalid. Proceed by retrieving DOM again."
+        return {"summary_message": msg, "detailed_message": f"{msg}. Error: {e}"}
+
+
+async def is_element_present(page: Page, selector: str) -> bool:
+    """
+    Checks if an element is present on the page.
+
+    Parameters:
+    - page: The Playwright page instance.
+    - selector: The query selector string to identify the element.
+
+    Returns:
+    - True if the element is present, False otherwise.
+    """
+    element = await page.query_selector(selector)
+    return element is not None
+
+
+async def perform_playwright_click(element: ElementHandle, selector: str):
+    """
+    Performs a click action on the element using Playwright's click method.
+
+    Parameters:
+    - element: The Playwright ElementHandle instance representing the element to be clicked.
+    - selector: The query selector string of the element.
+
+    Returns:
+    - None
+    """
+    logger.info(f"Performing first Step: Playwright Click on element with selector: {selector}")
+    await element.click(force=False, timeout=200)
+
+
+async def perform_javascript_click(page: Page, selector: str):
+    """
+    Performs a click action on the element using JavaScript.
+
+    Parameters:
+    - page: The Playwright page instance.
+    - selector: The query selector string of the element.
+
+    Returns:
+    - None
+    """
+    js_code = """(selector) => {
+        let element = document.querySelector(selector);
+
+        if (!element) {
+            console.log(`perform_javascript_click: Element with selector ${selector} not found`);
+            return `perform_javascript_click: Element with selector ${selector} not found`;
+        }
+
+        if (element.tagName.toLowerCase() === "option") {
+            let value = element.text;
+            let parent = element.parentElement;
+
+            parent.value = element.value; // Directly set the value if possible
+            // Trigger change event if necessary
+            let event = new Event('change', { bubbles: true });
+            parent.dispatchEvent(event);
+
+            console.log("Select menu option", value, "selected");
+            return "Select menu option: "+ value+ " selected";
+        }
+        else {
+            console.log("About to click selector", selector);
+            // If the element is a link, make it open in the same tab
+            if (element.tagName.toLowerCase() === "a") {
+                element.target = "_self";
+            }
+            let ariaExpandedBeforeClick = element.getAttribute('aria-expanded');
+            element.click();
+            let ariaExpandedAfterClick = element.getAttribute('aria-expanded');
+            if (ariaExpandedBeforeClick === 'false' && ariaExpandedAfterClick === 'true') {
+                return "Executed JavaScript Click on element with selector: "+selector +". Very important: As a consequence a menu has appeared where you may need to make further selction. Very important: Get all_fields DOM to complete the action.";
+            }
+            return "Executed JavaScript Click on element with selector: "+selector;
+        }
+    }"""
+    try:
+        logger.info(f"Executing JavaScript click on element with selector: {selector}")
+        result:str = await page.evaluate(js_code, selector)
+        logger.debug(f"Executed JavaScript Click on element with selector: {selector}")
+        return result
+    except Exception as e:
+        logger.error(f"Error executing JavaScript click on element with selector: {selector}. Error: {e}")
+        traceback.print_exc()
+

+ 140 - 0
src/nanobrowser/lib/agent/tools/enter_text_and_click.py

@@ -0,0 +1,140 @@
+import asyncio
+import inspect
+import logging
+from pydantic import Field
+from .click_using_selector import do_click
+from .enter_text_using_selector import do_entertext
+from .press_key_combination import do_press_key_combination
+from langchain_core.tools import tool
+from ..context import AgentContext, Actors
+from ..event import Event, ExecutionState, EventData
+from .base import BaseToolArgsWithContextSchema 
+
+logger = logging.getLogger(__name__)
+
+class EnterTextAndClickArgsSchema(BaseToolArgsWithContextSchema):
+    text_selector: str = Field(description="The properly formatted DOM selector query, for example [mmid='1234'], where the text will be entered. Use mmid attribute.")
+    text_to_enter: str = Field(description="The text that will be entered into the element specified by text_selector.")
+    click_selector: str = Field(description="The properly formatted DOM selector query, for example [mmid='1234'], for the element that will be clicked after text entry.")
+    wait_before_click_execution: float = Field(description="Optional wait time in seconds before executing the click.", default=0.0)
+
+
+@tool(args_schema=EnterTextAndClickArgsSchema)
+async def enter_text_and_click(
+    context: AgentContext,
+    text_selector: str,
+    text_to_enter: str,
+    click_selector: str,
+    wait_before_click_execution: float = 0.0
+) -> str:
+    """
+    Enters text into an element and then clicks on another element.
+
+    Returns:
+    - A message indicating the success or failure of the text entry and click.
+
+    Raises:
+    - ValueError: If no active page is found. The OpenURL command opens a new page.
+
+    Example usage:
+    ```
+    await enter_text_and_click("[mmid='1234']", "Hello, World!", "[mmid='5678']", wait_before_click_execution=1.5)
+    ```
+    """
+    # logger.info(f"Entering text '{text_to_enter}' into element with selector '{text_selector}' and then clicking element with selector '{click_selector}'.")
+    event_manager = context.event_manager
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_START,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="enter_text_and_click",
+            details=f"Entering text '{text_to_enter}' into element with selector '{text_selector}'."
+        )
+    ))
+
+    # Initialize PlaywrightManager and get the active browser page
+    browser_context = context.browser_context
+    page = await browser_context.get_current_page()
+    if page is None: # type: ignore
+        logger.error("No active page found")
+        raise ValueError('No active page found. OpenURL command opens a new page.')
+
+    await browser_context.highlight_element(text_selector, True)
+
+    function_name = inspect.currentframe().f_code.co_name # type: ignore
+    await browser_context.take_screenshots(f"{function_name}_start", page)
+
+    text_entry_result = await do_entertext(context, page, text_selector, text_to_enter, use_keyboard_fill=True)
+
+    #await browser_manager.notify_user(text_entry_result["summary_message"])
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_OK,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="enter_text_and_click",
+            details=text_entry_result["summary_message"]
+        )
+    ))
+    if not text_entry_result["summary_message"].startswith("Success"):
+        await browser_context.take_screenshots(f"{function_name}_end", page)
+        return(f"Failed to enter text '{text_to_enter}' into element with selector '{text_selector}'. Check that the selctor is valid.")
+
+    result = text_entry_result
+
+    # emit event
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_START,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="enter_text_and_click",
+            details=f"Clicking element: \"{click_selector}\""
+        )
+    ))
+
+    click_result = ""
+
+    #if the text_selector is the same as the click_selector, press the Enter key instead of clicking
+    if text_selector == click_selector:
+        do_press_key_combination_result = await do_press_key_combination(browser_context, page, "Enter")
+        if do_press_key_combination_result:
+            result["detailed_message"] += f" Instead of click, pressed the Enter key successfully on element: \"{click_selector}\"."
+            # await browser_manager.notify_user(f"Pressed the Enter key successfully on element: \"{click_selector}\".", message_type=MessageType.ACTION)
+            click_result = "Pressed the Enter key successfully on element: \"{click_selector}\""
+        else:
+            result["detailed_message"] += f" Clicking the same element after entering text in it, is of no value. Tried pressing the Enter key on element \"{click_selector}\" instead of click and failed."
+            # await browser_manager.notify_user("Failed to press the Enter key on element \"{click_selector}\".", message_type=MessageType.ACTION)
+            click_result = "Failed to press the Enter key on element \"{click_selector}\""
+    else:
+        await browser_context.highlight_element(click_selector, True)
+
+        do_click_result = await do_click(page, click_selector, wait_before_click_execution)
+        result["detailed_message"] += f' {do_click_result["detailed_message"]}'
+        #await browser_manager.notify_user(do_click_result["summary_message"])
+        click_result = do_click_result["summary_message"]
+
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_OK,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="enter_text_and_click",
+            details=click_result
+        )
+    ))
+    
+    await asyncio.sleep(0.1) # sleep for 100ms to allow the mutation observer to detect changes
+
+    await browser_context.take_screenshots(f"{function_name}_end", page)
+
+    return result["detailed_message"]

+ 251 - 0
src/nanobrowser/lib/agent/tools/enter_text_using_selector.py

@@ -0,0 +1,251 @@
+import asyncio
+import inspect
+import traceback
+import logging
+from typing import List  # noqa: UP035
+from pydantic import Field
+from playwright.async_api import Page
+from langchain_core.tools import tool
+from .press_key_combination import execute_press_key_combination
+from ...browser.dom.dom_helper import get_element_outer_html
+from ...browser.dom.dom_mutation_observer import subscribe
+from ...browser.dom.dom_mutation_observer import unsubscribe
+from ..context import AgentContext, Actors
+from ..event import Event, ExecutionState, EventData
+from .base import BaseToolArgsWithContextSchema
+
+logger = logging.getLogger(__name__)
+
+async def custom_fill_element(page: Page, selector: str, text_to_enter: str):
+    """
+    Sets the value of a DOM element to a specified text without triggering keyboard input events.
+
+    This function directly sets the 'value' property of a DOM element identified by the given CSS selector,
+    effectively changing its current value to the specified text. This approach bypasses the need for
+    simulating keyboard typing, providing a more efficient and reliable way to fill in text fields,
+    especially in automated testing scenarios where speed and accuracy are paramount.
+
+    Args:
+        page (Page): The Playwright Page object representing the browser tab in which the operation will be performed.
+        selector (str): The CSS selector string used to locate the target DOM element. The function will apply the
+                        text change to the first element that matches this selector.
+        text_to_enter (str): The text value to be set in the target element. Existing content will be overwritten.
+
+    Example:
+        await custom_fill_element(page, '#username', 'test_user')
+
+    Note:
+        This function does not trigger input-related events (like 'input' or 'change'). If application logic
+        relies on these events being fired, additional steps may be needed to simulate them.
+    """
+    selector = f"{selector}"  # Ensures the selector is treated as a string
+    try:
+        result = await page.evaluate(
+            """(inputParams) => {
+            const selector = inputParams.selector;
+            let text_to_enter = inputParams.text_to_enter;
+            text_to_enter = text_to_enter.trim();
+            const element = document.querySelector(selector);
+            if (!element) {
+                throw new Error(`Element not found: ${selector}`);
+            }
+            element.value = text_to_enter;
+            return `Value set for ${selector}`;
+        }""",
+            {"selector": selector, "text_to_enter": text_to_enter},
+        )
+        logger.debug(f"custom_fill_element result: {result}")
+    except Exception as e:
+        logger.error(f"Error in custom_fill_element, Selector: {selector}, Text: {text_to_enter}. Error: {str(e)}")
+        raise
+
+
+class EnterTextArgsSchema(BaseToolArgsWithContextSchema):
+    query_selector: str = Field(description="The valid DOM selector query, for example [mmid='1234'], where the text will be entered. Use mmid attribute.")
+    text_to_enter: str = Field(description="The text that will be entered into the element specified by query_selector.")
+
+@tool(args_schema=EnterTextArgsSchema)
+async def entertext(context: AgentContext, query_selector: str, text_to_enter: str) -> str:
+    """
+    Enters text into a DOM element identified by a CSS selector.
+
+    This function enters the specified text into a DOM element identified by the given CSS selector.
+    It uses the Playwright library to interact with the browser and perform the text entry operation.
+    The function supports both direct setting of the 'value' property and simulating keyboard typing.
+
+    Returns:
+        str: Explanation of the outcome of this operation.
+
+    Example:
+        result = await entertext('#username', 'test_user')
+    """
+    # logger.info(f"Entering text: {text_to_enter} into element with selector: {query_selector}")
+
+    event_manager = context.event_manager
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_START,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="entertext",
+            details=f"Entering text: {text_to_enter} into element with selector: {query_selector}"
+        )
+    ))
+
+    # Create and use the PlaywrightManager
+    browser_context = context.browser_context
+    page = await browser_context.get_current_page()
+    if page is None: # type: ignore
+        return "Error: No active page found. OpenURL command opens a new page."
+
+    function_name = inspect.currentframe().f_code.co_name # type: ignore
+
+    await browser_context.take_screenshots(f"{function_name}_start", page)
+
+    await browser_context.highlight_element(query_selector, True)
+
+    dom_changes_detected=None
+    def detect_dom_changes(changes:str): # type: ignore
+        nonlocal dom_changes_detected
+        dom_changes_detected = changes # type: ignore
+
+    subscribe(detect_dom_changes)
+
+    await page.evaluate(
+        """
+        (selector) => {
+            const element = document.querySelector(selector);
+            if (element) {
+                element.value = '';
+            } else {
+                console.error('Element not found:', selector);
+            }
+        }
+        """,
+        query_selector,
+    )
+
+    result = await do_entertext(context, page, query_selector, text_to_enter)
+    await asyncio.sleep(0.1) # sleep for 100ms to allow the mutation observer to detect changes
+    unsubscribe(detect_dom_changes)
+
+    await browser_context.take_screenshots(f"{function_name}_end", page)
+
+    # await browser_context.notify_user(result["summary_message"], message_type=MessageType.ACTION)
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_OK,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="entertext",
+            details=result["summary_message"]
+        )
+    ))
+    if dom_changes_detected:
+        return f"{result['detailed_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action of entering text {text_to_enter} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction."
+    return result["detailed_message"]
+
+
+async def do_entertext(context: AgentContext, page: Page, selector: str, text_to_enter: str, use_keyboard_fill: bool=False):
+    """
+    Performs the text entry operation on a DOM element.
+
+    This function performs the text entry operation on a DOM element identified by the given CSS selector.
+    It applies a pulsating border effect to the element during the operation for visual feedback.
+    The function supports both direct setting of the 'value' property and simulating keyboard typing.
+
+    Args:
+        page (Page): The Playwright Page object representing the browser tab in which the operation will be performed.
+        selector (str): The CSS selector string used to locate the target DOM element.
+        text_to_enter (str): The text value to be set in the target element. Existing content will be overwritten.
+        use_keyboard_fill (bool, optional): Determines whether to simulate keyboard typing or not.
+                                            Defaults to False.
+
+    Returns:
+        dict[str, str]: Explanation of the outcome of this operation represented as a dictionary with 'summary_message' and 'detailed_message'.
+
+    Example:
+        result = await do_entertext(page, '#username', 'test_user')
+
+    Note:
+        - The 'use_keyboard_fill' parameter determines whether to simulate keyboard typing or not.
+        - If 'use_keyboard_fill' is set to True, the function uses the 'page.keyboard.type' method to enter the text.
+        - If 'use_keyboard_fill' is set to False, the function uses the 'custom_fill_element' method to enter the text.
+    """
+    try:
+
+        logger.debug(f"Looking for selector {selector} to enter text: {text_to_enter}")
+
+        elem = await page.query_selector(selector)
+
+        if elem is None:
+            error = f"Error: Selector {selector} not found. Unable to continue."
+            return {"summary_message": error, "detailed_message": error}
+
+        logger.info(f"Found selector {selector} to enter text")
+        element_outer_html = await get_element_outer_html(elem, page)
+
+        # TODO: remove this after testing
+        # use_keyboard_fill = False
+        if use_keyboard_fill:
+            await elem.focus()
+            await asyncio.sleep(0.1)
+            await execute_press_key_combination(context, "Control+A")
+            await asyncio.sleep(0.1)
+            await execute_press_key_combination(context, "Backspace")
+            await asyncio.sleep(0.1)
+            logger.debug(f"Focused element with selector {selector} to enter text")
+            # add a 100ms delay
+            await page.keyboard.type(text_to_enter, delay=1)
+        else:
+            await custom_fill_element(page, selector, text_to_enter)
+        await elem.focus()
+        logger.info(f"Success. Text \"{text_to_enter}\" set successfully in the element with selector {selector}")
+        success_msg = f"Success. Text \"{text_to_enter}\" set successfully in the element with selector {selector}"
+        return {"summary_message": success_msg, "detailed_message": f"{success_msg} and outer HTML: {element_outer_html}."}
+
+    except Exception as e:
+        traceback.print_exc()
+        error = f"Error entering text in selector {selector}."
+        return {"summary_message": error, "detailed_message": f"{error} Error: {e}"}
+
+
+
+class BulkEnterTextArgsSchema(BaseToolArgsWithContextSchema):   
+    entries: List[dict[str, str]] = Field(description="List of entries, each containing 'query_selector' and 'text'.")
+
+@tool(args_schema=BulkEnterTextArgsSchema)
+async def bulk_enter_text(context: AgentContext, entries: List[dict[str, str]]) -> List[dict[str, str]]:
+    """
+    Enters text into multiple DOM elements using a bulk operation.
+
+    This function enters text into multiple DOM elements using a bulk operation.
+    It takes a list of dictionaries, where each dictionary contains a 'query_selector' and 'text' pair.
+    The function internally calls the 'entertext' function to perform the text entry operation for each entry.
+
+    Returns:
+        List of dictionaries, each containing 'query_selector' and the result of the operation.
+
+    Example:
+        entries = [
+            {"query_selector": "#username", "text": "test_user"},
+            {"query_selector": "#password", "text": "test_password"}
+        ]
+        results = await bulk_enter_text(entries)
+    """
+
+    results: List[dict[str, str]] = []  # noqa: UP006
+    logger.info("Executing bulk Enter Text Command")
+    for entry in entries:
+        query_selector = entry['query_selector']
+        text_to_enter = entry['text']
+        logger.info(f"Entering text: {text_to_enter} in element with selector: {query_selector}")
+        result = await entertext(context, query_selector, text_to_enter)
+
+        results.append({"query_selector": query_selector, "result": result})
+
+    return results

+ 139 - 0
src/nanobrowser/lib/agent/tools/get_dom_with_content_type.py

@@ -0,0 +1,139 @@
+import os
+import time
+import logging
+from typing import Any
+from pydantic import Field
+from playwright.async_api import Page
+from langchain_core.tools import tool
+from ...browser.dom.dom_helper import wait_for_non_loading_dom_state
+from ...browser.dom.get_detailed_accessibility_tree import do_get_accessibility_info
+from ..context import AgentContext, Actors
+from ..event import Event, ExecutionState, EventData
+from .base import BaseToolArgsWithContextSchema
+
+logger = logging.getLogger(__name__)
+
+class GetDomWithContentTypeArgsSchema(BaseToolArgsWithContextSchema):
+    content_type: str = Field(description="""The type of content to extract. Possible values are:
+                               - 'text_only': Extracts the innerText of the highest element in the document and responds with text,
+                               - 'input_fields': Extracts the text input and button elements in the dom,
+                               - 'all_fields': Extracts all the fields in the DOM and responds with a JSON object.""")
+
+
+@tool(args_schema=GetDomWithContentTypeArgsSchema)
+async def get_dom_with_content_type(context: AgentContext, content_type: str) -> dict[str, Any] | str | None:
+    """
+    Retrieves and processes the DOM of the active page in a browser instance based on the specified content type.
+    
+    Returns
+    -------
+    dict[str, Any] | str | None
+        The processed content based on the specified content type. This could be:
+        - A JSON object for 'input_fields' with just inputs.
+        - Plain text for 'text_only'.
+        - A minified DOM represented as a JSON object for 'all_fields'.
+
+    Raises
+    ------
+    ValueError
+        If an unsupported content_type is provided.
+    """
+
+    # logger.info(f"Executing Get DOM Command based on content_type: {content_type}")
+    event_manager = context.event_manager
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_START,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="get_dom_with_content_type",
+            details=f"Executing Get DOM Command based on content_type: {content_type}"
+        )
+    ))
+    
+    start_time = time.time()
+    # Create and use the PlaywrightManager
+    browser_context = context.browser_context
+    page = await browser_context.get_current_page()
+    if page is None: # type: ignore
+        raise ValueError('No active page found. OpenURL command opens a new page.')
+
+    extracted_data = None
+    await wait_for_non_loading_dom_state(page, 2000) # wait for the DOM to be ready, non loading means external resources do not need to be loaded
+    user_success_message = ""
+    if content_type == 'all_fields':
+        user_success_message = "Fetched all the fields in the DOM"
+        extracted_data = await do_get_accessibility_info(page, only_input_fields=False, log_dir=context.path_manager.logs)
+    elif content_type == 'input_fields':
+        logger.debug('Fetching DOM for input_fields')
+        extracted_data = await do_get_accessibility_info(page, only_input_fields=True, log_dir=context.path_manager.logs)
+        if extracted_data is None:
+            return "Could not fetch input fields. Please consider trying with content_type all_fields."
+        user_success_message = "Fetched only input fields in the DOM"
+    elif content_type == 'text_only':
+        # Extract text from the body or the highest-level element
+        logger.debug('Fetching DOM for text_only')
+        text_content = await get_filtered_text_content(page)
+
+        with open(os.path.join(context.path_manager.logs, 'text_only_dom.txt'), 'w',  encoding='utf-8') as f:
+            f.write(text_content)
+        extracted_data = text_content
+        user_success_message = "Fetched the text content of the DOM"
+    else:
+        raise ValueError(f"Unsupported content_type: {content_type}")
+
+    elapsed_time = time.time() - start_time
+    logger.info(f"Get DOM Command executed in {elapsed_time} seconds")
+    # await browser_manager.notify_user(user_success_message, message_type=MessageType.ACTION)
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_OK,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="get_dom_with_content_type",
+            details=user_success_message
+        )
+    ))
+
+    return extracted_data # type: ignore
+
+
+async def get_filtered_text_content(page: Page) -> str:
+    text_content = await page.evaluate("""
+        () => {
+            // Array of query selectors to filter out
+            const selectorsToFilter = ['#agente-overlay'];
+
+            // Store the original visibility values to revert later
+            const originalStyles = [];
+
+            // Hide the elements matching the query selectors
+            selectorsToFilter.forEach(selector => {
+                const elements = document.querySelectorAll(selector);
+                elements.forEach(element => {
+                    originalStyles.push({ element: element, originalStyle: element.style.visibility });
+                    element.style.visibility = 'hidden';
+                });
+            });
+
+            // Get the text content of the page
+            let textContent = document?.body?.innerText || document?.documentElement?.innerText || "";
+
+            // Get all the alt text from images on the page
+            let altTexts = Array.from(document.querySelectorAll('img')).map(img => img.alt);
+            altTexts="Other Alt Texts in the page: " + altTexts.join(' ');
+
+            // Revert the visibility changes
+            originalStyles.forEach(entry => {
+                entry.element.style.visibility = entry.originalStyle;
+            });
+            textContent=textContent+" "+altTexts;
+            return textContent;
+        }
+    """)
+    return text_content
+

+ 39 - 0
src/nanobrowser/lib/agent/tools/get_url.py

@@ -0,0 +1,39 @@
+from langchain_core.tools import tool
+from ..context import AgentContext
+from .base import BaseToolArgsWithContextSchema
+
+
+@tool(args_schema=BaseToolArgsWithContextSchema)
+async def geturl(context: AgentContext) -> str:
+    """
+    Returns the full URL of the current page
+
+    Returns:
+    - Full URL the browser's active page.
+    """
+
+
+    try:
+        # Create and use the PlaywrightManager
+        browser_context = context.browser_context
+        page = await browser_context.get_current_page()
+
+        if not page:
+            raise ValueError('No active page found. OpenURL command opens a new page.')
+
+        await page.wait_for_load_state("domcontentloaded")
+
+        # Get the URL of the current page
+        try:
+            title = await page.title()
+            current_url = page.url
+            if len(current_url) >250:
+                current_url = current_url[:250] + "..."
+            return f"Current Page: {current_url}, Title: {title}" # type: ignore
+        except:  # noqa: E722
+            current_url = page.url
+            return f"Current Page: {current_url}"
+
+    except Exception as e:
+        raise ValueError('No active page found. OpenURL command opens a new page.') from e
+

+ 113 - 0
src/nanobrowser/lib/agent/tools/open_url.py

@@ -0,0 +1,113 @@
+import inspect
+import logging
+from pydantic import Field
+from playwright.async_api import TimeoutError as PlaywrightTimeoutError
+from langchain_core.tools import tool
+from ..context import AgentContext, Actors
+from ..event import Event, ExecutionState, EventData
+from .base import BaseToolArgsWithContextSchema
+
+logger = logging.getLogger(__name__)
+
+class OpenUrlArgsSchema(BaseToolArgsWithContextSchema):
+    url: str = Field(description="The URL to navigate to. Value must include the protocol (http:// or https://).")
+    timeout: int = Field(description="Additional wait time in seconds after initial load. Default is 3 seconds.", default=3)
+
+@tool(args_schema=OpenUrlArgsSchema)
+async def openurl(context: AgentContext, url: str, timeout: int = 3) -> str:
+    """
+    Opens a specified URL in the active browser instance. Waits for an initial load event, then waits for either
+    the 'domcontentloaded' event or a configurable timeout, whichever comes first.
+
+    Returns:
+    - URL of the new page.
+    """
+    # logger.info(f"Opening URL: {url}")
+
+    # emit event
+    event_manager = context.event_manager
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_START,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="openurl",
+            details=f"Opening URL: {url}"
+        )
+    ))
+
+    # browser_manager = PlaywrightManager(PlaywrightOptions())
+    # await browser_manager.get_browser_context()
+    browser_context = context.browser_context
+    page = await browser_context.get_current_page()
+    try:
+        url = ensure_protocol(url)
+        if page.url == url:
+            logger.info(f"Current page URL is the same as the new URL: {url}. No need to refresh.")
+            title = await page.title()
+            
+            msg = f"Page already loaded: {url}, Title: {title}"
+            # emit event
+            await event_manager.emit(Event.create(
+                state=ExecutionState.ACT_OK,
+                actor=Actors.NAVIGATOR,
+                data=EventData(
+                    task_id=context.task_id,
+                    step=context.step,
+                    tool_round=context.tool_round,
+                    tool="openurl",
+                    details=msg
+                )
+            ))
+            return msg # type: ignore
+
+        # Navigate to the URL with a short timeout to ensure the initial load starts
+        function_name = inspect.currentframe().f_code.co_name # type: ignore
+        
+        await browser_context.take_screenshots(f"{function_name}_start", page)
+
+        await page.goto(url, timeout=timeout*1000) # type: ignore
+    except PlaywrightTimeoutError as pte:
+        logger.warn(f"Initial navigation to {url} failed: {pte}. Will try to continue anyway.") # happens more often than not, but does not seem to be a problem
+    except Exception as e:
+        logger.error(f"An error occurred while opening the URL: {url}. Error: {e}")
+        import traceback
+        traceback.print_exc()
+
+    await browser_context.take_screenshots(f"{function_name}_end", page)
+
+    # await browser_context.notify_user(f"Opened URL: {url}", message_type=MessageType.ACTION)
+        # Get the page title
+    title = await page.title()
+    msg = f"Page loaded: {page.url}, Title: {title}"
+    # emit event
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_OK,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="openurl",
+            details=msg
+        )
+    ))
+    return msg # type: ignore
+
+def ensure_protocol(url: str) -> str:
+    """
+    Ensures that a URL has a protocol (http:// or https://). If it doesn't have one,
+    https:// is added by default.
+
+    Parameters:
+    - url: The URL to check and modify if necessary.
+
+    Returns:
+    - A URL string with a protocol.
+    """
+    if not url.startswith(('http://', 'https://')):
+        url = 'https://' + url  # Default to http if no protocol is specified
+        logger.info(f"Added 'https://' protocol to URL because it was missing. New URL is: {url}")
+    return url

+ 138 - 0
src/nanobrowser/lib/agent/tools/pdf_text_extractor.py

@@ -0,0 +1,138 @@
+import os
+import logging
+from pydantic import Field
+from langchain_core.tools import tool
+import httpx
+import pdfplumber
+from ..context import AgentContext, Actors
+from ..event import Event, ExecutionState, EventData
+from .base import BaseToolArgsWithContextSchema
+
+logger = logging.getLogger(__name__)
+
+class ExtractTextFromPdfArgsSchema(BaseToolArgsWithContextSchema):
+    pdf_url: str = Field(description="The URL of the PDF file to extract text from.")   
+
+@tool(args_schema=ExtractTextFromPdfArgsSchema)
+async def extract_text_from_pdf(context: AgentContext, pdf_url: str) -> str:
+    """
+    Extract text from a PDF file.
+
+    Returns:
+    - All the text found in the PDF.
+    """
+    file_path = os.path.join(context.path_manager.temp, "downloaded_file.pdf")  # fixed file path for downloading the PDF
+
+    try:
+        event_manager = context.event_manager
+        await event_manager.emit(Event.create(
+            state=ExecutionState.ACT_START,
+            actor=Actors.NAVIGATOR,
+            data=EventData(
+                task_id=context.task_id,
+                step=context.step,
+                tool_round=context.tool_round,
+                tool="extract_text_from_pdf",
+                details=f"Extracting text from PDF: {pdf_url}"
+            )
+        ))
+
+        # Download the PDF
+        download_result = await download_pdf(pdf_url, file_path)
+        if not os.path.exists(download_result):
+            return download_result  # Return error message if download failed
+
+        # Open the PDF using pdfplumber and extract text
+        text = ""
+        with pdfplumber.open(download_result) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        extracted_text = text.strip()
+        word_count = len(extracted_text.split())
+        # await browser_manager.notify_user(f"Extracted text from the PDF successfully. Found {word_count} words.", message_type=MessageType.ACTION)
+        await event_manager.emit(Event.create(
+            state=ExecutionState.ACT_OK,
+            actor=Actors.NAVIGATOR,
+            data=EventData(
+                task_id=context.task_id,
+                step=context.step,
+                tool_round=context.tool_round,
+                tool="extract_text_from_pdf",
+                details=f"Extracted text from the PDF successfully. Found {word_count} words."
+            )
+        ))
+        return "Text found in the PDF:\n" + extracted_text
+    except httpx.HTTPStatusError as e:
+        logger.error(f"An error occurred while downloading the PDF from {pdf_url}: {str(e)}")
+        error_message = f"An error occurred while downloading the PDF: {str(e)}"
+        await event_manager.emit(Event.create(
+            state=ExecutionState.ACT_FAIL,
+            actor=Actors.NAVIGATOR,
+            data=EventData(
+                task_id=context.task_id,
+                step=context.step,
+                tool_round=context.tool_round,
+                tool="extract_text_from_pdf",
+                details=error_message
+            )
+        ))
+        return error_message
+    except Exception as e:
+        logger.error(f"An error occurred while extracting text from the PDF that was downloaded from {pdf_url}: {str(e)}")
+        error_message = f"An error occurred while extracting text: {str(e)}"
+        await event_manager.emit(Event.create(
+            state=ExecutionState.ACT_FAIL,
+            actor=Actors.NAVIGATOR,
+            data=EventData(
+                task_id=context.task_id,
+                step=context.step,
+                tool_round=context.tool_round,
+                tool="extract_text_from_pdf",
+                details=error_message
+            )
+        ))
+        return error_message
+    finally:
+        # Cleanup: Ensure the downloaded file is removed
+        cleanup_temp_files(file_path)
+
+def cleanup_temp_files(*file_paths: str) -> None:
+    """
+    Remove the specified temporary files.
+
+    *file_paths: str - One or more file paths to be removed.
+    """
+    for file_path in file_paths:
+        if os.path.exists(file_path):
+            try:
+                os.remove(file_path)
+                logger.debug(f"Cleaned file from the filesystem: {file_path}")
+            except Exception as e:
+                logger.error(f"Failed to remove {file_path}: {str(e)}")
+        else:
+            logger.debug(f"File not found. Unable to clean it from the filesystem: {file_path}")
+
+async def download_pdf(pdf_url: str, file_path: str) -> str:
+    """
+    Download the PDF file from the given URL and save it to the specified path.
+
+    pdf_url: str - The URL of the PDF file to download.
+    file_path: str - The local path to save the downloaded PDF.
+
+    returns: str - The file path of the downloaded PDF if successful, otherwise an error message.
+    raises: Exception - If an error occurs during the download process.
+    """
+    try:
+        logger.info(f"Downloading PDF from: {pdf_url} to: {file_path}")
+        async with httpx.AsyncClient() as client:
+            response = await client.get(pdf_url)
+            response.raise_for_status()  # Ensure the request was successful
+        with open(file_path, 'wb') as pdf_file:
+            pdf_file.write(response.content)
+        return file_path
+    # except httpx.HTTPStatusError as e:
+    #     raise e
+    except Exception as e:
+        raise e

+ 138 - 0
src/nanobrowser/lib/agent/tools/press_key_combination.py

@@ -0,0 +1,138 @@
+import asyncio
+import inspect
+import logging
+from pydantic import Field
+from langchain_core.tools import tool
+from playwright.async_api import Page  # type: ignore
+from ...browser.dom.dom_mutation_observer import subscribe  # type: ignore
+from ...browser.dom.dom_mutation_observer import unsubscribe  # type: ignore
+from ..context import AgentContext, Actors
+from ...browser.context import BrowserContext
+from ..event import Event, ExecutionState, EventData
+from .base import BaseToolArgsWithContextSchema
+
+logger = logging.getLogger(__name__)
+
+class PressKeyCombinationArgsSchema(BaseToolArgsWithContextSchema):
+    key_combination: str = Field(description="The key combination to press, use '+' as a separator for combinations. e.g., 'Control+C'.")
+
+@tool(args_schema=PressKeyCombinationArgsSchema)
+async def press_key_combination(context: AgentContext, key_combination: str) -> str:
+    """
+    Presses a key combination on the current active page managed by PlaywrightManager.
+
+    This function simulates the pressing of a key or a combination of keys on the current active web page.
+    The `key_combination` should be a string that represents the keys to be pressed, separated by '+' if it's a combination.
+    For example, 'Control+C' to copy or 'Alt+F4' to close a window on Windows.
+
+    Returns:
+    str: status of the operation expressed as a string
+    """
+    return await execute_press_key_combination(context, key_combination)
+
+async def execute_press_key_combination(context: AgentContext, key_combination: str) -> str:
+   
+    # logger.info(f"Executing press_key_combination with key combo: {key_combination}")
+    event_manager = context.event_manager
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_START,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="press_key_combination",
+            details=f"Executing press_key_combination with key combo: {key_combination}"
+        )
+    ))
+    # Create and use the PlaywrightManager
+    browser_context = context.browser_context
+    page = await browser_context.get_current_page()
+
+    if page is None: # type: ignore
+        raise ValueError('No active page found. OpenURL command opens a new page.')
+
+    # Split the key combination if it's a combination of keys
+    keys = key_combination.split('+')
+
+    dom_changes_detected=None
+    def detect_dom_changes(changes:str): # type: ignore
+        nonlocal dom_changes_detected
+        dom_changes_detected = changes # type: ignore
+
+    subscribe(detect_dom_changes)
+    # If it's a combination, hold down the modifier keys
+    for key in keys[:-1]:  # All keys except the last one are considered modifier keys
+        await page.keyboard.down(key)
+
+    # Press the last key in the combination
+    await page.keyboard.press(keys[-1])
+
+    # Release the modifier keys
+    for key in keys[:-1]:
+        await page.keyboard.up(key)
+    await asyncio.sleep(0.1) # sleep for 100ms to allow the mutation observer to detect changes
+    unsubscribe(detect_dom_changes)
+
+    await event_manager.emit(Event.create(
+        state=ExecutionState.ACT_OK,
+        actor=Actors.NAVIGATOR,
+        data=EventData(
+            task_id=context.task_id,
+            step=context.step,
+            tool_round=context.tool_round,
+            tool="press_key_combination",
+            details=f"Key {key_combination} executed successfully."
+        )
+    ))
+
+    if dom_changes_detected:
+        return f"Key {key_combination} executed successfully.\n As a consequence of this action, new elements have appeared in view:{dom_changes_detected}. This means that the action is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction."
+
+    # await browser_manager.notify_user(f"Key {key_combination} executed successfully", message_type=MessageType.ACTION)
+    return f"Key {key_combination} executed successfully"
+
+
+async def do_press_key_combination(browser_context: BrowserContext, page: Page, key_combination: str) -> bool:
+    """
+    Presses a key combination on the provided page.
+
+    This function simulates the pressing of a key or a combination of keys on a web page.
+    The `key_combination` should be a string that represents the keys to be pressed, separated by '+' if it's a combination.
+    For example, 'Control+C' to copy or 'Alt+F4' to close a window on Windows.
+
+    Parameters:
+    - browser_manager (PlaywrightManager): The PlaywrightManager instance.
+    - page (Page): The Playwright page instance.
+    - key_combination (str): The key combination to press, represented as a string. For combinations, use '+' as a separator.
+
+    Returns:
+    bool: True if success and False if failed
+    """
+
+    logger.info(f"Executing press_key_combination with key combo: {key_combination}")
+    try:
+        function_name = inspect.currentframe().f_code.co_name # type: ignore
+        await browser_context.take_screenshots(f"{function_name}_start", page)
+        # Split the key combination if it's a combination of keys
+        keys = key_combination.split('+')
+
+        # If it's a combination, hold down the modifier keys
+        for key in keys[:-1]:  # All keys except the last one are considered modifier keys
+            await page.keyboard.down(key)
+
+        # Press the last key in the combination
+        await page.keyboard.press(keys[-1])
+
+        # Release the modifier keys
+        for key in keys[:-1]:
+            await page.keyboard.up(key)
+
+    except Exception as e:
+        logger.error(f"Error executing press_key_combination \"{key_combination}\": {e}")
+        return False
+
+    await browser_context.take_screenshots(f"{function_name}_end", page)
+
+    return True
+

+ 0 - 0
src/nanobrowser/lib/browser/__init__.py


+ 163 - 0
src/nanobrowser/lib/browser/context.py

@@ -0,0 +1,163 @@
+import time
+import logging
+from dataclasses import dataclass
+from typing import Optional
+from playwright.async_api import BrowserContext as PlaywrightBrowserContext, Page
+from .dom.dom_mutation_observer import handle_navigation_for_mutation_observer, dom_mutation_change_detected
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class PageInfo:
+    """Information about the current page"""
+    url: str  # current url
+    title: str  # current title
+    screenshot: Optional[str] = None  # current screenshot
+
+@dataclass
+class BrowserContextOptions:
+    home_page: str
+    screenshots_dir: str
+    screenshot_capture_enabled: bool
+
+class BrowserContext:
+    def __init__(self, context: PlaywrightBrowserContext, options: BrowserContextOptions):
+        self._context = context
+        # the current page that is being operated on
+        self._current_page = None
+        self._home_page = options.home_page
+        self._screenshots_dir = options.screenshots_dir
+        self._screenshot_capture_enabled = options.screenshot_capture_enabled
+
+    async def set_current_page(self, nano_tab_id: str) -> Page:
+        # if there is a specific nano_tab_id, try to find the page with that id
+        if nano_tab_id is not None:
+            for page in self._context.pages:
+                if not page.is_closed() and not page.url.startswith(("chrome-extension://", "chrome://", "edge://")):
+                    id = await page.evaluate("document.body.getAttribute('data-nano-tab-id')")
+                    logger.debug(f"\tPage ID: {id}, URL: {page.url}")
+                    if id == nano_tab_id:
+                        await self._setup_handlers(page)
+                        self._current_page = page
+                        return page
+            self._current_page = None
+            logger.warning(f"Page with nano_tab_id {nano_tab_id} not found")
+
+        # if there is a current page, return it
+        if self._current_page is not None:
+            return self._current_page
+        
+        # if there is no current page, create a new one and goto home page
+        page = await self._context.new_page()
+        logger.debug(f"Creating new page: {page.url}")
+        await page.goto(self._home_page)
+        await page.bring_to_front()
+
+        await self._setup_handlers(page)
+        self._current_page = page
+        return page
+
+    async def get_current_page(self):
+        if self._current_page is None:
+            self._current_page = await self.set_current_page(None)
+        return self._current_page
+
+    async def get_current_page_info(self) -> PageInfo:
+        page = await self.get_current_page()
+        title = await page.title()
+        url = page.url
+        return PageInfo(url=url, title=title)
+    
+    async def _setup_handlers(self, page: Page):
+        # Check if handler already exists using a custom attribute
+        handler_exists = getattr(page, '_navigation_handler_added', False)
+        if not handler_exists:
+            # Add new handler only if it doesn't exist
+            logger.debug(f"Adding navigation handler on page: {page.url}")
+            page.on("domcontentloaded", handle_navigation_for_mutation_observer)
+            # Mark that we've added the handler
+            setattr(page, '_navigation_handler_added', True)
+        else:
+            logger.debug("Navigation handler already exists, skipping addition")
+
+        # Only expose the function if it hasn't been exposed yet
+        try:
+            await page.expose_function("dom_mutation_change_detected", dom_mutation_change_detected)
+        except Exception as e:
+            # Ignore errors if function is already exposed
+            if "already registered" not in str(e):
+                # only log error for now
+                logger.error(f"Error exposing function: {e}")
+
+        logger.debug(f"Navigation handler setup complete for page: {page.url}")
+    
+    async def get_current_url(self):
+        page = await self.get_current_page()
+        return page.url
+    
+    async def highlight_element(self, selector: str, add_highlight: bool):
+        try:
+            page: Page = await self.get_current_page()
+            if add_highlight:
+                # Add the 'agente-ui-automation-highlight' class to the element. This class is used to apply the fading border.
+                await page.eval_on_selector(selector, '''e => {
+                            let originalBorderStyle = e.style.border;
+                            e.classList.add('agente-ui-automation-highlight');
+                            e.addEventListener('animationend', () => {
+                                e.classList.remove('agente-ui-automation-highlight')
+                            });}''')
+                logger.debug(f"Applied pulsating border to element with selector {selector} to indicate text entry operation")
+            else:
+                # Remove the 'agente-ui-automation-highlight' class from the element.
+                await page.eval_on_selector(selector, "e => e.classList.remove('agente-ui-automation-highlight')")
+                logger.debug(f"Removed pulsating border from element with selector {selector} after text entry operation")
+        except Exception:
+            # This is not significant enough to fail the operation
+            pass
+
+    async def take_screenshots(self, name: str, page: Page|None, full_page: bool = True, include_timestamp: bool = True,
+                               load_state: str = 'domcontentloaded', take_snapshot_timeout: int = 5*1000):
+        if not self._screenshot_capture_enabled:
+            return
+        if page is None:
+            page = await self.get_current_page()
+
+        screenshot_name = name
+
+        if include_timestamp:
+            screenshot_name = f"{int(time.time_ns())}_{screenshot_name}"
+        screenshot_name += ".png"
+        screenshot_path = f"{self.get_screenshots_dir()}/{screenshot_name}"
+        try:
+            await page.wait_for_load_state(state=load_state, timeout=take_snapshot_timeout) # type: ignore
+            await page.screenshot(path=screenshot_path, full_page=full_page, timeout=take_snapshot_timeout, caret="initial", scale="device")
+            logger.debug(f"Screen shot saved to: {screenshot_path}")
+        except Exception as e:
+            logger.error(f"Failed to take screenshot and save to \"{screenshot_path}\". Error: {e}")
+
+    async def close(self):
+        try:
+            if self._current_page is not None:
+                # Wait for any pending operations to complete
+                try:
+                    await self._current_page.wait_for_load_state('load', timeout=5000)
+                except Exception:
+                    # Ignore timeout or other errors during wait
+                    pass
+                
+                current_page = self._current_page
+                # Clear reference first
+                self._current_page = None
+                # Then close the page
+                await current_page.close()
+            
+            # Handle context cleanup separately
+            if self._context is not None:
+                context = self._context
+                self._context = None
+                await context.close()
+                
+        except Exception as e:
+            logger.error(f"Error while closing browser context: {e}")
+            raise
+

+ 0 - 0
src/nanobrowser/lib/browser/dom/__init__.py


+ 43 - 0
src/nanobrowser/lib/browser/dom/dom_helper.py

@@ -0,0 +1,43 @@
+import asyncio
+import logging
+from playwright.async_api import ElementHandle, Page
+
+logger = logging.getLogger(__name__)
+
+async def wait_for_non_loading_dom_state(page: Page, max_wait_millis: int):
+    max_wait_seconds = max_wait_millis / 1000
+    end_time = asyncio.get_event_loop().time() + max_wait_seconds
+    while asyncio.get_event_loop().time() < end_time:
+        dom_state = await page.evaluate("document.readyState")
+        if dom_state != "loading":
+            logger.debug(f"DOM state is not 'loading': {dom_state}")
+            break  # Exit the loop if the DOM state is not 'loading'
+
+        await asyncio.sleep(0.05)
+
+
+async def get_element_outer_html(element: ElementHandle, page: Page, element_tag_name: str|None = None) -> str:
+    """
+    Constructs the opening tag of an HTML element along with its attributes.
+
+    Args:
+        element (ElementHandle): The element to retrieve the opening tag for.
+        page (Page): The page object associated with the element.
+        element_tag_name (str, optional): The tag name of the element. Defaults to None. If not passed, it will be retrieved from the element.
+
+    Returns:
+        str: The opening tag of the HTML element, including a select set of attributes.
+    """
+    tag_name: str = element_tag_name if element_tag_name else await page.evaluate("element => element.tagName.toLowerCase()", element)
+
+    attributes_of_interest: list[str] = ['id', 'name', 'aria-label', 'placeholder', 'href', 'src', 'aria-autocomplete', 'role', 'type',
+                                         'data-testid', 'value', 'selected', 'aria-labelledby', 'aria-describedby', 'aria-haspopup']
+    opening_tag: str = f'<{tag_name}'
+
+    for attr in attributes_of_interest:
+        value: str = await element.get_attribute(attr) # type: ignore
+        if value:
+            opening_tag += f' {attr}="{value}"'
+    opening_tag += '>'
+
+    return opening_tag

+ 98 - 0
src/nanobrowser/lib/browser/dom/dom_mutation_observer.py

@@ -0,0 +1,98 @@
+
+import asyncio
+import json
+from typing import Callable  # noqa: UP035
+
+from playwright.async_api import Page
+
+# Create an event loop
+loop = asyncio.get_event_loop()
+
+DOM_change_callback: list[Callable[[str], None]] = []
+
+def subscribe(callback: Callable[[str], None]) -> None:
+    DOM_change_callback.append(callback)
+
+def unsubscribe(callback: Callable[[str], None]) -> None:
+    DOM_change_callback.remove(callback)
+
+
+async def add_mutation_observer(page:Page):
+    """
+    Adds a mutation observer to the page to detect changes in the DOM.
+    When changes are detected, the observer calls the dom_mutation_change_detected function in the browser context.
+    This changes can be detected by subscribing to the dom_mutation_change_detected function by individual skills.
+
+    Current implementation only detects when a new node is added to the DOM.
+    However, in many cases, the change could be a change in the style or class of an existing node (e.g. toggle visibility of a hidden node).
+
+    a) New Elements (childList):
+    // Processes newly added DOM nodes
+    // Filters out SCRIPT, NOSCRIPT, STYLE tags
+    // Collects text content from visible elements
+
+    b) Text Changes (characterData):
+    // Monitors changes to text content
+    // Checks parent node visibility
+    // Avoids duplicate content
+    """
+
+    await page.evaluate("""
+        console.log('Adding a mutation observer for DOM changes');
+        new MutationObserver((mutationsList, observer) => {
+            let changes_detected = [];
+            for(let mutation of mutationsList) {
+                if (mutation.type === 'childList') {
+                    let allAddedNodes=mutation.addedNodes;
+                    for(let node of allAddedNodes) {
+                        if(node.tagName && !['SCRIPT', 'NOSCRIPT', 'STYLE'].includes(node.tagName) && !node.closest('#agentDriveAutoOverlay')) {
+                            let visibility=true;
+                            let content = node.innerText.trim();
+                            if(visibility && node.innerText.trim()){
+                                if(content) {
+                                    changes_detected.push({tag: node.tagName, content: content});
+                                }
+                            }
+                        }
+                    }
+                } else if (mutation.type === 'characterData') {
+                    let node = mutation.target;
+                    if(node.parentNode && !['SCRIPT', 'NOSCRIPT', 'STYLE'].includes(node.parentNode.tagName) && !node.parentNode.closest('#agentDriveAutoOverlay')) {
+                        let visibility=true;
+                        let content = node.data.trim();
+                        if(visibility && content && window.getComputedStyle(node.parentNode).display !== 'none'){
+                            if(content && !changes_detected.some(change => change.content.includes(content))) {
+                                changes_detected.push({tag: node.parentNode.tagName, content: content});
+                            }
+                        }
+                    }
+                }
+            }
+            if(changes_detected.length > 0) {
+                window.dom_mutation_change_detected(JSON.stringify(changes_detected));
+            }
+        }).observe(document, {subtree: true, childList: true, characterData: true});
+        """)
+
+
+async def handle_navigation_for_mutation_observer(page:Page):
+    await add_mutation_observer(page)
+
+async def dom_mutation_change_detected(changes_detected: str):
+    """
+    Detects changes in the DOM (new nodes added) and emits the event to all subscribed callbacks.
+    The changes_detected is a string in JSON formatt containing the tag and content of the new nodes added to the DOM.
+
+    e.g.  The following will be detected when autocomplete recommendations show up when one types Nelson Mandela on google search
+    [{'tag': 'SPAN', 'content': 'nelson mandela wikipedia'}, {'tag': 'SPAN', 'content': 'nelson mandela movies'}]
+    """
+    changes_detected = json.loads(changes_detected.replace('\t', '').replace('\n', ''))
+    if len(changes_detected) > 0:
+        # Emit the event to all subscribed callbacks
+        for callback in DOM_change_callback:
+            # If the callback is a coroutine function
+            if asyncio.iscoroutinefunction(callback):
+                await callback(changes_detected)
+            # If the callback is a regular function
+            else:
+                callback(changes_detected)

+ 530 - 0
src/nanobrowser/lib/browser/dom/get_detailed_accessibility_tree.py

@@ -0,0 +1,530 @@
+import json
+import os
+import re
+import logging
+import traceback
+from typing import Annotated, Optional, Any
+from pathlib import Path
+from playwright.async_api import Page
+from ..manager import PlaywrightManager, PlaywrightOptions
+
+logger = logging.getLogger(__name__)
+
+space_delimited_mmid = re.compile(r'^[\d ]+$')
+
+def is_space_delimited_mmid(s: str) -> bool:
+    """
+    Check if the given string matches the the mmid pattern of number space repeated.
+
+    Parameters:
+    - s (str): The string to check against the pattern.
+
+    Returns:
+    - bool: True if the string matches the pattern, False otherwise.
+    """
+    # Use fullmatch() to ensure the entire string matches the pattern
+    return bool(space_delimited_mmid.fullmatch(s))
+
+
+async def __inject_attributes(page: Page):
+    """
+    Injects 'mmid' and 'aria-keyshortcuts' into all DOM elements. If an element already has an 'aria-keyshortcuts',
+    it renames it to 'orig-aria-keyshortcuts' before injecting the new 'aria-keyshortcuts'
+    This will be captured in the accessibility tree and thus make it easier to reconcile the tree with the DOM.
+    'aria-keyshortcuts' is choosen because it is not widely used aria attribute.
+    """
+
+    last_mmid = await page.evaluate("""() => {
+        const allElements = document.querySelectorAll('*');
+        let id = 0;
+        allElements.forEach(element => {
+            const origAriaAttribute = element.getAttribute('aria-keyshortcuts');
+            const mmid = `${++id}`;
+            element.setAttribute('mmid', mmid);
+            element.setAttribute('aria-keyshortcuts', mmid);
+            //console.log(`Injected 'mmid'into element with tag: ${element.tagName} and mmid: ${mmid}`);
+            if (origAriaAttribute) {
+                element.setAttribute('orig-aria-keyshortcuts', origAriaAttribute);
+            }
+        });
+        return id;
+    }""")
+    logger.debug(f"Added MMID into {last_mmid} elements")
+
+
+async def __fetch_dom_info(page: Page, accessibility_tree: dict[str, Any], only_input_fields: bool):
+    """
+    Iterates over the accessibility tree, fetching additional information from the DOM based on 'mmid',
+    and constructs a new JSON structure with detailed information.
+
+    Args:
+        page (Page): The page object representing the web page.
+        accessibility_tree (dict[str, Any]): The accessibility tree JSON structure.
+        only_input_fields (bool): Flag indicating whether to include only input fields in the new JSON structure.
+
+    Returns:
+        dict[str, Any]: The pruned tree with detailed information from the DOM.
+    """
+
+    logger.debug("Reconciling the Accessibility Tree with the DOM")
+    # Define the attributes to fetch for each element
+    attributes = ['name', 'aria-label', 'placeholder', 'mmid', "id", "for", "data-testid"]
+    backup_attributes = [] #if the attributes are not found, then try to get these attributes
+    tags_to_ignore = ['head','style', 'script', 'link', 'meta', 'noscript', 'template', 'iframe', 'g', 'main', 'c-wiz','svg', 'path']
+    attributes_to_delete = ["level", "multiline", "haspopup", "id", "for"]
+    ids_to_ignore = ['agentDriveAutoOverlay']
+
+    # Recursive function to process each node in the accessibility tree
+    async def process_node(node: dict[str, Any]):
+        if 'children' in node:
+            for child in node['children']:
+                await process_node(child)
+
+        # Use 'name' attribute from the accessibility node as 'mmid'
+        mmid_temp: str = node.get('keyshortcuts') # type: ignore
+
+        # If the name has multiple mmids, take the last one
+        if(mmid_temp and is_space_delimited_mmid(mmid_temp)):
+            #TODO: consider if we should grab each of the mmids and process them separately as seperate nodes copying this node's attributes
+            mmid_temp = mmid_temp.split(' ')[-1]
+
+        #focusing on nodes with mmid, which is the attribute we inject
+        try:
+            mmid = int(mmid_temp)
+        except (ValueError, TypeError):
+            #logger.error(f"'name attribute contains \"{node.get('name')}\", which is not a valid numeric mmid. Adding node as is: {node}")
+            return node.get('name')
+
+        if node['role'] == 'menuitem':
+            return node.get('name')
+
+        if node.get('role') == 'dialog' and node.get('modal') == True:  # noqa: E712
+            node["important information"] = "This is a modal dialog. Please interact with this dialog and close it to be able to interact with the full page (e.g. by pressing the close button or selecting an option)."
+
+        if mmid:
+            # Determine if we need to fetch 'innerText' based on the absence of 'children' in the accessibility node
+            should_fetch_inner_text = 'children' not in node
+
+            js_code = """
+            (input_params) => {
+                const should_fetch_inner_text = input_params.should_fetch_inner_text;
+                const mmid = input_params.mmid;
+                const attributes = input_params.attributes;
+                const tags_to_ignore = input_params.tags_to_ignore;
+                const ids_to_ignore = input_params.ids_to_ignore;
+
+                const element = document.querySelector(`[mmid="${mmid}"]`);
+
+                if (!element) {
+                    console.log(`No element found with mmid: ${mmid}`);
+                    return null;
+                }
+
+                if (ids_to_ignore.includes(element.id)) {
+                    console.log(`Ignoring element with id: ${element.id}`, element);
+                    return null;
+                }
+                //Ignore "option" because it would have been processed with the select element
+                if (tags_to_ignore.includes(element.tagName.toLowerCase()) || element.tagName.toLowerCase() === "option") return null;
+
+                let attributes_to_values = {
+                    'tag': element.tagName.toLowerCase() // Always include the tag name
+                };
+
+                // If the element is an input, include its type as well
+                if (element.tagName.toLowerCase() === 'input') {
+                    attributes_to_values['tag_type'] = element.type; // This will capture 'checkbox', 'radio', etc.
+                }
+                else if (element.tagName.toLowerCase() === 'select') {
+                    attributes_to_values["mmid"] = element.getAttribute('mmid');
+                    attributes_to_values["role"] = "combobox";
+                    attributes_to_values["options"] = [];
+
+                    for (const option of element.options) {
+                        let option_attributes_to_values = {
+                            "mmid": option.getAttribute('mmid'),
+                            "text": option.text,
+                            "value": option.value,
+                            "selected": option.selected
+                        };
+                        attributes_to_values["options"].push(option_attributes_to_values);
+                    }
+                    return attributes_to_values;
+                }
+
+                for (const attribute of attributes) {
+                    let value = element.getAttribute(attribute);
+
+                    if(value){
+                        /*
+                        if(attribute === 'href'){
+                            value = value.split('?')[0]
+                        }
+                        */
+                        attributes_to_values[attribute] = value;
+                    }
+                }
+
+                if (should_fetch_inner_text && element.innerText) {
+                    attributes_to_values['description'] = element.innerText;
+                }
+
+                let role = element.getAttribute('role');
+                if(role==='listbox' || element.tagName.toLowerCase()=== 'ul'){
+                    let children=element.children;
+                    let filtered_children = Array.from(children).filter(child => child.getAttribute('role') === 'option');
+                    console.log("Listbox or ul found: ", filtered_children);
+                    let attributes_to_include = ['mmid', 'role', 'aria-label','value'];
+                    attributes_to_values["additional_info"]=[]
+                    for (const child of children) {
+                        let children_attributes_to_values = {};
+
+                        for (let attr of child.attributes) {
+                            // If the attribute is not in the predefined list, add it to children_attributes_to_values
+                            if (attributes_to_include.includes(attr.name)) {
+                                children_attributes_to_values[attr.name] = attr.value;
+                            }
+                        }
+
+                        attributes_to_values["additional_info"].push(children_attributes_to_values);
+                    }
+                }
+                // Check if attributes_to_values contains more than just 'name', 'role', and 'mmid'
+                const keys = Object.keys(attributes_to_values);
+                const minimalKeys = ['tag', 'mmid'];
+                const hasMoreThanMinimalKeys = keys.length > minimalKeys.length || keys.some(key => !minimalKeys.includes(key));
+
+                if (!hasMoreThanMinimalKeys) {
+                    //If there were no attributes found, then try to get the backup attributes
+                    for (const backupAttribute of input_params.backup_attributes) {
+                        let value = element.getAttribute(backupAttribute);
+                        if(value){
+                            attributes_to_values[backupAttribute] = value;
+                        }
+                    }
+
+                    //if even the backup attributes are not found, then return null, which will cause this element to be skipped
+                    if(Object.keys(attributes_to_values).length <= minimalKeys.length) {
+                        if (element.tagName.toLowerCase() === 'button') {
+                                attributes_to_values["mmid"] = element.getAttribute('mmid');
+                                attributes_to_values["role"] = "button";
+                                attributes_to_values["additional_info"] = [];
+                                let children=element.children;
+                                let attributes_to_exclude = ['width', 'height', 'path', 'class', 'viewBox', 'mmid']
+
+                                // Check if the button has no text and no attributes
+                                if (element.innerText.trim() === '') {
+
+                                    for (const child of children) {
+                                        let children_attributes_to_values = {};
+
+                                        for (let attr of child.attributes) {
+                                            // If the attribute is not in the predefined list, add it to children_attributes_to_values
+                                            if (!attributes_to_exclude.includes(attr.name)) {
+                                                children_attributes_to_values[attr.name] = attr.value;
+                                            }
+                                        }
+
+                                        attributes_to_values["additional_info"].push(children_attributes_to_values);
+                                    }
+                                    console.log("Button with no text and no attributes: ", attributes_to_values);
+                                    return attributes_to_values;
+                                }
+                        }
+
+                        return null; // Return null if only minimal keys are present
+                    }
+                }
+                return attributes_to_values;
+            }
+            """
+
+            # Fetch attributes and possibly 'innerText' from the DOM element by 'mmid'
+            element_attributes = await page.evaluate(js_code,
+                                                     {"mmid": mmid, "attributes": attributes, "backup_attributes": backup_attributes,
+                                                      "should_fetch_inner_text": should_fetch_inner_text,
+                                                      "tags_to_ignore": tags_to_ignore,
+                                                      "ids_to_ignore": ids_to_ignore})
+
+            if 'keyshortcuts' in node:
+                    del node['keyshortcuts'] #remove keyshortcuts since it is not needed
+
+            node["mmid"]=mmid
+
+            # Update the node with fetched information
+            if element_attributes:
+                node.update(element_attributes)
+
+                # check if 'name' and 'mmid' are the same
+                if node.get('name') == node.get('mmid') and node.get('role') != "textbox":
+                    del node['name']  # Remove 'name' from the node
+
+                if 'name' in node and 'description' in node and (node['name'] == node['description'] or node['name'] == node['description'].replace('\n', ' ') or node['description'].replace('\n', '') in node['name']):
+                    del node['description'] #if the name is same as description, then remove the description to avoid duplication
+
+                if 'name' in node and 'aria-label' in node and  node['aria-label'] in node['name']:
+                    del node['aria-label'] #if the name is same as the aria-label, then remove the aria-label to avoid duplication
+
+                if 'name' in node and 'text' in node and node['name'] == node['text']:
+                    del node['text'] #if the name is same as the text, then remove the text to avoid duplication
+
+                if node.get('tag') == "select": #children are not needed for select menus since "options" attriburte is already added
+                    node.pop("children", None)
+                    node.pop("role", None)
+                    node.pop("description", None)
+
+                #role and tag can have the same info. Get rid of role if it is the same as tag
+                if node.get('role') == node.get('tag'):
+                    del node['role']
+
+                # avoid duplicate aria-label
+                if node.get("aria-label") and node.get("placeholder") and node.get("aria-label") == node.get("placeholder"):
+                    del node["aria-label"]
+
+                if node.get("role") == "link":
+                    del node["role"]
+                    if node.get("description"):
+                        node["text"] = node["description"]
+                        del node["description"]
+
+                #textbox just means a text input and that is expressed well enough with the rest of the attributes returned
+                #if node.get('role') == "textbox":
+                #    del node['role']
+
+                if node.get('role') == "textbox":
+                    #get the id attribute of this field from the DOM
+                    if "id" in element_attributes and element_attributes["id"]:
+                        #find if there is an element in the DOM that has this id in aria-labelledby.
+                        js_code = """
+                        (inputParams) => {
+                            let referencingElements = [];
+                            const referencedElement = document.querySelector(`[aria-labelledby="${inputParams.aria_labelled_by_query_value}"]`);
+                            if(referencedElement) {
+                                const mmid = referencedElement.getAttribute('mmid');
+                                if (mmid) {
+                                    return {"mmid": mmid, "tag": referencedElement.tagName.toLowerCase()};
+                                }
+                            }
+                            return null;
+                        }
+                        """
+                    #textbox just means a text input and that is expressed well enough with the rest of the attributes returned
+                    #del node['role']
+
+            #remove attributes that are not needed once processing of a node is complete
+            for attribute_to_delete in attributes_to_delete:
+                if attribute_to_delete in node:
+                    node.pop(attribute_to_delete, None)
+        else:
+            logger.debug(f"No element found with mmid: {mmid}, deleting node: {node}")
+            node["marked_for_deletion_by_mm"] = True
+
+
+    # Process each node in the tree starting from the root
+    await process_node(accessibility_tree)
+
+    pruned_tree = __prune_tree(accessibility_tree, only_input_fields)
+
+    logger.debug("Reconciliation complete")
+    return pruned_tree
+
+
+async def __cleanup_dom(page: Page):
+    """
+    Cleans up the DOM by removing injected 'aria-description' attributes and restoring any original 'aria-keyshortcuts'
+    from 'orig-aria-keyshortcuts'.
+    """
+    logger.debug("Cleaning up the DOM's previous injections")
+    await page.evaluate("""() => {
+        const allElements = document.querySelectorAll('*[mmid]');
+        allElements.forEach(element => {
+            element.removeAttribute('aria-keyshortcuts');
+            const origAriaLabel = element.getAttribute('orig-aria-keyshortcuts');
+            if (origAriaLabel) {
+                element.setAttribute('aria-keyshortcuts', origAriaLabel);
+                element.removeAttribute('orig-aria-keyshortcuts');
+            }
+        });
+    }""")
+    logger.debug("DOM cleanup complete")
+
+
+def __prune_tree(node: dict[str, Any], only_input_fields: bool) -> dict[str, Any] | None:
+    """
+    Recursively prunes a tree starting from `node`, based on pruning conditions and handling of 'unraveling'.
+
+    The function has two main jobs:
+    1. Pruning: Remove nodes that don't meet certain conditions, like being marked for deletion.
+    2. Unraveling: For nodes marked with 'marked_for_unravel_children', we replace them with their children,
+       effectively removing the node and lifting its children up a level in the tree.
+
+    This happens in place, meaning we modify the tree as we go, which is efficient but means you should
+    be cautious about modifying the tree outside this function during a prune operation.
+
+    Args:
+    - node (Dict[str, Any]): The node we're currently looking at. We'll check this node, its children,
+      and so on, recursively down the tree.
+    - only_input_fields (bool): If True, we're only interested in pruning input-related nodes (like form fields).
+      This lets you narrow the focus if, for example, you're only interested in cleaning up form-related parts
+      of a larger tree.
+
+    Returns:
+    - dict[str, Any] | None: The pruned version of `node`, or None if `node` was pruned away. When we 'unravel'
+      a node, we directly replace it with its children in the parent's list of children, so the return value
+      will be the parent, updated in place.
+
+    Notes:
+    - 'marked_for_deletion_by_mm' is our flag for nodes that should definitely be removed.
+    - Unraveling is neat for flattening the tree when a node is just a wrapper without semantic meaning.
+    - We use a while loop with manual index management to safely modify the list of children as we iterate over it.
+    """
+    if "marked_for_deletion_by_mm" in node:
+        return None
+
+    if 'children' in node:
+        i = 0
+        while i < len(node['children']):
+            child = node['children'][i]
+            if 'marked_for_unravel_children' in child:
+                # Replace the current child with its children
+                if 'children' in child:
+                    node['children'] = node['children'][:i] + child['children'] + node['children'][i+1:]
+                    i += len(child['children']) - 1  # Adjust the index for the new children
+                else:
+                    # If the node marked for unraveling has no children, remove it
+                    node['children'].pop(i)
+                    i -= 1  # Adjust the index since we removed an element
+            else:
+                # Recursively prune the child if it's not marked for unraveling
+                pruned_child = __prune_tree(child, only_input_fields)
+                if pruned_child is None:
+                    # If the child is pruned, remove it from the children list
+                    node['children'].pop(i)
+                    i -= 1  # Adjust the index since we removed an element
+                else:
+                    # Update the child with the pruned version
+                    node['children'][i] = pruned_child
+            i += 1  # Move to the next child
+
+        # After processing all children, if the children array is empty, remove it
+        if not node['children']:
+            del node['children']
+
+    # Apply existing conditions to decide if the current node should be pruned
+    return None if __should_prune_node(node, only_input_fields) else node
+
+
+def __should_prune_node(node: dict[str, Any], only_input_fields: bool):
+    """
+    Determines if a node should be pruned based on its 'role' and 'element_attributes'.
+
+    Args:
+        node (dict[str, Any]): The node to be evaluated.
+        only_input_fields (bool): Flag indicating whether only input fields should be considered.
+
+    Returns:
+        bool: True if the node should be pruned, False otherwise.
+    """
+    #If the request is for only input fields and this is not an input field, then mark the node for prunning
+    if node.get("role") != "WebArea" and only_input_fields and not (node.get("tag") in ("input", "button", "textarea") or node.get("role") == "button"):
+        return True
+
+    if node.get('role') == 'generic' and 'children' not in node and not ('name' in node and node.get('name')):  # The presence of 'children' is checked after potentially deleting it above
+        return True
+
+    if node.get('role') in ['separator', 'LineBreak']:
+        return True
+    processed_name = ""
+    if 'name' in node:
+        processed_name:str =node.get('name') # type: ignore
+        processed_name = processed_name.replace(',', '')
+        processed_name = processed_name.replace(':', '')
+        processed_name = processed_name.replace('\n', '')
+        processed_name = processed_name.strip()
+        if len(processed_name) <3:
+            processed_name = ""
+
+    #check if the node only have name and role, then delete that node
+    if len(node) == 2 and 'name' in node and 'role' in node and not (node.get('role') == "text" and processed_name != ""):
+        return True
+    return False
+
+async def get_node_dom_element(page: Page, mmid: str):
+    return await page.evaluate("""
+        (mmid) => {
+            return document.querySelector(`[mmid="${mmid}"]`);
+        }
+    """, mmid)
+
+
+async def get_element_attributes(page: Page, mmid: str, attributes: list[str]):
+    return await page.evaluate("""
+        (inputParams) => {
+            const mmid = inputParams.mmid;
+            const attributes = inputParams.attributes;
+            const element = document.querySelector(`[mmid="${mmid}"]`);
+            if (!element) return null;  // Return null if element is not found
+
+            let attrs = {};
+            for (let attr of attributes) {
+                attrs[attr] = element.getAttribute(attr);
+            }
+            return attrs;
+        }
+    """, {"mmid": mmid, "attributes": attributes})
+
+
+async def get_dom_with_accessibility_info() -> Annotated[dict[str, Any] | None, "A minified representation of the HTML DOM for the current webpage"]:
+    """
+    Retrieves, processes, and minifies the Accessibility tree of the active page in a browser instance.
+    Strictly follow the name and role tag for any interaction with the nodes.
+
+    Returns:
+    - The minified JSON content of the browser's active page.
+    """
+    logger.debug("Executing Get Accessibility Tree Command")
+    # Create and use the PlaywrightManager
+    browser_manager = PlaywrightManager(PlaywrightOptions())
+    page = await browser_manager.get_current_page()
+    if page is None: # type: ignore
+        raise ValueError('No active page found')
+
+    return await do_get_accessibility_info(page)
+
+
+async def do_get_accessibility_info(page: Page, only_input_fields: bool = False, log_dir: Optional[Path] = None):
+    """
+    Retrieves the accessibility information of a web page and saves it as JSON files.
+
+    Args:
+        page (Page): The page object representing the web page.
+        only_input_fields (bool, optional): If True, only retrieves accessibility information for input fields.
+            Defaults to False.
+
+    Returns:
+        dict[str, Any] or None: The enhanced accessibility tree as a dictionary, or None if an error occurred.
+    """
+    await __inject_attributes(page)
+    accessibility_tree: dict[str, Any] = await page.accessibility.snapshot(interesting_only=True)  # type: ignore
+    
+    if log_dir is not None:
+        with open(os.path.join(log_dir, 'json_accessibility_dom.json'), 'w',  encoding='utf-8') as f:
+            f.write(json.dumps(accessibility_tree, indent=2))
+            logger.debug("json_accessibility_dom.json saved")
+
+    await __cleanup_dom(page)
+    try:
+        enhanced_tree = await __fetch_dom_info(page, accessibility_tree, only_input_fields)
+
+        logger.debug("Enhanced Accessibility Tree ready")
+
+        if log_dir is not None:
+            with open(os.path.join(log_dir, 'json_accessibility_dom_enriched.json'), 'w',  encoding='utf-8') as f:
+                f.write(json.dumps(enhanced_tree, indent=2))
+                logger.debug("json_accessibility_dom_enriched.json saved")
+
+        return enhanced_tree
+    except Exception as e:
+        logger.error(f"Error while fetching DOM info: {e}")
+        traceback.print_exc()
+        return None

+ 176 - 0
src/nanobrowser/lib/browser/launcher.py

@@ -0,0 +1,176 @@
+# original code from 
+# https://github.com/HMaker/python-cdp/blob/master/pycdp/browser.py
+import warnings
+import os
+import signal
+import shutil
+import tempfile
+import subprocess
+import typing as t
+import asyncio
+from io import TextIOWrapper
+import logging
+
+logger = logging.getLogger(__name__)
+
+class BrowserLauncher():
+
+    def __init__(
+        self,
+        *,
+        binary: str,
+        profile: str=None,
+        keep_profile: bool=True,
+        headless: bool=False,
+        locale: str=None,
+        timezone: str=None,
+        proxy: str=None,
+        window_width: int=None,
+        window_height: int=None,
+        initial_url: str=None,
+        extensions: t.List[str]=[],
+        args: t.List[str]=None,
+        log: bool=False,
+    ):
+        self._binary = binary
+        self._headless = headless
+        self._locale = locale
+        self._timezone = timezone
+        self._proxy = proxy
+        self._window_width = window_width
+        self._window_height = window_height
+        self._extensions = extensions
+        self._initial_url = initial_url
+        self._args = args
+        self._log = log
+        self._process: subprocess.Popen = None
+        if profile is None:
+            self._keep_profile = False
+            self._profile = None
+        else:
+            self._profile = profile
+            self._keep_profile = keep_profile
+        self._logfile: TextIOWrapper = None
+
+    @property
+    def pid(self) -> int:
+        return self._process.pid
+
+    @property
+    def locale(self):
+        return self._locale
+
+    @property
+    def timezone(self):
+        return self._timezone
+
+    async def alaunch(self):
+        await asyncio.get_running_loop().run_in_executor(None, self.launch)
+
+    def launch(self):
+        if self._process is not None: raise RuntimeError('already launched')
+        if self._log:
+            self._logfile = open(f'{self.__class__.__name__.lower()}.log', 'a')
+            stdout = stderr = self._logfile
+            logger.debug('redirecting output to %s.log', self.__class__.__name__.lower())
+        else:
+            stdout = stderr = subprocess.DEVNULL
+            logger.debug('redirecting output to subprocess.DEVNULL')
+        if self._profile is None:
+            self._profile = tempfile.mkdtemp()
+            self._configure_profile()
+        cmd = self._build_launch_cmdline()
+        logger.debug('launching %s', cmd)
+        self._process = subprocess.Popen(
+            cmd,
+            env=self._build_launch_env(),
+            stdin=subprocess.PIPE,
+            stdout=stdout,
+            stderr=stderr,
+            text=True,
+            close_fds=True,
+            preexec_fn=os.setsid if os.name == 'posix' else None,
+            creationflags=subprocess.CREATE_NEW_PROCESS_GROUP if os.name == 'nt' else 0
+        )
+        try:
+            logger.debug('waiting launch finish...')
+            returncode = self._process.wait(1)
+        except subprocess.TimeoutExpired:
+            logger.debug('launch finished')
+
+    async def akill(self, timeout: float=3.0):
+        await asyncio.get_running_loop().run_in_executor(None, self.kill, timeout)
+
+    def kill(self, timeout: float=3.0):
+        if self._process is not None:
+            try:
+                if os.name == 'posix':
+                    try:
+                        os.killpg(os.getpgid(self._process.pid), signal.SIGTERM)
+                    except ProcessLookupError:
+                        logger.debug('Process already terminated')
+                else:
+                    self._process.terminate()
+                
+                try:
+                    self._process.wait(timeout)
+                except subprocess.TimeoutExpired:
+                    if os.name == 'posix':
+                        try:
+                            os.killpg(os.getpgid(self._process.pid), signal.SIGKILL)
+                        except ProcessLookupError:
+                            logger.debug('Process already terminated')
+                    else:
+                        self._process.kill()
+                    
+            finally:
+                self._process = None
+                if self._logfile is not None and not self._logfile.closed:
+                    self._logfile.close()
+                if not self._keep_profile:
+                    shutil.rmtree(self._profile, ignore_errors=True)
+
+    def _build_launch_cmdline(self) -> t.List[str]:
+        raise NotImplementedError
+
+    def _build_launch_env(self):
+        env = os.environ.copy()
+        if os.name == 'posix':
+            if self._timezone is not None:
+                env['TZ'] = self._timezone
+            if self._locale is not None:
+                env['LANGUAGE'] = self._locale
+        return env
+
+    def _configure_profile(self):
+        pass
+
+    def __del__(self):
+        if self._process is not None:
+            warnings.warn('A BrowserLauncher instance has not closed with .kill(), it will leak')
+
+
+class ChromeLauncher(BrowserLauncher):
+
+    def _build_launch_cmdline(self) -> t.List[str]:
+        cmd = [
+            self._binary,
+            f'--window-size={self._window_width},{self._window_height}' if self._window_width is not None and self._window_height is not None else '--start-maximized'
+        ]
+        if os.name == 'posix':
+            cmd.append('--enable-logging')
+            cmd.append('--v=2')
+        if self._headless:
+            cmd.append('--headless')
+            cmd.append('--disable-gpu')
+        if self._proxy is not None:
+            cmd.append(f'--proxy-server={self._proxy}')
+        if len(self._extensions) > 0:
+            cmd.append(f"--load-extension={','.join(str(path) for path in self._extensions)}")
+        if os.name == 'nt' and self._locale is not None:
+            cmd.append(f'--lang={self._locale}')
+        if self._args is not None:
+            cmd.extend(self._args)
+        if self._initial_url is not None:
+            cmd.append(self._initial_url)
+        return cmd

+ 257 - 0
src/nanobrowser/lib/browser/manager.py

@@ -0,0 +1,257 @@
+import requests
+import logging
+from dataclasses import dataclass
+from typing import Optional
+from playwright.async_api import async_playwright
+from playwright.async_api import Playwright
+from asyncio import Lock
+from .launcher import ChromeLauncher
+from .context import BrowserContext, BrowserContextOptions
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class PlaywrightOptions:
+    headless: bool = False
+    screenshots_dir: str = ""
+    # use screenshot capture to record the browser actions
+    screenshot_capture_enabled: bool = False
+    # use chrome app path to launch chrome in subprocess
+    chrome_app_path: Optional[str] = None
+    # use cdp port to connect to chrome over cdp
+    cdp_port: Optional[int] = 9222
+
+class PlaywrightManager:
+    """
+    A singleton class to manage Playwright instances and browsers.
+
+    Only Chrome and Chromium browser are supported for now.
+    """
+    _instance = None
+   
+
+    def __new__(cls, *args, **kwargs): # type: ignore
+        """
+        Ensures that only one instance of PlaywrightManager is created (singleton pattern).
+        """
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+
+    def __init__(self, options: PlaywrightOptions):
+        """
+        Initializes the PlaywrightManager with the specified browser type and headless mode.
+        Initialization occurs only once due to the singleton pattern.
+
+        Args:
+            options (PlaywrightOptions): The options for the PlaywrightManager.
+        """
+        # Only initialize if these attributes don't exist yet
+        if not hasattr(self, '_homepage'):
+            self._homepage = "https://www.google.com"
+
+            self._playwright = None # type: ignore
+            self._browser = None # type: ignore
+            self._browser_context: BrowserContext | None = None
+            self.__async_initialize_done = False
+            self.__init_lock = Lock()
+
+            self._chrome_app_path = options.chrome_app_path
+            self._cdp_port = options.cdp_port
+            self._headless = options.headless
+            self._screenshot_capture_enabled = options.screenshot_capture_enabled
+            self._screenshots_dir = options.screenshots_dir
+            
+            # use chrome launcher to launch chrome in subprocess
+            self._chrome_launcher: ChromeLauncher | None = None
+            self._user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+            # https://peter.sh/experiments/chromium-command-line-switches/
+            self._default_chrome_args = [ 
+                '--disable-infobars',
+                '--no-pings',
+                '--disable-breakpad',
+                '--disable-component-update',
+                '--disable-background-timer-throttling',
+                '--disable-popup-blocking',
+                '--disable-backgrounding-occluded-windows',
+                '--disable-renderer-backgrounding',
+                '--no-first-run',
+                '--no-default-browser-check',
+                '--disable-dev-shm-usage'
+            ]
+
+    async def async_initialize(self):
+        """
+        Asynchronously initialize necessary components and handlers for the browser context.
+        Thread-safe initialization using an asyncio Lock.
+        """
+        async with self.__init_lock:
+            if self.__async_initialize_done:
+                return
+
+            if not self._playwright:
+                self._playwright: Playwright = await async_playwright().start()
+                logger.debug("Playwright instance created..")
+            
+            await self._connect_to_browser()
+            await self.get_browser_context()
+
+            # browser is launched by playwright, navigate to homepage
+            if self._chrome_app_path is None and self._browser_context is not None:
+                await self._browser_context.set_current_page(None)
+
+            self.__async_initialize_done = True
+
+
+    async def _connect_to_browser(self):
+        """
+        Connects to a browser instance with remote debugging enabled.
+        1. if chrome app path is not provided, launch a chromium browser instance
+        2. if chrome app path is provided, try to connect to an existing browser instance with remote debugging enabled.
+        3. if not successful, try to launch a new browser instance with remote debugging enabled.
+        4. if not successful, raise an error.
+        """
+        try:
+            if self._chrome_app_path is None:
+                await self._launch_chrome_by_playwright()
+                return True
+            else:
+                cdp_enabled = await self._is_chrome_cdp_enabled()
+                if not cdp_enabled:
+                    await self._launch_chrome_in_subprocess()
+                     
+            # finally try to connect over cdp
+            self._browser = await self._playwright.chromium.connect_over_cdp(
+                f"http://localhost:{self._cdp_port}",
+            )
+            if self._browser is None:
+                raise RuntimeError("Failed to connect to chrome over CDP. Please close any existing chrome instances and try again.")
+            return True
+        except Exception:
+            raise
+
+    async def _is_chrome_cdp_enabled(self, timeout: int = 3):
+        """
+        Checks if a Chrome instance with remote debugging enabled is running.
+
+        Args:
+            timeout (int, optional): The timeout for the request in seconds. Defaults to 3.
+        """
+        try:
+            response = requests.get(f"http://localhost:{self._cdp_port}/json/version", timeout=timeout)
+            return response.status_code == 200
+        except Exception:
+            return False
+
+    async def _launch_chrome_in_subprocess(self):
+        """
+        Launches the Chrome application with remote debugging enabled in a subprocess.
+        """
+        try:
+            args = [f'--remote-debugging-port={self._cdp_port}']
+            args.extend(self._default_chrome_args)
+
+            self._chrome_launcher = ChromeLauncher(binary= self._chrome_app_path, args=args)
+            await self._chrome_launcher.alaunch()
+
+            # If a chrome instance is running before launching, the remote debugging would not be enabled.
+            cdp_enabled = await self._is_chrome_cdp_enabled()
+            if not cdp_enabled:
+                raise RuntimeError("Chrome is launched in subprocess, but remote debugging not enabled. Please close any existing chrome instances and try again.")
+        except Exception:
+            raise
+    
+    async def _launch_chrome_by_playwright(self):
+        """
+        Launches the Google Chrome browser by playwright.
+        """
+        try:
+            args= self._default_chrome_args.copy()
+            args.append('--no-sandbox')
+            args.append('--disable-blink-features=AutomationControlled')
+            if self._headless:
+                args.append('--disable-gpu')
+
+            self._browser = await self._playwright.chromium.launch(
+                channel= "chrome",
+                headless=self._headless,
+                args=args
+            )
+        except Exception as e:
+            logger.error(f"Failed to launch Chrome browser by playwright: {e}")
+            raise
+
+
+    async def create_browser_context(self):
+        try:
+            if self._browser is None:
+                raise ValueError("Browser is not initialized")
+
+            context = None
+            if len(self._browser.contexts) > 0:
+                # pretty print the browser contexts
+                logger.debug(f"Browser context already exists. Reusing it. {self._browser.contexts}")
+
+                context = self._browser.contexts[0]
+                logger.debug("Browser context already exists. Reusing it.")
+            else:
+                context = await self._browser.new_context(
+                    no_viewport=True,
+                    user_agent=self._user_agent,
+                    java_script_enabled=True
+                )
+                logger.debug("Created new browser context")
+
+            self._browser_context = BrowserContext(context, BrowserContextOptions(
+                home_page=self._homepage,
+                screenshots_dir=self._screenshots_dir,
+                screenshot_capture_enabled=self._screenshot_capture_enabled
+            ))
+            return self._browser_context
+        except Exception as e:
+            logger.error(f"Failed to create browser context: {e}")
+            raise e
+
+
+    async def get_browser_context(self):
+        """
+        Returns the existing browser context, or creates a new one if it doesn't exist.
+        """
+        if self._browser_context is None:
+            await self.create_browser_context()
+        return self._browser_context
+
+
+    async def close(self):
+        """
+        Closes and cleans up all Playwright resources.
+        This includes closing browser contexts, browser instances, and stopping the Playwright instance.
+        """
+        try:
+            if self._browser_context is not None:
+                await self._browser_context.close()
+
+            if self._browser is not None:
+                await self._browser.close()
+            
+            if self._playwright is not None:
+                await self._playwright.stop()
+                self._playwright = None
+
+            if self._chrome_launcher is not None:
+                await self._chrome_launcher.akill()
+                self._chrome_launcher = None
+
+            logger.info("Successfully closed all Playwright resources")
+        except Exception as e:
+            logger.error(f"Error while closing Playwright resources: {e}")
+            raise
+        finally:
+            self._playwright = None
+            self._browser = None
+            self._browser_context = None
+            self._chrome_launcher = None
+            self._playwright = None
+            self.__async_initialize_done = False
+

+ 0 - 0
src/nanobrowser/lib/config/__init__.py


+ 127 - 0
src/nanobrowser/lib/config/config.py

@@ -0,0 +1,127 @@
+import os
+import platform
+import yaml
+from dotenv import load_dotenv
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional, Dict, Any
+from .logging_config import configure_logging
+
+load_dotenv()
+
+@dataclass
+class AgentConfig:
+    model: str
+    model_provider: str
+    api_key: Optional[str] = None
+    inference_config: Dict[str, Any] = field(default_factory=dict)
+
+    def __post_init__(self):
+        # Set api_key to None if it starts with "sk-..."
+        if isinstance(self.api_key, str) and self.api_key.startswith("sk-"):
+            self.api_key = None
+
+        # Move any unknown parameters to inference_config during initialization
+        known_fields = {'model', 'model_provider', 'api_key', 'inference_config'}
+        for k, v in list(self.__dict__.items()):
+            if k not in known_fields and not k.startswith('_'):
+                self.inference_config[k] = v
+                delattr(self, k)
+
+@dataclass
+class BrowserConfig:
+    chrome_app_path: Optional[str] = None
+    cdp_port: int = 9222
+
+@dataclass
+class ServerConfig:
+    host: str = "127.0.0.1"
+    port: int = 6768
+
+@dataclass
+class NanoConfig:
+    base_dir: Path
+    save_chat_history: bool = True
+    log_events: bool = True
+    max_steps: int = 100
+    max_errors: int = 20
+    max_tool_rounds: int = 20
+    planner: AgentConfig = None
+    navigator: AgentConfig = None
+    browser: BrowserConfig = None
+    server: ServerConfig = None
+    log_level: str = "INFO"
+
+    @staticmethod
+    def try_to_find_chrome_path() -> Optional[str]:
+        """
+        Tries to find the Chrome executable path based on the operating system.
+        Returns None if Chrome is not found.
+        """
+        system = platform.system()
+        
+        if system == "Darwin":  # macOS
+            chrome_paths = [
+                "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+                "~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
+            ]
+            for path in chrome_paths:
+                path = os.path.expanduser(path)
+                if os.path.exists(path):
+                    return path
+                    
+        elif system == "Windows":
+            chrome_paths = [
+                os.path.join(os.environ.get("PROGRAMFILES", ""), "Google/Chrome/Application/chrome.exe"),
+                os.path.join(os.environ.get("PROGRAMFILES(X86)", ""), "Google/Chrome/Application/chrome.exe"),
+                os.path.join(os.environ.get("LOCALAPPDATA", ""), "Google/Chrome/Application/chrome.exe")
+            ]
+            for path in chrome_paths:
+                if os.path.exists(path):
+                    return path
+                    
+        return None
+
+    @classmethod
+    def from_yaml(cls, yaml_path: str | Path) -> 'NanoConfig':
+        with open(yaml_path, 'r') as f:
+            config_dict = yaml.safe_load(f)
+        
+        # Configure logging first
+        configure_logging(level=config_dict.get('log_level', 'INFO'))
+        
+        base_dir = config_dict.get('base_dir', Path.cwd()/".nanobrowser")
+        config_dict['base_dir'] = Path(base_dir)
+        
+        # Create LLM configs
+        if 'planner' in config_dict:
+            config_dict['planner'] = AgentConfig(**config_dict['planner'])
+        if 'navigator' in config_dict:
+            config_dict['navigator'] = AgentConfig(**config_dict['navigator'])
+            
+        if 'browser' in config_dict:
+            chrome_path = None
+            if config_dict['browser']['chrome_app_path']:
+                chrome_path = config_dict['browser']['chrome_app_path'].strip()
+                if not os.path.exists(chrome_path):
+                    chrome_path = None
+            if not chrome_path:
+                chrome_path = cls.try_to_find_chrome_path()
+
+            if chrome_path:
+                config_dict['browser']['chrome_app_path'] = chrome_path
+
+            config_dict['browser'] = BrowserConfig(**config_dict['browser'])
+        else:
+            # try to find chrome path
+            chrome_path = cls.try_to_find_chrome_path()
+            if chrome_path:
+                config_dict['browser'] = BrowserConfig(chrome_app_path=chrome_path)
+            else:
+                config_dict['browser'] = None
+            
+        # Add WebSocket server config handling
+        if 'server' in config_dict:
+            config_dict['server'] = ServerConfig(**config_dict['server'])
+        
+        return cls(**config_dict) 

+ 34 - 0
src/nanobrowser/lib/config/logging_config.py

@@ -0,0 +1,34 @@
+import logging
+
+def configure_logging(
+    level: str = "INFO",
+    format: str = '[%(asctime)s] %(levelname)s {%(filename)s:%(lineno)d} - %(message)s',
+    date_format: str = '%Y-%m-%d %H:%M:%S'
+) -> None:
+    """Configure root logger with common settings"""
+    # Configure root logger
+    logging.basicConfig(
+        level=level.upper(),
+        format=format,
+        datefmt=date_format
+    )
+    
+    # Suppress noisy modules
+    suppress_modules = [
+        "httpcore",
+        "httpx",
+        "playwright",
+        "urllib3",
+        "asyncio",
+        "websockets",
+        "langchain",
+        "openai",
+        "anthropic",
+        "langchain_openai",
+        "langchain_anthropic",
+    ]
+    
+    for module in suppress_modules:
+        module_logger = logging.getLogger(module)
+        module_logger.setLevel(logging.WARNING)
+        module_logger.propagate = False

+ 0 - 0
src/nanobrowser/lib/utils/__init__.py


+ 59 - 0
src/nanobrowser/lib/utils/path_manager.py

@@ -0,0 +1,59 @@
+from pathlib import Path
+from typing import Dict
+
+class PathManager:
+    """
+    A singleton class to manage paths for the agents.
+    """
+    _instance = None
+
+    def __new__(cls, *args, **kwargs): # type: ignore
+        if cls._instance is None:
+            cls._instance = super(PathManager, cls).__new__(cls)
+        return cls._instance
+
+    def __init__(self, base_dir: Path | str = None):
+        # Convert string to Path if necessary
+        if isinstance(base_dir, str):
+            base_dir = Path(base_dir)
+        
+        self.base = base_dir or Path.home() / ".r2"
+        
+        # Define subdirectories
+        self.logs = self.base / "logs"
+        self.screenshots = self.base / "screenshots"
+        self.messages = self.base / "messages"
+        self.tasks = self.base / "tasks"
+        self.outputs = self.base / "outputs"
+        self.temp = self.base / "temp"
+        
+        # Create directories if they don't exist
+        self._create_directories()
+    
+    def _create_directories(self):
+        """Create all required directories if they don't exist."""
+        directories = [
+            self.base,
+            self.logs,
+            self.screenshots,
+            self.messages,
+            self.tasks,
+            self.outputs,
+            self.temp
+        ]
+        
+        for directory in directories:
+            directory.mkdir(parents=True, exist_ok=True)
+    
+    @property
+    def paths(self) -> Dict[str, Path]:
+        """Return a dictionary of all available paths."""
+        return {
+            "logs": self.logs,
+            "screenshots": self.screenshots,
+            "messages": self.messages,
+            "tasks": self.tasks,
+            "outputs": self.outputs,
+            "temp": self.temp,
+            "base": self.base
+        } 

+ 35 - 0
src/nanobrowser/lib/utils/time_utils.py

@@ -0,0 +1,35 @@
+from datetime import datetime
+from typing import Optional
+import pytz
+import time
+import random
+import logging
+
+logger = logging.getLogger(__name__)
+
+def generate_new_task_id():
+    """
+    Generate a new task id based on the current timestamp and a random number.
+    """
+    return f"{int(time.time() * 1000)}-{random.randint(100000, 999999)}"
+
+def get_current_timestamp_str(timezone: Optional[str] = None) -> str:
+    """
+    Get the current timestamp as a string in the format YYYY-MM-DD HH:MM:SS Z.
+    
+    Args:
+        timezone (str): Timezone name (e.g. 'US/Pacific', 'UTC', 'Asia/Tokyo')
+        
+    Returns:
+        str: Formatted datetime string in the specified timezone or local time
+    """
+    format = "%Y-%m-%d %H:%M:%S %Z"
+    if timezone:
+        try:
+            tz = pytz.timezone(timezone)
+            return datetime.now(tz).strftime(format)
+        except pytz.exceptions.UnknownTimeZoneError:
+            logger.error(f"Unknown timezone: {timezone}")
+            return datetime.now().strftime(format)
+    else:
+        return datetime.now().strftime(format)

+ 0 - 0
src/nanobrowser/lib/websocket/__init__.py


+ 63 - 0
src/nanobrowser/lib/websocket/message.py

@@ -0,0 +1,63 @@
+from pydantic import BaseModel
+from enum import Enum
+from typing import Optional, Dict, Any
+from ..agent.event.base import Event, ExecutionState, EventData
+
+"""
+WebSocket message types
+"""
+
+class WebSocketMessageKind(Enum):
+    """WebSocket message types:
+    create: Create a new task
+    cancel: Cancel a running task
+    state: Task state update message
+    hb: Application-level heartbeat
+    ack: Heartbeat acknowledgment
+    error: Error message
+    """
+    HEARTBEAT = "hb"       # application heartbeat
+    ACK = "ack"            # heartbeat acknowledgment
+    CREATE = "create"      # create new task
+    CANCEL = "cancel"      # cancel task
+    TASK_STATE = "state"   # task state update
+    ERROR = "error"        # error message
+
+class WebSocketMessage(BaseModel):
+    kind: WebSocketMessageKind
+    data: Optional[Dict[str, Any]] = None 
+
+class CreateTaskMessage(BaseModel):
+    """Message to create a new task"""
+    task_id: str
+    intent: str
+    args: Optional[Dict[str, Any]] = None
+
+class CancelTaskMessage(BaseModel):
+    """Message to cancel a running task"""
+    task_id: str
+
+class TaskStateMessage(BaseModel):
+    """Message of task state update"""
+    task_id: str
+    state: ExecutionState
+    actor: str
+    data: EventData
+    timestamp: str
+
+    @classmethod
+    def from_event(cls, event: Event) -> 'TaskStateMessage':
+        return cls(
+            task_id=event.data.task_id,
+            state=event.state,
+            actor=event.actor,
+            data=event.data,
+            timestamp=event.timestamp
+        )
+
+class ErrorMessage(BaseModel):
+    """Message to indicate an error"""
+    task_id: str
+    message: str
+    timestamp: str
+

+ 161 - 0
src/nanobrowser/lib/websocket/server.py

@@ -0,0 +1,161 @@
+import asyncio
+import json
+import websockets
+import logging
+from typing import Set
+from pathlib import Path
+from websockets.server import WebSocketServerProtocol
+from ..agent.executor import Executor
+from ..agent.event.base import Event
+from .message import (
+    CreateTaskMessage, TaskStateMessage, WebSocketMessage, WebSocketMessageKind, ErrorMessage
+)
+from .task import TaskManager, Task
+from ..utils.path_manager import PathManager
+
+logger = logging.getLogger(__name__)
+
+# Only allow one connection for now
+MAX_CONNECTIONS = 1
+
+class WebSocketServer:
+    def __init__(self, base_dir: Path, executor: Executor):
+        self._path_manager = PathManager(base_dir)
+        self._active_connections: Set[WebSocketServerProtocol] = set()
+        self._executor = executor
+        self._task_manager = TaskManager(self._path_manager.tasks)
+        self._connection_lock = asyncio.Lock()  # Add lock for thread safety
+        # Subscribe to task execution state changes
+        asyncio.create_task(self._subscribe_to_execution_state())
+
+    async def _subscribe_to_execution_state(self):
+        """Subscribe to task execution state changes and broadcast them to clients"""
+        async def handle_state_event(event: Event):
+            for websocket in self._active_connections:
+                try:
+                    self._task_manager.update_task_execution_state(event)
+                    await self._send_task_state(websocket, event)
+                except Exception as e:
+                    logger.error(f"Failed to send agent event to client: {e}")
+
+        await self._executor.subscribe_execution_state(handle_state_event)
+
+    async def _register(self, websocket: WebSocketServerProtocol):
+        client_info = f"{websocket.remote_address[0]}:{websocket.remote_address[1]}"
+        
+        async with self._connection_lock:  # Ensure thread-safe check and add
+            if len(self._active_connections) >= MAX_CONNECTIONS:
+                logger.warning(f"Rejected connection from {client_info}: Maximum connection limit reached")
+                await websocket.close(1013, "Maximum connection limit reached")
+                return
+            self._active_connections.add(websocket)
+            logger.info(f"New client connected from {client_info}. Total active connections: {len(self._active_connections)}")
+
+        try:
+            await self._handle_connection(websocket)
+        finally:
+            async with self._connection_lock:  # Ensure thread-safe removal
+                self._active_connections.remove(websocket)
+                logger.info(f"Client disconnected from {client_info}. Remaining active connections: {len(self._active_connections)}")
+
+    async def _handle_connection(self, websocket: WebSocketServerProtocol):
+        try:
+            async for message in websocket:
+                try:
+                    data = json.loads(message)
+                    message = WebSocketMessage.model_validate(data)
+
+                    if message.kind == WebSocketMessageKind.CREATE:
+                        logger.debug(f"Received create_task message: {data}")
+                        await self._handle_create_task(message.data, websocket)
+                    elif message.kind == WebSocketMessageKind.CANCEL:
+                        logger.debug(f"Received cancel_task message: {data}")
+                        await self._handle_cancel_task(message.data, websocket)
+                    elif message.kind == WebSocketMessageKind.HEARTBEAT:
+                        ack_message = WebSocketMessage(
+                            kind=WebSocketMessageKind.ACK,
+                            data=message.data
+                        )
+                        await websocket.send(json.dumps(ack_message.model_dump(mode='json')))
+                    else:
+                        logger.error(f"Unknown message kind: {message.kind}")
+
+                except json.JSONDecodeError:
+                    logger.error("Failed to decode JSON message")
+                except Exception as e:
+                    logger.error(f"Unexpected error occurred: {str(e)}", exc_info=True)
+
+        except websockets.exceptions.ConnectionClosed as e:
+            logger.error(f"WebSocket connection closed: {e}")
+        except Exception as e:
+            logger.error(f"Unexpected error occurred: {str(e)}", exc_info=True)
+
+    async def _process_task(self, task: Task, websocket: WebSocketServerProtocol):
+        """Run task"""
+        try:
+            # Run task using executor
+            await self._executor.run(
+                task=task.intent,
+                task_id=task.id,
+                **task.args
+            )
+        except Exception as e:
+            error_message = str(e)
+            logger.error(f"Task failed: {error_message}", exc_info=True)
+            await self._send_error_message(websocket, task.id, error_message)
+        finally:
+            self._task_manager.close_task()
+
+    async def _handle_create_task(self, message_data: dict, websocket: WebSocketServerProtocol):
+        """Handle create_task message"""
+        try:
+            create_msg = CreateTaskMessage.model_validate(message_data)
+            task = self._task_manager.create_task(
+                create_msg.task_id,
+                create_msg.intent, 
+                create_msg.args 
+            )
+            # Start task processing in the background
+            asyncio.create_task(self._process_task(task, websocket))
+        except Exception as e:
+            logger.error(f"Error creating task: {str(e)}", exc_info=True)
+            task_id = task.id if task else "unknown"
+            await self._send_error_message(websocket, task_id, str(e))
+            
+
+    async def _handle_cancel_task(self, message_data: dict, websocket: WebSocketServerProtocol):
+        """Handle cancel_task message"""
+        raise NotImplementedError("Cancel task is not implemented")
+
+    async def _send_task_state(self, websocket: WebSocketServerProtocol, event: Event):
+        """Send task state update to client"""
+        state = TaskStateMessage.from_event(event)
+        message = WebSocketMessage(
+            kind=WebSocketMessageKind.TASK_STATE,
+            data=state.model_dump()
+        )
+        await websocket.send(json.dumps(message.model_dump(mode='json')))
+
+    async def _send_error_message(self, websocket: WebSocketServerProtocol, task_id: str, error_message: str):
+        """Send error message to client"""
+        message = WebSocketMessage(
+            kind=WebSocketMessageKind.ERROR,
+            data=ErrorMessage(
+                task_id=task_id,
+                message=error_message
+            ).model_dump()
+        )
+        await websocket.send(json.dumps(message.model_dump(mode='json')))
+
+
+async def start_server(host: str, port: int, base_dir: Path, executor: Executor):
+    server = WebSocketServer(base_dir, executor)
+    async with websockets.serve(
+        server._register, 
+        host, 
+        port,
+        ping_interval=20,
+        ping_timeout=20
+    ):
+        logger.info(f"WebSocket server started on ws://{host}:{port}")
+        await asyncio.Future()

+ 92 - 0
src/nanobrowser/lib/websocket/task.py

@@ -0,0 +1,92 @@
+import json
+import os
+from pathlib import Path
+from threading import Lock
+from datetime import datetime
+from typing import Optional, Dict, Any, List
+from pydantic import BaseModel
+from ..agent.event.base import ExecutionState, Event
+
+class Task(BaseModel):
+    """Task model"""
+    id: str
+    intent: str
+    args: Optional[Dict[str, Any]] = None
+    steps: List[Event] = [] 
+    created_at: datetime
+    updated_at: datetime
+
+class TaskManager:
+    """
+    A singleton class to manage tasks.
+    Only one task can be running at a time.
+    """
+    _instance = None
+
+    def __new__(cls, tasks_dir: Path):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance._initialize(tasks_dir)
+        return cls._instance
+
+    def _initialize(self, tasks_dir: Path):
+        """
+        Initialize the singleton instance only once
+
+        Args:
+            tasks_dir: The directory to store task execution event files
+        """
+        self.tasks_dir = tasks_dir
+        self.current_task: Optional[Task] = None
+        os.makedirs(tasks_dir, exist_ok=True)
+        self._task_lock = Lock()  # Instance lock for task operations
+
+    def create_task(self, id: str, intent: str, args: Optional[Dict[str, Any]] = None) -> Task:
+        # id and intent are mandatory
+        if not id or not id.strip() or not intent or not intent.strip():
+            raise ValueError("Task id and intent cannot be empty")
+        
+        with self._task_lock:
+            if self.current_task:
+                raise ValueError("Another task is currently running, please wait for it to complete.")
+            
+            task = Task(
+                id=id,
+                intent=intent.strip(),
+                args=args or {},
+                created_at=datetime.now(),
+                updated_at=datetime.now(),
+                steps=[]
+            )
+            
+            self.current_task = task
+            return task
+
+    def close_task(self):
+        if self.current_task:
+            self._save_task(self.current_task)
+            self.current_task = None
+
+    def update_task_execution_state(self, event: Event) -> bool:
+        if not self.current_task or self.current_task.id != event.data.task_id:
+            raise ValueError(f"Task {event.data.task_id} not found")
+
+        # Add event to task steps
+        self.current_task.steps.append(event)
+        self.current_task.updated_at = datetime.now()
+
+        # If the event indicates task completion, save and clear current task
+        if event.state in [ExecutionState.TASK_OK, ExecutionState.TASK_FAIL, ExecutionState.TASK_CANCEL]:
+            self.close_task()
+
+        return True
+
+    def _save_task(self, task: Task):
+        """
+        Save the task to a json file.
+        """
+        filename = f"{task.id}.json"
+        filepath = os.path.join(self.tasks_dir, filename)
+        
+        with open(filepath, 'w') as f:
+            json.dump(task.model_dump(), f, indent=2, default=str)