import { z } from 'zod'; import { BaseAgent, type BaseAgentOptions, type ExtraAgentOptions } from './base'; import { createLogger } from '@src/background/log'; import { ActionResult, type AgentOutput } from '../types'; import type { Action } from '../actions/builder'; import { buildDynamicActionSchema } from '../actions/builder'; import { agentBrainSchema } from '../types'; import { type BaseMessage, HumanMessage } from '@langchain/core/messages'; import { Actors, ExecutionState } from '../event/types'; import { ChatModelAuthError, ChatModelForbiddenError, isAuthenticationError, isForbiddenError, LLM_FORBIDDEN_ERROR_MESSAGE, } from './errors'; import { jsonNavigatorOutputSchema } from '../actions/json_schema'; import { geminiNavigatorOutputSchema } from '../actions/json_gemini'; import { calcBranchPathHashSet } from '@src/background/dom/views'; const logger = createLogger('NavigatorAgent'); export class NavigatorActionRegistry { private actions: Record = {}; constructor(actions: Action[]) { for (const action of actions) { this.registerAction(action); } } registerAction(action: Action): void { this.actions[action.name()] = action; } unregisterAction(name: string): void { delete this.actions[name]; } getAction(name: string): Action | undefined { return this.actions[name]; } setupModelOutputSchema(): z.ZodType { const actionSchema = buildDynamicActionSchema(Object.values(this.actions)); return z.object({ current_state: agentBrainSchema, action: z.array(actionSchema), }); } } export interface NavigatorResult { done: boolean; } export class NavigatorAgent extends BaseAgent { private actionRegistry: NavigatorActionRegistry; private jsonSchema: Record; constructor( actionRegistry: NavigatorActionRegistry, options: BaseAgentOptions, extraOptions?: Partial, ) { super(actionRegistry.setupModelOutputSchema(), options, { ...extraOptions, id: 'navigator' }); this.actionRegistry = actionRegistry; this.jsonSchema = this.modelName.startsWith('gemini') ? geminiNavigatorOutputSchema : jsonNavigatorOutputSchema; // logger.info('Navigator zod schema', JSON.stringify(zodToJsonSchema(this.modelOutputSchema), null, 2)); } async invoke(inputMessages: BaseMessage[]): Promise { // Use structured output if (this.withStructuredOutput) { const structuredLlm = this.chatLLM.withStructuredOutput(this.jsonSchema, { includeRaw: true, name: this.modelOutputToolName, }); let response = undefined; try { response = await structuredLlm.invoke(inputMessages, { ...this.callOptions, }); if (response.parsed) { return response.parsed; } } catch (error) { const errorMessage = `Failed to invoke ${this.modelName} with structured output: ${error}`; throw new Error(errorMessage); } // Use type assertion to access the properties const rawResponse = response.raw as BaseMessage & { tool_calls?: Array<{ args: { currentState: typeof agentBrainSchema._type; action: z.infer>; }; }>; }; // sometimes LLM returns an empty content, but with one or more tool calls, so we need to check the tool calls if (rawResponse.tool_calls && rawResponse.tool_calls.length > 0) { logger.info('Navigator structuredLlm tool call with empty content', rawResponse.tool_calls); // only use the first tool call const toolCall = rawResponse.tool_calls[0]; return { current_state: toolCall.args.currentState, action: [...toolCall.args.action], }; } throw new Error('Could not parse response'); } throw new Error('Navigator needs to work with LLM that supports tool calling'); } async execute(): Promise> { const agentOutput: AgentOutput = { id: this.id, }; let cancelled = false; try { this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.STEP_START, 'Navigating...'); const messageManager = this.context.messageManager; // add the browser state message await this.addStateMessageToMemory(); // check if the task is paused or stopped if (this.context.paused || this.context.stopped) { cancelled = true; return agentOutput; } // call the model to get the actions to take const inputMessages = messageManager.getMessages(); const modelOutput = await this.invoke(inputMessages); // check if the task is paused or stopped if (this.context.paused || this.context.stopped) { cancelled = true; return agentOutput; } // remove the last state message from memory before adding the model output this.removeLastStateMessageFromMemory(); this.addModelOutputToMemory(modelOutput); // take the actions const actionResults = await this.doMultiAction(modelOutput); this.context.actionResults = actionResults; // check if the task is paused or stopped if (this.context.paused || this.context.stopped) { cancelled = true; return agentOutput; } // emit event this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.STEP_OK, 'Navigation done'); let done = false; if (actionResults.length > 0 && actionResults[actionResults.length - 1].isDone) { done = true; } agentOutput.result = { done }; return agentOutput; } catch (error) { this.removeLastStateMessageFromMemory(); // Check if this is an authentication error if (isAuthenticationError(error)) { throw new ChatModelAuthError('Navigator API Authentication failed. Please verify your API key', error); } if (isForbiddenError(error)) { throw new ChatModelForbiddenError(LLM_FORBIDDEN_ERROR_MESSAGE, error); } const errorMessage = error instanceof Error ? error.message : String(error); const errorString = `Navigation failed: ${errorMessage}`; logger.error(errorString); this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.STEP_FAIL, errorString); agentOutput.error = errorMessage; return agentOutput; } finally { // if the task is cancelled, remove the last state message from memory and emit event if (cancelled) { this.removeLastStateMessageFromMemory(); this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.STEP_CANCEL, 'Navigation cancelled'); } } } /** * Add the state message to the memory */ public async addStateMessageToMemory() { if (this.context.stateMessageAdded) { return; } const messageManager = this.context.messageManager; // Handle results that should be included in memory if (this.context.actionResults.length > 0) { let index = 0; for (const r of this.context.actionResults) { if (r.includeInMemory) { if (r.extractedContent) { const msg = new HumanMessage(`Action result: ${r.extractedContent}`); // logger.info('Adding action result to memory', msg.content); messageManager.addMessageWithTokens(msg); } if (r.error) { // Get error text and convert to string const errorText = r.error.toString().trim(); // Get only the last line of the error const lastLine = errorText.split('\n').pop() || ''; const msg = new HumanMessage(`Action error: ${lastLine}`); logger.info('Adding action error to memory', msg.content); messageManager.addMessageWithTokens(msg); } // reset this action result to empty, we dont want to add it again in the state message // NOTE: in python version, all action results are reset to empty, but in ts version, only those included in memory are reset to empty this.context.actionResults[index] = new ActionResult(); } index++; } } const state = await this.prompt.getUserMessage(this.context); messageManager.addStateMessage(state); this.context.stateMessageAdded = true; } /** * Remove the last state message from the memory */ protected async removeLastStateMessageFromMemory() { if (!this.context.stateMessageAdded) return; const messageManager = this.context.messageManager; messageManager.removeLastStateMessage(); this.context.stateMessageAdded = false; } private async addModelOutputToMemory(modelOutput: this['ModelOutput']) { const messageManager = this.context.messageManager; messageManager.addModelOutput(modelOutput); } private async doMultiAction(response: this['ModelOutput']): Promise { const results: ActionResult[] = []; let errCount = 0; logger.info('Actions', response.action); // sometimes response.action is a string, but not an array as expected, so we need to parse it as an array let actions: Record[] = []; if (Array.isArray(response.action)) { // if the item is null, skip it actions = response.action.filter((item: unknown) => item !== null); if (actions.length === 0) { logger.warning('No valid actions found', response.action); } } else if (typeof response.action === 'string') { try { logger.warning('Unexpected action format', response.action); // try to parse the action as an JSON object actions = JSON.parse(response.action); } catch (error) { logger.error('Invalid action format', response.action); throw new Error('Invalid action output format'); } } else { // if the action is neither an array nor a string, it should be an object actions = [response.action]; } const browserContext = this.context.browserContext; const browserState = await browserContext.getState(); const cachedPathHashes = await calcBranchPathHashSet(browserState); await browserContext.removeHighlight(); for (const [i, action] of actions.entries()) { const actionName = Object.keys(action)[0]; const actionArgs = action[actionName]; try { // check if the task is paused or stopped if (this.context.paused || this.context.stopped) { return results; } const actionInstance = this.actionRegistry.getAction(actionName); if (actionInstance === undefined) { throw new Error(`Action ${actionName} not exists`); } const indexArg = actionInstance.getIndexArg(actionArgs); if (i > 0 && indexArg !== null) { const newState = await browserContext.getState(); const newPathHashes = await calcBranchPathHashSet(newState); // next action requires index but there are new elements on the page if (!newPathHashes.isSubsetOf(cachedPathHashes)) { const msg = `Something new appeared after action ${i} / ${actions.length}`; logger.info(msg); results.push( new ActionResult({ extractedContent: msg, includeInMemory: true, }), ); break; } } const result = await actionInstance.call(actionArgs); if (result === undefined) { throw new Error(`Action ${actionName} returned undefined`); } results.push(result); // check if the task is paused or stopped if (this.context.paused || this.context.stopped) { return results; } // TODO: wait for 1 second for now, need to optimize this to avoid unnecessary waiting await new Promise(resolve => setTimeout(resolve, 1000)); } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); logger.error('doAction error', actionName, actionArgs, errorMessage); // unexpected error, emit event this.context.emitEvent(Actors.NAVIGATOR, ExecutionState.ACT_FAIL, errorMessage); errCount++; if (errCount > 3) { throw new Error('Too many errors in actions'); } results.push( new ActionResult({ error: errorMessage, isDone: false, includeInMemory: true, }), ); } } return results; } }