diff --git a/apps/cli/ai/agent.ts b/apps/cli/ai/agent.ts index c0cc5e56d9..a2f41edc33 100644 --- a/apps/cli/ai/agent.ts +++ b/apps/cli/ai/agent.ts @@ -81,6 +81,11 @@ export function startAiAgent( config: AiAgentConfig ): Query { : createStudioTools( { enablePreviewSteering: isForkedByDesktop } ), }; + // The remote-session controller sets STUDIO_REMOTE_SESSION=1 when it spawns + // `studio code --json` so the agent knows it's driving Telegram and should + // favor screenshot replies. + const remoteSession = resolvedEnv.STUDIO_REMOTE_SESSION === '1'; + // Build site-aware system prompt const systemPromptOptions = isRemoteSite ? { @@ -89,8 +94,9 @@ export function startAiAgent( config: AiAgentConfig ): Query { url: activeSite.url ?? '', id: activeSite.wpcomSiteId!, }, + remoteSession, } - : { previewSteering: isForkedByDesktop }; + : { previewSteering: isForkedByDesktop, remoteSession }; if ( ! fs.existsSync( STUDIO_SITES_ROOT ) ) { fs.mkdirSync( STUDIO_SITES_ROOT, { recursive: true } ); diff --git a/apps/cli/ai/system-prompt.ts b/apps/cli/ai/system-prompt.ts index d8087e9073..31fb01efe0 100644 --- a/apps/cli/ai/system-prompt.ts +++ b/apps/cli/ai/system-prompt.ts @@ -12,15 +12,21 @@ export interface BuildSystemPromptOptions { // tools to the agent. When false, the "Keep the preview in sync" section // is omitted so we don't document tools the agent can't actually call. previewSteering?: boolean; + // True when the agent is being driven by the Telegram remote-session bridge. + // Adds guidance about delivering screenshots via `share_screenshot` and + // offering a preview-site follow-up. + remoteSession?: boolean; } export function buildSystemPrompt( options?: BuildSystemPromptOptions ): string { + const remoteSessionAddendum = options?.remoteSession ? `\n\n${ REMOTE_SESSION_GUIDANCE }` : ''; + if ( options?.remoteSite ) { return `${ buildRemoteIntro( options.remoteSite ) } ${ REMOTE_CONTENT_GUIDELINES } -${ REMOTE_DESIGN_GUIDELINES } +${ REMOTE_DESIGN_GUIDELINES }${ remoteSessionAddendum } `; } @@ -28,7 +34,7 @@ ${ REMOTE_DESIGN_GUIDELINES } ${ LOCAL_CONTENT_GUIDELINES } -${ LOCAL_DESIGN_GUIDELINES } +${ LOCAL_DESIGN_GUIDELINES }${ remoteSessionAddendum } `; } @@ -194,6 +200,20 @@ One \`Write\` or \`Edit\` per turn (read-only \`site_info\`, \`site_list\`, \`wp - All animations and transitions must respect \`prefers-reduced-motion\`. Add a \`@media (prefers-reduced-motion: reduce)\` block that disables or simplifies animations (e.g. \`animation: none; transition: none; scroll-behavior: auto;\`).`; } +const REMOTE_SESSION_GUIDANCE = `## Telegram remote session + +You are running over Telegram. The user iterates turn-by-turn; keep replies short and image-driven. + +After ANY visible change to a site, call \`share_screenshot\` before ending the turn — no preamble, no permission-asking. It is fire-and-forget: the image goes to the user but is NOT returned to you. Do not analyze or describe what you sent. Follow up with at most one short sentence (e.g. "Heading is now red." or "Want me to publish this as a preview?"). + +Defaults to a 16:9 above-the-fold view. Pass \`fullPage: true\` only when the user explicitly asks for the whole page. Captions describe what the user is looking at; never mention "full page", "viewport", or other capture-mode wording. + +\`take_screenshot\` is separate — use it only when YOU need to inspect a render before continuing. Don't pair it with \`share_screenshot\` for the same URL. + +For non-visual changes (data, logs, listings), reply with a concise text summary; no screenshot needed. + +Never claim to have stored, saved, or remembered anything beyond what your tools actually did. There is no gist storage, no preview-link memory, no session summary. Do not invent epilogues like "gist stored" or "preview link saved".`; + const REMOTE_CONTENT_GUIDELINES = `## Block content guidelines - Use only core WordPress blocks. No custom HTML blocks except for inline SVGs. diff --git a/apps/cli/ai/tools.ts b/apps/cli/ai/tools.ts index 2db191742c..be21fe5c2f 100644 --- a/apps/cli/ai/tools.ts +++ b/apps/cli/ai/tools.ts @@ -665,15 +665,98 @@ const validateBlocksTool = tool( // --- Screenshot tool --- +// Tall portrait viewport used by `take_screenshot` for full-page captures +// where the agent wants to inspect the whole scrolled page at once. const VIEWPORTS = { desktop: { width: 1040, height: 1248 }, mobile: { width: 390, height: 844 }, } as const; +// 16:9 viewport used by `share_screenshot` to capture "as it would look on a +// screen" — an above-the-fold view of the rendered page. The user can ask for +// the full page explicitly by setting `fullPage: true`. +const SHARE_VIEWPORTS = { + desktop: { width: 1280, height: 720 }, + mobile: { width: 390, height: 844 }, +} as const; + +async function captureScreenshotPng( + url: string, + viewport: { width: number; height: number }, + options: { fullPage: boolean } +): Promise< string > { + const browser = await getSharedBrowser(); + const page = await browser.newPage( { viewport } ); + + try { + await page.emulateMedia( { reducedMotion: 'reduce' } ); + await page.goto( url, { waitUntil: 'domcontentloaded', timeout: 30000 } ); + await page.waitForLoadState( 'networkidle', { timeout: 10000 } ).catch( () => {} ); + + // For full-page captures, scroll through the entire document so lazy-loaded + // images can begin loading. For viewport captures we keep the page where + // it is and only wait on images that intersect the first viewport, so + // above-the-fold shots stay quick on long pages. + await page.evaluate( async ( fullPage ) => { + const delay = ( ms: number ) => + new Promise< void >( ( resolve ) => setTimeout( resolve, ms ) ); + + if ( fullPage ) { + const scrollHeight = document.body.scrollHeight; + const viewportHeight = window.innerHeight; + for ( let y = 0; y < scrollHeight; y += viewportHeight ) { + window.scrollTo( 0, y ); + await delay( 100 ); + } + window.scrollTo( 0, 0 ); + } + + const pendingImages = Array.from( document.images ).filter( ( img ) => { + if ( img.complete ) { + return false; + } + if ( fullPage ) { + return true; + } + const rect = img.getBoundingClientRect(); + return rect.bottom > 0 && rect.top < window.innerHeight; + } ); + const timeout = new Promise< void >( ( resolve ) => setTimeout( resolve, 5000 ) ); + const allImages = Promise.all( + pendingImages.map( + ( img ) => + new Promise< void >( ( resolve ) => { + img.addEventListener( 'load', () => resolve(), { once: true } ); + img.addEventListener( 'error', () => resolve(), { once: true } ); + } ) + ) + ); + await Promise.race( [ allImages, timeout ] ); + }, options.fullPage ); + + // Hide WordPress admin bar and scrollbars for cleaner screenshots + await page.addStyleTag( { + content: ` + #wpadminbar { display: none !important; } + html { margin-top: 0 !important; } + ::-webkit-scrollbar { display: none !important; } + html, body { scrollbar-width: none !important; } + `, + } ); + + const buffer = await page.screenshot( { fullPage: options.fullPage, type: 'png' } ); + return buffer.toString( 'base64' ); + } finally { + await page.close(); + } +} + const takeScreenshotTool = tool( 'take_screenshot', 'Takes a full-page screenshot of a URL. Returns the screenshot as an image that you can analyze visually. ' + - 'Supports desktop and mobile viewports. Use this to verify the site looks correct after building it.', + 'Supports desktop and mobile viewports. Use this to verify the site looks correct after building it. ' + + 'Note: this image is for your own visual reasoning only — the user does not see it. ' + + 'Use `share_screenshot` instead when you want to deliver the rendered page to the user.', { url: z.string().describe( 'The URL to screenshot' ), viewport: z @@ -686,79 +769,80 @@ const takeScreenshotTool = tool( async ( args ) => { try { const viewportType = args.viewport ?? 'desktop'; - const viewport = VIEWPORTS[ viewportType ]; - emitProgress( `Taking ${ viewportType } screenshot of ${ args.url }…` ); + const base64 = await captureScreenshotPng( args.url, VIEWPORTS[ viewportType ], { + fullPage: true, + } ); + emitProgress( `Screenshot captured (${ viewportType })` ); + return { + content: [ + { + type: 'image' as const, + data: base64, + mimeType: 'image/png', + }, + ], + }; + } catch ( error ) { + return errorResult( + `Screenshot failed: ${ error instanceof Error ? error.message : String( error ) }` + ); + } + } +); - const browser = await getSharedBrowser(); - const page = await browser.newPage( { viewport } ); - - try { - // Reduce motion to avoid capturing mid-animation states - await page.emulateMedia( { reducedMotion: 'reduce' } ); - - await page.goto( args.url, { waitUntil: 'domcontentloaded', timeout: 30000 } ); - await page.waitForLoadState( 'networkidle', { timeout: 10000 } ).catch( () => {} ); - - // Scroll through the page to trigger lazy-loaded images, then wait - // for all images to finish loading (with a timeout so we don't hang - // on images that never settle). - await page.evaluate( async () => { - const delay = ( ms: number ) => - new Promise< void >( ( resolve ) => setTimeout( resolve, ms ) ); - const scrollHeight = document.body.scrollHeight; - const viewportHeight = window.innerHeight; - for ( let y = 0; y < scrollHeight; y += viewportHeight ) { - window.scrollTo( 0, y ); - await delay( 100 ); - } - window.scrollTo( 0, 0 ); - - const timeout = new Promise< void >( ( resolve ) => setTimeout( resolve, 5000 ) ); - const allImages = Promise.all( - Array.from( document.images ) - .filter( ( img ) => ! img.complete ) - .map( - ( img ) => - new Promise< void >( ( resolve ) => { - img.addEventListener( 'load', () => resolve() ); - img.addEventListener( 'error', () => resolve() ); - } ) - ) - ); - await Promise.race( [ allImages, timeout ] ); - } ); - - // Hide WordPress admin bar and scrollbars for cleaner screenshots - await page.addStyleTag( { - content: ` - #wpadminbar { display: none !important; } - html { margin-top: 0 !important; } - ::-webkit-scrollbar { display: none !important; } - html, body { scrollbar-width: none !important; } - `, - } ); - - const buffer = await page.screenshot( { fullPage: true, type: 'png' } ); - const base64 = buffer.toString( 'base64' ); +const shareScreenshotTool = tool( + 'share_screenshot', + 'Fire-and-forget primitive that captures a URL and delivers the image to the user. ' + + 'Call after ANY visible change to a site so the user sees the new state. ' + + 'Returns a confirmation string only — the image is NOT returned to you. The user already has the picture; do not analyze or describe what was sent in your reply. After calling this, write at most one short follow-up sentence and end the turn. ' + + 'Defaults to a 16:9 above-the-fold view. Set `fullPage: true` only when the user explicitly asks for the whole scroll length. ' + + 'Distinct from `take_screenshot`, which is for your own visual reasoning before continuing work.', + { + url: z.string().describe( 'The URL to screenshot and send to the user' ), + viewport: z + .enum( [ 'desktop', 'mobile' ] ) + .optional() + .describe( + 'Viewport size: "desktop" (1280x720, 16:9) or "mobile" (390x844). Defaults to desktop.' + ), + fullPage: z + .boolean() + .optional() + .describe( + 'When true, capture the entire scrolled page instead of just the viewport. Defaults to false; only set this when the user has explicitly asked for the full page.' + ), + caption: z + .string() + .optional() + .describe( + 'Short caption sent with the image. Describe what the user is looking at; do NOT mention "full page", "viewport", or other capture-mode wording. Keep it under ~1024 characters.' + ), + }, + async ( args ) => { + try { + const viewportType = args.viewport ?? 'desktop'; + const base64 = await captureScreenshotPng( args.url, SHARE_VIEWPORTS[ viewportType ], { + fullPage: args.fullPage ?? false, + } ); - emitProgress( `Screenshot captured (${ viewportType })` ); + emitEvent( { + type: 'media.share', + timestamp: new Date().toISOString(), + mediaType: 'image', + mimeType: 'image/png', + dataBase64: base64, + caption: args.caption, + } ); - return { - content: [ - { - type: 'image' as const, - data: base64, - mimeType: 'image/png', - }, - ], - }; - } finally { - await page.close(); - } + return textResult( + `Screenshot delivered to the user (${ viewportType }${ + args.fullPage ? ', full page' : '' + }). The user is viewing it now; do not describe what was sent.` + ); } catch ( error ) { return errorResult( - `Screenshot failed: ${ error instanceof Error ? error.message : String( error ) }` + `Share screenshot failed: ${ error instanceof Error ? error.message : String( error ) }` ); } } @@ -1169,6 +1253,7 @@ export const studioToolDefinitions = [ runWpCliTool, validateBlocksTool, takeScreenshotTool, + shareScreenshotTool, installTaxonomyScriptsTool, auditPerformanceTool, auditSeoTool, @@ -1217,6 +1302,6 @@ export function createRemoteSiteTools( token: string, siteId: number ) { return createSdkMcpServer( { name: 'studio', version: '1.0.0', - tools: [ ...wpcomTools, takeScreenshotTool, createSiteTool, pullSiteTool ], + tools: [ ...wpcomTools, takeScreenshotTool, shareScreenshotTool, createSiteTool, pullSiteTool ], } ); } diff --git a/apps/cli/remote-session/media-streamer.ts b/apps/cli/remote-session/media-streamer.ts new file mode 100644 index 0000000000..d7c87172f2 --- /dev/null +++ b/apps/cli/remote-session/media-streamer.ts @@ -0,0 +1,86 @@ +import type { JsonEvent } from '@studio/common/ai/json-events'; +import type { RemoteSessionConfig } from 'cli/remote-session/config'; +import type { RemoteSessionLogger } from 'cli/remote-session/logger'; +import type { respondMessage } from 'cli/remote-session/telegram-client'; + +export interface MediaTarget { + chatId: number; + bot?: string; +} + +export interface MediaStreamerDeps { + respond: typeof respondMessage; + logger: RemoteSessionLogger; +} + +export interface MediaStreamerOptions { + config: RemoteSessionConfig; + target: MediaTarget; + deps: MediaStreamerDeps; +} + +/** + * Forwards `media.share` NDJSON events from `studio code --json` to Telegram as + * photos in real time. Each photo POST kicks off as soon as the event arrives, + * so the image appears in the chat while the model is still finishing its + * follow-up text. POSTs are serialized through a single promise chain so that + * multiple shares from one turn arrive in emit order. Failures are logged and + * dropped — `respondMessage` already retries 5xx internally, so a final failure + * here means the photo isn't recoverable for this turn. + */ +export class MediaStreamer { + private readonly config: RemoteSessionConfig; + private readonly target: MediaTarget; + private readonly deps: MediaStreamerDeps; + + private queue: Promise< void > = Promise.resolve(); + private posted = 0; + private failed = 0; + + constructor( options: MediaStreamerOptions ) { + this.config = options.config; + this.target = options.target; + this.deps = options.deps; + } + + readonly onEvent = ( event: JsonEvent ): void => { + if ( event.type !== 'media.share' ) { + return; + } + const { dataBase64, mimeType, caption } = event; + this.deps.logger.debug( 'media.share received; queueing photo post', { + chat_id: this.target.chatId, + mime_type: mimeType, + base64_chars: dataBase64.length, + caption_length: caption?.length ?? 0, + } ); + this.queue = this.queue.then( async () => { + try { + await this.deps.respond( + this.config, + { + chatId: this.target.chatId, + bot: this.target.bot, + photo: dataBase64, + photoMimeType: mimeType, + caption, + }, + { logger: this.deps.logger } + ); + this.posted++; + } catch ( error ) { + this.failed++; + this.deps.logger.warn( 'Real-time media post failed; photo dropped', { + chat_id: this.target.chatId, + error: ( error as Error ).message, + } ); + } + } ); + }; + + /** Wait for any in-flight photo POSTs to complete. Resolves once the queue drains. */ + async drain(): Promise< { posted: number; failed: number } > { + await this.queue; + return { posted: this.posted, failed: this.failed }; + } +} diff --git a/apps/cli/remote-session/poll-loop.ts b/apps/cli/remote-session/poll-loop.ts index 01dae1e9f6..b546dd11e3 100644 --- a/apps/cli/remote-session/poll-loop.ts +++ b/apps/cli/remote-session/poll-loop.ts @@ -1,5 +1,7 @@ +import { type JsonEvent } from '@studio/common/ai/json-events'; import { type RemoteSessionConfig } from 'cli/remote-session/config'; import { RemoteSessionLogger } from 'cli/remote-session/logger'; +import { MediaStreamer } from 'cli/remote-session/media-streamer'; import { ProgressStreamer } from 'cli/remote-session/progress-streamer'; import { chunkReply, extractReply } from 'cli/remote-session/reply-formatter'; import { clearSessionId, readStateForChat, writeSessionId } from 'cli/remote-session/state'; @@ -111,11 +113,20 @@ async function handleTurn( const started = Date.now(); const logContext = { chat_id: target.chatId }; - const streamer = new ProgressStreamer( { + const progressStreamer = new ProgressStreamer( { config, target, deps: { respond: deps.respond, logger: deps.logger }, } ); + const mediaStreamer = new MediaStreamer( { + config, + target, + deps: { respond: deps.respond, logger: deps.logger }, + } ); + const onEvent = ( event: JsonEvent ) => { + progressStreamer.onEvent( event ); + mediaStreamer.onEvent( event ); + }; let outcome: TurnOutcome; try { @@ -126,7 +137,7 @@ async function handleTurn( signal, logger: deps.logger, logContext, - onEvent: streamer.onEvent, + onEvent, } ); if ( ! signal.aborted && outcome.staleSession && sessionId ) { @@ -144,13 +155,18 @@ async function handleTurn( signal, logger: deps.logger, logContext, - onEvent: streamer.onEvent, + onEvent, } ); } } finally { - streamer.stop(); + progressStreamer.stop(); } + // Wait for any in-flight photos to finish posting so a text reply that + // follows them lands in chat order, even if the photo POST is still + // running when the turn ends. + const mediaSummary = await mediaStreamer.drain(); + if ( outcome.sessionId && outcome.sessionId !== sessionId ) { await deps.writeSession( target.chatId, outcome.sessionId ); } @@ -164,6 +180,8 @@ async function handleTurn( chars_out: outcome.replyText?.length ?? 0, session_id: outcome.sessionId, aborted: signal.aborted, + media_posted: mediaSummary.posted, + media_failed: mediaSummary.failed, } ); // Detach was requested mid-turn. Skip posting any reply — the detach flow @@ -202,12 +220,16 @@ async function handleTurn( isError: outcome.isError, } ); - if ( reply === null ) { + const deliveredMedia = mediaSummary.posted > 0; + + if ( reply === null && ! deliveredMedia ) { await postBestEffort( deps, config, target, '⚠️ Local agent did not return a result.' ); return; } - await postChunks( deps, config, target, reply ); + if ( reply !== null ) { + await postChunks( deps, config, target, reply ); + } } /** diff --git a/apps/cli/remote-session/telegram-client.ts b/apps/cli/remote-session/telegram-client.ts index 3aa9c5803b..cabab76451 100644 --- a/apps/cli/remote-session/telegram-client.ts +++ b/apps/cli/remote-session/telegram-client.ts @@ -201,24 +201,65 @@ function extractMessages( payload: unknown ): PolledMessage[] { return out; } +export interface RespondParams { + chatId: number; + bot?: string; + /** Plain text reply. Required when no `photo` is provided. */ + text?: string; + /** + * Base64-encoded image bytes (PNG or JPEG). When set, the request goes out as + * `multipart/form-data` so the server forwards it to Telegram via `sendPhoto`. + */ + photo?: string; + /** MIME type of the photo bytes. Defaults to `image/png`. */ + photoMimeType?: 'image/png' | 'image/jpeg'; + /** Caption to send alongside `photo`. The server demotes long captions to a follow-up message. */ + caption?: string; +} + +interface RespondResponseBody { + success?: boolean; + photo_sent?: boolean; + text_sent?: boolean; + chunks_sent?: number; + error?: string; +} + /** * POST a message back to Telegram. Retries up to 3 times on 5xx with exponential backoff. * 4xx responses are surfaced as TelegramBadRequestError and should be logged but not retried. + * + * Transports: + * - Text-only: `application/json` body — `{ chat_id, bot, text }`. + * - Photo (with optional caption / follow-up text): `multipart/form-data` with a + * binary `photo` file part plus text fields. The server validates the photo + * bytes (size + magic bytes) before forwarding to Telegram. + * + * The server always answers with HTTP 200 and a JSON body indicating partial outcomes + * (`success`, `photo_sent`, `text_sent`, `error`). We log a warning when `success` is + * false but do not throw — the caller has already committed to best-effort delivery. */ export async function respondMessage( config: RemoteSessionConfig, - params: { chatId: number; text: string; bot?: string }, + params: RespondParams, options: { signal?: AbortSignal; maxRetries?: number; logger?: RemoteSessionLogger } = {} ): Promise< void > { + if ( ! params.text && ! params.photo ) { + throw new Error( 'respondMessage requires `text`, `photo`, or both' ); + } + const url = buildUrl( config.base_url, 'local-agent-respond' ); const allowedHost = new URL( config.base_url ).host; assertSameHost( url, allowedHost ); const bot = params.bot ?? config.bot; - const body = JSON.stringify( { - chat_id: params.chatId, - text: params.text, + const { body, contentType } = buildRespondBody( { + chatId: params.chatId, bot, + text: params.text, + photo: params.photo, + photoMimeType: params.photoMimeType, + caption: params.caption, } ); const maxRetries = options.maxRetries ?? 3; @@ -228,19 +269,30 @@ export async function respondMessage( logger?.debug( 'Respond start', { chat_id: params.chatId, bot, - text_length: params.text.length, - text_preview: params.text.slice( 0, 120 ), + text_length: params.text?.length ?? 0, + text_preview: params.text?.slice( 0, 120 ), + has_photo: params.photo !== undefined, + photo_base64_chars: params.photo?.length ?? 0, + photo_mime_type: params.photoMimeType, + caption_length: params.caption?.length ?? 0, + transport: contentType === undefined ? 'multipart' : 'json', } ); while ( attempt <= maxRetries ) { let response: Response; try { + // Note: when `body` is a FormData the runtime sets the multipart + // Content-Type with the proper boundary. Setting it manually here + // would corrupt the boundary token, so we omit it for that path. + const headers: Record< string, string > = { + Authorization: `Bearer ${ config.token }`, + }; + if ( contentType ) { + headers[ 'Content-Type' ] = contentType; + } response = await fetch( url, { method: 'POST', - headers: { - Authorization: `Bearer ${ config.token }`, - 'Content-Type': 'application/json', - }, + headers, body, redirect: 'manual', signal: options.signal, @@ -296,11 +348,25 @@ export async function respondMessage( response.status ); } - logger?.debug( 'Respond ok', { - status: response.status, - chat_id: params.chatId, - attempt, - } ); + + const outcome = await readRespondOutcome( response ); + if ( outcome && outcome.success === false ) { + logger?.warn( 'Respond reported partial failure', { + chat_id: params.chatId, + photo_sent: outcome.photo_sent, + text_sent: outcome.text_sent, + error: outcome.error, + } ); + } else { + logger?.debug( 'Respond ok', { + status: response.status, + chat_id: params.chatId, + attempt, + photo_sent: outcome?.photo_sent, + text_sent: outcome?.text_sent, + chunks_sent: outcome?.chunks_sent, + } ); + } return; } @@ -314,6 +380,77 @@ export async function respondMessage( throw new TelegramTransientError( 'Respond failed after retries' ); } +interface BuildBodyParams { + chatId: number; + bot?: string; + text?: string; + photo?: string; + photoMimeType?: 'image/png' | 'image/jpeg'; + caption?: string; +} + +// Telegram caps captions at 1024 characters and the wpcom endpoint rejects +// anything longer with HTTP 400. Truncate at the client so a slightly +// over-cap caption from the agent doesn't drop the whole photo. +const CAPTION_MAX_CHARS = 1024; + +function clampCaption( caption: string | undefined ): string | undefined { + if ( ! caption ) { + return undefined; + } + if ( caption.length <= CAPTION_MAX_CHARS ) { + return caption; + } + return `${ caption.slice( 0, CAPTION_MAX_CHARS - 1 ) }…`; +} + +function buildRespondBody( params: BuildBodyParams ): { + body: string | FormData; + /** Set for the JSON path; `undefined` for multipart so fetch fills the boundary in. */ + contentType?: string; +} { + if ( params.photo ) { + const fd = new FormData(); + fd.append( 'chat_id', String( params.chatId ) ); + if ( params.bot ) { + fd.append( 'bot', params.bot ); + } + if ( params.text ) { + fd.append( 'text', params.text ); + } + const caption = clampCaption( params.caption ); + if ( caption ) { + fd.append( 'caption', caption ); + } + const mime = params.photoMimeType ?? 'image/png'; + const filename = mime === 'image/jpeg' ? 'screenshot.jpg' : 'screenshot.png'; + const bytes = Buffer.from( params.photo, 'base64' ); + fd.append( 'photo', new Blob( [ new Uint8Array( bytes ) ], { type: mime } ), filename ); + return { body: fd }; + } + + const json: Record< string, unknown > = { chat_id: params.chatId }; + if ( params.bot ) { + json.bot = params.bot; + } + if ( params.text ) { + json.text = params.text; + } + return { body: JSON.stringify( json ), contentType: 'application/json' }; +} + +async function readRespondOutcome( response: Response ): Promise< RespondResponseBody | null > { + const raw = await safeReadText( response ); + if ( ! raw.trim() ) { + return null; + } + try { + return JSON.parse( raw ) as RespondResponseBody; + } catch { + return null; + } +} + async function safeReadText( response: Response ): Promise< string > { try { return await response.text(); diff --git a/apps/cli/remote-session/tests/fixtures/mock-studio-code.mjs b/apps/cli/remote-session/tests/fixtures/mock-studio-code.mjs index 824c79497c..cb617f460e 100644 --- a/apps/cli/remote-session/tests/fixtures/mock-studio-code.mjs +++ b/apps/cli/remote-session/tests/fixtures/mock-studio-code.mjs @@ -5,6 +5,7 @@ // error — emits a result with is_error + turn.completed error // stale-resume — writes a stale-session-looking line to stderr + exits non-zero // hang — never exits (tests timeout path) +// media-share — emits a media.share + result + turn.completed success const scenario = process.env.SCENARIO ?? 'success'; const sessionId = process.env.SESSION_ID ?? 'sess-new'; @@ -91,12 +92,54 @@ function runHang() { setInterval( () => {}, 1000 ); } +function runMediaShare() { + emit( { type: 'turn.started', timestamp: ts() } ); + emit( { + type: 'media.share', + timestamp: ts(), + mediaType: 'image', + mimeType: 'image/png', + dataBase64: 'AAAA', + caption: 'Site preview', + } ); + emit( { + type: 'media.share', + timestamp: ts(), + mediaType: 'image', + mimeType: 'image/png', + dataBase64: 'BBBB', + } ); + emit( { + type: 'message', + timestamp: ts(), + message: { + type: 'result', + subtype: 'success', + is_error: false, + result: 'Want me to publish this as a preview site?', + session_id: sessionId, + duration_ms: 10, + duration_api_ms: 5, + num_turns: 1, + stop_reason: 'end_turn', + total_cost_usd: 0, + usage: {}, + modelUsage: {}, + permission_denials: [], + uuid: 'u', + }, + } ); + emit( { type: 'turn.completed', timestamp: ts(), sessionId, status: 'success' } ); + process.exit( 0 ); +} + const handlers = { success: runSuccess, paused: runPaused, error: runError, 'stale-resume': runStaleResume, hang: runHang, + 'media-share': runMediaShare, }; const handler = handlers[ scenario ]; diff --git a/apps/cli/remote-session/tests/media-streamer.test.ts b/apps/cli/remote-session/tests/media-streamer.test.ts new file mode 100644 index 0000000000..569af01809 --- /dev/null +++ b/apps/cli/remote-session/tests/media-streamer.test.ts @@ -0,0 +1,123 @@ +import { describe, expect, it, vi } from 'vitest'; +import { RemoteSessionLogger } from 'cli/remote-session/logger'; +import { MediaStreamer } from 'cli/remote-session/media-streamer'; +import type { RemoteSessionConfig } from 'cli/remote-session/config'; + +const config: RemoteSessionConfig = { + base_url: 'https://api.example.test/telegram-bot', + token: 't', + bot: 'b', + chat_id: 42, + poll_interval_seconds: 1, + long_poll_timeout_seconds: 5, + max_message_chars: 3800, + turn_timeout_seconds: 60, +}; + +describe( 'MediaStreamer', () => { + it( 'posts a photo as soon as a media.share event arrives', async () => { + const respond = vi.fn().mockResolvedValue( undefined ); + const streamer = new MediaStreamer( { + config, + target: { chatId: 42, bot: 'b' }, + deps: { respond, logger: new RemoteSessionLogger( '/dev/null' ) }, + } ); + + streamer.onEvent( { + type: 'media.share', + timestamp: 'now', + mediaType: 'image', + mimeType: 'image/png', + dataBase64: 'AAAA', + caption: 'Site preview', + } ); + + const summary = await streamer.drain(); + expect( summary.posted ).toBe( 1 ); + expect( summary.failed ).toBe( 0 ); + expect( respond ).toHaveBeenCalledOnce(); + expect( respond.mock.calls[ 0 ][ 1 ] ).toEqual( { + chatId: 42, + bot: 'b', + photo: 'AAAA', + photoMimeType: 'image/png', + caption: 'Site preview', + } ); + } ); + + it( 'serializes posts so multiple shares arrive in emit order', async () => { + const order: string[] = []; + const respond = vi.fn().mockImplementation( async ( _config, params ) => { + // Yield the event loop so a second post would race past us if we + // were not serializing. + await new Promise( ( r ) => setTimeout( r, 5 ) ); + order.push( params.photo ); + } ); + const streamer = new MediaStreamer( { + config, + target: { chatId: 42 }, + deps: { respond, logger: new RemoteSessionLogger( '/dev/null' ) }, + } ); + + streamer.onEvent( { + type: 'media.share', + timestamp: 'now', + mediaType: 'image', + mimeType: 'image/png', + dataBase64: 'first', + } ); + streamer.onEvent( { + type: 'media.share', + timestamp: 'now', + mediaType: 'image', + mimeType: 'image/png', + dataBase64: 'second', + } ); + + await streamer.drain(); + expect( order ).toEqual( [ 'first', 'second' ] ); + } ); + + it( 'tracks failures without throwing so the turn keeps running', async () => { + const respond = vi + .fn() + .mockResolvedValueOnce( undefined ) + .mockRejectedValueOnce( new Error( 'network blip' ) ) + .mockResolvedValueOnce( undefined ); + const streamer = new MediaStreamer( { + config, + target: { chatId: 42 }, + deps: { respond, logger: new RemoteSessionLogger( '/dev/null' ) }, + } ); + + for ( const data of [ 'a', 'b', 'c' ] ) { + streamer.onEvent( { + type: 'media.share', + timestamp: 'now', + mediaType: 'image', + mimeType: 'image/png', + dataBase64: data, + } ); + } + + const summary = await streamer.drain(); + expect( summary.posted ).toBe( 2 ); + expect( summary.failed ).toBe( 1 ); + } ); + + it( 'ignores non-media events', async () => { + const respond = vi.fn().mockResolvedValue( undefined ); + const streamer = new MediaStreamer( { + config, + target: { chatId: 42 }, + deps: { respond, logger: new RemoteSessionLogger( '/dev/null' ) }, + } ); + + streamer.onEvent( { type: 'progress', timestamp: 'now', message: 'doing things' } ); + streamer.onEvent( { type: 'turn.started', timestamp: 'now' } ); + + const summary = await streamer.drain(); + expect( summary.posted ).toBe( 0 ); + expect( respond ).not.toHaveBeenCalled(); + } ); +} ); diff --git a/apps/cli/remote-session/tests/poll-loop.test.ts b/apps/cli/remote-session/tests/poll-loop.test.ts index 99a4785325..fabf432488 100644 --- a/apps/cli/remote-session/tests/poll-loop.test.ts +++ b/apps/cli/remote-session/tests/poll-loop.test.ts @@ -207,6 +207,91 @@ describe( 'runPollLoop', () => { expect( bodies ).toContain( '⚠️ Local agent did not return a result.' ); } ); + it( 'posts media shares in real time before the text reply when both are present', async () => { + const scripted = makeScriptedPoll( [ { chat_id: 42, text: 'show me' } ] ); + const deps = makeDeps( { scriptedPoll: scripted } ); + // Simulate the child emitting a media.share event mid-turn, then completing. + ( deps.runTurn as ReturnType< typeof vi.fn > ).mockImplementation( + async ( opts: { onEvent?: ( event: unknown ) => void } ) => { + opts.onEvent?.( { + type: 'media.share', + timestamp: 'now', + mediaType: 'image', + mimeType: 'image/png', + dataBase64: 'AAAA', + caption: 'Site preview', + } ); + return { + status: 'success', + sessionId: 'sess-1', + replyText: 'Want me to publish this as a preview site?', + isError: false, + stderrTail: '', + exitCode: 0, + staleSession: false, + } satisfies TurnOutcome; + } + ); + + const handle = await runPollLoop( { config: baseConfig, deps } ); + await scripted.done; + await handle.detach(); + await handle.done; + + const respond = deps.respond as ReturnType< typeof vi.fn >; + const calls = respond.mock.calls.map( ( [ , params ] ) => params ); + const photoIdx = calls.findIndex( ( p ) => p.photo === 'AAAA' ); + const textIdx = calls.findIndex( + ( p ) => p.text === 'Want me to publish this as a preview site?' + ); + expect( photoIdx ).toBeGreaterThan( -1 ); + expect( textIdx ).toBeGreaterThan( -1 ); + expect( photoIdx ).toBeLessThan( textIdx ); + expect( calls[ photoIdx ] ).toEqual( + expect.objectContaining( { + chatId: 42, + bot: 'b', + photo: 'AAAA', + photoMimeType: 'image/png', + caption: 'Site preview', + } ) + ); + } ); + + it( 'posts media even when there is no text reply (no fallback warning)', async () => { + const scripted = makeScriptedPoll( [ { chat_id: 42, text: 'just the screenshot' } ] ); + const deps = makeDeps( { scriptedPoll: scripted } ); + ( deps.runTurn as ReturnType< typeof vi.fn > ).mockImplementation( + async ( opts: { onEvent?: ( event: unknown ) => void } ) => { + opts.onEvent?.( { + type: 'media.share', + timestamp: 'now', + mediaType: 'image', + mimeType: 'image/png', + dataBase64: 'IMG', + } ); + return { + status: 'success', + sessionId: 'sess-1', + isError: false, + stderrTail: '', + exitCode: 0, + staleSession: false, + } satisfies TurnOutcome; + } + ); + + const handle = await runPollLoop( { config: baseConfig, deps } ); + await scripted.done; + await handle.detach(); + await handle.done; + + const respond = deps.respond as ReturnType< typeof vi.fn >; + const params = respond.mock.calls.map( ( [ , p ] ) => p ); + expect( params.some( ( p ) => p.photo === 'IMG' ) ).toBe( true ); + expect( params.some( ( p ) => /did not return a result/.test( p.text ?? '' ) ) ).toBe( false ); + } ); + it( 'aborts an in-flight turn when detach is called and skips posting a reply', async () => { const scripted = makeScriptedPoll( [ { chat_id: 42, text: 'long task' } ] ); const deps = makeDeps( { scriptedPoll: scripted } ); diff --git a/apps/cli/remote-session/tests/telegram-client.test.ts b/apps/cli/remote-session/tests/telegram-client.test.ts index 8eeda85a39..73daba221d 100644 --- a/apps/cli/remote-session/tests/telegram-client.test.ts +++ b/apps/cli/remote-session/tests/telegram-client.test.ts @@ -151,4 +151,113 @@ describe( 'respondMessage', () => { ); expect( fetchMock ).toHaveBeenCalledTimes( 1 ); } ); + + it( 'POSTs photo + caption as multipart/form-data with raw image bytes', async () => { + fetchMock.mockResolvedValueOnce( + new Response( JSON.stringify( { success: true, photo_sent: true } ), { + status: 200, + headers: { 'content-type': 'application/json' }, + } ) + ); + // Tiny 1x1 PNG — base64 of the standard "transparent pixel" header. + const photoBase64 = + 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkAAIAAAoAAv/lxKUAAAAASUVORK5CYII='; + await respondMessage( baseConfig, { + chatId: 42, + photo: photoBase64, + caption: 'Hello world', + } ); + const [ , init ] = fetchMock.mock.calls[ 0 ]; + expect( init.body ).toBeInstanceOf( FormData ); + // fetch sets Content-Type with the boundary; we must NOT set it ourselves. + expect( init.headers ).not.toHaveProperty( 'Content-Type' ); + expect( init.headers.Authorization ).toBe( 'Bearer abc' ); + const fd = init.body as FormData; + expect( fd.get( 'chat_id' ) ).toBe( '42' ); + expect( fd.get( 'bot' ) ).toBe( 'my_bot' ); + expect( fd.get( 'caption' ) ).toBe( 'Hello world' ); + expect( fd.get( 'text' ) ).toBeNull(); + const photo = fd.get( 'photo' ) as Blob; + expect( photo ).toBeInstanceOf( Blob ); + expect( photo.type ).toBe( 'image/png' ); + expect( photo.size ).toBe( Buffer.from( photoBase64, 'base64' ).length ); + } ); + + it( 'POSTs photo + text together via multipart with both fields', async () => { + fetchMock.mockResolvedValueOnce( + new Response( JSON.stringify( { success: true, photo_sent: true, text_sent: true } ), { + status: 200, + headers: { 'content-type': 'application/json' }, + } ) + ); + await respondMessage( baseConfig, { + chatId: 42, + photo: 'BASE64DATA', + text: 'Follow-up', + } ); + const [ , init ] = fetchMock.mock.calls[ 0 ]; + const fd = init.body as FormData; + expect( fd.get( 'text' ) ).toBe( 'Follow-up' ); + expect( fd.get( 'caption' ) ).toBeNull(); + const photo = fd.get( 'photo' ) as Blob; + expect( photo ).toBeInstanceOf( Blob ); + expect( photo.type ).toBe( 'image/png' ); + } ); + + it( 'uses the requested mime type for the photo file part', async () => { + fetchMock.mockResolvedValueOnce( new Response( '', { status: 200 } ) ); + await respondMessage( baseConfig, { + chatId: 1, + photo: 'BASE64DATA', + photoMimeType: 'image/jpeg', + } ); + const [ , init ] = fetchMock.mock.calls[ 0 ]; + const fd = init.body as FormData; + const photo = fd.get( 'photo' ) as Blob; + expect( photo.type ).toBe( 'image/jpeg' ); + } ); + + it( 'omits caption from the multipart body when it is undefined', async () => { + fetchMock.mockResolvedValueOnce( new Response( '', { status: 200 } ) ); + await respondMessage( baseConfig, { chatId: 1, photo: 'BASE64DATA' } ); + const [ , init ] = fetchMock.mock.calls[ 0 ]; + const fd = init.body as FormData; + expect( fd.get( 'caption' ) ).toBeNull(); + } ); + + it( 'logs a warning but does not throw when the server reports a partial failure', async () => { + fetchMock.mockResolvedValueOnce( + new Response( + JSON.stringify( { + success: false, + photo_sent: true, + text_sent: false, + error: 'Telegram returned 502 on text follow-up', + } ), + { status: 200, headers: { 'content-type': 'application/json' } } + ) + ); + // Should resolve, not throw. + await respondMessage( baseConfig, { chatId: 1, photo: 'BASE64DATA', text: 'follow' } ); + } ); + + it( 'rejects calls with neither text nor photo', async () => { + await expect( respondMessage( baseConfig, { chatId: 1 } ) ).rejects.toThrow( /text.*photo/i ); + expect( fetchMock ).not.toHaveBeenCalled(); + } ); + + it( 'truncates captions over 1024 chars before sending', async () => { + fetchMock.mockResolvedValueOnce( new Response( '', { status: 200 } ) ); + const longCaption = 'x'.repeat( 1500 ); + await respondMessage( baseConfig, { + chatId: 1, + photo: 'BASE64DATA', + caption: longCaption, + } ); + const [ , init ] = fetchMock.mock.calls[ 0 ]; + const fd = init.body as FormData; + const sent = fd.get( 'caption' ) as string; + expect( sent.length ).toBe( 1024 ); + expect( sent.endsWith( '…' ) ).toBe( true ); + } ); } ); diff --git a/apps/cli/remote-session/tests/turn-runner.test.ts b/apps/cli/remote-session/tests/turn-runner.test.ts index 5b5d40a054..118c21e0aa 100644 --- a/apps/cli/remote-session/tests/turn-runner.test.ts +++ b/apps/cli/remote-session/tests/turn-runner.test.ts @@ -61,4 +61,24 @@ describe( 'runTurn', () => { const outcome = await run( 'hang', undefined, 400 ); expect( outcome.status ).toBe( 'timeout' ); }, 10_000 ); + + it( 'forwards media.share events to the onEvent callback for in-flight delivery', async () => { + const seen: string[] = []; + const outcome = await runTurn( { + text: 'ignored', + timeoutMs: 5000, + cliEntry: mockCli, + env: { ...process.env, SCENARIO: 'media-share', SESSION_ID: 'captured-sess' }, + onEvent: ( event ) => { + if ( event.type === 'media.share' ) { + seen.push( event.dataBase64 ); + } + }, + } ); + expect( outcome.status ).toBe( 'success' ); + expect( outcome.replyText ).toBe( 'Want me to publish this as a preview site?' ); + // Events are forwarded in emit order; the streamer (not the runner) is + // responsible for actually posting them, so the outcome itself stays clean. + expect( seen ).toEqual( [ 'AAAA', 'BBBB' ] ); + } ); } ); diff --git a/apps/cli/remote-session/turn-runner.ts b/apps/cli/remote-session/turn-runner.ts index f36320c6b0..1a2300f72a 100644 --- a/apps/cli/remote-session/turn-runner.ts +++ b/apps/cli/remote-session/turn-runner.ts @@ -153,11 +153,18 @@ export async function runTurn( options: TurnRunOptions ): Promise< TurnOutcome > timeout_ms: options.timeoutMs, } ); + // Tell the spawned `studio code --json` it's running in a remote session so the + // system prompt can lean on `share_screenshot` and the preview-site follow-up. + const childEnv: NodeJS.ProcessEnv = { + ...( options.env ?? process.env ), + STUDIO_REMOTE_SESSION: '1', + }; + let child: ChildProcess; try { child = spawn( execPath, args, { stdio: [ 'pipe', 'pipe', 'pipe' ], - env: options.env ?? process.env, + env: childEnv, // Explicitly never use a shell — text is attacker-controlled. shell: false, } ); diff --git a/tools/common/ai/json-events.ts b/tools/common/ai/json-events.ts index 231d946624..10a6916930 100644 --- a/tools/common/ai/json-events.ts +++ b/tools/common/ai/json-events.ts @@ -5,6 +5,18 @@ export type TurnCompletedStatus = 'success' | 'error' | 'paused' | 'max_turns'; // JSONL — if no UI is listening (e.g. plain CLI runs) they are no-ops. export type PreviewCommand = { kind: 'navigate'; path: string } | { kind: 'reload' }; +// User-facing media payload emitted by tools like `share_screenshot`. The remote +// session controller forwards these to Telegram as photos; other consumers +// (desktop renderer, plain CLI) can ignore them. +export interface MediaShareEvent { + type: 'media.share'; + timestamp: string; + mediaType: 'image'; + mimeType: 'image/png' | 'image/jpeg'; + dataBase64: string; + caption?: string; +} + export type JsonEvent = | { type: 'message'; timestamp: string; message: unknown } | { type: 'progress'; timestamp: string; message: string } @@ -26,4 +38,5 @@ export type JsonEvent = status: TurnCompletedStatus; usage?: { numTurns: number; costUsd?: number }; } - | ( { type: 'preview.command'; timestamp: string } & PreviewCommand ); + | ( { type: 'preview.command'; timestamp: string } & PreviewCommand ) + | MediaShareEvent; diff --git a/tools/common/ai/tools.ts b/tools/common/ai/tools.ts index 785a4f64c7..1a93ef52d1 100644 --- a/tools/common/ai/tools.ts +++ b/tools/common/ai/tools.ts @@ -20,6 +20,7 @@ export function getToolDisplayName( name: string ): string { mcp__studio__wp_cli: __( 'Run WP-CLI' ), mcp__studio__validate_blocks: __( 'Validate blocks' ), mcp__studio__take_screenshot: __( 'Take screenshot' ), + mcp__studio__share_screenshot: __( 'Share screenshot' ), mcp__studio__preview_navigate: __( 'Navigate preview' ), mcp__studio__preview_reload: __( 'Reload preview' ), Read: __( 'Read' ), @@ -68,6 +69,7 @@ export function getToolDetail( name: string, input?: Record< string, unknown > ) } return __( 'inline content' ); case 'mcp__studio__take_screenshot': + case 'mcp__studio__share_screenshot': return typeof input.url === 'string' ? input.url : ''; case 'mcp__studio__preview_navigate': return typeof input.path === 'string' ? input.path : '';