claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
222 lines • 8.72 kB
JavaScript
/**
* GAIA Tool: image_describe — ADR-133-PR5
*
* Describes an image at a given URL or local file path using Anthropic's
* claude-haiku-4-5 model with vision (image content blocks). Covers the
* subset of GAIA Level-1 questions that provide image attachments (graphs,
* screenshots, photos, diagrams).
*
* ============================================================
* DESIGN NOTES
* ============================================================
* - Uses the Anthropic Messages API directly via `fetch` (same approach as
* gaia-agent.ts / gaia-judge.ts) — no SDK dependency.
* - API key resolution order mirrors gaia-agent.ts:
* 1. `options.apiKey` (caller-supplied)
* 2. ANTHROPIC_API_KEY env var
* 3. gcloud secrets versions access latest --secret=ANTHROPIC_API_KEY
* - Model: claude-haiku-4-5 (cheapest vision-capable model, ~$0.001/call).
* - URL images: sent as { type: 'url', url } — Anthropic fetches the image.
* - Local files: read as Buffer, base64-encoded, MIME detected from extension.
* - execute() NEVER throws — returns a structured error string so the agent
* loop can forward it to Claude rather than crashing.
*
* ============================================================
* SUPPORTED IMAGE FORMATS
* ============================================================
* Anthropic vision accepts: JPEG, PNG, GIF, WebP.
* Unsupported formats return a descriptive error that Claude can relay.
*
* Refs: ADR-133, #2156
*/
import * as fs from 'node:fs';
import * as path from 'node:path';
import { execSync } from 'node:child_process';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const VISION_MODEL = 'claude-haiku-4-5';
const ANTHROPIC_API_VERSION = '2023-06-01';
const ANTHROPIC_API_URL = 'https://api.anthropic.com/v1/messages';
const MAX_TOKENS = 512;
const DEFAULT_PROMPT = 'Describe this image in detail. Note any text, charts, diagrams, tables, ' +
'numerical data, labels, axes, legends, or other content that would be ' +
'useful to answer a factual question about what the image shows.';
const EXT_TO_MIME = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.webp': 'image/webp',
};
function detectMime(filePath, buf) {
// Magic bytes first
if (buf.length >= 2 && buf[0] === 0xff && buf[1] === 0xd8)
return 'image/jpeg';
if (buf.length >= 4 &&
buf[0] === 0x89 &&
buf[1] === 0x50 &&
buf[2] === 0x4e &&
buf[3] === 0x47)
return 'image/png';
if (buf.length >= 4 &&
buf[0] === 0x47 &&
buf[1] === 0x49 &&
buf[2] === 0x46 &&
buf[3] === 0x38)
return 'image/gif';
if (buf.length >= 4 &&
buf[0] === 0x52 &&
buf[1] === 0x49 &&
buf[2] === 0x46 &&
buf[3] === 0x46)
return 'image/webp';
// Fall back to extension
const ext = path.extname(filePath).toLowerCase();
return EXT_TO_MIME[ext] ?? 'image/png';
}
// ---------------------------------------------------------------------------
// API key resolution (mirrors gaia-agent.ts)
// ---------------------------------------------------------------------------
function resolveAnthropicApiKey(suppliedKey) {
if (suppliedKey && suppliedKey.trim())
return suppliedKey.trim();
const envKey = process.env.ANTHROPIC_API_KEY;
if (envKey && envKey.trim())
return envKey.trim();
try {
const out = execSync('gcloud secrets versions access latest --secret=ANTHROPIC_API_KEY 2>/dev/null', { encoding: 'utf-8', timeout: 10_000 }).trim();
if (out)
return out;
}
catch {
/* fall through */
}
throw new Error('ANTHROPIC_API_KEY not found. Set the env var or store it in GCP Secret Manager ' +
'under "ANTHROPIC_API_KEY".');
}
async function callVisionApi(imageBlock, prompt, apiKey) {
const body = JSON.stringify({
model: VISION_MODEL,
max_tokens: MAX_TOKENS,
messages: [
{
role: 'user',
content: [imageBlock, { type: 'text', text: prompt }],
},
],
});
const resp = await fetch(ANTHROPIC_API_URL, {
method: 'POST',
headers: {
'x-api-key': apiKey,
'anthropic-version': ANTHROPIC_API_VERSION,
'content-type': 'application/json',
},
body,
});
if (!resp.ok) {
const errText = await resp.text().catch(() => '(no body)');
throw new Error(`Anthropic API error ${resp.status}: ${errText}`);
}
const json = (await resp.json());
if (json.error)
throw new Error(`Anthropic API error: ${json.error.message}`);
const textBlock = json.content?.find((b) => b.type === 'text');
return textBlock?.text ?? '(no description returned)';
}
// ---------------------------------------------------------------------------
// GaiaTool implementation
// ---------------------------------------------------------------------------
export class ImageDescribeTool {
name = 'image_describe';
apiKey;
constructor(apiKey) {
this.apiKey = apiKey;
}
definition = {
name: 'image_describe',
description: 'Describe an image at a given URL or local absolute file path. ' +
'Returns a detailed text description suitable as input to answer ' +
'factual questions about the image content (charts, graphs, photos, ' +
'screenshots, diagrams, text in images). ' +
`Uses ${VISION_MODEL} for cost-efficient vision (~$0.001/call). ` +
'Supported formats: JPEG, PNG, GIF, WebP.',
input_schema: {
type: 'object',
properties: {
source: {
type: 'string',
description: 'URL (http/https) or absolute local file path of the image to describe.',
},
prompt: {
type: 'string',
description: 'Optional instruction to guide the description ' +
'(default: describe all visible content with focus on factual details).',
},
},
required: ['source'],
},
};
async execute(input) {
const source = String(input['source'] ?? '').trim();
if (!source)
throw new Error('image_describe: `source` input is required and must be non-empty.');
const prompt = input['prompt'] != null ? String(input['prompt']).trim() : DEFAULT_PROMPT;
// Resolve API key — errors here are caught by the agent loop.
let apiKey;
try {
apiKey = resolveAnthropicApiKey(this.apiKey);
}
catch (e) {
return `[image_describe error] ${String(e)}`;
}
let imageBlock;
if (source.startsWith('http://') || source.startsWith('https://')) {
// URL image — Anthropic fetches it directly.
imageBlock = {
type: 'image',
source: { type: 'url', url: source },
};
}
else {
// Local file — validate, read, base64-encode.
if (!path.isAbsolute(source)) {
return (`[image_describe error] Local file paths must be absolute. ` +
`Got: "${source}".`);
}
let buf;
try {
buf = fs.readFileSync(source);
}
catch (e) {
const err = e;
if (err.code === 'ENOENT') {
return `[image_describe error] File not found: ${source}`;
}
return `[image_describe error] Cannot read file "${source}": ${String(e)}`;
}
const mime = detectMime(source, buf);
imageBlock = {
type: 'image',
source: {
type: 'base64',
media_type: mime,
data: buf.toString('base64'),
},
};
}
try {
const description = await callVisionApi(imageBlock, prompt, apiKey);
return `[image_describe: ${VISION_MODEL}]\n${description}`;
}
catch (e) {
// Return error as string — never throw so the agent loop stays alive.
return `[image_describe error] ${String(e)}`;
}
}
}
export function createImageDescribeTool(opts) {
return new ImageDescribeTool(opts?.apiKey);
}
//# sourceMappingURL=image_describe.js.map