claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
226 lines • 7.94 kB
JavaScript
/**
* GAIA Tool: pdf_read — ADR-138 iter 54
*
* Extracts text content from a PDF file using a Python subprocess.
* GAIA L1 has ~20-30% of questions with PDF attachments; without this,
* ruflo is functionally blind on those questions.
*
* Extraction chain (tries in order, returns first success):
* 1. pdfminer.six — best quality, handles most text PDFs
* 2. pdfplumber — good alternative, especially for tables
* 3. PyPDF2/pypdf — fallback, lower quality but widely available
* 4. pdftotext — CLI tool (poppler-utils), if installed
* 5. Stub — returns a note that extraction failed
*
* For image-only (scanned) PDFs, all text extractors return empty.
* In that case we return a note so the agent can try describe_image.
*
* Refs: ADR-138, #2156, iter 54
*/
import { execFileSync } from 'node:child_process';
import * as path from 'node:path';
import * as fs from 'node:fs';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const EXEC_TIMEOUT_MS = 30_000;
const MAX_OUTPUT_CHARS = 8_000;
// ---------------------------------------------------------------------------
// Extraction helpers
// ---------------------------------------------------------------------------
/**
* Try to extract PDF text using Python pdfminer.six (best quality).
*/
function extractViaPdfminer(filePath) {
const script = `
import sys
try:
from pdfminer.high_level import extract_text
text = extract_text(sys.argv[1])
print(text[:12000] if text else '')
except ImportError:
print('[pdfminer_not_installed]')
except Exception as e:
print('[pdfminer_error:' + str(e) + ']')
`.trim();
try {
const out = execFileSync('python3', ['-', filePath], {
input: script,
encoding: 'utf-8',
timeout: EXEC_TIMEOUT_MS,
}).trim();
if (out && !out.startsWith('[pdfminer_not_installed]') && !out.startsWith('[pdfminer_error:')) {
return out;
}
}
catch { /* fall through */ }
return '';
}
/**
* Try to extract PDF text using pdfplumber (good for tables).
*/
function extractViaPdfplumber(filePath) {
const script = `
import sys
try:
import pdfplumber
with pdfplumber.open(sys.argv[1]) as pdf:
texts = []
for page in pdf.pages:
t = page.extract_text()
if t:
texts.append(t)
print('\\n\\n'.join(texts)[:12000])
except ImportError:
print('[pdfplumber_not_installed]')
except Exception as e:
print('[pdfplumber_error:' + str(e) + ']')
`.trim();
try {
const out = execFileSync('python3', ['-', filePath], {
input: script,
encoding: 'utf-8',
timeout: EXEC_TIMEOUT_MS,
}).trim();
if (out && !out.startsWith('[pdfplumber_not_installed]') && !out.startsWith('[pdfplumber_error:')) {
return out;
}
}
catch { /* fall through */ }
return '';
}
/**
* Try to extract PDF text using pypdf (widely available, lower quality).
*/
function extractViaPypdf(filePath) {
const script = `
import sys
try:
try:
import pypdf as pdf_lib
except ImportError:
import PyPDF2 as pdf_lib
with open(sys.argv[1], 'rb') as f:
reader = pdf_lib.PdfReader(f)
texts = []
for page in reader.pages:
t = page.extract_text()
if t:
texts.append(t)
print('\\n\\n'.join(texts)[:12000])
except ImportError:
print('[pypdf_not_installed]')
except Exception as e:
print('[pypdf_error:' + str(e) + ']')
`.trim();
try {
const out = execFileSync('python3', ['-', filePath], {
input: script,
encoding: 'utf-8',
timeout: EXEC_TIMEOUT_MS,
}).trim();
if (out && !out.startsWith('[pypdf_not_installed]') && !out.startsWith('[pypdf_error:')) {
return out;
}
}
catch { /* fall through */ }
return '';
}
/**
* Try to extract PDF text using the `pdftotext` CLI tool (poppler-utils).
*/
function extractViaPdftotext(filePath) {
try {
const out = execFileSync('pdftotext', [filePath, '-'], {
encoding: 'utf-8',
timeout: EXEC_TIMEOUT_MS,
}).trim();
return out.slice(0, 12000);
}
catch { /* fall through */ }
return '';
}
// ---------------------------------------------------------------------------
// Path validation (mirrors file_read.ts)
// ---------------------------------------------------------------------------
function validatePdfPath(filePath) {
if (!filePath || typeof filePath !== 'string') {
throw new Error('pdf_read: `path` must be a non-empty string.');
}
if (!path.isAbsolute(filePath)) {
throw new Error(`pdf_read: path must be absolute. Got: "${filePath}".`);
}
if (filePath.includes('\0')) {
throw new Error('pdf_read: path contains null byte — rejected.');
}
const ext = path.extname(filePath).toLowerCase();
if (ext !== '.pdf') {
throw new Error(`pdf_read: expected a .pdf file, got extension "${ext}". Use file_read for other formats.`);
}
}
// ---------------------------------------------------------------------------
// GaiaTool implementation
// ---------------------------------------------------------------------------
export class PdfReadTool {
name = 'pdf_read';
definition = {
name: 'pdf_read',
description: 'Extract text content from a PDF file. ' +
'Path must be absolute. ' +
'Tries pdfminer.six, pdfplumber, pypdf, and pdftotext in order. ' +
'Returns extracted text or a note if the PDF is image-only (scanned). ' +
`Output truncated to ${MAX_OUTPUT_CHARS} characters.`,
input_schema: {
type: 'object',
properties: {
path: {
type: 'string',
description: 'Absolute path to the PDF file.',
},
},
required: ['path'],
},
};
async execute(input) {
const filePath = String(input['path'] ?? '').trim();
validatePdfPath(filePath);
// Check existence
try {
const stat = fs.statSync(filePath);
if (!stat.isFile())
throw new Error(`pdf_read: "${filePath}" is not a regular file.`);
}
catch (e) {
const err = e;
if (err.code === 'ENOENT')
throw new Error(`pdf_read: file not found: ${filePath}`);
throw e;
}
const filename = path.basename(filePath);
// Try extractors in order
let text = extractViaPdfminer(filePath);
if (!text)
text = extractViaPdfplumber(filePath);
if (!text)
text = extractViaPypdf(filePath);
if (!text)
text = extractViaPdftotext(filePath);
if (!text || text.length < 20) {
return (`[PDF: ${filename}]\n` +
`Text extraction returned no content. This may be a scanned/image-only PDF.\n` +
`If the PDF contains images, consider using describe_image on specific pages.\n` +
`File: ${filePath}`);
}
const truncated = text.length > MAX_OUTPUT_CHARS;
const output = text.slice(0, MAX_OUTPUT_CHARS);
return (`[PDF: ${filename}]\n\n${output}` +
(truncated ? `\n\n[... truncated at ${MAX_OUTPUT_CHARS} chars ...]` : ''));
}
}
// ---------------------------------------------------------------------------
// Convenience factory
// ---------------------------------------------------------------------------
export function createPdfReadTool() {
return new PdfReadTool();
}
//# sourceMappingURL=pdf_read.js.map