@nutrient-sdk/document-engine-mcp-server
Version:
MCP server for Nutrient Document Engine
146 lines (145 loc) • 6.62 kB
JavaScript
import { z } from 'zod';
import { getDocumentInfo, getDocumentPageText } from '../../api/DocumentLayerAbstraction.js';
import { DocumentFingerprintSchema } from '../schemas/DocumentFingerprintSchema.js';
import { formatBBox } from '../../utils/Common.js';
/**
* Schema for extract_text tool
*/
// Create a Zod schema for PageRange
const PageRangeSchema = z.object({
start: z.number().optional(),
end: z.number().optional(),
});
export const ExtractTextSchema = {
document_fingerprint: DocumentFingerprintSchema,
page_range: PageRangeSchema.optional().describe('Range of pages to include with start and end indices (0-based)'),
include_coordinates: z
.boolean()
.optional()
.default(false)
.describe('Whether to include text coordinates'),
ocr_enabled: z
.boolean()
.optional()
.default(false)
.describe('Whether to enable OCR for text extraction'),
};
export const ExtractTextInputSchema = z.object(ExtractTextSchema);
/**
* Extract text content from a document with OCR support and pagination
*/
export async function extractText(client, params) {
try {
// Validate input
const validatedParams = ExtractTextInputSchema.parse(params);
const { document_fingerprint, page_range, include_coordinates = false, ocr_enabled = false, } = validatedParams;
// Get document info to get total page count
const docInfo = await getDocumentInfo(client, document_fingerprint);
const pageCount = docInfo.pageCount || 0;
// Parse page range if provided, otherwise use all pages
let pageIndices = [];
if (page_range) {
const start = page_range.start !== undefined ? page_range.start : 0;
const end = page_range.end !== undefined ? page_range.end : pageCount - 1;
// Validate page range
if (start < 0 || start >= pageCount) {
throw new Error(`Page range start ${start} is out of bounds (document has ${pageCount} pages, valid indices are 0-${pageCount - 1})`);
}
if (end < 0 || end >= pageCount) {
throw new Error(`Page range end ${end} is out of bounds (document has ${pageCount} pages, valid indices are 0-${pageCount - 1})`);
}
if (start > end) {
throw new Error(`Invalid page range: start (${start}) must be less than or equal to end (${end})`);
}
// Generate array of page indices from start to end
pageIndices = Array.from({ length: end - start + 1 }, (_, i) => start + i);
}
else {
pageIndices = Array.from({ length: pageCount }, (_, i) => i);
}
// Fetch text for each page
const pageTextPromises = pageIndices.map(async (pageIndex) => {
const response = await getDocumentPageText(client, document_fingerprint, pageIndex, ocr_enabled);
return {
pageIndex,
textLines: response.data.textLines,
};
});
const pageTexts = await Promise.all(pageTextPromises);
// Count total words
let totalWords = 0;
pageTexts.forEach(page => {
const textLines = page.textLines || [];
textLines.forEach((line) => {
// Count words in each line by splitting on whitespace
const words = line.contents.trim().split(/\s+/).filter(Boolean);
totalWords += words.length;
});
});
// Build the markdown content
let markdown = `# Text Extraction\n\n`;
markdown += `📄 **Document ID:** ${document_fingerprint.document_id} \n`;
if (document_fingerprint.layer) {
markdown += `🔖 **Layer:** ${document_fingerprint.layer} \n`;
}
markdown += `📄 **Total Pages:** ${pageCount} \n`;
markdown += `📝 **Total Words:** ${totalWords.toLocaleString()} \n`;
markdown += `🔍 **OCR Applied:** ${ocr_enabled ? 'Yes' : 'No'} \n`;
// Add page range information if provided
if (page_range) {
const rangeText = page_range.start !== undefined && page_range.end !== undefined
? `${page_range.start}-${page_range.end}`
: page_range.start !== undefined
? `${page_range.start}-end`
: page_range.end !== undefined
? `0-${page_range.end}`
: 'All pages';
markdown += `📖 **Pages Processed:** ${rangeText} \n`;
}
else {
markdown += `📖 **Pages Processed:** All pages \n`;
}
markdown += `\n---\n\n`;
// Add text content for each page
pageTexts.forEach(page => {
const pageNumber = page.pageIndex + 1;
// Count words on this page
let pageWordCount = 0;
const textLines = page.textLines || [];
textLines.forEach((line) => {
const words = line.contents.trim().split(/\s+/).filter(Boolean);
pageWordCount += words.length;
});
markdown += `## Page ${pageNumber} (${pageWordCount} words)\n`;
// Add coordinates if requested or if OCR
if (include_coordinates || ocr_enabled) {
markdown += `### Coordinates for Page ${pageNumber}\n`;
markdown += `\`\`\`json\n`;
markdown += JSON.stringify(textLines.map((line) => ({
contents: line.contents,
boundingBox: formatBBox([
line.left || 0,
line.top || 0,
line.width || 0,
line.height || 0,
]),
})), null, 2);
markdown += `\n\`\`\`\n\n`;
}
else {
// Combine all text lines for this page
const pageText = textLines.map((line) => line.contents).join(' ');
markdown += `${pageText}\n\n`;
}
});
markdown += `---\n\n`;
markdown += `💡 **Tip:** Use \`extract_form_data\` if this document contains fillable forms.\n`;
return { markdown };
}
catch (error) {
// Provide a more user-friendly error message
return {
markdown: `# Error Extracting Text\n\nAn error occurred while trying to extract text: ${error instanceof Error ? error.message : String(error)}\n\nPlease check your connection and try again.`,
};
}
}