document-extraction-service
Version:
A service for handling document extraction and processing
156 lines (134 loc) • 4 kB
JavaScript
const express = require('express');
const bodyParser = require('body-parser');
const axios = require('axios');
const { v4: uuidv4 } = require('uuid');
const app = express();
const port = 5005;
// Middleware
app.use(bodyParser.json());
// Store extraction jobs
const extractionJobs = new Map();
// Placeholder for custom extraction logic
const defaultExtractor = (content) => {
// Default implementation splits text into paragraphs
const text = typeof content === 'string' ? content : JSON.stringify(content);
return text.split('\n').filter(chunk => chunk.trim()).map((chunk, index) => ({
chunkId: `chunk-${index}`,
chunkText: chunk.trim(),
pageNumber: 1,
chunkType: 'Text',
extractionMethod: 'custom',
extractionStrategy: 'default'
}));
};
// Allow users to set their custom extraction logic
let customExtractor = defaultExtractor;
const setCustomExtractor = (extractorFn) => {
if (typeof extractorFn !== 'function') {
throw new Error('Extractor must be a function');
}
customExtractor = extractorFn;
};
// Validate required headers
const validateHeaders = (headers) => {
const required = ['x-callback-url', 'x-document-id', 'x-trace-id'];
const missing = required.filter(header => !headers[header.toLowerCase()]);
if (missing.length > 0) {
throw new Error(`Missing required headers: ${missing.join(', ')}`);
}
};
// Process document and send chunks to callback URL
const processDocument = async (jobId) => {
const job = extractionJobs.get(jobId);
if (!job) return;
try {
// Extract chunks using custom or default extractor
const chunks = customExtractor(job.payload.content);
// Prepare callback data
const callbackData = {
doc_id: job.docId,
trace_id: job.traceId,
last_batch: true,
chunk_data: chunks.map(chunk => ({
...chunk,
docId: job.docId,
sourceId: `src-${uuidv4()}`,
recordTitle: 'Extracted Document',
recordUrl: '',
searchIndexId: job.payload.streamId,
sourceAcl: ['*'],
createdOn: new Date().toISOString(),
sourceUrl: '',
sourceType: 'file',
chunkMeta: {},
chunkTitle: '',
sourceName: 'Custom Extraction',
fileType: 'txt'
}))
};
// Send chunks to callback URL
const response = await axios.post(job.callbackUrl, callbackData, {
headers: {
'Content-Type': 'application/json'
}
});
console.log(`Callback response for job ${jobId}:`, {
status: response.status,
data: response.data
});
// Cleanup
extractionJobs.delete(jobId);
} catch (error) {
console.error(`Error processing job ${jobId}:`, error);
extractionJobs.delete(jobId);
}
};
// API endpoint for document extraction
app.post('/extract', async (req, res) => {
try {
// Validate headers
validateHeaders(req.headers);
const jobId = uuidv4();
const docId = req.headers['x-document-id'];
const traceId = req.headers['x-trace-id'];
const callbackUrl = req.headers['x-callback-url'];
// Store job details
extractionJobs.set(jobId, {
docId,
traceId,
callbackUrl,
payload: req.body,
timestamp: Date.now()
});
// Send acceptance response
res.status(202).json({
message: 'Document enqueued for extraction',
jobId,
docId,
traceId
});
// Process document asynchronously
setTimeout(() => processDocument(jobId), 1000);
} catch (error) {
console.error('Extraction request error:', error);
res.status(400).json({
error: error.message
});
}
});
// Health check endpoint
app.get('/health', (req, res) => {
res.status(200).json({
status: 'healthy',
timestamp: new Date().toISOString()
});
});
// Start server
const server = app.listen(port, () => {
console.log(`Custom Extraction test server running on port ${port}`);
});
module.exports = {
app,
server,
setCustomExtractor
};