document-extraction-service
Version:
A service for handling document extraction and processing
232 lines (196 loc) • 6.46 kB
JavaScript
const FormData = require('form-data');
const { v4: uuidv4 } = require('uuid');
class CustomExtractionRequestValidator {
#config;
#requestQueue;
/**
* @param {Object} config - Configuration object
*/
constructor(config) {
if (!config) {
throw new Error('Configuration is required');
}
this.#validateConfig(config);
// Set default strategy ID if not provided
if (config.requestBody && !config.requestBody.strategies_batch_id) {
config.requestBody.strategies_batch_id = 'fstrat-test';
}
this.#config = config;
this.#requestQueue = new Map();
// Make instance immutable
Object.freeze(this);
}
/**
* Get the current configuration
* @returns {Object}
*/
get config() {
return this.#config;
}
/**
* Get pending requests count
* @returns {number}
*/
get pendingRequestsCount() {
return this.#requestQueue.size;
}
#validateConfig(config) {
if (!config.endpoint) {
throw new Error('Endpoint is required');
}
if (!config.headers?.callback_url_pattern) {
throw new Error('Required header missing: callback_url_pattern');
}
if (config.requestBody) {
const requiredFields = ['doc_id'];
for (const field of requiredFields) {
if (!(field in config.requestBody)) {
throw new Error(`Required request body field missing: ${field}`);
}
}
}
}
/**
* Prepare request parameters for document processing
* @param {string} docId - Document identifier
* @param {string|Object} content - Document content
* @param {string} streamId - Stream identifier
* @param {string} strategyBatchId - Strategy batch identifier
* @returns {Object} Request parameters
*/
prepareRequest(docId, content, streamId, strategyBatchId = "fstrat-test") {
if (!docId || !content || !streamId) {
throw new Error('Missing required parameters');
}
const traceId = `trace-${Date.now()}-${uuidv4().slice(0, 8)}`;
const callbackUrl = this.#config.headers.callback_url_pattern
.replace('{{docId}}', docId)
.replace('{{streamId}}', streamId);
const formData = new FormData();
let contentToAppend;
if (Buffer.isBuffer(content)) {
contentToAppend = content;
} else if (typeof content === 'object') {
contentToAppend = JSON.stringify(content);
} else {
contentToAppend = String(content);
}
formData.append('document_meta', contentToAppend);
formData.append('strategyBatchId', strategyBatchId);
formData.append('docId', docId);
formData.append('streamId', streamId);
const metadata = {
traceId,
timestamp: Date.now()
};
formData.append('metadata', JSON.stringify(metadata));
// Generate a unique boundary
const boundary = `----FormBoundary${Date.now()}${Math.random().toString(36).slice(2)}`;
// Add entries method for testing
formData.entries = function* () {
yield ['document_meta', contentToAppend];
yield ['strategyBatchId', strategyBatchId];
yield ['docId', docId];
yield ['streamId', streamId];
yield ['metadata', JSON.stringify(metadata)];
};
// Get the boundary from FormData if available, otherwise use generated boundary
const formDataBoundary = formData.getBoundary?.() || boundary;
const requestParams = {
url: this.#config.endpoint,
method: 'POST',
timeout: 2 * 24 * 60 * 60 * 1000, // 2 days in milliseconds
headers: {
'X-Document-ID': docId,
'X-Trace-ID': traceId,
'X-Callback-URL': callbackUrl,
'Content-Type': `multipart/form-data; boundary=${formDataBoundary}`,
...this.#config.headers
},
data: formData,
maxBodyLength: Infinity,
maxContentLength: Infinity
};
// If using node-fetch or other libraries that need manual boundary handling
if (formData._boundary) {
requestParams.headers['Content-Type'] = `multipart/form-data; boundary=${formData._boundary}`;
}
this.#requestQueue.set(traceId, {
docId,
timestamp: Date.now(),
content
});
return requestParams;
}
/**
* Handle API response
* @param {Object} response - API response
* @param {string} traceId - Trace identifier
* @returns {Object} Processed response
*/
handleResponse(response, traceId) {
if (!response || !traceId || typeof traceId !== 'string') {
throw new Error('Invalid response or traceId');
}
const requestData = this.#requestQueue.get(traceId);
if (!requestData) {
throw new Error('No matching request found for traceId');
}
try {
// Validate response structure
const hasValidStatus = response.status && typeof response.status === 'number';
const isSuccess = hasValidStatus && response.status >= 200 && response.status < 300;
const hasValidData = response.data && typeof response.data === 'object';
const result = {
success: isSuccess && hasValidData,
docId: requestData.docId,
traceId: traceId
};
if (!hasValidStatus || !hasValidData || !isSuccess) {
result.success = false;
result.error = !hasValidStatus
? 'Invalid response format'
: !hasValidData
? 'Missing or invalid response data'
: response.data?.message || `Request failed with status ${response.status}`;
}
this.#requestQueue.delete(traceId);
return result;
} catch (error) {
this.#requestQueue.delete(traceId);
return {
success: false,
docId: requestData.docId,
traceId: traceId,
error: error.message
};
}
}
/**
* Clear request queue
* @returns {boolean}
*/
clearQueue() {
this.#requestQueue.clear();
return true;
}
static validateRequest(docId, content, streamId) {
if (!docId || typeof docId !== 'string') {
throw new Error('Invalid document ID');
}
if (!content) {
throw new Error('Content is required');
}
if (!streamId || typeof streamId !== 'string') {
throw new Error('Invalid stream ID');
}
return true;
}
static validateRequestHeaders(headers) {
if (!headers.callback_url_pattern) {
throw new Error('Callback URL pattern is required in headers');
}
return true;
}
}
module.exports = CustomExtractionRequestValidator;