UNPKG

@fin.cx/einvoice

Version:

A TypeScript module for creating, manipulating, and embedding XML data within PDF files specifically tailored for electronic invoice (einvoice) packages.

355 lines (317 loc) 10.8 kB
import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString, pako } from '../../../plugins.js'; /** * Base class for PDF XML extractors with common functionality */ export abstract class BaseXMLExtractor { /** * Known XML file names for different invoice formats */ protected readonly knownFileNames = [ 'factur-x.xml', 'zugferd-invoice.xml', 'ZUGFeRD-invoice.xml', 'xrechnung.xml', 'ubl-invoice.xml', 'invoice.xml', 'metadata.xml' ]; /** * Known XML formats to validate extracted content */ protected readonly knownFormats = [ 'CrossIndustryInvoice', 'CrossIndustryDocument', 'Invoice', 'CreditNote', 'ubl:Invoice', 'ubl:CreditNote', 'rsm:CrossIndustryInvoice', 'rsm:CrossIndustryDocument', 'ram:CrossIndustryDocument', 'urn:un:unece:uncefact', 'urn:ferd:CrossIndustryDocument', 'urn:zugferd', 'urn:factur-x', 'factur-x.eu', 'ZUGFeRD', 'FatturaElettronica' ]; /** * Known XML end tags for extracting content from strings */ protected readonly knownEndTags = [ '</CrossIndustryInvoice>', '</CrossIndustryDocument>', '</Invoice>', '</CreditNote>', '</rsm:CrossIndustryInvoice>', '</rsm:CrossIndustryDocument>', '</ram:CrossIndustryDocument>', '</ubl:Invoice>', '</ubl:CreditNote>', '</FatturaElettronica>' ]; /** * Extract XML from a PDF buffer * @param pdfBuffer PDF buffer * @returns XML content or null if not found */ public abstract extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null>; /** * Check if an XML string is valid * @param xmlString XML string to check * @returns True if the XML is valid */ protected isValidXml(xmlString: string): boolean { try { // Basic checks for XML validity if (!xmlString || typeof xmlString !== 'string') { return false; } // Check if it starts with XML declaration or a valid element if (!xmlString.includes('<?xml') && !this.hasKnownXmlElement(xmlString)) { return false; } // Check if the XML string contains known invoice formats const hasKnownFormat = this.hasKnownFormat(xmlString); if (!hasKnownFormat) { return false; } // Check if the XML string contains binary data or invalid characters if (this.hasBinaryData(xmlString)) { return false; } // Check if the XML string is too short if (xmlString.length < 100) { return false; } // Check if XML has a proper structure (contains both opening and closing tags) if (!this.hasProperXmlStructure(xmlString)) { return false; } return true; } catch (error) { console.error('Error validating XML:', error); return false; } } /** * Check if the XML string contains a known element * @param xmlString XML string to check * @returns True if the XML contains a known element */ protected hasKnownXmlElement(xmlString: string): boolean { for (const format of this.knownFormats) { // Check for opening tag of format if (xmlString.includes(`<${format}`)) { return true; } } return false; } /** * Check if the XML string contains a known format * @param xmlString XML string to check * @returns True if the XML contains a known format */ protected hasKnownFormat(xmlString: string): boolean { for (const format of this.knownFormats) { if (xmlString.includes(format)) { return true; } } return false; } /** * Check if the XML string has a proper structure * @param xmlString XML string to check * @returns True if the XML has a proper structure */ protected hasProperXmlStructure(xmlString: string): boolean { // Check for at least one matching opening and closing tag for (const endTag of this.knownEndTags) { const startTag = endTag.replace('/', ''); if (xmlString.includes(startTag) && xmlString.includes(endTag)) { return true; } } // If no specific tag is found but it has a basic XML structure return ( (xmlString.includes('<?xml') && xmlString.includes('?>')) || (xmlString.match(/<[^>]+>/) !== null && xmlString.match(/<\/[^>]+>/) !== null) ); } /** * Check if the XML string contains binary data * @param xmlString XML string to check * @returns True if the XML contains binary data */ protected hasBinaryData(xmlString: string): boolean { // Check for common binary data indicators const binaryChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005']; const consecutiveNulls = '\u0000\u0000\u0000'; // Check for control characters that shouldn't be in XML if (binaryChars.some(char => xmlString.includes(char))) { return true; } // Check for consecutive null bytes which indicate binary data if (xmlString.includes(consecutiveNulls)) { return true; } // Check for high concentration of non-printable characters const nonPrintableCount = (xmlString.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || []).length; if (nonPrintableCount > xmlString.length * 0.05) { // More than 5% non-printable return true; } return false; } /** * Extract XML from a string * @param text Text to extract XML from * @param startIndex Index to start extraction from * @returns XML content or null if not found */ protected extractXmlFromString(text: string, startIndex: number = 0): string | null { try { // Find the start of the XML document let xmlStartIndex = text.indexOf('<?xml', startIndex); // If no XML declaration, try to find known elements if (xmlStartIndex === -1) { for (const format of this.knownFormats) { const formatStartIndex = text.indexOf(`<${format.split(':').pop()}`, startIndex); if (formatStartIndex !== -1) { xmlStartIndex = formatStartIndex; break; } } // Still didn't find any start marker if (xmlStartIndex === -1) { return null; } } // Try to find the end of the XML document let xmlEndIndex = -1; for (const endTag of this.knownEndTags) { const endIndex = text.indexOf(endTag, xmlStartIndex); if (endIndex !== -1) { xmlEndIndex = endIndex + endTag.length; break; } } // If no known end tag found, try to use a heuristic approach if (xmlEndIndex === -1) { // Try to find the last closing tag const lastClosingTagMatch = text.slice(xmlStartIndex).match(/<\/[^>]+>(?!.*<\/[^>]+>)/); if (lastClosingTagMatch && lastClosingTagMatch.index !== undefined) { xmlEndIndex = xmlStartIndex + lastClosingTagMatch.index + lastClosingTagMatch[0].length; } else { return null; } } // Extract the XML content const xmlContent = text.substring(xmlStartIndex, xmlEndIndex); // Validate the extracted content if (this.isValidXml(xmlContent)) { return xmlContent; } return null; } catch (error) { console.error('Error extracting XML from string:', error); return null; } } /** * Decompress and decode XML content from a PDF stream * @param stream PDF stream containing XML data * @param fileName Name of the file (for logging) * @returns XML content or null if not valid */ protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> { try { // Get the raw bytes from the stream const rawBytes = stream.getContents(); // First try without decompression (in case the content is not compressed) let xmlContent = this.tryDecodeBuffer(rawBytes); if (xmlContent && this.isValidXml(xmlContent)) { console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`); return xmlContent; } // Try with decompression try { const decompressedBytes = this.tryDecompress(rawBytes); if (decompressedBytes) { xmlContent = this.tryDecodeBuffer(decompressedBytes); if (xmlContent && this.isValidXml(xmlContent)) { console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`); return xmlContent; } } } catch (decompressError) { console.log(`Decompression failed for ${fileName}: ${decompressError}`); } return null; } catch (error) { console.error('Error extracting XML from stream:', error); return null; } } /** * Try to decompress a buffer using different methods * @param buffer Buffer to decompress * @returns Decompressed buffer or null if decompression failed */ protected tryDecompress(buffer: Uint8Array): Uint8Array | null { try { // Try pako inflate (for deflate/zlib compression) return pako.inflate(buffer); } catch (error) { // If pako fails, try other methods if needed console.warn('Pako decompression failed, might be uncompressed or using a different algorithm'); return null; } } /** * Try to decode a buffer to a string using different encodings * @param buffer Buffer to decode * @returns Decoded string or null if decoding failed */ protected tryDecodeBuffer(buffer: Uint8Array): string | null { try { // Try UTF-8 first let content = new TextDecoder('utf-8').decode(buffer); if (this.isPlausibleXml(content)) { return content; } // Try ISO-8859-1 (Latin1) content = this.decodeLatin1(buffer); if (this.isPlausibleXml(content)) { return content; } return null; } catch (error) { console.warn('Error decoding buffer:', error); return null; } } /** * Decode a buffer using ISO-8859-1 (Latin1) encoding * @param buffer Buffer to decode * @returns Decoded string */ protected decodeLatin1(buffer: Uint8Array): string { return Array.from(buffer) .map(byte => String.fromCharCode(byte)) .join(''); } /** * Check if a string is plausibly XML (quick check before validation) * @param content String to check * @returns True if the string is plausibly XML */ protected isPlausibleXml(content: string): boolean { return content.includes('<') && content.includes('>') && (content.includes('<?xml') || this.knownFormats.some(format => content.includes(format))); } }