UNPKG

@thasmorato/docx-parser

Version:

A modern JavaScript library for parsing and processing Microsoft Word DOCX documents with support for both buffer and stream operations. Features incremental parsing, checkbox detection, footnote support, and document validation.

1,092 lines (1,078 loc) 36.1 kB
'use strict'; var fs = require('fs'); var web = require('stream/web'); var JSZip = require('jszip'); function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; } var JSZip__default = /*#__PURE__*/_interopDefault(JSZip); var __defProp = Object.defineProperty; var __name = (target, value) => __defProp(target, "name", { value, configurable: true }); // src/domain/types.ts var DocxParseError = class extends Error { constructor(message, position, code) { super(message); this.position = position; this.code = code; this.name = "DocxParseError"; } static { __name(this, "DocxParseError"); } }; var StreamAdapter = class { static { __name(this, "StreamAdapter"); } /** * Converts a Node.js ReadStream to web ReadableStream * @param readStream - Node.js read stream (fs.ReadStream) * @returns Web API ReadableStream */ static toWebStream(readStream) { return new web.ReadableStream({ start(controller) { readStream.on("data", (chunk) => { const buffer = chunk instanceof Buffer ? chunk : Buffer.from(chunk); const uint8Array = new Uint8Array(buffer.buffer, buffer.byteOffset, buffer.byteLength); controller.enqueue(uint8Array); }); readStream.on("end", () => { controller.close(); }); readStream.on("error", (error) => { controller.error(error); }); }, cancel() { readStream.destroy(); } }); } /** * Checks if it's a Web ReadableStream */ static isWebReadableStream(stream) { return stream && typeof stream.getReader === "function"; } /** * Checks if it's a Node.js Readable stream */ static isNodeReadableStream(stream) { return stream && typeof stream.read === "function" && typeof stream.on === "function"; } /** * Converts Node.js Readable stream to Buffer */ static async nodeStreamToBuffer(stream) { const chunks = []; return new Promise((resolve, reject) => { stream.on("data", (chunk) => { chunks.push(chunk); }); stream.on("end", () => { resolve(Buffer.concat(chunks)); }); stream.on("error", (error) => { reject(error); }); }); } /** * Converts Web ReadableStream to Buffer */ static async webStreamToBuffer(stream) { const chunks = []; const reader = stream.getReader(); try { let done = false; while (!done) { const result = await reader.read(); done = result.done; if (result.value) { chunks.push(result.value); } } } finally { reader.releaseLock(); } return Buffer.concat(chunks); } /** * Converts ReadableStream (Web or Node.js) to Buffer * Automatically detects stream type and uses appropriate conversion * @param stream - Web ReadableStream or Node.js Readable stream * @returns Promise that resolves to Buffer */ static async toBuffer(stream) { if (this.isWebReadableStream(stream)) { return this.webStreamToBuffer(stream); } else if (this.isNodeReadableStream(stream)) { return this.nodeStreamToBuffer(stream); } else { throw new Error("Stream type not supported. Expected Web ReadableStream or Node.js Readable stream."); } } /** * Creates a web ReadableStream from a Buffer * @param buffer - Buffer to be converted * @returns Web ReadableStream */ static fromBuffer(buffer) { let position = 0; const chunkSize = 64 * 1024; return new web.ReadableStream({ pull(controller) { if (position >= buffer.length) { controller.close(); return; } const chunk = buffer.subarray(position, Math.min(position + chunkSize, buffer.length)); const uint8Array = new Uint8Array(chunk.buffer, chunk.byteOffset, chunk.byteLength); controller.enqueue(uint8Array); position += chunk.length; } }); } /** * Converts Node.js Readable stream to Web ReadableStream * Useful for HTTP requests (axios, fetch, etc) * @param nodeStream - Node.js Readable stream * @returns Web API ReadableStream */ static nodeToWebStream(nodeStream) { return new web.ReadableStream({ start(controller) { nodeStream.on("data", (chunk) => { const uint8Array = new Uint8Array(chunk.buffer, chunk.byteOffset, chunk.byteLength); controller.enqueue(uint8Array); }); nodeStream.on("end", () => { controller.close(); }); nodeStream.on("error", (error) => { controller.error(error); }); }, cancel() { if (nodeStream.destroy) { nodeStream.destroy(); } } }); } }; // src/infrastructure/adapters/xml-adapter.ts var XmlAdapter = class { static { __name(this, "XmlAdapter"); } parseXml(xmlContent) { console.log("XML content length:", xmlContent.length); throw new DocxParseError("XML parsing not implemented - use a proper XML parser library"); } extractText(xmlDoc) { console.log("XML doc:", xmlDoc); throw new DocxParseError("XML text extraction not implemented"); } extractElements(xmlDoc, tagName) { console.log("XML doc:", xmlDoc, "Tag name:", tagName); throw new DocxParseError("XML element extraction not implemented"); } // Simple regex-based text extraction for basic functionality extractTextFromXml(xmlContent) { try { const textContent = xmlContent.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim(); return textContent; } catch (error) { throw new DocxParseError( `Failed to extract text from XML: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract paragraph content using regex extractParagraphsFromXml(xmlContent) { try { const paragraphs = []; const paragraphRegex = /<w:p[^>]*>(.*?)<\/w:p>/gs; let match; while ((match = paragraphRegex.exec(xmlContent)) !== null) { const paragraphXml = match[1]; if (paragraphXml) { const text = this.extractTextFromXml(paragraphXml); if (text.trim()) { paragraphs.push(text); } } } return paragraphs; } catch (error) { throw new DocxParseError( `Failed to extract paragraphs: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract paragraphs with style information to detect headers extractParagraphsWithStyleFromXml(xmlContent) { try { const elements = []; const paragraphRegex = /<w:p[^>]*>(.*?)<\/w:p>/gs; let match; while ((match = paragraphRegex.exec(xmlContent)) !== null) { const paragraphXml = match[1]; if (paragraphXml) { const text = this.extractTextFromXml(paragraphXml); if (text.trim()) { const styleInfo = this.analyzeStyleForHeader(paragraphXml); const element = { text: text.trim(), type: styleInfo.isHeader ? "header" : "paragraph" }; if (styleInfo.level !== void 0) { element.level = styleInfo.level; } elements.push(element); } } } return elements; } catch (error) { throw new DocxParseError( `Failed to extract paragraphs with style: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract paragraphs with formatting information (including strike for checkboxes) extractParagraphsWithFormattingFromXml(xmlContent) { try { const elements = []; const paragraphRegex = /<w:p[^>]*>(.*?)<\/w:p>/gs; let match; while ((match = paragraphRegex.exec(xmlContent)) !== null) { const paragraphXml = match[1]; if (paragraphXml) { const text = this.extractTextFromXml(paragraphXml); if (text.trim()) { const styleInfo = this.analyzeStyleForHeader(paragraphXml); const formattingInfo = this.extractFormattingInfo(paragraphXml); const element = { text: text.trim(), type: styleInfo.isHeader ? "header" : "paragraph" }; if (styleInfo.level !== void 0) { element.level = styleInfo.level; } if (Object.keys(formattingInfo).length > 0) { element.formatting = formattingInfo; } elements.push(element); } } } return elements; } catch (error) { throw new DocxParseError( `Failed to extract paragraphs with formatting: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract formatting information from paragraph XML extractFormattingInfo(paragraphXml) { const formatting = {}; if (/<w:b\b[^>]*\/?>/.test(paragraphXml)) { formatting.bold = true; } if (/<w:i\b[^>]*\/?>/.test(paragraphXml)) { formatting.italic = true; } if (/<w:u\b[^>]*\/?>/.test(paragraphXml)) { formatting.underline = true; } if (/<w:strike\s+w:val=["']1["'][^>]*\/?>/.test(paragraphXml)) { formatting.strike = true; } return formatting; } // Analyze paragraph style to determine if it's a header analyzeStyleForHeader(paragraphXml) { const headingStyleRegex = /<w:pStyle[^>]*w:val=["']([^"']*heading[^"']*)["'][^>]*>/i; const headingMatch = paragraphXml.match(headingStyleRegex); if (headingMatch && headingMatch[1]) { const style = headingMatch[1].toLowerCase(); const levelMatch = style.match(/heading(\d+)|h(\d+)/); const level = levelMatch ? parseInt(levelMatch[1] || levelMatch[2] || "1") : 1; return { isHeader: true, level }; } const titleStyleRegex = /<w:pStyle[^>]*w:val=["']([^"']*title[^"']*)["'][^>]*>/i; if (titleStyleRegex.test(paragraphXml)) { return { isHeader: true, level: 1 }; } const subtitleStyleRegex = /<w:pStyle[^>]*w:val=["']([^"']*subtitle[^"']*)["'][^>]*>/i; if (subtitleStyleRegex.test(paragraphXml)) { return { isHeader: true, level: 2 }; } return { isHeader: false }; } // Extract table content using regex extractTablesFromXml(xmlContent) { try { const tables = []; const tableRegex = /<w:tbl[^>]*>(.*?)<\/w:tbl>/gs; let tableMatch; while ((tableMatch = tableRegex.exec(xmlContent)) !== null) { const tableXml = tableMatch[1]; if (!tableXml) continue; const rows = []; const rowRegex = /<w:tr[^>]*>(.*?)<\/w:tr>/gs; let rowMatch; while ((rowMatch = rowRegex.exec(tableXml)) !== null) { const rowXml = rowMatch[1]; if (!rowXml) continue; const cells = []; const cellRegex = /<w:tc[^>]*>(.*?)<\/w:tc>/gs; let cellMatch; while ((cellMatch = cellRegex.exec(rowXml)) !== null) { const cellXml = cellMatch[1]; if (cellXml) { const cellText = this.extractTextFromXml(cellXml); cells.push(cellText); } } if (cells.length > 0) { rows.push({ cells }); } } if (rows.length > 0) { tables.push({ rows }); } } return tables; } catch (error) { throw new DocxParseError( `Failed to extract tables: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract header content from XML extractHeaderFromXml(xmlContent) { try { const textContent = this.extractTextFromXml(xmlContent); const hasPageNumber = /<w:instrText[^>]*>PAGE<\/w:instrText>/.test(xmlContent); const watermarkMatch = xmlContent.match(/string="([^"]*)"[^>]*>.*?<v:textpath/) || xmlContent.match(/string="([^"]*)".*?fitshape="t"/); const watermark = watermarkMatch ? watermarkMatch[1] : void 0; if (textContent.trim() || hasPageNumber || watermark) { const result = { text: textContent.trim(), hasPageNumber }; if (watermark) { result.watermark = watermark; } return result; } return null; } catch (error) { throw new DocxParseError( `Failed to extract header: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract footer content from XML extractFooterFromXml(xmlContent) { try { const textContent = this.extractTextFromXml(xmlContent); const hasPageNumber = /<w:instrText[^>]*>PAGE<\/w:instrText>/.test(xmlContent); if (textContent.trim() || hasPageNumber) { return { text: textContent.trim(), hasPageNumber }; } return null; } catch (error) { throw new DocxParseError( `Failed to extract footer: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract footnotes from XML extractFootnotesFromXml(xmlContent) { try { const footnotes = []; const footnoteRegex = /<w:footnote[^>]*w:id="([^"]*)"[^>]*>(.*?)<\/w:footnote>/gs; let match; while ((match = footnoteRegex.exec(xmlContent)) !== null) { const id = match[1]; const footnoteXml = match[2]; if (id && footnoteXml && id !== "0") { const text = this.extractTextFromXml(footnoteXml); if (text.trim()) { footnotes.push({ id, text: text.trim() }); } } else if (id === "0" && footnoteXml) { const text = this.extractTextFromXml(footnoteXml); if (text.trim()) { footnotes.push({ id, text: text.trim() }); } } } return footnotes; } catch (error) { throw new DocxParseError( `Failed to extract footnotes: ${error instanceof Error ? error.message : "Unknown error"}` ); } } }; var ZipAdapter = class { static { __name(this, "ZipAdapter"); } async extractFile(buffer, filename) { try { const zip = new JSZip__default.default(); const zipData = await zip.loadAsync(buffer); const file = zipData.file(filename); if (!file) { return null; } const content = await file.async("uint8array"); return Buffer.from(content); } catch (error) { throw new DocxParseError( `Failed to extract file ${filename}: ${error instanceof Error ? error.message : "Unknown error"}` ); } } async extractFiles(buffer, pattern) { try { const zip = new JSZip__default.default(); const zipData = await zip.loadAsync(buffer); const files = /* @__PURE__ */ new Map(); for (const [filename, file] of Object.entries(zipData.files)) { if (file && !file.dir && pattern.test(filename)) { const content = await file.async("uint8array"); files.set(filename, Buffer.from(content)); } } return files; } catch (error) { throw new DocxParseError( `Failed to extract files with pattern ${pattern}: ${error instanceof Error ? error.message : "Unknown error"}` ); } } }; // src/infrastructure/repositories/docx-repository.ts var DocxRepository = class { static { __name(this, "DocxRepository"); } zipAdapter; xmlAdapter; constructor() { this.zipAdapter = new ZipAdapter(); this.xmlAdapter = new XmlAdapter(); } async *parse(source, options = {}) { try { const buffer = source instanceof Buffer ? source : await StreamAdapter.toBuffer(source); const opts = { includeMetadata: options.includeMetadata ?? true, includeImages: options.includeImages ?? true, includeTables: options.includeTables ?? true, includeHeaders: options.includeHeaders ?? false, includeFooters: options.includeFooters ?? false, imageFormat: options.imageFormat ?? "buffer", maxImageSize: options.maxImageSize ?? 10 * 1024 * 1024, // 10MB preserveFormatting: options.preserveFormatting ?? true, normalizeWhitespace: options.normalizeWhitespace ?? true, chunkSize: options.chunkSize ?? 64 * 1024, // 64KB concurrent: options.concurrent ?? false }; let elementId = 0; const getNextId = /* @__PURE__ */ __name(() => `element_${++elementId}`, "getNextId"); if (opts.includeMetadata) { yield* this.extractMetadata(buffer, getNextId); } yield* this.extractContent(buffer, opts, getNextId); if (opts.includeHeaders) { yield* this.extractHeaders(buffer, getNextId); } if (opts.includeFooters) { yield* this.extractFooters(buffer, getNextId); } yield* this.extractFootnotes(buffer, getNextId); if (opts.includeImages) { yield* this.extractImages(buffer, opts, getNextId); } } catch (error) { throw new DocxParseError( `Failed to parse DOCX document: ${error instanceof Error ? error.message : "Unknown error"}` ); } } async *extractMetadata(buffer, getNextId) { try { const corePropsBuffer = await this.zipAdapter.extractFile(buffer, "docProps/core.xml"); if (corePropsBuffer) { const corePropsXml = corePropsBuffer.toString("utf-8"); const title = this.extractMetadataValue(corePropsXml, "dc:title"); const author = this.extractMetadataValue(corePropsXml, "dc:creator"); const subject = this.extractMetadataValue(corePropsXml, "dc:subject"); const created = this.extractMetadataDate(corePropsXml, "dcterms:created"); const modified = this.extractMetadataDate(corePropsXml, "dcterms:modified"); yield { type: "metadata", id: getNextId(), position: { page: 0, section: 0, order: 0 }, content: { title: title || void 0, author: author || void 0, subject: subject || void 0, created: created || void 0, modified: modified || void 0 } }; } else { yield { type: "metadata", id: getNextId(), position: { page: 0, section: 0, order: 0 }, content: { title: "Unknown Document", author: "Unknown" } }; } } catch (error) { yield { type: "metadata", id: getNextId(), position: { page: 0, section: 0, order: 0 }, content: {} }; } } async *extractContent(buffer, options, getNextId) { try { const documentBuffer = await this.zipAdapter.extractFile(buffer, "word/document.xml"); if (!documentBuffer) { throw new DocxParseError("Main document XML not found"); } const documentXml = documentBuffer.toString("utf-8"); const elements = this.xmlAdapter.extractParagraphsWithFormattingFromXml(documentXml); let order = 1; for (const element of elements) { const content = options.normalizeWhitespace ? element.text.replace(/\s+/g, " ").trim() : element.text; if (element.type === "header") { yield { type: "header", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content, level: element.level || 1, formatting: { fontFamily: "Calibri", fontSize: 12, ...element.formatting } }; } else { const documentElement = { type: "paragraph", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content, formatting: { fontFamily: "Calibri", fontSize: 12, ...element.formatting } }; if (element.formatting?.strike) { documentElement.checkbox = { checked: true }; } yield documentElement; } } if (options.includeTables) { yield* this.extractTables(documentXml, options, getNextId, order); } } catch (error) { throw new DocxParseError( `Failed to extract content: ${error instanceof Error ? error.message : "Unknown error"}` ); } } async *extractTables(documentXml, options, getNextId, startOrder) { try { const tables = this.xmlAdapter.extractTablesFromXml(documentXml); let order = startOrder; for (const table of tables) { const tableRows = table.rows.map((row) => ({ cells: row.cells.map((cellText) => ({ content: options.normalizeWhitespace ? cellText.replace(/\s+/g, " ").trim() : cellText })), isHeader: false })); yield { type: "table", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content: tableRows }; } } catch (error) { console.warn("Failed to extract tables:", error); } } async *extractImages(buffer, options, getNextId) { try { const mediaFiles = await this.zipAdapter.extractFiles(buffer, /^word\/media\//); let order = 1e3; for (const [filename, imageBuffer] of mediaFiles) { if (imageBuffer.length > options.maxImageSize) { continue; } const format = this.getImageFormat(filename); if (!format) continue; const filenameOnly = filename.split("/").pop() || "unknown.img"; yield { type: "image", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content: imageBuffer, metadata: { filename: filenameOnly, format, width: 0, // TODO: Extract actual dimensions height: 0 }, positioning: { inline: true } }; } } catch (error) { console.warn("Failed to extract images:", error); } } extractMetadataValue(xml, tagName) { const regex = new RegExp(`<${tagName}[^>]*>([^<]*)</${tagName}>`, "i"); const match = xml.match(regex); return match?.[1]?.trim() || null; } extractMetadataDate(xml, tagName) { const dateStr = this.extractMetadataValue(xml, tagName); return dateStr ? new Date(dateStr) : null; } getImageFormat(filename) { const ext = filename.toLowerCase().split(".").pop(); switch (ext) { case "png": return "png"; case "jpg": case "jpeg": return "jpg"; case "gif": return "gif"; case "svg": return "svg"; case "wmf": return "wmf"; case "emf": return "emf"; default: return null; } } async *extractHeaders(buffer, getNextId) { try { const headerFiles = await this.zipAdapter.extractFiles(buffer, /^word\/header\d*\.xml$/); let order = 2e3; for (const [, headerBuffer] of headerFiles) { const headerXml = headerBuffer.toString("utf-8"); const headerInfo = this.xmlAdapter.extractHeaderFromXml(headerXml); if (headerInfo) { const element = { type: "header", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content: headerInfo.text, level: 1 }; if (headerInfo.hasPageNumber) { element.hasPageNumber = true; } if (headerInfo.watermark) { element.watermark = headerInfo.watermark; } yield element; } } } catch (error) { console.warn("Failed to extract headers:", error); } } async *extractFooters(buffer, getNextId) { try { const footerFiles = await this.zipAdapter.extractFiles(buffer, /^word\/footer\d*\.xml$/); let order = 3e3; for (const [, footerBuffer] of footerFiles) { const footerXml = footerBuffer.toString("utf-8"); const footerInfo = this.xmlAdapter.extractFooterFromXml(footerXml); if (footerInfo) { const element = { type: "footer", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content: footerInfo.text }; if (footerInfo.hasPageNumber) { element.hasPageNumber = true; } yield element; } } } catch (error) { console.warn("Failed to extract footers:", error); } } async *extractFootnotes(buffer, getNextId) { try { const footnotesBuffer = await this.zipAdapter.extractFile(buffer, "word/footnotes.xml"); if (footnotesBuffer) { const footnotesXml = footnotesBuffer.toString("utf-8"); const footnotes = this.xmlAdapter.extractFootnotesFromXml(footnotesXml); let order = 4e3; for (const footnote of footnotes) { const element = { type: "paragraph", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content: footnote.text, formatting: { fontFamily: "Calibri", fontSize: 10 } }; element.footnoteId = footnote.id; element.isFootnote = true; yield element; } } } catch (error) { console.warn("Failed to extract footnotes:", error); } } }; // src/application/use-cases/parse-document.use-case.ts var ParseDocumentUseCaseImpl = class { static { __name(this, "ParseDocumentUseCaseImpl"); } docxRepository; constructor(docxRepository) { this.docxRepository = docxRepository || new DocxRepository(); } async *execute(source, options = {}) { try { if (!source) { throw new DocxParseError("Source cannot be null or undefined"); } yield* this.docxRepository.parse(source, options); } catch (error) { if (error instanceof DocxParseError) { throw error; } throw new DocxParseError( `Failed to parse document: ${error instanceof Error ? error.message : "Unknown error"}` ); } } }; // src/application/use-cases/extract-content.use-case.ts var ExtractContentUseCaseImpl = class { static { __name(this, "ExtractContentUseCaseImpl"); } parseDocumentUseCase; constructor() { this.parseDocumentUseCase = new ParseDocumentUseCaseImpl(); } async extractText(source, options = {}) { try { const textParts = []; const parseOptions = { includeMetadata: false, includeImages: false, includeTables: true, preserveFormatting: options.preserveFormatting ?? false }; for await (const element of this.parseDocumentUseCase.execute(source, parseOptions)) { if (element.type === "paragraph") { textParts.push(element.content); } else if (element.type === "table") { const tableText = element.content.map((row) => row.cells.map((cell) => cell.content).join(" ")).join("\n"); textParts.push(tableText); } else if (element.type === "header") { textParts.push(element.content); } } return textParts.join("\n"); } catch (error) { throw new DocxParseError( `Failed to extract text: ${error instanceof Error ? error.message : "Unknown error"}` ); } } async *extractImages(source) { try { const parseOptions = { includeMetadata: false, includeImages: true, includeTables: false, includeHeaders: false, includeFooters: false }; for await (const element of this.parseDocumentUseCase.execute(source, parseOptions)) { if (element.type === "image") { yield element; } } } catch (error) { throw new DocxParseError( `Failed to extract images: ${error instanceof Error ? error.message : "Unknown error"}` ); } } async extractMetadata(source) { try { const parseOptions = { includeMetadata: true, includeImages: false, includeTables: false, includeHeaders: false, includeFooters: false }; for await (const element of this.parseDocumentUseCase.execute(source, parseOptions)) { if (element.type === "metadata") { return element.content; } } return {}; } catch (error) { throw new DocxParseError( `Failed to extract metadata: ${error instanceof Error ? error.message : "Unknown error"}` ); } } }; // src/application/use-cases/validate-document.use-case.ts var ValidateDocumentUseCaseImpl = class { static { __name(this, "ValidateDocumentUseCaseImpl"); } async validate(source) { try { const errors = []; if (!source) { errors.push({ code: "INVALID_SOURCE", message: "Source cannot be null or undefined", severity: "error" }); } if (source instanceof Buffer) { if (source.length === 0) { errors.push({ code: "EMPTY_BUFFER", message: "Buffer is empty", severity: "error" }); } if (source.length > 100 * 1024 * 1024) { errors.push({ code: "LARGE_FILE", message: "File size exceeds 100MB limit", severity: "warning" }); } if (!this.hasValidZipSignature(source)) { errors.push({ code: "INVALID_ZIP_SIGNATURE", message: "File does not appear to be a valid ZIP archive", severity: "error" }); } } return { isValid: errors.filter((e) => e.severity === "error").length === 0, errors }; } catch (error) { throw new DocxParseError( `Validation failed: ${error instanceof Error ? error.message : "Unknown error"}` ); } } hasValidZipSignature(buffer) { if (buffer.length < 4) return false; return buffer[0] === 80 && // P buffer[1] === 75 && // K buffer[2] === 3 && buffer[3] === 4; } }; // src/application/use-cases/parse-document-to-array.use-case.ts var ParseDocumentToArrayUseCaseImpl = class { static { __name(this, "ParseDocumentToArrayUseCaseImpl"); } parseDocumentUseCase; constructor() { this.parseDocumentUseCase = new ParseDocumentUseCaseImpl(); } async execute(source, options) { try { const elements = []; for await (const element of this.parseDocumentUseCase.execute(source, options)) { elements.push(element); } return elements; } catch (error) { throw new DocxParseError( `Failed to parse document to array: ${error instanceof Error ? error.message : "Unknown error"}` ); } } }; // src/interfaces/docx-parser.ts async function* parseDocx(buffer, options) { const useCase = new ParseDocumentUseCaseImpl(); yield* useCase.execute(buffer, options); } __name(parseDocx, "parseDocx"); async function* parseDocxStream(stream, options) { const useCase = new ParseDocumentUseCaseImpl(); const webStream = StreamAdapter.toWebStream(stream); yield* useCase.execute(webStream, options); } __name(parseDocxStream, "parseDocxStream"); async function* parseDocxHttpStream(stream, options) { const useCase = new ParseDocumentUseCaseImpl(); const webStream = StreamAdapter.nodeToWebStream(stream); yield* useCase.execute(webStream, options); } __name(parseDocxHttpStream, "parseDocxHttpStream"); async function* parseDocxWebStream(stream, options) { const useCase = new ParseDocumentUseCaseImpl(); yield* useCase.execute(stream, options); } __name(parseDocxWebStream, "parseDocxWebStream"); async function* parseDocxFile(filePath, options) { const stream = fs.createReadStream(filePath); yield* parseDocxStream(stream, options); } __name(parseDocxFile, "parseDocxFile"); async function parseDocxToArray(source, options) { const useCase = new ParseDocumentToArrayUseCaseImpl(); if (source instanceof Buffer) { return useCase.execute(source, options); } else if ("readable" in source && "path" in source) { const webStream = StreamAdapter.toWebStream(source); return useCase.execute(webStream, options); } else if (typeof source.getReader === "function") { return useCase.execute(source, options); } else if (typeof source.read === "function" && typeof source.on === "function") { const webStream = StreamAdapter.nodeToWebStream(source); return useCase.execute(webStream, options); } else { throw new Error("Unsupported source type"); } } __name(parseDocxToArray, "parseDocxToArray"); async function extractText(source, options) { const useCase = new ExtractContentUseCaseImpl(); if (source instanceof Buffer) { return useCase.extractText(source, options); } else if ("readable" in source && "path" in source) { const webStream = StreamAdapter.toWebStream(source); return useCase.extractText(webStream, options); } else if (typeof source.getReader === "function") { return useCase.extractText(source, options); } else if (typeof source.read === "function" && typeof source.on === "function") { const webStream = StreamAdapter.nodeToWebStream(source); return useCase.extractText(webStream, options); } else { throw new Error("Unsupported source type"); } } __name(extractText, "extractText"); async function* extractImages(source) { const useCase = new ExtractContentUseCaseImpl(); if (source instanceof Buffer) { yield* useCase.extractImages(source); } else if ("readable" in source && "path" in source) { const webStream = StreamAdapter.toWebStream(source); yield* useCase.extractImages(webStream); } else if (typeof source.getReader === "function") { yield* useCase.extractImages(source); } else if (typeof source.read === "function" && typeof source.on === "function") { const webStream = StreamAdapter.nodeToWebStream(source); yield* useCase.extractImages(webStream); } else { throw new Error("Unsupported source type"); } } __name(extractImages, "extractImages"); async function getMetadata(source) { const useCase = new ExtractContentUseCaseImpl(); if (source instanceof Buffer) { return useCase.extractMetadata(source); } else if ("readable" in source && "path" in source) { const webStream = StreamAdapter.toWebStream(source); return useCase.extractMetadata(webStream); } else if (typeof source.getReader === "function") { return useCase.extractMetadata(source); } else if (typeof source.read === "function" && typeof source.on === "function") { const webStream = StreamAdapter.nodeToWebStream(source); return useCase.extractMetadata(webStream); } else { throw new Error("Unsupported source type"); } } __name(getMetadata, "getMetadata"); async function* parseDocxReadable(stream, options) { const useCase = new ParseDocumentUseCaseImpl(); const webStream = StreamAdapter.nodeToWebStream(stream); yield* useCase.execute(webStream, options); } __name(parseDocxReadable, "parseDocxReadable"); // src/index.ts var VERSION = "1.0.0"; exports.DocxParseError = DocxParseError; exports.StreamAdapter = StreamAdapter; exports.VERSION = VERSION; exports.ValidateDocumentUseCaseImpl = ValidateDocumentUseCaseImpl; exports.extractImages = extractImages; exports.extractText = extractText; exports.getMetadata = getMetadata; exports.parseDocx = parseDocx; exports.parseDocxFile = parseDocxFile; exports.parseDocxHttpStream = parseDocxHttpStream; exports.parseDocxReadable = parseDocxReadable; exports.parseDocxStream = parseDocxStream; exports.parseDocxToArray = parseDocxToArray; exports.parseDocxWebStream = parseDocxWebStream; //# sourceMappingURL=index.cjs.map //# sourceMappingURL=index.cjs.map