UNPKG

@thasmorato/docx-parser

Version:

A modern JavaScript library for parsing and processing Microsoft Word DOCX documents with support for both buffer and stream operations. Features incremental parsing, checkbox detection, footnote support, and document validation.

github.com/ThaSMorato/docx-parser

ThaSMorato/docx-parser

1,092 lines (1,078 loc) • 36.1 kB

JavaScript

'use strict'; var fs = require('fs'); var web = require('stream/web'); var JSZip = require('jszip'); function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; } var JSZip__default = /*#__PURE__*/_interopDefault(JSZip); var __defProp = Object.defineProperty; var __name = (target, value) => __defProp(target, "name", { value, configurable: true }); // src/domain/types.ts var DocxParseError = class extends Error { constructor(message, position, code) { super(message); this.position = position; this.code = code; this.name = "DocxParseError"; } static { __name(this, "DocxParseError"); } }; var StreamAdapter = class { static { __name(this, "StreamAdapter"); } /** * Converts a Node.js ReadStream to web ReadableStream * @param readStream - Node.js read stream (fs.ReadStream) * @returns Web API ReadableStream */ static toWebStream(readStream) { return new web.ReadableStream({ start(controller) { readStream.on("data", (chunk) => { const buffer = chunk instanceof Buffer ? chunk : Buffer.from(chunk); const uint8Array = new Uint8Array(buffer.buffer, buffer.byteOffset, buffer.byteLength); controller.enqueue(uint8Array); }); readStream.on("end", () => { controller.close(); }); readStream.on("error", (error) => { controller.error(error); }); }, cancel() { readStream.destroy(); } }); } /** * Checks if it's a Web ReadableStream */ static isWebReadableStream(stream) { return stream && typeof stream.getReader === "function"; } /** * Checks if it's a Node.js Readable stream */ static isNodeReadableStream(stream) { return stream && typeof stream.read === "function" && typeof stream.on === "function"; } /** * Converts Node.js Readable stream to Buffer */ static async nodeStreamToBuffer(stream) { const chunks = []; return new Promise((resolve, reject) => { stream.on("data", (chunk) => { chunks.push(chunk); }); stream.on("end", () => { resolve(Buffer.concat(chunks)); }); stream.on("error", (error) => { reject(error); }); }); } /** * Converts Web ReadableStream to Buffer */ static async webStreamToBuffer(stream) { const chunks = []; const reader = stream.getReader(); try { let done = false; while (!done) { const result = await reader.read(); done = result.done; if (result.value) { chunks.push(result.value); } } } finally { reader.releaseLock(); } return Buffer.concat(chunks); } /** * Converts ReadableStream (Web or Node.js) to Buffer * Automatically detects stream type and uses appropriate conversion * @param stream - Web ReadableStream or Node.js Readable stream * @returns Promise that resolves to Buffer */ static async toBuffer(stream) { if (this.isWebReadableStream(stream)) { return this.webStreamToBuffer(stream); } else if (this.isNodeReadableStream(stream)) { return this.nodeStreamToBuffer(stream); } else { throw new Error("Stream type not supported. Expected Web ReadableStream or Node.js Readable stream."); } } /** * Creates a web ReadableStream from a Buffer * @param buffer - Buffer to be converted * @returns Web ReadableStream */ static fromBuffer(buffer) { let position = 0; const chunkSize = 64 * 1024; return new web.ReadableStream({ pull(controller) { if (position >= buffer.length) { controller.close(); return; } const chunk = buffer.subarray(position, Math.min(position + chunkSize, buffer.length)); const uint8Array = new Uint8Array(chunk.buffer, chunk.byteOffset, chunk.byteLength); controller.enqueue(uint8Array); position += chunk.length; } }); } /** * Converts Node.js Readable stream to Web ReadableStream * Useful for HTTP requests (axios, fetch, etc) * @param nodeStream - Node.js Readable stream * @returns Web API ReadableStream */ static nodeToWebStream(nodeStream) { return new web.ReadableStream({ start(controller) { nodeStream.on("data", (chunk) => { const uint8Array = new Uint8Array(chunk.buffer, chunk.byteOffset, chunk.byteLength); controller.enqueue(uint8Array); }); nodeStream.on("end", () => { controller.close(); }); nodeStream.on("error", (error) => { controller.error(error); }); }, cancel() { if (nodeStream.destroy) { nodeStream.destroy(); } } }); } }; // src/infrastructure/adapters/xml-adapter.ts var XmlAdapter = class { static { __name(this, "XmlAdapter"); } parseXml(xmlContent) { console.log("XML content length:", xmlContent.length); throw new DocxParseError("XML parsing not implemented - use a proper XML parser library"); } extractText(xmlDoc) { console.log("XML doc:", xmlDoc); throw new DocxParseError("XML text extraction not implemented"); } extractElements(xmlDoc, tagName) { console.log("XML doc:", xmlDoc, "Tag name:", tagName); throw new DocxParseError("XML element extraction not implemented"); } // Simple regex-based text extraction for basic functionality extractTextFromXml(xmlContent) { try { const textContent = xmlContent.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim(); return textContent; } catch (error) { throw new DocxParseError( `Failed to extract text from XML: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract paragraph content using regex extractParagraphsFromXml(xmlContent) { try { const paragraphs = []; const paragraphRegex = /<w:p[^>]*>(.*?)<\/w:p>/gs; let match; while ((match = paragraphRegex.exec(xmlContent)) !== null) { const paragraphXml = match[1]; if (paragraphXml) { const text = this.extractTextFromXml(paragraphXml); if (text.trim()) { paragraphs.push(text); } } } return paragraphs; } catch (error) { throw new DocxParseError( `Failed to extract paragraphs: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract paragraphs with style information to detect headers extractParagraphsWithStyleFromXml(xmlContent) { try { const elements = []; const paragraphRegex = /<w:p[^>]*>(.*?)<\/w:p>/gs; let match; while ((match = paragraphRegex.exec(xmlContent)) !== null) { const paragraphXml = match[1]; if (paragraphXml) { const text = this.extractTextFromXml(paragraphXml); if (text.trim()) { const styleInfo = this.analyzeStyleForHeader(paragraphXml); const element = { text: text.trim(), type: styleInfo.isHeader ? "header" : "paragraph" }; if (styleInfo.level !== void 0) { element.level = styleInfo.level; } elements.push(element); } } } return elements; } catch (error) { throw new DocxParseError( `Failed to extract paragraphs with style: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract paragraphs with formatting information (including strike for checkboxes) extractParagraphsWithFormattingFromXml(xmlContent) { try { const elements = []; const paragraphRegex = /<w:p[^>]*>(.*?)<\/w:p>/gs; let match; while ((match = paragraphRegex.exec(xmlContent)) !== null) { const paragraphXml = match[1]; if (paragraphXml) { const text = this.extractTextFromXml(paragraphXml); if (text.trim()) { const styleInfo = this.analyzeStyleForHeader(paragraphXml); const formattingInfo = this.extractFormattingInfo(paragraphXml); const element = { text: text.trim(), type: styleInfo.isHeader ? "header" : "paragraph" }; if (styleInfo.level !== void 0) { element.level = styleInfo.level; } if (Object.keys(formattingInfo).length > 0) { element.formatting = formattingInfo; } elements.push(element); } } } return elements; } catch (error) { throw new DocxParseError( `Failed to extract paragraphs with formatting: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract formatting information from paragraph XML extractFormattingInfo(paragraphXml) { const formatting = {}; if (/<w:b\b[^>]*\/?>/.test(paragraphXml)) { formatting.bold = true; } if (/<w:i\b[^>]*\/?>/.test(paragraphXml)) { formatting.italic = true; } if (/<w:u\b[^>]*\/?>/.test(paragraphXml)) { formatting.underline = true; } if (/<w:strike\s+w:val=["']1["'][^>]*\/?>/.test(paragraphXml)) { formatting.strike = true; } return formatting; } // Analyze paragraph style to determine if it's a header analyzeStyleForHeader(paragraphXml) { const headingStyleRegex = /<w:pStyle[^>]*w:val=["']([^"']*heading[^"']*)["'][^>]*>/i; const headingMatch = paragraphXml.match(headingStyleRegex); if (headingMatch && headingMatch[1]) { const style = headingMatch[1].toLowerCase(); const levelMatch = style.match(/heading(\d+)|h(\d+)/); const level = levelMatch ? parseInt(levelMatch[1] || levelMatch[2] || "1") : 1; return { isHeader: true, level }; } const titleStyleRegex = /<w:pStyle[^>]*w:val=["']([^"']*title[^"']*)["'][^>]*>/i; if (titleStyleRegex.test(paragraphXml)) { return { isHeader: true, level: 1 }; } const subtitleStyleRegex = /<w:pStyle[^>]*w:val=["']([^"']*subtitle[^"']*)["'][^>]*>/i; if (subtitleStyleRegex.test(paragraphXml)) { return { isHeader: true, level: 2 }; } return { isHeader: false }; } // Extract table content using regex extractTablesFromXml(xmlContent) { try { const tables = []; const tableRegex = /<w:tbl[^>]*>(.*?)<\/w:tbl>/gs; let tableMatch; while ((tableMatch = tableRegex.exec(xmlContent)) !== null) { const tableXml = tableMatch[1]; if (!tableXml) continue; const rows = []; const rowRegex = /<w:tr[^>]*>(.*?)<\/w:tr>/gs; let rowMatch; while ((rowMatch = rowRegex.exec(tableXml)) !== null) { const rowXml = rowMatch[1]; if (!rowXml) continue; const cells = []; const cellRegex = /<w:tc[^>]*>(.*?)<\/w:tc>/gs; let cellMatch; while ((cellMatch = cellRegex.exec(rowXml)) !== null) { const cellXml = cellMatch[1]; if (cellXml) { const cellText = this.extractTextFromXml(cellXml); cells.push(cellText); } } if (cells.length > 0) { rows.push({ cells }); } } if (rows.length > 0) { tables.push({ rows }); } } return tables; } catch (error) { throw new DocxParseError( `Failed to extract tables: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract header content from XML extractHeaderFromXml(xmlContent) { try { const textContent = this.extractTextFromXml(xmlContent); const hasPageNumber = /<w:instrText[^>]*>PAGE<\/w:instrText>/.test(xmlContent); const watermarkMatch = xmlContent.match(/string="([^"]*)"[^>]*>.*?<v:textpath/) || xmlContent.match(/string="([^"]*)".*?fitshape="t"/); const watermark = watermarkMatch ? watermarkMatch[1] : void 0; if (textContent.trim() || hasPageNumber || watermark) { const result = { text: textContent.trim(), hasPageNumber }; if (watermark) { result.watermark = watermark; } return result; } return null; } catch (error) { throw new DocxParseError( `Failed to extract header: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract footer content from XML extractFooterFromXml(xmlContent) { try { const textContent = this.extractTextFromXml(xmlContent); const hasPageNumber = /<w:instrText[^>]*>PAGE<\/w:instrText>/.test(xmlContent); if (textContent.trim() || hasPageNumber) { return { text: textContent.trim(), hasPageNumber }; } return null; } catch (error) { throw new DocxParseError( `Failed to extract footer: ${error instanceof Error ? error.message : "Unknown error"}` ); } } // Extract footnotes from XML extractFootnotesFromXml(xmlContent) { try { const footnotes = []; const footnoteRegex = /<w:footnote[^>]*w:id="([^"]*)"[^>]*>(.*?)<\/w:footnote>/gs; let match; while ((match = footnoteRegex.exec(xmlContent)) !== null) { const id = match[1]; const footnoteXml = match[2]; if (id && footnoteXml && id !== "0") { const text = this.extractTextFromXml(footnoteXml); if (text.trim()) { footnotes.push({ id, text: text.trim() }); } } else if (id === "0" && footnoteXml) { const text = this.extractTextFromXml(footnoteXml); if (text.trim()) { footnotes.push({ id, text: text.trim() }); } } } return footnotes; } catch (error) { throw new DocxParseError( `Failed to extract footnotes: ${error instanceof Error ? error.message : "Unknown error"}` ); } } }; var ZipAdapter = class { static { __name(this, "ZipAdapter"); } async extractFile(buffer, filename) { try { const zip = new JSZip__default.default(); const zipData = await zip.loadAsync(buffer); const file = zipData.file(filename); if (!file) { return null; } const content = await file.async("uint8array"); return Buffer.from(content); } catch (error) { throw new DocxParseError( `Failed to extract file ${filename}: ${error instanceof Error ? error.message : "Unknown error"}` ); } } async extractFiles(buffer, pattern) { try { const zip = new JSZip__default.default(); const zipData = await zip.loadAsync(buffer); const files = /* @__PURE__ */ new Map(); for (const [filename, file] of Object.entries(zipData.files)) { if (file && !file.dir && pattern.test(filename)) { const content = await file.async("uint8array"); files.set(filename, Buffer.from(content)); } } return files; } catch (error) { throw new DocxParseError( `Failed to extract files with pattern ${pattern}: ${error instanceof Error ? error.message : "Unknown error"}` ); } } }; // src/infrastructure/repositories/docx-repository.ts var DocxRepository = class { static { __name(this, "DocxRepository"); } zipAdapter; xmlAdapter; constructor() { this.zipAdapter = new ZipAdapter(); this.xmlAdapter = new XmlAdapter(); } async *parse(source, options = {}) { try { const buffer = source instanceof Buffer ? source : await StreamAdapter.toBuffer(source); const opts = { includeMetadata: options.includeMetadata ?? true, includeImages: options.includeImages ?? true, includeTables: options.includeTables ?? true, includeHeaders: options.includeHeaders ?? false, includeFooters: options.includeFooters ?? false, imageFormat: options.imageFormat ?? "buffer", maxImageSize: options.maxImageSize ?? 10 * 1024 * 1024, // 10MB preserveFormatting: options.preserveFormatting ?? true, normalizeWhitespace: options.normalizeWhitespace ?? true, chunkSize: options.chunkSize ?? 64 * 1024, // 64KB concurrent: options.concurrent ?? false }; let elementId = 0; const getNextId = /* @__PURE__ */ __name(() => `element_${++elementId}`, "getNextId"); if (opts.includeMetadata) { yield* this.extractMetadata(buffer, getNextId); } yield* this.extractContent(buffer, opts, getNextId); if (opts.includeHeaders) { yield* this.extractHeaders(buffer, getNextId); } if (opts.includeFooters) { yield* this.extractFooters(buffer, getNextId); } yield* this.extractFootnotes(buffer, getNextId); if (opts.includeImages) { yield* this.extractImages(buffer, opts, getNextId); } } catch (error) { throw new DocxParseError( `Failed to parse DOCX document: ${error instanceof Error ? error.message : "Unknown error"}` ); } } async *extractMetadata(buffer, getNextId) { try { const corePropsBuffer = await this.zipAdapter.extractFile(buffer, "docProps/core.xml"); if (corePropsBuffer) { const corePropsXml = corePropsBuffer.toString("utf-8"); const title = this.extractMetadataValue(corePropsXml, "dc:title"); const author = this.extractMetadataValue(corePropsXml, "dc:creator"); const subject = this.extractMetadataValue(corePropsXml, "dc:subject"); const created = this.extractMetadataDate(corePropsXml, "dcterms:created"); const modified = this.extractMetadataDate(corePropsXml, "dcterms:modified"); yield { type: "metadata", id: getNextId(), position: { page: 0, section: 0, order: 0 }, content: { title: title || void 0, author: author || void 0, subject: subject || void 0, created: created || void 0, modified: modified || void 0 } }; } else { yield { type: "metadata", id: getNextId(), position: { page: 0, section: 0, order: 0 }, content: { title: "Unknown Document", author: "Unknown" } }; } } catch (error) { yield { type: "metadata", id: getNextId(), position: { page: 0, section: 0, order: 0 }, content: {} }; } } async *extractContent(buffer, options, getNextId) { try { const documentBuffer = await this.zipAdapter.extractFile(buffer, "word/document.xml"); if (!documentBuffer) { throw new DocxParseError("Main document XML not found"); } const documentXml = documentBuffer.toString("utf-8"); const elements = this.xmlAdapter.extractParagraphsWithFormattingFromXml(documentXml); let order = 1; for (const element of elements) { const content = options.normalizeWhitespace ? element.text.replace(/\s+/g, " ").trim() : element.text; if (element.type === "header") { yield { type: "header", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content, level: element.level || 1, formatting: { fontFamily: "Calibri", fontSize: 12, ...element.formatting } }; } else { const documentElement = { type: "paragraph", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content, formatting: { fontFamily: "Calibri", fontSize: 12, ...element.formatting } }; if (element.formatting?.strike) { documentElement.checkbox = { checked: true }; } yield documentElement; } } if (options.includeTables) { yield* this.extractTables(documentXml, options, getNextId, order); } } catch (error) { throw new DocxParseError( `Failed to extract content: ${error instanceof Error ? error.message : "Unknown error"}` ); } } async *extractTables(documentXml, options, getNextId, startOrder) { try { const tables = this.xmlAdapter.extractTablesFromXml(documentXml); let order = startOrder; for (const table of tables) { const tableRows = table.rows.map((row) => ({ cells: row.cells.map((cellText) => ({ content: options.normalizeWhitespace ? cellText.replace(/\s+/g, " ").trim() : cellText })), isHeader: false })); yield { type: "table", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content: tableRows }; } } catch (error) { console.warn("Failed to extract tables:", error); } } async *extractImages(buffer, options, getNextId) { try { const mediaFiles = await this.zipAdapter.extractFiles(buffer, /^word\/media\//); let order = 1e3; for (const [filename, imageBuffer] of mediaFiles) { if (imageBuffer.length > options.maxImageSize) { continue; } const format = this.getImageFormat(filename); if (!format) continue; const filenameOnly = filename.split("/").pop() || "unknown.img"; yield { type: "image", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content: imageBuffer, metadata: { filename: filenameOnly, format, width: 0, // TODO: Extract actual dimensions height: 0 }, positioning: { inline: true } }; } } catch (error) { console.warn("Failed to extract images:", error); } } extractMetadataValue(xml, tagName) { const regex = new RegExp(`<${tagName}[^>]*>([^<]*)</${tagName}>`, "i"); const match = xml.match(regex); return match?.[1]?.trim() || null; } extractMetadataDate(xml, tagName) { const dateStr = this.extractMetadataValue(xml, tagName); return dateStr ? new Date(dateStr) : null; } getImageFormat(filename) { const ext = filename.toLowerCase().split(".").pop(); switch (ext) { case "png": return "png"; case "jpg": case "jpeg": return "jpg"; case "gif": return "gif"; case "svg": return "svg"; case "wmf": return "wmf"; case "emf": return "emf"; default: return null; } } async *extractHeaders(buffer, getNextId) { try { const headerFiles = await this.zipAdapter.extractFiles(buffer, /^word\/header\d*\.xml$/); let order = 2e3; for (const [, headerBuffer] of headerFiles) { const headerXml = headerBuffer.toString("utf-8"); const headerInfo = this.xmlAdapter.extractHeaderFromXml(headerXml); if (headerInfo) { const element = { type: "header", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content: headerInfo.text, level: 1 }; if (headerInfo.hasPageNumber) { element.hasPageNumber = true; } if (headerInfo.watermark) { element.watermark = headerInfo.watermark; } yield element; } } } catch (error) { console.warn("Failed to extract headers:", error); } } async *extractFooters(buffer, getNextId) { try { const footerFiles = await this.zipAdapter.extractFiles(buffer, /^word\/footer\d*\.xml$/); let order = 3e3; for (const [, footerBuffer] of footerFiles) { const footerXml = footerBuffer.toString("utf-8"); const footerInfo = this.xmlAdapter.extractFooterFromXml(footerXml); if (footerInfo) { const element = { type: "footer", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content: footerInfo.text }; if (footerInfo.hasPageNumber) { element.hasPageNumber = true; } yield element; } } } catch (error) { console.warn("Failed to extract footers:", error); } } async *extractFootnotes(buffer, getNextId) { try { const footnotesBuffer = await this.zipAdapter.extractFile(buffer, "word/footnotes.xml"); if (footnotesBuffer) { const footnotesXml = footnotesBuffer.toString("utf-8"); const footnotes = this.xmlAdapter.extractFootnotesFromXml(footnotesXml); let order = 4e3; for (const footnote of footnotes) { const element = { type: "paragraph", id: getNextId(), position: { page: 1, section: 1, order: order++ }, content: footnote.text, formatting: { fontFamily: "Calibri", fontSize: 10 } }; element.footnoteId = footnote.id; element.isFootnote = true; yield element; } } } catch (error) { console.warn("Failed to extract footnotes:", error); } } }; // src/application/use-cases/parse-document.use-case.ts var ParseDocumentUseCaseImpl = class { static { __name(this, "ParseDocumentUseCaseImpl"); } docxRepository; constructor(docxRepository) { this.docxRepository = docxRepository || new DocxRepository(); } async *execute(source, options = {}) { try { if (!source) { throw new DocxParseError("Source cannot be null or undefined"); } yield* this.docxRepository.parse(source, options); } catch (error) { if (error instanceof DocxParseError) { throw error; } throw new DocxParseError( `Failed to parse document: ${error instanceof Error ? error.message : "Unknown error"}` ); } } }; // src/application/use-cases/extract-content.use-case.ts var ExtractContentUseCaseImpl = class { static { __name(this, "ExtractContentUseCaseImpl"); } parseDocumentUseCase; constructor() { this.parseDocumentUseCase = new ParseDocumentUseCaseImpl(); } async extractText(source, options = {}) { try { const textParts = []; const parseOptions = { includeMetadata: false, includeImages: false, includeTables: true, preserveFormatting: options.preserveFormatting ?? false }; for await (const element of this.parseDocumentUseCase.execute(source, parseOptions)) { if (element.type === "paragraph") { textParts.push(element.content); } else if (element.type === "table") { const tableText = element.content.map((row) => row.cells.map((cell) => cell.content).join(" ")).join("\n"); textParts.push(tableText); } else if (element.type === "header") { textParts.push(element.content); } } return textParts.join("\n"); } catch (error) { throw new DocxParseError( `Failed to extract text: ${error instanceof Error ? error.message : "Unknown error"}` ); } } async *extractImages(source) { try { const parseOptions = { includeMetadata: false, includeImages: true, includeTables: false, includeHeaders: false, includeFooters: false }; for await (const element of this.parseDocumentUseCase.execute(source, parseOptions)) { if (element.type === "image") { yield element; } } } catch (error) { throw new DocxParseError( `Failed to extract images: ${error instanceof Error ? error.message : "Unknown error"}` ); } } async extractMetadata(source) { try { const parseOptions = { includeMetadata: true, includeImages: false, includeTables: false, includeHeaders: false, includeFooters: false }; for await (const element of this.parseDocumentUseCase.execute(source, parseOptions)) { if (element.type === "metadata") { return element.content; } } return {}; } catch (error) { throw new DocxParseError( `Failed to extract metadata: ${error instanceof Error ? error.message : "Unknown error"}` ); } } }; // src/application/use-cases/validate-document.use-case.ts var ValidateDocumentUseCaseImpl = class { static { __name(this, "ValidateDocumentUseCaseImpl"); } async validate(source) { try { const errors = []; if (!source) { errors.push({ code: "INVALID_SOURCE", message: "Source cannot be null or undefined", severity: "error" }); } if (source instanceof Buffer) { if (source.length === 0) { errors.push({ code: "EMPTY_BUFFER", message: "Buffer is empty", severity: "error" }); } if (source.length > 100 * 1024 * 1024) { errors.push({ code: "LARGE_FILE", message: "File size exceeds 100MB limit", severity: "warning" }); } if (!this.hasValidZipSignature(source)) { errors.push({ code: "INVALID_ZIP_SIGNATURE", message: "File does not appear to be a valid ZIP archive", severity: "error" }); } } return { isValid: errors.filter((e) => e.severity === "error").length === 0, errors }; } catch (error) { throw new DocxParseError( `Validation failed: ${error instanceof Error ? error.message : "Unknown error"}` ); } } hasValidZipSignature(buffer) { if (buffer.length < 4) return false; return buffer[0] === 80 && // P buffer[1] === 75 && // K buffer[2] === 3 && buffer[3] === 4; } }; // src/application/use-cases/parse-document-to-array.use-case.ts var ParseDocumentToArrayUseCaseImpl = class { static { __name(this, "ParseDocumentToArrayUseCaseImpl"); } parseDocumentUseCase; constructor() { this.parseDocumentUseCase = new ParseDocumentUseCaseImpl(); } async execute(source, options) { try { const elements = []; for await (const element of this.parseDocumentUseCase.execute(source, options)) { elements.push(element); } return elements; } catch (error) { throw new DocxParseError( `Failed to parse document to array: ${error instanceof Error ? error.message : "Unknown error"}` ); } } }; // src/interfaces/docx-parser.ts async function* parseDocx(buffer, options) { const useCase = new ParseDocumentUseCaseImpl(); yield* useCase.execute(buffer, options); } __name(parseDocx, "parseDocx"); async function* parseDocxStream(stream, options) { const useCase = new ParseDocumentUseCaseImpl(); const webStream = StreamAdapter.toWebStream(stream); yield* useCase.execute(webStream, options); } __name(parseDocxStream, "parseDocxStream"); async function* parseDocxHttpStream(stream, options) { const useCase = new ParseDocumentUseCaseImpl(); const webStream = StreamAdapter.nodeToWebStream(stream); yield* useCase.execute(webStream, options); } __name(parseDocxHttpStream, "parseDocxHttpStream"); async function* parseDocxWebStream(stream, options) { const useCase = new ParseDocumentUseCaseImpl(); yield* useCase.execute(stream, options); } __name(parseDocxWebStream, "parseDocxWebStream"); async function* parseDocxFile(filePath, options) { const stream = fs.createReadStream(filePath); yield* parseDocxStream(stream, options); } __name(parseDocxFile, "parseDocxFile"); async function parseDocxToArray(source, options) { const useCase = new ParseDocumentToArrayUseCaseImpl(); if (source instanceof Buffer) { return useCase.execute(source, options); } else if ("readable" in source && "path" in source) { const webStream = StreamAdapter.toWebStream(source); return useCase.execute(webStream, options); } else if (typeof source.getReader === "function") { return useCase.execute(source, options); } else if (typeof source.read === "function" && typeof source.on === "function") { const webStream = StreamAdapter.nodeToWebStream(source); return useCase.execute(webStream, options); } else { throw new Error("Unsupported source type"); } } __name(parseDocxToArray, "parseDocxToArray"); async function extractText(source, options) { const useCase = new ExtractContentUseCaseImpl(); if (source instanceof Buffer) { return useCase.extractText(source, options); } else if ("readable" in source && "path" in source) { const webStream = StreamAdapter.toWebStream(source); return useCase.extractText(webStream, options); } else if (typeof source.getReader === "function") { return useCase.extractText(source, options); } else if (typeof source.read === "function" && typeof source.on === "function") { const webStream = StreamAdapter.nodeToWebStream(source); return useCase.extractText(webStream, options); } else { throw new Error("Unsupported source type"); } } __name(extractText, "extractText"); async function* extractImages(source) { const useCase = new ExtractContentUseCaseImpl(); if (source instanceof Buffer) { yield* useCase.extractImages(source); } else if ("readable" in source && "path" in source) { const webStream = StreamAdapter.toWebStream(source); yield* useCase.extractImages(webStream); } else if (typeof source.getReader === "function") { yield* useCase.extractImages(source); } else if (typeof source.read === "function" && typeof source.on === "function") { const webStream = StreamAdapter.nodeToWebStream(source); yield* useCase.extractImages(webStream); } else { throw new Error("Unsupported source type"); } } __name(extractImages, "extractImages"); async function getMetadata(source) { const useCase = new ExtractContentUseCaseImpl(); if (source instanceof Buffer) { return useCase.extractMetadata(source); } else if ("readable" in source && "path" in source) { const webStream = StreamAdapter.toWebStream(source); return useCase.extractMetadata(webStream); } else if (typeof source.getReader === "function") { return useCase.extractMetadata(source); } else if (typeof source.read === "function" && typeof source.on === "function") { const webStream = StreamAdapter.nodeToWebStream(source); return useCase.extractMetadata(webStream); } else { throw new Error("Unsupported source type"); } } __name(getMetadata, "getMetadata"); async function* parseDocxReadable(stream, options) { const useCase = new ParseDocumentUseCaseImpl(); const webStream = StreamAdapter.nodeToWebStream(stream); yield* useCase.execute(webStream, options); } __name(parseDocxReadable, "parseDocxReadable"); // src/index.ts var VERSION = "1.0.0"; exports.DocxParseError = DocxParseError; exports.StreamAdapter = StreamAdapter; exports.VERSION = VERSION; exports.ValidateDocumentUseCaseImpl = ValidateDocumentUseCaseImpl; exports.extractImages = extractImages; exports.extractText = extractText; exports.getMetadata = getMetadata; exports.parseDocx = parseDocx; exports.parseDocxFile = parseDocxFile; exports.parseDocxHttpStream = parseDocxHttpStream; exports.parseDocxReadable = parseDocxReadable; exports.parseDocxStream = parseDocxStream; exports.parseDocxToArray = parseDocxToArray; exports.parseDocxWebStream = parseDocxWebStream; //# sourceMappingURL=index.cjs.map //# sourceMappingURL=index.cjs.map