UNPKG

@mazix/n8n-nodes-converter-documents

Version:

n8n node to convert various document formats (DOCX, DOC, XML, YML, XLSX, CSV, PDF, TXT, PPT, PPTX, HTML, JSON, ODT, ODP, ODS) to JSON or text format

299 lines 12.6 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.strategies = void 0; const fast_xml_parser_1 = require("fast-xml-parser"); const mammoth_1 = __importDefault(require("mammoth")); const node_1 = __importDefault(require("read-excel-file/node")); const node_html_parser_1 = require("node-html-parser"); const chardet_1 = __importDefault(require("chardet")); const papaparse_1 = __importDefault(require("papaparse")); const readline = __importStar(require("readline")); const stream_1 = require("stream"); const node_html_markdown_1 = require("node-html-markdown"); const helpers_1 = require("../helpers"); const errors_1 = require("../errors"); const columns_1 = require("../utils/columns"); const flatten_1 = require("../utils/flatten"); const yml_1 = require("../processors/yml"); // Константы const CSV_STREAM_ROW_LIMIT = 100000; const TXT_STREAM_SIZE_LIMIT = 10 * 1024 * 1024; // 10 MB const TXT_STREAM_CHAR_LIMIT = 1000000; // 1 млн символов // --- Вспомогательные функции --- async function streamTxtStrategy(buf) { return new Promise((resolve, reject) => { const rl = readline.createInterface({ input: stream_1.Readable.from(buf.toString("utf8")), crlfDelay: Infinity, }); let text = ""; let truncated = false; rl.on("line", (line) => { if (text.length < TXT_STREAM_CHAR_LIMIT) { text += line + "\n"; } else { truncated = true; } }); rl.on("close", () => { resolve({ text: truncated ? text.slice(0, TXT_STREAM_CHAR_LIMIT) : text, warning: truncated ? `Текст обрезан до ${TXT_STREAM_CHAR_LIMIT} символов` : undefined, }); }); rl.on("error", (err) => reject(err)); }); } async function streamCsvStrategy(data) { return new Promise((resolve, reject) => { const rows = []; let rowCount = 0; papaparse_1.default.parse(data, { header: true, skipEmptyLines: true, step: (result) => { if (rowCount < CSV_STREAM_ROW_LIMIT) { rows.push(result.data); rowCount++; } }, complete: () => { const warning = rowCount >= CSV_STREAM_ROW_LIMIT ? `CSV truncated to ${CSV_STREAM_ROW_LIMIT} rows` : undefined; resolve({ sheets: { Sheet1: rows }, warning, }); }, error: (err) => reject(err), }); }); } async function processHtml(buf) { try { const root = (0, node_html_parser_1.parse)(buf.toString("utf8")); const body = root.querySelector("body"); const cleanText = body ? body.textContent.replace(/\s+/g, " ").trim() : ""; return { text: cleanText }; } catch (error) { throw new errors_1.ProcessingError(`HTML processing error: ${error instanceof Error ? error.message : String(error)}`); } } /** * Общая стратегия для legacy CFB форматов (DOC, PPT) */ function cfbLegacyStrategy(format, modernFormat) { return async (buf) => { try { const signature = buf.slice(0, 8); const cfbSignature = Buffer.from([0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1]); if (signature.equals(cfbSignature)) { throw new errors_1.UnsupportedFormatError(`Старые ${format.toUpperCase()} файлы не поддерживаются. ` + `Пожалуйста, сохраните файл в формате ${modernFormat.toUpperCase()} и попробуйте снова.`); } return { text: await (0, helpers_1.extractViaOfficeParser)(buf) }; } catch (error) { if (error instanceof errors_1.UnsupportedFormatError) { throw error; } if (error instanceof Error && error.message.includes('cfb files')) { throw new errors_1.UnsupportedFormatError(`Старые ${format.toUpperCase()} файлы не поддерживаются. ` + `Пожалуйста, сохраните файл в формате ${modernFormat.toUpperCase()} и попробуйте снова.`); } throw new errors_1.ProcessingError(`${format.toUpperCase()} processing error: ${error instanceof Error ? error.message : String(error)}`); } }; } /** * Общая стратегия для ODF форматов (ODT, ODP, ODS) */ function odfStrategy(format) { return async (buf) => { try { return { text: await (0, helpers_1.extractViaOfficeParser)(buf) }; } catch (error) { if (error instanceof errors_1.UnsupportedFormatError || error instanceof errors_1.ProcessingError) { throw error; } throw new errors_1.ProcessingError(`${format.toUpperCase()} processing error: ${error instanceof Error ? error.message : String(error)}`); } }; } // --- Стратегии --- exports.strategies = { doc: cfbLegacyStrategy('doc', 'docx'), docx: async (buf, _ext, options) => { const outputFormat = options?.outputFormat || 'text'; if (outputFormat === 'html' || outputFormat === 'markdown') { try { const result = await mammoth_1.default.convertToHtml({ buffer: buf }); if (result.value && result.value.trim().length > 0) { if (outputFormat === 'markdown') { return { text: node_html_markdown_1.NodeHtmlMarkdown.translate(result.value) }; } return { text: result.value }; } } catch { // Ошибка mammoth HTML - пробуем fallback } } // Попытка 1: officeparser try { const text = await (0, helpers_1.extractViaOfficeParser)(buf); if (text && text.trim().length > 0) { return { text }; } } catch { // Ошибка officeparser - пробуем дальше } // Попытка 2: mammoth (text) try { const result = await mammoth_1.default.extractRawText({ buffer: buf }); if (result.value && result.value.trim().length > 0) { return { text: result.value }; } } catch { // Ошибка mammoth } throw new errors_1.ProcessingError(`DOCX processing error: All parsers failed. ` + `This may be a corrupted, password-protected, or non-standard DOCX file.`); }, xml: async (buf) => { const parser = new fast_xml_parser_1.XMLParser({ ignoreAttributes: false }); const parsed = parser.parse(buf.toString("utf8")); return { text: JSON.stringify(parsed, null, 2) }; }, yml: async (buf) => { try { const xmlContent = buf.toString("utf8"); const parser = new fast_xml_parser_1.XMLParser({ ignoreAttributes: false }); const parsed = parser.parse(xmlContent); if (parsed.yml_catalog && parsed.yml_catalog.shop) { return (0, yml_1.processYandexMarketYml)(parsed); } return { text: JSON.stringify(parsed, null, 2) }; } catch (error) { throw new errors_1.ProcessingError(`YML processing error: ${error instanceof Error ? error.message : String(error)}`); } }, json: async (buf) => { try { const detected = chardet_1.default.detect(buf); const encoding = (detected || 'utf-8'); const jsonString = buf.toString(encoding); const parsed = JSON.parse(jsonString); if (typeof parsed === 'object' && parsed !== null) { const flattened = (0, flatten_1.flattenJsonObject)(parsed); return { text: JSON.stringify(flattened, null, 2), warning: Object.keys(flattened).length > Object.keys(parsed).length ? "Многоуровневая структура JSON была преобразована в плоский объект" : undefined }; } return { text: JSON.stringify(parsed, null, 2) }; } catch (error) { throw new errors_1.ProcessingError(`JSON parsing error: ${error instanceof Error ? error.message : String(error)}`); } }, odt: odfStrategy('odt'), odp: odfStrategy('odp'), ods: odfStrategy('ods'), xlsx: async (buf) => { const { readSheetNames } = await Promise.resolve().then(() => __importStar(require("read-excel-file/node"))); const sheetNames = await readSheetNames(buf); const sheets = {}; for (const sheetName of sheetNames) { const rows = await (0, node_1.default)(buf, { sheet: sheetName, dateFormat: 'YYYY-MM-DD' }); const jsonData = []; for (const row of rows) { const rowData = {}; row.forEach((cell, colIndex) => { if (cell !== null && cell !== undefined) { const columnLetter = (0, columns_1.numberToColumn)(colIndex + 1); rowData[columnLetter] = cell instanceof Date ? cell.toISOString() : cell; } }); if (Object.keys(rowData).length > 0) { jsonData.push(rowData); } } sheets[sheetName] = (0, helpers_1.limitExcelSheet)(jsonData, 0); } return { sheets }; }, csv: async (buf) => { const detected = chardet_1.default.detect(buf); const encoding = (detected || 'utf-8'); const decoded = buf.toString(encoding); return streamCsvStrategy(decoded); }, pdf: async (buf) => { try { return { text: await (0, helpers_1.extractViaOfficeParser)(buf) }; } catch (error) { throw new errors_1.ProcessingError(`PDF processing error: ${error instanceof Error ? error.message : String(error)}`); } }, txt: async (buf) => { if (buf.length > TXT_STREAM_SIZE_LIMIT) { return streamTxtStrategy(buf); } const detected = chardet_1.default.detect(buf); const encoding = (detected || 'utf-8'); return { text: buf.toString(encoding) }; }, ppt: cfbLegacyStrategy('ppt', 'pptx'), pptx: async (buf) => ({ text: await (0, helpers_1.extractViaOfficeParser)(buf), }), html: async (buf) => processHtml(buf), htm: async (buf) => processHtml(buf), }; //# sourceMappingURL=index.js.map