@mazix/n8n-nodes-converter-documents
Version:
n8n node to convert various document formats (DOCX, DOC, XML, YML, XLSX, CSV, PDF, TXT, PPT, PPTX, HTML, JSON, ODT, ODP, ODS) to JSON or text format
299 lines • 12.6 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.strategies = void 0;
const fast_xml_parser_1 = require("fast-xml-parser");
const mammoth_1 = __importDefault(require("mammoth"));
const node_1 = __importDefault(require("read-excel-file/node"));
const node_html_parser_1 = require("node-html-parser");
const chardet_1 = __importDefault(require("chardet"));
const papaparse_1 = __importDefault(require("papaparse"));
const readline = __importStar(require("readline"));
const stream_1 = require("stream");
const node_html_markdown_1 = require("node-html-markdown");
const helpers_1 = require("../helpers");
const errors_1 = require("../errors");
const columns_1 = require("../utils/columns");
const flatten_1 = require("../utils/flatten");
const yml_1 = require("../processors/yml");
// Константы
const CSV_STREAM_ROW_LIMIT = 100000;
const TXT_STREAM_SIZE_LIMIT = 10 * 1024 * 1024; // 10 MB
const TXT_STREAM_CHAR_LIMIT = 1000000; // 1 млн символов
// --- Вспомогательные функции ---
async function streamTxtStrategy(buf) {
return new Promise((resolve, reject) => {
const rl = readline.createInterface({
input: stream_1.Readable.from(buf.toString("utf8")),
crlfDelay: Infinity,
});
let text = "";
let truncated = false;
rl.on("line", (line) => {
if (text.length < TXT_STREAM_CHAR_LIMIT) {
text += line + "\n";
}
else {
truncated = true;
}
});
rl.on("close", () => {
resolve({
text: truncated ? text.slice(0, TXT_STREAM_CHAR_LIMIT) : text,
warning: truncated ? `Текст обрезан до ${TXT_STREAM_CHAR_LIMIT} символов` : undefined,
});
});
rl.on("error", (err) => reject(err));
});
}
async function streamCsvStrategy(data) {
return new Promise((resolve, reject) => {
const rows = [];
let rowCount = 0;
papaparse_1.default.parse(data, {
header: true,
skipEmptyLines: true,
step: (result) => {
if (rowCount < CSV_STREAM_ROW_LIMIT) {
rows.push(result.data);
rowCount++;
}
},
complete: () => {
const warning = rowCount >= CSV_STREAM_ROW_LIMIT
? `CSV truncated to ${CSV_STREAM_ROW_LIMIT} rows`
: undefined;
resolve({
sheets: { Sheet1: rows },
warning,
});
},
error: (err) => reject(err),
});
});
}
async function processHtml(buf) {
try {
const root = (0, node_html_parser_1.parse)(buf.toString("utf8"));
const body = root.querySelector("body");
const cleanText = body ? body.textContent.replace(/\s+/g, " ").trim() : "";
return { text: cleanText };
}
catch (error) {
throw new errors_1.ProcessingError(`HTML processing error: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Общая стратегия для legacy CFB форматов (DOC, PPT)
*/
function cfbLegacyStrategy(format, modernFormat) {
return async (buf) => {
try {
const signature = buf.slice(0, 8);
const cfbSignature = Buffer.from([0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1]);
if (signature.equals(cfbSignature)) {
throw new errors_1.UnsupportedFormatError(`Старые ${format.toUpperCase()} файлы не поддерживаются. ` +
`Пожалуйста, сохраните файл в формате ${modernFormat.toUpperCase()} и попробуйте снова.`);
}
return { text: await (0, helpers_1.extractViaOfficeParser)(buf) };
}
catch (error) {
if (error instanceof errors_1.UnsupportedFormatError) {
throw error;
}
if (error instanceof Error && error.message.includes('cfb files')) {
throw new errors_1.UnsupportedFormatError(`Старые ${format.toUpperCase()} файлы не поддерживаются. ` +
`Пожалуйста, сохраните файл в формате ${modernFormat.toUpperCase()} и попробуйте снова.`);
}
throw new errors_1.ProcessingError(`${format.toUpperCase()} processing error: ${error instanceof Error ? error.message : String(error)}`);
}
};
}
/**
* Общая стратегия для ODF форматов (ODT, ODP, ODS)
*/
function odfStrategy(format) {
return async (buf) => {
try {
return { text: await (0, helpers_1.extractViaOfficeParser)(buf) };
}
catch (error) {
if (error instanceof errors_1.UnsupportedFormatError || error instanceof errors_1.ProcessingError) {
throw error;
}
throw new errors_1.ProcessingError(`${format.toUpperCase()} processing error: ${error instanceof Error ? error.message : String(error)}`);
}
};
}
// --- Стратегии ---
exports.strategies = {
doc: cfbLegacyStrategy('doc', 'docx'),
docx: async (buf, _ext, options) => {
const outputFormat = options?.outputFormat || 'text';
if (outputFormat === 'html' || outputFormat === 'markdown') {
try {
const result = await mammoth_1.default.convertToHtml({ buffer: buf });
if (result.value && result.value.trim().length > 0) {
if (outputFormat === 'markdown') {
return { text: node_html_markdown_1.NodeHtmlMarkdown.translate(result.value) };
}
return { text: result.value };
}
}
catch {
// Ошибка mammoth HTML - пробуем fallback
}
}
// Попытка 1: officeparser
try {
const text = await (0, helpers_1.extractViaOfficeParser)(buf);
if (text && text.trim().length > 0) {
return { text };
}
}
catch {
// Ошибка officeparser - пробуем дальше
}
// Попытка 2: mammoth (text)
try {
const result = await mammoth_1.default.extractRawText({ buffer: buf });
if (result.value && result.value.trim().length > 0) {
return { text: result.value };
}
}
catch {
// Ошибка mammoth
}
throw new errors_1.ProcessingError(`DOCX processing error: All parsers failed. ` +
`This may be a corrupted, password-protected, or non-standard DOCX file.`);
},
xml: async (buf) => {
const parser = new fast_xml_parser_1.XMLParser({ ignoreAttributes: false });
const parsed = parser.parse(buf.toString("utf8"));
return { text: JSON.stringify(parsed, null, 2) };
},
yml: async (buf) => {
try {
const xmlContent = buf.toString("utf8");
const parser = new fast_xml_parser_1.XMLParser({ ignoreAttributes: false });
const parsed = parser.parse(xmlContent);
if (parsed.yml_catalog && parsed.yml_catalog.shop) {
return (0, yml_1.processYandexMarketYml)(parsed);
}
return { text: JSON.stringify(parsed, null, 2) };
}
catch (error) {
throw new errors_1.ProcessingError(`YML processing error: ${error instanceof Error ? error.message : String(error)}`);
}
},
json: async (buf) => {
try {
const detected = chardet_1.default.detect(buf);
const encoding = (detected || 'utf-8');
const jsonString = buf.toString(encoding);
const parsed = JSON.parse(jsonString);
if (typeof parsed === 'object' && parsed !== null) {
const flattened = (0, flatten_1.flattenJsonObject)(parsed);
return {
text: JSON.stringify(flattened, null, 2),
warning: Object.keys(flattened).length > Object.keys(parsed).length ?
"Многоуровневая структура JSON была преобразована в плоский объект" : undefined
};
}
return { text: JSON.stringify(parsed, null, 2) };
}
catch (error) {
throw new errors_1.ProcessingError(`JSON parsing error: ${error instanceof Error ? error.message : String(error)}`);
}
},
odt: odfStrategy('odt'),
odp: odfStrategy('odp'),
ods: odfStrategy('ods'),
xlsx: async (buf) => {
const { readSheetNames } = await Promise.resolve().then(() => __importStar(require("read-excel-file/node")));
const sheetNames = await readSheetNames(buf);
const sheets = {};
for (const sheetName of sheetNames) {
const rows = await (0, node_1.default)(buf, { sheet: sheetName, dateFormat: 'YYYY-MM-DD' });
const jsonData = [];
for (const row of rows) {
const rowData = {};
row.forEach((cell, colIndex) => {
if (cell !== null && cell !== undefined) {
const columnLetter = (0, columns_1.numberToColumn)(colIndex + 1);
rowData[columnLetter] = cell instanceof Date ? cell.toISOString() : cell;
}
});
if (Object.keys(rowData).length > 0) {
jsonData.push(rowData);
}
}
sheets[sheetName] = (0, helpers_1.limitExcelSheet)(jsonData, 0);
}
return { sheets };
},
csv: async (buf) => {
const detected = chardet_1.default.detect(buf);
const encoding = (detected || 'utf-8');
const decoded = buf.toString(encoding);
return streamCsvStrategy(decoded);
},
pdf: async (buf) => {
try {
return { text: await (0, helpers_1.extractViaOfficeParser)(buf) };
}
catch (error) {
throw new errors_1.ProcessingError(`PDF processing error: ${error instanceof Error ? error.message : String(error)}`);
}
},
txt: async (buf) => {
if (buf.length > TXT_STREAM_SIZE_LIMIT) {
return streamTxtStrategy(buf);
}
const detected = chardet_1.default.detect(buf);
const encoding = (detected || 'utf-8');
return { text: buf.toString(encoding) };
},
ppt: cfbLegacyStrategy('ppt', 'pptx'),
pptx: async (buf) => ({
text: await (0, helpers_1.extractViaOfficeParser)(buf),
}),
html: async (buf) => processHtml(buf),
htm: async (buf) => processHtml(buf),
};
//# sourceMappingURL=index.js.map