@mazix/n8n-nodes-converter-documents
Version:
n8n node to convert various document formats (DOCX, DOC, XML, YML, XLSX, CSV, PDF, TXT, PPT, PPTX, HTML, JSON, ODT, ODP, ODS) to JSON or text format
209 lines • 9.73 kB
JavaScript
;
/*
* Convert File to JSON v6
* ─────────────────────────────────────────────────────────
* Универсальный кастом-нод для n8n.
* Поддерживает: DOC, DOCX, XML, XLS, XLSX, CSV, PDF, TXT,
* PPT, PPTX, HTML / HTM, ODT, ODP, ODS, JSON.
* Выход: { text: "..."} либо { sheets: {...} } + metadata.
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.FileToJsonNode = void 0;
const path_1 = __importDefault(require("path"));
const file_type_1 = require("file-type");
const n8n_workflow_1 = require("n8n-workflow");
const errors_1 = require("./errors");
const utils_1 = require("./utils");
const strategies_1 = require("./strategies");
const SUPPORTED_FORMATS = [
"doc", "docx", "xml", "yml", "xlsx", "csv", "pdf",
"txt", "ppt", "pptx", "html", "htm", "odt", "odp", "ods", "json",
];
/**
* Custom n8n node: convert files to JSON/text
* Supports DOCX, XML, YML, XLSX, CSV, PDF, TXT, PPTX, HTML
*/
class FileToJsonNode {
constructor() {
this.description = {
displayName: "Convert File to JSON",
name: "convertFileToJson",
icon: "file:icon.svg",
group: ["transform"],
version: 5,
description: "DOCX / XML / YML / XLSX / CSV / PDF / TXT / PPTX / HTML → JSON|text",
defaults: { name: "Convert File to JSON" },
inputs: [n8n_workflow_1.NodeConnectionTypes.Main],
outputs: [n8n_workflow_1.NodeConnectionTypes.Main],
usableAsTool: true,
properties: [
{
displayName: "Binary Property",
name: "binaryPropertyName",
type: "string",
default: "data",
description: "Name of the binary property that contains the file",
},
{
displayName: "Max File Size (MB)",
name: "maxFileSize",
type: "number",
default: 50,
description: "Maximum file size in megabytes",
typeOptions: {
minValue: 1,
maxValue: 100
}
},
{
displayName: "Max Concurrency",
name: "maxConcurrency",
type: "number",
default: 4,
description: "Maximum number of files processed concurrently",
typeOptions: {
minValue: 1,
maxValue: 10
}
},
{
displayName: "Output Format (DOCX)",
name: "outputFormat",
type: "options",
options: [
{
name: "Plain Text",
value: "text",
description: "Extract text only (fastest, smallest output)",
},
{
name: "HTML",
value: "html",
description: "Convert to HTML (preserves tables, formatting, structure)",
},
{
name: "Markdown",
value: "markdown",
description: "Convert to Markdown with GFM tables (ideal for AI/LLM/RAG)",
},
],
default: "text",
description: "Choose output format for DOCX files. Markdown and HTML preserve tables and formatting for AI/LLM processing.",
},
],
};
}
/**
* Main execution method for n8n node
*/
async execute() {
const items = this.getInputData();
const maxFileSize = this.getNodeParameter('maxFileSize', 0, 50) * 1024 * 1024;
const maxConcurrency = this.getNodeParameter('maxConcurrency', 0, 4);
const processItem = async (item, i) => {
const prop = this.getNodeParameter("binaryPropertyName", i, "data");
// --- Input data validation ---
if (!item || typeof item !== "object")
throw new errors_1.FileTypeError(`Item #${i} is not an object`);
const itemObj = item;
if (!itemObj.binary || typeof itemObj.binary !== "object")
throw new errors_1.FileTypeError(`Item #${i} does not contain binary data`);
const binary = itemObj.binary;
if (!binary[prop])
throw new errors_1.FileTypeError(`Binary property "${prop}" is missing (item ${i})`);
const binaryProp = binary[prop];
if (!binaryProp.fileName || typeof binaryProp.fileName !== "string")
throw new errors_1.FileTypeError(`File does not contain a valid name (item ${i})`);
const buf = await this.helpers.getBinaryDataBuffer(i, prop);
if (!Buffer.isBuffer(buf) || buf.length === 0)
throw new errors_1.EmptyFileError("File is empty or contains no data");
if (buf.length > maxFileSize)
throw new errors_1.FileTooLargeError(`File is too large (maximum ${maxFileSize / 1024 / 1024} MB)`);
// --- End of validation ---
const name = (0, utils_1.sanitizeFileName)(binaryProp.fileName ?? "");
let ext = path_1.default.extname(name).slice(1).toLowerCase();
/* ── autodetect ── */
if (!ext || !SUPPORTED_FORMATS.includes(ext)) {
try {
const ft = await (0, file_type_1.fromBuffer)(buf);
if (ft?.ext && SUPPORTED_FORMATS.includes(ft.ext)) {
ext = ft.ext;
}
else {
throw new errors_1.UnsupportedFormatError(`Unsupported file type: ${ext || "unknown"}`);
}
}
catch (error) {
this.logger?.warn('File type detection failed', {
fileName: name,
error: error instanceof Error ? error.message : String(error)
});
throw new errors_1.UnsupportedFormatError(`Unsupported file type: ${ext || "unknown"}`);
}
}
this.logger?.info("ConvertFileToJSON →", {
file: name || "[no-name]",
ext,
size: buf.length,
});
let json = {};
const startTime = performance.now();
const outputFormat = this.getNodeParameter('outputFormat', i, 'text');
try {
if (!strategies_1.strategies[ext]) {
throw new errors_1.UnsupportedFormatError(`Format "${ext}" is not supported`);
}
json = await strategies_1.strategies[ext](buf, ext, ext === 'docx' ? { outputFormat } : undefined);
}
catch (e) {
if (e instanceof errors_1.FileTypeError ||
e instanceof errors_1.FileTooLargeError ||
e instanceof errors_1.UnsupportedFormatError ||
e instanceof errors_1.EmptyFileError ||
e instanceof errors_1.ProcessingError) {
throw e;
}
throw new errors_1.ProcessingError(`${ext.toUpperCase()} processing error: ${e.message}`);
}
const processingTime = performance.now() - startTime;
this.logger?.info('Processing completed', {
file: name,
size: buf.length,
time: `${processingTime.toFixed(2)}ms`,
type: ext
});
if ("text" in json &&
(!json.text || json.text.trim().length === 0)) {
throw new errors_1.EmptyFileError(`File "${name}" (${ext.toUpperCase()}, ${(buf.length / 1024).toFixed(2)} KB) contains no extractable text. ` +
`Possible reasons: (1) File contains only images/graphics without text, ` +
`(2) File is password-protected or encrypted, ` +
`(3) File structure is corrupted, ` +
`(4) File was created with a non-standard application. ` +
`Try: Open file in original application and verify it contains text, then save it again.`);
}
json.metadata = {
fileName: (0, utils_1.sanitizeFileName)(name) || null,
fileSize: buf.length,
fileType: ext,
processedAt: new Date().toISOString(),
};
return {
json: json,
pairedItem: { item: i },
};
};
const results = await (0, utils_1.promisePool)(items, processItem, maxConcurrency);
return [[{
json: {
files: results.map(result => result.json),
totalFiles: results.length,
processedAt: new Date().toISOString()
}
}]];
}
}
exports.FileToJsonNode = FileToJsonNode;
//# sourceMappingURL=FileToJsonNode.node.js.map