html-textify
Version:
Convert html to plain text
161 lines (155 loc) • 5.11 kB
JavaScript
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/index.ts
var index_exports = {};
__export(index_exports, {
textify: () => textify
});
module.exports = __toCommonJS(index_exports);
// src/utils/preserveFormat.ts
function preserveFormat({
html,
ignoreTags = []
}) {
if (!html) return "";
html = html.replace(/>\s+</g, "><");
html = !ignoreTags.includes("br") ? html.replace(/<br\s*\/?>/gi, "\n") : html;
html = html.replace(
/<\/(h[1-6]|p)>/gi,
(match, tag) => ignoreTags.includes(tag.toLowerCase()) ? match : "\n\n"
);
html = html.replace(
/<(b|strong)>(.*?)<\/\1>/gi,
(match, tag, content) => ignoreTags.includes(tag.toLowerCase()) ? match : `**${content}**`
);
html = html.replace(
/<(i|em)>(.*?)<\/\1>/gi,
(match, tag, content) => ignoreTags.includes(tag.toLowerCase()) ? match : `*${content}*`
);
html = !ignoreTags.includes("a") ? html.replace(
/<a\s+href="(.*?)".*?>(.*?)<\/a>/gi,
(_m, href, text) => `${text} (${href})`
) : html;
html = html.replace(/<ol>(.*?)<\/ol>/gis, (match, content) => {
if (ignoreTags.includes("ol")) return match;
let counter = 0;
return content.replace(
/<li>(.*?)<\/li>/gi,
(liMatch, liContent) => ignoreTags.includes("li") ? liMatch : `${++counter}. ${liContent}
`
);
});
html = html.replace(/<ul>(.*?)<\/ul>/gis, (match, content) => {
if (ignoreTags.includes("ul")) return match;
return content.replace(
/<li>(.*?)<\/li>/gi,
(liMatch, liContent) => ignoreTags.includes("li") ? liMatch : `- ${liContent}
`
);
});
html = !ignoreTags.includes("blockquote") ? html.replace(
/<blockquote>(.*?)<\/blockquote>/gis,
(_m, content) => content.replace(/<br\s*\/?>/gi, "\n").trim().split("\n").map((line) => `> ${line.trim()}`).join("\n")
) : html;
html = html.replace(
/<table>(.*?)<\/table>/gis,
(match, tableContent) => {
if (ignoreTags.includes("table")) return match;
return tableContent.replace(/<tr>(.*?)<\/tr>/gi, (trMatch, rowContent) => {
if (ignoreTags.includes("tr")) return trMatch;
return rowContent.replace(
/<t[dh]>(.*?)<\/t[dh]>/gi,
(cellMatch, cellContent) => ignoreTags.includes("td") || ignoreTags.includes("th") ? cellMatch : `${cellContent} `
).trim().replace(/\t$/, "") + "\n";
}).trim();
}
);
if (ignoreTags.length === 0) {
html = html.replace(/<[^>]+>/g, "");
} else {
html = html.replace(
/<\/?([a-z0-9]+)[^>]*>/gi,
(match, tag) => ignoreTags.includes(tag.toLowerCase()) ? match : ""
);
}
html = html.replace(/ /gi, " ").replace(/&/gi, "&").replace(/</gi, "<").replace(/>/gi, ">");
html = html.replace(/\n{3,}/g, "\n\n").trim();
return html;
}
// src/utils/wrapByLength.ts
function wrapByLength(text, length) {
if (length <= 0) {
throw new Error("wrap length must be greater than 0");
}
const words = text.trim().split(/\s+/);
const lines = [];
let line = "";
for (const word of words) {
if ((line + " " + word).trim().length > length) {
if (line) lines.push(line.trim());
line = word;
} else {
line += " " + word;
}
}
if (line) lines.push(line.trim());
return lines.join("\n");
}
// src/utils/wrapByWords.ts
function wrapByWords(text, count) {
const words = text.trim().split(/\s+/);
const lines = [];
for (let i = 0; i < words.length; i += count) {
lines.push(words.slice(i, i + count).join(" "));
}
return lines.join("\n");
}
// src/index.ts
function textify({
html,
preserveFormatting = true,
ignoreTags = [],
wrapLength,
wrapWords
}) {
if (!html) return "";
if (preserveFormatting) {
html = preserveFormat({ html, ignoreTags });
} else {
if (ignoreTags.length === 0) {
html = html.replace(/<[^>]+>/g, "").trim();
} else {
const IG = new Set(ignoreTags.map((t) => t.toLowerCase()));
html = html.replace(
/<\/?([a-z][a-z0-9-]*)\b[^>]*>/gi,
(match, tag) => IG.has(tag.toLowerCase()) ? match : ""
).trim();
}
}
if (wrapWords && wrapWords > 0) {
html = wrapByWords(html, wrapWords);
} else if (wrapLength && wrapLength > 0) {
html = wrapByLength(html, wrapLength);
}
return html;
}
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
textify
});
;