UNPKG

html-textify

Version:

Convert html to plain text

github.com/marjmandi/html-textify

marjmandi/html-textify

161 lines (155 loc) • 5.11 kB

JavaScript

"use strict"; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.ts var index_exports = {}; __export(index_exports, { textify: () => textify }); module.exports = __toCommonJS(index_exports); // src/utils/preserveFormat.ts function preserveFormat({ html, ignoreTags = [] }) { if (!html) return ""; html = html.replace(/>\s+</g, "><"); html = !ignoreTags.includes("br") ? html.replace(/<br\s*\/?>/gi, "\n") : html; html = html.replace( /<\/(h[1-6]|p)>/gi, (match, tag) => ignoreTags.includes(tag.toLowerCase()) ? match : "\n\n" ); html = html.replace( /<(b|strong)>(.*?)<\/\1>/gi, (match, tag, content) => ignoreTags.includes(tag.toLowerCase()) ? match : `**${content}**` ); html = html.replace( /<(i|em)>(.*?)<\/\1>/gi, (match, tag, content) => ignoreTags.includes(tag.toLowerCase()) ? match : `*${content}*` ); html = !ignoreTags.includes("a") ? html.replace( /<a\s+href="(.*?)".*?>(.*?)<\/a>/gi, (_m, href, text) => `${text} (${href})` ) : html; html = html.replace(/<ol>(.*?)<\/ol>/gis, (match, content) => { if (ignoreTags.includes("ol")) return match; let counter = 0; return content.replace( /<li>(.*?)<\/li>/gi, (liMatch, liContent) => ignoreTags.includes("li") ? liMatch : `${++counter}. ${liContent} ` ); }); html = html.replace(/<ul>(.*?)<\/ul>/gis, (match, content) => { if (ignoreTags.includes("ul")) return match; return content.replace( /<li>(.*?)<\/li>/gi, (liMatch, liContent) => ignoreTags.includes("li") ? liMatch : `- ${liContent} ` ); }); html = !ignoreTags.includes("blockquote") ? html.replace( /<blockquote>(.*?)<\/blockquote>/gis, (_m, content) => content.replace(/<br\s*\/?>/gi, "\n").trim().split("\n").map((line) => `> ${line.trim()}`).join("\n") ) : html; html = html.replace( /<table>(.*?)<\/table>/gis, (match, tableContent) => { if (ignoreTags.includes("table")) return match; return tableContent.replace(/<tr>(.*?)<\/tr>/gi, (trMatch, rowContent) => { if (ignoreTags.includes("tr")) return trMatch; return rowContent.replace( /<t[dh]>(.*?)<\/t[dh]>/gi, (cellMatch, cellContent) => ignoreTags.includes("td") || ignoreTags.includes("th") ? cellMatch : `${cellContent} ` ).trim().replace(/\t$/, "") + "\n"; }).trim(); } ); if (ignoreTags.length === 0) { html = html.replace(/<[^>]+>/g, ""); } else { html = html.replace( /<\/?([a-z0-9]+)[^>]*>/gi, (match, tag) => ignoreTags.includes(tag.toLowerCase()) ? match : "" ); } html = html.replace(/ /gi, " ").replace(/&/gi, "&").replace(/</gi, "<").replace(/>/gi, ">"); html = html.replace(/\n{3,}/g, "\n\n").trim(); return html; } // src/utils/wrapByLength.ts function wrapByLength(text, length) { if (length <= 0) { throw new Error("wrap length must be greater than 0"); } const words = text.trim().split(/\s+/); const lines = []; let line = ""; for (const word of words) { if ((line + " " + word).trim().length > length) { if (line) lines.push(line.trim()); line = word; } else { line += " " + word; } } if (line) lines.push(line.trim()); return lines.join("\n"); } // src/utils/wrapByWords.ts function wrapByWords(text, count) { const words = text.trim().split(/\s+/); const lines = []; for (let i = 0; i < words.length; i += count) { lines.push(words.slice(i, i + count).join(" ")); } return lines.join("\n"); } // src/index.ts function textify({ html, preserveFormatting = true, ignoreTags = [], wrapLength, wrapWords }) { if (!html) return ""; if (preserveFormatting) { html = preserveFormat({ html, ignoreTags }); } else { if (ignoreTags.length === 0) { html = html.replace(/<[^>]+>/g, "").trim(); } else { const IG = new Set(ignoreTags.map((t) => t.toLowerCase())); html = html.replace( /<\/?([a-z][a-z0-9-]*)\b[^>]*>/gi, (match, tag) => IG.has(tag.toLowerCase()) ? match : "" ).trim(); } } if (wrapWords && wrapWords > 0) { html = wrapByWords(html, wrapWords); } else if (wrapLength && wrapLength > 0) { html = wrapByLength(html, wrapLength); } return html; } // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { textify });