UNPKG

pdf2html

Version:

PDF to HTML or Text conversion using Apache Tika. Also generate PDF thumbnail using Apache PDFBox.

28 lines (22 loc) 667 B
// lib/HTMLParser.js const cheerio = require('cheerio'); const { JSDOM } = require('jsdom'); const DOMPurify = require('dompurify'); const window = new JSDOM('').window; const purify = DOMPurify(window); /** * HTML content parser */ class HTMLParser { static extractPages(htmlContent, options = {}) { const $ = cheerio.load(htmlContent); const pages = []; $('.page').each((index, element) => { const $page = $(element); const content = options.text ? $page.text().trim() : purify.sanitize($page.html()); pages.push(content); }); return pages; } } module.exports = HTMLParser;