UNPKG

@7-docs/cli

Version:

Command-line tool for 7-docs to ingest content

21 lines (20 loc) 860 B
import 'path2d-polyfill'; import { parser as textParser } from './text.js'; export const parser = async (content, maxLength) => { const { default: pdf } = await import('pdfjs-dist/legacy/build/pdf.js'); const data = Buffer.isBuffer(content) ? new Uint8Array(content) : content; const loadingTask = pdf.getDocument(data); const pdfDocument = await loadingTask.promise; const numPages = pdfDocument.numPages; let fullText = ''; for (let pageNum = 1; pageNum <= numPages; pageNum++) { const page = await pdfDocument.getPage(pageNum); const textContent = await page.getTextContent(); const pageText = textContent.items .filter((item) => 'str' in item) .map(item => item.str) .join(' '); fullText += ' ' + pageText; } return textParser(fullText, maxLength); };