UNPKG

pomljs

Version:

Prompt Orchestration Markup Language

github.com/microsoft/poml

234 lines (231 loc) • 9.34 kB

JavaScript

import * as React from 'react'; import * as fs from 'fs'; import * as mammoth from 'mammoth'; import * as cheerio from 'cheerio'; import { Text, Header, Paragraph, Newline, List, ListItem, Bold, Italic, Image } from '../essentials.js'; import { getNumPages, pdfParse } from '../util/pdf.js'; import { component, useWithCatch, expandRelative, BufferCollection } from '../base.js'; import { Table } from './table.js'; import { parsePythonStyleSlice } from './utils.js'; function readBufferCached(filePath) { const abs = expandRelative(filePath); const key = `content://${abs}`; const stat = fs.statSync(abs); const cached = BufferCollection.get(key); if (cached && cached.mtime === stat.mtimeMs) { return cached.value; } const buf = fs.readFileSync(abs); BufferCollection.set(key, { value: buf, mtime: stat.mtimeMs }); return buf; } async function parsePdfWithPageLimit(dataBuffer, startPage, endPage) { // This is a workaround for pdf-parse not supporting a range. const data = await pdfParse(dataBuffer, endPage + 1); if (startPage <= 0) { return data; } const minusData = await pdfParse(dataBuffer, startPage); return data.slice(minusData.length); } async function readPdf(dataBuffer, options) { const { selectedPages } = options || {}; const numPages = await getNumPages(dataBuffer); if (selectedPages) { const [start, end] = parsePythonStyleSlice(selectedPages, numPages); const result = await parsePdfWithPageLimit(dataBuffer, start, end); return React.createElement(Text, { whiteSpace: 'pre' }, result); } else { return React.createElement(Text, { whiteSpace: 'pre' }, await pdfParse(dataBuffer)); } } function htmlContentsToPoml(element, $, options) { const children = element .contents() .toArray() .map((child, index) => { if (child.type === 'text') { return React.createElement(React.Fragment, { key: index }, child.data); } else { return React.createElement(React.Fragment, { key: index }, htmlToPoml($(child), $, options)); } }); return children; } function convertTableFromHtml(element, $, options) { const body = element .find('tr') .toArray() .map(tr => $(tr) .find('td, th') .toArray() .map(td => $(td).text())); const header = body.shift() || []; if (header.length === 0) { return React.createElement(React.Fragment, null); } const maxColumns = Math.max(...body.map(row => row.length), header.length); if (header.length < maxColumns) { header.push(...Array(maxColumns - header.length).map(i => `Unnamed Column ${i + header.length}`)); } const rows = body.map(row => { return Object.fromEntries(header.map((column, index) => { return [column, index < row.length ? row[index] : '']; })); }); return (React.createElement(Table, { records: rows, columns: header.map(column => ({ field: column, header: column })) })); } function htmlToPoml(element, $, options) { if (element.is('style') || element.is('script')) { return React.createElement(React.Fragment, null); } else if (element.is('h1') || element.is('h2') || element.is('h3') || element.is('h4') || element.is('h5') || element.is('h6')) { return React.createElement(Header, { whiteSpace: 'pre' }, htmlContentsToPoml(element, $, options)); } else if (element.is('p') || element.is('div')) { return React.createElement(Paragraph, { whiteSpace: 'pre' }, htmlContentsToPoml(element, $, options)); } else if (element.is('br')) { return React.createElement(Newline, null); } else if (element.is('ol')) { return React.createElement(List, { listStyle: 'decimal' }, htmlContentsToPoml(element, $, options)); } else if (element.is('ul')) { return React.createElement(List, null, htmlContentsToPoml(element, $, options)); } else if (element.is('li')) { return React.createElement(ListItem, null, htmlContentsToPoml(element, $, options)); } else if (element.is('b')) { return React.createElement(Bold, null, htmlContentsToPoml(element, $, options)); } else if (element.is('i')) { return React.createElement(Italic, null, htmlContentsToPoml(element, $, options)); } else if (element.is('img')) { // src is in the format of data:image/png;base64, so we can't use it directly const src = element.attr('src'); // check whether src is in the format of data:type;base64 if (src.startsWith('data:') && src.includes(';base64')) { const base64 = src.split(',')[1]; if (options?.multimedia || options?.multimedia === undefined) { return React.createElement(Image, { syntax: "multimedia", base64: base64, alt: element.attr('alt') }); } else { return React.createElement(Image, { base64: base64, alt: element.attr('alt') }); } } else { // TODO: Probably needs to fetch a file or URL return React.createElement(React.Fragment, null); } } else if (element.is('table')) { return convertTableFromHtml(element, $); } else { return React.createElement(React.Fragment, null, htmlContentsToPoml(element, $, options)); } } async function readDocx(dataBuffer, options) { const result = await mammoth.convertToHtml({ buffer: dataBuffer }); const $ = cheerio.load(result.value); return React.createElement(Text, { syntax: "markdown" }, htmlContentsToPoml($('body'), $, options)); } async function readTxt(dataBuffer, options) { const text = dataBuffer.toString(); return React.createElement(Text, { whiteSpace: 'pre' }, text); } function determineParser(src) { src = src.toLowerCase(); if (src.endsWith('.docx') || src.endsWith('.doc')) { return 'docx'; } else if (src.endsWith('.pdf')) { return 'pdf'; } else if (src.endsWith('.txt')) { return 'txt'; } else { throw new Error('Cannot determine parser for ' + src + '. Please manually specify a parser.'); } } async function autoParseDocument(props) { let { parser, src, buffer } = props; if (parser === 'auto' || parser === undefined) { if (!src) { throw new Error('Cannot determine parser without source file provided.'); } parser = determineParser(src); } if (src) { buffer = readBufferCached(src); } else if (!buffer) { throw new Error('Either buffer or src must be provided'); } switch (parser) { case 'pdf': return await readPdf(buffer, props); case 'docx': return await readDocx(buffer, props); case 'txt': return await readTxt(buffer); default: throw new Error('Unsupported parser: ' + parser); } } /** * Displaying an external document like PDF, TXT or DOCX. * * @param {string} src - The source file to read the data from. This must be provided if records is not provided. * @param {Buffer|string} buffer - Document data buffer. Recommended to use `src` instead unless you want to use a string. * @param {string} base64 - Base64 encoded string of the document data. Mutually exclusive with `src` and `buffer`. * @param {'auto'|'pdf'|'docx'|'txt'} parser - The parser to use for reading the data. If not provided, it will be inferred from the file extension. * @param {boolean} multimedia - If true, the multimedias will be displayed. If false, the alt strings will be displayed at best effort. Default is `true`. * @param {string} selectedPages - The pages to be selected. This is only available **for PDF documents**. If not provided, all pages will be selected. * You can use a string like `2` to specify a single page, or slice like `2:4` to specify a range of pages (2 inclusive, 4 exclusive). * The pages selected are **0-indexed**. Negative indexes like `-1` is not supported here. * * @see {@link Inline} for other props available. * * @example * To display a Word document without including the real multimedia: * ```xml * <Document src="sample.docx" multimedia="false"/> * ``` */ component('Document', { aliases: ['doc'], asynchorous: true })((props) => { let { buffer, parser, base64, ...others } = props; let parsedBuffer; if (base64) { if (buffer !== undefined) { throw new Error('Either buffer or base64 should be provided, not both.'); } parsedBuffer = Buffer.from(base64, 'base64'); } else { if (typeof buffer === 'string') { parsedBuffer = Buffer.from(buffer, 'utf-8'); if (parser === undefined || parser === 'auto') { parser = 'txt'; } } else { parsedBuffer = buffer; } } const document = useWithCatch(autoParseDocument({ buffer: parsedBuffer, parser, ...others }), others); return React.createElement(React.Fragment, null, document ?? null); }); export { htmlToPoml, readDocx, readPdf, readTxt }; //# sourceMappingURL=document.js.map