pomljs
Version:
Prompt Orchestration Markup Language
234 lines (231 loc) • 9.34 kB
JavaScript
import * as React from 'react';
import * as fs from 'fs';
import * as mammoth from 'mammoth';
import * as cheerio from 'cheerio';
import { Text, Header, Paragraph, Newline, List, ListItem, Bold, Italic, Image } from '../essentials.js';
import { getNumPages, pdfParse } from '../util/pdf.js';
import { component, useWithCatch, expandRelative, BufferCollection } from '../base.js';
import { Table } from './table.js';
import { parsePythonStyleSlice } from './utils.js';
function readBufferCached(filePath) {
const abs = expandRelative(filePath);
const key = `content://${abs}`;
const stat = fs.statSync(abs);
const cached = BufferCollection.get(key);
if (cached && cached.mtime === stat.mtimeMs) {
return cached.value;
}
const buf = fs.readFileSync(abs);
BufferCollection.set(key, { value: buf, mtime: stat.mtimeMs });
return buf;
}
async function parsePdfWithPageLimit(dataBuffer, startPage, endPage) {
// This is a workaround for pdf-parse not supporting a range.
const data = await pdfParse(dataBuffer, endPage + 1);
if (startPage <= 0) {
return data;
}
const minusData = await pdfParse(dataBuffer, startPage);
return data.slice(minusData.length);
}
async function readPdf(dataBuffer, options) {
const { selectedPages } = options || {};
const numPages = await getNumPages(dataBuffer);
if (selectedPages) {
const [start, end] = parsePythonStyleSlice(selectedPages, numPages);
const result = await parsePdfWithPageLimit(dataBuffer, start, end);
return React.createElement(Text, { whiteSpace: 'pre' }, result);
}
else {
return React.createElement(Text, { whiteSpace: 'pre' }, await pdfParse(dataBuffer));
}
}
function htmlContentsToPoml(element, $, options) {
const children = element
.contents()
.toArray()
.map((child, index) => {
if (child.type === 'text') {
return React.createElement(React.Fragment, { key: index }, child.data);
}
else {
return React.createElement(React.Fragment, { key: index }, htmlToPoml($(child), $, options));
}
});
return children;
}
function convertTableFromHtml(element, $, options) {
const body = element
.find('tr')
.toArray()
.map(tr => $(tr)
.find('td, th')
.toArray()
.map(td => $(td).text()));
const header = body.shift() || [];
if (header.length === 0) {
return React.createElement(React.Fragment, null);
}
const maxColumns = Math.max(...body.map(row => row.length), header.length);
if (header.length < maxColumns) {
header.push(...Array(maxColumns - header.length).map(i => `Unnamed Column ${i + header.length}`));
}
const rows = body.map(row => {
return Object.fromEntries(header.map((column, index) => {
return [column, index < row.length ? row[index] : ''];
}));
});
return (React.createElement(Table, { records: rows, columns: header.map(column => ({ field: column, header: column })) }));
}
function htmlToPoml(element, $, options) {
if (element.is('style') || element.is('script')) {
return React.createElement(React.Fragment, null);
}
else if (element.is('h1') ||
element.is('h2') ||
element.is('h3') ||
element.is('h4') ||
element.is('h5') ||
element.is('h6')) {
return React.createElement(Header, { whiteSpace: 'pre' }, htmlContentsToPoml(element, $, options));
}
else if (element.is('p') || element.is('div')) {
return React.createElement(Paragraph, { whiteSpace: 'pre' }, htmlContentsToPoml(element, $, options));
}
else if (element.is('br')) {
return React.createElement(Newline, null);
}
else if (element.is('ol')) {
return React.createElement(List, { listStyle: 'decimal' }, htmlContentsToPoml(element, $, options));
}
else if (element.is('ul')) {
return React.createElement(List, null, htmlContentsToPoml(element, $, options));
}
else if (element.is('li')) {
return React.createElement(ListItem, null, htmlContentsToPoml(element, $, options));
}
else if (element.is('b')) {
return React.createElement(Bold, null, htmlContentsToPoml(element, $, options));
}
else if (element.is('i')) {
return React.createElement(Italic, null, htmlContentsToPoml(element, $, options));
}
else if (element.is('img')) {
// src is in the format of data:image/png;base64, so we can't use it directly
const src = element.attr('src');
// check whether src is in the format of data:type;base64
if (src.startsWith('data:') && src.includes(';base64')) {
const base64 = src.split(',')[1];
if (options?.multimedia || options?.multimedia === undefined) {
return React.createElement(Image, { syntax: "multimedia", base64: base64, alt: element.attr('alt') });
}
else {
return React.createElement(Image, { base64: base64, alt: element.attr('alt') });
}
}
else {
// TODO: Probably needs to fetch a file or URL
return React.createElement(React.Fragment, null);
}
}
else if (element.is('table')) {
return convertTableFromHtml(element, $);
}
else {
return React.createElement(React.Fragment, null, htmlContentsToPoml(element, $, options));
}
}
async function readDocx(dataBuffer, options) {
const result = await mammoth.convertToHtml({ buffer: dataBuffer });
const $ = cheerio.load(result.value);
return React.createElement(Text, { syntax: "markdown" }, htmlContentsToPoml($('body'), $, options));
}
async function readTxt(dataBuffer, options) {
const text = dataBuffer.toString();
return React.createElement(Text, { whiteSpace: 'pre' }, text);
}
function determineParser(src) {
src = src.toLowerCase();
if (src.endsWith('.docx') || src.endsWith('.doc')) {
return 'docx';
}
else if (src.endsWith('.pdf')) {
return 'pdf';
}
else if (src.endsWith('.txt')) {
return 'txt';
}
else {
throw new Error('Cannot determine parser for ' + src + '. Please manually specify a parser.');
}
}
async function autoParseDocument(props) {
let { parser, src, buffer } = props;
if (parser === 'auto' || parser === undefined) {
if (!src) {
throw new Error('Cannot determine parser without source file provided.');
}
parser = determineParser(src);
}
if (src) {
buffer = readBufferCached(src);
}
else if (!buffer) {
throw new Error('Either buffer or src must be provided');
}
switch (parser) {
case 'pdf':
return await readPdf(buffer, props);
case 'docx':
return await readDocx(buffer, props);
case 'txt':
return await readTxt(buffer);
default:
throw new Error('Unsupported parser: ' + parser);
}
}
/**
* Displaying an external document like PDF, TXT or DOCX.
*
* @param {string} src - The source file to read the data from. This must be provided if records is not provided.
* @param {Buffer|string} buffer - Document data buffer. Recommended to use `src` instead unless you want to use a string.
* @param {string} base64 - Base64 encoded string of the document data. Mutually exclusive with `src` and `buffer`.
* @param {'auto'|'pdf'|'docx'|'txt'} parser - The parser to use for reading the data. If not provided, it will be inferred from the file extension.
* @param {boolean} multimedia - If true, the multimedias will be displayed. If false, the alt strings will be displayed at best effort. Default is `true`.
* @param {string} selectedPages - The pages to be selected. This is only available **for PDF documents**. If not provided, all pages will be selected.
* You can use a string like `2` to specify a single page, or slice like `2:4` to specify a range of pages (2 inclusive, 4 exclusive).
* The pages selected are **0-indexed**. Negative indexes like `-1` is not supported here.
*
* @see {@link Inline} for other props available.
*
* @example
* To display a Word document without including the real multimedia:
* ```xml
* <Document src="sample.docx" multimedia="false"/>
* ```
*/
component('Document', { aliases: ['doc'], asynchorous: true })((props) => {
let { buffer, parser, base64, ...others } = props;
let parsedBuffer;
if (base64) {
if (buffer !== undefined) {
throw new Error('Either buffer or base64 should be provided, not both.');
}
parsedBuffer = Buffer.from(base64, 'base64');
}
else {
if (typeof buffer === 'string') {
parsedBuffer = Buffer.from(buffer, 'utf-8');
if (parser === undefined || parser === 'auto') {
parser = 'txt';
}
}
else {
parsedBuffer = buffer;
}
}
const document = useWithCatch(autoParseDocument({ buffer: parsedBuffer, parser, ...others }), others);
return React.createElement(React.Fragment, null, document ?? null);
});
export { htmlToPoml, readDocx, readPdf, readTxt };
//# sourceMappingURL=document.js.map