UNPKG

@anyparser/core

Version:

The `@anyparser/core` Typescript SDK enables developers to quickly extract structured data from a wide variety of file formats like PDFs, images, websites, audio, and videos.

584 lines (568 loc) 16.2 kB
var __defProp = Object.defineProperty; var __getOwnPropSymbols = Object.getOwnPropertySymbols; var __hasOwnProp = Object.prototype.hasOwnProperty; var __propIsEnum = Object.prototype.propertyIsEnumerable; var __knownSymbol = (name, symbol) => (symbol = Symbol[name]) ? symbol : Symbol.for("Symbol." + name); var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value; var __spreadValues = (a, b) => { for (var prop in b || (b = {})) if (__hasOwnProp.call(b, prop)) __defNormalProp(a, prop, b[prop]); if (__getOwnPropSymbols) for (var prop of __getOwnPropSymbols(b)) { if (__propIsEnum.call(b, prop)) __defNormalProp(a, prop, b[prop]); } return a; }; var __async = (__this, __arguments, generator) => { return new Promise((resolve, reject) => { var fulfilled = (value) => { try { step(generator.next(value)); } catch (e) { reject(e); } }; var rejected = (value) => { try { step(generator.throw(value)); } catch (e) { reject(e); } }; var step = (x) => x.done ? resolve(x.value) : Promise.resolve(x.value).then(fulfilled, rejected); step((generator = generator.apply(__this, __arguments)).next()); }); }; var __forAwait = (obj, it, method) => (it = obj[__knownSymbol("asyncIterator")]) ? it.call(obj) : (obj = obj[__knownSymbol("iterator")](), it = {}, method = (key, fn) => (fn = obj[key]) && (it[key] = (arg) => new Promise((yes, no, done) => (arg = fn.call(obj, arg), done = arg.done, Promise.resolve(arg.value).then((value) => yes({ value, done }), no)))), method("next"), method("return"), it); // src/version.ts var ANYPARSER_VERSION = "1.0.1"; var version = ANYPARSER_VERSION; // src/config/hardcoded.ts var FALLBACK_API_URL = "https://anyparserapi.com"; var OCR_PRESETS = Object.freeze({ DOCUMENT: "document", HANDWRITING: "handwriting", SCAN: "scan", RECEIPT: "receipt", MAGAZINE: "magazine", INVOICE: "invoice", BUSINESS_CARD: "business-card", PASSPORT: "passport", DRIVER_LICENSE: "driver-license" }); var OCR_LANGUAGES = Object.freeze({ AFRIKAANS: "afr", AMHARIC: "amh", ARABIC: "ara", ASSAMESE: "asm", AZERBAIJANI: "aze", AZERBAIJANI_CYRILLIC: "aze_cyrl", BELARUSIAN: "bel", BENGALI: "ben", TIBETAN: "bod", BOSNIAN: "bos", BRETON: "bre", BULGARIAN: "bul", CATALAN: "cat", CEBUANO: "ceb", CZECH: "ces", SIMPLIFIED_CHINESE: "chi_sim", SIMPLIFIED_CHINESE_VERTICAL: "chi_sim_vert", TRADITIONAL_CHINESE: "chi_tra", TRADITIONAL_CHINESE_VERTICAL: "chi_tra_vert", CHEROKEE: "chr", CORSICAN: "cos", WELSH: "cym", DANISH: "dan", DANISH_FRAKTUR: "dan_frak", GERMAN: "deu", GERMAN_FRAKTUR: "deu_frak", GERMAN_LATIN: "deu_latf", DIVESH: "div", DZONGKHA: "dzo", GREEK: "ell", ENGLISH: "eng", MIDDLE_ENGLISH: "enm", ESPERANTO: "epo", EQUATORIAL_GUINEAN: "equ", ESTONIAN: "est", BASQUE: "eus", FAROESE: "fao", PERSIAN: "fas", FILIPINO: "fil", FINNISH: "fin", FRENCH: "fra", OLD_FRENCH: "frm", FRISIAN: "fry", SCOTTISH_GAELIC: "gla", IRISH: "gle", GALICIAN: "glg", ANCIENT_GREEK: "grc", GUJARATI: "guj", HAITIAN_CREOLE: "hat", HEBREW: "heb", HINDI: "hin", CROATIAN: "hrv", HUNGARIAN: "hun", ARMENIAN: "hye", IGBO: "iku", INDONESIAN: "ind", ICELANDIC: "isl", ITALIAN: "ita", OLD_ITALIAN: "ita_old", JAVANESE: "jav", JAPANESE: "jpn", JAPANESE_VERTICAL: "jpn_vert", KANNADA: "kan", GEORGIAN: "kat", OLD_GEORGIAN: "kat_old", KAZAKH: "kaz", KHMER: "khm", KIRGHIZ: "kir", KURDISH: "kmr", KOREAN: "kor", KOREAN_VERTICAL: "kor_vert", LAO: "lao", LATIN: "lat", LATVIAN: "lav", LITHUANIAN: "lit", LUXEMBOURGISH: "ltz", MALAYALAM: "mal", MARATHI: "mar", MACEDONIAN: "mkd", MALTESE: "mlt", MONGOLIAN: "mon", MAORI: "mri", MALAY: "msa", MYANMAR: "mya", NEPALI: "nep", DUTCH: "nld", NORWEGIAN: "nor", OCCITAN: "oci", ODISHA: "ori", OSD: "osd", PUNJABI: "pan", POLISH: "pol", PORTUGUESE: "por", PASHTO: "pus", QUECHUA: "que", ROMANIAN: "ron", RUSSIAN: "rus", SANSKRIT: "san", SINHALA: "sin", SLOVAK: "slk", SLOVAK_FRAKTUR: "slk_frak", SLOVENIAN: "slv", SINDHI: "snd", SPANISH: "spa", OLD_SPANISH: "spa_old", ALBANIAN: "sqi", SERBIAN: "srp", SERBIAN_LATIN: "srp_latn", SUNDIANESE: "sun", SWAHILI: "swa", SWEDISH: "swe", SYRIAC: "syr", TAMIL: "tam", TATAR: "tat", TELUGU: "tel", TAJIK: "tgk", TAGALOG: "tgl", THAI: "tha", TIGRINYA: "tir", TONGAN: "ton", TURKISH: "tur", UIGHUR: "uig", UKRAINIAN: "ukr", URDU: "urd", UZBEK: "uzb", UZBEK_CYRILLIC: "uzb_cyrl", VIETNAMESE: "vie", YIDDISH: "yid", YORUBA: "yor" }); // src/utils/fetcher.ts var WrappedError = class extends Error { constructor(message, cause, statusCode) { super(message); this.name = "WrappedError"; this.cause = cause; this.statusCode = statusCode; } }; var wrappedFetch = (input, options) => __async(void 0, null, function* () { const response = yield fetch(input, options); if (!response.ok) { const { status, statusText } = response; const text = yield response.text(); throw new WrappedError( `HTTP ${status} ${statusText}: ${input}`, new Error(text), status ); } return response; }); // src/form.ts function buildForm(parsed) { var _a, _b; const formData = new FormData(); formData.append("format", parsed.format); formData.append("model", parsed.model); if (parsed.model !== "ocr" && parsed.model !== "crawler") { if (parsed.image !== void 0) { formData.append("image", String(parsed.image)); } if (parsed.table !== void 0) { formData.append("table", String(parsed.table)); } } if (parsed.model === "ocr") { if ((_a = parsed.ocrLanguage) == null ? void 0 : _a.length) { formData.append("ocrLanguage", parsed.ocrLanguage.join(",")); } if (parsed.ocrPreset) { formData.append("ocrPreset", parsed.ocrPreset); } } if (parsed.model === "crawler") { formData.append("url", (_b = parsed.url) != null ? _b : ""); if (parsed.maxDepth !== void 0) { formData.append("maxDepth", String(parsed.maxDepth)); } if (parsed.maxExecutions !== void 0) { formData.append("maxExecutions", String(parsed.maxExecutions)); } if (parsed.strategy) { formData.append("strategy", parsed.strategy); } if (parsed.traversalScope) { formData.append("traversalScope", parsed.traversalScope); } } else { if (parsed.files) { for (const file of parsed.files) { formData.append("files", file.contents, file.fileName); } } } return formData; } // src/validator/index.ts import * as fsapi from "node:fs"; import { basename } from "node:path"; import * as fs from "node:fs/promises"; // src/utils/nullable.ts var isNullOrUndefined = (suspect) => { if (typeof suspect === "undefined" || suspect === null) { return true; } if (typeof suspect === "string") { return suspect.trim() === ""; } return false; }; var isValidObject = (suspect) => { return typeof suspect === "object" && suspect !== void 0 && suspect !== null; }; // src/utils/env.ts var env = (key, fallback = "") => { const value = process.env[key]; if (!isNullOrUndefined(value)) { return value; } if (!isNullOrUndefined(fallback)) { return fallback; } return ""; }; // src/options.default.ts var getApiUrl = () => { const value = env("ANYPARSER_API_URL", FALLBACK_API_URL); try { return new URL(value); } catch (e) { console.error("Invalid API URL %s", value); } console.debug("Defaulting to %s", FALLBACK_API_URL); return new URL(FALLBACK_API_URL); }; var defaultOptions = { apiUrl: getApiUrl(), apiKey: env("ANYPARSER_API_KEY"), format: "json", model: "text", image: true, table: true }; // src/options.ts function validateApiKey(apiKey) { if (!apiKey) { throw new Error("API key is required"); } if (typeof apiKey !== "string" || apiKey.trim().length === 0) { throw new Error("API key must be a non-empty string"); } } function buildOptions(options) { const mergedOptions = __spreadValues(__spreadValues({}, defaultOptions), options); validateApiKey(mergedOptions.apiKey); if (!mergedOptions.apiUrl) { throw new Error("API URL is required"); } const parsedOptions = { apiUrl: mergedOptions.apiUrl, apiKey: mergedOptions.apiKey, format: mergedOptions.format || "json", model: mergedOptions.model || "text", encoding: mergedOptions.encoding || "utf-8", image: mergedOptions.image, table: mergedOptions.table, ocrLanguage: mergedOptions.ocrLanguage, ocrPreset: mergedOptions.ocrPreset, url: mergedOptions.url, maxDepth: mergedOptions.maxDepth, maxExecutions: mergedOptions.maxExecutions, strategy: mergedOptions.strategy, traversalScope: mergedOptions.traversalScope }; return parsedOptions; } // src/validator/option.ts var validateOption = (parsed) => { if (isNullOrUndefined(parsed.apiUrl)) { throw new Error("API URL is required"); } if (!isNullOrUndefined(parsed.ocrLanguage)) { parsed.ocrLanguage.forEach((language) => { if (!Object.values(OCR_LANGUAGES).includes(language)) { throw new Error("Invalid OCR language"); } }); } if (!isNullOrUndefined(parsed.ocrPreset)) { if (!Object.values(OCR_PRESETS).includes(parsed.ocrPreset)) { throw new Error("Invalid OCR preset"); } } }; // src/validator/path.ts import { access } from "node:fs/promises"; var validatePath = (filePaths) => __async(void 0, null, function* () { if (!filePaths) { return { valid: false, error: new Error("No files provided") }; } const files = Array.isArray(filePaths) ? filePaths : [filePaths]; if (files.length === 0) { return { valid: false, error: new Error("No files provided") }; } for (const filePath of files) { try { yield access(filePath); } catch (error) { return { valid: false, error }; } } return { valid: true, files }; }); // src/validator/crawler.ts var getURLToCrawl = (filePaths) => { if (Array.isArray(filePaths)) { const filePath = filePaths.find((x) => !isNullOrUndefined(x)); if (!isNullOrUndefined(filePath)) { return new URL(filePath).toString(); } } return new URL(filePaths).toString(); }; // src/validator/index.ts function checkFileAccess(filePath) { return __async(this, null, function* () { try { yield fs.access(filePath); } catch (error) { if (error instanceof Error && "code" in error && error.code === "ENOENT") { throw new Error(`File ${filePath} was not found or was removed`); } throw error; } try { const fileHandle = yield fs.open(filePath, "r"); yield fileHandle.close(); } catch (error) { if (error instanceof Error && "code" in error && (error.code === "EBUSY" || error.code === "ELOCK")) { throw new Error(`File ${filePath} is locked by another process`); } throw error; } }); } function validateAndParse(filePaths, options) { return __async(this, null, function* () { const parsed = buildOptions(options); validateOption(parsed); if (!["json", "markdown", "html"].includes(parsed.format)) { throw new Error(`Unsupported format: ${parsed.format}`); } const isCrawler = (options == null ? void 0 : options.model) === "crawler"; const result = isCrawler ? { valid: true, files: [getURLToCrawl(filePaths)] } : yield validatePath(filePaths); if (result.valid === false) { throw result.error; } const parsedOption = { apiUrl: parsed.apiUrl, apiKey: parsed.apiKey, format: parsed.format, model: parsed.model, image: parsed.image, table: parsed.table, ocrLanguage: parsed.ocrLanguage, ocrPreset: parsed.ocrPreset, url: parsed.url, maxDepth: parsed.maxDepth, maxExecutions: parsed.maxExecutions, strategy: parsed.strategy, traversalScope: parsed.traversalScope, encoding: parsed.encoding }; if (isCrawler) { parsedOption.url = result.files[0]; } else { const processed = []; for (const filePath of result.files) { yield checkFileAccess(filePath); const fileStream = fsapi.createReadStream(filePath); const chunks = []; try { for (var iter = __forAwait(fileStream), more, temp, error; more = !(temp = yield iter.next()).done; more = false) { const chunk = temp.value; chunks.push(chunk); } } catch (temp) { error = [temp]; } finally { try { more && (temp = iter.return) && (yield temp.call(iter)); } finally { if (error) throw error[0]; } } const buffer = Buffer.concat(chunks); const contents = new File([buffer], basename(filePath), { type: "application/octet-stream" }); processed.push({ fileName: basename(filePath), contents }); } parsedOption.files = processed; } return parsedOption; }); } // src/utils/casing.ts var underscoreToCamel = (x) => x.replace(/_+(.)/g, (_, c) => c.toUpperCase()); // src/utils/camel-case.ts var transformToCamel = (item) => { if (item instanceof Date || item instanceof RegExp || item instanceof URL) { return item; } if (typeof item === "function") { return item; } if (item === null || item === void 0) { return item; } if (Array.isArray(item)) { return item.map((el) => transformToCamel(el)); } if (item instanceof Map) { const transformedMap = /* @__PURE__ */ new Map(); item.forEach((value, key) => { transformedMap.set(transformToCamel(key), transformToCamel(value)); }); return transformedMap; } if (item instanceof Set) { const transformedSet = /* @__PURE__ */ new Set(); item.forEach((value) => { transformedSet.add(transformToCamel(value)); }); return transformedSet; } if (isValidObject(item)) { return Object.keys(item).reduce((acc, key) => { const camelKey = underscoreToCamel(key); acc[camelKey] = transformToCamel(item[key]); return acc; }, {}); } return item; }; // src/parser.ts var Anyparser = class { /** * Initialize the parser with optional configuration. * @param options - Configuration options for the parser */ constructor(options) { this.options = options; } /** * Parse files using the Anyparser API. * @param filePathsOrUrl - A single file path or list of file paths to parse, or a start URL for crawling * @returns List of parsed file results if format is JSON, or raw text content if format is text/markdown * @throws Error if the API request fails */ parse(filePathsOrUrl) { return __async(this, null, function* () { const parsed = yield validateAndParse(filePathsOrUrl, this.options); const { apiUrl, apiKey } = parsed; const formData = buildForm(parsed); const fetchOptions = { method: "POST", body: formData, headers: __spreadValues({}, apiKey ? { Authorization: `Bearer ${apiKey}`, "User-Agent": `@anyparser/core@${ANYPARSER_VERSION}` // eslint-disable-line @typescript-eslint/naming-convention } : {}) }; const url = new URL("/parse/v1", apiUrl); const response = yield wrappedFetch(url, fetchOptions); switch (parsed.format) { case "json": return transformToCamel(yield response.json()); case "markdown": case "html": return yield response.text(); default: throw new Error(`Unsupported format: ${parsed.format}`); } }); } }; export { ANYPARSER_VERSION, Anyparser, OCR_LANGUAGES, OCR_PRESETS, version }; //# sourceMappingURL=index.js.map