@anyparser/core
Version:
The `@anyparser/core` Typescript SDK enables developers to quickly extract structured data from a wide variety of file formats like PDFs, images, websites, audio, and videos.
584 lines (568 loc) • 16.2 kB
JavaScript
var __defProp = Object.defineProperty;
var __getOwnPropSymbols = Object.getOwnPropertySymbols;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __propIsEnum = Object.prototype.propertyIsEnumerable;
var __knownSymbol = (name, symbol) => (symbol = Symbol[name]) ? symbol : Symbol.for("Symbol." + name);
var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
var __spreadValues = (a, b) => {
for (var prop in b || (b = {}))
if (__hasOwnProp.call(b, prop))
__defNormalProp(a, prop, b[prop]);
if (__getOwnPropSymbols)
for (var prop of __getOwnPropSymbols(b)) {
if (__propIsEnum.call(b, prop))
__defNormalProp(a, prop, b[prop]);
}
return a;
};
var __async = (__this, __arguments, generator) => {
return new Promise((resolve, reject) => {
var fulfilled = (value) => {
try {
step(generator.next(value));
} catch (e) {
reject(e);
}
};
var rejected = (value) => {
try {
step(generator.throw(value));
} catch (e) {
reject(e);
}
};
var step = (x) => x.done ? resolve(x.value) : Promise.resolve(x.value).then(fulfilled, rejected);
step((generator = generator.apply(__this, __arguments)).next());
});
};
var __forAwait = (obj, it, method) => (it = obj[__knownSymbol("asyncIterator")]) ? it.call(obj) : (obj = obj[__knownSymbol("iterator")](), it = {}, method = (key, fn) => (fn = obj[key]) && (it[key] = (arg) => new Promise((yes, no, done) => (arg = fn.call(obj, arg), done = arg.done, Promise.resolve(arg.value).then((value) => yes({ value, done }), no)))), method("next"), method("return"), it);
// src/version.ts
var ANYPARSER_VERSION = "1.0.1";
var version = ANYPARSER_VERSION;
// src/config/hardcoded.ts
var FALLBACK_API_URL = "https://anyparserapi.com";
var OCR_PRESETS = Object.freeze({
DOCUMENT: "document",
HANDWRITING: "handwriting",
SCAN: "scan",
RECEIPT: "receipt",
MAGAZINE: "magazine",
INVOICE: "invoice",
BUSINESS_CARD: "business-card",
PASSPORT: "passport",
DRIVER_LICENSE: "driver-license"
});
var OCR_LANGUAGES = Object.freeze({
AFRIKAANS: "afr",
AMHARIC: "amh",
ARABIC: "ara",
ASSAMESE: "asm",
AZERBAIJANI: "aze",
AZERBAIJANI_CYRILLIC: "aze_cyrl",
BELARUSIAN: "bel",
BENGALI: "ben",
TIBETAN: "bod",
BOSNIAN: "bos",
BRETON: "bre",
BULGARIAN: "bul",
CATALAN: "cat",
CEBUANO: "ceb",
CZECH: "ces",
SIMPLIFIED_CHINESE: "chi_sim",
SIMPLIFIED_CHINESE_VERTICAL: "chi_sim_vert",
TRADITIONAL_CHINESE: "chi_tra",
TRADITIONAL_CHINESE_VERTICAL: "chi_tra_vert",
CHEROKEE: "chr",
CORSICAN: "cos",
WELSH: "cym",
DANISH: "dan",
DANISH_FRAKTUR: "dan_frak",
GERMAN: "deu",
GERMAN_FRAKTUR: "deu_frak",
GERMAN_LATIN: "deu_latf",
DIVESH: "div",
DZONGKHA: "dzo",
GREEK: "ell",
ENGLISH: "eng",
MIDDLE_ENGLISH: "enm",
ESPERANTO: "epo",
EQUATORIAL_GUINEAN: "equ",
ESTONIAN: "est",
BASQUE: "eus",
FAROESE: "fao",
PERSIAN: "fas",
FILIPINO: "fil",
FINNISH: "fin",
FRENCH: "fra",
OLD_FRENCH: "frm",
FRISIAN: "fry",
SCOTTISH_GAELIC: "gla",
IRISH: "gle",
GALICIAN: "glg",
ANCIENT_GREEK: "grc",
GUJARATI: "guj",
HAITIAN_CREOLE: "hat",
HEBREW: "heb",
HINDI: "hin",
CROATIAN: "hrv",
HUNGARIAN: "hun",
ARMENIAN: "hye",
IGBO: "iku",
INDONESIAN: "ind",
ICELANDIC: "isl",
ITALIAN: "ita",
OLD_ITALIAN: "ita_old",
JAVANESE: "jav",
JAPANESE: "jpn",
JAPANESE_VERTICAL: "jpn_vert",
KANNADA: "kan",
GEORGIAN: "kat",
OLD_GEORGIAN: "kat_old",
KAZAKH: "kaz",
KHMER: "khm",
KIRGHIZ: "kir",
KURDISH: "kmr",
KOREAN: "kor",
KOREAN_VERTICAL: "kor_vert",
LAO: "lao",
LATIN: "lat",
LATVIAN: "lav",
LITHUANIAN: "lit",
LUXEMBOURGISH: "ltz",
MALAYALAM: "mal",
MARATHI: "mar",
MACEDONIAN: "mkd",
MALTESE: "mlt",
MONGOLIAN: "mon",
MAORI: "mri",
MALAY: "msa",
MYANMAR: "mya",
NEPALI: "nep",
DUTCH: "nld",
NORWEGIAN: "nor",
OCCITAN: "oci",
ODISHA: "ori",
OSD: "osd",
PUNJABI: "pan",
POLISH: "pol",
PORTUGUESE: "por",
PASHTO: "pus",
QUECHUA: "que",
ROMANIAN: "ron",
RUSSIAN: "rus",
SANSKRIT: "san",
SINHALA: "sin",
SLOVAK: "slk",
SLOVAK_FRAKTUR: "slk_frak",
SLOVENIAN: "slv",
SINDHI: "snd",
SPANISH: "spa",
OLD_SPANISH: "spa_old",
ALBANIAN: "sqi",
SERBIAN: "srp",
SERBIAN_LATIN: "srp_latn",
SUNDIANESE: "sun",
SWAHILI: "swa",
SWEDISH: "swe",
SYRIAC: "syr",
TAMIL: "tam",
TATAR: "tat",
TELUGU: "tel",
TAJIK: "tgk",
TAGALOG: "tgl",
THAI: "tha",
TIGRINYA: "tir",
TONGAN: "ton",
TURKISH: "tur",
UIGHUR: "uig",
UKRAINIAN: "ukr",
URDU: "urd",
UZBEK: "uzb",
UZBEK_CYRILLIC: "uzb_cyrl",
VIETNAMESE: "vie",
YIDDISH: "yid",
YORUBA: "yor"
});
// src/utils/fetcher.ts
var WrappedError = class extends Error {
constructor(message, cause, statusCode) {
super(message);
this.name = "WrappedError";
this.cause = cause;
this.statusCode = statusCode;
}
};
var wrappedFetch = (input, options) => __async(void 0, null, function* () {
const response = yield fetch(input, options);
if (!response.ok) {
const { status, statusText } = response;
const text = yield response.text();
throw new WrappedError(
`HTTP ${status} ${statusText}: ${input}`,
new Error(text),
status
);
}
return response;
});
// src/form.ts
function buildForm(parsed) {
var _a, _b;
const formData = new FormData();
formData.append("format", parsed.format);
formData.append("model", parsed.model);
if (parsed.model !== "ocr" && parsed.model !== "crawler") {
if (parsed.image !== void 0) {
formData.append("image", String(parsed.image));
}
if (parsed.table !== void 0) {
formData.append("table", String(parsed.table));
}
}
if (parsed.model === "ocr") {
if ((_a = parsed.ocrLanguage) == null ? void 0 : _a.length) {
formData.append("ocrLanguage", parsed.ocrLanguage.join(","));
}
if (parsed.ocrPreset) {
formData.append("ocrPreset", parsed.ocrPreset);
}
}
if (parsed.model === "crawler") {
formData.append("url", (_b = parsed.url) != null ? _b : "");
if (parsed.maxDepth !== void 0) {
formData.append("maxDepth", String(parsed.maxDepth));
}
if (parsed.maxExecutions !== void 0) {
formData.append("maxExecutions", String(parsed.maxExecutions));
}
if (parsed.strategy) {
formData.append("strategy", parsed.strategy);
}
if (parsed.traversalScope) {
formData.append("traversalScope", parsed.traversalScope);
}
} else {
if (parsed.files) {
for (const file of parsed.files) {
formData.append("files", file.contents, file.fileName);
}
}
}
return formData;
}
// src/validator/index.ts
import * as fsapi from "node:fs";
import { basename } from "node:path";
import * as fs from "node:fs/promises";
// src/utils/nullable.ts
var isNullOrUndefined = (suspect) => {
if (typeof suspect === "undefined" || suspect === null) {
return true;
}
if (typeof suspect === "string") {
return suspect.trim() === "";
}
return false;
};
var isValidObject = (suspect) => {
return typeof suspect === "object" && suspect !== void 0 && suspect !== null;
};
// src/utils/env.ts
var env = (key, fallback = "") => {
const value = process.env[key];
if (!isNullOrUndefined(value)) {
return value;
}
if (!isNullOrUndefined(fallback)) {
return fallback;
}
return "";
};
// src/options.default.ts
var getApiUrl = () => {
const value = env("ANYPARSER_API_URL", FALLBACK_API_URL);
try {
return new URL(value);
} catch (e) {
console.error("Invalid API URL %s", value);
}
console.debug("Defaulting to %s", FALLBACK_API_URL);
return new URL(FALLBACK_API_URL);
};
var defaultOptions = {
apiUrl: getApiUrl(),
apiKey: env("ANYPARSER_API_KEY"),
format: "json",
model: "text",
image: true,
table: true
};
// src/options.ts
function validateApiKey(apiKey) {
if (!apiKey) {
throw new Error("API key is required");
}
if (typeof apiKey !== "string" || apiKey.trim().length === 0) {
throw new Error("API key must be a non-empty string");
}
}
function buildOptions(options) {
const mergedOptions = __spreadValues(__spreadValues({}, defaultOptions), options);
validateApiKey(mergedOptions.apiKey);
if (!mergedOptions.apiUrl) {
throw new Error("API URL is required");
}
const parsedOptions = {
apiUrl: mergedOptions.apiUrl,
apiKey: mergedOptions.apiKey,
format: mergedOptions.format || "json",
model: mergedOptions.model || "text",
encoding: mergedOptions.encoding || "utf-8",
image: mergedOptions.image,
table: mergedOptions.table,
ocrLanguage: mergedOptions.ocrLanguage,
ocrPreset: mergedOptions.ocrPreset,
url: mergedOptions.url,
maxDepth: mergedOptions.maxDepth,
maxExecutions: mergedOptions.maxExecutions,
strategy: mergedOptions.strategy,
traversalScope: mergedOptions.traversalScope
};
return parsedOptions;
}
// src/validator/option.ts
var validateOption = (parsed) => {
if (isNullOrUndefined(parsed.apiUrl)) {
throw new Error("API URL is required");
}
if (!isNullOrUndefined(parsed.ocrLanguage)) {
parsed.ocrLanguage.forEach((language) => {
if (!Object.values(OCR_LANGUAGES).includes(language)) {
throw new Error("Invalid OCR language");
}
});
}
if (!isNullOrUndefined(parsed.ocrPreset)) {
if (!Object.values(OCR_PRESETS).includes(parsed.ocrPreset)) {
throw new Error("Invalid OCR preset");
}
}
};
// src/validator/path.ts
import { access } from "node:fs/promises";
var validatePath = (filePaths) => __async(void 0, null, function* () {
if (!filePaths) {
return {
valid: false,
error: new Error("No files provided")
};
}
const files = Array.isArray(filePaths) ? filePaths : [filePaths];
if (files.length === 0) {
return {
valid: false,
error: new Error("No files provided")
};
}
for (const filePath of files) {
try {
yield access(filePath);
} catch (error) {
return {
valid: false,
error
};
}
}
return {
valid: true,
files
};
});
// src/validator/crawler.ts
var getURLToCrawl = (filePaths) => {
if (Array.isArray(filePaths)) {
const filePath = filePaths.find((x) => !isNullOrUndefined(x));
if (!isNullOrUndefined(filePath)) {
return new URL(filePath).toString();
}
}
return new URL(filePaths).toString();
};
// src/validator/index.ts
function checkFileAccess(filePath) {
return __async(this, null, function* () {
try {
yield fs.access(filePath);
} catch (error) {
if (error instanceof Error && "code" in error && error.code === "ENOENT") {
throw new Error(`File ${filePath} was not found or was removed`);
}
throw error;
}
try {
const fileHandle = yield fs.open(filePath, "r");
yield fileHandle.close();
} catch (error) {
if (error instanceof Error && "code" in error && (error.code === "EBUSY" || error.code === "ELOCK")) {
throw new Error(`File ${filePath} is locked by another process`);
}
throw error;
}
});
}
function validateAndParse(filePaths, options) {
return __async(this, null, function* () {
const parsed = buildOptions(options);
validateOption(parsed);
if (!["json", "markdown", "html"].includes(parsed.format)) {
throw new Error(`Unsupported format: ${parsed.format}`);
}
const isCrawler = (options == null ? void 0 : options.model) === "crawler";
const result = isCrawler ? { valid: true, files: [getURLToCrawl(filePaths)] } : yield validatePath(filePaths);
if (result.valid === false) {
throw result.error;
}
const parsedOption = {
apiUrl: parsed.apiUrl,
apiKey: parsed.apiKey,
format: parsed.format,
model: parsed.model,
image: parsed.image,
table: parsed.table,
ocrLanguage: parsed.ocrLanguage,
ocrPreset: parsed.ocrPreset,
url: parsed.url,
maxDepth: parsed.maxDepth,
maxExecutions: parsed.maxExecutions,
strategy: parsed.strategy,
traversalScope: parsed.traversalScope,
encoding: parsed.encoding
};
if (isCrawler) {
parsedOption.url = result.files[0];
} else {
const processed = [];
for (const filePath of result.files) {
yield checkFileAccess(filePath);
const fileStream = fsapi.createReadStream(filePath);
const chunks = [];
try {
for (var iter = __forAwait(fileStream), more, temp, error; more = !(temp = yield iter.next()).done; more = false) {
const chunk = temp.value;
chunks.push(chunk);
}
} catch (temp) {
error = [temp];
} finally {
try {
more && (temp = iter.return) && (yield temp.call(iter));
} finally {
if (error)
throw error[0];
}
}
const buffer = Buffer.concat(chunks);
const contents = new File([buffer], basename(filePath), {
type: "application/octet-stream"
});
processed.push({
fileName: basename(filePath),
contents
});
}
parsedOption.files = processed;
}
return parsedOption;
});
}
// src/utils/casing.ts
var underscoreToCamel = (x) => x.replace(/_+(.)/g, (_, c) => c.toUpperCase());
// src/utils/camel-case.ts
var transformToCamel = (item) => {
if (item instanceof Date || item instanceof RegExp || item instanceof URL) {
return item;
}
if (typeof item === "function") {
return item;
}
if (item === null || item === void 0) {
return item;
}
if (Array.isArray(item)) {
return item.map((el) => transformToCamel(el));
}
if (item instanceof Map) {
const transformedMap = /* @__PURE__ */ new Map();
item.forEach((value, key) => {
transformedMap.set(transformToCamel(key), transformToCamel(value));
});
return transformedMap;
}
if (item instanceof Set) {
const transformedSet = /* @__PURE__ */ new Set();
item.forEach((value) => {
transformedSet.add(transformToCamel(value));
});
return transformedSet;
}
if (isValidObject(item)) {
return Object.keys(item).reduce((acc, key) => {
const camelKey = underscoreToCamel(key);
acc[camelKey] = transformToCamel(item[key]);
return acc;
}, {});
}
return item;
};
// src/parser.ts
var Anyparser = class {
/**
* Initialize the parser with optional configuration.
* @param options - Configuration options for the parser
*/
constructor(options) {
this.options = options;
}
/**
* Parse files using the Anyparser API.
* @param filePathsOrUrl - A single file path or list of file paths to parse, or a start URL for crawling
* @returns List of parsed file results if format is JSON, or raw text content if format is text/markdown
* @throws Error if the API request fails
*/
parse(filePathsOrUrl) {
return __async(this, null, function* () {
const parsed = yield validateAndParse(filePathsOrUrl, this.options);
const { apiUrl, apiKey } = parsed;
const formData = buildForm(parsed);
const fetchOptions = {
method: "POST",
body: formData,
headers: __spreadValues({}, apiKey ? {
Authorization: `Bearer ${apiKey}`,
"User-Agent": `@anyparser/core@${ANYPARSER_VERSION}`
// eslint-disable-line @typescript-eslint/naming-convention
} : {})
};
const url = new URL("/parse/v1", apiUrl);
const response = yield wrappedFetch(url, fetchOptions);
switch (parsed.format) {
case "json":
return transformToCamel(yield response.json());
case "markdown":
case "html":
return yield response.text();
default:
throw new Error(`Unsupported format: ${parsed.format}`);
}
});
}
};
export {
ANYPARSER_VERSION,
Anyparser,
OCR_LANGUAGES,
OCR_PRESETS,
version
};
//# sourceMappingURL=index.js.map