@anyparser/core

Version:

The `@anyparser/core` Typescript SDK enables developers to quickly extract structured data from a wide variety of file formats like PDFs, images, websites, audio, and videos.

github.com/anyparser/anyparserjs

270 lines (253 loc) • 8 kB

TypeScript

declare const ANYPARSER_VERSION = "1.0.1"; declare const version = "1.0.1"; declare const OCR_PRESETS: Readonly<{ readonly DOCUMENT: "document"; readonly HANDWRITING: "handwriting"; readonly SCAN: "scan"; readonly RECEIPT: "receipt"; readonly MAGAZINE: "magazine"; readonly INVOICE: "invoice"; readonly BUSINESS_CARD: "business-card"; readonly PASSPORT: "passport"; readonly DRIVER_LICENSE: "driver-license"; }>; declare const OCR_LANGUAGES: Readonly<{ readonly AFRIKAANS: "afr"; readonly AMHARIC: "amh"; readonly ARABIC: "ara"; readonly ASSAMESE: "asm"; readonly AZERBAIJANI: "aze"; readonly AZERBAIJANI_CYRILLIC: "aze_cyrl"; readonly BELARUSIAN: "bel"; readonly BENGALI: "ben"; readonly TIBETAN: "bod"; readonly BOSNIAN: "bos"; readonly BRETON: "bre"; readonly BULGARIAN: "bul"; readonly CATALAN: "cat"; readonly CEBUANO: "ceb"; readonly CZECH: "ces"; readonly SIMPLIFIED_CHINESE: "chi_sim"; readonly SIMPLIFIED_CHINESE_VERTICAL: "chi_sim_vert"; readonly TRADITIONAL_CHINESE: "chi_tra"; readonly TRADITIONAL_CHINESE_VERTICAL: "chi_tra_vert"; readonly CHEROKEE: "chr"; readonly CORSICAN: "cos"; readonly WELSH: "cym"; readonly DANISH: "dan"; readonly DANISH_FRAKTUR: "dan_frak"; readonly GERMAN: "deu"; readonly GERMAN_FRAKTUR: "deu_frak"; readonly GERMAN_LATIN: "deu_latf"; readonly DIVESH: "div"; readonly DZONGKHA: "dzo"; readonly GREEK: "ell"; readonly ENGLISH: "eng"; readonly MIDDLE_ENGLISH: "enm"; readonly ESPERANTO: "epo"; readonly EQUATORIAL_GUINEAN: "equ"; readonly ESTONIAN: "est"; readonly BASQUE: "eus"; readonly FAROESE: "fao"; readonly PERSIAN: "fas"; readonly FILIPINO: "fil"; readonly FINNISH: "fin"; readonly FRENCH: "fra"; readonly OLD_FRENCH: "frm"; readonly FRISIAN: "fry"; readonly SCOTTISH_GAELIC: "gla"; readonly IRISH: "gle"; readonly GALICIAN: "glg"; readonly ANCIENT_GREEK: "grc"; readonly GUJARATI: "guj"; readonly HAITIAN_CREOLE: "hat"; readonly HEBREW: "heb"; readonly HINDI: "hin"; readonly CROATIAN: "hrv"; readonly HUNGARIAN: "hun"; readonly ARMENIAN: "hye"; readonly IGBO: "iku"; readonly INDONESIAN: "ind"; readonly ICELANDIC: "isl"; readonly ITALIAN: "ita"; readonly OLD_ITALIAN: "ita_old"; readonly JAVANESE: "jav"; readonly JAPANESE: "jpn"; readonly JAPANESE_VERTICAL: "jpn_vert"; readonly KANNADA: "kan"; readonly GEORGIAN: "kat"; readonly OLD_GEORGIAN: "kat_old"; readonly KAZAKH: "kaz"; readonly KHMER: "khm"; readonly KIRGHIZ: "kir"; readonly KURDISH: "kmr"; readonly KOREAN: "kor"; readonly KOREAN_VERTICAL: "kor_vert"; readonly LAO: "lao"; readonly LATIN: "lat"; readonly LATVIAN: "lav"; readonly LITHUANIAN: "lit"; readonly LUXEMBOURGISH: "ltz"; readonly MALAYALAM: "mal"; readonly MARATHI: "mar"; readonly MACEDONIAN: "mkd"; readonly MALTESE: "mlt"; readonly MONGOLIAN: "mon"; readonly MAORI: "mri"; readonly MALAY: "msa"; readonly MYANMAR: "mya"; readonly NEPALI: "nep"; readonly DUTCH: "nld"; readonly NORWEGIAN: "nor"; readonly OCCITAN: "oci"; readonly ODISHA: "ori"; readonly OSD: "osd"; readonly PUNJABI: "pan"; readonly POLISH: "pol"; readonly PORTUGUESE: "por"; readonly PASHTO: "pus"; readonly QUECHUA: "que"; readonly ROMANIAN: "ron"; readonly RUSSIAN: "rus"; readonly SANSKRIT: "san"; readonly SINHALA: "sin"; readonly SLOVAK: "slk"; readonly SLOVAK_FRAKTUR: "slk_frak"; readonly SLOVENIAN: "slv"; readonly SINDHI: "snd"; readonly SPANISH: "spa"; readonly OLD_SPANISH: "spa_old"; readonly ALBANIAN: "sqi"; readonly SERBIAN: "srp"; readonly SERBIAN_LATIN: "srp_latn"; readonly SUNDIANESE: "sun"; readonly SWAHILI: "swa"; readonly SWEDISH: "swe"; readonly SYRIAC: "syr"; readonly TAMIL: "tam"; readonly TATAR: "tat"; readonly TELUGU: "tel"; readonly TAJIK: "tgk"; readonly TAGALOG: "tgl"; readonly THAI: "tha"; readonly TIGRINYA: "tir"; readonly TONGAN: "ton"; readonly TURKISH: "tur"; readonly UIGHUR: "uig"; readonly UKRAINIAN: "ukr"; readonly URDU: "urd"; readonly UZBEK: "uzb"; readonly UZBEK_CYRILLIC: "uzb_cyrl"; readonly VIETNAMESE: "vie"; readonly YIDDISH: "yid"; readonly YORUBA: "yor"; }>; type OcrPresetType = (typeof OCR_PRESETS)[keyof typeof OCR_PRESETS]; type OcrLanguageType = (typeof OCR_LANGUAGES)[keyof typeof OCR_LANGUAGES]; /** * Main class for parsing items using the Anyparser API. */ declare class Anyparser { options?: AnyparserOption; /** * Initialize the parser with optional configuration. * @param options - Configuration options for the parser */ constructor(options?: AnyparserOption); /** * Parse files using the Anyparser API. * @param filePathsOrUrl - A single file path or list of file paths to parse, or a start URL for crawling * @returns List of parsed file results if format is JSON, or raw text content if format is text/markdown * @throws Error if the API request fails */ parse(filePathsOrUrl: string | string[]): Promise<Result>; } type AnyparserFormatType = 'json' | 'markdown' | 'html' type AnyparserModelType = 'text' | 'ocr' | 'vlm' | 'lam' | 'crawler' type AnyparserEncodingType = 'utf-8' | 'latin1' interface AnyparserOption { apiUrl?: URL apiKey?: string format?: AnyparserFormatType model?: AnyparserModelType encoding?: AnyparserEncodingType image?: boolean table?: boolean files?: string | string[] ocrLanguage?: OcrLanguageType[] ocrPreset?: OcrPresetType url?: string maxDepth?: number maxExecutions?: number strategy?: 'LIFO' | 'FIFO' traversalScope?: 'subtree' | 'domain' } // ---- Parser interface AnyparserImageReference { base64Data: string displayName: string page?: number imageIndex: number } interface AnyparserResultBase { rid: string originalFilename: string checksum: string totalCharacters?: number markdown?: string } interface AnyparserCrawlDirectiveBase { type: 'HTTP Header' | 'HTML Meta' | 'Combined' priority: number name?: string noindex?: boolean nofollow?: boolean crawlDelay?: number unavailableAfter?: Date } interface AnyparserCrawlDirective extends AnyparserCrawlDirectiveBase { type: 'Combined' name: undefined underlying: AnyparserCrawlDirectiveBase[] } interface AnyparserUrl { url: URL title?: string crawledAt?: string statusCode: number statusMessage: string directive: AnyparserCrawlDirective totalCharacters?: number markdown?: string images?: AnyparserImageReference[] text?: string politenessDelay: number } interface AnyparserRobotsTxtDirective { userAgent: string disallow: Set<string> allow: Set<string> crawlDelay?: number } interface AnyparserPdfPage { pageNumber: number markdown?: string text?: string images?: AnyparserImageReference[] } interface AnyparserPdfResult extends AnyparserResultBase { totalItems?: number items?: AnyparserPdfPage[] } interface AnyparserCrawlResult { rid: string startUrl: URL totalCharacters: number totalItems: number markdown: string items?: AnyparserUrl[] robotsDirective: AnyparserRobotsTxtDirective } type AnyparserResult = AnyparserCrawlResult | AnyparserPdfResult | AnyparserResultBase type Result = AnyparserResult[] | string export { ANYPARSER_VERSION, Anyparser, type AnyparserCrawlDirective, type AnyparserCrawlDirectiveBase, type AnyparserCrawlResult, type AnyparserImageReference, type AnyparserOption, type AnyparserPdfPage, type AnyparserPdfResult, type AnyparserResult, type AnyparserResultBase, type AnyparserRobotsTxtDirective, type AnyparserUrl, OCR_LANGUAGES, OCR_PRESETS, type OcrLanguageType, type OcrPresetType, type Result, version };