@anyparser/core
Version:
The `@anyparser/core` Typescript SDK enables developers to quickly extract structured data from a wide variety of file formats like PDFs, images, websites, audio, and videos.
270 lines (253 loc) • 8 kB
TypeScript
declare const ANYPARSER_VERSION = "1.0.1";
declare const version = "1.0.1";
declare const OCR_PRESETS: Readonly<{
readonly DOCUMENT: "document";
readonly HANDWRITING: "handwriting";
readonly SCAN: "scan";
readonly RECEIPT: "receipt";
readonly MAGAZINE: "magazine";
readonly INVOICE: "invoice";
readonly BUSINESS_CARD: "business-card";
readonly PASSPORT: "passport";
readonly DRIVER_LICENSE: "driver-license";
}>;
declare const OCR_LANGUAGES: Readonly<{
readonly AFRIKAANS: "afr";
readonly AMHARIC: "amh";
readonly ARABIC: "ara";
readonly ASSAMESE: "asm";
readonly AZERBAIJANI: "aze";
readonly AZERBAIJANI_CYRILLIC: "aze_cyrl";
readonly BELARUSIAN: "bel";
readonly BENGALI: "ben";
readonly TIBETAN: "bod";
readonly BOSNIAN: "bos";
readonly BRETON: "bre";
readonly BULGARIAN: "bul";
readonly CATALAN: "cat";
readonly CEBUANO: "ceb";
readonly CZECH: "ces";
readonly SIMPLIFIED_CHINESE: "chi_sim";
readonly SIMPLIFIED_CHINESE_VERTICAL: "chi_sim_vert";
readonly TRADITIONAL_CHINESE: "chi_tra";
readonly TRADITIONAL_CHINESE_VERTICAL: "chi_tra_vert";
readonly CHEROKEE: "chr";
readonly CORSICAN: "cos";
readonly WELSH: "cym";
readonly DANISH: "dan";
readonly DANISH_FRAKTUR: "dan_frak";
readonly GERMAN: "deu";
readonly GERMAN_FRAKTUR: "deu_frak";
readonly GERMAN_LATIN: "deu_latf";
readonly DIVESH: "div";
readonly DZONGKHA: "dzo";
readonly GREEK: "ell";
readonly ENGLISH: "eng";
readonly MIDDLE_ENGLISH: "enm";
readonly ESPERANTO: "epo";
readonly EQUATORIAL_GUINEAN: "equ";
readonly ESTONIAN: "est";
readonly BASQUE: "eus";
readonly FAROESE: "fao";
readonly PERSIAN: "fas";
readonly FILIPINO: "fil";
readonly FINNISH: "fin";
readonly FRENCH: "fra";
readonly OLD_FRENCH: "frm";
readonly FRISIAN: "fry";
readonly SCOTTISH_GAELIC: "gla";
readonly IRISH: "gle";
readonly GALICIAN: "glg";
readonly ANCIENT_GREEK: "grc";
readonly GUJARATI: "guj";
readonly HAITIAN_CREOLE: "hat";
readonly HEBREW: "heb";
readonly HINDI: "hin";
readonly CROATIAN: "hrv";
readonly HUNGARIAN: "hun";
readonly ARMENIAN: "hye";
readonly IGBO: "iku";
readonly INDONESIAN: "ind";
readonly ICELANDIC: "isl";
readonly ITALIAN: "ita";
readonly OLD_ITALIAN: "ita_old";
readonly JAVANESE: "jav";
readonly JAPANESE: "jpn";
readonly JAPANESE_VERTICAL: "jpn_vert";
readonly KANNADA: "kan";
readonly GEORGIAN: "kat";
readonly OLD_GEORGIAN: "kat_old";
readonly KAZAKH: "kaz";
readonly KHMER: "khm";
readonly KIRGHIZ: "kir";
readonly KURDISH: "kmr";
readonly KOREAN: "kor";
readonly KOREAN_VERTICAL: "kor_vert";
readonly LAO: "lao";
readonly LATIN: "lat";
readonly LATVIAN: "lav";
readonly LITHUANIAN: "lit";
readonly LUXEMBOURGISH: "ltz";
readonly MALAYALAM: "mal";
readonly MARATHI: "mar";
readonly MACEDONIAN: "mkd";
readonly MALTESE: "mlt";
readonly MONGOLIAN: "mon";
readonly MAORI: "mri";
readonly MALAY: "msa";
readonly MYANMAR: "mya";
readonly NEPALI: "nep";
readonly DUTCH: "nld";
readonly NORWEGIAN: "nor";
readonly OCCITAN: "oci";
readonly ODISHA: "ori";
readonly OSD: "osd";
readonly PUNJABI: "pan";
readonly POLISH: "pol";
readonly PORTUGUESE: "por";
readonly PASHTO: "pus";
readonly QUECHUA: "que";
readonly ROMANIAN: "ron";
readonly RUSSIAN: "rus";
readonly SANSKRIT: "san";
readonly SINHALA: "sin";
readonly SLOVAK: "slk";
readonly SLOVAK_FRAKTUR: "slk_frak";
readonly SLOVENIAN: "slv";
readonly SINDHI: "snd";
readonly SPANISH: "spa";
readonly OLD_SPANISH: "spa_old";
readonly ALBANIAN: "sqi";
readonly SERBIAN: "srp";
readonly SERBIAN_LATIN: "srp_latn";
readonly SUNDIANESE: "sun";
readonly SWAHILI: "swa";
readonly SWEDISH: "swe";
readonly SYRIAC: "syr";
readonly TAMIL: "tam";
readonly TATAR: "tat";
readonly TELUGU: "tel";
readonly TAJIK: "tgk";
readonly TAGALOG: "tgl";
readonly THAI: "tha";
readonly TIGRINYA: "tir";
readonly TONGAN: "ton";
readonly TURKISH: "tur";
readonly UIGHUR: "uig";
readonly UKRAINIAN: "ukr";
readonly URDU: "urd";
readonly UZBEK: "uzb";
readonly UZBEK_CYRILLIC: "uzb_cyrl";
readonly VIETNAMESE: "vie";
readonly YIDDISH: "yid";
readonly YORUBA: "yor";
}>;
type OcrPresetType = (typeof OCR_PRESETS)[keyof typeof OCR_PRESETS];
type OcrLanguageType = (typeof OCR_LANGUAGES)[keyof typeof OCR_LANGUAGES];
/**
* Main class for parsing items using the Anyparser API.
*/
declare class Anyparser {
options?: AnyparserOption;
/**
* Initialize the parser with optional configuration.
* @param options - Configuration options for the parser
*/
constructor(options?: AnyparserOption);
/**
* Parse files using the Anyparser API.
* @param filePathsOrUrl - A single file path or list of file paths to parse, or a start URL for crawling
* @returns List of parsed file results if format is JSON, or raw text content if format is text/markdown
* @throws Error if the API request fails
*/
parse(filePathsOrUrl: string | string[]): Promise<Result>;
}
type AnyparserFormatType = 'json' | 'markdown' | 'html'
type AnyparserModelType = 'text' | 'ocr' | 'vlm' | 'lam' | 'crawler'
type AnyparserEncodingType = 'utf-8' | 'latin1'
interface AnyparserOption {
apiUrl?: URL
apiKey?: string
format?: AnyparserFormatType
model?: AnyparserModelType
encoding?: AnyparserEncodingType
image?: boolean
table?: boolean
files?: string | string[]
ocrLanguage?: OcrLanguageType[]
ocrPreset?: OcrPresetType
url?: string
maxDepth?: number
maxExecutions?: number
strategy?: 'LIFO' | 'FIFO'
traversalScope?: 'subtree' | 'domain'
}
// ---- Parser
interface AnyparserImageReference {
base64Data: string
displayName: string
page?: number
imageIndex: number
}
interface AnyparserResultBase {
rid: string
originalFilename: string
checksum: string
totalCharacters?: number
markdown?: string
}
interface AnyparserCrawlDirectiveBase {
type: 'HTTP Header' | 'HTML Meta' | 'Combined'
priority: number
name?: string
noindex?: boolean
nofollow?: boolean
crawlDelay?: number
unavailableAfter?: Date
}
interface AnyparserCrawlDirective extends AnyparserCrawlDirectiveBase {
type: 'Combined'
name: undefined
underlying: AnyparserCrawlDirectiveBase[]
}
interface AnyparserUrl {
url: URL
title?: string
crawledAt?: string
statusCode: number
statusMessage: string
directive: AnyparserCrawlDirective
totalCharacters?: number
markdown?: string
images?: AnyparserImageReference[]
text?: string
politenessDelay: number
}
interface AnyparserRobotsTxtDirective {
userAgent: string
disallow: Set<string>
allow: Set<string>
crawlDelay?: number
}
interface AnyparserPdfPage {
pageNumber: number
markdown?: string
text?: string
images?: AnyparserImageReference[]
}
interface AnyparserPdfResult extends AnyparserResultBase {
totalItems?: number
items?: AnyparserPdfPage[]
}
interface AnyparserCrawlResult {
rid: string
startUrl: URL
totalCharacters: number
totalItems: number
markdown: string
items?: AnyparserUrl[]
robotsDirective: AnyparserRobotsTxtDirective
}
type AnyparserResult = AnyparserCrawlResult | AnyparserPdfResult | AnyparserResultBase
type Result = AnyparserResult[] | string
export { ANYPARSER_VERSION, Anyparser, type AnyparserCrawlDirective, type AnyparserCrawlDirectiveBase, type AnyparserCrawlResult, type AnyparserImageReference, type AnyparserOption, type AnyparserPdfPage, type AnyparserPdfResult, type AnyparserResult, type AnyparserResultBase, type AnyparserRobotsTxtDirective, type AnyparserUrl, OCR_LANGUAGES, OCR_PRESETS, type OcrLanguageType, type OcrPresetType, type Result, version };