UNPKG

pdfvector

Version:

Official TypeScript/JavaScript SDK for PDF Vector API - Parse PDF/Word/Image/Excel documents to clean, structured markdown format and search academic publications across multiple databases

732 lines 28 kB
import type { AcademicFetchResponse, AcademicSearchProvider, AcademicSearchPublicationField, AcademicSearchResponse, AskIdResponse, AskInvoiceResponse, AskResponse, ExtractIdResponse, ExtractInvoiceResponse, ExtractResponse, ListImagesResponse, ParseIdResponse, ParseInvoiceResponse, ParseResponse } from "./types.js"; export { AcademicSearchProviderValues, AcademicSearchPublicationFieldValues, } from "./types.js"; export type { AcademicFetchError, AcademicFetchResponse, AcademicFetchResult, AcademicSearchArxivData, AcademicSearchAuthor, AcademicSearchBasePublication, AcademicSearchEricData, AcademicSearchEuropePmcData, AcademicSearchGoogleScholarData, AcademicSearchOpenAlexData, AcademicSearchProvider, AcademicSearchProviderData, AcademicSearchProviderError, AcademicSearchPublication, AcademicSearchPublicationField, AcademicSearchPubMedData, AcademicSearchResponse, AcademicSearchSemanticScholarData, AskIdResponse, AskInvoiceResponse, AskResponse, ExtractIdResponse, ExtractInvoiceResponse, ExtractResponse, ListImagesResponse, ParseIdResponse, ParseInvoiceResponse, ParseResponse, } from "./types.js"; /** * Base interface for parse request parameters */ export interface ParseBaseRequest { /** Whether to use LLM parsing. Defaults to "auto" */ useLLM?: "auto" | "always" | "never"; } /** * Request parameters for parsing a PDF/Word document from URL */ export interface ParseURLRequest extends ParseBaseRequest { /** Direct URL to a PDF/Word document that will be parsed and converted to markdown */ url: string; } /** * Request parameters for parsing a PDF/Word document from data */ export interface ParseDataRequest extends ParseBaseRequest { /** Direct data of the document - supports string, Buffer, Uint8Array, ArrayBuffer, Blob, or ReadableStream */ data: string | Buffer | Uint8Array | ArrayBuffer | Blob | ReadableStream; /** Content type of the document (e.g., "application/pdf", "application/msword") */ contentType: string; } /** * Base interface for ask request parameters */ export interface AskBaseRequest { /** The question or prompt you want to ask about the document */ prompt: string; } /** * Request parameters for asking questions about a document from URL */ export interface AskURLRequest extends AskBaseRequest { /** Direct URL to a PDF/Word document that will be analyzed */ url: string; } /** * Request parameters for asking questions about a document from data */ export interface AskDataRequest extends AskBaseRequest { /** Direct data of the document - supports string, Buffer, Uint8Array, ArrayBuffer, Blob, or ReadableStream */ data: string | Buffer | Uint8Array | ArrayBuffer | Blob | ReadableStream; /** Content type of the document (e.g., "application/pdf", "application/msword") */ contentType: string; } /** * Base interface for extract request parameters */ export interface ExtractBaseRequest { /** Instructions for extracting structured data from the document */ prompt: string; /** JSON Schema object that defines the structure of the expected output. Should follow the JSON Schema specification and include additionalProperties property */ schema: Record<string, unknown>; } /** * Request parameters for extracting structured data from a document from URL */ export interface ExtractURLRequest extends ExtractBaseRequest { /** Direct URL to a PDF/Word document that will be analyzed */ url: string; } /** * Request parameters for extracting structured data from a document from data */ export interface ExtractDataRequest extends ExtractBaseRequest { /** Direct data of the document - supports string, Buffer, Uint8Array, ArrayBuffer, Blob, or ReadableStream */ data: string | Buffer | Uint8Array | ArrayBuffer | Blob | ReadableStream; /** Content type of the document (e.g., "application/pdf", "application/msword") */ contentType: string; } /** * Configuration for the PDF Vector client */ export interface PDFVectorConfig { /** API key for authentication (format: pdfvector_xxx) */ apiKey: string; /** Base URL for the PDF Vector API. Defaults to https://www.pdfvector.com */ baseUrl?: string; } /** * Request parameters for searching academic publications */ export interface SearchRequest { /** Search query string */ query: string; /** Providers to search (defaults to ["semantic-scholar"]) */ providers?: AcademicSearchProvider[]; /** Number of results to skip (for pagination, defaults to 0) */ offset?: number; /** Maximum number of results to return (1-100, defaults to 20) */ limit?: number; /** Filter results by publication year (from) */ yearFrom?: number; /** Filter results by publication year (to) */ yearTo?: number; /** List of fields to include in the response. If not specified, all base publication fields are returned. Use 'providerData' to include provider-specific metadata. */ fields?: AcademicSearchPublicationField[]; } /** * Request parameters for fetching specific academic publications by ID */ export interface FetchRequest { /** Array of IDs (strings with auto-detection) */ ids: string[]; /** List of fields to include in the response. If not specified, all base publication fields are returned. Use 'providerData' to include provider-specific metadata. */ fields?: AcademicSearchPublicationField[]; } /** * Base interface for list images request parameters */ export interface ListImagesBaseRequest { /** Whether to use LLM parsing. Defaults to "auto" */ useLLM?: "auto" | "always" | "never"; } /** * Request parameters for listing images from a document from URL */ export interface ListImagesURLRequest extends ListImagesBaseRequest { /** Direct URL to a PDF/Word document that will be analyzed */ url: string; } /** * Request parameters for listing images from a document from data */ export interface ListImagesDataRequest extends ListImagesBaseRequest { /** Direct data of the document - supports string, Buffer, Uint8Array, ArrayBuffer, Blob, or ReadableStream */ data: string | Buffer | Uint8Array | ArrayBuffer | Blob | ReadableStream; /** Content type of the document (e.g., "application/pdf", "application/msword") */ contentType: string; } /** * Custom error class for PDF Vector API errors */ export declare class PDFVectorError extends Error { /** HTTP status code */ status?: number | undefined; /** Custom error code */ code?: string | undefined; constructor(message: string, /** HTTP status code */ status?: number | undefined, /** Custom error code */ code?: string | undefined); } /** * PDF Vector TypeScript/JavaScript SDK client * * @example * Parse from URL: * ```typescript * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.parse({ * url: "https://example.com/document.pdf", * useLLM: "auto" * }); * console.log(result.markdown); * ``` * * Parse from data: * ```typescript * import { readFile } from "fs/promises"; * * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.parse({ * data: await readFile("document.pdf"), * contentType: "application/pdf", * useLLM: "auto" * }); * console.log(result.markdown); * ``` * * Ask questions about documents: * ```typescript * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.ask({ * url: "https://example.com/research-paper.pdf", * prompt: "What are the main findings and methodology?" * }); * console.log(result.markdown); * console.log(`Cost: ${result.creditCount} credits`); * ``` * * Extract structured data: * ```typescript * const result = await client.extract({ * url: "https://example.com/invoice.pdf", * prompt: "Extract invoice details", * schema: { * type: "object", * properties: { * invoiceNumber: { type: "string" }, * date: { type: "string" }, * totalAmount: { type: "number" }, * items: { * type: "array", * items: { * type: "object", * properties: { * description: { type: "string" }, * quantity: { type: "number" }, * price: { type: "number" } * } * } * } * }, * required: ["invoiceNumber", "date", "totalAmount", "items"], * additionalProperties: false * } * }); * console.log(result.data); * ``` * * Search academic publications: * ```typescript * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const results = await client.academicSearch({ * query: "machine learning transformers", * providers: ["semantic-scholar", "arxiv"], * limit: 10, * yearFrom: 2020 * }); * console.log(results.results); * ``` */ export declare class PDFVector { /** API key for authentication (format: pdfvector_xxx) */ readonly apiKey: string; /** Base URL for the PDF Vector API. Defaults to https://www.pdfvector.com */ readonly baseUrl: string; /** * Creates a new PDF Vector client * @param config Configuration object containing API key and optional base URL */ constructor(config: PDFVectorConfig); /** * Handle API response errors (internal) * @private */ private handleResponseError; /** * Convert data to base64 string (internal) * @private */ private dataToBase64; /** * Parse a PDF/Word document from URL or data and convert to markdown * @param request Parse request parameters (URL or data) * @returns Promise resolving to parsed document details * @throws {PDFVectorError} When the API request fails */ parse(request: ParseURLRequest | ParseDataRequest): Promise<ParseResponse>; /** * Ask questions about a PDF/Word document and get AI-powered answers in natural language * @param request Ask request parameters (URL or data with prompt) * @returns Promise resolving to AI-generated answer about the document in markdown format * @throws {PDFVectorError} When the API request fails * * @example * Ask from URL: * ```typescript * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.ask({ * url: "https://example.com/research-paper.pdf", * prompt: "What are the key findings in this research?" * }); * console.log(result.markdown); * console.log(`Cost: ${result.creditCount} credits (${result.pageCount} pages)`); * ``` * * @example * Ask from data: * ```typescript * import { readFile } from "fs/promises"; * * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.ask({ * data: await readFile("document.pdf"), * contentType: "application/pdf", * prompt: "Summarize the main points of this document" * }); * console.log(result.markdown); * ``` */ ask(request: AskURLRequest | AskDataRequest): Promise<AskResponse>; /** * Extract structured data from a PDF/Word document based on a JSON Schema * @param request Extract request parameters (URL or data with prompt and schema) * @returns Promise resolving to structured data matching the provided schema * @throws {PDFVectorError} When the API request fails * * @example * Extract from URL: * ```typescript * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.extract({ * url: "https://example.com/invoice.pdf", * prompt: "Extract invoice details from this document", * schema: { * type: "object", * properties: { * invoiceNumber: { type: "string" }, * date: { type: "string" }, * totalAmount: { type: "number" }, * items: { * type: "array", * items: { * type: "object", * properties: { * description: { type: "string" }, * quantity: { type: "number" }, * price: { type: "number" } * } * } * } * }, * required: ["invoiceNumber", "date", "totalAmount", "items"], * additionalProperties: false * } * }); * console.log(result.data); * console.log(`Cost: ${result.creditCount} credits (${result.pageCount} pages)`); * ``` * * @example * Extract from data: * ```typescript * import { readFile } from "fs/promises"; * * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.extract({ * data: await readFile("research-paper.pdf"), * contentType: "application/pdf", * prompt: "Extract the title, authors, abstract, and key findings", * schema: { * type: "object", * properties: { * title: { type: "string" }, * authors: { type: "array", items: { type: "string" } }, * abstract: { type: "string" }, * keyFindings: { type: "array", items: { type: "string" } }, * publicationDate: { type: "string" } * }, * required: ["title", "authors", "abstract", "keyFindings"], * additionalProperties: false * } * }); * console.log(result.data); * ``` */ extract(request: ExtractURLRequest | ExtractDataRequest): Promise<ExtractResponse>; /** * Search academic publications across multiple databases * @param request Search request parameters * @returns Promise resolving to search results * @throws {PDFVectorError} When the API request fails * * @example * ```typescript * const results = await client.academicSearch({ * query: "machine learning", * providers: ["semantic-scholar", "arxiv"], * limit: 20, * fields: ["title", "authors", "abstract", "year"] * }); * ``` */ academicSearch(request: SearchRequest): Promise<AcademicSearchResponse>; /** * Search academic publications (alias for academicSearch) * @param request Search request parameters * @returns Promise resolving to search results * @throws {PDFVectorError} When the API request fails */ search(request: SearchRequest): Promise<AcademicSearchResponse>; /** * Fetch specific academic publications by their IDs with auto-detection * @param request Fetch request parameters containing IDs and optional fields * @returns Promise resolving to fetch results and errors * @throws {PDFVectorError} When the API request fails * * @example * Auto-detection with mixed ID formats: * ```typescript * const results = await client.academicFetch({ * ids: [ * "10.1038/nature12373", // DOI auto-detected * "12345678", // PubMed ID auto-detected * "2301.00001", // ArXiv ID auto-detected * "arXiv:2507.16298v1", // ArXiv with prefix * "ED123456", // ERIC ID auto-detected * "0f40b1f08821e22e859c6050916cec3667778613", // Semantic Scholar * "pubmed:98765432", // PDFVector format * "some-unknown-id" // Will try all providers * ] * }); * ``` * * @example * Fetching specific fields only: * ```typescript * const results = await client.academicFetch({ * ids: ["10.1038/nature12373", "pubmed:12345678"], * fields: ["title", "authors", "year", "abstract"] * }); * ``` * * @example * Handling results: * ```typescript * // Successfully fetched publications * results.results.forEach(pub => { * console.log(`Fetched: ${pub.title}`); * console.log(`Provider: ${pub.detectedProvider}`); * console.log(`Requested as: ${pub.id}`); * }); * * // Handle errors for IDs that couldn't be fetched * results.errors?.forEach(error => { * console.log(`Failed to fetch ${error.id}: ${error.error}`); * if (error.code === "NOT_FOUND") { * console.log("Publication not found in any provider"); * } * }); * ``` */ academicFetch(request: FetchRequest): Promise<AcademicFetchResponse>; /** * Fetch specific academic publications by their IDs (alias for academicFetch) * @param request Fetch request parameters containing IDs and optional fields * @returns Promise resolving to fetch results and errors * @throws {PDFVectorError} When the API request fails */ fetch(request: FetchRequest): Promise<AcademicFetchResponse>; /** * List all images from a PDF/Word document * @param request List images request parameters (URL or data) * @returns Promise resolving to list of image URLs and document details * @throws {PDFVectorError} When the API request fails * * @example * List images from URL: * ```typescript * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.listImages({ * url: "https://example.com/document.pdf", * useLLM: "auto" * }); * console.log(result.images); // Array of image URLs * console.log(`Found ${result.images.length} images`); * console.log(`Cost: ${result.creditCount} credits (${result.pageCount} pages)`); * ``` * * @example * List images from data: * ```typescript * import { readFile } from "fs/promises"; * * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.listImages({ * data: await readFile("document.pdf"), * contentType: "application/pdf", * useLLM: "auto" * }); * * // Download and save images * for (const imageUrl of result.images) { * const response = await fetch(imageUrl); * const buffer = await response.arrayBuffer(); * // Save buffer to file... * } * ``` */ listImages(request: ListImagesURLRequest | ListImagesDataRequest): Promise<ListImagesResponse>; /** * Parse an invoice from URL or data and convert to markdown * @param request Parse request parameters (URL or data) * @returns Promise resolving to parsed invoice details * @throws {PDFVectorError} When the API request fails * * @example * Parse invoice from URL: * ```typescript * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.invoiceParse({ * url: "https://example.com/invoice.pdf" * }); * console.log(result.markdown); * console.log(`Cost: ${result.creditCount} credits (${result.pageCount} pages)`); * ``` * * @example * Parse invoice from data: * ```typescript * import { readFile } from "fs/promises"; * * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.invoiceParse({ * data: await readFile("invoice.pdf"), * contentType: "application/pdf" * }); * console.log(result.markdown); * ``` */ invoiceParse(request: ParseURLRequest | ParseDataRequest): Promise<ParseInvoiceResponse>; /** * Ask questions about an invoice and get AI-powered answers in natural language * @param request Ask request parameters (URL or data with prompt) * @returns Promise resolving to AI-generated answer about the invoice in markdown format * @throws {PDFVectorError} When the API request fails * * @example * Ask questions about an invoice from URL: * ```typescript * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.invoiceAsk({ * url: "https://example.com/invoice.pdf", * prompt: "What is the total amount and due date?" * }); * console.log(result.markdown); * console.log(`Cost: ${result.creditCount} credits (${result.pageCount} pages)`); * ``` * * @example * Ask about invoice from data: * ```typescript * import { readFile } from "fs/promises"; * * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.invoiceAsk({ * data: await readFile("invoice.pdf"), * contentType: "application/pdf", * prompt: "List all the line items and their costs" * }); * console.log(result.markdown); * ``` */ invoiceAsk(request: AskURLRequest | AskDataRequest): Promise<AskInvoiceResponse>; /** * Extract structured data from an invoice based on a JSON Schema * @param request Extract request parameters (URL or data with prompt and schema) * @returns Promise resolving to structured invoice data matching the provided schema * @throws {PDFVectorError} When the API request fails * * @example * Extract invoice data from URL: * ```typescript * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.invoiceExtract({ * url: "https://example.com/invoice.pdf", * prompt: "Extract invoice details from this document", * schema: { * type: "object", * properties: { * invoiceNumber: { type: "string" }, * date: { type: "string" }, * totalAmount: { type: "number" }, * items: { * type: "array", * items: { * type: "object", * properties: { * description: { type: "string" }, * quantity: { type: "number" }, * price: { type: "number" } * } * } * } * }, * required: ["invoiceNumber", "date", "totalAmount", "items"], * additionalProperties: false * } * }); * console.log(result.data); * console.log(`Cost: ${result.creditCount} credits (${result.pageCount} pages)`); * ``` * * @example * Extract from invoice data: * ```typescript * import { readFile } from "fs/promises"; * * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.invoiceExtract({ * data: await readFile("invoice.pdf"), * contentType: "application/pdf", * prompt: "Extract all vendor and payment information", * schema: { * type: "object", * properties: { * vendorName: { type: "string" }, * vendorAddress: { type: "string" }, * paymentTerms: { type: "string" }, * dueDate: { type: "string" }, * totalAmount: { type: "number" } * }, * required: ["vendorName", "totalAmount"], * additionalProperties: false * } * }); * console.log(result.data); * ``` */ invoiceExtract(request: ExtractURLRequest | ExtractDataRequest): Promise<ExtractInvoiceResponse>; /** * Parse an ID document (passport, driver's license, ID card) from URL or data and convert to markdown * @param request Parse request parameters (URL or data) * @returns Promise resolving to parsed ID document details * @throws {PDFVectorError} When the API request fails * * @example * Parse ID from URL: * ```typescript * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.idParse({ * url: "https://example.com/passport.pdf" * }); * console.log(result.markdown); * console.log(`Cost: ${result.creditCount} credits (${result.pageCount} pages)`); * ``` * * @example * Parse ID from data: * ```typescript * import { readFile } from "fs/promises"; * * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.idParse({ * data: await readFile("drivers-license.jpg"), * contentType: "image/jpeg" * }); * console.log(result.markdown); * ``` */ idParse(request: ParseURLRequest | ParseDataRequest): Promise<ParseIdResponse>; /** * Ask questions about an ID document and get AI-powered answers in natural language * @param request Ask request parameters (URL or data with prompt) * @returns Promise resolving to AI-generated answer about the ID document in markdown format * @throws {PDFVectorError} When the API request fails * * @example * Ask questions about an ID from URL: * ```typescript * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.idAsk({ * url: "https://example.com/passport.pdf", * prompt: "What is the full name and date of birth on this document?" * }); * console.log(result.markdown); * console.log(`Cost: ${result.creditCount} credits (${result.pageCount} pages)`); * ``` * * @example * Ask about ID from data: * ```typescript * import { readFile } from "fs/promises"; * * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.idAsk({ * data: await readFile("id-card.png"), * contentType: "image/png", * prompt: "When does this ID expire?" * }); * console.log(result.markdown); * ``` */ idAsk(request: AskURLRequest | AskDataRequest): Promise<AskIdResponse>; /** * Extract structured data from an ID document based on a JSON Schema * @param request Extract request parameters (URL or data with prompt and schema) * @returns Promise resolving to structured ID document data matching the provided schema * @throws {PDFVectorError} When the API request fails * * @example * Extract ID data from URL: * ```typescript * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.idExtract({ * url: "https://example.com/passport.pdf", * prompt: "Extract passport details from this document", * schema: { * type: "object", * properties: { * fullName: { type: "string" }, * dateOfBirth: { type: "string" }, * documentNumber: { type: "string" }, * nationality: { type: "string" }, * expirationDate: { type: "string" }, * issuingCountry: { type: "string" } * }, * required: ["fullName", "documentNumber"], * additionalProperties: false * } * }); * console.log(result.data); * console.log(`Cost: ${result.creditCount} credits (${result.pageCount} pages)`); * ``` * * @example * Extract from ID data: * ```typescript * import { readFile } from "fs/promises"; * * const client = new PDFVector({ apiKey: "pdfvector_xxx" }); * const result = await client.idExtract({ * data: await readFile("drivers-license.jpg"), * contentType: "image/jpeg", * prompt: "Extract driver's license information", * schema: { * type: "object", * properties: { * fullName: { type: "string" }, * address: { type: "string" }, * licenseNumber: { type: "string" }, * dateOfBirth: { type: "string" }, * expirationDate: { type: "string" }, * licenseClass: { type: "string" } * }, * required: ["fullName", "licenseNumber"], * additionalProperties: false * } * }); * console.log(result.data); * ``` */ idExtract(request: ExtractURLRequest | ExtractDataRequest): Promise<ExtractIdResponse>; } //# sourceMappingURL=index.d.ts.map