UNPKG

@langchain/community

Version:
1 lines 9.64 kB
{"version":3,"file":"arxiv.cjs","names":["XMLParser","PDFLoader","Document"],"sources":["../../src/utils/arxiv.ts"],"sourcesContent":["/* oxlint-disable typescript/no-explicit-any */\nimport { Document } from \"@langchain/core/documents\";\nimport { XMLParser } from \"fast-xml-parser\";\n\nimport { PDFLoader } from \"../document_loaders/fs/pdf.js\";\n\n// Interface for processed arXiv entry\ninterface ArxivEntry {\n id: string;\n title: string;\n summary: string;\n published: string;\n updated: string;\n authors: string[];\n pdfUrl: string;\n links: any[];\n}\n\n// Used to check if the query is an arXiv ID, or a natural language query\nexport function isArXivIdentifier(query: string): boolean {\n const arxivIdRegex = /^\\d{4}\\.\\d{4,5}(v\\d+)?$|^\\d{7}(\\.\\d+)?(v\\d+)?$/;\n return arxivIdRegex.test(query.trim());\n}\n\n// Used to fetch direct arXiv articles by IDs (supports multiple IDs)\nexport async function fetchDirectArxivArticle(\n arxivIds: string\n): Promise<ArxivEntry[]> {\n try {\n const idList = arxivIds\n .split(/[\\s,]+/)\n .map((id) => id.trim())\n .filter(Boolean)\n .join(\",\");\n const url = `http://export.arxiv.org/api/query?id_list=${idList}`;\n const response = await fetch(url);\n\n if (!response.ok) {\n throw new Error(`HTTP error! status: ${response.status}`);\n }\n\n const xml = await response.text();\n\n const parser = new XMLParser({\n ignoreAttributes: false,\n attributeNamePrefix: \"@_\",\n });\n const result = parser.parse(xml);\n let entries = result.feed.entry;\n\n if (!entries) {\n return [];\n }\n\n // Ensure entries is an array\n if (!Array.isArray(entries)) {\n entries = [entries];\n }\n\n const processedEntries = entries.map(processEntry);\n\n return processedEntries;\n } catch {\n throw new Error(`Failed to fetch articles with IDs ${arxivIds}`);\n }\n}\n\n// Used to fetch arXiv results by natural language query with maxResults parameter\nexport async function fetchArxivResultsByQuery(\n query: string,\n start = 0,\n maxResults = 10\n): Promise<ArxivEntry[]> {\n try {\n const encodedQuery = encodeURIComponent(query);\n const url = `http://export.arxiv.org/api/query?search_query=all:${encodedQuery}&start=${start}&max_results=${maxResults}`;\n const response = await fetch(url);\n\n if (!response.ok) {\n throw new Error(`HTTP error! status: ${response.status}`);\n }\n\n const xml = await response.text();\n\n const parser = new XMLParser({\n ignoreAttributes: false,\n attributeNamePrefix: \"@_\",\n });\n const result = parser.parse(xml);\n let entries = result.feed.entry;\n\n if (!entries) {\n return [];\n }\n\n // Ensure entries is an array\n if (!Array.isArray(entries)) {\n entries = [entries];\n }\n\n const processedEntries = entries.map(processEntry);\n\n return processedEntries;\n } catch {\n throw new Error(`Failed to fetch articles with query \"${query}\"`);\n }\n}\n\n// Used to search for arXiv articles with a maxResults parameter\nexport async function searchArxiv(\n query: string,\n maxResults = 3\n): Promise<ArxivEntry[]> {\n if (isArXivIdentifier(query)) {\n return await fetchDirectArxivArticle(query);\n } else {\n return await fetchArxivResultsByQuery(query, 0, maxResults);\n }\n}\n\n// Used to fetch and parse PDF to text\nexport async function fetchAndParsePDF(pdfUrl: string): Promise<string> {\n try {\n // Fetch the PDF\n const response = await fetch(pdfUrl);\n\n if (!response.ok) {\n throw new Error(`HTTP error! status: ${response.status}`);\n }\n\n const buffer = await response.arrayBuffer();\n\n // Convert the ArrayBuffer to a Blob\n const blob = new Blob([buffer], { type: \"application/pdf\" });\n\n // Use PDFLoader to process the PDF\n const loader = new PDFLoader(blob, { splitPages: false }); // Pass the Blob\n const docs: Document[] = await loader.load();\n\n // Combine all document content into a single string\n const content = docs.map((doc) => doc.pageContent).join(\"\\n\\n\");\n return content;\n } catch {\n throw new Error(`Failed to fetch or parse PDF from ${pdfUrl}`);\n }\n}\n\n// Used to load raw text from each search result, and convert to Document instances\nexport async function loadDocsFromResults(\n results: ArxivEntry[]\n): Promise<Document[]> {\n const docs: Document[] = [];\n for (const result of results) {\n const pdfUrl = result.pdfUrl;\n try {\n const pdfContent = await fetchAndParsePDF(pdfUrl);\n const metadata = {\n id: result.id,\n title: result.title,\n authors: result.authors,\n published: result.published,\n updated: result.updated,\n source: \"arxiv\",\n url: result.id,\n summary: result.summary,\n };\n const doc = new Document({\n pageContent: pdfContent,\n metadata,\n });\n docs.push(doc);\n } catch {\n throw new Error(`Error loading document from ${pdfUrl}`);\n }\n }\n return docs;\n}\n\n// Used to convert metadata and summaries to Document instances\nexport function getDocsFromSummaries(results: ArxivEntry[]): Document[] {\n const docs: Document[] = [];\n for (const result of results) {\n const metadata = {\n id: result.id,\n title: result.title,\n authors: result.authors,\n published: result.published,\n updated: result.updated,\n source: \"arxiv\",\n url: result.id,\n };\n const doc = new Document({\n pageContent: result.summary,\n metadata,\n });\n docs.push(doc);\n }\n return docs;\n}\n\n// Helper function to process each arXiv entry\nfunction processEntry(entry: any): ArxivEntry {\n const id = entry.id;\n const title = entry.title.replace(/\\s+/g, \" \").trim();\n const summary = entry.summary.replace(/\\s+/g, \" \").trim();\n const published = entry.published;\n const updated = entry.updated;\n\n // Extract authors\n let authors: string[] = [];\n if (Array.isArray(entry.author)) {\n authors = entry.author.map((author: any) => author.name);\n } else if (entry.author) {\n authors = [entry.author.name];\n }\n\n // Extract links\n let links: any[] = [];\n if (Array.isArray(entry.link)) {\n links = entry.link;\n } else if (entry.link) {\n links = [entry.link];\n }\n\n // Extract PDF link\n let pdfUrl = `${id.replace(\"/abs/\", \"/pdf/\")}.pdf`;\n const pdfLinkObj = links.find((link: any) => link[\"@_title\"] === \"pdf\");\n if (pdfLinkObj && pdfLinkObj[\"@_href\"]) {\n pdfUrl = pdfLinkObj[\"@_href\"];\n }\n\n return {\n id,\n title,\n summary,\n published,\n updated,\n authors,\n pdfUrl,\n links,\n };\n}\n"],"mappings":";;;;;AAmBA,SAAgB,kBAAkB,OAAwB;AAExD,QADqB,iDACD,KAAK,MAAM,MAAM,CAAC;;AAIxC,eAAsB,wBACpB,UACuB;AACvB,KAAI;EAMF,MAAM,MAAM,6CALG,SACZ,MAAM,SAAS,CACf,KAAK,OAAO,GAAG,MAAM,CAAC,CACtB,OAAO,QAAQ,CACf,KAAK,IAAI;EAEZ,MAAM,WAAW,MAAM,MAAM,IAAI;AAEjC,MAAI,CAAC,SAAS,GACZ,OAAM,IAAI,MAAM,uBAAuB,SAAS,SAAS;EAG3D,MAAM,MAAM,MAAM,SAAS,MAAM;EAOjC,IAAI,UALW,IAAIA,gBAAAA,UAAU;GAC3B,kBAAkB;GAClB,qBAAqB;GACtB,CAAC,CACoB,MAAM,IAAI,CACX,KAAK;AAE1B,MAAI,CAAC,QACH,QAAO,EAAE;AAIX,MAAI,CAAC,MAAM,QAAQ,QAAQ,CACzB,WAAU,CAAC,QAAQ;AAKrB,SAFyB,QAAQ,IAAI,aAAa;SAG5C;AACN,QAAM,IAAI,MAAM,qCAAqC,WAAW;;;AAKpE,eAAsB,yBACpB,OACA,QAAQ,GACR,aAAa,IACU;AACvB,KAAI;EAEF,MAAM,MAAM,sDADS,mBAAmB,MAAM,CACiC,SAAS,MAAM,eAAe;EAC7G,MAAM,WAAW,MAAM,MAAM,IAAI;AAEjC,MAAI,CAAC,SAAS,GACZ,OAAM,IAAI,MAAM,uBAAuB,SAAS,SAAS;EAG3D,MAAM,MAAM,MAAM,SAAS,MAAM;EAOjC,IAAI,UALW,IAAIA,gBAAAA,UAAU;GAC3B,kBAAkB;GAClB,qBAAqB;GACtB,CAAC,CACoB,MAAM,IAAI,CACX,KAAK;AAE1B,MAAI,CAAC,QACH,QAAO,EAAE;AAIX,MAAI,CAAC,MAAM,QAAQ,QAAQ,CACzB,WAAU,CAAC,QAAQ;AAKrB,SAFyB,QAAQ,IAAI,aAAa;SAG5C;AACN,QAAM,IAAI,MAAM,wCAAwC,MAAM,GAAG;;;AAKrE,eAAsB,YACpB,OACA,aAAa,GACU;AACvB,KAAI,kBAAkB,MAAM,CAC1B,QAAO,MAAM,wBAAwB,MAAM;KAE3C,QAAO,MAAM,yBAAyB,OAAO,GAAG,WAAW;;AAK/D,eAAsB,iBAAiB,QAAiC;AACtE,KAAI;EAEF,MAAM,WAAW,MAAM,MAAM,OAAO;AAEpC,MAAI,CAAC,SAAS,GACZ,OAAM,IAAI,MAAM,uBAAuB,SAAS,SAAS;EAG3D,MAAM,SAAS,MAAM,SAAS,aAAa;AAW3C,UAJyB,MADV,IAAIC,gCAAAA,UAHN,IAAI,KAAK,CAAC,OAAO,EAAE,EAAE,MAAM,mBAAmB,CAAC,EAGzB,EAAE,YAAY,OAAO,CAAC,CACnB,MAAM,EAGvB,KAAK,QAAQ,IAAI,YAAY,CAAC,KAAK,OAAO;SAEzD;AACN,QAAM,IAAI,MAAM,qCAAqC,SAAS;;;AAKlE,eAAsB,oBACpB,SACqB;CACrB,MAAM,OAAmB,EAAE;AAC3B,MAAK,MAAM,UAAU,SAAS;EAC5B,MAAM,SAAS,OAAO;AACtB,MAAI;GAYF,MAAM,MAAM,IAAIC,0BAAAA,SAAS;IACvB,aAZiB,MAAM,iBAAiB,OAAO;IAa/C,UAZe;KACf,IAAI,OAAO;KACX,OAAO,OAAO;KACd,SAAS,OAAO;KAChB,WAAW,OAAO;KAClB,SAAS,OAAO;KAChB,QAAQ;KACR,KAAK,OAAO;KACZ,SAAS,OAAO;KACjB;IAIA,CAAC;AACF,QAAK,KAAK,IAAI;UACR;AACN,SAAM,IAAI,MAAM,+BAA+B,SAAS;;;AAG5D,QAAO;;AAIT,SAAgB,qBAAqB,SAAmC;CACtE,MAAM,OAAmB,EAAE;AAC3B,MAAK,MAAM,UAAU,SAAS;EAC5B,MAAM,WAAW;GACf,IAAI,OAAO;GACX,OAAO,OAAO;GACd,SAAS,OAAO;GAChB,WAAW,OAAO;GAClB,SAAS,OAAO;GAChB,QAAQ;GACR,KAAK,OAAO;GACb;EACD,MAAM,MAAM,IAAIA,0BAAAA,SAAS;GACvB,aAAa,OAAO;GACpB;GACD,CAAC;AACF,OAAK,KAAK,IAAI;;AAEhB,QAAO;;AAIT,SAAS,aAAa,OAAwB;CAC5C,MAAM,KAAK,MAAM;CACjB,MAAM,QAAQ,MAAM,MAAM,QAAQ,QAAQ,IAAI,CAAC,MAAM;CACrD,MAAM,UAAU,MAAM,QAAQ,QAAQ,QAAQ,IAAI,CAAC,MAAM;CACzD,MAAM,YAAY,MAAM;CACxB,MAAM,UAAU,MAAM;CAGtB,IAAI,UAAoB,EAAE;AAC1B,KAAI,MAAM,QAAQ,MAAM,OAAO,CAC7B,WAAU,MAAM,OAAO,KAAK,WAAgB,OAAO,KAAK;UAC/C,MAAM,OACf,WAAU,CAAC,MAAM,OAAO,KAAK;CAI/B,IAAI,QAAe,EAAE;AACrB,KAAI,MAAM,QAAQ,MAAM,KAAK,CAC3B,SAAQ,MAAM;UACL,MAAM,KACf,SAAQ,CAAC,MAAM,KAAK;CAItB,IAAI,SAAS,GAAG,GAAG,QAAQ,SAAS,QAAQ,CAAC;CAC7C,MAAM,aAAa,MAAM,MAAM,SAAc,KAAK,eAAe,MAAM;AACvE,KAAI,cAAc,WAAW,UAC3B,UAAS,WAAW;AAGtB,QAAO;EACL;EACA;EACA;EACA;EACA;EACA;EACA;EACA;EACD"}