@langchain/community
Version:
Third-party integrations for LangChain.js
1 lines • 4.21 kB
Source Map (JSON)
{"version":3,"file":"hn.cjs","names":["CheerioWebBaseLoader","Document"],"sources":["../../../src/document_loaders/web/hn.ts"],"sourcesContent":["import type { CheerioAPI } from \"cheerio\";\nimport { Document } from \"@langchain/core/documents\";\nimport { CheerioWebBaseLoader } from \"./cheerio.js\";\n\n/**\n * A class that extends the CheerioWebBaseLoader class. It represents a\n * loader for loading web pages from the Hacker News website.\n */\nexport class HNLoader extends CheerioWebBaseLoader {\n constructor(public webPath: string) {\n super(webPath);\n }\n\n /**\n * An asynchronous method that loads the web page. If the webPath includes\n * \"item\", it calls the loadComments() method to load the comments from\n * the web page. Otherwise, it calls the loadResults() method to load the\n * results from the web page.\n * @returns A Promise that resolves to an array of Document instances.\n */\n public async load(): Promise<Document[]> {\n const $ = await this.scrape();\n if (this.webPath.includes(\"item\")) {\n return this.loadComments($);\n }\n return this.loadResults($);\n }\n\n /**\n * A private method that loads the comments from the web page. It selects\n * the elements with the class \"athing comtr\" using the $ function\n * provided by Cheerio. It also extracts the title of the web page from\n * the element with the id \"pagespace\". It creates Document instances for\n * each comment, with the comment text as the page content and the source\n * and title as metadata.\n * @param $ A CheerioAPI instance.\n * @returns An array of Document instances.\n */\n private loadComments($: CheerioAPI): Document[] {\n const comments = $(\"tr[class='athing comtr']\");\n const title = $(\"tr[id='pagespace']\").attr(\"title\");\n const documents: Document[] = [];\n comments.each((_index, comment) => {\n const text = $(comment).text().trim();\n const metadata = { source: this.webPath, title };\n documents.push(new Document({ pageContent: text, metadata }));\n });\n return documents;\n }\n\n /**\n * A private method that loads the results from the web page. It selects\n * the elements with the class \"athing\" using the $ function provided by\n * Cheerio. It extracts the ranking, link, title, and other metadata from\n * each result item. It creates Document instances for each result item,\n * with the title as the page content and the source, title, link, and\n * ranking as metadata.\n * @param $ A CheerioAPI instance.\n * @returns An array of Document instances.\n */\n private loadResults($: CheerioAPI): Document[] {\n const items = $(\"tr[class='athing']\");\n const documents: Document[] = [];\n items.each((_index, item) => {\n const ranking = $(item).find(\"span[class='rank']\").text();\n const link = $(item).find(\"span[class='titleline'] a\").attr(\"href\");\n const title = $(item).find(\"span[class='titleline']\").text().trim();\n const metadata = {\n source: this.webPath,\n title,\n link,\n ranking,\n };\n documents.push(new Document({ pageContent: title, metadata }));\n });\n return documents;\n }\n}\n"],"mappings":";;;;;;;;;;AAQA,IAAa,WAAb,cAA8BA,qCAAAA,qBAAqB;CACjD,YAAY,SAAwB;AAClC,QAAM,QAAQ;AADG,OAAA,UAAA;;;;;;;;;CAWnB,MAAa,OAA4B;EACvC,MAAM,IAAI,MAAM,KAAK,QAAQ;AAC7B,MAAI,KAAK,QAAQ,SAAS,OAAO,CAC/B,QAAO,KAAK,aAAa,EAAE;AAE7B,SAAO,KAAK,YAAY,EAAE;;;;;;;;;;;;CAa5B,aAAqB,GAA2B;EAC9C,MAAM,WAAW,EAAE,2BAA2B;EAC9C,MAAM,QAAQ,EAAE,qBAAqB,CAAC,KAAK,QAAQ;EACnD,MAAM,YAAwB,EAAE;AAChC,WAAS,MAAM,QAAQ,YAAY;GACjC,MAAM,OAAO,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM;GACrC,MAAM,WAAW;IAAE,QAAQ,KAAK;IAAS;IAAO;AAChD,aAAU,KAAK,IAAIC,0BAAAA,SAAS;IAAE,aAAa;IAAM;IAAU,CAAC,CAAC;IAC7D;AACF,SAAO;;;;;;;;;;;;CAaT,YAAoB,GAA2B;EAC7C,MAAM,QAAQ,EAAE,qBAAqB;EACrC,MAAM,YAAwB,EAAE;AAChC,QAAM,MAAM,QAAQ,SAAS;GAC3B,MAAM,UAAU,EAAE,KAAK,CAAC,KAAK,qBAAqB,CAAC,MAAM;GACzD,MAAM,OAAO,EAAE,KAAK,CAAC,KAAK,4BAA4B,CAAC,KAAK,OAAO;GACnE,MAAM,QAAQ,EAAE,KAAK,CAAC,KAAK,0BAA0B,CAAC,MAAM,CAAC,MAAM;GACnE,MAAM,WAAW;IACf,QAAQ,KAAK;IACb;IACA;IACA;IACD;AACD,aAAU,KAAK,IAAIA,0BAAAA,SAAS;IAAE,aAAa;IAAO;IAAU,CAAC,CAAC;IAC9D;AACF,SAAO"}