@langchain/community
Version:
Third-party integrations for LangChain.js
95 lines (94 loc) • 3.31 kB
JavaScript
Object.defineProperty(exports, Symbol.toStringTag, { value: "Module" });
const require_runtime = require("../../_virtual/_rolldown/runtime.cjs");
const require_document_loaders_web_cheerio = require("./cheerio.cjs");
let _langchain_core_documents = require("@langchain/core/documents");
//#region src/document_loaders/web/hn.ts
var hn_exports = /* @__PURE__ */ require_runtime.__exportAll({ HNLoader: () => HNLoader });
/**
* A class that extends the CheerioWebBaseLoader class. It represents a
* loader for loading web pages from the Hacker News website.
*/
var HNLoader = class extends require_document_loaders_web_cheerio.CheerioWebBaseLoader {
constructor(webPath) {
super(webPath);
this.webPath = webPath;
}
/**
* An asynchronous method that loads the web page. If the webPath includes
* "item", it calls the loadComments() method to load the comments from
* the web page. Otherwise, it calls the loadResults() method to load the
* results from the web page.
* @returns A Promise that resolves to an array of Document instances.
*/
async load() {
const $ = await this.scrape();
if (this.webPath.includes("item")) return this.loadComments($);
return this.loadResults($);
}
/**
* A private method that loads the comments from the web page. It selects
* the elements with the class "athing comtr" using the $ function
* provided by Cheerio. It also extracts the title of the web page from
* the element with the id "pagespace". It creates Document instances for
* each comment, with the comment text as the page content and the source
* and title as metadata.
* @param $ A CheerioAPI instance.
* @returns An array of Document instances.
*/
loadComments($) {
const comments = $("tr[class='athing comtr']");
const title = $("tr[id='pagespace']").attr("title");
const documents = [];
comments.each((_index, comment) => {
const text = $(comment).text().trim();
const metadata = {
source: this.webPath,
title
};
documents.push(new _langchain_core_documents.Document({
pageContent: text,
metadata
}));
});
return documents;
}
/**
* A private method that loads the results from the web page. It selects
* the elements with the class "athing" using the $ function provided by
* Cheerio. It extracts the ranking, link, title, and other metadata from
* each result item. It creates Document instances for each result item,
* with the title as the page content and the source, title, link, and
* ranking as metadata.
* @param $ A CheerioAPI instance.
* @returns An array of Document instances.
*/
loadResults($) {
const items = $("tr[class='athing']");
const documents = [];
items.each((_index, item) => {
const ranking = $(item).find("span[class='rank']").text();
const link = $(item).find("span[class='titleline'] a").attr("href");
const title = $(item).find("span[class='titleline']").text().trim();
const metadata = {
source: this.webPath,
title,
link,
ranking
};
documents.push(new _langchain_core_documents.Document({
pageContent: title,
metadata
}));
});
return documents;
}
};
//#endregion
exports.HNLoader = HNLoader;
Object.defineProperty(exports, "hn_exports", {
enumerable: true,
get: function() {
return hn_exports;
}
});
//# sourceMappingURL=hn.cjs.map