@langchain/community
Version:
Third-party integrations for LangChain.js
1 lines • 2.2 kB
Source Map (JSON)
{"version":3,"file":"mozilla_readability.cjs","names":["MappingDocumentTransformer","Document","Readability","JSDOM"],"sources":["../../src/document_transformers/mozilla_readability.ts"],"sourcesContent":["import { Readability } from \"@mozilla/readability\";\nimport { JSDOM } from \"jsdom\";\nimport type { Options } from \"mozilla-readability\";\nimport {\n MappingDocumentTransformer,\n Document,\n} from \"@langchain/core/documents\";\n\n/**\n * A transformer that uses the Mozilla Readability library to extract the\n * main content from a web page.\n * @example\n * ```typescript\n * const loader = new HTMLWebBaseLoader(\"https://example.com/article\");\n * const docs = await loader.load();\n *\n * const splitter = new RecursiveCharacterTextSplitter({\n * maxCharacterCount: 5000,\n * });\n * const transformer = new MozillaReadabilityTransformer();\n *\n * // The sequence processes the loaded documents through the splitter and then the transformer.\n * const sequence = transformer.pipe(splitter);\n *\n * // Invoke the sequence to transform the documents into a more readable format.\n * const newDocuments = await sequence.invoke(docs);\n *\n * console.log(newDocuments);\n * ```\n */\nexport class MozillaReadabilityTransformer extends MappingDocumentTransformer {\n static lc_name() {\n return \"MozillaReadabilityTransformer\";\n }\n\n constructor(protected options: Options = {}) {\n super(options);\n }\n\n async _transformDocument(document: Document): Promise<Document> {\n const doc = new JSDOM(document.pageContent);\n\n const readability = new Readability(doc.window.document, this.options);\n\n const result = readability.parse();\n\n return new Document({\n pageContent: result?.textContent ?? \"\",\n metadata: {\n ...document.metadata,\n },\n });\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA8BA,IAAa,gCAAb,cAAmDA,0BAAAA,2BAA2B;CAC5E,OAAO,UAAU;AACf,SAAO;;CAGT,YAAY,UAA6B,EAAE,EAAE;AAC3C,QAAM,QAAQ;AADM,OAAA,UAAA;;CAItB,MAAM,mBAAmB,UAAuC;AAO9D,SAAO,IAAIC,0BAAAA,SAAS;GAClB,aALkB,IAAIC,qBAAAA,YAFZ,IAAIC,MAAAA,MAAM,SAAS,YAAY,CAEH,OAAO,UAAU,KAAK,QAAQ,CAE3C,OAAO,EAGX,eAAe;GACpC,UAAU,EACR,GAAG,SAAS,UACb;GACF,CAAC"}