@langchain/community
Version:
Third-party integrations for LangChain.js
1 lines • 1.81 kB
Source Map (JSON)
{"version":3,"file":"html_to_text.cjs","names":["MappingDocumentTransformer","Document"],"sources":["../../src/document_transformers/html_to_text.ts"],"sourcesContent":["import { htmlToText, type HtmlToTextOptions } from \"html-to-text\";\nimport {\n MappingDocumentTransformer,\n Document,\n} from \"@langchain/core/documents\";\n\n/**\n * A transformer that converts HTML content to plain text.\n * @example\n * ```typescript\n * const loader = new CheerioWebBaseLoader(\"https://example.com/some-page\");\n * const docs = await loader.load();\n *\n * const splitter = new RecursiveCharacterTextSplitter({\n * maxCharacterCount: 1000,\n * });\n * const transformer = new HtmlToTextTransformer();\n *\n * // The sequence of text splitting followed by HTML to text transformation\n * const sequence = splitter.pipe(transformer);\n *\n * // Processing the loaded documents through the sequence\n * const newDocuments = await sequence.invoke(docs);\n *\n * console.log(newDocuments);\n * ```\n */\nexport class HtmlToTextTransformer extends MappingDocumentTransformer {\n static lc_name() {\n return \"HtmlToTextTransformer\";\n }\n\n constructor(protected options: HtmlToTextOptions = {}) {\n super(options);\n }\n\n async _transformDocument(document: Document): Promise<Document> {\n const extractedContent = htmlToText(document.pageContent, this.options);\n return new Document({\n pageContent: extractedContent,\n metadata: { ...document.metadata },\n });\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;AA2BA,IAAa,wBAAb,cAA2CA,0BAAAA,2BAA2B;CACpE,OAAO,UAAU;AACf,SAAO;;CAGT,YAAY,UAAuC,EAAE,EAAE;AACrD,QAAM,QAAQ;AADM,OAAA,UAAA;;CAItB,MAAM,mBAAmB,UAAuC;AAE9D,SAAO,IAAIC,0BAAAA,SAAS;GAClB,cAAA,GAAA,aAAA,YAFkC,SAAS,aAAa,KAAK,QAAQ;GAGrE,UAAU,EAAE,GAAG,SAAS,UAAU;GACnC,CAAC"}