UNPKG

@langchain/community

Version:
1 lines 5.26 kB
{"version":3,"file":"youtube.cjs","names":["BaseDocumentLoader","Innertube","Document"],"sources":["../../../src/document_loaders/web/youtube.ts"],"sourcesContent":["import { Innertube } from \"youtubei.js\";\nimport { Document } from \"@langchain/core/documents\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\n\n/**\n * Configuration options for the YoutubeLoader class. Includes properties\n * such as the videoId, language, and addVideoInfo.\n */\ninterface YoutubeConfig {\n videoId: string;\n language?: string;\n addVideoInfo?: boolean;\n}\n\n/**\n * Metadata of a YouTube video. Includes properties such as the source\n * (videoId), description, title, view_count, author, and category.\n */\ninterface VideoMetadata {\n source: string;\n description?: string;\n title?: string;\n view_count?: number;\n author?: string;\n category?: string;\n}\n\n/**\n * A document loader for loading data from YouTube videos. It uses the\n * youtubei.js library to fetch the transcript and video metadata.\n * @example\n * ```typescript\n * const loader = new YoutubeLoader(\n * \"https:\n * \"en\",\n * true,\n * );\n * const docs = await loader.load();\n * ```\n */\nexport class YoutubeLoader extends BaseDocumentLoader {\n private videoId: string;\n\n private language?: string;\n\n private addVideoInfo: boolean;\n\n constructor(config: YoutubeConfig) {\n super();\n this.videoId = config.videoId;\n this.language = config?.language;\n this.addVideoInfo = config?.addVideoInfo ?? false;\n }\n\n /**\n * Extracts the videoId from a YouTube video URL.\n * @param url The URL of the YouTube video.\n * @returns The videoId of the YouTube video.\n */\n private static getVideoID(url: string): string {\n // YouTube video IDs are exactly 11 characters: alphanumeric, underscores, and hyphens\n // Using a bounded pattern to avoid ReDoS vulnerabilities\n const match = url.match(\n /(?:youtu\\.be\\/|youtube\\.com\\/(?:v\\/|u\\/\\w\\/|embed\\/|watch\\?v=|shorts\\/))([a-zA-Z0-9_-]{11})(?:[?&#]|$)/\n );\n if (match !== null) {\n return match[1];\n } else {\n throw new Error(\"Failed to get youtube video id from the url\");\n }\n }\n\n /**\n * Creates a new instance of the YoutubeLoader class from a YouTube video\n * URL.\n * @param url The URL of the YouTube video.\n * @param config Optional configuration options for the YoutubeLoader instance, excluding the videoId.\n * @returns A new instance of the YoutubeLoader class.\n */\n static createFromUrl(\n url: string,\n config?: Omit<YoutubeConfig, \"videoId\">\n ): YoutubeLoader {\n const videoId = YoutubeLoader.getVideoID(url);\n return new YoutubeLoader({ ...config, videoId });\n }\n\n /**\n * Loads the transcript and video metadata from the specified YouTube\n * video. It uses the youtubei.js library to fetch the video metadata and transcripts.\n * @returns An array of Documents representing the retrieved data.\n */\n async load(): Promise<Document[]> {\n let transcript: string | undefined;\n const metadata: VideoMetadata = {\n source: this.videoId,\n };\n try {\n const youtube = await Innertube.create({\n lang: this.language,\n retrieve_player: false,\n });\n const info = await youtube.getInfo(this.videoId);\n const transcriptData = await info.getTranscript();\n transcript =\n transcriptData.transcript.content?.body?.initial_segments\n .map((segment) => segment.snippet.text)\n .join(\" \") ?? \"\";\n if (transcript === undefined) {\n throw new Error(\"Transcription not found\");\n }\n if (this.addVideoInfo) {\n const basicInfo = info.basic_info;\n metadata.description = basicInfo.short_description;\n metadata.title = basicInfo.title;\n metadata.view_count = basicInfo.view_count;\n metadata.author = basicInfo.author;\n }\n } catch (e: unknown) {\n throw new Error(\n `Failed to get YouTube video transcription: ${(e as Error).message}`\n );\n }\n const document = new Document({\n pageContent: transcript,\n metadata,\n });\n\n return [document];\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAwCA,IAAa,gBAAb,MAAa,sBAAsBA,sCAAAA,mBAAmB;CACpD;CAEA;CAEA;CAEA,YAAY,QAAuB;AACjC,SAAO;AACP,OAAK,UAAU,OAAO;AACtB,OAAK,WAAW,QAAQ;AACxB,OAAK,eAAe,QAAQ,gBAAgB;;;;;;;CAQ9C,OAAe,WAAW,KAAqB;EAG7C,MAAM,QAAQ,IAAI,MAChB,yGACD;AACD,MAAI,UAAU,KACZ,QAAO,MAAM;MAEb,OAAM,IAAI,MAAM,8CAA8C;;;;;;;;;CAWlE,OAAO,cACL,KACA,QACe;EACf,MAAM,UAAU,cAAc,WAAW,IAAI;AAC7C,SAAO,IAAI,cAAc;GAAE,GAAG;GAAQ;GAAS,CAAC;;;;;;;CAQlD,MAAM,OAA4B;EAChC,IAAI;EACJ,MAAM,WAA0B,EAC9B,QAAQ,KAAK,SACd;AACD,MAAI;GAKF,MAAM,OAAO,OAJG,MAAMC,YAAAA,UAAU,OAAO;IACrC,MAAM,KAAK;IACX,iBAAiB;IAClB,CAAC,EACyB,QAAQ,KAAK,QAAQ;AAEhD,iBADuB,MAAM,KAAK,eAAe,EAEhC,WAAW,SAAS,MAAM,iBACtC,KAAK,YAAY,QAAQ,QAAQ,KAAK,CACtC,KAAK,IAAI,IAAI;AAClB,OAAI,eAAe,KAAA,EACjB,OAAM,IAAI,MAAM,0BAA0B;AAE5C,OAAI,KAAK,cAAc;IACrB,MAAM,YAAY,KAAK;AACvB,aAAS,cAAc,UAAU;AACjC,aAAS,QAAQ,UAAU;AAC3B,aAAS,aAAa,UAAU;AAChC,aAAS,SAAS,UAAU;;WAEvB,GAAY;AACnB,SAAM,IAAI,MACR,8CAA+C,EAAY,UAC5D;;AAOH,SAAO,CALU,IAAIC,0BAAAA,SAAS;GAC5B,aAAa;GACb;GACD,CAAC,CAEe"}