UNPKG

t-youtube-transcript-fetcher

Version:

An enhanced TypeScript library for fetching YouTube transcripts with proxy support (based on youtube-transcript)

163 lines (162 loc) 6.27 kB
import { CONSTANTS } from './constants.js'; import { TranscriptError, RateLimitError, VideoUnavailableError, TranscriptDisabledError, NoTranscriptError, LanguageNotFoundError, } from './errors.js'; import fetch from 'node-fetch'; import { HttpsProxyAgent } from 'https-proxy-agent'; import { URL } from 'url'; /** * Service class for fetching YouTube video transcripts */ export class YoutubeTranscript { /** * Fetches the transcript for a YouTube video * @param videoId Video URL or ID * @param config Configuration options * @returns Array of transcript segments */ static async fetchTranscript(videoId, config) { const identifier = this.retrieveVideoId(videoId); const pageContent = await this.fetchVideoPage(identifier, config); const captionsData = this.parseCaptionsData(pageContent, videoId); const transcriptUrl = this.getTranscriptUrl(captionsData, videoId, config?.lang); return this.fetchAndParseTranscript(transcriptUrl, config?.lang, captionsData.playerCaptionsTracklistRenderer.captionTracks[0].languageCode, config); } /** * Creates fetch options with proxy configuration if provided */ static getFetchOptions(config, extraHeaders = {}) { const headers = { 'User-Agent': CONSTANTS.USER_AGENT, ...extraHeaders, }; const options = { headers }; if (config?.proxyAgent) { // Use pre-configured proxy agent if provided options.agent = config.proxyAgent; } else if (config?.proxy) { // Otherwise, create a proxy agent from the proxy configuration const proxyUrl = new URL(config.proxy.host); if (config.proxy.auth) { proxyUrl.username = config.proxy.auth.username; proxyUrl.password = config.proxy.auth.password; } options.agent = new HttpsProxyAgent(proxyUrl.toString()); } return options; } /** * Fetches the video page content */ static async fetchVideoPage(videoId, config) { const extraHeaders = {}; if (config?.lang) { extraHeaders['Accept-Language'] = config.lang; } const options = this.getFetchOptions(config, extraHeaders); const response = await fetch(`https://www.youtube.com/watch?v=${videoId}`, options); return response.text(); } /** * Extracts and validates captions data from the video page */ static parseCaptionsData(pageContent, videoId) { const htmlParts = pageContent.split('"captions":'); if (htmlParts.length <= 1) { this.handlePageErrors(pageContent, videoId); } const captionsData = this.extractCaptionsJson(htmlParts[1]); if (!captionsData) { throw new TranscriptDisabledError(videoId); } if (!('captionTracks' in captionsData.playerCaptionsTracklistRenderer)) { throw new NoTranscriptError(videoId); } return captionsData; } /** * Extracts captions JSON data from the page content */ static extractCaptionsJson(captionsSection) { try { const jsonStr = captionsSection.split(',"videoDetails')[0].replace('\n', ''); return JSON.parse(jsonStr); } catch { return undefined; } } /** * Handles various error cases from the video page */ static handlePageErrors(pageContent, videoId) { if (pageContent.includes('class="g-recaptcha"')) { throw new RateLimitError(); } if (!pageContent.includes('"playabilityStatus":')) { throw new VideoUnavailableError(videoId); } throw new TranscriptDisabledError(videoId); } /** * Gets the URL for the transcript in the requested language */ static getTranscriptUrl(captionsData, videoId, requestedLang) { if (requestedLang) { this.validateLanguageAvailability(captionsData, requestedLang, videoId); } const tracks = captionsData.playerCaptionsTracklistRenderer.captionTracks; const track = requestedLang ? tracks.find((track) => track.languageCode === requestedLang) : tracks[0]; if (!track) { throw new NoTranscriptError(videoId); } return track.baseUrl; } /** * Validates that the requested language is available */ static validateLanguageAvailability(captionsData, lang, videoId) { const tracks = captionsData.playerCaptionsTracklistRenderer.captionTracks; const isLanguageAvailable = tracks.some((track) => track.languageCode === lang); if (!isLanguageAvailable) { const availableLanguages = tracks.map((track) => track.languageCode); throw new LanguageNotFoundError(lang, availableLanguages, videoId); } } /** * Fetches and parses the transcript XML */ static async fetchAndParseTranscript(transcriptUrl, requestedLang, defaultLang, config) { const extraHeaders = {}; if (requestedLang) { extraHeaders['Accept-Language'] = requestedLang; } const options = this.getFetchOptions(config, extraHeaders); const response = await fetch(transcriptUrl, options); if (!response.ok) { throw new NoTranscriptError(transcriptUrl); } const transcriptText = await response.text(); const matches = [...transcriptText.matchAll(CONSTANTS.TRANSCRIPT_XML_REGEX)]; return matches.map(match => ({ text: match[3], duration: parseFloat(match[2]), offset: parseFloat(match[1]), lang: requestedLang ?? defaultLang, })); } /** * Extracts the video ID from either a full URL or direct ID */ static retrieveVideoId(videoId) { if (videoId.length === 11) { return videoId; } const match = videoId.match(CONSTANTS.VIDEO_ID_REGEX); if (match?.[1]) { return match[1]; } throw new TranscriptError('Could not extract YouTube video ID from the provided string'); } }