UNPKG

youtube-transcript-node

Version:

This is a Node.js API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and supports translating subtitles.

155 lines (154 loc) 6.63 kB
import { decode } from 'html-entities'; import { WATCH_URL, INNERTUBE_API_URL, INNERTUBE_CONTEXT } from './utils/constants.js'; import { TranscriptsDisabled, YouTubeDataUnparsable, YouTubeRequestFailed, InvalidVideoId, VideoUnavailable, VideoUnplayable, IpBlocked, RequestBlocked, AgeRestricted, FailedToCreateConsentCookie } from './errors/index.js'; import { Transcript, TranscriptList } from './transcript.js'; const PlayabilityStatus = { OK: 'OK', ERROR: 'ERROR', LOGIN_REQUIRED: 'LOGIN_REQUIRED' }; const PlayabilityFailedReason = { BOT_DETECTED: 'Sign in to confirm you\'re not a bot', AGE_RESTRICTED: 'This video may be inappropriate for some users.', VIDEO_UNAVAILABLE: 'This video is unavailable' }; export class TranscriptListFetcher { _httpClient; _proxyConfig; constructor(httpClient, proxyConfig = null) { this._httpClient = httpClient; this._proxyConfig = proxyConfig; } async fetch(videoId) { const captionsJson = await this._fetchCaptionsJson(videoId); return TranscriptList.build(this._httpClient, videoId, captionsJson); } async _fetchCaptionsJson(videoId, tryNumber = 0) { try { const html = await this._fetchVideoHtml(videoId); const apiKey = this._extractInnertubeApiKey(html, videoId); const innertubeData = await this._fetchInnertubeData(videoId, apiKey); return this._extractCaptionsJson(innertubeData, videoId); } catch (error) { if (error instanceof RequestBlocked && this._proxyConfig) { const retries = this._proxyConfig.retriesWhenBlocked || 0; if (tryNumber + 1 < retries) { return this._fetchCaptionsJson(videoId, tryNumber + 1); } } throw error; } } _extractInnertubeApiKey(html, videoId) { const pattern = /"INNERTUBE_API_KEY":\s*"([a-zA-Z0-9_-]+)"/; const match = html.match(pattern); if (match && match[1]) { return match[1]; } if (html.includes('class="g-recaptcha"')) { throw new IpBlocked(videoId); } throw new YouTubeDataUnparsable(videoId); } _extractCaptionsJson(innertubeData, videoId) { this._assertPlayability(innertubeData.playabilityStatus, videoId); const captionsJson = innertubeData.captions?.playerCaptionsTracklistRenderer; if (!captionsJson || !captionsJson.captionTracks) { throw new TranscriptsDisabled(videoId); } return captionsJson; } _assertPlayability(playabilityStatusData, videoId) { if (!playabilityStatusData) return; const playabilityStatus = playabilityStatusData.status; if (playabilityStatus === PlayabilityStatus.OK || !playabilityStatus) { return; } const reason = playabilityStatusData.reason || ''; if (playabilityStatus === PlayabilityStatus.LOGIN_REQUIRED) { if (reason === PlayabilityFailedReason.BOT_DETECTED) { throw new RequestBlocked(videoId); } if (reason === PlayabilityFailedReason.AGE_RESTRICTED) { throw new AgeRestricted(videoId); } } if (playabilityStatus === PlayabilityStatus.ERROR && reason === PlayabilityFailedReason.VIDEO_UNAVAILABLE) { if (videoId.startsWith('http://') || videoId.startsWith('https://')) { throw new InvalidVideoId(videoId); } throw new VideoUnavailable(videoId); } const subReasons = playabilityStatusData.errorScreen ?.playerErrorMessageRenderer ?.subreason ?.runs ?.map(run => run.text || '') || []; throw new VideoUnplayable(videoId, reason, subReasons); } async _createConsentCookie(html, videoId) { const match = html.match(/name="v" value="(.*?)"/); if (!match) { throw new FailedToCreateConsentCookie(videoId); } // Set cookie using the http client's cookie jar this._httpClient.defaults.headers.Cookie = `CONSENT=YES+${match[1]}`; } async _fetchVideoHtml(videoId) { let html = await this._fetchHtml(videoId); if (html.includes('action="https://consent.youtube.com/s"')) { await this._createConsentCookie(html, videoId); html = await this._fetchHtml(videoId); if (html.includes('action="https://consent.youtube.com/s"')) { throw new FailedToCreateConsentCookie(videoId); } } return html; } async _fetchHtml(videoId) { try { const response = await this._httpClient.get(WATCH_URL.replace('{video_id}', videoId)); return decode(response.data); } catch (error) { const axiosError = error; if (axiosError.response?.status === 429) { throw new IpBlocked(videoId); } throw new YouTubeRequestFailed(videoId, error); } } async _fetchInnertubeData(videoId, apiKey) { try { const response = await this._httpClient.post(INNERTUBE_API_URL.replace('{api_key}', apiKey), { context: INNERTUBE_CONTEXT, videoId: videoId }); return response.data; } catch (error) { const axiosError = error; if (axiosError.response?.status === 429) { throw new IpBlocked(videoId); } throw new YouTubeRequestFailed(videoId, error); } } } // Static build method for TranscriptList TranscriptList.build = function (httpClient, videoId, captionsJson) { const translationLanguages = (captionsJson.translationLanguages || []).map(lang => ({ language: lang.languageName.runs[0].text, languageCode: lang.languageCode })); const manuallyCreatedTranscripts = {}; const generatedTranscripts = {}; for (const caption of captionsJson.captionTracks) { const transcriptDict = caption.kind === 'asr' ? generatedTranscripts : manuallyCreatedTranscripts; transcriptDict[caption.languageCode] = new Transcript(httpClient, videoId, caption.baseUrl.replace('&fmt=srv3', ''), caption.name.runs[0].text, caption.languageCode, caption.kind === 'asr', caption.isTranslatable ? translationLanguages : []); } return new TranscriptList(videoId, manuallyCreatedTranscripts, generatedTranscripts, translationLanguages); };