t-youtube-transcript-fetcher
Version:
An enhanced TypeScript library for fetching YouTube transcripts with proxy support (based on youtube-transcript)
163 lines (162 loc) • 6.27 kB
JavaScript
import { CONSTANTS } from './constants.js';
import { TranscriptError, RateLimitError, VideoUnavailableError, TranscriptDisabledError, NoTranscriptError, LanguageNotFoundError, } from './errors.js';
import fetch from 'node-fetch';
import { HttpsProxyAgent } from 'https-proxy-agent';
import { URL } from 'url';
/**
* Service class for fetching YouTube video transcripts
*/
export class YoutubeTranscript {
/**
* Fetches the transcript for a YouTube video
* @param videoId Video URL or ID
* @param config Configuration options
* @returns Array of transcript segments
*/
static async fetchTranscript(videoId, config) {
const identifier = this.retrieveVideoId(videoId);
const pageContent = await this.fetchVideoPage(identifier, config);
const captionsData = this.parseCaptionsData(pageContent, videoId);
const transcriptUrl = this.getTranscriptUrl(captionsData, videoId, config?.lang);
return this.fetchAndParseTranscript(transcriptUrl, config?.lang, captionsData.playerCaptionsTracklistRenderer.captionTracks[0].languageCode, config);
}
/**
* Creates fetch options with proxy configuration if provided
*/
static getFetchOptions(config, extraHeaders = {}) {
const headers = {
'User-Agent': CONSTANTS.USER_AGENT,
...extraHeaders,
};
const options = { headers };
if (config?.proxyAgent) {
// Use pre-configured proxy agent if provided
options.agent = config.proxyAgent;
}
else if (config?.proxy) {
// Otherwise, create a proxy agent from the proxy configuration
const proxyUrl = new URL(config.proxy.host);
if (config.proxy.auth) {
proxyUrl.username = config.proxy.auth.username;
proxyUrl.password = config.proxy.auth.password;
}
options.agent = new HttpsProxyAgent(proxyUrl.toString());
}
return options;
}
/**
* Fetches the video page content
*/
static async fetchVideoPage(videoId, config) {
const extraHeaders = {};
if (config?.lang) {
extraHeaders['Accept-Language'] = config.lang;
}
const options = this.getFetchOptions(config, extraHeaders);
const response = await fetch(`https://www.youtube.com/watch?v=${videoId}`, options);
return response.text();
}
/**
* Extracts and validates captions data from the video page
*/
static parseCaptionsData(pageContent, videoId) {
const htmlParts = pageContent.split('"captions":');
if (htmlParts.length <= 1) {
this.handlePageErrors(pageContent, videoId);
}
const captionsData = this.extractCaptionsJson(htmlParts[1]);
if (!captionsData) {
throw new TranscriptDisabledError(videoId);
}
if (!('captionTracks' in captionsData.playerCaptionsTracklistRenderer)) {
throw new NoTranscriptError(videoId);
}
return captionsData;
}
/**
* Extracts captions JSON data from the page content
*/
static extractCaptionsJson(captionsSection) {
try {
const jsonStr = captionsSection.split(',"videoDetails')[0].replace('\n', '');
return JSON.parse(jsonStr);
}
catch {
return undefined;
}
}
/**
* Handles various error cases from the video page
*/
static handlePageErrors(pageContent, videoId) {
if (pageContent.includes('class="g-recaptcha"')) {
throw new RateLimitError();
}
if (!pageContent.includes('"playabilityStatus":')) {
throw new VideoUnavailableError(videoId);
}
throw new TranscriptDisabledError(videoId);
}
/**
* Gets the URL for the transcript in the requested language
*/
static getTranscriptUrl(captionsData, videoId, requestedLang) {
if (requestedLang) {
this.validateLanguageAvailability(captionsData, requestedLang, videoId);
}
const tracks = captionsData.playerCaptionsTracklistRenderer.captionTracks;
const track = requestedLang
? tracks.find((track) => track.languageCode === requestedLang)
: tracks[0];
if (!track) {
throw new NoTranscriptError(videoId);
}
return track.baseUrl;
}
/**
* Validates that the requested language is available
*/
static validateLanguageAvailability(captionsData, lang, videoId) {
const tracks = captionsData.playerCaptionsTracklistRenderer.captionTracks;
const isLanguageAvailable = tracks.some((track) => track.languageCode === lang);
if (!isLanguageAvailable) {
const availableLanguages = tracks.map((track) => track.languageCode);
throw new LanguageNotFoundError(lang, availableLanguages, videoId);
}
}
/**
* Fetches and parses the transcript XML
*/
static async fetchAndParseTranscript(transcriptUrl, requestedLang, defaultLang, config) {
const extraHeaders = {};
if (requestedLang) {
extraHeaders['Accept-Language'] = requestedLang;
}
const options = this.getFetchOptions(config, extraHeaders);
const response = await fetch(transcriptUrl, options);
if (!response.ok) {
throw new NoTranscriptError(transcriptUrl);
}
const transcriptText = await response.text();
const matches = [...transcriptText.matchAll(CONSTANTS.TRANSCRIPT_XML_REGEX)];
return matches.map(match => ({
text: match[3],
duration: parseFloat(match[2]),
offset: parseFloat(match[1]),
lang: requestedLang ?? defaultLang,
}));
}
/**
* Extracts the video ID from either a full URL or direct ID
*/
static retrieveVideoId(videoId) {
if (videoId.length === 11) {
return videoId;
}
const match = videoId.match(CONSTANTS.VIDEO_ID_REGEX);
if (match?.[1]) {
return match[1];
}
throw new TranscriptError('Could not extract YouTube video ID from the provided string');
}
}