@rolme/ytscript
Version:
A CLI tool to download YouTube transcripts and generate summaries
93 lines (92 loc) • 3.18 kB
JavaScript
import { TranscriptError } from '../../errors.js';
function parseTimestamp(text) {
const match = text.match(/<text start="([0-9.]+)" dur="([0-9.]+)"/);
if (!match)
return null;
return {
offset: parseFloat(match[1]),
duration: parseFloat(match[2])
};
}
function decodeHtmlEntities(text) {
return text
.replace(/'/g, "'")
.replace(/"/g, '"')
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.trim();
}
function parseXmlToSegments(xml) {
const segments = [];
const lines = xml.split('\n');
for (const line of lines) {
const timing = parseTimestamp(line);
if (!timing)
continue;
const text = line.replace(/<\/?[^>]+(>|$)/g, '').trim(); // Remove XML tags
if (text) {
segments.push({
text: decodeHtmlEntities(text),
...timing
});
}
}
return segments;
}
export async function getTranscript(videoInfo, options = {}) {
try {
const captions = videoInfo.player_response.captions;
if (!captions || !captions.playerCaptionsTracklistRenderer) {
throw new TranscriptError('No captions available for this video');
}
const captionTracks = captions.playerCaptionsTracklistRenderer.captionTracks;
if (!captionTracks || captionTracks.length === 0) {
throw new TranscriptError('No caption tracks found');
}
// Find the requested language or default to English
const targetLang = options.lang || 'en';
const caption = captionTracks.find(track => track.languageCode === targetLang);
if (!caption) {
throw new TranscriptError(`No captions found for language: ${targetLang}`);
}
// Download the caption track
const response = await fetch(caption.baseUrl);
if (!response.ok) {
throw new TranscriptError(`Failed to download caption track: ${response.statusText}`);
}
const rawText = await response.text();
let text = '';
let segments = [];
// Check if the response is XML by looking for transcript tags
const isXml = rawText.includes('<transcript>') && rawText.includes('</transcript>');
if (isXml) {
// Try to parse as XML
try {
segments = parseXmlToSegments(rawText);
text = segments.map(s => s.text).join('\n');
}
catch (e) {
segments = [];
text = '';
}
}
else {
// Invalid XML or plain text
segments = [];
text = '';
}
return {
transcript: text,
segments,
videoId: videoInfo.videoDetails.videoId
};
}
catch (error) {
if (error instanceof TranscriptError) {
throw error;
}
const errorMessage = error instanceof Error ? error.message : 'Unknown error occurred';
throw new TranscriptError(`Failed to download transcript: ${errorMessage}`);
}
}