headless-youtube-captions
Version:
Extract YouTube video transcripts, channel videos, comments, and comprehensive video metadata using headless browser automation
243 lines (200 loc) • 7.9 kB
JavaScript
import { createBrowser, createPage, handleCookieConsent } from './utils/browser.js';
import { scrollAndWaitForMore } from './utils/scroll.js';
import { extractVideoData, extractChannelInfo } from './utils/extract.js';
export async function getChannelVideos({ channelURL, limit = 30, pageToken = null }) {
const browser = await createBrowser();
try {
const page = await createPage(browser);
// Construct the full URL
let fullURL;
if (channelURL.startsWith('http')) {
// Ensure we're on the videos tab
fullURL = channelURL.includes('/videos') ? channelURL : channelURL.replace(/\/?$/, '/videos');
} else if (channelURL.startsWith('@')) {
fullURL = `https://youtube.com/${channelURL}/videos`;
} else if (channelURL.startsWith('UC')) {
fullURL = `https://youtube.com/channel/${channelURL}/videos`;
} else {
fullURL = `https://youtube.com/c/${channelURL}/videos`;
}
console.error(`Navigating to ${fullURL}`);
await page.goto(fullURL, {
waitUntil: 'networkidle2',
timeout: 60000
});
// Handle cookie consent
await handleCookieConsent(page);
// Wait a bit for dynamic content
await new Promise(resolve => setTimeout(resolve, 3000));
// Wait for initial videos to load
await page.waitForSelector('ytd-rich-item-renderer', { timeout: 30000 });
console.error('Initial videos loaded');
// Extract channel info
const channelInfo = await extractChannelInfo(page);
let allVideos = [];
let currentCount = 0;
// Load videos up to the limit
while (allVideos.length < limit) {
const videos = await extractVideoData(page);
allVideos = videos;
if (videos.length === currentCount) {
// No more videos to load
break;
}
currentCount = videos.length;
if (currentCount < limit) {
// Try to load more videos
const newCount = await scrollAndWaitForMore(page, 'ytd-rich-item-renderer', currentCount);
if (newCount === currentCount) {
break; // No new videos loaded
}
}
}
// Trim to requested limit
const resultVideos = allVideos.slice(0, limit);
console.error(`Successfully extracted ${resultVideos.length} videos`);
return {
channel: channelInfo,
videos: resultVideos,
totalLoaded: allVideos.length,
hasMore: allVideos.length > limit
};
} catch (error) {
console.error('Error extracting channel videos:', error);
throw error;
} finally {
await browser.close();
}
}
export async function searchChannelVideos({ channelURL, query, limit = 30 }) {
const browser = await createBrowser();
try {
const page = await createPage(browser);
// Navigate to channel page
let fullURL;
if (channelURL.startsWith('http')) {
// Remove /videos if present to get to main channel page
fullURL = channelURL.replace(/\/videos\/?$/, '');
} else if (channelURL.startsWith('@')) {
fullURL = `https://youtube.com/${channelURL}`;
} else if (channelURL.startsWith('UC')) {
fullURL = `https://youtube.com/channel/${channelURL}`;
} else {
fullURL = `https://youtube.com/c/${channelURL}`;
}
console.error(`Navigating to ${fullURL}`);
await page.goto(fullURL, {
waitUntil: 'networkidle2',
timeout: 60000
});
// Handle cookie consent
await handleCookieConsent(page);
// Wait for page to load
await new Promise(resolve => setTimeout(resolve, 3000));
// Look for search icon in channel header
const searchButtonSelectors = [
'ytd-channel-header-renderer yt-icon-button[aria-label*="Search"]',
'ytd-channel-header-renderer button[aria-label*="Search"]',
'#channel-header yt-icon-button[aria-label*="Search"]',
'yt-icon[icon="yt-icons:search"]'
];
let searchClicked = false;
for (const selector of searchButtonSelectors) {
try {
const searchButton = await page.$(selector);
if (searchButton) {
const isVisible = await searchButton.evaluate(el => {
const rect = el.getBoundingClientRect();
return rect.width > 0 && rect.height > 0;
});
if (isVisible) {
await searchButton.click();
console.error('Clicked search button');
searchClicked = true;
break;
}
}
} catch (e) {
// Try next selector
}
}
if (!searchClicked) {
// Try clicking on the search icon itself
const clicked = await page.evaluate(() => {
const icons = document.querySelectorAll('yt-icon');
for (const icon of icons) {
if (icon.getAttribute('icon') === 'yt-icons:search') {
const button = icon.closest('button') || icon.closest('yt-icon-button');
if (button) {
button.click();
return true;
}
}
}
return false;
});
if (clicked) {
console.error('Clicked search icon');
searchClicked = true;
}
}
if (!searchClicked) {
throw new Error('Could not find channel search button');
}
// Wait for search input to appear
await page.waitForSelector('input[placeholder*="Search"]', { timeout: 5000 });
// Type search query
await page.type('input[placeholder*="Search"]', query);
await page.keyboard.press('Enter');
// Wait for search results
await new Promise(resolve => setTimeout(resolve, 3000));
await page.waitForSelector('ytd-video-renderer, ytd-rich-item-renderer', { timeout: 10000 });
// Extract search results
const searchResults = await page.evaluate(() => {
// Try different selectors for search results
let videos = document.querySelectorAll('ytd-video-renderer');
if (videos.length === 0) {
videos = document.querySelectorAll('ytd-rich-item-renderer');
}
return Array.from(videos).map(video => {
// Extract video ID
const link = video.querySelector('a#video-title, a#video-title-link');
const href = link ? link.href : '';
const videoId = href.match(/watch\?v=([^&]+)/)?.[1] || '';
// Extract title
const titleElement = video.querySelector('#video-title');
const title = titleElement ? titleElement.textContent.trim() : '';
// Extract metadata
const viewsElement = video.querySelector('#metadata-line span:first-child, .view-count');
const views = viewsElement ? viewsElement.textContent : '';
const timeElement = video.querySelector('#metadata-line span:last-child, .published-time');
const uploadTime = timeElement ? timeElement.textContent : '';
// Extract duration
const durationElement = video.querySelector('ytd-thumbnail-overlay-time-status-renderer span, .video-time');
const duration = durationElement ? durationElement.textContent.trim() : '';
// Extract thumbnail
const thumbnail = video.querySelector('img#img')?.src || '';
return {
id: videoId,
title,
views,
uploadTime,
duration,
thumbnail,
url: `https://youtube.com/watch?v=${videoId}`
};
}).filter(video => video.id && video.title);
});
console.error(`Found ${searchResults.length} videos matching "${query}"`);
return {
query,
results: searchResults.slice(0, limit),
totalFound: searchResults.length
};
} catch (error) {
console.error('Error searching channel videos:', error);
throw error;
} finally {
await browser.close();
}
}