UNPKG

@phunky/scrape-channel-listings

Version:

A TypeScript library for scraping TV channel listings from various providers

github.com/phunky/scrape-channel-listings

phunky/scrape-channel-listings

232 lines (231 loc) • 8 kB

JavaScript

"use strict"; /** * Core scraping utility module for channel listing extraction * Provides a robust framework for scraping channel information from various providers * Features include: * - Automated browser setup and cleanup * - Request interception and resource blocking * - Retry mechanism with exponential backoff * - Channel name normalization * - Configurable error handling * - Structured output generation */ var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.runScraper = runScraper; exports.runScraperCLI = runScraperCLI; const playwright_1 = __importDefault(require("playwright")); const random_useragent_1 = __importDefault(require("random-useragent")); const fs_1 = __importDefault(require("fs")); const path_1 = __importDefault(require("path")); /** * Global configuration settings for the scraper * Includes browser settings, resource blocking, retry logic, and output configuration */ const CONFIG = { // Browser settings BROWSER: { HEADLESS: process.env.HEADLESS !== 'false', USER_AGENT: random_useragent_1.default.getRandom(), }, // Request interception BLOCKED_RESOURCES: ['image', 'stylesheet', 'font', 'media'], // Retry settings RETRY: { ATTEMPTS: process.env.RETRY_ATTEMPTS ? parseInt(process.env.RETRY_ATTEMPTS, 10) : 1, DELAY: process.env.RETRY_DELAY ? parseInt(process.env.RETRY_DELAY, 10) : 1000, }, // Page load settings PAGE_LOAD: { TIMEOUT: process.env.PAGE_TIMEOUT ? parseInt(process.env.PAGE_TIMEOUT, 10) : 30000, WAIT_UNTIL: 'networkidle', }, // Output directory OUTPUT_DIR: process.env.OUTPUT_DIR || '../data', }; /** * Sets up a browser instance with custom configuration * @returns {Promise<{browser: playwright.Browser, context: playwright.BrowserContext}>} * @throws {Error} If browser initialization fails */ const setupBrowser = async () => { const browser = await playwright_1.default.chromium.launch({ headless: CONFIG.BROWSER.HEADLESS }); const context = await browser.newContext({ userAgent: CONFIG.BROWSER.USER_AGENT, bypassCSP: true }); return { browser, context }; }; /** * Normalizes channel names to a standard format across providers * Handles special characters, whitespace, and common variations * @param {string} name - Raw channel name from provider * @returns {string} Normalized channel name */ const normalizeChannelName = (name) => { return name .toUpperCase() .replace(/(?:\([^)]*\)|'|'|[^\w\s&+']|(?:\s+)|(?:\s*&\s*)|\s+\+1)/g, (match) => { if (match === '&' || match.includes('&')) return '&'; if (match === "'" || match === "'") return "'"; if (match.startsWith('(')) return ''; if (match.includes('+1')) return '+1'; if (match === ' ') return ' '; return ''; }) .replace(/\s+\+1/g, '+1') .replace(/\s+/g, ' ') .trim(); }; /** * Implements retry logic with exponential backoff * @template T - Return type of the function being retried * @param {() => Promise<T>} fn - Function to retry * @param {number} [retries] - Maximum number of retry attempts * @param {number} [delay] - Base delay between retries in milliseconds * @returns {Promise<T>} Result of the successful attempt * @throws {Error} If all retry attempts fail */ const retry = async (fn, retries = CONFIG.RETRY.ATTEMPTS, delay = CONFIG.RETRY.DELAY) => { for (let i = 0; i < retries; i++) { try { return await fn(); } catch (error) { console.log(`Attempt ${i + 1} failed: ${error.message}`); if (i < retries - 1) { await new Promise(resolve => setTimeout(resolve, delay * Math.pow(2, i))); } else { throw error; } } } throw new Error('Retry failed'); }; /** * Configures a page with request interception and navigation * @param {playwright.BrowserContext} context - Browser context to create page from * @param {string} url - URL to navigate to * @returns {Promise<playwright.Page>} Configured page instance */ const setupPage = async (context, url) => { const page = await context.newPage(); // Block unnecessary resources to improve performance await page.route('**/*', (route) => { const request = route.request(); route[CONFIG.BLOCKED_RESOURCES.includes(request.resourceType()) ? 'abort' : 'continue'](); }); await page.goto(url, { timeout: CONFIG.PAGE_LOAD.TIMEOUT, waitUntil: CONFIG.PAGE_LOAD.WAIT_UNTIL }); return page; }; /** * Writes channel data to a JSON file * Creates output directory if it doesn't exist * @param {Channel[]} output - Channel data to write * @param {string} filename - Name of the output file * @throws {Error} If file writing fails */ const writeOutputToFile = (output, filename) => { const outputPath = path_1.default.join(__dirname, '..', CONFIG.OUTPUT_DIR, filename); try { fs_1.default.mkdirSync(path_1.default.dirname(outputPath), { recursive: true }); fs_1.default.writeFileSync(outputPath, JSON.stringify(output, null, 2)); } catch (error) { console.error(`Error writing to file ${filename}:`, error); throw error; } }; /** * Processes raw scraped data into standardized channel format * Applies name normalization and overrides * @param {Partial<Channel>[]} data - Raw channel data * @param {Record<string, string>} [overrides] - Channel name mappings * @returns {Channel[]} Processed channel list */ const processData = (data, overrides = {}) => { return data .map(item => { if (!item?.name || !item?.number) return null; const normalizedName = normalizeChannelName(item.name); const finalName = overrides[normalizedName] || normalizedName; const channel = { number: item.number, name: finalName }; return channel; }) .filter((channel) => channel !== null); }; /** * Executes a scraper with the given configuration * @param {ScraperConfig} config - Scraper configuration * @returns {Promise<Channel[]>} Array of scraped channels */ async function runScraper(config) { const { browser, context } = await setupBrowser(); try { const page = await setupPage(context, config.url); const data = await retry(() => config.scrapeFunction(page)); const channels = processData(data, config.overrides); // Write to file if outputFile is specified if (config.outputFile) { writeOutputToFile(channels, config.outputFile); } return channels; } finally { await browser.close(); } } /** * Parse command line arguments */ function parseArgs() { return { writeFiles: process.argv.includes('--files') }; } /** * Execute a scraper configuration from the command line * Handles argument parsing and output formatting */ async function runScraperCLI(config) { const { writeFiles } = parseArgs(); try { let channels; if (config.runCustom) { channels = await config.runCustom({ writeFiles }); } else { // Remove outputFile from config if we're not writing to files const runConfig = { ...config, outputFile: writeFiles ? config.outputFile : undefined }; channels = await runScraper(runConfig); } if (!writeFiles) { // Output JSON directly console.log(JSON.stringify(channels, null, 2)); } } catch (error) { console.error('Error:', error); process.exit(1); } }