UNPKG

@phunky/scrape-channel-listings

Version:

A TypeScript library for scraping TV channel listings from various providers

github.com/phunky/scrape-channel-listings

phunky/scrape-channel-listings

169 lines (168 loc) • 5.09 kB

JavaScript

"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.runScraper = void 0; const playwright_1 = __importDefault(require("playwright")); const random_useragent_1 = __importDefault(require("random-useragent")); const fs_1 = __importDefault(require("fs")); const path_1 = __importDefault(require("path")); /** * Configuration constants */ const CONFIG = { // Browser settings BROWSER: { HEADLESS: true, USER_AGENT: random_useragent_1.default.getRandom(), }, // Request interception BLOCKED_RESOURCES: ['image', 'stylesheet', 'font', 'media'], // Retry settings RETRY: { ATTEMPTS: 1, DELAY: 1000, }, // Page load settings PAGE_LOAD: { TIMEOUT: 30000, WAIT_UNTIL: 'networkidle', }, // Output directory OUTPUT_DIR: 'data', }; /** * Browser and context setup */ const setupBrowser = async () => { const browser = await playwright_1.default.chromium.launch({ headless: CONFIG.BROWSER.HEADLESS }); const context = await browser.newContext({ userAgent: CONFIG.BROWSER.USER_AGENT, bypassCSP: true }); return { browser, context }; }; /** * Optimized channel name normalization using a single regex */ const normalizeChannelName = (name) => { return name .toUpperCase() .replace(/(?:\([^)]*\)|'|'|[^\w\s&+']|(?:\s+)|(?:\s*&\s*))/g, (match) => { if (match === '&') return ' & '; if (match === "'" || match === "'") return "'"; if (match.startsWith('(')) return ''; if (match === ' ') return ' '; return ''; }) .trim(); }; /** * Retry mechanism with exponential backoff */ const retry = async (fn, retries = CONFIG.RETRY.ATTEMPTS, delay = CONFIG.RETRY.DELAY) => { for (let i = 0; i < retries; i++) { try { return await fn(); } catch (error) { console.log(`Attempt ${i + 1} failed: ${error.message}`); if (i < retries - 1) { await new Promise(resolve => setTimeout(resolve, delay * Math.pow(2, i))); } else { throw error; } } } throw new Error('Retry failed'); }; /** * Page setup with optimized request interception */ const setupPage = async (context, url) => { const page = await context.newPage(); await page.route('**/*', (route) => { const request = route.request(); route[CONFIG.BLOCKED_RESOURCES.includes(request.resourceType()) ? 'abort' : 'continue'](); }); await page.goto(url, { timeout: CONFIG.PAGE_LOAD.TIMEOUT, waitUntil: CONFIG.PAGE_LOAD.WAIT_UNTIL }); return page; }; /** * Write output to JSON file with error handling */ const writeOutputToFile = (output, filename) => { const outputPath = path_1.default.join(__dirname, '..', CONFIG.OUTPUT_DIR, filename); try { fs_1.default.mkdirSync(path_1.default.dirname(outputPath), { recursive: true }); fs_1.default.writeFileSync(outputPath, JSON.stringify(output, null, 2)); } catch (error) { console.error(`Error writing to file ${filename}:`, error); throw error; } }; /** * Process scraped data with optimizations */ const processData = (data, overrides = {}, excludeChannels = () => false) => { return data .map(item => { if (!item?.name || !item?.number) return null; const normalizedName = normalizeChannelName(item.name); const finalName = overrides[normalizedName] || normalizedName; const channel = { number: item.number, name: finalName }; return excludeChannels(channel) ? null : channel; }) .filter((item) => item !== null); }; /** * Main scraper function with improved error handling and resource management */ const runScraper = async ({ url, scrapeFunction, overrides = {}, excludeChannels = () => false, outputFile }) => { let browser; let output = []; try { const { browser: newBrowser, context } = await setupBrowser(); browser = newBrowser; const scrapePage = async () => { const page = await setupPage(context, url); try { const data = await scrapeFunction(page); return processData(data, overrides, excludeChannels); } finally { await page.close(); } }; output = await retry(scrapePage); writeOutputToFile(output, outputFile); } catch (error) { console.error('Error during the scraping process:', error); throw error; } finally { if (browser) { await browser.close(); } } console.log(`Scrapped ${output.length} channels`); return output; }; exports.runScraper = runScraper;