@phunky/scrape-channel-listings
Version:
A TypeScript library for scraping TV channel listings from various providers
232 lines (231 loc) • 8 kB
JavaScript
;
/**
* Core scraping utility module for channel listing extraction
* Provides a robust framework for scraping channel information from various providers
* Features include:
* - Automated browser setup and cleanup
* - Request interception and resource blocking
* - Retry mechanism with exponential backoff
* - Channel name normalization
* - Configurable error handling
* - Structured output generation
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.runScraper = runScraper;
exports.runScraperCLI = runScraperCLI;
const playwright_1 = __importDefault(require("playwright"));
const random_useragent_1 = __importDefault(require("random-useragent"));
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
/**
* Global configuration settings for the scraper
* Includes browser settings, resource blocking, retry logic, and output configuration
*/
const CONFIG = {
// Browser settings
BROWSER: {
HEADLESS: process.env.HEADLESS !== 'false',
USER_AGENT: random_useragent_1.default.getRandom(),
},
// Request interception
BLOCKED_RESOURCES: ['image', 'stylesheet', 'font', 'media'],
// Retry settings
RETRY: {
ATTEMPTS: process.env.RETRY_ATTEMPTS ? parseInt(process.env.RETRY_ATTEMPTS, 10) : 1,
DELAY: process.env.RETRY_DELAY ? parseInt(process.env.RETRY_DELAY, 10) : 1000,
},
// Page load settings
PAGE_LOAD: {
TIMEOUT: process.env.PAGE_TIMEOUT ? parseInt(process.env.PAGE_TIMEOUT, 10) : 30000,
WAIT_UNTIL: 'networkidle',
},
// Output directory
OUTPUT_DIR: process.env.OUTPUT_DIR || '../data',
};
/**
* Sets up a browser instance with custom configuration
* @returns {Promise<{browser: playwright.Browser, context: playwright.BrowserContext}>}
* @throws {Error} If browser initialization fails
*/
const setupBrowser = async () => {
const browser = await playwright_1.default.chromium.launch({
headless: CONFIG.BROWSER.HEADLESS
});
const context = await browser.newContext({
userAgent: CONFIG.BROWSER.USER_AGENT,
bypassCSP: true
});
return { browser, context };
};
/**
* Normalizes channel names to a standard format across providers
* Handles special characters, whitespace, and common variations
* @param {string} name - Raw channel name from provider
* @returns {string} Normalized channel name
*/
const normalizeChannelName = (name) => {
return name
.toUpperCase()
.replace(/(?:\([^)]*\)|'|'|[^\w\s&+']|(?:\s+)|(?:\s*&\s*)|\s+\+1)/g, (match) => {
if (match === '&' || match.includes('&'))
return '&';
if (match === "'" || match === "'")
return "'";
if (match.startsWith('('))
return '';
if (match.includes('+1'))
return '+1';
if (match === ' ')
return ' ';
return '';
})
.replace(/\s+\+1/g, '+1')
.replace(/\s+/g, ' ')
.trim();
};
/**
* Implements retry logic with exponential backoff
* @template T - Return type of the function being retried
* @param {() => Promise<T>} fn - Function to retry
* @param {number} [retries] - Maximum number of retry attempts
* @param {number} [delay] - Base delay between retries in milliseconds
* @returns {Promise<T>} Result of the successful attempt
* @throws {Error} If all retry attempts fail
*/
const retry = async (fn, retries = CONFIG.RETRY.ATTEMPTS, delay = CONFIG.RETRY.DELAY) => {
for (let i = 0; i < retries; i++) {
try {
return await fn();
}
catch (error) {
console.log(`Attempt ${i + 1} failed: ${error.message}`);
if (i < retries - 1) {
await new Promise(resolve => setTimeout(resolve, delay * Math.pow(2, i)));
}
else {
throw error;
}
}
}
throw new Error('Retry failed');
};
/**
* Configures a page with request interception and navigation
* @param {playwright.BrowserContext} context - Browser context to create page from
* @param {string} url - URL to navigate to
* @returns {Promise<playwright.Page>} Configured page instance
*/
const setupPage = async (context, url) => {
const page = await context.newPage();
// Block unnecessary resources to improve performance
await page.route('**/*', (route) => {
const request = route.request();
route[CONFIG.BLOCKED_RESOURCES.includes(request.resourceType()) ? 'abort' : 'continue']();
});
await page.goto(url, {
timeout: CONFIG.PAGE_LOAD.TIMEOUT,
waitUntil: CONFIG.PAGE_LOAD.WAIT_UNTIL
});
return page;
};
/**
* Writes channel data to a JSON file
* Creates output directory if it doesn't exist
* @param {Channel[]} output - Channel data to write
* @param {string} filename - Name of the output file
* @throws {Error} If file writing fails
*/
const writeOutputToFile = (output, filename) => {
const outputPath = path_1.default.join(__dirname, '..', CONFIG.OUTPUT_DIR, filename);
try {
fs_1.default.mkdirSync(path_1.default.dirname(outputPath), { recursive: true });
fs_1.default.writeFileSync(outputPath, JSON.stringify(output, null, 2));
}
catch (error) {
console.error(`Error writing to file ${filename}:`, error);
throw error;
}
};
/**
* Processes raw scraped data into standardized channel format
* Applies name normalization and overrides
* @param {Partial<Channel>[]} data - Raw channel data
* @param {Record<string, string>} [overrides] - Channel name mappings
* @returns {Channel[]} Processed channel list
*/
const processData = (data, overrides = {}) => {
return data
.map(item => {
if (!item?.name || !item?.number)
return null;
const normalizedName = normalizeChannelName(item.name);
const finalName = overrides[normalizedName] || normalizedName;
const channel = {
number: item.number,
name: finalName
};
return channel;
})
.filter((channel) => channel !== null);
};
/**
* Executes a scraper with the given configuration
* @param {ScraperConfig} config - Scraper configuration
* @returns {Promise<Channel[]>} Array of scraped channels
*/
async function runScraper(config) {
const { browser, context } = await setupBrowser();
try {
const page = await setupPage(context, config.url);
const data = await retry(() => config.scrapeFunction(page));
const channels = processData(data, config.overrides);
// Write to file if outputFile is specified
if (config.outputFile) {
writeOutputToFile(channels, config.outputFile);
}
return channels;
}
finally {
await browser.close();
}
}
/**
* Parse command line arguments
*/
function parseArgs() {
return {
writeFiles: process.argv.includes('--files')
};
}
/**
* Execute a scraper configuration from the command line
* Handles argument parsing and output formatting
*/
async function runScraperCLI(config) {
const { writeFiles } = parseArgs();
try {
let channels;
if (config.runCustom) {
channels = await config.runCustom({ writeFiles });
}
else {
// Remove outputFile from config if we're not writing to files
const runConfig = {
...config,
outputFile: writeFiles ? config.outputFile : undefined
};
channels = await runScraper(runConfig);
}
if (!writeFiles) {
// Output JSON directly
console.log(JSON.stringify(channels, null, 2));
}
}
catch (error) {
console.error('Error:', error);
process.exit(1);
}
}