@phunky/scrape-channel-listings
Version:
A TypeScript library for scraping TV channel listings from various providers
169 lines (168 loc) • 5.09 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.runScraper = void 0;
const playwright_1 = __importDefault(require("playwright"));
const random_useragent_1 = __importDefault(require("random-useragent"));
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
/**
* Configuration constants
*/
const CONFIG = {
// Browser settings
BROWSER: {
HEADLESS: true,
USER_AGENT: random_useragent_1.default.getRandom(),
},
// Request interception
BLOCKED_RESOURCES: ['image', 'stylesheet', 'font', 'media'],
// Retry settings
RETRY: {
ATTEMPTS: 1,
DELAY: 1000,
},
// Page load settings
PAGE_LOAD: {
TIMEOUT: 30000,
WAIT_UNTIL: 'networkidle',
},
// Output directory
OUTPUT_DIR: 'data',
};
/**
* Browser and context setup
*/
const setupBrowser = async () => {
const browser = await playwright_1.default.chromium.launch({
headless: CONFIG.BROWSER.HEADLESS
});
const context = await browser.newContext({
userAgent: CONFIG.BROWSER.USER_AGENT,
bypassCSP: true
});
return { browser, context };
};
/**
* Optimized channel name normalization using a single regex
*/
const normalizeChannelName = (name) => {
return name
.toUpperCase()
.replace(/(?:\([^)]*\)|'|'|[^\w\s&+']|(?:\s+)|(?:\s*&\s*))/g, (match) => {
if (match === '&')
return ' & ';
if (match === "'" || match === "'")
return "'";
if (match.startsWith('('))
return '';
if (match === ' ')
return ' ';
return '';
})
.trim();
};
/**
* Retry mechanism with exponential backoff
*/
const retry = async (fn, retries = CONFIG.RETRY.ATTEMPTS, delay = CONFIG.RETRY.DELAY) => {
for (let i = 0; i < retries; i++) {
try {
return await fn();
}
catch (error) {
console.log(`Attempt ${i + 1} failed: ${error.message}`);
if (i < retries - 1) {
await new Promise(resolve => setTimeout(resolve, delay * Math.pow(2, i)));
}
else {
throw error;
}
}
}
throw new Error('Retry failed');
};
/**
* Page setup with optimized request interception
*/
const setupPage = async (context, url) => {
const page = await context.newPage();
await page.route('**/*', (route) => {
const request = route.request();
route[CONFIG.BLOCKED_RESOURCES.includes(request.resourceType()) ? 'abort' : 'continue']();
});
await page.goto(url, {
timeout: CONFIG.PAGE_LOAD.TIMEOUT,
waitUntil: CONFIG.PAGE_LOAD.WAIT_UNTIL
});
return page;
};
/**
* Write output to JSON file with error handling
*/
const writeOutputToFile = (output, filename) => {
const outputPath = path_1.default.join(__dirname, '..', CONFIG.OUTPUT_DIR, filename);
try {
fs_1.default.mkdirSync(path_1.default.dirname(outputPath), { recursive: true });
fs_1.default.writeFileSync(outputPath, JSON.stringify(output, null, 2));
}
catch (error) {
console.error(`Error writing to file ${filename}:`, error);
throw error;
}
};
/**
* Process scraped data with optimizations
*/
const processData = (data, overrides = {}, excludeChannels = () => false) => {
return data
.map(item => {
if (!item?.name || !item?.number)
return null;
const normalizedName = normalizeChannelName(item.name);
const finalName = overrides[normalizedName] || normalizedName;
const channel = {
number: item.number,
name: finalName
};
return excludeChannels(channel) ? null : channel;
})
.filter((item) => item !== null);
};
/**
* Main scraper function with improved error handling and resource management
*/
const runScraper = async ({ url, scrapeFunction, overrides = {}, excludeChannels = () => false, outputFile }) => {
let browser;
let output = [];
try {
const { browser: newBrowser, context } = await setupBrowser();
browser = newBrowser;
const scrapePage = async () => {
const page = await setupPage(context, url);
try {
const data = await scrapeFunction(page);
return processData(data, overrides, excludeChannels);
}
finally {
await page.close();
}
};
output = await retry(scrapePage);
writeOutputToFile(output, outputFile);
}
catch (error) {
console.error('Error during the scraping process:', error);
throw error;
}
finally {
if (browser) {
await browser.close();
}
}
console.log(`Scrapped ${output.length} channels`);
return output;
};
exports.runScraper = runScraper;