defuddle
Version:
Extract article content and metadata from web pages.
115 lines • 3.88 kB
JavaScript
Object.defineProperty(exports, "__esModule", { value: true });
exports.ExtractorRegistry = void 0;
// Extractors
const reddit_1 = require("./extractors/reddit");
const twitter_1 = require("./extractors/twitter");
const youtube_1 = require("./extractors/youtube");
const hackernews_1 = require("./extractors/hackernews");
const chatgpt_1 = require("./extractors/chatgpt");
const claude_1 = require("./extractors/claude");
const grok_1 = require("./extractors/grok");
const gemini_1 = require("./extractors/gemini");
class ExtractorRegistry {
static initialize() {
// Register all extractors with their URL patterns
this.register({
patterns: [
'twitter.com',
/\/x\.com\/.*/,
],
extractor: twitter_1.TwitterExtractor
});
this.register({
patterns: [
'reddit.com',
'old.reddit.com',
'new.reddit.com',
/^https:\/\/[^\/]+\.reddit\.com/
],
extractor: reddit_1.RedditExtractor
});
this.register({
patterns: [
'youtube.com',
'youtu.be',
/youtube\.com\/watch\?v=.*/,
/youtu\.be\/.*/
],
extractor: youtube_1.YoutubeExtractor
});
this.register({
patterns: [
/news\.ycombinator\.com\/item\?id=.*/
],
extractor: hackernews_1.HackerNewsExtractor
});
this.register({
patterns: [
/^https?:\/\/chatgpt\.com\/(c|share)\/.*/
],
extractor: chatgpt_1.ChatGPTExtractor
});
this.register({
patterns: [
/^https?:\/\/claude\.ai\/(chat|share)\/.*/
],
extractor: claude_1.ClaudeExtractor
});
this.register({
patterns: [
/^https?:\/\/grok\.com\/(chat|share)(\/.*)?$/
],
extractor: grok_1.GrokExtractor,
});
this.register({
patterns: [
/^https?:\/\/gemini\.google\.com\/app\/.*/
],
extractor: gemini_1.GeminiExtractor
});
}
static register(mapping) {
this.mappings.push(mapping);
}
static findExtractor(document, url, schemaOrgData) {
try {
const domain = new URL(url).hostname;
// Check cache first
if (this.domainCache.has(domain)) {
const cachedExtractor = this.domainCache.get(domain);
return cachedExtractor ? new cachedExtractor(document, url, schemaOrgData) : null;
}
// Find matching extractor
for (const { patterns, extractor } of this.mappings) {
const matches = patterns.some(pattern => {
if (pattern instanceof RegExp) {
return pattern.test(url);
}
return domain.includes(pattern);
});
if (matches) {
// Cache the result
this.domainCache.set(domain, extractor);
return new extractor(document, url, schemaOrgData);
}
}
// Cache the negative result
this.domainCache.set(domain, null);
return null;
}
catch (error) {
console.error('Error in findExtractor:', error);
return null;
}
}
static clearCache() {
this.domainCache.clear();
}
}
exports.ExtractorRegistry = ExtractorRegistry;
ExtractorRegistry.mappings = [];
ExtractorRegistry.domainCache = new Map();
// Initialize extractors
ExtractorRegistry.initialize();
//# sourceMappingURL=extractor-registry.js.map
;