UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

github.com/kepano/defuddle

kepano/defuddle

115 lines • 3.88 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.ExtractorRegistry = void 0; // Extractors const reddit_1 = require("./extractors/reddit"); const twitter_1 = require("./extractors/twitter"); const youtube_1 = require("./extractors/youtube"); const hackernews_1 = require("./extractors/hackernews"); const chatgpt_1 = require("./extractors/chatgpt"); const claude_1 = require("./extractors/claude"); const grok_1 = require("./extractors/grok"); const gemini_1 = require("./extractors/gemini"); class ExtractorRegistry { static initialize() { // Register all extractors with their URL patterns this.register({ patterns: [ 'twitter.com', /\/x\.com\/.*/, ], extractor: twitter_1.TwitterExtractor }); this.register({ patterns: [ 'reddit.com', 'old.reddit.com', 'new.reddit.com', /^https:\/\/[^\/]+\.reddit\.com/ ], extractor: reddit_1.RedditExtractor }); this.register({ patterns: [ 'youtube.com', 'youtu.be', /youtube\.com\/watch\?v=.*/, /youtu\.be\/.*/ ], extractor: youtube_1.YoutubeExtractor }); this.register({ patterns: [ /news\.ycombinator\.com\/item\?id=.*/ ], extractor: hackernews_1.HackerNewsExtractor }); this.register({ patterns: [ /^https?:\/\/chatgpt\.com\/(c|share)\/.*/ ], extractor: chatgpt_1.ChatGPTExtractor }); this.register({ patterns: [ /^https?:\/\/claude\.ai\/(chat|share)\/.*/ ], extractor: claude_1.ClaudeExtractor }); this.register({ patterns: [ /^https?:\/\/grok\.com\/(chat|share)(\/.*)?$/ ], extractor: grok_1.GrokExtractor, }); this.register({ patterns: [ /^https?:\/\/gemini\.google\.com\/app\/.*/ ], extractor: gemini_1.GeminiExtractor }); } static register(mapping) { this.mappings.push(mapping); } static findExtractor(document, url, schemaOrgData) { try { const domain = new URL(url).hostname; // Check cache first if (this.domainCache.has(domain)) { const cachedExtractor = this.domainCache.get(domain); return cachedExtractor ? new cachedExtractor(document, url, schemaOrgData) : null; } // Find matching extractor for (const { patterns, extractor } of this.mappings) { const matches = patterns.some(pattern => { if (pattern instanceof RegExp) { return pattern.test(url); } return domain.includes(pattern); }); if (matches) { // Cache the result this.domainCache.set(domain, extractor); return new extractor(document, url, schemaOrgData); } } // Cache the negative result this.domainCache.set(domain, null); return null; } catch (error) { console.error('Error in findExtractor:', error); return null; } } static clearCache() { this.domainCache.clear(); } } exports.ExtractorRegistry = ExtractorRegistry; ExtractorRegistry.mappings = []; ExtractorRegistry.domainCache = new Map(); // Initialize extractors ExtractorRegistry.initialize(); //# sourceMappingURL=extractor-registry.js.map