UNPKG

@springmusk/who-is

Version:

Professional WHOIS data scraping module with structured parsing and batch processing capabilities

349 lines (302 loc) 11.8 kB
'use strict'; const axios = require('axios'); const cheerio = require('cheerio'); /** * @module who-is * @description A professional WHOIS data scraping module with structured parsing and batch processing capabilities */ /** * Custom error class for WHOIS scraping related errors * @extends Error */ class WhoisScrapingError extends Error { constructor(message, cause = null) { super(message); this.name = 'WhoisScrapingError'; this.cause = cause; } } class WhoisParsingError extends Error { constructor(message, cause = null) { super(message); this.name = 'WhoisParsingError'; this.cause = cause; } } /** * HTTP client wrapper for making requests */ class HttpClient { constructor(config = {}) { this.timeout = config.timeout || 30000; this.userAgent = config.userAgent || 'Mozilla/5.0 (compatible; Whois/1.0)'; this.maxRetries = config.maxRetries || 3; this.retryDelay = config.retryDelay || 1000; } /** * Make HTTP GET request with retry logic * @param {string} url - Target URL * @param {Object} params - Query parameters * @param {number} retryCount - Current retry attempt * @returns {Promise<string>} Response data */ async get(url, params = {}, retryCount = 0) { try { const response = await axios.get(url, { params, timeout: this.timeout, headers: { 'User-Agent': this.userAgent } }); return response.data; } catch (error) { if (retryCount < this.maxRetries) { await this.#delay(this.retryDelay * (retryCount + 1)); return this.get(url, params, retryCount + 1); } throw new WhoisScrapingError(`HTTP request failed after ${this.maxRetries} retries`, error); } } /** * Private method to create delay * @param {number} ms - Milliseconds to delay */ #delay(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } } /** * WHOIS data parser */ class WhoisParser { /** * Parse raw WHOIS data into structured JSON * @param {string} rawData - Raw WHOIS text data * @returns {Object} Parsed WHOIS data */ static parseRawWhoisToJson(rawData) { if (!rawData || typeof rawData !== 'string') { throw new WhoisParsingError('Invalid raw WHOIS data provided'); } try { const lines = rawData.split("\n"); const result = {}; const postWhoisDisclaimer = []; let foundLastUpdate = false; for (const line of lines) { const trimmedLine = line.trim(); if (!trimmedLine) continue; // Check for 'Last update of whois database' marker if (trimmedLine.startsWith(">>> Last update of whois database:")) { const match = trimmedLine.match(/Last update of whois database:\s*(.*?)\s*<<<$/); if (match) { result.lastWhoisUpdate = match[1]; } foundLastUpdate = true; continue; } // After last update line, capture all remaining as disclaimer if (foundLastUpdate) { postWhoisDisclaimer.push(trimmedLine); continue; } const colonIndex = trimmedLine.indexOf(":"); if (colonIndex === -1 || colonIndex === 0) continue; const key = trimmedLine.slice(0, colonIndex).trim(); const value = trimmedLine.slice(colonIndex + 1).trim(); if (!key || !value) continue; this.#addToResult(result, key, value); } if (postWhoisDisclaimer.length > 0) { result.postWhoisDisclaimer = postWhoisDisclaimer.join(" "); } return result; } catch (error) { throw new WhoisParsingError('Failed to parse WHOIS data', error); } } /** * Private method to add key-value pairs to result object * @param {Object} result - Result object to modify * @param {string} key - Property key * @param {string} value - Property value */ static #addToResult(result, key, value) { if (result.hasOwnProperty(key)) { if (!Array.isArray(result[key])) { result[key] = [result[key]]; } result[key].push(value); } else { result[key] = value; } } } /** * WHOIS data extractor from HTML content */ class WhoisExtractor { /** * Extract WHOIS data from HTML content * @param {string} htmlContent - HTML content from who.is * @returns {Object} Extracted WHOIS data */ static extractWhoisData(htmlContent) { if (!htmlContent) { throw new WhoisScrapingError('No HTML content provided'); } try { const $ = cheerio.load(htmlContent); const scripts = $('script').map((_, el) => $(el).html()).get(); const extractedData = { registryData: null, registrarData: null, pageTitle: $('title').text() || 'Unknown' }; for (const scriptContent of scripts) { if (!scriptContent) continue; const matches = [...scriptContent.matchAll(/self\.__next_f\.push\(\[(\d+),\s*(".*?")\]\)/gs)]; for (const match of matches) { try { const text = JSON.parse(match[2]); if (!text.includes('Domain Name:')) continue; // Registry WHOIS Data (contains Updated Date but not Registrant State) if (text.includes('Updated Date:') && !text.includes('Registrant State')) { extractedData.registryData = text.replace(/\r\n/g, '\n').trim(); } // Registrar WHOIS Data (contains Registrar WHOIS Server) if (text.includes('Registrar WHOIS Server')) { extractedData.registrarData = text.replace(/\r\n/g, '\n').trim(); } } catch (parseError) { // Skip invalid JSON, continue processing continue; } } } return extractedData; } catch (error) { throw new WhoisScrapingError('Failed to extract WHOIS data from HTML', error); } } } /** * Main WHOIS scraper class */ class Whois { constructor(config = {}) { this.httpClient = new HttpClient(config.http); this.baseUrl = config.baseUrl || 'https://who.is/whois'; this.outputFormat = config.outputFormat || 'json'; // 'json' or 'raw' } /** * Scrape WHOIS data for a domain * @param {string} domain - Domain name to lookup * @returns {Promise<Object>} Scraped and parsed WHOIS data */ async scrape(domain) { if (!domain || typeof domain !== 'string') { throw new WhoisScrapingError('Invalid domain provided'); } const cleanDomain = domain.trim().toLowerCase(); if (!this.#isValidDomain(cleanDomain)) { throw new WhoisScrapingError(`Invalid domain format: ${cleanDomain}`); } try { const url = `${this.baseUrl}/${cleanDomain}`; const htmlContent = await this.httpClient.get(url); const extractedData = WhoisExtractor.extractWhoisData(htmlContent); const result = { domain: cleanDomain, scrapedAt: new Date().toISOString(), pageTitle: extractedData.pageTitle, registry: null, registrar: null }; // Parse registry data if available if (extractedData.registryData) { result.registry = this.outputFormat === 'raw' ? extractedData.registryData : WhoisParser.parseRawWhoisToJson(extractedData.registryData); } // Parse registrar data if available if (extractedData.registrarData) { result.registrar = this.outputFormat === 'raw' ? extractedData.registrarData : WhoisParser.parseRawWhoisToJson(extractedData.registrarData); } return result; } catch (error) { if (error instanceof WhoisScrapingError || error instanceof WhoisParsingError) { throw error; } throw new WhoisScrapingError(`Failed to scrape WHOIS data for ${cleanDomain}`, error); } } /** * Scrape multiple domains * @param {string[]} domains - Array of domain names * @param {Object} options - Scraping options * @returns {Promise<Object[]>} Array of scraped WHOIS data */ async scrapeMultiple(domains, options = {}) { if (!Array.isArray(domains)) { throw new WhoisScrapingError('Domains must be provided as an array'); } const { concurrent = 3, continueOnError = true } = options; const results = []; const errors = []; // Process domains in batches to avoid overwhelming the server for (let i = 0; i < domains.length; i += concurrent) { const batch = domains.slice(i, i + concurrent); const batchPromises = batch.map(async (domain) => { try { return await this.scrape(domain); } catch (error) { const errorInfo = { domain, error: error.message, timestamp: new Date().toISOString() }; if (continueOnError) { errors.push(errorInfo); return null; } else { throw error; } } }); const batchResults = await Promise.all(batchPromises); results.push(...batchResults.filter(result => result !== null)); } return { results, errors: errors.length > 0 ? errors : undefined, summary: { total: domains.length, successful: results.length, failed: errors.length } }; } /** * Private method to validate domain format * @param {string} domain - Domain to validate * @returns {boolean} True if valid domain format */ #isValidDomain(domain) { const domainRegex = /^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*$/; return domainRegex.test(domain) && domain.length <= 253; } } // Export the module module.exports = { Whois, WhoisParser, WhoisExtractor, HttpClient, WhoisScrapingError, WhoisParsingError, default: Whois };