@springmusk/who-is
Version:
Professional WHOIS data scraping module with structured parsing and batch processing capabilities
349 lines (302 loc) • 11.8 kB
JavaScript
;
const axios = require('axios');
const cheerio = require('cheerio');
/**
* @module who-is
* @description A professional WHOIS data scraping module with structured parsing and batch processing capabilities
*/
/**
* Custom error class for WHOIS scraping related errors
* @extends Error
*/
class WhoisScrapingError extends Error {
constructor(message, cause = null) {
super(message);
this.name = 'WhoisScrapingError';
this.cause = cause;
}
}
class WhoisParsingError extends Error {
constructor(message, cause = null) {
super(message);
this.name = 'WhoisParsingError';
this.cause = cause;
}
}
/**
* HTTP client wrapper for making requests
*/
class HttpClient {
constructor(config = {}) {
this.timeout = config.timeout || 30000;
this.userAgent = config.userAgent || 'Mozilla/5.0 (compatible; Whois/1.0)';
this.maxRetries = config.maxRetries || 3;
this.retryDelay = config.retryDelay || 1000;
}
/**
* Make HTTP GET request with retry logic
* @param {string} url - Target URL
* @param {Object} params - Query parameters
* @param {number} retryCount - Current retry attempt
* @returns {Promise<string>} Response data
*/
async get(url, params = {}, retryCount = 0) {
try {
const response = await axios.get(url, {
params,
timeout: this.timeout,
headers: {
'User-Agent': this.userAgent
}
});
return response.data;
} catch (error) {
if (retryCount < this.maxRetries) {
await this.#delay(this.retryDelay * (retryCount + 1));
return this.get(url, params, retryCount + 1);
}
throw new WhoisScrapingError(`HTTP request failed after ${this.maxRetries} retries`, error);
}
}
/**
* Private method to create delay
* @param {number} ms - Milliseconds to delay
*/
#delay(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
/**
* WHOIS data parser
*/
class WhoisParser {
/**
* Parse raw WHOIS data into structured JSON
* @param {string} rawData - Raw WHOIS text data
* @returns {Object} Parsed WHOIS data
*/
static parseRawWhoisToJson(rawData) {
if (!rawData || typeof rawData !== 'string') {
throw new WhoisParsingError('Invalid raw WHOIS data provided');
}
try {
const lines = rawData.split("\n");
const result = {};
const postWhoisDisclaimer = [];
let foundLastUpdate = false;
for (const line of lines) {
const trimmedLine = line.trim();
if (!trimmedLine) continue;
// Check for 'Last update of whois database' marker
if (trimmedLine.startsWith(">>> Last update of whois database:")) {
const match = trimmedLine.match(/Last update of whois database:\s*(.*?)\s*<<<$/);
if (match) {
result.lastWhoisUpdate = match[1];
}
foundLastUpdate = true;
continue;
}
// After last update line, capture all remaining as disclaimer
if (foundLastUpdate) {
postWhoisDisclaimer.push(trimmedLine);
continue;
}
const colonIndex = trimmedLine.indexOf(":");
if (colonIndex === -1 || colonIndex === 0) continue;
const key = trimmedLine.slice(0, colonIndex).trim();
const value = trimmedLine.slice(colonIndex + 1).trim();
if (!key || !value) continue;
this.#addToResult(result, key, value);
}
if (postWhoisDisclaimer.length > 0) {
result.postWhoisDisclaimer = postWhoisDisclaimer.join(" ");
}
return result;
} catch (error) {
throw new WhoisParsingError('Failed to parse WHOIS data', error);
}
}
/**
* Private method to add key-value pairs to result object
* @param {Object} result - Result object to modify
* @param {string} key - Property key
* @param {string} value - Property value
*/
static #addToResult(result, key, value) {
if (result.hasOwnProperty(key)) {
if (!Array.isArray(result[key])) {
result[key] = [result[key]];
}
result[key].push(value);
} else {
result[key] = value;
}
}
}
/**
* WHOIS data extractor from HTML content
*/
class WhoisExtractor {
/**
* Extract WHOIS data from HTML content
* @param {string} htmlContent - HTML content from who.is
* @returns {Object} Extracted WHOIS data
*/
static extractWhoisData(htmlContent) {
if (!htmlContent) {
throw new WhoisScrapingError('No HTML content provided');
}
try {
const $ = cheerio.load(htmlContent);
const scripts = $('script').map((_, el) => $(el).html()).get();
const extractedData = {
registryData: null,
registrarData: null,
pageTitle: $('title').text() || 'Unknown'
};
for (const scriptContent of scripts) {
if (!scriptContent) continue;
const matches = [...scriptContent.matchAll(/self\.__next_f\.push\(\[(\d+),\s*(".*?")\]\)/gs)];
for (const match of matches) {
try {
const text = JSON.parse(match[2]);
if (!text.includes('Domain Name:')) continue;
// Registry WHOIS Data (contains Updated Date but not Registrant State)
if (text.includes('Updated Date:') && !text.includes('Registrant State')) {
extractedData.registryData = text.replace(/\r\n/g, '\n').trim();
}
// Registrar WHOIS Data (contains Registrar WHOIS Server)
if (text.includes('Registrar WHOIS Server')) {
extractedData.registrarData = text.replace(/\r\n/g, '\n').trim();
}
} catch (parseError) {
// Skip invalid JSON, continue processing
continue;
}
}
}
return extractedData;
} catch (error) {
throw new WhoisScrapingError('Failed to extract WHOIS data from HTML', error);
}
}
}
/**
* Main WHOIS scraper class
*/
class Whois {
constructor(config = {}) {
this.httpClient = new HttpClient(config.http);
this.baseUrl = config.baseUrl || 'https://who.is/whois';
this.outputFormat = config.outputFormat || 'json'; // 'json' or 'raw'
}
/**
* Scrape WHOIS data for a domain
* @param {string} domain - Domain name to lookup
* @returns {Promise<Object>} Scraped and parsed WHOIS data
*/
async scrape(domain) {
if (!domain || typeof domain !== 'string') {
throw new WhoisScrapingError('Invalid domain provided');
}
const cleanDomain = domain.trim().toLowerCase();
if (!this.#isValidDomain(cleanDomain)) {
throw new WhoisScrapingError(`Invalid domain format: ${cleanDomain}`);
}
try {
const url = `${this.baseUrl}/${cleanDomain}`;
const htmlContent = await this.httpClient.get(url);
const extractedData = WhoisExtractor.extractWhoisData(htmlContent);
const result = {
domain: cleanDomain,
scrapedAt: new Date().toISOString(),
pageTitle: extractedData.pageTitle,
registry: null,
registrar: null
};
// Parse registry data if available
if (extractedData.registryData) {
result.registry = this.outputFormat === 'raw'
? extractedData.registryData
: WhoisParser.parseRawWhoisToJson(extractedData.registryData);
}
// Parse registrar data if available
if (extractedData.registrarData) {
result.registrar = this.outputFormat === 'raw'
? extractedData.registrarData
: WhoisParser.parseRawWhoisToJson(extractedData.registrarData);
}
return result;
} catch (error) {
if (error instanceof WhoisScrapingError || error instanceof WhoisParsingError) {
throw error;
}
throw new WhoisScrapingError(`Failed to scrape WHOIS data for ${cleanDomain}`, error);
}
}
/**
* Scrape multiple domains
* @param {string[]} domains - Array of domain names
* @param {Object} options - Scraping options
* @returns {Promise<Object[]>} Array of scraped WHOIS data
*/
async scrapeMultiple(domains, options = {}) {
if (!Array.isArray(domains)) {
throw new WhoisScrapingError('Domains must be provided as an array');
}
const { concurrent = 3, continueOnError = true } = options;
const results = [];
const errors = [];
// Process domains in batches to avoid overwhelming the server
for (let i = 0; i < domains.length; i += concurrent) {
const batch = domains.slice(i, i + concurrent);
const batchPromises = batch.map(async (domain) => {
try {
return await this.scrape(domain);
} catch (error) {
const errorInfo = {
domain,
error: error.message,
timestamp: new Date().toISOString()
};
if (continueOnError) {
errors.push(errorInfo);
return null;
} else {
throw error;
}
}
});
const batchResults = await Promise.all(batchPromises);
results.push(...batchResults.filter(result => result !== null));
}
return {
results,
errors: errors.length > 0 ? errors : undefined,
summary: {
total: domains.length,
successful: results.length,
failed: errors.length
}
};
}
/**
* Private method to validate domain format
* @param {string} domain - Domain to validate
* @returns {boolean} True if valid domain format
*/
#isValidDomain(domain) {
const domainRegex = /^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*$/;
return domainRegex.test(domain) && domain.length <= 253;
}
}
// Export the module
module.exports = {
Whois,
WhoisParser,
WhoisExtractor,
HttpClient,
WhoisScrapingError,
WhoisParsingError,
default: Whois
};