UNPKG

this.text

Version:
291 lines (254 loc) 7.69 kB
import axios from 'axios'; import cheerio from 'cheerio'; import fs from 'fs'; import path from 'path'; import { createLogger, transports, format } from 'winston'; /** * Class representing a URL node. */ class URL { /** * Create a URL instance. * @param {string} url - The URL of the node. * @param {URL} [parent=null] - The parent node. */ constructor(url, parent = null) { this.url = this.ensureProtocol(url); this.statusCode = null; this.responseTime = null; this.contentLength = null; this.contentType = null; this.headers = null; this.redirects = 0; this.isOnline = false; this.errorMessage = null; this.title = null; this.timestamp = null; // Content-related properties this.htmlContent = null; this.hyperlinks = []; this.images = []; this.videos = []; this.textContent = null; // Node-related properties this.parent = parent; this.children = []; this.references = []; this.logger = this.setupLogger(); } /** * Ensure the URL has a protocol. * @param {string} url - The URL to check. * @returns {string} The URL with the protocol. */ ensureProtocol(url) { if (!/^https?:\/\//i.test(url)) { return `http://${url}`; } return url; } /** * Set up the logger. * @returns {Object} The logger instance. */ setupLogger() { return createLogger({ level: 'info', format: format.combine( format.colorize(), format.timestamp(), format.printf(({ timestamp, level, message }) => { return `${timestamp} ${level}: ${message}`; }) ), transports: [ new transports.Console(), new transports.File({ filename: 'url_fetch.log' }) ], }); } /** * Fetch metadata of the URL. * @async */ async fetchMetadata() { const startTime = Date.now(); this.timestamp = new Date(startTime).toISOString(); try { const response = await axios.head(this.url, { maxRedirects: 10 }); this.statusCode = response.status; this.responseTime = Date.now() - startTime; this.contentLength = response.headers['content-length']; this.contentType = response.headers['content-type']; this.headers = response.headers; this.redirects = response.request._redirectable._redirectCount; this.isOnline = true; const titleResponse = await axios.get(this.url, { maxRedirects: 10 }); const $ = cheerio.load(titleResponse.data); this.title = $('title').text(); this.logger.info(`Successfully fetched metadata from ${this.url}`); } catch (error) { this.isOnline = false; if (error.response) { this.statusCode = error.response.status; this.headers = error.response.headers; this.responseTime = Date.now() - startTime; this.logger.error(`Failed to fetch metadata from ${this.url} - Status: ${this.statusCode}`); } else { this.errorMessage = error.message; this.logger.error(`Failed to fetch metadata from ${this.url} - Error: ${this.errorMessage}`); } } } /** * Fetch the full content of the URL. * @async */ async fetchContent() { if (!this.isOnline) { this.logger.error(`Cannot fetch content. URL ${this.url} is offline.`); return; } try { const response = await axios.get(this.url, { maxRedirects: 10 }); this.htmlContent = response.data; const $ = cheerio.load(this.htmlContent); this.hyperlinks = $('a').map((i, el) => $(el).attr('href')).get(); this.images = $('img').map((i, el) => $(el).attr('src')).get(); this.videos = $('video').map((i, el) => $(el).attr('src')).get(); this.textContent = $('body').text(); await this.downloadMediaFiles(); this.logger.info(`Successfully fetched content from ${this.url}`); } catch (error) { this.errorMessage = error.message; this.logger.error(`Failed to fetch content from ${this.url} - Error: ${this.errorMessage}`); } } /** * Download media files (images and videos). * @async */ async downloadMediaFiles() { const mediaDir = './media'; if (!fs.existsSync(mediaDir)) { fs.mkdirSync(mediaDir); } for (const imgUrl of this.images) { await this.downloadFile(imgUrl, mediaDir); } for (const videoUrl of this.videos) { await this.downloadFile(videoUrl, mediaDir); } } /** * Download a file and save it locally. * @async * @param {string} url - The URL of the file to download. * @param {string} directory - The directory to save the downloaded file. */ async downloadFile(url, directory) { try { const response = await axios({ url, method: 'GET', responseType: 'stream', }); const filePath = path.join(directory, path.basename(url)); const writer = fs.createWriteStream(filePath); response.data.pipe(writer); return new Promise((resolve, reject) => { writer.on('finish', resolve); writer.on('error', reject); }); } catch (error) { this.logger.error(`Failed to download file from ${url} - Error: ${error.message}`); } } /** * Get the metadata of the URL. * @returns {Object} The metadata of the URL. */ getMetadata() { return { url: this.url, statusCode: this.statusCode, responseTime: this.responseTime, contentLength: this.contentLength, contentType: this.contentType, headers: this.headers, redirects: this.redirects, isOnline: this.isOnline, errorMessage: this.errorMessage, title: this.title, timestamp: this.timestamp }; } /** * Get the full content of the URL. * @returns {Object} The content of the URL. */ getContent() { return { url: this.url, htmlContent: this.htmlContent, hyperlinks: this.hyperlinks, images: this.images, videos: this.videos, textContent: this.textContent }; } /** * Add a child node. * @param {URL} childNode - The child node to add. */ addChild(childNode) { childNode.parent = this; this.children.push(childNode); } /** * Add a reference to another node. * @param {URL} referenceNode - The node being referenced. */ addReference(referenceNode) { this.references.push(referenceNode); } /** * Display the node structure. * @param {number} [level=0] - The current level of the node. */ displayStructure(level = 0) { console.log(' '.repeat(level) + this.url); for (const child of this.children) { child.displayStructure(level + 1); } } /** * Crawl the website starting from the initial URL. * @async * @param {number} depth - The maximum depth to crawl. */ async crawl(depth = 2) { const queue = [{ url: this.url, depth: 0, parent: this }]; const visited = new Set(); while (queue.length > 0) { const { url, depth: currentDepth, parent } = queue.shift(); if (visited.has(url) || currentDepth > depth) { continue; } visited.add(url); const page = new URL(url, parent); await page.fetchMetadata(); await page.fetchContent(); console.log(`Crawled: ${url}`); console.log('Metadata:', page.getMetadata()); console.log('Content:', page.getContent()); parent.addChild(page); for (const link of page.hyperlinks) { if (!visited.has(link)) { queue.push({ url: this.ensureProtocol(link), depth: currentDepth + 1, parent: page }); } } } } } export default URL;