UNPKG

markdown-crawler

Version:

A powerful web crawler that extracts content from web pages and converts them to clean Markdown format, with support for code blocks and GitHub Flavored Markdown

github.com/gkctou/md-crawler

gkctou/md-crawler

155 lines (154 loc) • 7.7 kB

JavaScript

"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.crawl = crawl; const clipper_js_1 = require("./clipper.js"); const minimatch_1 = require("minimatch"); const cheerio = __importStar(require("cheerio")); // Added cheerio import // Helper function to resolve relative URLs in markdown content function resolveRelativeUrls(markdown, baseUrl) { // Regex to find markdown links ![alt](src) and [text](link) // It captures the prefix (![...]( or [...]( ), the URL, and the closing parenthesis ) const regex = /(!?\[.*?\]\()(.+?)(\))/g; return markdown.replace(regex, (match, prefix, url, suffix) => { try { // Check if the URL is already absolute (starts with http/https or is protocol-relative) if (/^(?:[a-z]+:)?\/\//i.test(url)) { return match; // It's already absolute or protocol-relative, return the original match } // Attempt to resolve the relative URL against the base URL const absoluteUrl = new URL(url, baseUrl).href; return `${prefix}${absoluteUrl}${suffix}`; // Return the link/image with the resolved URL } catch (resolveError) { // If resolving fails (e.g., invalid characters, unsupported scheme), return the original match console.warn(`Could not resolve URL '${url}' against base '${baseUrl}':`, resolveError instanceof Error ? resolveError.message : resolveError); return match; } }); } async function crawl(url, additionalGlobalUrls = []) { const pages = []; const visitedUrls = new Set(); const urlsToVisit = [url]; const urlsToVisitSet = new Set(urlsToVisit); // Set for quick lookups // Custom user agent to mimic a browser const userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'; console.log('additionalGlobalUrls:', additionalGlobalUrls); while (urlsToVisit.length > 0) { const currentUrl = urlsToVisit.shift(); if (!currentUrl || visitedUrls.has(currentUrl)) { continue; } visitedUrls.add(currentUrl); // console.log(`[Debug] Processing: ${currentUrl}. Queue length after shift: ${urlsToVisit.length}`); // Added Debug Log try { console.info(`Fetching ${currentUrl}`); // Fetch the page using native fetch const response = await fetch(currentUrl, { headers: { 'User-Agent': userAgent } }); if (!response.ok || !response.headers.get('content-type')?.includes('text/html')) { console.error(`Got ${response.status} for ${currentUrl}`); continue; } // Get the HTML content const html = await response.text(); // Use cheerio to parse HTML and extract information const $ = cheerio.load(html); const [title, rawMarkdown] = await (0, clipper_js_1.extract_from_html)(html); // Use original HTML // Resolve relative URLs in the extracted markdown const resolvedMarkdown = resolveRelativeUrls(rawMarkdown, currentUrl); // Save the page information pages.push({ title, url: currentUrl, markdown: resolvedMarkdown, // Use resolved markdown html, at: Date.now() }); // Extract links from the current page, resolve them, and add to the queue if they match patterns if (additionalGlobalUrls.length > 0) { const links = $('a'); links.each((_, element) => { const href = $(element).attr('href'); if (!href) return; // Skip mailto, tel, javascript links early if (href.startsWith('mailto:') || href.startsWith('tel:') || href.startsWith('javascript:')) return; let fullUrl; try { // Resolve the href against the current page's URL to get an absolute URL // This handles relative paths, absolute paths, and already absolute URLs correctly. fullUrl = new URL(href, currentUrl).href; } catch (e) { // Log and skip if the href is invalid and cannot be resolved console.warn(`Skipping invalid URL '${href}' on page ${currentUrl}:`, e instanceof Error ? e.message : e); return; } // Remove hash/fragment from URL to avoid crawling the same logical page multiple times const hashIndex = fullUrl.indexOf('#'); if (hashIndex !== -1) { fullUrl = fullUrl.substring(0, hashIndex); } // Skip URLs that are not http or https if (!fullUrl.startsWith('http:') && !fullUrl.startsWith('https:')) return; // Skip URLs that are already visited or already in the queue (using Set for efficiency) if (visitedUrls.has(fullUrl) || urlsToVisitSet.has(fullUrl)) return; // Check if the resolved URL matches any of the allowed glob patterns for (const pattern of additionalGlobalUrls) { // Use minimatch for reliable glob pattern matching if ((0, minimatch_1.minimatch)(fullUrl, pattern)) { console.log(`Enqueuing URL: ${fullUrl} (matched pattern: ${pattern})`); urlsToVisit.push(fullUrl); urlsToVisitSet.add(fullUrl); // Add to set for quick lookup break; // Stop checking other patterns once matched } } }); } } catch (error) { console.error(`Error processing ${currentUrl}:`, error); } // End of catch block } // End of while loop return pages; }