UNPKG

@mintlify/scraping

Version:

Scrape documentation frameworks to Mintlify docs

136 lines (120 loc) 3.8 kB
import type { Root as HastRoot, Element } from 'hast'; import { join } from 'node:path'; import { Browser } from 'puppeteer'; import { EXIT, visit } from 'unist-util-visit'; import { framework } from '../utils/detectFramework.js'; import { getErrorMessage } from '../utils/errors.js'; import { downloadImage } from '../utils/images.js'; import { log } from '../utils/log.js'; import { fetchPageHtml } from '../utils/network.js'; import { htmlToHast } from './root.js'; function findReadmeLogoNodes(root: HastRoot): Array<Element> | undefined { const elements: Array<Element> = []; visit(root, 'element', function (node) { if ( node.tagName === 'img' && Array.isArray(node.properties.className) && node.properties.className.includes('rm-Logo-img') ) elements.push(node); }); return elements.length ? elements : undefined; } function findGitBookLogoNodes(root: HastRoot): Array<Element> | undefined { const elements: Array<Element> = []; visit(root, 'element', function (node) { if (node.tagName === 'img' && node.properties.alt === 'Logo') { elements.push(node); } }); return elements.length ? elements : undefined; } function findDocusaurusLogoNodes(root: HastRoot): Array<Element> | undefined { const elements: Array<Element> = []; visit(root, 'element', function (node) { if ( node.tagName === 'div' && Array.isArray(node.properties.className) && node.properties.className.includes('navbar__brand') ) { visit(node, 'element', function (subNode) { if (subNode.tagName === 'img') elements.push(subNode); }); return EXIT; } }); return elements.length ? elements : undefined; } async function findLogosFromHtml( html: string, downloadFn: (root: HastRoot) => Array<Element> | undefined, filepaths: Array<string> ) { const hast = htmlToHast(html); const imgNodes = downloadFn(hast); if (imgNodes) { filepaths.push( ...(await Promise.all( imgNodes.map(async (node) => { const res = await downloadImage( node.properties.src as string, join(process.cwd(), 'images') ); if (res.success && res.data) { return res.data[1]; } else { return ''; } }) )) ); } filepaths.forEach((filepath, index) => { if (!filepath) filepaths.splice(index, 1); }); } export async function downloadLogos( url: string | URL, browser: Browser | undefined ): Promise<string | { light: string; dark: string } | undefined> { url = new URL(url); const filepaths: Array<string> = []; if (browser) { const htmls: Array<string> = []; try { const page = await browser.newPage(); await page.goto(url.toString(), { waitUntil: 'networkidle2', }); htmls.push(await page.content()); await page.click('.rm-ThemeToggle'); htmls.push(await page.content()); } catch {} await Promise.all( htmls.map(async (html) => { return await findLogosFromHtml(html, findReadmeLogoNodes, filepaths); }) ); } else { try { const html = await fetchPageHtml(url); await findLogosFromHtml( html, framework.vendor === 'gitbook' ? findGitBookLogoNodes : findDocusaurusLogoNodes, filepaths ); } catch (error) { const errorMessage = getErrorMessage(error); log(`Failed to retrieve logo from HTML: ${errorMessage}`); } } const uniqueFilepaths = [...new Set(filepaths).values()]; return uniqueFilepaths.length === 1 ? uniqueFilepaths[0] : uniqueFilepaths.length > 1 ? { light: uniqueFilepaths[0] as string, dark: uniqueFilepaths[1] as string, } : undefined; }