UNPKG

@mintlify/scraping

Version:

Scrape documentation frameworks to Mintlify docs

76 lines (70 loc) 2.67 kB
import { convertStrToTitle } from '@mintlify/common'; import type { Browser } from 'puppeteer'; import { OVERVIEW_PAGE_SLUG } from '../constants.js'; import type { Result } from '../types/result.js'; import { getErrorMessage } from '../utils/errors.js'; import { intoChunks } from '../utils/intoChunks.js'; import { log } from '../utils/log.js'; import { fetchPageHtml, startPuppeteer } from '../utils/network.js'; import { scrapePage } from './page.js'; export async function scrapePageGroup( navGroup: Array<URL>, needsBrowser: boolean, opts: { externalLinks: boolean; rootPaths?: Array<string>; } = { externalLinks: false } ): Promise<Array<Result<[string, string]>>> { const browser: Browser | undefined = needsBrowser ? await startPuppeteer() : undefined; const allResults: Array<Result<[string, string]>> = []; try { for (const chunk of intoChunks(navGroup)) { const res = await Promise.all( chunk.map(async (url, index) => { try { if (opts.externalLinks) { let externalLinkTitle = convertStrToTitle(url.pathname.split('/').at(-1) ?? url.pathname) || `external-link-${index}`; externalLinkTitle = externalLinkTitle.replace(/\s+/g, '-').toLowerCase(); const res = scrapePage(externalLinkTitle, url, { externalLink: true }); return res; } let isOverviewPage = false; if (url.toString().endsWith(OVERVIEW_PAGE_SLUG)) { isOverviewPage = true; url = new URL(url.toString().replace(OVERVIEW_PAGE_SLUG, '')); } const html = await fetchPageHtml(url, browser); const res = scrapePage(html, url, { externalLink: false, isOverviewPage, rootPath: opts.rootPaths ? opts.rootPaths[index] : undefined, }); return res; } catch (error) { const errorMessage = getErrorMessage(error); return { success: false, message: `We encountered an error when scraping ${url}${errorMessage}`, data: [url.toString(), ''] as [string, string], }; } }) ); allResults.push(...res); } } catch (error) { const errorMessage = getErrorMessage(error); log( `We encountered an error when scraping the page group from ${ navGroup[0]?.origin ?? 'the URL provided' }${errorMessage}` ); if (browser) await browser.close(); throw error; } finally { if (browser) await browser.close(); return allResults; } }