UNPKG

@mintlify/scraping

Version:

Scrape documentation frameworks to Mintlify docs

157 lines (141 loc) 5.7 kB
import type { Root as HastRoot } from 'hast'; import type { Root as MdastRoot } from 'mdast'; import remarkGfm from 'remark-gfm'; import remarkMdx from 'remark-mdx'; import remarkStringify from 'remark-stringify'; import { unified } from 'unified'; import { convertHeaderLinksToText } from '../components/link.js'; import { CONTENT_FAILURE_MSG, MDAST_FAILURE_MSG } from '../constants.js'; import { createCallout, createCard, createAccordion, createAccordionGroup, createFrame, createCodeGroup, createTabs, createCardGroup, } from '../customComponents/create.js'; import { rehypeToRemarkCustomComponents } from '../customComponents/plugin.js'; import { selectiveRehypeRemark } from '../customComponents/selective.js'; import { retrieveRootContent } from '../root/retrieve.js'; import type { Result } from '../types/result.js'; import { unifiedRemoveBreadCrumbs } from '../utils/breadcrumbs.js'; import { unifiedRemoveBreaks } from '../utils/breaks.js'; import { unifiedRemoveClassNames } from '../utils/className.js'; import { unifiedRemoveCopyButtons } from '../utils/copyButton.js'; import { detectFramework, framework } from '../utils/detectFramework.js'; import { remarkRemoveEmptyEmphases } from '../utils/emptyEmphasis.js'; import { unifiedRemoveEmptyParagraphs } from '../utils/emptyParagraphs.js'; import { getErrorMessage, logErrorResults } from '../utils/errors.js'; import { writePage } from '../utils/file.js'; import { remarkProperlyFormatEmphasis } from '../utils/formatEmphasis.js'; import { removeHastComments } from '../utils/hastComments.js'; import { remarkSpaceListsOut } from '../utils/lists.js'; import { log } from '../utils/log.js'; import { remarkRemoveBottomMetadata } from '../utils/metadata.js'; import { unifiedRemoveNestedRoots } from '../utils/nestedRoots.js'; import { unifiedRemovePositions } from '../utils/position.js'; import { removeLeadingSlash, removeTrailingSlash } from '../utils/strings.js'; import { remarkRemoveCodeBlocksInCells } from '../utils/tableCells.js'; import { getDescriptionFromRoot, getTitleFromHeading } from '../utils/title.js'; import { unifiedRemoveTableOfContents } from '../utils/toc.js'; import { remarkRemoveUpdatedAt } from '../utils/updatedAt.js'; import { downloadImagesFromFile } from './images.js'; import { htmlToHast } from './root.js'; export async function scrapePage( html: string, url: string | URL, opts: { externalLink: boolean; isOverviewPage?: boolean; rootPath?: string; } = { externalLink: false } ): Promise<Result<[string, string]>> { url = new URL(url); if (opts.externalLink) { const filename = html; const filenameWithExt = `${filename}.mdx`; writePage(filenameWithExt, '', '', '', url.toString()); return { success: true, data: [url.toString(), filename] }; } const hast = htmlToHast(html); removeHastComments(hast); if (!framework.vendor) detectFramework(hast); const urlStr = url.toString(); const content = retrieveRootContent(hast); if (!content) return { success: false, message: `${urlStr}: ${CONTENT_FAILURE_MSG}`, data: [urlStr, ''] }; const contentAsRoot: HastRoot = { type: 'root', children: [content], }; const mdastTree: MdastRoot = unified() .use(unifiedRemoveBreaks) .use(unifiedRemoveBreadCrumbs) .use(unifiedRemoveTableOfContents) .use(unifiedRemoveCopyButtons) .use(createCard) .use(createAccordion) .use(createFrame) .use(createCallout) .use(createCardGroup) .use(createAccordionGroup) .use(createCodeGroup) .use(createTabs) .use(unifiedRemoveClassNames) .use(unifiedRemoveEmptyParagraphs) .use(unifiedRemovePositions) .use(selectiveRehypeRemark) // Cleans up any nested components left untouched // by `selectiveRehypeRemark`, and converts them to // MDX compatible components .use(rehypeToRemarkCustomComponents) .use(convertHeaderLinksToText) .use(unifiedRemoveNestedRoots) .use(remarkSpaceListsOut) .use(remarkRemoveBottomMetadata) .use(remarkRemoveUpdatedAt) .use(remarkRemoveEmptyEmphases) .use(remarkProperlyFormatEmphasis) .use(remarkRemoveCodeBlocksInCells) // @ts-expect-error moving some of the pipeline around results in contentAsRoot being treated differently than its type which is Root Element .runSync(contentAsRoot) as MdastRoot; try { const imageResults = await downloadImagesFromFile(mdastTree, url); logErrorResults(`scraping images from ${url.toString()}`, imageResults); } catch (error) { const errorMessage = getErrorMessage(error); log(`We encountered an error when scraping the images from ${url.toString()}${errorMessage}`); throw error; } const title = getTitleFromHeading(mdastTree); const description = getDescriptionFromRoot(mdastTree); try { const result = unified() .use(remarkMdx) .use(remarkGfm) .use(remarkStringify) .stringify(mdastTree); const resultStr = String(result).replace(/\n{3,}/g, '\n\n'); if (opts.rootPath) { url = new URL(opts.rootPath, url.origin); } else if (url.origin === removeTrailingSlash(url.toString())) { url = new URL('home', new URL(url).origin); } writePage(url, opts.isOverviewPage ? 'Overview' : title, description, resultStr); return { success: true, data: opts.rootPath ? [removeLeadingSlash(removeTrailingSlash(new URL(urlStr).pathname)), opts.rootPath] : undefined, }; } catch (error) { const errorMessage = getErrorMessage(error); return { success: false, message: `${urlStr}: ${MDAST_FAILURE_MSG}${errorMessage}`, data: [urlStr, ''], }; } }