@mintlify/scraping
Version:
Scrape documentation frameworks to Mintlify docs
100 lines (87 loc) • 3.18 kB
text/typescript
import type { Colors, Navigation, NavigationEntry } from '@mintlify/models';
import { MintConfig, Tab } from '@mintlify/models';
import { retrieveTabLinks } from '../tabs/retrieve.js';
import type { Result } from '../types/result.js';
import { detectFramework, framework } from '../utils/detectFramework.js';
import { getErrorMessage } from '../utils/errors.js';
import { log } from '../utils/log.js';
import { fetchPageHtml, startPuppeteer } from '../utils/network.js';
import { getTitleFromLink } from '../utils/title.js';
import { defaultColors } from './color.js';
import { downloadLogos } from './logo.js';
import { htmlToHast } from './root.js';
import { scrapeSite } from './site.js';
import { downloadTitle } from './title.js';
export async function scrapeAllSiteTabs(
html: string,
url: string | URL
): Promise<Result<MintConfig>> {
const hast = htmlToHast(html);
url = new URL(url);
detectFramework(hast);
if (
framework.vendor === 'readme' ||
framework.vendor === 'docusaurus' ||
framework.vendor === 'gitbook'
) {
const links = retrieveTabLinks(hast, url);
if (
!links ||
!links.length ||
(links.length === 1 && links[0] && links[0].url === url.pathname)
)
return scrapeSite(html, url, { hast });
if (!links.find((link) => url.pathname.startsWith(link.url))) {
links.push({
name: getTitleFromLink(url.pathname),
url: url.pathname,
});
}
const results = await Promise.all(
links.map(async (tabEntry) => {
const newUrl = new URL(url);
newUrl.pathname = tabEntry.url;
try {
const newHtml = await fetchPageHtml(newUrl, undefined);
return await scrapeSite(newHtml, newUrl, { tabs: [tabEntry] });
} catch (error) {
return { success: false as const, message: getErrorMessage(error) };
}
})
);
const navigations: Array<NavigationEntry> = [];
const tabs: Array<Tab> = [];
let favicon = '/favicon.svg';
let colors: Colors = defaultColors;
const successes = results.filter((result) => result.success);
successes.forEach((result) => {
if (!result.data) return;
navigations.push(...result.data.navigation);
if (result.data.tabs) tabs.push(...result.data.tabs);
if (result.data.favicon !== '/favicon.svg') favicon = result.data.favicon;
if (result.data.colors !== defaultColors) colors = result.data.colors;
});
const failures = results.filter((result) => !result.success);
failures.forEach((result) => {
log('Failed to scrape tab' + result.message);
});
const needsBrowser = framework.vendor === 'readme';
const browser = needsBrowser ? await startPuppeteer() : undefined;
const logo = await downloadLogos(url, browser);
const name = await downloadTitle(hast);
if (browser) await browser.close();
return {
success: true,
data: {
$schema: 'https://mintlify.com/schema.json',
name,
logo,
colors,
favicon,
navigation: navigations as Navigation,
tabs,
},
};
}
return scrapeSite(html, url, { hast });
}