@mintlify/scraping
Version:
Scrape documentation frameworks to Mintlify docs
235 lines (214 loc) • 7.84 kB
text/typescript
import { AsyncAPIDocumentInterface, exponentialBackoff, validateAsyncApi } from '@mintlify/common';
import yaml from 'js-yaml';
import { OpenAPI } from 'openapi-types';
import { Browser, launch } from 'puppeteer';
import { framework } from './detectFramework.js';
import { getErrorMessage } from './errors.js';
import { log } from './log.js';
const userAgents = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
] as const;
const headers = {
'Accept-Language': 'en-US,en;q=0.9',
Accept:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
'Accept-Encoding': 'gzip, deflate, br, zstd',
Connection: 'keep-alive',
} as const;
export async function startPuppeteer() {
try {
return await launch({
headless: true,
ignoreHTTPSErrors: true,
});
} catch (error) {
if (error instanceof Error) {
log(`Could not create a browser instance: ${error.message}`);
}
}
}
export async function getHtmlWithPuppeteer(
browser: Browser,
url: string | URL
): Promise<string | undefined> {
try {
const page = await browser.newPage();
await page.setViewport({
width: 3072,
height: 2048,
deviceScaleFactor: 2,
isMobile: false,
hasTouch: false,
isLandscape: true,
});
await page.setExtraHTTPHeaders(headers);
await page.setUserAgent(
userAgents[Math.floor(Math.random() * userAgents.length)] || userAgents[0]
);
await page.setJavaScriptEnabled(true);
await exponentialBackoff(() =>
page.goto(url.toString(), {
waitUntil: 'networkidle2',
timeout: 30000,
})
);
if (framework.vendor === 'docusaurus') {
await page.evaluate(() => {
document.addEventListener(
'click',
(e) => {
if (e.target instanceof Element && e.target.classList.contains('menu__link--sublist'))
e.preventDefault();
},
true
);
function clickItems(el: HTMLElement | Document) {
const menuItems = el.getElementsByClassName(
'menu__link--sublist'
) as HTMLCollectionOf<HTMLElement>;
for (const item of menuItems) {
item.click();
clickItems(item);
}
}
clickItems(document);
});
}
if (framework.vendor === 'gitbook') {
for (let round = 0; round < 10; round++) {
const clickedCount = await page.evaluate(() => {
const tocEl = document.getElementById('table-of-contents');
if (!tocEl) return 0;
let count = 0;
const items = tocEl.querySelectorAll('li.page-document-item');
items.forEach((li) => {
const btn = li.querySelector(':scope > a button');
if (!btn || !(btn instanceof HTMLElement)) return;
const anchor = btn.closest('a');
if (!anchor) return;
const sibling = anchor.nextElementSibling;
if (
sibling instanceof HTMLElement &&
sibling.style.opacity === '1' &&
sibling.style.height === 'auto'
)
return;
btn.click();
count++;
});
return count;
});
if (clickedCount === 0) break;
await page
.waitForFunction(
() => {
const tocEl = document.getElementById('table-of-contents');
if (!tocEl) return true;
const anchors = tocEl.querySelectorAll('li.page-document-item > a');
return Array.from(anchors).every((a) => {
const btn = a.querySelector('button');
if (!btn) return true;
const sibling = a.nextElementSibling;
if (!sibling || !(sibling instanceof HTMLElement)) return true;
return sibling.style.opacity === '1' && sibling.style.height === 'auto';
});
},
{ timeout: 5000 }
)
.catch(() => {});
}
}
const content = await exponentialBackoff(() => page.content());
await page.close();
return content;
} catch (error) {
const errorMessage = getErrorMessage(error);
throw new Error(`Failed to download page from Puppeteer${errorMessage}`);
}
}
async function fetchPageResponse(url: string | URL): Promise<string> {
try {
const res = await fetch(url);
if (!res.ok) {
throw new Error(`${res.status} ${res.statusText}`);
}
return await res.text();
} catch (error) {
const errorMessage = getErrorMessage(error);
throw new Error(`Failed to fetch page from source${errorMessage}`);
}
}
export async function fetchPageHtml(
url: string | URL,
browser: Browser | undefined = undefined
): Promise<string> {
try {
let res: string | undefined = undefined;
if (browser) {
res = await getHtmlWithPuppeteer(browser, url);
} else {
res = await exponentialBackoff(() => fetchPageResponse(url));
}
if (res) return res;
throw new Error('An unknown error occured.');
} catch (error) {
const errorMessage = getErrorMessage(error);
throw new Error(`Error retrieving HTML for ${url.toString()}${errorMessage}`);
}
}
export async function fetchImage(url: string): Promise<NodeJS.TypedArray> {
try {
const res = await exponentialBackoff(() => fetch(url));
if (!res.ok) {
throw new Error(`${res.status} ${res.statusText}`);
}
const imageBuffer = await res.arrayBuffer();
const imageData = new Uint8Array(imageBuffer);
return imageData;
} catch (error) {
const errorMessage = getErrorMessage(error);
throw new Error(`${url} - failed to retrieve image from source${errorMessage}`);
}
}
export async function fetchOpenApi(url: URL): Promise<OpenAPI.Document> {
try {
const res = await exponentialBackoff(async () => {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`${response.status} ${response.statusText}`);
}
return response;
});
const file = await res.text();
return yaml.load(file) as OpenAPI.Document;
} catch (error) {
const errorMessage = getErrorMessage(error);
throw new Error(`${url} - failed to retrieve OpenAPI file from source${errorMessage}`);
}
}
export async function fetchAsyncApi(url: URL): Promise<AsyncAPIDocumentInterface> {
try {
const res = await exponentialBackoff(async () => {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`${response.status} ${response.statusText}`);
}
return response;
});
const file = await res.text();
const { document, errorMessage } = await validateAsyncApi(file);
if (!document) {
throw new Error(`${url} - this document is not a valid AsyncAPI document - ${errorMessage}`);
}
return document;
} catch (error) {
const errorMessage = getErrorMessage(error);
throw new Error(`${url} - failed to retrieve AsyncAPI file from source - ${errorMessage}`);
}
}