openai-code
Version:
An unofficial proxy layer that lets you use Anthropic Claude Code with any OpenAI API backend.
87 lines (74 loc) • 3.34 kB
JavaScript
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { sendOpenAIRequest } from './openai.mjs';
// add the stealth plugin
puppeteer.use(StealthPlugin());
// New function: htmlToMarkdown converts given HTML content into Markdown.
// It uses the GPT-4o-mini model provided by openai vercel/ai.
// The function constructs a prompt to instruct the model for conversion.
export async function markdownify(html) {
try {
const completion = await sendOpenAIRequest({
model: 'gpt-4o-mini',
messages: [{
role: "system",
content: "Convert the following HTML into copyable Markdown:"
}, {
role: "user",
content: html
}]
})
return completion.choices[0].message.content;
} catch (e) {
console.error("Error converting HTML to Markdown:", e);
throw e;
}
}
// scrapes a page and turns it into markdown
export const scrape = async (url, selector = "body") => {
// https://stackoverflow.com/questions/30456371/audiocodec-doesnt-recognize-aac-or-libfdk-aac
// Launch a headless browser instance using puppeteer
const browser = await puppeteer.launch({ headless: 'new' });
try {
// Open a new page
const page = await browser.newPage();
// --- Anti-scraping measures start ---
// Set a random user agent from a list of common ones to mimic regular browsers
const USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
];
const randomUserAgent = USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
await page.setUserAgent(randomUserAgent);
// Set extra HTTP headers to mimic real browser requests
await page.setExtraHTTPHeaders({
'Accept-Language': 'en-US,en;q=0.9'
});
// Introduce a random delay to simulate human behavior (1-4 seconds)
await new Promise(resolve => setTimeout(resolve, Math.floor(Math.random() * 3000) + 1000));
// --- Anti-scraping measures end ---
// Navigate to the specified URL and wait until the network is idle to ensure full HTML is loaded
await page.goto(url, { waitUntil: 'networkidle0', timeout: 60000 });
// Select the element with class "aow-detail-page" and get its HTML content
const htmlContent = await page.evaluate((selector) => {
const element = document.querySelector(selector);
return element ? element.innerHTML : '';
}, selector);
// Remove <script> and <style> tags from the HTML content using regex
const cleanedHtml = htmlContent.replace(/<(script|style)(\s[^>]*?)?>[\s\S]*?<\/\1>/gi, '');
// convert the cleaned HTML content to Markdown
const markdownContent = await markdownify(cleanedHtml);
// return the cleaned HTML and Markdown content
return {
html: cleanedHtml,
markdown: markdownContent
}
} catch (error) {
console.error('Error in turnToMarkdown:', error);
throw error;
} finally {
// Always close the browser to free up resources
await browser.close();
}
}