UNPKG

openai-code

Version:

An unofficial proxy layer that lets you use Anthropic Claude Code with any OpenAI API backend.

87 lines (74 loc) 3.34 kB
import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { sendOpenAIRequest } from './openai.mjs'; // add the stealth plugin puppeteer.use(StealthPlugin()); // New function: htmlToMarkdown converts given HTML content into Markdown. // It uses the GPT-4o-mini model provided by openai vercel/ai. // The function constructs a prompt to instruct the model for conversion. export async function markdownify(html) { try { const completion = await sendOpenAIRequest({ model: 'gpt-4o-mini', messages: [{ role: "system", content: "Convert the following HTML into copyable Markdown:" }, { role: "user", content: html }] }) return completion.choices[0].message.content; } catch (e) { console.error("Error converting HTML to Markdown:", e); throw e; } } // scrapes a page and turns it into markdown export const scrape = async (url, selector = "body") => { // https://stackoverflow.com/questions/30456371/audiocodec-doesnt-recognize-aac-or-libfdk-aac // Launch a headless browser instance using puppeteer const browser = await puppeteer.launch({ headless: 'new' }); try { // Open a new page const page = await browser.newPage(); // --- Anti-scraping measures start --- // Set a random user agent from a list of common ones to mimic regular browsers const USER_AGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36" ]; const randomUserAgent = USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)]; await page.setUserAgent(randomUserAgent); // Set extra HTTP headers to mimic real browser requests await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-US,en;q=0.9' }); // Introduce a random delay to simulate human behavior (1-4 seconds) await new Promise(resolve => setTimeout(resolve, Math.floor(Math.random() * 3000) + 1000)); // --- Anti-scraping measures end --- // Navigate to the specified URL and wait until the network is idle to ensure full HTML is loaded await page.goto(url, { waitUntil: 'networkidle0', timeout: 60000 }); // Select the element with class "aow-detail-page" and get its HTML content const htmlContent = await page.evaluate((selector) => { const element = document.querySelector(selector); return element ? element.innerHTML : ''; }, selector); // Remove <script> and <style> tags from the HTML content using regex const cleanedHtml = htmlContent.replace(/<(script|style)(\s[^>]*?)?>[\s\S]*?<\/\1>/gi, ''); // convert the cleaned HTML content to Markdown const markdownContent = await markdownify(cleanedHtml); // return the cleaned HTML and Markdown content return { html: cleanedHtml, markdown: markdownContent } } catch (error) { console.error('Error in turnToMarkdown:', error); throw error; } finally { // Always close the browser to free up resources await browser.close(); } }