UNPKG

@plastichub/osr-ai-tools

Version:

CLI and library for LLM tools

125 lines (116 loc) 6.17 kB
import * as path from 'path' import { sync as write } from '@plastichub/fs/write' import { RunnableToolFunction } from 'openai/lib/RunnableFunction' import puppeteer from 'puppeteer' import TurndownService from 'turndown' const turndown = new TurndownService() import { toolLogger } from '../..' import { IKBotTask } from '../../types' export const tools = (target: string, options: IKBotTask): Array<any> => { const logger = toolLogger(path.parse(__filename).name, options) return [ { type: 'function', function: { name: 'browse_page', description: 'Browse a webpage and return its content as markdown, all links, images and pages main image', parameters: { type: 'object', properties: { url: { type: 'string', description: 'URL of the webpage to browse' } }, required: ['url'] }, function: async (params: any) => { try { logger.debug(`Tool::BrowsePage Browsing ${params.url}`); const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] }); try { const page = await browser.newPage() logger.debug(`Tool::Web::BrowsePage Opening page ${params.url}`) await page.goto(params.url, { waitUntil: 'networkidle2' }) if (params.wait_for) { await page.waitForSelector(params.wait_for) } const pageData = await page.evaluate((selector) => { const elementsToRemove = document.querySelectorAll( 'script, style, link, meta, noscript, iframe, [style*="display:none"],[style*="display: none"], .hidden' ) elementsToRemove.forEach(el => el.remove()) // Extract all links (limited to 20) const links = Array.from(document.querySelectorAll('a')) .map(a => ({ text: a.textContent?.trim() || '', href: a.href })) .filter(link => link.href && link.href.startsWith('http')) .slice(0, 20) // Extract all images (limited to 20) const images = Array.from(document.querySelectorAll('img')) .map(img => ({ src: img.src, alt: img.alt || '', width: img.width, height: img.height })) .filter(img => img.src && img.src.startsWith('http')) .slice(0, 20) // Get og:image const mainImage = document.querySelector('meta[property="og:image"]')?.getAttribute('content') || document.querySelector('meta[name="og:image"]')?.getAttribute('content') let content if (selector) { const element = document.querySelector(selector) content = element ? element.innerHTML : '' } else { const body = document.body content = body ? body.innerHTML : '' } return { content, links, images, ogImage: mainImage } }, params.selector) const markdown = turndown.turndown(pageData.content) write(path.join(target, 'tmp', 'web_puppeteer_last.md'), markdown) write(path.join(target, 'tmp', 'web_puppeteer_last.html'), pageData.content) await browser.close() const ret = { success: true, markdown: markdown, links: pageData.links, images: pageData.images, mainImage: pageData.ogImage, url: params.url }; write(path.join(target, 'tmp', 'web_puppeteer.json'), ret) return ret } catch (error: any) { logger.debug('Error browsing page:', error.message, error); await browser.close() throw error } } catch (error: any) { logger.debug('Error browsing page:', error.message); return { success: false, error: error.message, url: params.url }; } }, parse: JSON.parse } } as RunnableToolFunction<any> ]; };