UNPKG

generator-begcode

Version:

Spring Boot + Angular/React/Vue in one handy generator

107 lines (106 loc) 3.9 kB
import { load } from 'cheerio'; import axios from 'axios'; import { AgentOutputType, ChatMessageBuilder, trimText } from '../agent-core/index.js'; import { AgentFunctionBase, fetchHTML } from './utils/index.js'; import { FUNCTION_CALL_FAILED, FUNCTION_CALL_SUCCESS_CONTENT } from '../agents/Scripter/utils.js'; export class ScrapeTextFunction extends AgentFunctionBase { get description() { return "This is a naive function that opens a web page and extracts all text present in the HTML. Due to its broad approach, it may retrieve a large amount of irrelevant or extraneous data. It's recommended to use this function as a last resort when more precise methods fail or are unavailable"; } name = 'web_scrapeText'; parameters = { type: 'object', properties: { url: { type: 'string', }, }, required: ['url'], additionalProperties: false, }; buildExecutor(_) { return async (params, rawParams) => { try { if (params.url.endsWith('.pdf')) { return this.onError(params, 'Scrape text function can not scrape pdf files', rawParams); } let response; if (typeof window === 'object') { const result = await axios.get(`/api/process-web-page?url=${params.url}`); response = result.data.text; } else { response = await this.processWebpage(params.url); } return this.onSuccess(params, JSON.stringify(response), rawParams); } catch (err) { return this.onError(params, err.toString(), rawParams); } }; } async processWebpage(url) { const response = await fetchHTML(url); const html = response.data; const $ = load(html); $('script').remove(); $('style').remove(); $('noscript').remove(); $('link').remove(); $('head').remove(); const texts = []; $('*').each((_, element) => { const text = $(element).text().trim(); if (text) { const sanitizedText = this.sanitize(text); texts.push(sanitizedText); } }); return texts; } sanitize(html) { return html .replaceAll('\t', '') .replaceAll('\\\\t', '') .replaceAll('\n', ' ') .replaceAll('\\\\n', '\n') .replace(/ +(?= )/g, '') .trim(); } onSuccess(params, result, rawParams) { return { outputs: [ { type: AgentOutputType.Success, title: `Scrape text from '${params.url}'`, content: FUNCTION_CALL_SUCCESS_CONTENT(this.name, params, `Found the following text in '${params.url}': -------------- ${result}\n -------------- `), }, ], messages: [ ChatMessageBuilder.functionCall(this.name, rawParams), ChatMessageBuilder.functionCallResult(this.name, `Scrape text from '${params.url}'\`\`\` ${result}\n\`\`\``), ], }; } onError(params, error, rawParams) { return { outputs: [ { type: AgentOutputType.Error, title: `Scrape text from '${params.url}'`, content: FUNCTION_CALL_FAILED(params, this.name, error), }, ], messages: [ ChatMessageBuilder.functionCall(this.name, rawParams), ChatMessageBuilder.functionCallResult(this.name, `Error scraping text from '${params.url}'\n\`\`\` ${trimText(error, 300)}\n\`\`\``), ], }; } }