generator-begcode
Version:
Spring Boot + Angular/React/Vue in one handy generator
107 lines (106 loc) • 3.9 kB
JavaScript
import { load } from 'cheerio';
import axios from 'axios';
import { AgentOutputType, ChatMessageBuilder, trimText } from '../agent-core/index.js';
import { AgentFunctionBase, fetchHTML } from './utils/index.js';
import { FUNCTION_CALL_FAILED, FUNCTION_CALL_SUCCESS_CONTENT } from '../agents/Scripter/utils.js';
export class ScrapeTextFunction extends AgentFunctionBase {
get description() {
return "This is a naive function that opens a web page and extracts all text present in the HTML. Due to its broad approach, it may retrieve a large amount of irrelevant or extraneous data. It's recommended to use this function as a last resort when more precise methods fail or are unavailable";
}
name = 'web_scrapeText';
parameters = {
type: 'object',
properties: {
url: {
type: 'string',
},
},
required: ['url'],
additionalProperties: false,
};
buildExecutor(_) {
return async (params, rawParams) => {
try {
if (params.url.endsWith('.pdf')) {
return this.onError(params, 'Scrape text function can not scrape pdf files', rawParams);
}
let response;
if (typeof window === 'object') {
const result = await axios.get(`/api/process-web-page?url=${params.url}`);
response = result.data.text;
}
else {
response = await this.processWebpage(params.url);
}
return this.onSuccess(params, JSON.stringify(response), rawParams);
}
catch (err) {
return this.onError(params, err.toString(), rawParams);
}
};
}
async processWebpage(url) {
const response = await fetchHTML(url);
const html = response.data;
const $ = load(html);
$('script').remove();
$('style').remove();
$('noscript').remove();
$('link').remove();
$('head').remove();
const texts = [];
$('*').each((_, element) => {
const text = $(element).text().trim();
if (text) {
const sanitizedText = this.sanitize(text);
texts.push(sanitizedText);
}
});
return texts;
}
sanitize(html) {
return html
.replaceAll('\t', '')
.replaceAll('\\\\t', '')
.replaceAll('\n', ' ')
.replaceAll('\\\\n', '\n')
.replace(/ +(?= )/g, '')
.trim();
}
onSuccess(params, result, rawParams) {
return {
outputs: [
{
type: AgentOutputType.Success,
title: `Scrape text from '${params.url}'`,
content: FUNCTION_CALL_SUCCESS_CONTENT(this.name, params, `Found the following text in '${params.url}':
--------------
${result}\n
--------------
`),
},
],
messages: [
ChatMessageBuilder.functionCall(this.name, rawParams),
ChatMessageBuilder.functionCallResult(this.name, `Scrape text from '${params.url}'\`\`\`
${result}\n\`\`\``),
],
};
}
onError(params, error, rawParams) {
return {
outputs: [
{
type: AgentOutputType.Error,
title: `Scrape text from '${params.url}'`,
content: FUNCTION_CALL_FAILED(params, this.name, error),
},
],
messages: [
ChatMessageBuilder.functionCall(this.name, rawParams),
ChatMessageBuilder.functionCallResult(this.name, `Error scraping text from '${params.url}'\n\`\`\`
${trimText(error, 300)}\n\`\`\``),
],
};
}
}