@terranlabs/appflow-langchain
Version:
Use langchain in appflow
294 lines (245 loc) • 10.8 kB
JavaScript
const axios = require('axios');
const cheerio = require('cheerio');
process.env['NODE_TLS_REJECT_UNAUTHORIZED'] = 0
const LLMStrategy = require("./llm_strategy");
const PromptTemplate = require('./template');
async function getPageByUrl(url) {
var response;
var text = '';
try {
var response = await axios.get(url);
if (response.status === 200) {
const html = response.data;
const $ = cheerio.load(html);
// $('a').remove();
// $('script').remove();
// $('noscript').remove();
// $('header').remove();
// $('header').prevAll().remove();
// $('footer').remove();
// $('footer').nextAll().remove();
// $('#header').remove();
// $('#header').prevAll().remove();
// $('#footer').remove();
// $('#footer').nextAll().remove();
// $('#footer-map').remove();
// $('#footer-map').nextAll().remove();
// Get innerText
var rawText = $('body').prop('innerText');
text = rawText.replace(/\n\n+/g, '\n');
text = text.replace(/\s\s+/g, '\n');
}
return text;
} catch (error) {
throw error;
}
}
function getDomain(url) {
const regex = /^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)/igm;
// Alternative syntax using RegExp constructor
// const regex = new RegExp('^(?:https?:\\\/\\\/)?(?:[^@\\\/\\n]+@)?(?:www\\.)?([^:\\\/?\\n]+)', 'igm')
let m;
while ((m = regex.exec(url)) !== null) {
// This is necessary to avoid infinite loops with zero-width matches
if (m.index === regex.lastIndex) {
regex.lastIndex++;
}
return m[1];
}
}
function removeTelFax(content) {
const regex = /(?:(f|F)((a|A)(X|x))?(?:\s?(\-|\:)?\s?))(?:\s?[0-9\+\.\(\)\/\\\-]\/?\\?\s?){7,30}|(?:\s?[0-9\+\.\(\)\/\\\-]\/?\\?\s?){7,30}/g;
// Alternative syntax using RegExp constructor
// const regex = new RegExp('(?:(f|F)((a|A)(X|x))?(?:\\s?(\\-|\\:)?\\s?))(?:\\s?[0-9\\+\\.\\(\\)\\\/\\\\\\-]\\\/?\\\\?\\s?){7,30}|(?:\\s?[0-9\\+\\.\\(\\)\\\/\\\\\\-]\\\/?\\\\?\\s?){7,30}', 'g')
const str = content;
const subst = ``;
// The substituted value will be contained in the result variable
const result = str.replace(regex, subst);
// console.log('Substitution result: ', result);
return result
}
function removeCssCode(content) {
const regex = /(?:\s*\S+\s*{[^}]*})+/g;
// Alternative syntax using RegExp constructor
// const regex = new RegExp('(?:\\s*\\S+\\s*{[^}]*})+', 'g')
const str = content;
const subst = ``;
// The substituted value will be contained in the result variable
const result = str.replace(regex, subst);
return result
}
function removeNotJapaneseChar(content) {
const regex = /[^、-〿-ゟ゠-ヿ-一-龯\r\n]+(?![、-〿-ゟ゠-ヿ-一-龯\d])|[…“”’\"'–]/g;
// Alternative syntax using RegExp constructor
// const regex = new RegExp('[^、-〿-ゟ゠-ヿ-一-龯\\r\\n]+(?![、-〿-ゟ゠-ヿ-一-龯\\d])|[…“”’\\"'–]', 'g')
const str = content;
const subst = ``;
// The substituted value will be contained in the result variable
const result = str.replace(regex, subst);
return result;
}
function paddingTwoEnd(content) {
const max_length = 3600
var contentLength = content.length;
if (contentLength > max_length) {
var start = Math.floor((contentLength - max_length) / 2)
content = content.slice(start, start + max_length)
}
return content;
}
async function serperSearch(query, configs = {}) {
const { apiKey } = configs;
if (!apiKey) throw new Error("Serper Google Search: no apiKey provided !")
let data = JSON.stringify({
"q": query, gl: "jp", hl: "ja"
});
let config = {
method: 'post',
maxBodyLength: Infinity,
url: 'https://google.serper.dev/search',
headers: {
'X-API-KEY': configs.apiKey,
'Content-Type': 'application/json'
},
data: data
};
var response = await axios.request(config);
response = response.data;
response.results = response.organic;
delete response.organic;
return response;
}
async function googleSearch(query, configs = {}) {
const { cx, apiKey } = configs;
const defaultCx = "5245d658fce0140c9";
if (!apiKey) throw new Error("Google Custom Search: no apiKey provided !");
const GGCustomSearchEndpoint = "https://customsearch.googleapis.com/customsearch/v1";
const params = {
q: query, gl: "jp", hl: "ja",
cx: cx || defaultCx,
key: apiKey
}
const axiosConfig = {
method: "get",
url: GGCustomSearchEndpoint,
headers: { 'Content-Type': 'application/json' },
params: params
}
const response = await axios.request(axiosConfig);
return { results: response.data.items };
}
async function findDomain(query, search = googleSearch) {
try {
var response = await search(query, {});
} catch (error) {
console.error(`fn findDomain: ${error.message}`)
}
const results = response.results;
if (results.length > 0) firstResult = results[0];
var site = firstResult.url || firstResult.link;
var domain = getDomain(site);
return domain;
}
const getSearch = (strategy) => {
const searchStrategy = {
"google": googleSearch,
"serper": serperSearch
};
const { strategyName, strategyConfig } = strategy;
console.log(`Search strategy: ${strategyName}`);
const search = async (query) => await (searchStrategy[strategyName] || googleSearch)(query, strategyConfig);
return search
}
async function customSearch(query, strategy) {
const search = getSearch(strategy)
var searchQuery = `役員一覧|役員紹介|取締役・監査役|会社案内|会社概要|組合概要|企業情報|代表者|役員 ${query}`;
try {
var response = await search(searchQuery);
} catch (error) {
console.log(error)
console.error(`fn findDomain: ${error.message} - ${error.response.data.message}`)
}
let domain = '';
if (searchQuery.includes(' site:')) {
domain = searchQuery.split('site:')[1];
} else {
domain = getDomain(response.results[0]?.url || response.results[0]?.link || '');
}
var pageContent = '';
try {
const results = response.results;
const n_pages = results.length;
// filter out pdf, docx, xlsx, pptx, doc, xls, ppt
const urls = results.map(result => result.url || result.link)
.filter(url => !url.includes('.pdf') && !url.includes('.docx') && !url.includes('.xlsx') && !url.includes('.pptx') && !url.includes('.doc') && !url.includes('.xls') && !url.includes('.ppt'));
// Get content of maximum 5 pages
const max_pages = n_pages > 5 ? 5 : n_pages;
const urlsToCrawl = urls.slice(0, max_pages);
const promises = urlsToCrawl.map(async (url) => {
return await getPageByUrl(url);
})
const pages = await Promise.all(promises);
pageContent = pages.join('\n\n');
// pageContent = removeTelFax(pageContent);
// pageContent = removeCssCode(pageContent);
// pageContent = removeNotJapaneseChar(pageContent);
// pageContent = paddingTwoEnd(pageContent);
} catch (error) {
console.log(error);
console.log("Can't crawl url");
}
var result = `Domain: ${domain}
Content: ${pageContent}`;
return result;
}
class AgentExtractLeads {
constructor(config) {
const llmStrategyName = config?.llmStrategy || "google_gemini";
const llmConfig = config?.llmConfig || {};
console.log("config", config)
const searchStrategy = config.searchStrategy || { strategyConfig: { apiKey: "0a24a19060471364107f39d8abc58d7c5633291f" }, strategyName: "serper" };
this.searchTool = async (query) => await customSearch(query, searchStrategy);
this.findDomain = async (query) => await findDomain(query, getSearch(searchStrategy));
this.llm = new LLMStrategy[llmStrategyName](llmConfig);
}
async search(query) {
const searchResult = await this.searchTool(query);
const domain = searchResult.split('\n')[0].split('Domain: ')[1];
const pageContent = searchResult;
return { domain, pageContent }
}
async findBods(pageContent, templateContent) {
const template = new PromptTemplate({ template: templateContent });
const prompt = template.format({ context: pageContent });
const text = await this.llm.generate(prompt)
return text
}
async run(query, findDomainOnly) {
if (findDomainOnly) {
const domain = await this.findDomain(query);
return { output: domain }
}
else {
const { domain, pageContent } = await this.search(query);
const bods = await this.findBods(pageContent, `{{context}}\nExtract board members or representative information in the text above. Desire output format is JSON like: {<name>: <position>}. If no board members information is provied, response empty JSON. Provide JSON only.`);
return { domain, bods }
}
}
}
module.exports = { AgentExtractLeads, customSearch };
// (async () => {
//"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"
//"AIzaSyD-1b6jCKI7W6u14O5DVlennv-gqXhD79U"
//"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6IjY0ZGIwMDAxOWFmMTA4ZTk5MDQ3NjIwNSIsImVtYWlsIjoic3lzdGVtX2FkbWluQGdtYWlsLmNvbSIsInVzZXJuYW1lIjoic3lzdGVtX2FkbWluIiwicm9sZSI6IlN5c3RlbSBBZG1pbiIsImlhdCI6MTY5NjkxODc0OSwiZXhwIjoxNzI4NDU0NzQ5fQ.PZEhH5fYJ9VfRYr-w-M1nmpg0s9UkQkERya5tKrf9lY"
// const agent = new AgentExtractLeads({ llmStrategy: 'google_gemini', llmConfig: { api_key: "AIzaSyD-1b6jCKI7W6u14O5DVlennv-gqXhD79U", base_url: "" }, searchStrategy: { strategyConfig: { apiKey: "0a24a19060471364107f39d8abc58d7c5633291f" }, strategyName: "serper" } });
// agent.run("三田証券株式会社", true)
// })()
// main().then((result) => console.log("finish"))
// findDomain('NAL Vietnam').then(data => console.log(data))
// getPageByUrl('https://corp.toyokeizai.net/who-we-are/outline/').then(x => console.log(x));
// customSearch("SGシステム株式会社").then((data) => { console.log(data.length, data) })
// const agent = new AgentBod()
// agent.run(`代表取締役 天野 岳夫
// 取締役 鈴木 敏之
// 取締役 須田 超一
// 取締役 谷津 裕`)