UNPKG

@terranlabs/appflow-langchain

Version:

Use langchain in appflow

294 lines (245 loc) 10.8 kB
const axios = require('axios'); const cheerio = require('cheerio'); process.env['NODE_TLS_REJECT_UNAUTHORIZED'] = 0 const LLMStrategy = require("./llm_strategy"); const PromptTemplate = require('./template'); async function getPageByUrl(url) { var response; var text = ''; try { var response = await axios.get(url); if (response.status === 200) { const html = response.data; const $ = cheerio.load(html); // $('a').remove(); // $('script').remove(); // $('noscript').remove(); // $('header').remove(); // $('header').prevAll().remove(); // $('footer').remove(); // $('footer').nextAll().remove(); // $('#header').remove(); // $('#header').prevAll().remove(); // $('#footer').remove(); // $('#footer').nextAll().remove(); // $('#footer-map').remove(); // $('#footer-map').nextAll().remove(); // Get innerText var rawText = $('body').prop('innerText'); text = rawText.replace(/\n\n+/g, '\n'); text = text.replace(/\s\s+/g, '\n'); } return text; } catch (error) { throw error; } } function getDomain(url) { const regex = /^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)/igm; // Alternative syntax using RegExp constructor // const regex = new RegExp('^(?:https?:\\\/\\\/)?(?:[^@\\\/\\n]+@)?(?:www\\.)?([^:\\\/?\\n]+)', 'igm') let m; while ((m = regex.exec(url)) !== null) { // This is necessary to avoid infinite loops with zero-width matches if (m.index === regex.lastIndex) { regex.lastIndex++; } return m[1]; } } function removeTelFax(content) { const regex = /(?:(f|F)((a|A)(X|x))?(?:\s?(\-|\:)?\s?))(?:\s?[0-9\+\.\(\)\/\\\-]\/?\\?\s?){7,30}|(?:\s?[0-9\+\.\(\)\/\\\-]\/?\\?\s?){7,30}/g; // Alternative syntax using RegExp constructor // const regex = new RegExp('(?:(f|F)((a|A)(X|x))?(?:\\s?(\\-|\\:)?\\s?))(?:\\s?[0-9\\+\\.\\(\\)\\\/\\\\\\-]\\\/?\\\\?\\s?){7,30}|(?:\\s?[0-9\\+\\.\\(\\)\\\/\\\\\\-]\\\/?\\\\?\\s?){7,30}', 'g') const str = content; const subst = ``; // The substituted value will be contained in the result variable const result = str.replace(regex, subst); // console.log('Substitution result: ', result); return result } function removeCssCode(content) { const regex = /(?:\s*\S+\s*{[^}]*})+/g; // Alternative syntax using RegExp constructor // const regex = new RegExp('(?:\\s*\\S+\\s*{[^}]*})+', 'g') const str = content; const subst = ``; // The substituted value will be contained in the result variable const result = str.replace(regex, subst); return result } function removeNotJapaneseChar(content) { const regex = /[^、-〿぀-ゟ゠-ヿ＀-￯一-龯\r\n]+(?![、-〿぀-ゟ゠-ヿ＀-￯一-龯\d])|[…“”’\"'–]/g; // Alternative syntax using RegExp constructor // const regex = new RegExp('[^、-〿぀-ゟ゠-ヿ＀-￯一-龯\\r\\n]+(?![、-〿぀-ゟ゠-ヿ＀-￯一-龯\\d])|[…“”’\\"'–]', 'g') const str = content; const subst = ``; // The substituted value will be contained in the result variable const result = str.replace(regex, subst); return result; } function paddingTwoEnd(content) { const max_length = 3600 var contentLength = content.length; if (contentLength > max_length) { var start = Math.floor((contentLength - max_length) / 2) content = content.slice(start, start + max_length) } return content; } async function serperSearch(query, configs = {}) { const { apiKey } = configs; if (!apiKey) throw new Error("Serper Google Search: no apiKey provided !") let data = JSON.stringify({ "q": query, gl: "jp", hl: "ja" }); let config = { method: 'post', maxBodyLength: Infinity, url: 'https://google.serper.dev/search', headers: { 'X-API-KEY': configs.apiKey, 'Content-Type': 'application/json' }, data: data }; var response = await axios.request(config); response = response.data; response.results = response.organic; delete response.organic; return response; } async function googleSearch(query, configs = {}) { const { cx, apiKey } = configs; const defaultCx = "5245d658fce0140c9"; if (!apiKey) throw new Error("Google Custom Search: no apiKey provided !"); const GGCustomSearchEndpoint = "https://customsearch.googleapis.com/customsearch/v1"; const params = { q: query, gl: "jp", hl: "ja", cx: cx || defaultCx, key: apiKey } const axiosConfig = { method: "get", url: GGCustomSearchEndpoint, headers: { 'Content-Type': 'application/json' }, params: params } const response = await axios.request(axiosConfig); return { results: response.data.items }; } async function findDomain(query, search = googleSearch) { try { var response = await search(query, {}); } catch (error) { console.error(`fn findDomain: ${error.message}`) } const results = response.results; if (results.length > 0) firstResult = results[0]; var site = firstResult.url || firstResult.link; var domain = getDomain(site); return domain; } const getSearch = (strategy) => { const searchStrategy = { "google": googleSearch, "serper": serperSearch }; const { strategyName, strategyConfig } = strategy; console.log(`Search strategy: ${strategyName}`); const search = async (query) => await (searchStrategy[strategyName] || googleSearch)(query, strategyConfig); return search } async function customSearch(query, strategy) { const search = getSearch(strategy) var searchQuery = `役員一覧|役員紹介|取締役・監査役|会社案内|会社概要|組合概要|企業情報|代表者|役員 ${query}`; try { var response = await search(searchQuery); } catch (error) { console.log(error) console.error(`fn findDomain: ${error.message} - ${error.response.data.message}`) } let domain = ''; if (searchQuery.includes(' site:')) { domain = searchQuery.split('site:')[1]; } else { domain = getDomain(response.results[0]?.url || response.results[0]?.link || ''); } var pageContent = ''; try { const results = response.results; const n_pages = results.length; // filter out pdf, docx, xlsx, pptx, doc, xls, ppt const urls = results.map(result => result.url || result.link) .filter(url => !url.includes('.pdf') && !url.includes('.docx') && !url.includes('.xlsx') && !url.includes('.pptx') && !url.includes('.doc') && !url.includes('.xls') && !url.includes('.ppt')); // Get content of maximum 5 pages const max_pages = n_pages > 5 ? 5 : n_pages; const urlsToCrawl = urls.slice(0, max_pages); const promises = urlsToCrawl.map(async (url) => { return await getPageByUrl(url); }) const pages = await Promise.all(promises); pageContent = pages.join('\n\n'); // pageContent = removeTelFax(pageContent); // pageContent = removeCssCode(pageContent); // pageContent = removeNotJapaneseChar(pageContent); // pageContent = paddingTwoEnd(pageContent); } catch (error) { console.log(error); console.log("Can't crawl url"); } var result = `Domain: ${domain} Content: ${pageContent}`; return result; } class AgentExtractLeads { constructor(config) { const llmStrategyName = config?.llmStrategy || "google_gemini"; const llmConfig = config?.llmConfig || {}; console.log("config", config) const searchStrategy = config.searchStrategy || { strategyConfig: { apiKey: "0a24a19060471364107f39d8abc58d7c5633291f" }, strategyName: "serper" }; this.searchTool = async (query) => await customSearch(query, searchStrategy); this.findDomain = async (query) => await findDomain(query, getSearch(searchStrategy)); this.llm = new LLMStrategy[llmStrategyName](llmConfig); } async search(query) { const searchResult = await this.searchTool(query); const domain = searchResult.split('\n')[0].split('Domain: ')[1]; const pageContent = searchResult; return { domain, pageContent } } async findBods(pageContent, templateContent) { const template = new PromptTemplate({ template: templateContent }); const prompt = template.format({ context: pageContent }); const text = await this.llm.generate(prompt) return text } async run(query, findDomainOnly) { if (findDomainOnly) { const domain = await this.findDomain(query); return { output: domain } } else { const { domain, pageContent } = await this.search(query); const bods = await this.findBods(pageContent, `{{context}}\nExtract board members or representative information in the text above. Desire output format is JSON like: {<name>: <position>}. If no board members information is provied, response empty JSON. Provide JSON only.`); return { domain, bods } } } } module.exports = { AgentExtractLeads, customSearch }; // (async () => { //"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" //"AIzaSyD-1b6jCKI7W6u14O5DVlennv-gqXhD79U" //"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6IjY0ZGIwMDAxOWFmMTA4ZTk5MDQ3NjIwNSIsImVtYWlsIjoic3lzdGVtX2FkbWluQGdtYWlsLmNvbSIsInVzZXJuYW1lIjoic3lzdGVtX2FkbWluIiwicm9sZSI6IlN5c3RlbSBBZG1pbiIsImlhdCI6MTY5NjkxODc0OSwiZXhwIjoxNzI4NDU0NzQ5fQ.PZEhH5fYJ9VfRYr-w-M1nmpg0s9UkQkERya5tKrf9lY" // const agent = new AgentExtractLeads({ llmStrategy: 'google_gemini', llmConfig: { api_key: "AIzaSyD-1b6jCKI7W6u14O5DVlennv-gqXhD79U", base_url: "" }, searchStrategy: { strategyConfig: { apiKey: "0a24a19060471364107f39d8abc58d7c5633291f" }, strategyName: "serper" } }); // agent.run("三田証券株式会社", true) // })() // main().then((result) => console.log("finish")) // findDomain('NAL Vietnam').then(data => console.log(data)) // getPageByUrl('https://corp.toyokeizai.net/who-we-are/outline/').then(x => console.log(x)); // customSearch("SGシステム株式会社").then((data) => { console.log(data.length, data) }) // const agent = new AgentBod() // agent.run(`代表取締役     天野 岳夫 // 取締役       鈴木 敏之 // 取締役       須田 超一 // 取締役       谷津 裕`)