UNPKG

@kaufman-bot/html-scraper-server

Version:
182 lines 9.2 kB
"use strict"; var ScraperService_1; Object.defineProperty(exports, "__esModule", { value: true }); exports.ScraperService = void 0; const tslib_1 = require("tslib"); const core_server_1 = require("@kaufman-bot/core-server"); const common_1 = require("@nestjs/common"); const axios_1 = tslib_1.__importDefault(require("axios")); const charset_1 = tslib_1.__importDefault(require("charset")); const cheerio_1 = tslib_1.__importDefault(require("cheerio")); const class_validator_multi_lang_1 = require("class-validator-multi-lang"); const encoding_1 = tslib_1.__importDefault(require("encoding")); const html_to_text_1 = tslib_1.__importDefault(require("html-to-text")); const jschardet_1 = tslib_1.__importDefault(require("jschardet")); const mustache_1 = require("mustache"); const scraper_config_1 = require("../scraper-config/scraper.config"); let ScraperService = ScraperService_1 = class ScraperService { constructor(scraperConfig, botCommandsToolsService) { this.scraperConfig = scraperConfig; this.botCommandsToolsService = botCommandsToolsService; this.handlerId = ScraperService_1.name; this.logger = new common_1.Logger(ScraperService_1.name); } onContextBotCommands(msg) { return tslib_1.__awaiter(this, void 0, void 0, function* () { if (this.botCommandsToolsService.checkCommands(msg.text, [(0, class_validator_multi_lang_1.getText)('more'), (0, class_validator_multi_lang_1.getText)('next')], msg.locale)) { msg.text = `${core_server_1.BotCommandsEnum.get} ${this.scraperConfig.name}`; return { type: 'message', message: msg, }; } return null; }); } onHelp(msg, ctx, loggerContext) { return tslib_1.__awaiter(this, void 0, void 0, function* () { return yield this.onMessage(Object.assign(Object.assign({}, msg), { text: `${this.scraperConfig.name} ${core_server_1.BotCommandsEnum.help}` }), ctx, loggerContext); }); } onMessage(msg, ctx, loggerContext) { return tslib_1.__awaiter(this, void 0, void 0, function* () { const spyWord = this.botCommandsToolsService.checkSpyWords({ msg, spyWords: this.scraperConfig.spyWords, }); if (spyWord) { if (!msg.locale) { throw new Error(`locale not set`); } if (this.botCommandsToolsService.checkCommands(msg.text, [core_server_1.BotCommandsEnum.help], msg.locale)) { return { type: 'markdown', message: msg, markdown: this.botCommandsToolsService.generateHelpMessage(msg, { locale: msg.locale, name: this.scraperConfig.title, contextUsage: this.scraperConfig.contextUsage, descriptions: this.scraperConfig.descriptions, usage: this.scraperConfig.usage, category: this.scraperConfig.category, }), }; } const preparedText = this.botCommandsToolsService.clearCommands(msg.text, [ spyWord, core_server_1.BotCommandsEnum.help, ...(this.scraperConfig.removeWords || []), ], msg.locale); const replayMessage = yield this.scrap(msg.locale, preparedText, loggerContext); if (replayMessage) { return { type: 'text', message: msg, text: replayMessage, }; } this.logger.warn(`Unhandled commands for text: "${msg.text}"`, loggerContext || ScraperService_1.name); this.logger.debug(msg, loggerContext || ScraperService_1.name); } return null; }); } scrap(locale, text, loggerContext) { return tslib_1.__awaiter(this, void 0, void 0, function* () { /*const parsedVariables = parse(this.scraperConfig.uri) .filter((arr) => arr[0] === 'name') .map((arr) => arr[1]); const otherText = text;*/ const replaceVariables = { text: encodeURIComponent(text.trim()), locale }; (this.scraperConfig.removeWords || []).forEach((removeWord) => { text = text .replace(new RegExp(removeWord, 'ig'), '') .replace(new RegExp(' {2}', 'ig'), ' ') .trim(); }); const textArray = text.split(' '); if (textArray.length > 0) { textArray.forEach((textArrayItem, textArrayIndex) => { replaceVariables[`text ${textArrayIndex + 1}`] = textArrayItem; }); textArray.forEach((textArrayItem, textArrayIndex) => { replaceVariables[`text${textArrayIndex + 1}`] = textArrayItem.toLowerCase(); }); textArray.forEach((textArrayItem, textArrayIndex) => { replaceVariables[`TEXT${textArrayIndex + 1}`] = textArrayItem.toUpperCase(); }); } const replacedUri = (0, mustache_1.render)(this.scraperConfig.uri, replaceVariables); const headers = this.scraperConfig.headers || [ { Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9', 'Sec-Ch-Ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', }, { Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.5', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0', }, ]; const axiosInstance = axios_1.default.create({ timeout: this.scraperConfig.timeout || 3000, responseEncoding: this.scraperConfig.contentCodepage || 'binary', }); try { const response = yield axiosInstance.get(replacedUri, { headers: this.botCommandsToolsService.getRandomItem(headers), }); const $ = cheerio_1.default.load(String(response.data)); let content = this.scraperConfig.contentSelector .split(',') .map((selector) => html_to_text_1.default.fromString($(selector).html())) .join('\n\n'); const enc = (0, charset_1.default)(response.headers, response.data) || jschardet_1.default.detect(response.data).encoding.toLowerCase(); if (enc !== 'utf8') { content = encoding_1.default .convert(Buffer.from(content, 'binary'), 'utf8', enc, true) .toString('utf8'); } if (!content) { this.logger.debug(JSON.stringify({ scraperConfig: this.scraperConfig, replacedUri, data: response.data, enc, selectors: this.scraperConfig.contentSelector, })); } return content; } catch (err) { this.logger.error(err, err.stack, loggerContext || ScraperService_1.name); return err.toString(); } }); } }; ScraperService = ScraperService_1 = tslib_1.__decorate([ (0, common_1.Injectable)(), tslib_1.__param(0, (0, common_1.Inject)(scraper_config_1.SCRAPER_CONFIG)), tslib_1.__metadata("design:paramtypes", [Object, core_server_1.BotCommandsToolsService]) ], ScraperService); exports.ScraperService = ScraperService; //# sourceMappingURL=scraper.service.js.map