@kaufman-bot/html-scraper-server
Version:
Html scraper commands and tools
182 lines • 9.2 kB
JavaScript
"use strict";
var ScraperService_1;
Object.defineProperty(exports, "__esModule", { value: true });
exports.ScraperService = void 0;
const tslib_1 = require("tslib");
const core_server_1 = require("@kaufman-bot/core-server");
const common_1 = require("@nestjs/common");
const axios_1 = tslib_1.__importDefault(require("axios"));
const charset_1 = tslib_1.__importDefault(require("charset"));
const cheerio_1 = tslib_1.__importDefault(require("cheerio"));
const class_validator_multi_lang_1 = require("class-validator-multi-lang");
const encoding_1 = tslib_1.__importDefault(require("encoding"));
const html_to_text_1 = tslib_1.__importDefault(require("html-to-text"));
const jschardet_1 = tslib_1.__importDefault(require("jschardet"));
const mustache_1 = require("mustache");
const scraper_config_1 = require("../scraper-config/scraper.config");
let ScraperService = ScraperService_1 = class ScraperService {
constructor(scraperConfig, botCommandsToolsService) {
this.scraperConfig = scraperConfig;
this.botCommandsToolsService = botCommandsToolsService;
this.handlerId = ScraperService_1.name;
this.logger = new common_1.Logger(ScraperService_1.name);
}
onContextBotCommands(msg) {
return tslib_1.__awaiter(this, void 0, void 0, function* () {
if (this.botCommandsToolsService.checkCommands(msg.text, [(0, class_validator_multi_lang_1.getText)('more'), (0, class_validator_multi_lang_1.getText)('next')], msg.locale)) {
msg.text = `${core_server_1.BotCommandsEnum.get} ${this.scraperConfig.name}`;
return {
type: 'message',
message: msg,
};
}
return null;
});
}
onHelp(msg, ctx, loggerContext) {
return tslib_1.__awaiter(this, void 0, void 0, function* () {
return yield this.onMessage(Object.assign(Object.assign({}, msg), { text: `${this.scraperConfig.name} ${core_server_1.BotCommandsEnum.help}` }), ctx, loggerContext);
});
}
onMessage(msg, ctx, loggerContext) {
return tslib_1.__awaiter(this, void 0, void 0, function* () {
const spyWord = this.botCommandsToolsService.checkSpyWords({
msg,
spyWords: this.scraperConfig.spyWords,
});
if (spyWord) {
if (!msg.locale) {
throw new Error(`locale not set`);
}
if (this.botCommandsToolsService.checkCommands(msg.text, [core_server_1.BotCommandsEnum.help], msg.locale)) {
return {
type: 'markdown',
message: msg,
markdown: this.botCommandsToolsService.generateHelpMessage(msg, {
locale: msg.locale,
name: this.scraperConfig.title,
contextUsage: this.scraperConfig.contextUsage,
descriptions: this.scraperConfig.descriptions,
usage: this.scraperConfig.usage,
category: this.scraperConfig.category,
}),
};
}
const preparedText = this.botCommandsToolsService.clearCommands(msg.text, [
spyWord,
core_server_1.BotCommandsEnum.help,
...(this.scraperConfig.removeWords || []),
], msg.locale);
const replayMessage = yield this.scrap(msg.locale, preparedText, loggerContext);
if (replayMessage) {
return {
type: 'text',
message: msg,
text: replayMessage,
};
}
this.logger.warn(`Unhandled commands for text: "${msg.text}"`, loggerContext || ScraperService_1.name);
this.logger.debug(msg, loggerContext || ScraperService_1.name);
}
return null;
});
}
scrap(locale, text, loggerContext) {
return tslib_1.__awaiter(this, void 0, void 0, function* () {
/*const parsedVariables = parse(this.scraperConfig.uri)
.filter((arr) => arr[0] === 'name')
.map((arr) => arr[1]);
const otherText = text;*/
const replaceVariables = { text: encodeURIComponent(text.trim()), locale };
(this.scraperConfig.removeWords || []).forEach((removeWord) => {
text = text
.replace(new RegExp(removeWord, 'ig'), '')
.replace(new RegExp(' {2}', 'ig'), ' ')
.trim();
});
const textArray = text.split(' ');
if (textArray.length > 0) {
textArray.forEach((textArrayItem, textArrayIndex) => {
replaceVariables[`text ${textArrayIndex + 1}`] = textArrayItem;
});
textArray.forEach((textArrayItem, textArrayIndex) => {
replaceVariables[`text${textArrayIndex + 1}`] =
textArrayItem.toLowerCase();
});
textArray.forEach((textArrayItem, textArrayIndex) => {
replaceVariables[`TEXT${textArrayIndex + 1}`] =
textArrayItem.toUpperCase();
});
}
const replacedUri = (0, mustache_1.render)(this.scraperConfig.uri, replaceVariables);
const headers = this.scraperConfig.headers || [
{
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Sec-Ch-Ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
},
{
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.5',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0',
},
];
const axiosInstance = axios_1.default.create({
timeout: this.scraperConfig.timeout || 3000,
responseEncoding: this.scraperConfig.contentCodepage || 'binary',
});
try {
const response = yield axiosInstance.get(replacedUri, {
headers: this.botCommandsToolsService.getRandomItem(headers),
});
const $ = cheerio_1.default.load(String(response.data));
let content = this.scraperConfig.contentSelector
.split(',')
.map((selector) => html_to_text_1.default.fromString($(selector).html()))
.join('\n\n');
const enc = (0, charset_1.default)(response.headers, response.data) ||
jschardet_1.default.detect(response.data).encoding.toLowerCase();
if (enc !== 'utf8') {
content = encoding_1.default
.convert(Buffer.from(content, 'binary'), 'utf8', enc, true)
.toString('utf8');
}
if (!content) {
this.logger.debug(JSON.stringify({
scraperConfig: this.scraperConfig,
replacedUri,
data: response.data,
enc,
selectors: this.scraperConfig.contentSelector,
}));
}
return content;
}
catch (err) {
this.logger.error(err, err.stack, loggerContext || ScraperService_1.name);
return err.toString();
}
});
}
};
ScraperService = ScraperService_1 = tslib_1.__decorate([
(0, common_1.Injectable)(),
tslib_1.__param(0, (0, common_1.Inject)(scraper_config_1.SCRAPER_CONFIG)),
tslib_1.__metadata("design:paramtypes", [Object, core_server_1.BotCommandsToolsService])
], ScraperService);
exports.ScraperService = ScraperService;
//# sourceMappingURL=scraper.service.js.map