UNPKG

pons.js

Version:

node.js Pons Wörterbuch / Dictionary Crawler

363 lines (314 loc) 11.1 kB
'use strict'; const http = require('http'); const events = require('events'); const cheerio = require('cheerio'); const isEmpty = require('lodash/isEmpty'); const eURIc = encodeURIComponent; class Utils { static getConstructorName(obj) { let funcNameRegex = /function (.{1,})\(/; let results = (funcNameRegex).exec((obj).constructor.toString()); return (results && results.length > 1) ? results[1] : ''; } static ueHTML(escapedHTML) { escapedHTML = decodeURIComponent(escapedHTML) || ''; return escapedHTML.replace(/&lt;/g, '<').replace(/&gt;/g, '>').replace(/&amp;/g, '&').replace(/&#39;/g, "'"); } } class PonsDepth extends events.EventEmitter { constructor(opts) { super(); opts && Object.getOwnPropertyNames(opts).forEach(val => this[val] = opts[val]); this.ready = false; this.possibleDirections = []; // get the list of possible translation directions http.get('http://api.pons.com/v1/dictionaries?language=en', res => { let body = ''; res.on('data', chunk => { body += chunk; }); res.on('end', () => { this.possibleDirections = JSON.parse(body).map(val => val.key).filter(val => val !== 'dede' && val !== 'dedx'); this.ready = true; this.emit('ready'); }); }); } translate(word, _from, _to, callback) { this._translate(word, _from, _to, callback); } unitranslate(word, _to, callback) { this.waitress(() => { let positives = this.possibleDirections.map(value => (value.indexOf(_to) >= 0) && value.replace(_to, '')).filter(value => value); let counter = 0; let sum = []; positives.forEach(value => { this._translate(word, value, _to, (err, answer) => { if (err) { return callback(err); } sum = sum.concat(answer); if (++counter === positives.length) { // if completed then return sum callback(null, sum); } }); }); }); } nullitranslate(/* word, callback */) { } _translate(word, _from, _to, callback) { // I should rewrite the whole code when I have time, with error handling in mind, using promises this.waitress(() => { let options; try { options = { hostname: 'en.pons.com', path: `/translate?q=${eURIc(word)}&l=${this._getBiLangKey(_from, _to)}&in=&lf=${_from}&cid=`, headers: { 'pragma': 'no-cache', 'dnt': '1', 'accept-language': 'en-US;q=0.6,en;q=0.4', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'cache-control': 'no-cache' } }; } catch (e) { return callback(e); } this.emit('translating', word); http.get(options, res => { let body = ''; res.on('data', chunk => { body += chunk; }); res.on('end', () => { this._parseBody(body, (err, trResObj) => { if (err) { return callback(err); } this._afterTranslationProcess(word, trResObj, callback); }); }); }); }); } _afterTranslationProcess(word, json, callback) { if (this.exact || this.contain || this.noExample) { try { json = this._filterTranslations(word, json); } catch (e) { return callback(e); } } this.emit('translated', word, json); callback(null, json); } _filterTranslations(word, json) { for (let index0 = json.length - 1; index0 >= 0; index0--) { let element0 = json[index0]; let lang = element0.words; for (let index1 = lang.length - 1; index1 >= 0; index1--) { let element1 = lang[index1]; let _0 = element1.words; for (let index2 = _0.length - 1; index2 >= 0; index2--) { let element2 = _0[index2]; let _1 = element2.words; for (var index3 = _1.length - 1; index3 >= 0; index3--) { let element3 = _1[index3]; let rm = false; if (this.exact) { if (element3.query.toLowerCase() !== word.toLowerCase()) { (!rm) && (rm = true) && _1.splice(index3, 1); } } if (this.contain) { if (element3.query.toLowerCase().indexOf(word.toLowerCase()) === -1) { (!rm) && (rm = true) && _1.splice(index3, 1); } } if (this.noExample) { if (element3.example) { (!rm) && (rm = true) && _1.splice(index3, 1); } } } if (_1.length === 0) { _0.splice(index2, 1); } } if (_0.length === 0) { lang.splice(index1, 1); } } if (lang.length === 0) { json.splice(index0, 1); } } return json; } waitress(action) { if (this.ready) { action(); } else { this.once('ready', action); } } _parseBody(body, callback) { let $; const propProcedure = ($obj, selectDad, objToSave, objToTakeFrom, objToPush) => { $obj.find(selectDad + ' span:not(:has(*))').each(indis => { // flexion, sense etc. const $thus = $($obj.find(selectDad + ' span:not(:has(*))')[indis]); if (Utils.ueHTML($thus.html()).charAt(0) === '<' || Utils.ueHTML($thus.html()).charAt(0) === '(' || Utils.ueHTML($thus.html()).charAt(0) === '[') { objToSave[Utils.ueHTML($thus.attr('class').split(' ')[0])] = Utils.ueHTML($thus.html()).slice(1, -1); return; } objToSave[Utils.ueHTML($thus.attr('class').split(' ')[0])] = Utils.ueHTML($thus.html()); }); $obj.find(selectDad + ' acronym').each(indis => { // wordclass, genus, etc. const $thus = $($obj.find(selectDad + ' acronym')[indis]); if (objToSave[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])]) { if (typeof objToSave[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])] === 'string') { objToSave[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])] = [objToSave[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])]]; } objToSave[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])].push(Utils.ueHTML($thus.attr('title'))); } else { objToSave[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])] = Utils.ueHTML($thus.attr('title')); } }); objToSave.words = objToTakeFrom; objToPush.push(objToSave); }; const pairFieldProcedure = (pair, $obj, field, jqStrHead) => { $obj.find('.example').html() && (pair.example = true); let wanted = []; for (let l = 0; l < $obj.find(jqStrHead + 'a:not(.info a):not(acronym:only-child a)').length; l++) { const text = $obj.find(jqStrHead + 'a:not(.info a):not(acronym:only-child a)')[l]; wanted.push(Utils.ueHTML($(text).html())); } pair[field] = wanted = wanted.join(' '); if (pair[field] === '' && jqStrHead) { return pairFieldProcedure(pair, $obj, field, ''); } let propObj = pair[field + '_properties'] = {}; $obj.find('.sense a') && (propObj.sense = []) && $obj.find('.sense a').each(value => ((propObj.sense.push($($obj.find('.sense a')[value]).html())))) && (propObj.sense && ((propObj.sense.length !== 0) || !(delete propObj.sense)) && (propObj.sense = propObj.sense.join(' '))); // he he :D, lispy, but really, selecting pons' stupid in-translation comments is really hard and I couldn't automate it $obj.find('acronym:only-child').each(indis => { const $thus = $($obj.find('acronym:only-child')[indis]); if (propObj[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])]) { if (typeof propObj[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])] === 'string') { propObj[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])] = [propObj[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])]]; } propObj[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])].push(Utils.ueHTML($thus.attr('title'))); } else { propObj[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])] = Utils.ueHTML($thus.attr('title')); } }); isEmpty(propObj) && delete pair[field + '_properties']; }; const parseLangs = (cb, resultier, time) => { resultier = resultier || []; if (typeof time === 'undefined') { time = -1; } if (time < $('.result_list > .lang').length - 1) { time++; } else { return cb(resultier); } const $lang = $($('.result_list > .lang')[time]); return parseGirdis($lang, romlar => { resultier.push({ from: $($lang.find('.lang_dir > .flag')[0]).attr('class').slice(-2), to: $($lang.find('.lang_dir > .flag')[1]).attr('class').slice(-2), words: romlar }); return parseLangs(cb, resultier, time); }); }; const parseGirdis = ($lang, cb, romlar, time) => { romlar = romlar || []; if (typeof time === 'undefined') { time = -1; } if (time < $('.entry').length - 1) { time++; } else { return cb(romlar); } return parseRoms($($lang.find('.entry')[time]), romlar => { return parseGirdis($lang, cb, romlar, time); }, romlar); }; const parseRoms = ($girdi, cb, romlar, time) => { if (typeof time === 'undefined') { time = -1; } if (time < $girdi.find('.rom').length - 1) { time++; } else { return cb(romlar); } const $rom = $($girdi.find('.rom')[time]); return parseTranslations($rom, transTypes => { let romObj = {}; romObj.origin = Utils.ueHTML($girdi.attr('rel')); propProcedure($rom, '.romhead', romObj, transTypes, romlar); return parseRoms($girdi, cb, romlar, time); }); }; const parseTranslations = ($rom, cb, transTypes, time) => { transTypes = transTypes || []; if (typeof time === 'undefined') { time = -1; } if (time < $rom.find('.translations').length - 1) { time++; } else { return cb(transTypes); } const $translations = $($rom.find('.translations')[time]); return parsePairs($translations, transList => { let transListObj = {}; propProcedure($translations, 'h3', transListObj, transList, transTypes); return parseTranslations($rom, cb, transTypes, time); }); }; const parsePairs = ($translations, cb, transList, time) => { transList = transList || []; if (typeof time === 'undefined') { time = -1; } if (time < $translations.find('.kne').length - 1) { time++; } else { return cb(transList); } const $kne = $($translations.find('.kne')[time]); transList.push({}); const pair = transList[transList.length - 1]; pairFieldProcedure(pair, $($kne.find('.source')), 'query', ':first-child '); pairFieldProcedure(pair, $($kne.find('.target')), 'result', ''); if (!pair['result'.concat('_properties')]) { for (let key in pair['query'.concat('_properties')]) { pair[key] = pair['query'.concat('_properties')][key]; } delete pair['query'.concat('_properties')]; } return parsePairs($translations, cb, transList, time); }; try { $ = cheerio.load(body, { decodeEntities: false }); $('.roman, .separator').remove(); return parseLangs(resultier => callback(null, resultier)); } catch (e) { callback(e); } } _getBiLangKey(erste, zweite) { return this.possibleDirections.indexOf(erste.concat(zweite)) >= 0 ? erste.concat(zweite) : zweite.concat(erste); } } module.exports = PonsDepth;