pons.js
Version:
node.js Pons Wörterbuch / Dictionary Crawler
363 lines (314 loc) • 11.1 kB
JavaScript
'use strict';
const http = require('http');
const events = require('events');
const cheerio = require('cheerio');
const isEmpty = require('lodash/isEmpty');
const eURIc = encodeURIComponent;
class Utils {
static getConstructorName(obj) {
let funcNameRegex = /function (.{1,})\(/;
let results = (funcNameRegex).exec((obj).constructor.toString());
return (results && results.length > 1) ? results[1] : '';
}
static ueHTML(escapedHTML) {
escapedHTML = decodeURIComponent(escapedHTML) || '';
return escapedHTML.replace(/</g, '<').replace(/>/g, '>').replace(/&/g, '&').replace(/'/g, "'");
}
}
class PonsDepth extends events.EventEmitter {
constructor(opts) {
super();
opts && Object.getOwnPropertyNames(opts).forEach(val => this[val] = opts[val]);
this.ready = false;
this.possibleDirections = [];
// get the list of possible translation directions
http.get('http://api.pons.com/v1/dictionaries?language=en', res => {
let body = '';
res.on('data', chunk => {
body += chunk;
});
res.on('end', () => {
this.possibleDirections = JSON.parse(body).map(val => val.key).filter(val => val !== 'dede' && val !== 'dedx');
this.ready = true;
this.emit('ready');
});
});
}
translate(word, _from, _to, callback) {
this._translate(word, _from, _to, callback);
}
unitranslate(word, _to, callback) {
this.waitress(() => {
let positives = this.possibleDirections.map(value => (value.indexOf(_to) >= 0) && value.replace(_to, '')).filter(value => value);
let counter = 0;
let sum = [];
positives.forEach(value => {
this._translate(word, value, _to, (err, answer) => {
if (err) {
return callback(err);
}
sum = sum.concat(answer);
if (++counter === positives.length) { // if completed then return sum
callback(null, sum);
}
});
});
});
}
nullitranslate(/* word, callback */) {
}
_translate(word, _from, _to, callback) { // I should rewrite the whole code when I have time, with error handling in mind, using promises
this.waitress(() => {
let options;
try {
options = {
hostname: 'en.pons.com',
path: `/translate?q=${eURIc(word)}&l=${this._getBiLangKey(_from, _to)}&in=&lf=${_from}&cid=`,
headers: {
'pragma': 'no-cache',
'dnt': '1',
'accept-language': 'en-US;q=0.6,en;q=0.4',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'cache-control': 'no-cache'
}
};
} catch (e) {
return callback(e);
}
this.emit('translating', word);
http.get(options, res => {
let body = '';
res.on('data', chunk => {
body += chunk;
});
res.on('end', () => {
this._parseBody(body, (err, trResObj) => {
if (err) {
return callback(err);
}
this._afterTranslationProcess(word, trResObj, callback);
});
});
});
});
}
_afterTranslationProcess(word, json, callback) {
if (this.exact || this.contain || this.noExample) {
try {
json = this._filterTranslations(word, json);
} catch (e) {
return callback(e);
}
}
this.emit('translated', word, json);
callback(null, json);
}
_filterTranslations(word, json) {
for (let index0 = json.length - 1; index0 >= 0; index0--) {
let element0 = json[index0];
let lang = element0.words;
for (let index1 = lang.length - 1; index1 >= 0; index1--) {
let element1 = lang[index1];
let _0 = element1.words;
for (let index2 = _0.length - 1; index2 >= 0; index2--) {
let element2 = _0[index2];
let _1 = element2.words;
for (var index3 = _1.length - 1; index3 >= 0; index3--) {
let element3 = _1[index3];
let rm = false;
if (this.exact) {
if (element3.query.toLowerCase() !== word.toLowerCase()) {
(!rm) && (rm = true) && _1.splice(index3, 1);
}
}
if (this.contain) {
if (element3.query.toLowerCase().indexOf(word.toLowerCase()) === -1) {
(!rm) && (rm = true) && _1.splice(index3, 1);
}
}
if (this.noExample) {
if (element3.example) {
(!rm) && (rm = true) && _1.splice(index3, 1);
}
}
}
if (_1.length === 0) {
_0.splice(index2, 1);
}
}
if (_0.length === 0) {
lang.splice(index1, 1);
}
}
if (lang.length === 0) {
json.splice(index0, 1);
}
}
return json;
}
waitress(action) {
if (this.ready) {
action();
} else {
this.once('ready', action);
}
}
_parseBody(body, callback) {
let $;
const propProcedure = ($obj, selectDad, objToSave, objToTakeFrom, objToPush) => {
$obj.find(selectDad + ' span:not(:has(*))').each(indis => { // flexion, sense etc.
const $thus = $($obj.find(selectDad + ' span:not(:has(*))')[indis]);
if (Utils.ueHTML($thus.html()).charAt(0) === '<' || Utils.ueHTML($thus.html()).charAt(0) === '(' || Utils.ueHTML($thus.html()).charAt(0) === '[') {
objToSave[Utils.ueHTML($thus.attr('class').split(' ')[0])] = Utils.ueHTML($thus.html()).slice(1, -1);
return;
}
objToSave[Utils.ueHTML($thus.attr('class').split(' ')[0])] = Utils.ueHTML($thus.html());
});
$obj.find(selectDad + ' acronym').each(indis => { // wordclass, genus, etc.
const $thus = $($obj.find(selectDad + ' acronym')[indis]);
if (objToSave[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])]) {
if (typeof objToSave[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])] === 'string') {
objToSave[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])] = [objToSave[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])]];
}
objToSave[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])].push(Utils.ueHTML($thus.attr('title')));
} else {
objToSave[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])] = Utils.ueHTML($thus.attr('title'));
}
});
objToSave.words = objToTakeFrom;
objToPush.push(objToSave);
};
const pairFieldProcedure = (pair, $obj, field, jqStrHead) => {
$obj.find('.example').html() && (pair.example = true);
let wanted = [];
for (let l = 0; l < $obj.find(jqStrHead + 'a:not(.info a):not(acronym:only-child a)').length; l++) {
const text = $obj.find(jqStrHead + 'a:not(.info a):not(acronym:only-child a)')[l];
wanted.push(Utils.ueHTML($(text).html()));
}
pair[field] = wanted = wanted.join(' ');
if (pair[field] === '' && jqStrHead) {
return pairFieldProcedure(pair, $obj, field, '');
}
let propObj = pair[field + '_properties'] = {};
$obj.find('.sense a') && (propObj.sense = []) && $obj.find('.sense a').each(value => ((propObj.sense.push($($obj.find('.sense a')[value]).html())))) && (propObj.sense && ((propObj.sense.length !== 0) || !(delete propObj.sense)) && (propObj.sense = propObj.sense.join(' '))); // he he :D, lispy, but really, selecting pons' stupid in-translation comments is really hard and I couldn't automate it
$obj.find('acronym:only-child').each(indis => {
const $thus = $($obj.find('acronym:only-child')[indis]);
if (propObj[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])]) {
if (typeof propObj[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])] === 'string') {
propObj[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])] = [propObj[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])]];
}
propObj[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])].push(Utils.ueHTML($thus.attr('title')));
} else {
propObj[Utils.ueHTML($thus.parent().attr('class').split(' ')[0])] = Utils.ueHTML($thus.attr('title'));
}
});
isEmpty(propObj) && delete pair[field + '_properties'];
};
const parseLangs = (cb, resultier, time) => {
resultier = resultier || [];
if (typeof time === 'undefined') {
time = -1;
}
if (time < $('.result_list > .lang').length - 1) {
time++;
} else {
return cb(resultier);
}
const $lang = $($('.result_list > .lang')[time]);
return parseGirdis($lang, romlar => {
resultier.push({
from: $($lang.find('.lang_dir > .flag')[0]).attr('class').slice(-2),
to: $($lang.find('.lang_dir > .flag')[1]).attr('class').slice(-2),
words: romlar
});
return parseLangs(cb, resultier, time);
});
};
const parseGirdis = ($lang, cb, romlar, time) => {
romlar = romlar || [];
if (typeof time === 'undefined') {
time = -1;
}
if (time < $('.entry').length - 1) {
time++;
} else {
return cb(romlar);
}
return parseRoms($($lang.find('.entry')[time]), romlar => {
return parseGirdis($lang, cb, romlar, time);
}, romlar);
};
const parseRoms = ($girdi, cb, romlar, time) => {
if (typeof time === 'undefined') {
time = -1;
}
if (time < $girdi.find('.rom').length - 1) {
time++;
} else {
return cb(romlar);
}
const $rom = $($girdi.find('.rom')[time]);
return parseTranslations($rom, transTypes => {
let romObj = {};
romObj.origin = Utils.ueHTML($girdi.attr('rel'));
propProcedure($rom, '.romhead', romObj, transTypes, romlar);
return parseRoms($girdi, cb, romlar, time);
});
};
const parseTranslations = ($rom, cb, transTypes, time) => {
transTypes = transTypes || [];
if (typeof time === 'undefined') {
time = -1;
}
if (time < $rom.find('.translations').length - 1) {
time++;
} else {
return cb(transTypes);
}
const $translations = $($rom.find('.translations')[time]);
return parsePairs($translations, transList => {
let transListObj = {};
propProcedure($translations, 'h3', transListObj, transList, transTypes);
return parseTranslations($rom, cb, transTypes, time);
});
};
const parsePairs = ($translations, cb, transList, time) => {
transList = transList || [];
if (typeof time === 'undefined') {
time = -1;
}
if (time < $translations.find('.kne').length - 1) {
time++;
} else {
return cb(transList);
}
const $kne = $($translations.find('.kne')[time]);
transList.push({});
const pair = transList[transList.length - 1];
pairFieldProcedure(pair, $($kne.find('.source')), 'query', ':first-child ');
pairFieldProcedure(pair, $($kne.find('.target')), 'result', '');
if (!pair['result'.concat('_properties')]) {
for (let key in pair['query'.concat('_properties')]) {
pair[key] = pair['query'.concat('_properties')][key];
}
delete pair['query'.concat('_properties')];
}
return parsePairs($translations, cb, transList, time);
};
try {
$ = cheerio.load(body, {
decodeEntities: false
});
$('.roman, .separator').remove();
return parseLangs(resultier => callback(null, resultier));
} catch (e) {
callback(e);
}
}
_getBiLangKey(erste, zweite) {
return this.possibleDirections.indexOf(erste.concat(zweite)) >= 0 ? erste.concat(zweite) : zweite.concat(erste);
}
}
module.exports = PonsDepth;