yaspeller
Version:
Search tool typos in the text, files and websites
525 lines (442 loc) • 13.7 kB
JavaScript
;
const async = require('async');
const entities = require('entities');
const fs = require('fs');
const eyo = require('./eyo');
const formatModule = require('./format');
const ignore = require('./ignore');
const isutf8 = require('isutf8');
const request = require('request');
const pth = require('path');
const showdown = require('showdown');
const xml2js = require('xml2js');
const yaspellerApi = require('yandex-speller');
const markdownConverter = new showdown.Converter();
const printDebug = require('../lib/debug').print;
const MAX_LEN_TEXT = 10000; // Max length of text for Yandex.Speller API
const TOO_MANY_ERRORS = 4;
function getMaxRequest(settings) {
return settings.maxRequest || 2;
}
function stripTags(html) {
return html.replace(/<\/?[a-z][^>]*>/gi, ' ');
}
/**
* Check text for typos.
*
* @param {string} originalText
* @param {Function} callback
* @tutorial settings
* @param {Object} [settings]
* @param {string} [settings.format] Text format: plain or html.
* @param {string|Array} [settings.lang] Language: en, ru or uk.
* @param {Array<RegExp>} [settings.ignoreText]
* @param {Object} [settings.options]
*/
function checkText(originalText, callback, settings) {
let text = originalText;
const apiSettings = Object.assign({}, settings);
const format = formatModule.getFormat(text, apiSettings);
const lang = apiSettings.lang || 'en,ru';
apiSettings.lang = Array.isArray(lang) ? lang.join(',') : lang;
Array.isArray(apiSettings.ignoreText) && apiSettings.ignoreText.forEach(function(re) {
text = text.replace(re, '');
});
if (ignore.hasIgnoredText(text)) {
text = ignore.lines(text);
text = ignore.blocks(text);
}
if (format === 'html' || format === 'markdown') {
if (format === 'markdown') {
text = markdownConverter.makeHtml(text);
}
if (apiSettings.ignoreTags) {
text = ignore.tags(text, apiSettings.ignoreTags);
}
text = ignore.comments(text);
text = stripTags(text);
text = entities.decodeHTML(text);
}
text = prepareText(text, format);
const tasks = [];
const texts = splitText(text);
apiSettings.format = formatModule.getApiFormat(format);
texts.forEach(function(el, i) {
printDebug({
request: i,
format: format,
apiFormat: apiSettings.format,
lang: apiSettings.lang,
options: apiSettings.options,
text: el.substring(0, 128)
});
tasks.push(function(cb) {
yaspellerApi.checkText(el, function(error, body) {
if (error) {
cb(false, [true, error]);
} else {
cb(false, [false, body]);
}
}, apiSettings);
});
});
async.parallelLimit(tasks, getMaxRequest(apiSettings), function(err, data) {
const buf = mergeResults(data);
if (!buf.err && apiSettings.checkYo) {
checkYo(text, buf.data);
}
callback(buf.err, buf.data, originalText);
});
}
function checkYo(text, data) {
eyo(text).forEach(function(el) {
data.push({
code: 100,
position: el.position,
word: el.before,
s: [el.after],
count: el.count
});
});
}
function splitText(text) {
const texts = [];
let pos = 0;
let newPos = 0;
while (pos < text.length) {
if (pos + MAX_LEN_TEXT >= text.length) {
texts.push(text.substring(pos));
break;
} else {
newPos = getPosition(text, pos + MAX_LEN_TEXT);
texts.push(text.substring(pos, newPos));
pos = newPos;
}
}
return texts;
}
function getPosition(text, start) {
const depth = 500; // MAX_LEN_TEXT / 20
for (let i = start - 1; i >= start - depth; i--) {
const sym = text[i];
if (sym === ' ' || sym === '\n' || sym === '\t') {
return i;
}
}
return start;
}
function mergeResults(res) {
let err = false;
let data = [];
res.some(function(el) {
if (el[0]) {
err = true;
data = el[1];
return true;
}
return false;
});
if (!err) {
res.forEach(function(el) {
data = data.concat(el[1]);
});
}
return {
err: err,
data: data
};
}
/**
* Check text in file on typos.
*
* @param {string} file
* @param {Function} callback
* @param {Object} [settings] See {@tutorial options}
*/
function checkFile(file, callback, settings) {
settings = settings || {};
settings.extname = pth.extname(file);
printDebug('get: ' + file);
if (fs.existsSync(file)) {
if (fs.statSync(file).isFile()) {
const buf = fs.readFileSync(file);
if (isutf8(buf)) {
printDebug('post text -> Yandex.Speller API: ' + file);
const startTime = Date.now();
checkText(buf.toString(), function(err, data, originalText) {
callback(
err,
err ? data : {resource: file, data: data, time: Date.now() - startTime},
originalText
);
}, settings);
} else {
callback(true, Error(file + ': is not utf-8'));
}
} else {
callback(true, Error(file + ': is not file'));
}
} else {
callback(true, Error(file + ': is not exists'));
}
}
/**
* Check text on link for typos.
*
* @param {string} url
* @param {Function} callback
* @param {Object} [settings] See {@tutorial settings}
*/
function checkUrl(url, callback, settings) {
settings = settings || {};
settings.extname = pth.extname(url);
printDebug('get: ' + url);
request.get({
method: 'GET',
uri: url,
gzip: true
},
function(error, response, text) {
if (error) {
callback(true, error);
return;
}
if (response.statusCode !== 200) {
callback(true, Error(url + ': returns status code is ' + response.statusCode));
return;
}
const startTime = Date.now();
checkText(text, function(err, data, originalText) {
callback(
err,
err ? data : {resource: url, data: data, time: Date.now() - startTime},
originalText
);
}, settings);
});
}
/**
* Check text on pages of sitemap.xml.
*
* @param {string} url
* @param {Function} commonCallback - Common callback
* @param {Object} [settings] See {@tutorial settings}
* @param {Function} [callback] callback - Callback on each url.
*/
function checkSitemap(url, commonCallback, settings, callback) {
settings = settings || {};
const results = [];
printDebug('get: ' + url);
request.get(url, function(error, response, xml) {
let obj;
if (error) {
obj = [true, error];
results.push(obj);
callback && callback.apply(this, obj);
commonCallback(results);
return;
}
if (response.statusCode !== 200) {
obj = [true, Error(url + ': returns status code is ' + response.statusCode)];
results.push(obj);
callback && callback.apply(this, obj);
commonCallback(results);
return;
}
const parser = new xml2js.Parser();
parser.parseString(xml, function(err, result) {
if (err) {
let obj = [true, Error(url + ': error parsing xml')];
results.push(obj);
callback && callback.apply(this, obj);
commonCallback(results);
return;
}
const tasks = [];
if (result && result.urlset && Array.isArray(result.urlset.url)) {
result.urlset.url.forEach(function(el) {
el.loc && el.loc.forEach(function(url) {
tasks.push(function(cb) {
checkUrl(url, function(err, data, originalText) {
callback && callback(err, data, originalText);
cb(false, [err, data]);
}, settings);
});
});
});
}
async.parallelLimit(tasks, getMaxRequest(settings), function(err, data) {
commonCallback(data);
});
});
});
}
/**
* Add positions (line number and column number) for typos.
*
* @param {string} text
* @param {Object[]} data - Array of typos.
*/
function addPositions(text, data) {
data.forEach(function(item) {
if (item.code === TOO_MANY_ERRORS || item.position) {
return;
}
const result = [];
const letters = '[^a-zA-Zа-яА-ЯЁёҐґЄєІіЇї]';
text.replace(new RegExp(item.word + '(?:' + letters + '|$)', 'g'), function($0, index) {
const prevSymbol = text[index - 1];
if (prevSymbol && prevSymbol.search(letters) === -1) {
return;
}
const lines = text.substr(0, index).split(/\r\n|\n|\r/);
result.push({
line: lines.length,
column: lines[lines.length - 1].length + 1
});
});
item.position = item.count >= result.length ? result : [];
});
}
/**
* Remove duplicates in typos.
*
* @param {Object[]} data - Array of typos.
* @returns {Object[]}
*/
function removeDuplicates(data) {
const result = [];
const obj = {};
data.forEach(function(el) {
const code = el.code;
const word = el.word;
const s = el.s;
const hasPosition = Array.isArray(el.position);
if (!word) {
return;
}
obj[code] = obj[code] || {};
if (!obj[code][word]) {
obj[code][word] = {
code,
word,
count: el.count || 1,
};
if (Array.isArray(s) && s.length) {
obj[code][word].suggest = s;
}
if (hasPosition) {
obj[code][word].position = el.position;
}
} else {
const objWord = obj[code][word];
objWord.count += el.count || 1;
if (hasPosition) {
objWord.position = Array.isArray(objWord.position) ?
objWord.position.concat(el.position) :
el.position;
}
}
});
Object.keys(obj).forEach(function(code) {
Object.keys(obj[code]).sort().forEach(function(word) {
result.push(obj[code][word]);
});
});
return result;
}
/**
* Sort results by positions.
*
* @param {Object[]} data
*/
function sortByPositions(data) {
data.sort(function(a, b) {
const codeA = a.code;
const codeB = b.code;
// Sort by a code
if (codeA > codeB) {
return 1;
}
if (codeA < codeB) {
return -1;
}
const posA = a.position;
const posB = b.position;
// No position
if (!posA.length || !posB.length) {
if (posA.length === posB.length) {
// Sort by a word
return a.word.toLowerCase() > b.word.toLowerCase() ? 1 : -1;
}
if (posA.length < posB.length) {
return 1;
}
return -1;
} else {
// Sort by a line
const lineA = posA[0].line;
const lineB = posB[0].line;
if (lineA > lineB) {
return 1;
}
if (lineA < lineB) {
return -1;
}
// Sort by a column
const colA = posA[0].column;
const colB = posB[0].column;
if (colA > colB) {
return 1;
}
if (colA < colB) {
return -1;
}
return 0;
}
});
}
function prepareText(text) {
text = fixLineEndings(text);
text = removeSpecialSymbols(text);
return text.trim();
}
function fixLineEndings(text) {
return text.replace(/\r\n/g, '\n') // Fix Windows
.replace(/\r/g, '\n') // Fix MacOS
.replace(/\s+\n/g, '\n') // Trailling spaces
.replace(/\s+/g, ' ') // Repeat spaces
.replace(/\n+/g, '\n'); // Repeat line endings
}
function removeSpecialSymbols(text) {
return text
// en: aeiouy
// ru: аеёиоуыэюя
// uk: аеєиіїоуюя
.replace(/([aeiouyаеёиоуыэюяєії])\u0301/gi, '$1') // Acute accent
// eslint-disable-next-line no-misleading-character-class
.replace(/[\u200c\u200d\u00ad]/g, ''); // Zero-width non-joiner, Zero-width joiner and shy
}
function getErrors() {
return yaspellerApi.errorCodes.filter(function(el) {
return el.code !== TOO_MANY_ERRORS;
}).map(function(el) {
return {
code: el.code,
title: el.text
};
}).concat({
code: 100, // ERROR_EYO
title: 'Letter Ё (Yo)'
});
}
module.exports = {
addPositions,
errors: getErrors(),
checkFile,
checkSitemap,
checkText,
checkUrl,
removeDuplicates,
sortByPositions
};