rosaenlg-filter
Version:
Filtering feature of RosaeNLG
117 lines • 4.91 kB
JavaScript
;
/**
* @license
* Copyright 2019 Ludan Stoecklé
* SPDX-License-Identifier: Apache-2.0
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.addCaps = exports.quotes = exports.parenthesis = exports.cleanSpacesPunctuation = exports.duplicatePunctuation = exports.EATSPACE = void 0;
const rosaenlg_commons_1 = require("rosaenlg-commons");
exports.EATSPACE = 'EATSPACE';
// warning: we use both allPunctList and stdPunctList to manage Spanish properly
function duplicatePunctuation(input, languageFilter) {
let res = input;
// ['bla ...', 'bla…'],
res = res.replace(/\.\.\./g, '…');
// ['bla ! . bla', 'Bla! Bla'],
const regexDoublePunct = new RegExp(`([${languageFilter.constants.allPunctList}])((?:${languageFilter.constants.spaceOrNonBlockingClass}*[${rosaenlg_commons_1.Constants.stdPunctList}])*)`, 'g');
res = res.replace(regexDoublePunct, function (_match, firstPunct, otherStuff) {
const regexRemovePunct = new RegExp(`[${rosaenlg_commons_1.Constants.stdPunctList}]`, 'g');
const removedPunct = otherStuff.replace(regexRemovePunct, () => {
return '';
});
return `${firstPunct}${removedPunct}`;
});
return res;
}
exports.duplicatePunctuation = duplicatePunctuation;
function cleanSpacesPunctuation(input, languageFilter) {
let res = input;
// 2 spaces or more
res = res.replace(/\s{2,}/g, ' ');
res = languageFilter.cleanSpacesPunctuation(res);
if (languageFilter.cleanSpacesPunctuationDoDefault) {
const regexPunct = new RegExp(
// stdPunctList and not allPunctList: on purpose, as special Spanish is managed just before
`(${languageFilter.constants.spaceOrNonBlockingClass}*)([${rosaenlg_commons_1.Constants.stdPunctList}])(${languageFilter.constants.spaceOrNonBlockingClass}*)`, 'g');
res = res.replace(regexPunct, (_match, before, punct, after) => {
return `${before.replace(/\s/g, '')}${punct} ${after.replace(/\s/g, '')}`;
});
}
res = res.replace(/\s+☚/g, '☚');
// ['bla . </p>', 'bla.</p>']
res = res.replace(/☛\s+/g, '☛');
res = res.replace(/\s+☚/g, '☚');
// spaces at the very end
res = res.trim();
// eat spaces
const eatspaceRe = new RegExp(`[\\s¤]*${exports.EATSPACE}[\\s¤]*`, 'g');
res = res.replace(eatspaceRe, '');
res = languageFilter.cleanSpacesPunctuationCorrect(res);
return res;
}
exports.cleanSpacesPunctuation = cleanSpacesPunctuation;
function parenthesis(input, languageFilter) {
let res = input;
// remove spaces after '(' or before ')'
res = res.replace(/\(\s+/g, '(');
res = res.replace(/\s+\)/g, ')');
// add spaces before '(' or after ')'
const regexSpaceBeforePar = new RegExp('[' + languageFilter.constants.tousCaracteresMinMajRe + ']\\(', 'g');
res = res.replace(regexSpaceBeforePar, (match) => {
return match.charAt(0) + ' (';
});
const regexSpaceAfterPar = new RegExp('\\)[' + languageFilter.constants.tousCaracteresMinMajRe + ']', 'g');
res = res.replace(regexSpaceAfterPar, (match) => {
return ') ' + match.charAt(1);
});
return res;
}
exports.parenthesis = parenthesis;
function quotes(input) {
let res = input;
let alreadyStarted = false;
res = res.replace(/(\s*)"(\s*)/g, () => {
if (!alreadyStarted) {
alreadyStarted = true;
return ' "';
}
else {
alreadyStarted = false;
return '" ';
}
});
// trigger a warning if an end is missing
if (alreadyStarted) {
console.log(`WARNING: did find a starting " but not the ending one`);
}
// mixes of quotes and parenthesis
res = res.replace(/\(\s*"/g, () => {
return ' ("';
});
res = res.replace(/"\s*\)/g, () => {
return '") ';
});
return res;
}
exports.quotes = quotes;
function addCaps(input, languageFilter) {
let res = input;
{
const triggerCapsWithSpace = '[\\.!\\?¡¿]';
const regexCapsAfterDot = new RegExp(`(${triggerCapsWithSpace})(${languageFilter.constants.spaceOrNonBlockingClass}*)([${languageFilter.constants.tousCaracteresMinMajRe}])`, 'g');
res = res.replace(regexCapsAfterDot, (_match, punct, before, firstWord) => {
return `${punct}${before.replace(/\s/g, '')} ${firstWord.toUpperCase()}`;
});
}
res = languageFilter.addCapsSpecific(res);
{
const regexCapsAfterP = new RegExp(`([☛☚])(${languageFilter.constants.spaceOrNonBlockingClass}*)([${languageFilter.constants.tousCaracteresMinMajRe}])`, 'g');
res = res.replace(regexCapsAfterP, (_match, start, between, char) => {
return `${start}${between.replace(/ /g, '')}${char.toUpperCase()}`;
});
}
return res;
}
exports.addCaps = addCaps;
//# sourceMappingURL=punctuation.js.map