UNPKG

rosaenlg-filter

Version:

Filtering feature of RosaeNLG

117 lines 4.91 kB
"use strict"; /** * @license * Copyright 2019 Ludan Stoecklé * SPDX-License-Identifier: Apache-2.0 */ Object.defineProperty(exports, "__esModule", { value: true }); exports.addCaps = exports.quotes = exports.parenthesis = exports.cleanSpacesPunctuation = exports.duplicatePunctuation = exports.EATSPACE = void 0; const rosaenlg_commons_1 = require("rosaenlg-commons"); exports.EATSPACE = 'EATSPACE'; // warning: we use both allPunctList and stdPunctList to manage Spanish properly function duplicatePunctuation(input, languageFilter) { let res = input; // ['bla ...', 'bla…'], res = res.replace(/\.\.\./g, '…'); // ['bla ! . bla', 'Bla! Bla'], const regexDoublePunct = new RegExp(`([${languageFilter.constants.allPunctList}])((?:${languageFilter.constants.spaceOrNonBlockingClass}*[${rosaenlg_commons_1.Constants.stdPunctList}])*)`, 'g'); res = res.replace(regexDoublePunct, function (_match, firstPunct, otherStuff) { const regexRemovePunct = new RegExp(`[${rosaenlg_commons_1.Constants.stdPunctList}]`, 'g'); const removedPunct = otherStuff.replace(regexRemovePunct, () => { return ''; }); return `${firstPunct}${removedPunct}`; }); return res; } exports.duplicatePunctuation = duplicatePunctuation; function cleanSpacesPunctuation(input, languageFilter) { let res = input; // 2 spaces or more res = res.replace(/\s{2,}/g, ' '); res = languageFilter.cleanSpacesPunctuation(res); if (languageFilter.cleanSpacesPunctuationDoDefault) { const regexPunct = new RegExp( // stdPunctList and not allPunctList: on purpose, as special Spanish is managed just before `(${languageFilter.constants.spaceOrNonBlockingClass}*)([${rosaenlg_commons_1.Constants.stdPunctList}])(${languageFilter.constants.spaceOrNonBlockingClass}*)`, 'g'); res = res.replace(regexPunct, (_match, before, punct, after) => { return `${before.replace(/\s/g, '')}${punct} ${after.replace(/\s/g, '')}`; }); } res = res.replace(/\s+☚/g, '☚'); // ['bla . </p>', 'bla.</p>'] res = res.replace(/☛\s+/g, '☛'); res = res.replace(/\s+☚/g, '☚'); // spaces at the very end res = res.trim(); // eat spaces const eatspaceRe = new RegExp(`[\\s¤]*${exports.EATSPACE}[\\s¤]*`, 'g'); res = res.replace(eatspaceRe, ''); res = languageFilter.cleanSpacesPunctuationCorrect(res); return res; } exports.cleanSpacesPunctuation = cleanSpacesPunctuation; function parenthesis(input, languageFilter) { let res = input; // remove spaces after '(' or before ')' res = res.replace(/\(\s+/g, '('); res = res.replace(/\s+\)/g, ')'); // add spaces before '(' or after ')' const regexSpaceBeforePar = new RegExp('[' + languageFilter.constants.tousCaracteresMinMajRe + ']\\(', 'g'); res = res.replace(regexSpaceBeforePar, (match) => { return match.charAt(0) + ' ('; }); const regexSpaceAfterPar = new RegExp('\\)[' + languageFilter.constants.tousCaracteresMinMajRe + ']', 'g'); res = res.replace(regexSpaceAfterPar, (match) => { return ') ' + match.charAt(1); }); return res; } exports.parenthesis = parenthesis; function quotes(input) { let res = input; let alreadyStarted = false; res = res.replace(/(\s*)"(\s*)/g, () => { if (!alreadyStarted) { alreadyStarted = true; return ' "'; } else { alreadyStarted = false; return '" '; } }); // trigger a warning if an end is missing if (alreadyStarted) { console.log(`WARNING: did find a starting " but not the ending one`); } // mixes of quotes and parenthesis res = res.replace(/\(\s*"/g, () => { return ' ("'; }); res = res.replace(/"\s*\)/g, () => { return '") '; }); return res; } exports.quotes = quotes; function addCaps(input, languageFilter) { let res = input; { const triggerCapsWithSpace = '[\\.!\\?¡¿]'; const regexCapsAfterDot = new RegExp(`(${triggerCapsWithSpace})(${languageFilter.constants.spaceOrNonBlockingClass}*)([${languageFilter.constants.tousCaracteresMinMajRe}])`, 'g'); res = res.replace(regexCapsAfterDot, (_match, punct, before, firstWord) => { return `${punct}${before.replace(/\s/g, '')} ${firstWord.toUpperCase()}`; }); } res = languageFilter.addCapsSpecific(res); { const regexCapsAfterP = new RegExp(`([☛☚])(${languageFilter.constants.spaceOrNonBlockingClass}*)([${languageFilter.constants.tousCaracteresMinMajRe}])`, 'g'); res = res.replace(regexCapsAfterP, (_match, start, between, char) => { return `${start}${between.replace(/ /g, '')}${char.toUpperCase()}`; }); } return res; } exports.addCaps = addCaps; //# sourceMappingURL=punctuation.js.map