rosaenlg-filter
Version:
Filtering feature of RosaeNLG
153 lines • 6.42 kB
JavaScript
;
/**
* @license
* Copyright 2023 Ludan Stoecklé
* SPDX-License-Identifier: Apache-2.0
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.LanguageFilterFrench = void 0;
const LanguageFilter_1 = require("./LanguageFilter");
const french_contractions_1 = require("french-contractions");
const titleCaseFrFr = __importStar(require("titlecase-french"));
class LanguageFilterFrench extends LanguageFilter_1.LanguageFilter {
constructor(languageCommon) {
super(languageCommon);
this.cleanSpacesPunctuationDoDefault = false;
}
protectRawNumbers(input) {
let res = input;
const regexNumber = new RegExp(`([^\\d])${this.constants.stdBeforeWithParenthesis}((\\d{1,3}(?:\\s\\d{3})*|(?:\\d+))(?:,\\d+)?)`, 'g');
res = res.replace(regexNumber, (_match, before1, before2, content) => {
return before1 + before2 + '<protect>' + content + '</protect>';
});
return res;
}
getAfterDeterminer(beforeProtect) {
return `${this.constants.stdBetweenWithParenthesis}(${this.constants.getInBetween(beforeProtect)})([${this.constants.toutesVoyellesMinMaj}hH][${this.constants.tousCaracteresMinMajRe}]*)`;
}
// de + voyelle, que + voyelle, etc.
twoWordsContractions(input) {
let res = input;
const contrList = [
['de', 'le', 'du'],
['de', 'Le', 'du'],
['de', 'les', 'des'],
['de', 'Les', 'des'],
['de', 'des', 'de'],
['de', 'lequel', 'duquel'],
['de', 'lesquels', 'desquels'],
['de', 'lesquelles', 'desquelles'],
['des', 'les', 'des'],
['à', 'le', 'au'],
['à', 'Le', 'au'],
['à', 'lequel', 'auquel'],
['à', 'les', 'aux'],
['à', 'Les', 'aux'],
['à', 'lesquels', 'auxquels'],
['à', 'lesquelles', 'auxquelles'],
['si', 'il', "s'il"],
['si', 'ils', "s'ils"],
];
for (const contr of contrList) {
res = this.contract2elts(contr[0], contr[1], contr[2], res);
}
return res;
}
articlesContractionsGeneric(input, beforeProtect) {
let res = input;
const contrList = [
'[Dd]e',
'[Qq]ue',
'[Ll]e',
'[Ll]a',
'[Ss]e',
'[Jj]e',
'[Tt]e',
'[Mm]e',
'[Nn]e',
'[Pp]uisque',
'[Jj]usque',
'[Ll]orsque',
];
for (const contr of contrList) {
// gérer le cas où 'de' est en début de phrase
const regexDe = new RegExp(`${this.constants.stdBeforeWithParenthesis}(${contr})${this.getAfterDeterminer(beforeProtect)}`, 'g');
res = res.replace(regexDe, (_match, before, determiner, between, beforeWord, word) => {
const newBetween = (between + beforeWord).replace(/[\s¤]+/g, ''); // we contract thus keep no space
if ((0, french_contractions_1.contracts)(word, this.dictManager.getAdjsWordsData())) {
return `${before}${determiner.substring(0, determiner.length - 1)}'${newBetween}${word}`;
}
else {
// do nothing
return `${before}${newBetween}${determiner} ${word}`;
}
});
}
return res;
}
// ce arbre => cet arbre
ceCetGeneric(input, beforeProtect) {
let res = input;
const regexCe = new RegExp(`${this.constants.stdBeforeWithParenthesis}([Cc]e)${this.getAfterDeterminer(beforeProtect)}`, 'g');
res = res.replace(regexCe, (_match, before, determiner, between, beforeWord, word) => {
const newBetween = between + beforeWord;
if ((0, french_contractions_1.contracts)(word, this.dictManager.getAdjsWordsData())) {
return `${before}${determiner}t${newBetween}${word}`;
}
else {
// do nothing
return `${before}${determiner}${newBetween}${word}`;
}
});
return res;
}
contractions(input) {
let res = input;
res = this.ceCetGeneric(res, false);
res = this.articlesContractionsGeneric(res, false);
res = this.twoWordsContractions(res);
return res;
}
titlecase(input) {
return titleCaseFrFr.convert(input);
}
cleanSpacesPunctuation(input) {
let res = input;
// all but . and ,
const regexAllButDot = new RegExp(`(${this.constants.spaceOrNonBlockingClass}*)([:!\\?;])(${this.constants.spaceOrNonBlockingClass}*)`, 'g');
res = res.replace(regexAllButDot, function (_match, before, punc, after) {
return `${before.replace(/\s/g, '')}\xa0${punc} ${after.replace(/\s/g, '')}`;
});
// . and , and …
const regexDot = new RegExp(`(${this.constants.spaceOrNonBlockingClass}*)([\\.,…])(${this.constants.spaceOrNonBlockingClass}*)`, 'g');
res = res.replace(regexDot, function (_match, before, punc, after) {
return `${before.replace(/\s/g, '')}${punc} ${after.replace(/\s/g, '')}`;
});
return res;
}
}
exports.LanguageFilterFrench = LanguageFilterFrench;
//# sourceMappingURL=LanguageFilterFrench.js.map