UNPKG

rosaenlg-filter

Version:

Filtering feature of RosaeNLG

153 lines 6.42 kB
"use strict"; /** * @license * Copyright 2023 Ludan Stoecklé * SPDX-License-Identifier: Apache-2.0 */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.LanguageFilterFrench = void 0; const LanguageFilter_1 = require("./LanguageFilter"); const french_contractions_1 = require("french-contractions"); const titleCaseFrFr = __importStar(require("titlecase-french")); class LanguageFilterFrench extends LanguageFilter_1.LanguageFilter { constructor(languageCommon) { super(languageCommon); this.cleanSpacesPunctuationDoDefault = false; } protectRawNumbers(input) { let res = input; const regexNumber = new RegExp(`([^\\d])${this.constants.stdBeforeWithParenthesis}((\\d{1,3}(?:\\s\\d{3})*|(?:\\d+))(?:,\\d+)?)`, 'g'); res = res.replace(regexNumber, (_match, before1, before2, content) => { return before1 + before2 + '<protect>' + content + '</protect>'; }); return res; } getAfterDeterminer(beforeProtect) { return `${this.constants.stdBetweenWithParenthesis}(${this.constants.getInBetween(beforeProtect)})([${this.constants.toutesVoyellesMinMaj}hH][${this.constants.tousCaracteresMinMajRe}]*)`; } // de + voyelle, que + voyelle, etc. twoWordsContractions(input) { let res = input; const contrList = [ ['de', 'le', 'du'], ['de', 'Le', 'du'], ['de', 'les', 'des'], ['de', 'Les', 'des'], ['de', 'des', 'de'], ['de', 'lequel', 'duquel'], ['de', 'lesquels', 'desquels'], ['de', 'lesquelles', 'desquelles'], ['des', 'les', 'des'], ['à', 'le', 'au'], ['à', 'Le', 'au'], ['à', 'lequel', 'auquel'], ['à', 'les', 'aux'], ['à', 'Les', 'aux'], ['à', 'lesquels', 'auxquels'], ['à', 'lesquelles', 'auxquelles'], ['si', 'il', "s'il"], ['si', 'ils', "s'ils"], ]; for (const contr of contrList) { res = this.contract2elts(contr[0], contr[1], contr[2], res); } return res; } articlesContractionsGeneric(input, beforeProtect) { let res = input; const contrList = [ '[Dd]e', '[Qq]ue', '[Ll]e', '[Ll]a', '[Ss]e', '[Jj]e', '[Tt]e', '[Mm]e', '[Nn]e', '[Pp]uisque', '[Jj]usque', '[Ll]orsque', ]; for (const contr of contrList) { // gérer le cas où 'de' est en début de phrase const regexDe = new RegExp(`${this.constants.stdBeforeWithParenthesis}(${contr})${this.getAfterDeterminer(beforeProtect)}`, 'g'); res = res.replace(regexDe, (_match, before, determiner, between, beforeWord, word) => { const newBetween = (between + beforeWord).replace(/[\s¤]+/g, ''); // we contract thus keep no space if ((0, french_contractions_1.contracts)(word, this.dictManager.getAdjsWordsData())) { return `${before}${determiner.substring(0, determiner.length - 1)}'${newBetween}${word}`; } else { // do nothing return `${before}${newBetween}${determiner} ${word}`; } }); } return res; } // ce arbre => cet arbre ceCetGeneric(input, beforeProtect) { let res = input; const regexCe = new RegExp(`${this.constants.stdBeforeWithParenthesis}([Cc]e)${this.getAfterDeterminer(beforeProtect)}`, 'g'); res = res.replace(regexCe, (_match, before, determiner, between, beforeWord, word) => { const newBetween = between + beforeWord; if ((0, french_contractions_1.contracts)(word, this.dictManager.getAdjsWordsData())) { return `${before}${determiner}t${newBetween}${word}`; } else { // do nothing return `${before}${determiner}${newBetween}${word}`; } }); return res; } contractions(input) { let res = input; res = this.ceCetGeneric(res, false); res = this.articlesContractionsGeneric(res, false); res = this.twoWordsContractions(res); return res; } titlecase(input) { return titleCaseFrFr.convert(input); } cleanSpacesPunctuation(input) { let res = input; // all but . and , const regexAllButDot = new RegExp(`(${this.constants.spaceOrNonBlockingClass}*)([:!\\?;])(${this.constants.spaceOrNonBlockingClass}*)`, 'g'); res = res.replace(regexAllButDot, function (_match, before, punc, after) { return `${before.replace(/\s/g, '')}\xa0${punc} ${after.replace(/\s/g, '')}`; }); // . and , and … const regexDot = new RegExp(`(${this.constants.spaceOrNonBlockingClass}*)([\\.,…])(${this.constants.spaceOrNonBlockingClass}*)`, 'g'); res = res.replace(regexDot, function (_match, before, punc, after) { return `${before.replace(/\s/g, '')}${punc} ${after.replace(/\s/g, '')}`; }); return res; } } exports.LanguageFilterFrench = LanguageFilterFrench; //# sourceMappingURL=LanguageFilterFrench.js.map