UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

58 lines (57 loc) 1.66 kB
export function normalizeText(t) { return (t || "") .toLowerCase() .normalize("NFD") .replace(/\p{Diacritic}/gu, "") .replace(/[^a-z0-9\s]/g, " ") .replace(/\s+/g, " ") .trim(); } export function decodeHtmlEntities(s) { if (!s) return ""; return s .replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCodePoint(parseInt(h, 16))) .replace(/&#(\d+);/g, (_, d) => String.fromCodePoint(parseInt(d, 10))) .replace(/&amp;/g, "&") .replace(/&lt;/g, "<") .replace(/&gt;/g, ">") .replace(/&quot;/g, '"') .replace(/&apos;/g, "'"); } export function stripTrailingPunct(s) { return s.replace(/\s*([:,.;])\s*$/u, "").trim(); } export function dedupeSpeaker(raw) { let s = norm(raw); s = stripTrailingPunct(s); const dupPatterns = [/^(.+?)\s*[.]\s*\1$/u, /^(.+?)\s*,\s*\1,?$/u, /^(.+?)\s+\1$/u]; for (const re of dupPatterns) { const m = s.match(re); if (m) { s = m[1]; break; } } return s.replace(/\.\s*$/, ""); } export function fixApostrophes(s) { let out = s; out = out.replace(/\s*’\s*/g, "’"); out = out.replace(/\b([dljctmsn])\s*’/gi, (_, m) => m + "’"); out = out.replace(/’\s+([A-Za-zÀ-ÖØ-öø-ÿ])/g, "’$1"); out = out.replace(/\s+([,;:.!?])/g, "$1"); return out; } export function norm(s) { return (s || "") .replace(/\u00A0/g, " ") .replace(/\s+/g, " ") .trim(); } export function normalizeSpaces(s) { return s .replace(/\u00A0/g, " ") .replace(/\s+/g, " ") .trim(); }