@tricoteuses/senat
Version:
Handle French Sénat's open data
58 lines (57 loc) • 1.66 kB
JavaScript
export function normalizeText(t) {
return (t || "")
.toLowerCase()
.normalize("NFD")
.replace(/\p{Diacritic}/gu, "")
.replace(/[^a-z0-9\s]/g, " ")
.replace(/\s+/g, " ")
.trim();
}
export function decodeHtmlEntities(s) {
if (!s)
return "";
return s
.replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCodePoint(parseInt(h, 16)))
.replace(/&#(\d+);/g, (_, d) => String.fromCodePoint(parseInt(d, 10)))
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'");
}
export function stripTrailingPunct(s) {
return s.replace(/\s*([:,.;])\s*$/u, "").trim();
}
export function dedupeSpeaker(raw) {
let s = norm(raw);
s = stripTrailingPunct(s);
const dupPatterns = [/^(.+?)\s*[.]\s*\1$/u, /^(.+?)\s*,\s*\1,?$/u, /^(.+?)\s+\1$/u];
for (const re of dupPatterns) {
const m = s.match(re);
if (m) {
s = m[1];
break;
}
}
return s.replace(/\.\s*$/, "");
}
export function fixApostrophes(s) {
let out = s;
out = out.replace(/\s*’\s*/g, "’");
out = out.replace(/\b([dljctmsn])\s*’/gi, (_, m) => m + "’");
out = out.replace(/’\s+([A-Za-zÀ-ÖØ-öø-ÿ])/g, "’$1");
out = out.replace(/\s+([,;:.!?])/g, "$1");
return out;
}
export function norm(s) {
return (s || "")
.replace(/\u00A0/g, " ")
.replace(/\s+/g, " ")
.trim();
}
export function normalizeSpaces(s) {
return s
.replace(/\u00A0/g, " ")
.replace(/\s+/g, " ")
.trim();
}