@tricoteuses/senat
Version:
Handle French Sénat's open data
98 lines (97 loc) • 3.42 kB
JavaScript
import { toFRDate } from "../utils/date";
import { MAX_CANDIDATES, SENAT_VIDEOS_SEARCH_AJAX } from "./config";
import * as cheerio from "cheerio";
export async function fetchText(url) {
const res = await fetch(url);
if (!res.ok)
return null;
return await res.text();
}
export async function fetchBuffer(url) {
const res = await fetch(url);
if (!res.ok)
return null;
const ab = await res.arrayBuffer();
return Buffer.from(ab);
}
export function getAgendaType(agenda) {
const o = agenda.organe || "";
if (/séance publique/i.test(o))
return "Séance publique";
return "Commission";
}
export async function fetchAllSearchPages(args, maxPages = 3) {
const pages = [];
for (let p = 1; p <= maxPages; p++) {
const url = `${SENAT_VIDEOS_SEARCH_AJAX}?${queryString({ ...args, page: String(p) })}`;
const html = await fetchText(url);
if (!html)
break;
pages.push(html);
if (!/href="\/?video\.\d+_[a-z0-9]+\./i.test(html))
break;
}
return pages;
}
export function queryString(obj) {
return Object.entries(obj)
.map(([k, v]) => `${encodeURIComponent(k)}=${encodeURIComponent(v)}`)
.join("&");
}
export async function fetchCandidatesForAgenda(agenda, options) {
const searchParams = {
search: "true",
videotype: getAgendaType(agenda),
};
if (agenda.date) {
const fr = toFRDate(agenda.date);
searchParams.period = "custom";
searchParams.begin = fr;
searchParams.end = fr;
}
if (agenda.organe) {
searchParams.organe = agenda.organe;
}
const pages = await fetchAllSearchPages(searchParams);
if (!pages.length) {
if (!options["silent"]) {
console.log(`[miss] ${agenda.uid} no candidates (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`);
}
return null;
}
const combinedHtml = pages.join("\n<!-- PAGE SPLIT -->\n");
const candidates = extractCandidatesFromSearchHtml(combinedHtml).slice(0, MAX_CANDIDATES);
if (!candidates.length) {
if (!options["silent"]) {
console.log(`[miss] ${agenda.uid} no candidates after parse (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`);
}
return null;
}
return candidates;
}
export function extractCandidatesFromSearchHtml(html) {
const $ = cheerio.load(html);
const out = [];
const re = /video\.(\d+)_([a-z0-9]+)/i;
$('h3.card-title a.stretched-link[href*="video."]').each((_, a) => {
const href = $(a).attr("href") || "";
const m = href.match(re);
if (!m)
return;
const id = m[1];
const hash = m[2];
const pageUrl = `https://videos.senat.fr/video.${id}_${hash}.html`;
const title = ($(a).attr("title") || $(a).text() || "").replace(/\s+/g, " ").trim() || undefined;
const isSeancePublique = title?.toLowerCase().includes("séance publique") ?? false;
out.push({ id, hash, pageUrl, title, isSeancePublique });
});
// dedupe
const seen = new Set();
return out.filter((c) => {
const k = `${c.id}_${c.hash}`;
if (seen.has(k))
return false;
seen.add(k);
return true;
});
}