UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

98 lines (97 loc) 3.42 kB
import { toFRDate } from "../utils/date"; import { MAX_CANDIDATES, SENAT_VIDEOS_SEARCH_AJAX } from "./config"; import * as cheerio from "cheerio"; export async function fetchText(url) { const res = await fetch(url); if (!res.ok) return null; return await res.text(); } export async function fetchBuffer(url) { const res = await fetch(url); if (!res.ok) return null; const ab = await res.arrayBuffer(); return Buffer.from(ab); } export function getAgendaType(agenda) { const o = agenda.organe || ""; if (/séance publique/i.test(o)) return "Séance publique"; return "Commission"; } export async function fetchAllSearchPages(args, maxPages = 3) { const pages = []; for (let p = 1; p <= maxPages; p++) { const url = `${SENAT_VIDEOS_SEARCH_AJAX}?${queryString({ ...args, page: String(p) })}`; const html = await fetchText(url); if (!html) break; pages.push(html); if (!/href="\/?video\.\d+_[a-z0-9]+\./i.test(html)) break; } return pages; } export function queryString(obj) { return Object.entries(obj) .map(([k, v]) => `${encodeURIComponent(k)}=${encodeURIComponent(v)}`) .join("&"); } export async function fetchCandidatesForAgenda(agenda, options) { const searchParams = { search: "true", videotype: getAgendaType(agenda), }; if (agenda.date) { const fr = toFRDate(agenda.date); searchParams.period = "custom"; searchParams.begin = fr; searchParams.end = fr; } if (agenda.organe) { searchParams.organe = agenda.organe; } const pages = await fetchAllSearchPages(searchParams); if (!pages.length) { if (!options["silent"]) { console.log(`[miss] ${agenda.uid} no candidates (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`); } return null; } const combinedHtml = pages.join("\n<!-- PAGE SPLIT -->\n"); const candidates = extractCandidatesFromSearchHtml(combinedHtml).slice(0, MAX_CANDIDATES); if (!candidates.length) { if (!options["silent"]) { console.log(`[miss] ${agenda.uid} no candidates after parse (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`); } return null; } return candidates; } export function extractCandidatesFromSearchHtml(html) { const $ = cheerio.load(html); const out = []; const re = /video\.(\d+)_([a-z0-9]+)/i; $('h3.card-title a.stretched-link[href*="video."]').each((_, a) => { const href = $(a).attr("href") || ""; const m = href.match(re); if (!m) return; const id = m[1]; const hash = m[2]; const pageUrl = `https://videos.senat.fr/video.${id}_${hash}.html`; const title = ($(a).attr("title") || $(a).text() || "").replace(/\s+/g, " ").trim() || undefined; const isSeancePublique = title?.toLowerCase().includes("séance publique") ?? false; out.push({ id, hash, pageUrl, title, isSeancePublique }); }); // dedupe const seen = new Set(); return out.filter((c) => { const k = `${c.id}_${c.hash}`; if (seen.has(k)) return false; seen.add(k); return true; }); }