UNPKG

@tricoteuses/senat

Version:

Handle French Sénat's open data

202 lines (201 loc) 8.13 kB
// scripts/retrieve_senat_videos_from_agendas.ts import assert from "assert"; import commandLineArgs from "command-line-args"; import fs from "fs-extra"; import fsp from "fs/promises"; import path from "path"; import * as git from "../git.js"; import { AGENDA_FOLDER, iterLoadSenatAgendas } from "../loaders"; import { getSessionsFromStart } from "../types/sessions"; import { commonOptions } from "./shared/cli_helpers"; import { getAgendaSegmentTimecodes, buildSenatVodMasterM3u8FromNvs } from "../utils/nvs-parsing"; import { epochToParisDateTime, isAmbiguousTimeOriginal, toTargetEpoch } from "../utils/date"; import { pathToFileURL } from "url"; import { fetchCandidatesForAgenda, fetchText } from "../videos/search"; import { matchAgendaToVideo } from "../videos/match"; import { SENAT_DATAS_ROOT, STATS, VIDEOS_ROOT_FOLDER, weights } from "../videos/config"; import { processBisIfNeeded, processOneReunionMatch, writeIfChanged } from "../videos"; const optionsDefinitions = [...commonOptions]; const options = commandLineArgs(optionsDefinitions); function shouldSkipAgenda(agenda) { if (!agenda.captationVideo) return true; if (!agenda.date || !agenda.startTime) return true; if (agenda.uid.endsWith("Bis")) return true; // Don't reprocess bis reunions const agendaTs = toTargetEpoch(agenda.startTime, agenda.date); const now = Date.now(); if (agendaTs && agendaTs * 1000 > now) return true; return false; } async function computeContext(agenda, session, dataDir) { const agendaTs = agenda.startTime && agenda.date ? toTargetEpoch(agenda.startTime, agenda.date) : null; const reunionUid = agenda.uid; const baseDir = path.join(dataDir, VIDEOS_ROOT_FOLDER, String(session), reunionUid); await fs.ensureDir(baseDir); return { session, dataDir, baseDir, reunionUid, agendaTs }; } function shouldSkipDownload(agenda, baseDir) { if (!options["only-recent"]) return false; const now = Date.now(); const cutoff = now - options["only-recent"] * 24 * 3600 * 1000; const reunionTs = Date.parse(agenda.date); if (reunionTs >= cutoff) return false; const dataNvsPath = path.join(baseDir, "data.nvs"); const finalplayerNvsPath = path.join(baseDir, "finalplayer.nvs"); return fs.existsSync(dataNvsPath) && fs.existsSync(finalplayerNvsPath); } async function writeMatchArtifacts(args) { const { agenda, ctx, best, secondBest } = args; const bestDt = best.epoch ? epochToParisDateTime(best.epoch) : null; const secondBestDt = secondBest && secondBest.epoch ? epochToParisDateTime(secondBest.epoch) : null; const metadata = { reunionUid: ctx.reunionUid, session: ctx.session, accepted: true, threshold: weights.minAccept, agenda: { date: agenda.date, startTime: agenda.startTime, titre: agenda.titre, organe: agenda.organe ?? undefined, uid: agenda.uid, }, best: { id: best.id, hash: best.hash, pageUrl: best.pageUrl, epoch: best.epoch ?? null, date: bestDt?.date ?? null, startTime: bestDt?.startTime ?? null, title: best.vtitle ?? null, score: best.score, }, secondBest: secondBest ? { id: secondBest.id, hash: secondBest.hash, pageUrl: secondBest.pageUrl, epoch: secondBest.epoch ?? null, date: secondBestDt?.date ?? null, startTime: secondBestDt?.startTime ?? null, title: secondBest.vtitle ?? null, score: secondBest.score, } : null, }; await writeIfChanged(path.join(ctx.baseDir, "metadata.json"), JSON.stringify(metadata, null, 2)); const dataUrl = `${SENAT_DATAS_ROOT}/${best.id}_${best.hash}/content/data.nvs`; const finalUrl = `${SENAT_DATAS_ROOT}/${best.id}_${best.hash}/content/finalplayer.nvs`; const dataTxt = await fetchText(dataUrl); const finalTxt = await fetchText(finalUrl); if (dataTxt) await fsp.writeFile(path.join(ctx.baseDir, "data.nvs"), dataTxt, "utf-8"); if (finalTxt) await fsp.writeFile(path.join(ctx.baseDir, "finalplayer.nvs"), finalTxt, "utf-8"); } async function processGroupedReunion(agenda, session, dataDir, lastByVideo) { // 1) GuardRails if (shouldSkipAgenda(agenda)) return; const ctx = await computeContext(agenda, session, dataDir); const skipDownload = shouldSkipDownload(agenda, ctx.baseDir); let match = null; let best = null; let secondBest = null; // 2) Match + download artifacts (only if not skipped) if (!skipDownload) { STATS.total++; const candidates = await fetchCandidatesForAgenda(agenda, options); if (!candidates) { console.log(`[warn] ${agenda.uid} No candidate found for this reunion. Probably VOD not published yet.`); return; } match = await matchAgendaToVideo({ agenda, agendaTs: ctx.agendaTs, candidates, options }); if (!match) { console.log(`[miss] ${agenda.uid} No match found for this reunion`); return; } ; ({ best, secondBest } = match); STATS.accepted++; await writeMatchArtifacts({ agenda, ctx, best, secondBest }); } if (best && isAmbiguousTimeOriginal(agenda.events[0].timeOriginal)) { if (!options["silent"]) console.log("If the time is ambiguous, update agenda startTime from matched video"); agenda = { ...agenda, startTime: epochToParisDateTime(best.epoch)?.startTime ?? agenda.startTime }; } // 3) Always update BEST agenda JSON from local NVS await processOneReunionMatch({ agenda, best, baseDir: ctx.baseDir, dataDir: ctx.dataDir, session: ctx.session, options, writeIfChanged, lastByVideo, // NEW getAgendaSegmentTimecodes, buildSenatVodMasterM3u8FromNvs, }); await processBisIfNeeded({ agenda, secondBest, ctx, skipDownload, options, lastByVideo, writeIfChanged, processOneReunionMatch, getAgendaSegmentTimecodes, buildSenatVodMasterM3u8FromNvs, }); } async function processAll(dataDir, sessions) { console.log("Process all Agendas and fetch video's url"); for (const session of sessions) { const lastByVideo = new Map(); for (const { item: agenda } of iterLoadSenatAgendas(dataDir, session)) { try { await processGroupedReunion(agenda, session, dataDir, lastByVideo); } catch (e) { console.error(`[error] ${agenda?.uid ?? "unknown-uid"}:`, e?.message || e); } } } const debatsDir = path.join(dataDir, AGENDA_FOLDER); commitAndPushGit(debatsDir, options); } function commitAndPushGit(datasetDir, options) { let exitCode = 10; // 0: some data changed, 10: no modification if (options.commit) { const errorCode = git.commitAndPush(datasetDir, "Nouvelle moisson", options.remote); if ((exitCode === 10 && errorCode !== 10) || (exitCode === 0 && errorCode !== 0 && errorCode !== 10)) { exitCode = errorCode; } } } async function main() { const dataDir = options["dataDir"]; assert(dataDir, "Missing argument: data directory"); const sessions = getSessionsFromStart(options["fromSession"]); const TIMER = "senat-agendas→videos processing time"; console.time(TIMER); await processAll(dataDir, sessions); console.timeEnd(TIMER); const { total, accepted } = STATS; const ratio = total ? ((accepted / total) * 100).toFixed(1) : "0.0"; console.log(`[summary] accepted=${accepted} / total=${total} (${ratio}%)`); } if (import.meta.url === pathToFileURL(process.argv[1]).href) { main().catch((err) => { console.error(err); process.exit(1); }); }