@tricoteuses/senat
Version:
Handle French Sénat's open data
255 lines (254 loc) • 10.6 kB
JavaScript
import fsex from "fs-extra";
import fs from "fs";
import path from "path";
import * as git from "./git";
import { datasets } from "./datasets";
import { UNDEFINED_SESSION } from "./types/sessions";
export { EnabledDatasets } from "./datasets";
export const AGENDA_FOLDER = "agenda";
export const COMPTES_RENDUS_FOLDER = "seances";
export const COMMISSION_FOLDER = "commissions";
export const DOSLEG_DOSSIERS_FOLDER = "dossiers";
export const ENRICHED_TEXTE_FOLDER = "leg_enrichi";
export const SCRUTINS_FOLDER = "scrutins";
export const SENS_CIRCONSCRIPTIONS_FOLDER = "circonscriptions";
export const SENS_ORGANISMES_FOLDER = "organismes";
export const SENS_SENATEURS_FOLDER = "senateurs";
export const TEXTE_FOLDER = "leg";
export const RAPPORT_FOLDER = "rap";
export const DATA_ORIGINAL_FOLDER = "original";
export const DATA_TRANSFORMED_FOLDER = "transformed";
export const DOCUMENT_METADATA_FILE = "metadata.json";
export function* iterFilePaths(dirPath) {
if (dirPath && fs.existsSync(dirPath)) {
const files = fs.readdirSync(dirPath, {
withFileTypes: true,
recursive: true,
});
for (const file of files) {
if (file.isFile()) {
yield path.join(file.parentPath, file.name);
}
}
}
}
function* iterLoadSenatItems(dataDir, dataName, legislatureOrSession, subDir, { log = false, sinceCommit } = {}) {
let itemsDir = path.join(dataDir, dataName);
if (subDir) {
itemsDir = path.join(itemsDir, subDir);
}
if (legislatureOrSession) {
itemsDir = path.join(itemsDir, String(legislatureOrSession));
}
// Get changed files if sinceCommit is specified (excluding deleted files)
const changedFiles = sinceCommit
? git.getChangedFilesSinceCommit(itemsDir, sinceCommit, {
diffFilter: "AMR", // Added, Modified, Renamed
})
: null;
if (log && sinceCommit) {
console.log(`Filtering files changed since commit ${sinceCommit} in ${itemsDir}`);
console.log(`Found ${changedFiles?.size || 0} changed files (AMR)`);
}
for (const filePath of iterFilePaths(itemsDir)) {
if (!filePath.endsWith(".json")) {
continue;
}
const relativePath = path.relative(path.join(dataDir, dataName), filePath);
const gitStatus = changedFiles?.get(relativePath);
// Filter by changed files if sinceCommit is specified
if (changedFiles && !gitStatus) {
// Skip files not in the change set
continue;
}
if (log) {
console.log(`Loading file: ${filePath}…${gitStatus ? ` (${gitStatus})` : ""}`);
}
let item;
try {
const itemJson = fs.readFileSync(filePath, { encoding: "utf8" });
item = JSON.parse(itemJson);
}
catch (err) {
console.warn(`[iterLoadSenatItems] skipped invalid JSON: ${filePath} (${err.message})`);
continue;
}
const filePathFromDataset = filePath.substring(filePath.indexOf(dataName) + dataName.length);
yield {
item,
filePathFromDataset,
legislature: legislatureOrSession,
...(gitStatus && { gitStatus }), // Include gitStatus
};
}
// Yield deleted files at the end if sinceCommit is specified
if (sinceCommit) {
const deletedFiles = git.getChangedFilesSinceCommit(itemsDir, sinceCommit, {
diffFilter: "D", // Deleted
});
if (log) {
console.log(`Found ${deletedFiles.size || 0} deleted files (D)`);
}
for (const [relativePath, status] of deletedFiles.entries()) {
const deletedFilePath = path.join(itemsDir, relativePath);
if (log) {
console.log(`Deleted file: ${deletedFilePath}`);
}
// Extract UID from filename (remove extension) for the placeholder item
const fileExtension = path.extname(relativePath) || ".json"; // Assuming files use an extension like .json
const filename = path.basename(relativePath, fileExtension);
const fakeItem = { uid: filename }; // Placeholder item using uid constraint
const filePathFromDataset = deletedFilePath.substring(deletedFilePath.indexOf(dataName) + dataName.length);
yield {
item: fakeItem,
filePathFromDataset,
legislature: legislatureOrSession,
gitStatus: status,
};
}
}
}
export function* iterLoadSenatAmendements(dataDir, session, options = {}) {
for (const amendementItem of iterLoadSenatItems(dataDir, datasets.ameli.database, session, undefined, options)) {
yield amendementItem;
}
}
export function* iterLoadSenatDebats(dataDir, session, options = {}) {
for (const debatItem of iterLoadSenatItems(dataDir, datasets.debats.database, session, undefined, options)) {
yield debatItem;
}
}
export function* iterLoadSenatComptesRendusSeances(dataDir, session, options = {}) {
for (const compteRenduItem of iterLoadSenatItems(dataDir, COMPTES_RENDUS_FOLDER, session, DATA_TRANSFORMED_FOLDER, options)) {
yield compteRenduItem;
}
}
export function* iterLoadSenatComptesRendusCommissions(dataDir, session, options = {}) {
for (const compteRenduItem of iterLoadSenatItems(dataDir, COMMISSION_FOLDER, session, DATA_TRANSFORMED_FOLDER, options)) {
yield compteRenduItem;
}
}
export function* iterLoadSenatDossiersLegislatifs(dataDir, session, options = {}) {
for (const dossierLegislatifItem of iterLoadSenatItems(dataDir, datasets.dosleg.database, session, DOSLEG_DOSSIERS_FOLDER, options)) {
yield dossierLegislatifItem;
}
}
export function* iterLoadSenatRapportUrls(dataDir, session) {
let itemsDir = path.join(dataDir, RAPPORT_FOLDER, DATA_ORIGINAL_FOLDER);
if (session) {
itemsDir = path.join(itemsDir, session.toString());
}
for (const filePath of iterFilePaths(itemsDir)) {
const parsedFilePath = path.parse(filePath);
if (parsedFilePath.base === DOCUMENT_METADATA_FILE) {
const itemJson = fs.readFileSync(filePath, { encoding: "utf8" });
const item = JSON.parse(itemJson);
yield {
item,
};
}
}
}
export function* iterLoadSenatTexteUrls(dataDir, session) {
let itemsDir = path.join(dataDir, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER);
if (session) {
itemsDir = path.join(itemsDir, session.toString());
}
for (const filePath of iterFilePaths(itemsDir)) {
const parsedFilePath = path.parse(filePath);
if (parsedFilePath.base === DOCUMENT_METADATA_FILE) {
const itemJson = fs.readFileSync(filePath, { encoding: "utf8" });
const item = JSON.parse(itemJson);
yield {
item,
};
}
}
}
export function* iterLoadSenatRapports(dataDir, session, options = {}) {
for (const iterItem of iterLoadSenatItems(dataDir, RAPPORT_FOLDER, session, "original", options)) {
if (iterItem.item?.["id"]) {
yield iterItem;
}
}
}
export function* iterLoadSenatTextes(dataDir, session, options = {}) {
for (const iterItem of iterLoadSenatItems(dataDir, TEXTE_FOLDER, session, DATA_ORIGINAL_FOLDER, options)) {
if (!iterItem.item?.["id"]) {
continue;
}
const texteItem = iterItem;
const texte = texteItem.item;
const texteId = texte["id"];
const { item: texteContent } = loadSenatTexteContent(dataDir, texte["session"], texteId);
if (texteContent) {
texteItem.item = { ...texteContent, ...texteItem.item };
}
yield texteItem;
}
}
export function loadSenatTexteContent(dataDir, session, texteId) {
const fullTextePath = path.join(dataDir, TEXTE_FOLDER, DATA_TRANSFORMED_FOLDER, String(session ?? UNDEFINED_SESSION), texteId, `${texteId}.json`);
if (!fs.existsSync(fullTextePath)) {
return { item: null };
}
const texteJson = fs.readFileSync(fullTextePath, { encoding: "utf8" });
return { item: JSON.parse(texteJson) };
}
export function loadSenatCompteRenduContent(dataDir, session, debatId) {
const fullPath = path.join(dataDir, COMPTES_RENDUS_FOLDER, DATA_TRANSFORMED_FOLDER, String(session), `${debatId}.json`);
if (!fs.existsSync(fullPath)) {
return { item: null };
}
const json = fs.readFileSync(fullPath, { encoding: "utf8" });
return { item: JSON.parse(json) };
}
export function* iterLoadSenatAgendas(dataDir, session) {
const baseDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session ?? ""));
if (!fs.existsSync(baseDir))
return;
const files = (fs.readdirSync(baseDir) || []).filter((f) => f.startsWith("RUSN") && f.endsWith(".json")).sort();
for (const fileName of files) {
const filePath = path.join(baseDir, fileName);
let raw;
try {
raw = fsex.readJSONSync(filePath);
}
catch {
continue; // JSON invalide
}
if (!raw || typeof raw !== "object")
continue;
const gr = raw;
if (!gr.uid || !gr.date || !gr.titre)
continue;
if (!Array.isArray(gr.events))
gr.events = [];
yield { item: gr };
}
}
export function* iterLoadSenatCirconscriptions(dataDir, options = {}) {
for (const circonscriptionItem of iterLoadSenatItems(dataDir, datasets.sens.database, undefined, SENS_CIRCONSCRIPTIONS_FOLDER, options)) {
yield circonscriptionItem;
}
}
export function* iterLoadSenatOrganismes(dataDir, options = {}) {
for (const organismeItem of iterLoadSenatItems(dataDir, datasets.sens.database, undefined, SENS_ORGANISMES_FOLDER, options)) {
yield organismeItem;
}
}
export function* iterLoadSenatSenateurs(dataDir, options = {}) {
for (const senateurItem of iterLoadSenatItems(dataDir, datasets.sens.database, undefined, SENS_SENATEURS_FOLDER, options)) {
yield senateurItem;
}
}
export function* iterLoadSenatQuestions(dataDir, legislature, options = {}) {
for (const questionItem of iterLoadSenatItems(dataDir, datasets.questions.database, legislature, undefined, options)) {
yield questionItem;
}
}
export function* iterLoadSenatScrutins(dataDir, session, options = {}) {
for (const scrutinItem of iterLoadSenatItems(dataDir, "scrutins", session, undefined, options)) {
yield scrutinItem;
}
}