@tricoteuses/assemblee
Version:
Retrieve, clean up & handle French Assemblée nationale's open data
134 lines (130 loc) • 18.4 kB
JavaScript
import fs from "fs-extra";
import path from "path";
import stream from "stream";
import StreamZip from "node-stream-zip";
import util from "util";
import { walkDir } from "../file_systems.mjs";
import { XMLParser } from "fast-xml-parser";
const pipeline = util.promisify(stream.pipeline);
// Function to fetch zip files with retry mechanism
async function fetchZipFile(url, zipFilePath, retries = 3) {
for (let attempt = 0; attempt < retries; attempt++) {
const response = await fetch(url);
if (response.ok) {
await pipeline(response.body, fs.createWriteStream(zipFilePath));
return;
} else if (attempt < retries - 1) {
console.warn(`Retrying fetch for ${url}: ${response.statusText}`);
} else {
console.error(`Failed to fetch ${url} after ${retries} attempts`);
throw new Error(`Fetch failed: ${url}`);
}
}
}
async function extractZip(zipFilePath, outputDir) {
return new Promise((resolve, reject) => {
const zip = new StreamZip({
file: zipFilePath,
storeEntries: true
});
zip.on("ready", () => {
zip.extract(null, outputDir, err => {
zip.close();
if (err) {
reject(err);
} else {
resolve();
}
});
});
});
}
function reindentJsonFiles(dataDirOrFilePath) {
if (fs.statSync(dataDirOrFilePath).isDirectory()) {
for (const dataFileSplitPath of walkDir(dataDirOrFilePath)) {
const dataFilePath = path.join(dataDirOrFilePath, ...dataFileSplitPath);
if (dataFilePath.endsWith(".json")) {
const data = JSON.parse(fs.readFileSync(dataFilePath, {
encoding: "utf-8"
}));
fs.writeFileSync(dataFilePath, JSON.stringify(data, null, 2));
}
}
} else {
const data = JSON.parse(fs.readFileSync(dataDirOrFilePath, {
encoding: "utf-8"
}));
fs.writeFileSync(dataDirOrFilePath, JSON.stringify(data, null, 2));
}
}
const xmlParser = new XMLParser({
ignoreDeclaration: true,
ignoreAttributes: false,
ignorePiTags: true,
attributeNamePrefix: "",
textNodeName: "_",
processEntities: false,
trimValues: true,
stopNodes: ["*.texte", "*.intitule"],
parseTagValue: false,
parseAttributeValue: false,
transformTagName: tagName => tagName === "Para" ? "para" : tagName
});
/**
* Processes a dataset by fetching, unzipping, and converting files from XML to JSON format.
*
* @param {any} dataset - The dataset object containing metadata and processing instructions.
* @param {string} dataDir - The directory where the dataset files will be stored.
* @param {any} options - Options to control the processing behavior, such as fetching and silent mode.
* @returns {Promise<void>} A promise that resolves when the dataset processing is complete.
*/
export async function processDataset(args) {
const {
dataset,
dataDir,
options
} = args;
const zipFilePath = path.join(dataDir, `${dataset.filename}.zip`);
const dataDirOrFilePath = path.join(dataDir, dataset.filename);
fs.removeSync(dataDirOrFilePath);
if (options.fetch) {
// Fetch & save ZIP file.
if (!options.silent) {
console.log(`Loading ${dataset.title}: ${dataset.filename}.zip`);
}
await fetchZipFile(dataset.url, zipFilePath);
}
if (!options.silent) {
console.log(`Unzipping ${dataset.title}: ${dataset.filename}.zip`);
}
await extractZip(zipFilePath, dataDir);
if (dataset.repairZip !== undefined) {
if (!options.silent) {
console.log(`Repairing ${dataset.title}: ${dataset.filename}`);
}
dataset.repairZip(dataset, dataDir);
}
// Convert xml to JSON.
if (dataset.filename.endsWith(".xml")) {
for (const dataFileSplitPath of walkDir(dataDirOrFilePath)) {
const dataFilePath = path.join(dataDirOrFilePath, ...dataFileSplitPath);
if (dataFilePath.endsWith(".xml")) {
fs.readFile(dataFilePath, function (err, data) {
if (err) return console.error(err);
let result = xmlParser.parse(data);
const newDataFilePath = dataFilePath.replace(/\.[^.]+$/, ".json");
fs.copy(dataFilePath, newDataFilePath, err => {
fs.writeFileSync(newDataFilePath, JSON.stringify(result, null, 2));
if (err) return console.error(err);
});
});
}
}
}
// Reindent JSON file.
if (!options.silent) {
console.log(`Reidenting ${dataset.title}: ${dataset.filename}`);
}
reindentJsonFiles(dataDirOrFilePath);
}
//# sourceMappingURL=data:application/json;charset=utf-8;base64,{"version":3,"names":["fs","path","stream","StreamZip","util","walkDir","XMLParser","pipeline","promisify","fetchZipFile","url","zipFilePath","retries","attempt","response","fetch","ok","body","createWriteStream","console","warn","statusText","error","Error","extractZip","outputDir","Promise","resolve","reject","zip","file","storeEntries","on","extract","err","close","reindentJsonFiles","dataDirOrFilePath","statSync","isDirectory","dataFileSplitPath","dataFilePath","join","endsWith","data","JSON","parse","readFileSync","encoding","writeFileSync","stringify","xmlParser","ignoreDeclaration","ignoreAttributes","ignorePiTags","attributeNamePrefix","textNodeName","processEntities","trimValues","stopNodes","parseTagValue","parseAttributeValue","transformTagName","tagName","processDataset","args","dataset","dataDir","options","filename","removeSync","silent","log","title","repairZip","undefined","readFile","result","newDataFilePath","replace","copy"],"sources":["../../src/scripts/process_open_dataset.ts"],"sourcesContent":["import fs from \"fs-extra\"\nimport path from \"path\"\nimport stream from \"stream\"\nimport StreamZip from \"node-stream-zip\"\nimport util from \"util\"\n\nimport { walkDir } from \"../file_systems\"\nimport { XMLParser } from \"fast-xml-parser\"\n\nconst pipeline = util.promisify(stream.pipeline)\n\n// Function to fetch zip files with retry mechanism\nasync function fetchZipFile(\n  url: string,\n  zipFilePath: string,\n  retries = 3,\n): Promise<void> {\n  for (let attempt = 0; attempt < retries; attempt++) {\n    const response = await fetch(url)\n    if (response.ok) {\n      await pipeline(\n        response.body as unknown as NodeJS.ReadableStream,\n        fs.createWriteStream(zipFilePath),\n      )\n      return\n    } else if (attempt < retries - 1) {\n      console.warn(`Retrying fetch for ${url}: ${response.statusText}`)\n    } else {\n      console.error(`Failed to fetch ${url} after ${retries} attempts`)\n      throw new Error(`Fetch failed: ${url}`)\n    }\n  }\n}\n\nasync function extractZip(\n  zipFilePath: string,\n  outputDir: string,\n): Promise<void> {\n  return new Promise<void>((resolve, reject) => {\n    const zip = new StreamZip({\n      file: zipFilePath,\n      storeEntries: true,\n    })\n\n    zip.on(\"ready\", () => {\n      zip.extract(null!, outputDir, (err) => {\n        zip.close()\n        if (err) {\n          reject(err)\n        } else {\n          resolve()\n        }\n      })\n    })\n  })\n}\n\nfunction reindentJsonFiles(dataDirOrFilePath: string): void {\n  if (fs.statSync(dataDirOrFilePath).isDirectory()) {\n    for (const dataFileSplitPath of walkDir(dataDirOrFilePath)) {\n      const dataFilePath = path.join(dataDirOrFilePath, ...dataFileSplitPath)\n      if (dataFilePath.endsWith(\".json\")) {\n        const data = JSON.parse(\n          fs.readFileSync(dataFilePath, { encoding: \"utf-8\" }),\n        )\n        fs.writeFileSync(dataFilePath, JSON.stringify(data, null, 2))\n      }\n    }\n  } else {\n    const data = JSON.parse(\n      fs.readFileSync(dataDirOrFilePath, { encoding: \"utf-8\" }),\n    )\n    fs.writeFileSync(dataDirOrFilePath, JSON.stringify(data, null, 2))\n  }\n}\n\ntype ProcessDatasetArguments = {\n  dataset: any\n  dataDir: string\n  options: any\n}\n\nconst xmlParser = new XMLParser({\n  ignoreDeclaration: true,\n  ignoreAttributes: false,\n  ignorePiTags: true,\n  attributeNamePrefix: \"\",\n  textNodeName: \"_\",\n  processEntities: false,\n  trimValues: true,\n  stopNodes: [\"*.texte\", \"*.intitule\"],\n  parseTagValue: false,\n  parseAttributeValue: false,\n  transformTagName: (tagName: string) =>\n    tagName === \"Para\" ? \"para\" : tagName,\n})\n\n/**\n * Processes a dataset by fetching, unzipping, and converting files from XML to JSON format.\n *\n * @param {any} dataset - The dataset object containing metadata and processing instructions.\n * @param {string} dataDir - The directory where the dataset files will be stored.\n * @param {any} options - Options to control the processing behavior, such as fetching and silent mode.\n * @returns {Promise<void>} A promise that resolves when the dataset processing is complete.\n */\nexport async function processDataset(\n  args: ProcessDatasetArguments,\n): Promise<void> {\n  const { dataset, dataDir, options } = args\n  const zipFilePath = path.join(dataDir, `${dataset.filename}.zip`)\n  const dataDirOrFilePath = path.join(dataDir, dataset.filename)\n  fs.removeSync(dataDirOrFilePath)\n\n  if (options.fetch) {\n    // Fetch & save ZIP file.\n    if (!options.silent) {\n      console.log(`Loading ${dataset.title}: ${dataset.filename}.zip`)\n    }\n    await fetchZipFile(dataset.url, zipFilePath)\n  }\n  if (!options.silent) {\n    console.log(`Unzipping ${dataset.title}: ${dataset.filename}.zip`)\n  }\n  await extractZip(zipFilePath, dataDir)\n\n  if (dataset.repairZip !== undefined) {\n    if (!options.silent) {\n      console.log(`Repairing ${dataset.title}: ${dataset.filename}`)\n    }\n    dataset.repairZip(dataset, dataDir)\n  }\n\n  // Convert xml to JSON.\n  if (dataset.filename.endsWith(\".xml\")) {\n    for (const dataFileSplitPath of walkDir(dataDirOrFilePath)) {\n      const dataFilePath = path.join(dataDirOrFilePath, ...dataFileSplitPath)\n      if (dataFilePath.endsWith(\".xml\")) {\n        fs.readFile(dataFilePath, function (err, data) {\n          if (err) return console.error(err)\n          let result = xmlParser.parse(data)\n          const newDataFilePath = dataFilePath.replace(/\\.[^.]+$/, \".json\")\n          fs.copy(dataFilePath, newDataFilePath, (err) => {\n            fs.writeFileSync(newDataFilePath, JSON.stringify(result, null, 2))\n            if (err) return console.error(err)\n          })\n        })\n      }\n    }\n  }\n\n  // Reindent JSON file.\n  if (!options.silent) {\n    console.log(`Reidenting ${dataset.title}: ${dataset.filename}`)\n  }\n  reindentJsonFiles(dataDirOrFilePath)\n}\n"],"mappings":"AAAA,OAAOA,EAAE,MAAM,UAAU;AACzB,OAAOC,IAAI,MAAM,MAAM;AACvB,OAAOC,MAAM,MAAM,QAAQ;AAC3B,OAAOC,SAAS,MAAM,iBAAiB;AACvC,OAAOC,IAAI,MAAM,MAAM;AAAA,SAEdC,OAAO;AAChB,SAASC,SAAS,QAAQ,iBAAiB;AAE3C,MAAMC,QAAQ,GAAGH,IAAI,CAACI,SAAS,CAACN,MAAM,CAACK,QAAQ,CAAC;;AAEhD;AACA,eAAeE,YAAYA,CACzBC,GAAW,EACXC,WAAmB,EACnBC,OAAO,GAAG,CAAC,EACI;EACf,KAAK,IAAIC,OAAO,GAAG,CAAC,EAAEA,OAAO,GAAGD,OAAO,EAAEC,OAAO,EAAE,EAAE;IAClD,MAAMC,QAAQ,GAAG,MAAMC,KAAK,CAACL,GAAG,CAAC;IACjC,IAAII,QAAQ,CAACE,EAAE,EAAE;MACf,MAAMT,QAAQ,CACZO,QAAQ,CAACG,IAAI,EACbjB,EAAE,CAACkB,iBAAiB,CAACP,WAAW,CAClC,CAAC;MACD;IACF,CAAC,MAAM,IAAIE,OAAO,GAAGD,OAAO,GAAG,CAAC,EAAE;MAChCO,OAAO,CAACC,IAAI,CAAC,sBAAsBV,GAAG,KAAKI,QAAQ,CAACO,UAAU,EAAE,CAAC;IACnE,CAAC,MAAM;MACLF,OAAO,CAACG,KAAK,CAAC,mBAAmBZ,GAAG,UAAUE,OAAO,WAAW,CAAC;MACjE,MAAM,IAAIW,KAAK,CAAC,iBAAiBb,GAAG,EAAE,CAAC;IACzC;EACF;AACF;AAEA,eAAec,UAAUA,CACvBb,WAAmB,EACnBc,SAAiB,EACF;EACf,OAAO,IAAIC,OAAO,CAAO,CAACC,OAAO,EAAEC,MAAM,KAAK;IAC5C,MAAMC,GAAG,GAAG,IAAI1B,SAAS,CAAC;MACxB2B,IAAI,EAAEnB,WAAW;MACjBoB,YAAY,EAAE;IAChB,CAAC,CAAC;IAEFF,GAAG,CAACG,EAAE,CAAC,OAAO,EAAE,MAAM;MACpBH,GAAG,CAACI,OAAO,CAAC,IAAI,EAAGR,SAAS,EAAGS,GAAG,IAAK;QACrCL,GAAG,CAACM,KAAK,CAAC,CAAC;QACX,IAAID,GAAG,EAAE;UACPN,MAAM,CAACM,GAAG,CAAC;QACb,CAAC,MAAM;UACLP,OAAO,CAAC,CAAC;QACX;MACF,CAAC,CAAC;IACJ,CAAC,CAAC;EACJ,CAAC,CAAC;AACJ;AAEA,SAASS,iBAAiBA,CAACC,iBAAyB,EAAQ;EAC1D,IAAIrC,EAAE,CAACsC,QAAQ,CAACD,iBAAiB,CAAC,CAACE,WAAW,CAAC,CAAC,EAAE;IAChD,KAAK,MAAMC,iBAAiB,IAAInC,OAAO,CAACgC,iBAAiB,CAAC,EAAE;MAC1D,MAAMI,YAAY,GAAGxC,IAAI,CAACyC,IAAI,CAACL,iBAAiB,EAAE,GAAGG,iBAAiB,CAAC;MACvE,IAAIC,YAAY,CAACE,QAAQ,CAAC,OAAO,CAAC,EAAE;QAClC,MAAMC,IAAI,GAAGC,IAAI,CAACC,KAAK,CACrB9C,EAAE,CAAC+C,YAAY,CAACN,YAAY,EAAE;UAAEO,QAAQ,EAAE;QAAQ,CAAC,CACrD,CAAC;QACDhD,EAAE,CAACiD,aAAa,CAACR,YAAY,EAAEI,IAAI,CAACK,SAAS,CAACN,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;MAC/D;IACF;EACF,CAAC,MAAM;IACL,MAAMA,IAAI,GAAGC,IAAI,CAACC,KAAK,CACrB9C,EAAE,CAAC+C,YAAY,CAACV,iBAAiB,EAAE;MAAEW,QAAQ,EAAE;IAAQ,CAAC,CAC1D,CAAC;IACDhD,EAAE,CAACiD,aAAa,CAACZ,iBAAiB,EAAEQ,IAAI,CAACK,SAAS,CAACN,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;EACpE;AACF;AAQA,MAAMO,SAAS,GAAG,IAAI7C,SAAS,CAAC;EAC9B8C,iBAAiB,EAAE,IAAI;EACvBC,gBAAgB,EAAE,KAAK;EACvBC,YAAY,EAAE,IAAI;EAClBC,mBAAmB,EAAE,EAAE;EACvBC,YAAY,EAAE,GAAG;EACjBC,eAAe,EAAE,KAAK;EACtBC,UAAU,EAAE,IAAI;EAChBC,SAAS,EAAE,CAAC,SAAS,EAAE,YAAY,CAAC;EACpCC,aAAa,EAAE,KAAK;EACpBC,mBAAmB,EAAE,KAAK;EAC1BC,gBAAgB,EAAGC,OAAe,IAChCA,OAAO,KAAK,MAAM,GAAG,MAAM,GAAGA;AAClC,CAAC,CAAC;;AAEF;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,eAAeC,cAAcA,CAClCC,IAA6B,EACd;EACf,MAAM;IAAEC,OAAO;IAAEC,OAAO;IAAEC;EAAQ,CAAC,GAAGH,IAAI;EAC1C,MAAMtD,WAAW,GAAGV,IAAI,CAACyC,IAAI,CAACyB,OAAO,EAAE,GAAGD,OAAO,CAACG,QAAQ,MAAM,CAAC;EACjE,MAAMhC,iBAAiB,GAAGpC,IAAI,CAACyC,IAAI,CAACyB,OAAO,EAAED,OAAO,CAACG,QAAQ,CAAC;EAC9DrE,EAAE,CAACsE,UAAU,CAACjC,iBAAiB,CAAC;EAEhC,IAAI+B,OAAO,CAACrD,KAAK,EAAE;IACjB;IACA,IAAI,CAACqD,OAAO,CAACG,MAAM,EAAE;MACnBpD,OAAO,CAACqD,GAAG,CAAC,WAAWN,OAAO,CAACO,KAAK,KAAKP,OAAO,CAACG,QAAQ,MAAM,CAAC;IAClE;IACA,MAAM5D,YAAY,CAACyD,OAAO,CAACxD,GAAG,EAAEC,WAAW,CAAC;EAC9C;EACA,IAAI,CAACyD,OAAO,CAACG,MAAM,EAAE;IACnBpD,OAAO,CAACqD,GAAG,CAAC,aAAaN,OAAO,CAACO,KAAK,KAAKP,OAAO,CAACG,QAAQ,MAAM,CAAC;EACpE;EACA,MAAM7C,UAAU,CAACb,WAAW,EAAEwD,OAAO,CAAC;EAEtC,IAAID,OAAO,CAACQ,SAAS,KAAKC,SAAS,EAAE;IACnC,IAAI,CAACP,OAAO,CAACG,MAAM,EAAE;MACnBpD,OAAO,CAACqD,GAAG,CAAC,aAAaN,OAAO,CAACO,KAAK,KAAKP,OAAO,CAACG,QAAQ,EAAE,CAAC;IAChE;IACAH,OAAO,CAACQ,SAAS,CAACR,OAAO,EAAEC,OAAO,CAAC;EACrC;;EAEA;EACA,IAAID,OAAO,CAACG,QAAQ,CAAC1B,QAAQ,CAAC,MAAM,CAAC,EAAE;IACrC,KAAK,MAAMH,iBAAiB,IAAInC,OAAO,CAACgC,iBAAiB,CAAC,EAAE;MAC1D,MAAMI,YAAY,GAAGxC,IAAI,CAACyC,IAAI,CAACL,iBAAiB,EAAE,GAAGG,iBAAiB,CAAC;MACvE,IAAIC,YAAY,CAACE,QAAQ,CAAC,MAAM,CAAC,EAAE;QACjC3C,EAAE,CAAC4E,QAAQ,CAACnC,YAAY,EAAE,UAAUP,GAAG,EAAEU,IAAI,EAAE;UAC7C,IAAIV,GAAG,EAAE,OAAOf,OAAO,CAACG,KAAK,CAACY,GAAG,CAAC;UAClC,IAAI2C,MAAM,GAAG1B,SAAS,CAACL,KAAK,CAACF,IAAI,CAAC;UAClC,MAAMkC,eAAe,GAAGrC,YAAY,CAACsC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC;UACjE/E,EAAE,CAACgF,IAAI,CAACvC,YAAY,EAAEqC,eAAe,EAAG5C,GAAG,IAAK;YAC9ClC,EAAE,CAACiD,aAAa,CAAC6B,eAAe,EAAEjC,IAAI,CAACK,SAAS,CAAC2B,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;YAClE,IAAI3C,GAAG,EAAE,OAAOf,OAAO,CAACG,KAAK,CAACY,GAAG,CAAC;UACpC,CAAC,CAAC;QACJ,CAAC,CAAC;MACJ;IACF;EACF;;EAEA;EACA,IAAI,CAACkC,OAAO,CAACG,MAAM,EAAE;IACnBpD,OAAO,CAACqD,GAAG,CAAC,cAAcN,OAAO,CAACO,KAAK,KAAKP,OAAO,CAACG,QAAQ,EAAE,CAAC;EACjE;EACAjC,iBAAiB,CAACC,iBAAiB,CAAC;AACtC","ignoreList":[]}