UNPKG

@tricoteuses/arbre-de-la-loi

Version:

Generate ASTs from the French bills & laws; manipulate & export them to Markdown, etc.

198 lines (197 loc) 6.89 kB
import { EnabledDatasets, loadAssembleeData, pathFromDocumentUid, } from "@tricoteuses/assemblee/lib/loaders"; import assert from "assert"; import { execSync } from "child_process"; import commandLineArgs from "command-line-args"; import fs from "fs-extra"; import hastFromParse5 from "hast-util-from-parse5"; import hastToHtml from "hast-util-to-html"; import parse5 from "parse5"; import path from "path"; import prettier from "prettier"; import unistRemovePosition from "unist-util-remove-position"; import vfile from "vfile"; import { mergeSpanElementsWithSameAttributes, mergeTexts, removeEmptyStyleAttributes, removeLetterSpacingStyleAttributes, removeStyleElements, replaceSpanElementsContainingOnlySpacesWithTexts, replaceSpanElementsWithoutPropertiesWithTexts, } from "../cleaners"; const optionsDefinitions = [ { alias: "c", help: "commit documents", name: "commit", type: Boolean, }, { alias: "f", help: "clean even already retrieved documents", name: "full", type: Boolean, }, { alias: "l", defaultValue: "15", name: "legislature", type: String, }, { alias: "o", help: "clean only one document", name: "one", type: Boolean, }, { alias: "r", help: "push commit to given remote", multiple: true, name: "remote", type: String, }, { alias: "s", help: "don't log anything", name: "silent", type: Boolean, }, { alias: "u", help: `UID of first Assemblée's "texte de loi" to clean`, name: "uid", type: String, }, { alias: "v", help: "verbose logs", name: "verbose", type: Boolean, }, { defaultOption: true, help: "directory containing Assemblée open data files", name: "dataDir", type: String, }, ]; const options = commandLineArgs(optionsDefinitions); function commitAndPush(repositoryDir, commit, remotes) { let error = false; if (commit) { execSync("git add .", { cwd: repositoryDir, env: process.env, encoding: "utf-8", stdio: ["ignore", "ignore", "pipe"], }); try { execSync('git commit -m "Nouvelle moisson"', { cwd: repositoryDir, env: process.env, encoding: "utf-8", }); } catch (childProcess) { if (childProcess.stderr === null || !/nothing to commit/.test(childProcess.stdout)) { console.error(childProcess.output); throw childProcess; } } for (const remote of remotes || []) { try { execSync(`git push ${remote} master`, { cwd: repositoryDir, env: process.env, encoding: "utf-8", stdio: ["ignore", "ignore", "pipe"], }); } catch (childProcess) { // Don't stop when push fails. console.error(childProcess.output); error = true; } } } return error; } async function convertTextesLois() { assert(!options.commit || options.one, 'Options "commit" & "one" are incompatible'); assert(!options.commit || options.uid, 'Options "commit" & "uid" are incompatible'); const dataDir = options.dataDir; const { documentByUid } = loadAssembleeData(dataDir, EnabledDatasets.DossiersLegislatifs, options.legislature); const documentsRawHtmlDir = path.join(dataDir, "documents_html"); assert(fs.existsSync(documentsRawHtmlDir), `Directory "${documentsRawHtmlDir}" doesn't exist`); const documentsCleanHtmlDir = path.join(dataDir, "documents_html_nettoye"); fs.ensureDirSync(documentsCleanHtmlDir); if (options.full && !options.one && !options.uid) { for (const filename of fs.readdirSync(documentsCleanHtmlDir)) { if (filename[0] === ".") { continue; } fs.removeSync(path.join(documentsCleanHtmlDir, filename)); } } const documents = Object.values(documentByUid) // Ignore documents from Sénat. .filter((document) => document.uid.substring(4, 6) !== "SN") .sort((a, b) => a.uid.localeCompare(b.uid)); const firstUid = options.uid; let skip = !!firstUid; for (const document of documents) { if (skip) { if (document.uid === firstUid) { skip = false; } else { continue; } } const documentCleanHtmlBasename = pathFromDocumentUid(documentsCleanHtmlDir, document.uid); const documentCleanHtmlFilePath = `${documentCleanHtmlBasename}.html`; if (!options.full && fs.existsSync(documentCleanHtmlFilePath)) { continue; } const documentRawHtmlBasename = pathFromDocumentUid(documentsRawHtmlDir, document.uid); const documentRawHtmlFilePath = `${documentRawHtmlBasename}.html`; if (!fs.existsSync(documentRawHtmlFilePath)) { continue; } if (!options.silent) { console.log(`Cleaning HTML of document ${document.uid}…`); } const documentRawHtml = fs.readFileSync(documentRawHtmlFilePath, { encoding: "utf8", }); let documentHtml = documentRawHtml; documentHtml = removeLetterSpacingStyleAttributes(documentHtml); documentHtml = removeEmptyStyleAttributes(documentHtml); const documentParse5Ast = parse5.parse(documentHtml, { sourceCodeLocationInfo: true, }); const documentVfile = vfile({ contents: documentRawHtml, path: documentRawHtmlFilePath, }); const documentHast = hastFromParse5(documentParse5Ast, { file: documentVfile, }); unistRemovePosition(documentHast); removeStyleElements(documentHast); mergeSpanElementsWithSameAttributes(documentHast); replaceSpanElementsContainingOnlySpacesWithTexts(documentHast); replaceSpanElementsWithoutPropertiesWithTexts(documentHast); mergeTexts(documentHast); const documentCleanHtml = prettier.format(hastToHtml(documentHast), { parser: "html", }); fs.writeFileSync(documentCleanHtmlFilePath, documentCleanHtml, { encoding: "utf8", }); if (options.one) { break; } } if (commitAndPush(documentsCleanHtmlDir, options.commit, options.remote)) { process.exit(1); } } convertTextesLois().catch((error) => { console.log(error); process.exit(1); });