UNPKG

@gracexwho/model-card-generator

Version:

Tool for generating model cards for Jupyter Notebook.

551 lines (457 loc) 23.4 kB
"use strict"; exports.__esModule = true; // COMMAND: node main.js ../assets/News_Categorization_MNB.ipynb const BULK_RUN_PATH = "../tests/notebooks/"; //const BULK_RUN_PATH = "../assets/" const SCHEMAS_PATH = "/../lib/lale/sklearn/"; const MODEL_CARDS_PATH = "../assets/model_cards/"; var graphing = require("./Graph.js").Graph; var py = require("modified-python-program-analysis/dist/es5"); var fs = require('fs'); var path = require('path'); var ic = require("./infocell"); var dep = require("./cell_deps.js"); var args = process.argv.slice(2); var filePath = args[0]; class ModelCard { constructor() { this.JSONSchema = { modelname:{title:"", Filename:"", cell_ids:[]}, author:{title:"Author"}, datasets: {title: "Datasets", description:"", links:"", cell_ids:[]}, references: {title:"References", source:"", links:[], cell_ids:[]}, libraries:{title:"Libraries Used", lib:{}, info:{}, cell_ids:[]}, hyperparameters:{title:"Hyperparameters", cell_ids:[], lineNumbers:[], source:"", values:[], description:""}, misc:{title:"Miscellaneous", cell_ids:[], cells:[], lineNumbers:[], source:"", markdown:"", imports:[], functions:[], figures:[], description:"", outputs:[]}, plotting:{title:"Plotting", cell_ids:[], cells:[], lineNumbers:[], source:"", markdown:"", imports:[], functions:[], figures:[], description:"", outputs:[]}, datacleaning:{title:"Data Cleaning", cell_ids:[], cells:[], lineNumbers:[], source:"", markdown:"", imports:[], functions:[], figures:[], description:"", outputs:[]}, preprocessing:{title:"Preprocessing", cell_ids:[], cells:[], lineNumbers:[], source:"", markdown:"", imports:[], functions:[], figures:[], description:"", outputs:[]}, modeltraining:{title:"Model Training", cell_ids:[], cells:[], lineNumbers:[], source:"", markdown:"", imports:[], functions:[], figures:[], description:"", outputs:[]}, modelevaluation:{title:"Evaluation", cell_ids:[], cells:[], lineNumbers:[], source:"", markdown:"", imports:[], functions:[], figures:[], description:"", outputs:[]} } this.line_to_cell = {}; this.markdown = ""; this.intended_use = ""; this.ethical_considerations = ""; this.developer_comments = ""; this.hyperparamschemas = {}; } getStageLineNumbers(stage_name) { return this.JSONSchema[stage_name]["lineNumbers"]; } getPLineNumbers() { return this.JSONSchema["plotting"]["lineNumbers"]; } getDCLineNumbers() { return this.JSONSchema["datacleaning"]["lineNumbers"]; } getPPLineNumbers() { return this.JSONSchema["preprocessing"]["lineNumbers"]; } getMTLineNumbers() { return this.JSONSchema["modeltraining"]["lineNumbers"]; } getMELineNumbers() { return this.JSONSchema["modelevaluation"]["lineNumbers"]; } } var model_card = new ModelCard(); function createCell(text, executionCount, output) { return new ic.InfoCell(text, executionCount, output); } function convertColorToLabel(filePath) { // data collection -> red // data cleaning -> yellow // data labeling -> green // feature engineering -> lightblue // training -> purple // evaluation -> orange // model deployment -> pink model_card = new ModelCard(); var color_map = dep.printLabels(filePath); var mapObj = {red:"Data collection",yellow:"Data cleaning", green:"Data labelling", "lightblue":"Plotting", "blue":"Feature Engineering", purple:"Training", orange:"Evaluation", pink:"Model deployment"}; var re = new RegExp(Object.keys(mapObj).join("|"),"gi"); color_map = color_map.replace(re, function(matched){ return mapObj[matched]; }); /**fs.writeFile((__dirname + "/../assets/" + filePath.split(".ipynb")[0] + '_labels.txt'), color_map, function (err) { if (err) throw err; //console.log('Labels file saved!'); }); **/ color_map = color_map.split("\n"); var new_color_map = {}; for (let element of color_map) { element = element.split("->"); new_color_map[element[0]] = element[1]; } var schemas = {}; var filenames = fs.readdirSync(__dirname + SCHEMAS_PATH); filenames.forEach(file => { var newname = file.replace("_", ""); newname = newname.replace(".py", ""); schemas[newname] = file; }); model_card.hyperparamschemas = schemas; return new_color_map; } function readCells(filePath, new_color_map) { var content = fs.readFileSync(path.resolve(__dirname, filePath)); let jsondata = JSON.parse(content); var notebookCode = "\n"; var notebookMarkdown = ""; const rewriter = new py.MagicsRewriter(); var currStage = "misc"; let id_count = 0; let flag = true; let programbuilder = new py.ProgramBuilder(); model_card.JSONSchema["modelname"]["Filename"] = filePath.split("/").slice(-1).toString(); var countLines = 0; //fs.mkdirSync("../assets/model_cards/" + model_card.JSONSchema["modelname"]["Filename"], { recursive: true }) for (let cell of jsondata['cells']) { let sourceCode = ""; if (cell['cell_type'] === 'markdown') { model_card.JSONSchema[currStage]["markdown"] += "\n" + cell['source']; for (let mdline of cell['source']) { var matches = mdline.match(/\bhttps?:\/\/[\S][^)]+/gi); if (matches !== null) { model_card.JSONSchema["references"]["cell_ids"].push(id_count); model_card.JSONSchema["references"]["links"] = model_card.JSONSchema["references"]["links"].concat(matches); } } if (id_count == 0 && flag) { flag = false; model_card.JSONSchema["modelname"]["title"] = cell['source'][0]; model_card.JSONSchema["modelname"]["cell_ids"] = id_count; } id_count += 1; notebookMarkdown += cell["source"]; } else if (cell['source'][0] != undefined){ id_count += 1; var key = cell['execution_count'].toString(); if (key in new_color_map) { var stage = new_color_map[key]; if (stage == "Data collection" || stage == "Data cleaning" || stage == "Data labelling") { currStage = "datacleaning"; } else if (stage == "Feature Engineering") { currStage = "preprocessing"; } else if (stage == "Training") { currStage = "modeltraining"; } else if (stage == "Evaluation") { currStage = "modelevaluation"; } else if (stage == "Plotting") { currStage = "plotting"; } } for (let line of cell['source']) { if (line[0] === "%") { line = rewriter.rewriteLineMagic(line); } countLines += 1; model_card.JSONSchema[currStage]["lineNumbers"].push(countLines); model_card.line_to_cell[countLines] = id_count; sourceCode += line; } notebookCode += sourceCode + '\n'; let code_cell = createCell(sourceCode, cell['execution_count'], cell['outputs'][0]); if (cell["outputs"].length != 0) { for (let output in cell["outputs"]) { if (cell["outputs"][output]['output_type'] == 'display_data') { var bitmap = new Buffer.from(cell["outputs"][output]['data']['image/png'], 'base64'); //fs.writeFileSync(__dirname + MODEL_CARDS_PATH + model_card.JSONSchema["modelname"]["Filename"] + "/" + code_cell.persistentId + ".jpg", bitmap); var image = "![Hello World](data:image/png;base64," + cell["outputs"][output]['data']['image/png'] + ")"; model_card.JSONSchema[currStage]["figures"].push(code_cell.persistentId + ".jpg"); } else if (cell["outputs"][output]['output_type'] == 'stream') { var info = cell["outputs"][output]["text"]; model_card.JSONSchema[currStage]["outputs"].push(info); } } } programbuilder.add(code_cell) model_card.JSONSchema[currStage]["cells"] += code_cell; model_card.JSONSchema[currStage]["source"] += sourceCode; model_card.JSONSchema[currStage]["cell_ids"].push(id_count); } } model_card.markdown += notebookMarkdown; printLineDefUse(notebookCode, model_card, countLines); return [notebookCode, notebookMarkdown, model_card]; } function printLineDefUse(code, model_card, countLines){ let tree = py.parse(code); let cfg = new py.ControlFlowGraph(tree); const analyzer = new py.DataflowAnalyzer(); const flows = analyzer.analyze(cfg).dataflows; var importScope = {}; var lineToCode = {}; var pLines = model_card.getPLineNumbers(); var dcLines = model_card.getDCLineNumbers(); var ppLines = model_card.getPPLineNumbers(); var mtLines = model_card.getMTLineNumbers(); var meLines = model_card.getMELineNumbers(); for (let flow of flows.items) { let fromNode = py.printNode(flow.fromNode).split("\n"); let toNode = py.printNode(flow.toNode).split("\n"); lineToCode[flow.fromNode.location.first_line] = fromNode[0]; lineToCode[flow.fromNode.location.last_line] = fromNode[fromNode.length-1]; lineToCode[flow.toNode.location.last_line] = toNode[toNode.length-1]; lineToCode[flow.toNode.location.first_line] = toNode[0]; if (flow.fromNode.type === "from" || flow.fromNode.type === "import") { if (fromNode[0].includes("sklearn.datasets")) { model_card.JSONSchema["datasets"]["source"] += fromNode[0]; model_card.JSONSchema["datasets"]["cell_ids"].push(model_card.line_to_cell[flow.fromNode.location.first_line]); } //Check Hyperparameters var input = fromNode[0].toLowerCase(); var match = ""; var hyperparam_descriptions = {}; Object.keys(model_card.hyperparamschemas).forEach(function(key) { if (input.includes(key)) { var hcontents = fs.readFileSync(__dirname + "/../lib/lale/sklearn/" + model_card.hyperparamschemas[key], "utf8"); var hflag = false; var pflag = false; var hyperflag = false var hyperparams = ""; var hproperties = ""; var openbrackets = 0; for (let hline of hcontents.split("\n")) { if (hline.includes("_hyperparams_schema =")) { hyperflag = true; } if (hyperflag) { if (hline.includes("'properties':")) { pflag = true; } openbrackets += (hline.match(/{/g)||[]).length openbrackets -= (hline.match(/}/g)||[]).length } if (hline.includes("relevantToOptimizer")) { hflag = true; } if (hflag) { hyperparams += hline; } if (hline.includes("],")) { hflag = false; } if (pflag == true && hyperflag == true) { hproperties = hproperties + hline + "\n"; } if (hyperflag && openbrackets == 0) { break; } } hyperparams = hyperparams.substr(hyperparams.indexOf('[')+1); hyperparams = hyperparams.split("]")[0]; hyperparams = hyperparams.split(","); var parameters = []; for (let s of hyperparams) { s = s.replace(/['"]+/g, ""); s = s.trim(); if (s) { parameters.push(s); } } pflag = false; openbrackets=0; var desc = ""; var substring = null; var param = ""; function containsAny(str, substrings) { for (var i = 0; i != substrings.length; i++) { var substring = "'" + substrings[i] + "'"; if (str.indexOf(substring) != - 1) { return substring; } } return null; } for (let line of hproperties.split("\n")) { if (!pflag) { substring = containsAny(line, parameters); if (substring != null) { //parameters.some(function(v) {hyperparam_descriptions["'" + v + "'"] = ""; return line.indexOf("'" + v + "'") >= 0; }) pflag = true; param = substring; } } if (pflag) { if (line.includes("{")) { openbrackets += 1; } if (line.includes("}")) { openbrackets -=1; } desc = desc + line + "\n"; if (openbrackets <= 0) { pflag = false; hyperparam_descriptions[input] += desc; desc = ""; } } } model_card.JSONSchema["hyperparameters"]["values"] += parameters; model_card.JSONSchema["hyperparameters"]["lineNumbers"].push(flow.fromNode.location.first_line); model_card.JSONSchema["hyperparameters"]["cell_ids"].push(model_card.line_to_cell[flow.fromNode.location.first_line]); model_card.JSONSchema["hyperparameters"]["source"] += fromNode[0] + "\n"; model_card.JSONSchema["hyperparameters"]["description"] = hyperparam_descriptions; } }); importScope[flow.fromNode.location.first_line] = -1; model_card.JSONSchema["libraries"]["cell_ids"].push(model_card.line_to_cell[flow.fromNode.location.first_line]); } else if (flow.fromNode.type === "def") { if (flow.fromNode.location.first_line in pLines) { model_card.JSONSchema["plotting"]["functions"].push(py.printNode(flow.fromNode)); } else if (flow.fromNode.location.first_line in dcLines) { model_card.JSONSchema["datacleaning"]["functions"].push(py.printNode(flow.fromNode)); } else if (flow.fromNode.location.first_line in ppLines) { model_card.JSONSchema["preprocessing"]["functions"].push(py.printNode(flow.fromNode)); } else if (flow.fromNode.location.first_line in mtLines) { model_card.JSONSchema["modeltraining"]["functions"].push(py.printNode(flow.fromNode)); } else if (flow.fromNode.location.first_line in meLines) { model_card.JSONSchema["modelevaluation"]["functions"].push(py.printNode(flow.fromNode));s } } } var n = countLines; // need graph size to be size of lineToCode, not number of edges var numgraph = new graphing(n+1); for (let flow of flows.items) { numgraph.addEdge(flow.fromNode.location.first_line, flow.toNode.location.first_line); } findImportScope(importScope, lineToCode, numgraph, model_card); } function findImportScope(importScope, lineToCode, numgraph, model_card) { var importCode = Object.keys(importScope); var scopes = {}; var imports = {}; for (let lineNum of importCode) { var result = numgraph.findLongestPathSrc(numgraph.edge.length, parseInt(lineNum)) scopes[lineNum] = result[1]; imports[lineToCode[lineNum]] = "START:" + lineNum.toString() + "\t" + " END:" + scopes[lineNum]; if (model_card.getDCLineNumbers().includes(parseInt(lineNum))) { model_card.JSONSchema["datacleaning"]["imports"].push(lineToCode[lineNum]); } else if (model_card.getPPLineNumbers().includes(parseInt(lineNum))) { model_card.JSONSchema["preprocessing"]["imports"].push(lineToCode[lineNum]); }else if (model_card.getMTLineNumbers().includes(parseInt(lineNum))) { model_card.JSONSchema["modeltraining"]["imports"].push(lineToCode[lineNum]); }else if (model_card.getMELineNumbers().includes(parseInt(lineNum))) { model_card.JSONSchema["modelevaluation"]["imports"].push(lineToCode[lineNum]); } } generateLibraryInfo(imports); } function generateLibraryInfo(imports) { let library_defs = JSON.parse(fs.readFileSync(__dirname + "/../assets/library_defs.json")); var libraries = {"pandas":[], "numpy":[], "matplotlib":[], "sklearn":[], "tensorflow":[], "pytorch":[], "OTHER":[]}; for (let im of Object.keys(imports)) { if (im.includes("pandas")){ libraries["pandas"].push(im); } else if (im.includes("numpy")) { libraries["numpy"].push(im); } else if(im.includes("matplotlib")) { libraries["matplotlib"].push(im); } else if(im.includes("sklearn")) { libraries["sklearn"].push(im); } else if (im.includes("tensorflow")) { libraries["tensorflow"].push(im); } else if (im.includes("pytorch")) { libraries["pytorch"].push(im); } else { libraries["OTHER"].push(im); } } model_card.JSONSchema["libraries"]["lib"] = libraries; model_card.JSONSchema["libraries"]["info"] = library_defs; } function printCellsOfStage(stage_name, model_card) { for (let cell in model_card.JSONSchema[stage_name]["cells"]) { console.log(ic.printInfoCell(cell)); } } function getOutput() { // look at "output_type" of json" var hello = new InfoCell(text, executionCount, executionEventId); } function printModelCard(model_card) { console.log(JSON.stringify(model_card.JSONSchema)); } function generateMarkdown(model_card, notebookname="") { var markdown_contents = ""; var keys = Object.keys( model_card.JSONSchema ); for( var i = 0,length = keys.length; i < length; i++ ) { var stageKeys = Object.keys(model_card.JSONSchema[keys[i]]); for (let stageKey of stageKeys) { if (stageKey == 'title') { markdown_contents += "## " + model_card.JSONSchema[keys[i]][stageKey] + " ##" + "\n"; } else { if (stageKey == 'source') { markdown_contents += "### " + stageKey + " ###" + "\n"; markdown_contents += "``` " + "\n" + model_card.JSONSchema[keys[i]][stageKey] + "\n" + " ```" + "\n"; }else if (stageKey == "description" && keys[i] == "hyperparameters") { markdown_contents += "### " + stageKey + " ###" + "\n"; markdown_contents += JSON.stringify(model_card.JSONSchema[keys[i]][stageKey]) + "\n"; }else if (stageKey == "outputs") { markdown_contents += "### " + stageKey + " ###" + "\n"; markdown_contents += model_card.JSONSchema[keys[i]][stageKey] + "\n"; //var image = document.createElement('img'); //image.src = "data:image/png;base64," + base64JsonData; } else if (stageKey == "imports" || stageKey == "markdown") { continue; } else if (stageKey == "figures") { markdown_contents += "### " + stageKey + " ###" + "\n"; for (let image of model_card.JSONSchema[keys[i]][stageKey]) { //![id5](./image/id5.jpg) //markdown_contents += "![" + image + "](" + MODEL_CARDS_PATH + model_card.JSONSchema["modelname"]["Filename"] + "/" + image + ")" + "\n"; } } else if (keys[i] == "references" && stageKey == "links") { for (let link of model_card.JSONSchema['references']['links']) { markdown_contents += link + "\n"; } }else { markdown_contents += "### " + stageKey + " ###" + "\n"; markdown_contents += JSON.stringify(model_card.JSONSchema[keys[i]][stageKey]) + "\n"; } } } } fs.writeFile(MODEL_CARDS_PATH + "ModelCard" + notebookname + '.md', markdown_contents, (err) => { if (err) throw err; //console.log('Model card saved!'); //console.log(model_card); }); } function getExt(filename){ return filename.substring(filename.lastIndexOf('.')+1, filename.length); } function bulk_run() { fs.readdirSync(BULK_RUN_PATH).forEach(file => { var filePath = ""; if (getExt(file) === "ipynb"){ console.log('Currently processing:'); console.log(BULK_RUN_PATH + file + '\n'); filePath = BULK_RUN_PATH + file; try { var new_color = convertColorToLabel(filePath); var res = readCells(filePath, new_color); var notebookCode = res[0]; var notebookMarkdown = res[1]; var MC = res[2]; generateMarkdown(MC, "_" + file.split(".")[0]); } catch(err) { console.log(err); } } }); } function main() { var new_color = convertColorToLabel(filePath); var res = readCells(filePath, new_color); var notebookCode = res[0]; var notebookMarkdown = res[1]; var MC = res[2]; generateMarkdown(MC, notebookCode); } main(); //bulk_run();