UNPKG

decentralized-internet

Version:

An NPM library of programs to create decentralized web and distributed computing projects

286 lines (231 loc) 7.37 kB
module.exports = function (conf) { var fs = require('fs'); var Promise = require('bluebird'); var _ = require("underscore"); var spawn = require('child_process').spawn; var path = require('path'); var Joi = require('@hapi/joi'); var executionmethods = require('./executionserver.methods')(conf); var clustermodel = require('clusterpost-model'); var handler = {}; handler.submitJob = function(doc, cwd){ Joi.assert(doc, clustermodel.job); return new Promise(function(resolve, reject){ var command = 'sbatch'; var parameters = doc.parameters; var params = []; if(conf.hasbang){ params.push(conf.hash_bang); }else{ params.push("#!/bin/bash"); } if(doc.jobparameters){ for(var i = 0; i < doc.jobparameters.length; i++){ var param_script = ["#SBATCH"] var param = doc.jobparameters[i]; if(param.flag){ param_script.push(param.flag); } if(param.name){ param_script.push(param.name); } params.push(param_script.join(" ")); } } params.push(["#SBATCH", "-D", cwd].join(" ")); params.push(["#SBATCH", "-e", path.join(cwd, "stderr.err")].join(" ")); params.push(["#SBATCH", "-o", path.join(cwd, "stdout.out")].join(" ")); params.push(["#SBATCH", "-J", doc.userEmail].join(" ")); params_command = [] params_command.push(doc.executable); if(parameters){ for(var i = 0; i < parameters.length; i++){ var param = parameters[i]; if(param.flag){ params_command.push(param.flag); } if(param.name){ params_command.push(param.name); } } } params.push(params_command.join(" ")); var script_filename = path.join(cwd, "slurm_script.sh"); fs.writeFileSync(script_filename, params.join("\n")); try{ const runcommand = spawn(command, [script_filename]); var allerror = ""; runcommand.stderr.on('data', function(data){ allerror += data; }); var alldata = ""; runcommand.stdout.on('data', function(data){ alldata += data; }); //"sample: Submitted batch job 3053044" runcommand.on('close', function(code){ if(code){ resolve({ status: 'FAIL', error: allerror + alldata }); }else{ var stringfind = 'Submitted batch job '; var ind = alldata.indexOf(stringfind) + stringfind.length; var jobid = alldata.substr(ind, alldata.length - ind); resolve({ jobid : Number.parseInt(jobid), status: 'RUN' }); } }); }catch(e){ reject({ status: "FAIL", error: e }); } }); } handler.getJobStatus = function(doc){ Joi.assert(doc.jobstatus, clustermodel.jobstatus); return new Promise(function(resolve, reject){ try{ var jobid = doc.jobstatus.jobid; var params = ["-h", "-j", jobid]; const ps = spawn('squeue', params); var allerror = ""; ps.stderr.on('data', function(data){ allerror += data; }); var alldata = ""; ps.stdout.on('data', function(data){ alldata += data; }); //sample success: '3053968 general_b wrap jprieto PD 0:00 1 (Priority)' //sample fail: 'slurm_load_jobs error: Invalid job id specified' OR empty string ps.on('close', function(code){ if(alldata){ var job_status = alldata.replace(/ +/g, ' ').trim().split(' '); //JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) // BF BOOT_FAIL // Job terminated due to launch failure, typically due to a hardware failure (e.g. unable to boot the node or block and the job can not be requeued). // CA CANCELLED // Job was explicitly cancelled by the user or system administrator. The job may or may not have been initiated. // CD COMPLETED // Job has terminated all processes on all nodes with an exit code of zero. // CF CONFIGURING // Job has been allocated resources, but are waiting for them to become ready for use (e.g. booting). // CG COMPLETING // Job is in the process of completing. Some processes on some nodes may still be active. // DL DEADLINE // Job terminated on deadline. // F FAILED // Job terminated with non-zero exit code or other failure condition. // NF NODE_FAIL // Job terminated due to failure of one or more allocated nodes. // OOM OUT_OF_MEMORY // Job experienced out of memory error. // PD PENDING // Job is awaiting resource allocation. // PR PREEMPTED // Job terminated due to preemption. // R RUNNING // Job currently has an allocation. // RD RESV_DEL_HOLD // Job is held. // RF REQUEUE_FED // Job is being requeued by a federation. // RH REQUEUE_HOLD // Held job is being requeued. // RQ REQUEUED // Completing job is being requeued. // RS RESIZING // Job is about to change size. // RV REVOKED // Sibling was removed from cluster due to other cluster starting the job. // SI SIGNALING // Job is being signaled. // SE SPECIAL_EXIT // The job was requeued in a special state. This state can be set by users, typically in EpilogSlurmctld, if the job has terminated with a particular exit value. // SO STAGE_OUT // Job is staging out files. // ST STOPPED // Job has an allocation, but execution has been stopped with SIGSTOP signal. CPUS have been retained by this job. // S SUSPENDED // Job has an allocation, but execution has been suspended and CPUs have been released for other jobs. // TO TIMEOUT if(job_status.length > 5){ if(job_status[4] == 'R' || job_status[4] == 'PD'){ resolve({ status: 'RUN', stat: alldata }); }else if(job_status[4] == 'CD' || job_status[4] == 'CG'){ resolve({ status: 'DONE', stat: alldata }); }else if(job_status[4] == 'SE' || job_status[4] == 'CA'){ resolve({ status: 'EXIT', stat: alldata }); }else if(job_status[4] == 'F' || job_status[4] == 'NF'){ resolve({ status: 'FAIL', stat: alldata }); }else{ resolve({ status: 'DONE', stat: alldata }); } }else{ resolve({ jobid: jobid, status: 'DONE', stat: alldata }); } }else{ resolve({ jobid: jobid, status: 'DONE' }); } }); }catch(e){ reject(e); } }); } handler.killJob = function(doc){ Joi.assert(doc.jobstatus, clustermodel.jobstatus); return new Promise(function(resolve, reject){ try{ var jobid = doc.jobstatus.jobid; var params = [jobid]; const kill = spawn('scancel', params); var allerror = ""; kill.stderr.on('data', function(data){ allerror += data; }); var alldata = ""; kill.stdout.on('data', function(data){ alldata += data; }); kill.on('close', function(code){ resolve({ status: 'EXIT', stat: allerror + alldata }); }); }catch(e){ reject(e); } }); } return handler; }