@ifct2017/compositions
Version:
Detailed nutrient composition of 528 key foods in India.
188 lines (157 loc) • 5.26 kB
JavaScript
const fs = require('fs');
const path = require('path');
const lunr = require('lunr');
const csvx = require('csv-parse');
const esql = require('sql-extra');
const TEXTCOLS = new Set(['code', 'name', 'scie', 'lang', 'grup', 'tags']);
var corpus = new Map();
var index = null;
var ready = null;
function csv() {
return path.join(__dirname, 'index.csv');
}
function tsvector(tab, cols) {
var {code, name, scie, lang, grup, tags} = cols;
return `setweight(to_tsvector('english', "code"), '${code}')||`+
`setweight(to_tsvector('english', left("name", strpos("name", ','))), '${code}')||`+
`setweight(to_tsvector('english', "name"), '${name}')||`+
`setweight(to_tsvector('english', "scie"), '${scie}')||`+
`setweight(to_tsvector('english', ${tab}_lang_tags("lang")), '${lang}')||`+
`setweight(to_tsvector('english', "grup"), '${grup}')||`+
`setweight(to_tsvector('english', "tags"), '${tags}')`;
}
function createFunctionLangTags(tab) {
return `CREATE OR REPLACE FUNCTION "${tab}_lang_tags" (TEXT) RETURNS TEXT AS $$`+
` SELECT lower(regexp_replace(regexp_replace(regexp_replace($1, `+
` '\\[.*?\\]', '', 'g'), '\\w+\\.\\s([\\w\\'',\\/\\(\\)\\- ]+)[;\\.]?', '\\1', 'g'),`+
` '[,\\/\\(\\)\\- ]+', ' ', 'g')) $$`+
` LANGUAGE SQL IMMUTABLE RETURNS NULL ON NULL INPUT;\n`;
}
function createTable(tab, cols, opt={}, a='') {
var pre = ['code', 'name', 'scie', 'lang', 'grup', 'regn', 'tags'];
a += `CREATE TABLE IF NOT EXISTS "${tab}" (`;
for (var c of pre) {
var typ = c==='regn'? 'INT':'TEXT';
a += ` "${c}" ${typ} NOT NULL,`;
}
for (var c in cols) {
if (pre.includes(c)) continue;
a += ` "${c}" REAL NOT NULL,`;
}
if (opt.pk) a += ` PRIMARY KEY ("code"), `;
a = a.endsWith(', ')? a.substring(0, a.length-2) : a;
a += `);\n`;
return a;
}
function insertIntoBegin(tab, cols, a='') {
a += `INSERT INTO "${tab}" (`;
for (var c in cols)
a += `"${c}", `;
a = a.endsWith(', ')? a.substring(0, a.length-2) : a;
a += ') VALUES\n(';
return a;
}
function insertIntoMid(val, a='') {
for (var k in val)
a += `'${val[k]}', `;
a = a.endsWith(', ')? a.substring(0, a.length-2) : a;
a += `),\n(`;
return a;
}
function insertIntoEnd(a='') {
a = a.endsWith(',\n(')? a.substring(0, a.length-3) : a;
a += ';\n';
return a;
}
function sql(tab='compositions', opt={}) {
var i = -1, cols = null, a = '';
var opt = Object.assign({pk: 'code', index: true}, opt);
var tsv = tsvector(tab, {code: 'A', name: 'B', scie: 'B', lang: 'B', grup: 'C', tags: 'C'});
var stream = fs.createReadStream(csv()).pipe(csvx.parse({columns: true, comment: '#'}));
return new Promise((resolve, reject) => {
stream.on('error', reject);
stream.on('data', (r) => {
var x = fixColumns(r);
if (++i===0) { cols = x; a = createTable(tab, cols, opt, a); a = insertIntoBegin(tab, cols, a); }
a = insertIntoMid(x, a);
});
stream.on('end', () => {
a = insertIntoEnd(a);
a += createFunctionLangTags(tab);
a += esql.createView(`${tab}_tsvector`, `SELECT *, ${tsv} AS "tsvector" FROM "${tab}"`);
a += esql.createIndex(`${tab}_tsvector_idx`, tab, `(${tsv})`, {method: 'GIN'});
a = esql.setupTable.index(tab, cols, opt, a);
resolve(a);
});
});
}
// Fix column names of a row in the CSV file.
function fixColumns(row) {
var a = {};
for (var k in row) {
// Name of column is after the last semicolon.
var l = k.substring(k.lastIndexOf(';')+1).trim();
a[l] = row[k];
}
return a;
}
// Parse a row from the CSV file.
function parseRow(row) {
var a = {};
for (var k in row) {
var l = k.substring(k.lastIndexOf(';')+1).trim();
a[l] = TEXTCOLS.has(l)? row[k] : parseFloat(row[k]);
}
return a;
}
function loadCorpus() {
return new Promise((fres) => {
var s = fs.createReadStream(csv()).pipe(csvx.parse({columns: true, comment: '#'}));
s.on('data', (r) => { var x = parseRow(r); corpus.set(x.code, x); });
s.on('end', fres);
});
}
function createIndex() {
return lunr(function() {
this.ref('code');
this.field('code');
this.field('name');
this.field('scie');
this.field('lang');
this.field('grup');
this.field('tags');
for (var r of corpus.values()) {
var {code, name, scie, lang, grup, tags} = r;
name = name.replace(/^(\w+),/g, '$1 $1 $1 $1,');
lang = lang.replace(/\[.*?\]/g, '');
lang = lang.replace(/\w+\.\s([\w\',\/\(\)\- ]+)[;\.]?/g, '$1');
lang = lang.replace(/[,\/\(\)\- ]+/g, ' ');
this.add({code, name, scie, lang, grup, tags});
}
});
}
async function load() {
if (ready) await ready;
if (index) return corpus;
ready = loadCorpus();
await ready;
index = createIndex();
return corpus;
}
function matchRate(m) {
return Object.keys(m.matchData.metadata).length;
}
function compositions(txt) {
if (!index) { load(); return []; }
var a = [], txt = txt.replace(/\W/g, ' ');
var ms = index.search(txt), max = 0;
for (var m of ms)
max = Math.max(max, matchRate(m));
for (var m of ms)
if (matchRate(m)===max) a.push(corpus.get(m.ref));
return a;
}
compositions.load = load;
compositions.csv = csv;
compositions.sql = sql;
module.exports = compositions;