UNPKG

federer

Version:

Experiments in asynchronous federated learning and decentralized learning

141 lines 5.92 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.oneHotEncode = exports.encodeChar = exports.encodeSentence = exports.getFullData = exports.sample = exports.readLeafJson = exports.convertLabels = exports.convertItems = exports.getFilepaths = exports.readRawDataFile = exports.processRawDataFromFilepaths = exports.processRawData = exports.NUM_LETTERS = void 0; const tslib_1 = require("tslib"); const path = tslib_1.__importStar(require("path")); const assert = require("assert"); const tf = tslib_1.__importStar(require("@tensorflow/tfjs-node")); const Reservoir = require("reservoir"); const common_1 = require("../../../common"); const coordinator_1 = require("../../../coordinator"); const ALL_LETTERS = "\n !\"&'(),-.0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz}"; exports.NUM_LETTERS = ALL_LETTERS.length; async function processRawData(dataset, environment, numberClients, numberLabelClasses) { const filepaths = getFilepaths(dataset); const trainpaths = await processRawDataFromFilepaths(filepaths.train, environment, dataset, numberLabelClasses, true, numberClients); const testpaths = await processRawDataFromFilepaths(filepaths.test, environment, dataset, numberLabelClasses, false); return { trainpaths: trainpaths, testpaths: testpaths[0], }; } exports.processRawData = processRawData; /** * Create shards for test and train datasets * * @param filepaths Holds file paths to both the train and test JSON files * @param environment Depending on the environment, data files will be read * either from local files, or from S3. * @param dataset The Leaf dataset to be used * @param numberLabelClasses Number of unique labels * @param train True if creating train shards * @param numberClients How many shards will be used for training. * @returns File paths to the train and test shards */ async function processRawDataFromFilepaths(filepaths, environment, dataset, numberLabelClasses, train, numberClients) { const directory = `data/processed/leaf/${dataset}`; common_1.mkdirp(directory); const shards = await readRawDataFile(filepaths, environment, dataset, numberLabelClasses, numberClients); return Promise.all(shards.map(async (shard, i) => { const path = shard.save(directory, train ? `shard-${i}` : "test"); shard.dispose(); return path; })); } exports.processRawDataFromFilepaths = processRawDataFromFilepaths; /** * Split the corresponding JSON file into shards * * @param file The path to train/test JSON file * @param environment Depending on the environment, data files will be read * either from local files, or from S3. * @param dataset The Leaf dataset to be used * @param numberLabelClasses Number of unique labels * @param numberClients How much of the total number * of shards will be used for training. This is * undefined if creating test shards * @returns An array of tensors grouped into subsets */ async function readRawDataFile(file, environment, dataset, numberLabelClasses, numberClients) { const json = await readLeafJson(file, environment); const data = Object.values(json.user_data); const sampled = numberClients !== undefined ? sample(data, numberClients) : [getFullData(data)]; return sampled.map((sample) => { const items = convertItems(sample.x, dataset); const labels = convertLabels(sample.y, dataset, numberLabelClasses); return new common_1.DataSubset({ items, labels }); }); } exports.readRawDataFile = readRawDataFile; const fileNames = { train: "train/all_data_niid_0_keep_0_train_9.json", test: "test/all_data_niid_0_keep_0_test_9.json", }; function getFilepaths(dataset) { return { train: getFilepath(dataset, fileNames.train), test: getFilepath(dataset, fileNames.test), }; } exports.getFilepaths = getFilepaths; function getFilepath(dataset, filename) { return path.join(common_1.absolutePath.data.raw(`leaf/${dataset}`), filename); } function convertItems(array, dataset) { switch (dataset) { case "shakespeare": return tf.tensor(array.map((sentence) => encodeSentence(sentence))); case "synthetic": return tf.tensor(array, undefined, "float32"); } } exports.convertItems = convertItems; function convertLabels(array, dataset, numberLabelClasses) { switch (dataset) { case "shakespeare": return tf.tensor(array.map((sentence) => oneHotEncode(sentence))); case "synthetic": return tf.oneHot(array, numberLabelClasses); } } exports.convertLabels = convertLabels; async function readLeafJson(file, environment) { const buffer = await coordinator_1.readFileInEnvironment(file, environment); return JSON.parse(buffer.toString("utf-8")); } exports.readLeafJson = readLeafJson; function sample(items, samples) { assert(0 <= samples && samples <= items.length); const reservoir = Reservoir(samples); reservoir.pushSome(...items); return reservoir; } exports.sample = sample; function getFullData(items) { const values = Object.values(items); const x = values.reduce((acc, entry) => acc.concat(entry.x), []); const y = values.reduce((acc, entry) => acc.concat(entry.y), []); return { x, y }; } exports.getFullData = getFullData; function encodeSentence(sentence) { const results = []; for (let i = 0; i < sentence.length; i++) { results.push(encodeChar(sentence[i])); } return results; } exports.encodeSentence = encodeSentence; function encodeChar(char) { return ALL_LETTERS.indexOf(char); } exports.encodeChar = encodeChar; function oneHotEncode(char) { const results = Array(exports.NUM_LETTERS).fill(0); results[ALL_LETTERS.indexOf(char)] = 1; return results; } exports.oneHotEncode = oneHotEncode; //# sourceMappingURL=read-json.js.map