federer
Version:
Experiments in asynchronous federated learning and decentralized learning
141 lines • 5.92 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.oneHotEncode = exports.encodeChar = exports.encodeSentence = exports.getFullData = exports.sample = exports.readLeafJson = exports.convertLabels = exports.convertItems = exports.getFilepaths = exports.readRawDataFile = exports.processRawDataFromFilepaths = exports.processRawData = exports.NUM_LETTERS = void 0;
const tslib_1 = require("tslib");
const path = tslib_1.__importStar(require("path"));
const assert = require("assert");
const tf = tslib_1.__importStar(require("@tensorflow/tfjs-node"));
const Reservoir = require("reservoir");
const common_1 = require("../../../common");
const coordinator_1 = require("../../../coordinator");
const ALL_LETTERS = "\n !\"&'(),-.0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz}";
exports.NUM_LETTERS = ALL_LETTERS.length;
async function processRawData(dataset, environment, numberClients, numberLabelClasses) {
const filepaths = getFilepaths(dataset);
const trainpaths = await processRawDataFromFilepaths(filepaths.train, environment, dataset, numberLabelClasses, true, numberClients);
const testpaths = await processRawDataFromFilepaths(filepaths.test, environment, dataset, numberLabelClasses, false);
return {
trainpaths: trainpaths,
testpaths: testpaths[0],
};
}
exports.processRawData = processRawData;
/**
* Create shards for test and train datasets
*
* @param filepaths Holds file paths to both the train and test JSON files
* @param environment Depending on the environment, data files will be read
* either from local files, or from S3.
* @param dataset The Leaf dataset to be used
* @param numberLabelClasses Number of unique labels
* @param train True if creating train shards
* @param numberClients How many shards will be used for training.
* @returns File paths to the train and test shards
*/
async function processRawDataFromFilepaths(filepaths, environment, dataset, numberLabelClasses, train, numberClients) {
const directory = `data/processed/leaf/${dataset}`;
common_1.mkdirp(directory);
const shards = await readRawDataFile(filepaths, environment, dataset, numberLabelClasses, numberClients);
return Promise.all(shards.map(async (shard, i) => {
const path = shard.save(directory, train ? `shard-${i}` : "test");
shard.dispose();
return path;
}));
}
exports.processRawDataFromFilepaths = processRawDataFromFilepaths;
/**
* Split the corresponding JSON file into shards
*
* @param file The path to train/test JSON file
* @param environment Depending on the environment, data files will be read
* either from local files, or from S3.
* @param dataset The Leaf dataset to be used
* @param numberLabelClasses Number of unique labels
* @param numberClients How much of the total number
* of shards will be used for training. This is
* undefined if creating test shards
* @returns An array of tensors grouped into subsets
*/
async function readRawDataFile(file, environment, dataset, numberLabelClasses, numberClients) {
const json = await readLeafJson(file, environment);
const data = Object.values(json.user_data);
const sampled = numberClients !== undefined
? sample(data, numberClients)
: [getFullData(data)];
return sampled.map((sample) => {
const items = convertItems(sample.x, dataset);
const labels = convertLabels(sample.y, dataset, numberLabelClasses);
return new common_1.DataSubset({ items, labels });
});
}
exports.readRawDataFile = readRawDataFile;
const fileNames = {
train: "train/all_data_niid_0_keep_0_train_9.json",
test: "test/all_data_niid_0_keep_0_test_9.json",
};
function getFilepaths(dataset) {
return {
train: getFilepath(dataset, fileNames.train),
test: getFilepath(dataset, fileNames.test),
};
}
exports.getFilepaths = getFilepaths;
function getFilepath(dataset, filename) {
return path.join(common_1.absolutePath.data.raw(`leaf/${dataset}`), filename);
}
function convertItems(array, dataset) {
switch (dataset) {
case "shakespeare":
return tf.tensor(array.map((sentence) => encodeSentence(sentence)));
case "synthetic":
return tf.tensor(array, undefined, "float32");
}
}
exports.convertItems = convertItems;
function convertLabels(array, dataset, numberLabelClasses) {
switch (dataset) {
case "shakespeare":
return tf.tensor(array.map((sentence) => oneHotEncode(sentence)));
case "synthetic":
return tf.oneHot(array, numberLabelClasses);
}
}
exports.convertLabels = convertLabels;
async function readLeafJson(file, environment) {
const buffer = await coordinator_1.readFileInEnvironment(file, environment);
return JSON.parse(buffer.toString("utf-8"));
}
exports.readLeafJson = readLeafJson;
function sample(items, samples) {
assert(0 <= samples && samples <= items.length);
const reservoir = Reservoir(samples);
reservoir.pushSome(...items);
return reservoir;
}
exports.sample = sample;
function getFullData(items) {
const values = Object.values(items);
const x = values.reduce((acc, entry) => acc.concat(entry.x), []);
const y = values.reduce((acc, entry) => acc.concat(entry.y), []);
return { x, y };
}
exports.getFullData = getFullData;
function encodeSentence(sentence) {
const results = [];
for (let i = 0; i < sentence.length; i++) {
results.push(encodeChar(sentence[i]));
}
return results;
}
exports.encodeSentence = encodeSentence;
function encodeChar(char) {
return ALL_LETTERS.indexOf(char);
}
exports.encodeChar = encodeChar;
function oneHotEncode(char) {
const results = Array(exports.NUM_LETTERS).fill(0);
results[ALL_LETTERS.indexOf(char)] = 1;
return results;
}
exports.oneHotEncode = oneHotEncode;
//# sourceMappingURL=read-json.js.map