dataframe-js
Version:
Immutable and functional data structure for datascientists and developpers
1,180 lines (1,105 loc) • 42.3 kB
JavaScript
import { text, json } from "d3-request";
import { dsvFormat } from "d3-dsv";
import {
match,
transpose,
chain,
iter,
arrayEqual,
saveFile,
compare,
asArray,
loadTextFile,
addFileProtocol
} from "./reusables";
import {
ArgumentTypeError,
WrongSchemaError,
MixedTypeError,
FileNotFoundError
} from "./errors";
import Row from "./row";
import GroupedDataFrame from "./groupedDataframe";
const __columns__ = Symbol("columns");
const __rows__ = Symbol("rows");
/**
* DataFrame data structure providing an immutable, flexible and powerfull way to manipulate data with columns and rows.
*/
class DataFrame {
static defaultModules = [];
/**
* Set the default modules used in DataFrame instances.
* @param {...Object} defaultModules DataFrame modules used by default.
* @example
* DataFrame.setDefaultModules(SQL, Stat)
*/
static setDefaultModules(...defaultModules) {
DataFrame.defaultModules = defaultModules;
}
/**
* Create a DataFrame from a delimiter separated values text file. It returns a Promise.
* @param {String | File} pathOrFile A path to the file (url or local) or a browser File object.
* @param {String} sep The separator used to parse the file.
* @param {Boolean} [header=true] A boolean indicating if the text has a header or not.
* @example
* DataFrame.fromDSV('http://myurl/myfile.txt').then(df => df.show())
* // In browser Only
* DataFrame.fromDSV(myFile).then(df => df.show())
* // From node.js only Only
* DataFrame.fromDSV('/my/absolue/path/myfile.txt').then(df => df.show())
* DataFrame.fromDSV('/my/absolue/path/myfile.txt', ';', true).then(df => df.show())
*/
static fromDSV(pathOrFile, sep = ";", header = true) {
const parser = dsvFormat(sep);
return new Promise(resolve => {
const parseText = fileContent => {
if (fileContent.includes("Error: ENOENT")) return resolve(null);
const data = header
? parser.parse(fileContent)
: parser.parseRows(fileContent);
return resolve(data);
};
return typeof pathOrFile === "string"
? text(addFileProtocol(pathOrFile), parseText)
: loadTextFile(pathOrFile, parseText);
}).then(fileContent => {
if (fileContent === null) {
throw new FileNotFoundError(pathOrFile);
}
return new DataFrame(fileContent);
});
}
/**
* Create a DataFrame from a delimiter separated values text file. It returns a Promise. Alias of DataFrame.fromDSV.
* @param {String | File} pathOrFile A path to the file (url or local) or a browser File object.
* @param {String} sep The separator used to parse the file.
* @param {Boolean} [header=true] A boolean indicating if the text has a header or not.
* @example
* DataFrame.fromText('http://myurl/myfile.txt').then(df => df.show())
* // In browser Only
* DataFrame.fromText(myFile).then(df => df.show())
* // From node.js only Only
* DataFrame.fromText('/my/absolue/path/myfile.txt').then(df => df.show())
* DataFrame.fromText('/my/absolue/path/myfile.txt', ';', true).then(df => df.show())
*/
static fromText(pathOrFile, sep = ";", header = true) {
return DataFrame.fromDSV(pathOrFile, sep, header);
}
/**
* Create a DataFrame from a comma separated values file. It returns a Promise.
* @param {String | File} pathOrFile A path to the file (url or local) or a browser File object.
* @param {Boolean} [header=true] A boolean indicating if the csv has a header or not.
* @example
* DataFrame.fromCSV('http://myurl/myfile.csv').then(df => df.show())
* // For browser only
* DataFrame.fromCSV(myFile).then(df => df.show())
* // From node.js only
* DataFrame.fromCSV('/my/absolue/path/myfile.csv').then(df => df.show())
* DataFrame.fromCSV('/my/absolue/path/myfile.csv', true).then(df => df.show())
*/
static fromCSV(pathOrFile, header = true) {
return DataFrame.fromDSV(pathOrFile, ",", header);
}
/**
* Create a DataFrame from a tab separated values file. It returns a Promise.
* @param {String | File} pathOrFile A path to the file (url or local) or a browser File object.
* @param {Boolean} [header=true] A boolean indicating if the tsv has a header or not.
* @example
* DataFrame.fromTSV('http://myurl/myfile.tsv').then(df => df.show())
* // For browser only
* DataFrame.fromTSV(myFile).then(df => df.show())
* // From node.js only
* DataFrame.fromTSV('/my/absolue/path/myfile.tsv').then(df => df.show())
* DataFrame.fromTSV('/my/absolue/path/myfile.tsv', true).then(df => df.show())
*/
static fromTSV(pathOrFile, header = true) {
return DataFrame.fromDSV(pathOrFile, "\t", header);
}
/**
* Create a DataFrame from a pipe separated values file. It returns a Promise.
* @param {String | File} pathOrFile A path to the file (url or local) or a browser File object.
* @param {Boolean} [header=true] A boolean indicating if the psv has a header or not.
* @example
* DataFrame.fromPSV('http://myurl/myfile.psv').then(df => df.show())
* // For browser only
* DataFrame.fromPSV(myFile).then(df => df.show())
* // From node.js only
* DataFrame.fromPSV('/my/absolue/path/myfile.psv').then(df => df.show())
* DataFrame.fromPSV('/my/absolue/path/myfile.psv', true).then(df => df.show())
*/
static fromPSV(pathOrFile, header = true) {
return DataFrame.fromDSV(pathOrFile, "|", header);
}
/**
* Create a DataFrame from a JSON file. It returns a Promise.
* @param {String | File} pathOrFile A path to the file (url or local) or a browser File object.
* @example
* DataFrame.fromJSON('http://myurl/myfile.json').then(df => df.show())
* // For browser only
* DataFrame.fromJSON(myFile).then(df => df.show())
* // From node.js only
* DataFrame.fromJSON('/my/absolute/path/myfile.json').then(df => df.show())
*/
static fromJSON(pathOrFile) {
return new Promise(resolve => {
return typeof pathOrFile === "string"
? json(addFileProtocol(pathOrFile), resolve)
: loadTextFile(pathOrFile, txt => resolve(JSON.parse(txt)));
}).then(fileContent => {
if (fileContent === null) {
throw new FileNotFoundError(pathOrFile);
}
return new DataFrame(fileContent);
});
}
/**
* Create a new DataFrame.
* @param {Array | Object | DataFrame} data The data of the DataFrame.
* @param {Array} columns The DataFrame column names.
* @param {Object} options Additional options. Example: modules.
* @example
* new DataFrame({
* 'column1': [3, 6, 8],
* 'column2': [3, 4, 5, 6],
* }, ['column1', 'column2'])
*
* new Data Frame([
* [1, 6, 9, 10, 12],
* [1, 2],
* [6, 6, 9, 8, 9, 12],
* ], ['c1', 'c2', 'c3', 'c4', 'c5', 'c6'])
*
* new DataFrame([
* {c1: 1, c2: 6, c3: 9, c4: 10, c5: 12},
* {c4: 1, c3: 2},
* {c1: 6, c5: 6, c2: 9, c4: 8, c3: 9, c6: 12},
* ], ['c1', 'c2', 'c3', 'c4', 'c5', 'c6'])
*
* new DataFrame(df);
*
* new DataFrame(yourData, yourColumns, {modules: [MyOwnModule, MyOtherModule]})
*/
constructor(data, columns, options = {}) {
[this[__rows__], this[__columns__]] = this._build(data, columns);
this.options = options;
this.options.modules = [
...DataFrame.defaultModules,
...(this.options.modules || [])
];
Object.assign(
this,
...this.__instanciateModules__(this.options.modules)
);
}
*[Symbol.iterator]() {
for (const row of this[__rows__]) {
yield row;
}
}
_columnsAreEquals(columns, columns2 = this[__columns__]) {
for (const key of Object.keys(columns)) {
if (columns[key] !== columns2[key]) return false;
}
return true;
}
__newInstance__(data, columns) {
if (!this._columnsAreEquals(columns) || !(data[0] instanceof Row)) {
return new DataFrame(data, columns, this.options);
}
const firstRowColumns = Object.keys(data[0].toDict());
if (!arrayEqual(firstRowColumns, this[__columns__], true)) {
return new DataFrame(data, firstRowColumns, this.options);
}
const newInstance = new DataFrame([], [], this.options);
newInstance[__rows__] = [...data];
newInstance[__columns__] = [...columns];
return newInstance;
}
__instanciateModules__(modules, df = undefined) {
return modules.map(Plugin => {
const pluginInstance = new Plugin(df ? df : this);
return { [pluginInstance.name]: pluginInstance };
});
}
_build(data, columns) {
return match(
data,
[
value => value instanceof DataFrame,
() =>
this._fromArray(
[...data[__rows__]],
columns ? columns : data[__columns__]
)
],
[
value => value instanceof Array && value.length !== 0,
() =>
this._fromArray(
data,
columns
? columns
: [
...new Set(
[
...data.slice(0, 10),
...data.slice(-10, -1)
]
.map(row => Object.keys(row))
.reduce((p, n) => [...p, ...n])
)
]
)
],
[
value => value instanceof Array && value.length === 0,
() => this._fromArray(data, columns ? columns : [])
],
[
value => value instanceof Object,
() =>
this._fromDict(data, columns ? columns : Object.keys(data))
],
[
() => true,
() => {
throw new ArgumentTypeError(
data,
"DataFrame | Array | Object"
);
}
]
);
}
_fromDict(dict, columns) {
return [
transpose(Object.values(dict)).map(row => new Row(row, columns)),
columns
];
}
_fromArray(array, columns) {
return [array.map(row => new Row(row, columns)), columns];
}
_joinByType(gdf1, gdf2, type, newColumns) {
const gdf2Hashs = gdf2.listHashs();
return gdf1
.toCollection()
.map(({ group, hash }) => {
const isContained = gdf2Hashs.includes(hash);
let modifiedGroup = group;
if (gdf2.get(hash)) {
const gdf2Collection = gdf2.get(hash).group.toCollection();
const combinedGroup = group
.toCollection()
.map(row => {
return gdf2Collection.map(row2 =>
Object.assign({}, row2, row)
);
})
.reduce((p, n) => [...p, ...n], []);
modifiedGroup = this.__newInstance__(
combinedGroup,
newColumns
);
}
const filterCondition = bool => (bool ? modifiedGroup : false);
if (type === "full") return modifiedGroup;
return type === "out"
? filterCondition(!isContained)
: filterCondition(isContained);
})
.filter(group => group);
}
_join(dfToJoin, columnNames, types) {
if (!(dfToJoin instanceof DataFrame))
throw new ArgumentTypeError(dfToJoin, "DataFrame");
const newColumns = [
...new Set([...this.listColumns(), ...dfToJoin.listColumns()])
];
const columns = Array.isArray(columnNames)
? columnNames
: [columnNames];
const gdf = this.groupBy(...columns);
const gdfToJoin = dfToJoin.groupBy(...columns);
return [
this.__newInstance__([], newColumns),
...iter(
[
...(types[0]
? this._joinByType(gdf, gdfToJoin, types[0], newColumns)
: []),
...(types[1]
? this._joinByType(gdfToJoin, gdf, types[1], newColumns)
: [])
],
group => group.restructure(newColumns)
)
]
.reduce((p, n) => p.union(n))
.dropDuplicates();
}
_cleanSavePath(path) {
return path.replace("file://", "/");
}
/**
* Convert DataFrame into dict / hash / object.
* @returns {Object} The DataFrame converted into dict.
* @example
* df.toDict()
*/
toDict() {
return Object.assign(
{},
...Object.entries(this.transpose().toArray()).map(
([index, column]) => ({ [this[__columns__][index]]: column })
)
);
}
/**
* Convert DataFrame into Array of Arrays. You can also extract only one column as Array.
* @param {String} [columnName] Column Name to extract. By default, all columns are transformed.
* @returns {Array} The DataFrame (or the column) converted into Array.
* @example
* df.toArray()
*/
toArray(columnName) {
return columnName
? [...this].map(row => row.get(columnName))
: [...this].map(row => row.toArray());
}
/**
* Convert DataFrame into Array of dictionnaries. You can also return Rows instead of dictionnaries.
* @param {Boolean} [ofRows] Return a collection of Rows instead of dictionnaries.
* @returns {Array} The DataFrame converted into Array of dictionnaries (or Rows).
* @example
* df.toCollection()
*/
toCollection(ofRows) {
return ofRows ? [...this] : [...this].map(row => row.toDict());
}
/**
* Convert the DataFrame into a text delimiter separated values.
You can also save the file if you are using nodejs.
* @param {String} [sep=' '] Column separator.
* @param {Boolean} [header=true] Writing the header in the first line. If false, there will be no header.
* @param {String} [path] The path to save the file. /!\ Works only on node.js, not into the browser.
* @returns {String} The text file in raw string.
* @example
* df.toDSV()
* df.toDSV(';')
* df.toDSV(';', true)
* // From node.js only
* df.toDSV(';', true, '/my/absolute/path/dataframe.txt')
*/
toDSV(sep = ";", header = true, path = undefined) {
const parser = dsvFormat(sep);
const csvContent = header
? parser.format(this.toCollection(), this[__columns__])
: parser.formatRows(this.toArray());
if (path) {
saveFile(this._cleanSavePath(path), csvContent);
}
return csvContent;
}
/**
* Convert the DataFrame into a text delimiter separated values. Alias for .toDSV.
You can also save the file if you are using nodejs.
* @param {String} [sep=' '] Column separator.
* @param {Boolean} [header=true] Writing the header in the first line. If false, there will be no header.
* @param {String} [path] The path to save the file. /!\ Works only on node.js, not into the browser.
* @returns {String} The text file in raw string.
* @example
* df.toText()
* df.toText(';')
* df.toText(';', true)
* // From node.js only
* df.toText(';', true, '/my/absolute/path/dataframe.txt')
*/
toText(sep = ";", header = true, path = undefined) {
return this.toDSV(sep, header, path);
}
/**
* Convert the DataFrame into a comma separated values string.
You can also save the file if you are using nodejs.
* @param {Boolean} [header=true] Writing the header in the first line. If false, there will be no header.
* @param {String} [path] The path to save the file. /!\ Works only on node.js, not into the browser.
* @returns {String} The csv file in raw string.
* @example
* df.toCSV()
* df.toCSV(true)
* // From node.js only
* df.toCSV(true, '/my/absolute/path/dataframe.csv')
*/
toCSV(header = true, path = undefined) {
return this.toDSV(",", header, path);
}
/**
* Convert the DataFrame into a tab separated values string.
You can also save the file if you are using nodejs.
* @param {Boolean} [header=true] Writing the header in the first line. If false, there will be no header.
* @param {String} [path] The path to save the file. /!\ Works only on node.js, not into the browser.
* @returns {String} The csv file in raw string.
* @example
* df.toCSV()
* df.toCSV(true)
* // From node.js only
* df.toCSV(true, '/my/absolute/path/dataframe.csv')
*/
toTSV(header = true, path = undefined) {
return this.toDSV("\t", header, path);
}
/**
* Convert the DataFrame into a pipe separated values string.
You can also save the file if you are using nodejs.
* @param {Boolean} [header=true] Writing the header in the first line. If false, there will be no header.
* @param {String} [path] The path to save the file. /!\ Works only on node.js, not into the browser.
* @returns {String} The csv file in raw string.
* @example
* df.toPSV()
* df.toPSV(true)
* // From node.js only
* df.toPSV(true, '/my/absolute/path/dataframe.csv')
*/
toPSV(header = true, path = undefined) {
return this.toDSV("|", header, path);
}
/**
* Convert the DataFrame into a json string. You can also save the file if you are using nodejs.
* @param {Boolean} [asCollection=true] Writing the JSON as collection of Object.
* @param {String} [path] The path to save the file. /!\ Works only on node.js, not into the browser.
* @returns {String} The json file in raw string.
* @example
* df.toJSON()
* // From node.js only
* df.toJSON('/my/absolute/path/dataframe.json')
*/
toJSON(asCollection = false, path = undefined) {
const jsonContent = JSON.stringify(
asCollection ? this.toCollection() : this.toDict()
);
if (path) {
saveFile(this._cleanSavePath(path), jsonContent);
}
return jsonContent;
}
/**
* Display the DataFrame as String Table. Can only return a sring instead of displaying the DataFrame.
* @param {Number} [rows=10] The number of lines to display.
* @param {Boolean} [quiet=false] Quiet mode. If true, only returns a string instead of console.log().
* @returns {String} The DataFrame as String Table.
* @example
* df.show()
* df.show(10)
* const stringDF = df.show(10, true)
*/
show(rows = 10, quiet = false) {
const makeRow = row =>
`| ${row
.map(column => {
const columnAsString = String(column);
return columnAsString.length > 9
? columnAsString.substring(0, 6) + "..."
: columnAsString +
Array(10 - columnAsString.length).join(" ");
})
.join(" | ")} |`;
const header = makeRow(this[__columns__]);
let token = 0;
const toShow = [
header,
Array(header.length).join("-"),
...iter(
this[__rows__],
row => {
token++;
return makeRow(row.toArray());
},
() => token >= rows
)
].join("\n");
if (!quiet) {
console.log(toShow);
}
return toShow;
}
/**
* Get the DataFrame dimensions.
* @returns {Array} The DataFrame dimensions. [height, weight]
* @example
* const [height, weight] = df.dim()
*/
dim() {
return [this.count(), this[__columns__].length];
}
/**
* Transpose a DataFrame. Rows become columns and conversely. n x p => p x n.
* @param {Boolean} [transposeColumnNames=false] An option to transpose columnNames in a rowNames column.
* @returns {ÐataFrame} A new transposed DataFrame.
* @example
* df.transpose()
*/
transpose(tranposeColumnNames) {
const newColumns = [
...(tranposeColumnNames ? ["rowNames"] : []),
...[...Array(this.count()).keys()].reverse()
];
const transposedRows = transpose(
(tranposeColumnNames
? this.push(this[__columns__])
: this
).toArray()
);
return this.__newInstance__(
transposedRows,
newColumns.reverse()
).restructure(newColumns);
}
/**
* Get the rows number.
* @returns {Int} The number of DataFrame rows.
* @example
* df.count()
*/
count() {
return this[__rows__].length;
}
/**
* Get the count of a value into a column.
* @param valueToCount The value to count into the selected column.
* @param {String} [columnName=this.listColumns()[0]] The column to count the value.
* @returns {Int} The number of times the selected value appears.
* @example
* df.countValue(5, 'column2')
* df.select('column1').countValue(5)
*/
countValue(valueToCount, columnName = this[__columns__][0]) {
return this.filter(row => row.get(columnName) === valueToCount).count();
}
/**
* Push new rows into the DataFrame.
* @param {Array | Row} rows The rows to add.
* @returns {DataFrame} A new DataFrame with the new rows.
* @example
* df.push([1,2,3], [1,4,9])
*/
push(...rows) {
return this.union(new DataFrame(rows, this[__columns__]));
}
/**
* Replace a value by another in all the DataFrame or in a column.
* @param value The value to replace.
* @param replacement The new value.
* @param {String | Array} [columnNames=this.listColumns()] The columns to apply the replacement.
* @returns {DataFrame} A new DataFrame with replaced values.
* @example
* df.replace(undefined, 0, 'column1', 'column2')
*/
replace(value, replacement, columnNames) {
const columns = asArray(columnNames);
return this.map(row =>
(columns.length > 0 ? columns : this[__columns__]).reduce(
(p, n) => (p.get(n) === value ? p.set(n, replacement) : p),
row
)
);
}
/**
* Compute unique values into a column.
* @param {String} columnName The column to distinct.
* @returns {DataFrame} A DataFrame containing the column with distinct values.
* @example
* df.distinct('column1')
*/
distinct(columnName) {
return this.__newInstance__(
{ [columnName]: [...new Set(this.toArray(columnName))] },
[columnName]
);
}
/**
* Compute unique values into a column.
* Alias from .distinct()
* @param {String} columnName The column to distinct.
* @returns {DataFrame} A DataFrame containing the column with distinct values.
* @example
* df.unique('column1')
*/
unique(columnName) {
return this.distinct(columnName);
}
/**
* List DataFrame columns.
* @returns {Array} An Array containing DataFrame columnNames.
* @example
* df.listColumns()
*/
listColumns() {
return [...this[__columns__]];
}
/**
* Select columns in the DataFrame.
* @param {...String} columnNames The columns to select.
* @returns {DataFrame} A new DataFrame containing selected columns.
* @example
* df.select('column1', 'column3')
*/
select(...columnNames) {
return this.__newInstance__(
this[__rows__].map(row => row.select(...columnNames)),
columnNames
);
}
/**
* Add a new column or set an existing one.
* @param {String} columnName The column to modify or to create.
* @param {Function} [func=(row, index) => undefined] The function to create the column.
* @returns {DataFrame} A new DataFrame containing the new or modified column.
* @example
* df.withColumn('column4', () => 2)
* df.withColumn('column2', (row) => row.get('column2') * 2)
*/
withColumn(columnName, func = () => undefined) {
return this.__newInstance__(
this[__rows__].map((row, index) => {
return row.set(columnName, func(row, index));
}),
this[__columns__].includes(columnName)
? this[__columns__]
: [...this[__columns__], columnName]
);
}
/**
* Modify the structure of the DataFrame by changing columns order, creating new columns or removing some columns.
* @param {Array} newColumnNames The new columns of the DataFrame.
* @returns {DataFrame} A new DataFrame with restructured columns (renamed, add or deleted).
* @example
* df.restructure(['column1', 'column4', 'column2', 'column3'])
* df.restructure(['column1', 'column4'])
* df.restructure(['column1', 'newColumn', 'column4'])
*/
restructure(newColumnNames) {
return this.__newInstance__(this[__rows__], newColumnNames);
}
/**
* Rename each column.
* @param {Array} newColumnNames The new column names of the DataFrame.
* @returns {DataFrame} A new DataFrame with the new column names.
* @example
* df.renameAll(['column1', 'column3', 'column4'])
*/
renameAll(newColumnNames) {
if (newColumnNames.length !== this[__columns__].length) {
throw new WrongSchemaError(newColumnNames, this[__columns__]);
}
return this.__newInstance__(this.toArray(), newColumnNames);
}
/**
* Rename a column.
* @param {String} columnName The column to rename.
* @param {String} replacement The new name for the column.
* @returns {DataFrame} A new DataFrame with the new column name.
* @example
* df.rename('column1', 'columnRenamed')
*/
rename(columnName, replacement) {
const newColumnNames = this[__columns__].map(
column => (column === columnName ? replacement : column)
);
return this.renameAll(newColumnNames);
}
/**
* Cast each column into a given type.
* @param {Array} typeFunctions The functions used to cast columns.
* @returns {DataFrame} A new DataFrame with the columns having new types.
* @example
* df.castAll([Number, String, (val) => new CustomClass(val)])
*/
castAll(typeFunctions) {
if (typeFunctions.length !== this[__columns__].length) {
throw new WrongSchemaError(typeFunctions, this[__columns__]);
}
return this.map(
row =>
new Row(
row
.toArray()
.map((column, index) => typeFunctions[index](column)),
this[__columns__]
)
);
}
/**
* Cast a column into a given type.
* @param {String} columnName The column to cast.
* @param {Function} ObjectType The function used to cast the column.
* @returns {DataFrame} A new DataFrame with the column having a new type.
* @example
* df.cast('column1', Number)
* df.cast('column1', (val) => new MyCustomClass(val))
*/
cast(columnName, typeFunction) {
return this.withColumn(columnName, row =>
typeFunction(row.get(columnName))
);
}
/**
* Remove a single column.
* @param {String} columnName The column to drop.
* @returns {DataFrame} A new DataFrame without the dropped column.
* @example
* df.drop('column2')
*/
drop(columnName) {
return this.__newInstance__(
this[__rows__].map(row => row.delete(columnName)),
this[__columns__].filter(column => column !== columnName)
);
}
/**
* Chain maps and filters functions on DataFrame by optimizing their executions.
* If a function returns boolean, it's a filter. Else it's a map.
* It can be 10 - 100 x faster than standard chains of .map() and .filter().
* @param {...Function} funcs Functions to apply on the DataFrame rows taking the row as parameter.
* @returns {DataFrame} A new DataFrame with modified rows.
* @example
* df.chain(
* row => row.get('column1') > 3, // filter
* row => row.set('column1', 3), // map
* row => row.get('column2') === '5' // filter
* )
*/
chain(...funcs) {
return this.__newInstance__(
[...chain(this[__rows__], ...funcs)],
this[__columns__]
);
}
/**
* Filter DataFrame rows.
* @param {Function | Object} condition A filter function or a column/value object.
* @returns {DataFrame} A new filtered DataFrame.
* @example
* df.filter(row => row.get('column1') >= 3)
* df.filter({'column2': 5, 'column1': 3}))
*/
filter(condition) {
const func =
typeof condition === "object"
? row =>
Object.entries(condition)
.map(([column, value]) =>
Object.is(row.get(column), value)
)
.reduce((p, n) => p && n)
: condition;
const filteredRows = [
...iter(this[__rows__], (row, i) => (func(row, i) ? row : false))
];
return filteredRows.length > 0
? this.__newInstance__(filteredRows, this[__columns__])
: this.__newInstance__([], []);
}
/**
* Filter DataFrame rows.
* Alias of .filter()
* @param {Function | Object} condition A filter function or a column/value object.
* @returns {DataFrame} A new filtered DataFrame.
* @example
* df.where(row => row.get('column1') >= 3)
* df.where({'column2': 5, 'column1': 3}))
*/
where(condition) {
return this.filter(condition);
}
/**
* Find a row (the first met) based on a condition.
* @param {Function | Object} condition A filter function or a column/value object.
* @returns {Row} The targeted Row.
* @example
* df.find(row => row.get('column1') === 3)
* df.find({'column1': 3})
*/
find(condition) {
return this.filter(condition)[__rows__][0];
}
/**
* Map on DataFrame rows. /!\ Prefer to use .chain().
* @param {Function} func A function to apply on each row taking the row as parameter.
* @returns {DataFrame} A new DataFrame with modified rows.
* @example
* df.map(row => row.set('column1', row.get('column1') * 2))
*/
map(func) {
return this.__newInstance__(
[...iter(this[__rows__], (row, i) => func(row, i))],
this[__columns__]
);
}
/**
* Reduce DataFrame into a value.
* @param {Function} func The reduce function taking 2 parameters, previous and next.
* @param [init] The initial value of the reducer.
* @returns A reduced value.
* @example
* df.reduce((p, n) => n.get('column1') + p, 0)
* df2.reduce((p, n) => (
* n.set('column1', p.get('column1') + n.get('column1'))
* .set('column2', p.get('column2') + n.get('column2'))
* ))
*/
reduce(func, init) {
return typeof init === "undefined"
? this[__rows__].reduce((p, n) => func(p, n))
: this[__rows__].reduce((p, n) => func(p, n), init);
}
/**
* Reduce DataFrame into a value, starting from the last row (see .reduce()).
* @param {Function} func The reduce function taking 2 parameters, previous and next.
* @param [init] The initial value of the reducer.
* @returns A reduced value.
* @example
* df.reduceRight((p, n) => p > n ? p : n, 0)
*/
reduceRight(func, init) {
return typeof init === "undefined"
? this[__rows__].reduceRight((p, n) => func(p, n))
: this[__rows__].reduceRight((p, n) => func(p, n), init);
}
/**
* Return a DataFrame without duplicated columns.
* @param {...String} columnNames The columns used to check unicity of rows. If omitted, unicity is checked on all columns.
* @returns {DataFrame} A DataFrame without duplicated rows.
* @example
* df.dropDuplicates('id', 'name')
*/
dropDuplicates(...columnNames) {
const groupCols =
columnNames && columnNames.length > 0
? columnNames
: this[__columns__];
return this.groupBy(...groupCols).filter((row, i) => i === 0);
}
/**
* Return a shuffled DataFrame rows.
* @returns {DataFrame} A shuffled DataFrame.
* @example
* df.shuffle()
*/
shuffle() {
return this.__newInstance__(
this.reduce((p, n) => {
const index = Math.floor(Math.random() * (p.length - 1) + 1);
return Array.isArray(p)
? [...p.slice(index, p.length + 1), n, ...p.slice(0, index)]
: [p, n];
}),
this[__columns__]
);
}
/**
* Return a random sample of rows.
* @param {Number} percentage A percentage of the orignal DataFrame giving the sample size.
* @returns {DataFrame} A sample DataFrame
* @example
* df.sample(0.3)
*/
sample(percentage) {
const nRows = this.count() * percentage;
let token = 0;
return this.__newInstance__(
[
...iter(
this.shuffle()[__rows__],
row => {
token++;
return row;
},
() => token >= nRows
)
],
this[__columns__]
);
}
/**
* Randomly split a DataFrame into 2 DataFrames.
* @param {Number} percentage A percentage of the orignal DataFrame giving the first DataFrame size. The second takes the rest.
* @returns {Array} An Array containing the two DataFrames. First, the X% DataFrame then the rest DataFrame.
* @example
* const [30DF, 70DF] = df.bisect(0.3)
*/
bisect(percentage) {
const nRows = this.count() * percentage;
let token = 0;
const restRows = [];
return [
this.__newInstance__(
[
...iter(this.shuffle()[__rows__], row => {
if (token < nRows) {
token++;
return row;
}
restRows.push(row);
})
],
this[__columns__]
),
this.__newInstance__(restRows, this[__columns__])
];
}
/**
* Group DataFrame rows by columns giving a GroupedDataFrame object. See its doc for more examples.
* @param {...String} columnNames The columns used for the groupBy.
* @returns {GroupedDataFrame} A GroupedDataFrame object.
* @example
* df.groupBy('column1')
* df.groupBy('column1', 'column2')
* df.groupBy('column1', 'column2').listGroups()
* df.groupBy('column1', 'column2').show()
* df.groupBy('column1', 'column2').aggregate((group) => group.count())
*/
groupBy(...columnNames) {
return new GroupedDataFrame(this, ...columnNames);
}
/**
* Sort DataFrame rows based on column values. The row should contains only one variable type. Columns are sorted left-to-right.
* @param {String | Array<string>} columnNames The columns giving order.
* @param {Boolean} [reverse=false] Reverse mode. Reverse the order if true.
* @returns {DataFrame} An ordered DataFrame.
* @example
* df.sortBy('id')
* df.sortBy(['id1', 'id2'])
* df.sortBy(['id1'], true)
*/
sortBy(columnNames, reverse = false) {
// ensure unique columns
const _columnNames = Array.from(new Set(asArray(columnNames)));
const sortedRows = this[__rows__].sort((p, n) => {
return _columnNames
.map(col => {
const [pValue, nValue] = [p.get(col), n.get(col)];
if (typeof pValue !== typeof nValue) {
throw new MixedTypeError();
}
return compare(pValue, nValue, reverse);
})
.reduce((acc, curr) => {
return acc || curr;
});
});
return this.__newInstance__(sortedRows, this[__columns__]);
}
/**
* Concat two DataFrames.
* @param {DataFrame} dfToUnion The DataFrame to concat.
* @returns {DataFrame} A new concatenated DataFrame resulting of the union.
* @example
* df.union(df2)
*/
union(dfToUnion) {
if (!(dfToUnion instanceof DataFrame))
throw new ArgumentTypeError(dfToUnion, "DataFrame");
if (!arrayEqual(this[__columns__], dfToUnion[__columns__])) {
throw new WrongSchemaError(
dfToUnion[__columns__],
this[__columns__]
);
}
return this.__newInstance__(
[...this, ...dfToUnion.restructure(this[__columns__])],
this[__columns__]
);
}
/**
* Join two DataFrames.
* @param {DataFrame} dfToJoin The DataFrame to join.
* @param {String | Array} columnNames The selected columns for the join.
* @param {String} [how='inner'] The join mode. Can be: full, inner, outer, left, right.
* @returns {DataFrame} The joined DataFrame.
* @example
* df.join(df2, 'column1', 'full')
*/
join(dfToJoin, columnNames, how = "inner") {
const joinMethods = {
inner: () => this.innerJoin(dfToJoin, columnNames),
full: () => this.fullJoin(dfToJoin, columnNames),
outer: () => this.outerJoin(dfToJoin, columnNames),
left: () => this.leftJoin(dfToJoin, columnNames),
right: () => this.rightJoin(dfToJoin, columnNames)
};
return joinMethods[how]();
}
/**
* Join two DataFrames with inner mode.
* @param {DataFrame} dfToJoin The DataFrame to join.
* @param {String | Array} columnNames The selected columns for the join.
* @returns {DataFrame} The joined DataFrame.
* @example
* df.innerJoin(df2, 'id')
* df.join(df2, 'id')
* df.join(df2, 'id', 'inner')
*/
innerJoin(dfToJoin, columnNames) {
return this._join(dfToJoin, columnNames, ["in"]);
}
/**
* Join two DataFrames with full mode.
* @param {DataFrame} dfToJoin The DataFrame to join.
* @param {String | Array} columnNames The selected columns for the join.
* @returns {DataFrame} The joined DataFrame.
* @example
* df.fullJoin(df2, 'id')
* df.join(df2, 'id', 'full')
*/
fullJoin(dfToJoin, columnNames) {
return this._join(dfToJoin, columnNames, ["full", "full"]);
}
/**
* Join two DataFrames with outer mode.
* @param {DataFrame} dfToJoin The DataFrame to join.
* @param {String | Array} columnNames The selected columns for the join.
* @returns {DataFrame} The joined DataFrame.
* @example
* df2.outerJoin(df2, 'id')
* df2.join(df2, 'id', 'outer')
*/
outerJoin(dfToJoin, columnNames) {
return this.fullJoin(dfToJoin, columnNames);
}
/**
* Join two DataFrames with left mode.
* @param {DataFrame} dfToJoin The DataFrame to join.
* @param {String | Array} columnNames The selected columns for the join.
* @returns {DataFrame} The joined DataFrame.
* @example
* df.leftJoin(df2, 'id')
* df.join(df2, 'id', 'left')
*/
leftJoin(dfToJoin, columnNames) {
return this._join(dfToJoin, columnNames, ["full", "in"]);
}
/**
* Join two DataFrames with right mode.
* @param {DataFrame} dfToJoin The DataFrame to join.
* @param {String | Array} columnNames The selected columns for the join.
* @returns {DataFrame} The joined DataFrame.
* @example
* df.rightJoin(df2, 'id')
* df.join(df2, 'id', 'right')
*/
rightJoin(dfToJoin, columnNames) {
return this._join(dfToJoin, columnNames, ["in", "full"]);
}
/**
* Find the differences between two DataFrames (reverse of join).
* @param {DataFrame} dfToDiff The DataFrame to diff.
* @param {String | Array} columnNames The selected columns for the diff.
* @returns {DataFrame} The differences DataFrame.
* @example
* df2.diff(df2, 'id')
*/
diff(dfToDiff, columnNames) {
return this._join(dfToDiff, columnNames, ["out", "out"]);
}
}
export default DataFrame;