@eagleoutice/flowr
Version:
Static Dataflow Analyzer and Program Slicer for the R Programming Language
1,136 lines • 54.4 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.mapDataFrameFunctionCall = mapDataFrameFunctionCall;
const config_1 = require("../../../config");
const make_argument_1 = require("../../../dataflow/internal/process/functions/call/argument/make-argument");
const built_in_source_1 = require("../../../dataflow/internal/process/functions/call/built-in/built-in-source");
const r_function_call_1 = require("../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call");
const type_1 = require("../../../r-bridge/lang-4.x/ast/model/type");
const retriever_1 = require("../../../r-bridge/retriever");
const assert_1 = require("../../../util/assert");
const dataframe_domain_1 = require("../dataframe-domain");
const resolve_args_1 = require("../resolve-args");
const arguments_1 = require("./arguments");
const identifier_1 = require("../../../dataflow/environments/identifier");
/**
* Represents the different types of data frames in R
*/
var DataFrameType;
(function (DataFrameType) {
DataFrameType["DataFrame"] = "data.frame";
DataFrameType["Tibble"] = "tibble";
DataFrameType["DataTable"] = "data.table";
})(DataFrameType || (DataFrameType = {}));
/**
* Mapper for mapping the supported concrete data frame functions to mapper functions,
* including information about the origin library of the functions and the type of the returned data frame.
*/
const DataFrameFunctionMapper = {
'data.frame': { mapper: mapDataFrameCreate, library: 'base', returnType: DataFrameType.DataFrame },
'as.data.frame': { mapper: mapDataFrameConvert, library: 'base', returnType: DataFrameType.DataFrame },
'read.table': { mapper: mapDataFrameRead, library: 'utils', returnType: DataFrameType.DataFrame },
'read.csv': { mapper: mapDataFrameRead, library: 'utils', returnType: DataFrameType.DataFrame },
'read.csv2': { mapper: mapDataFrameRead, library: 'utils', returnType: DataFrameType.DataFrame },
'read.delim': { mapper: mapDataFrameRead, library: 'utils', returnType: DataFrameType.DataFrame },
'read.delim2': { mapper: mapDataFrameRead, library: 'utils', returnType: DataFrameType.DataFrame },
'read_table': { mapper: mapDataFrameRead, library: 'readr', returnType: DataFrameType.Tibble },
'read_csv': { mapper: mapDataFrameRead, library: 'readr', returnType: DataFrameType.Tibble },
'read_csv2': { mapper: mapDataFrameRead, library: 'readr', returnType: DataFrameType.Tibble },
'read_tsv': { mapper: mapDataFrameRead, library: 'readr', returnType: DataFrameType.Tibble },
'read_delim': { mapper: mapDataFrameRead, library: 'readr', returnType: DataFrameType.Tibble },
'cbind': { mapper: mapDataFrameColBind, library: 'base', returnType: DataFrameType.DataFrame },
'rbind': { mapper: mapDataFrameRowBind, library: 'base', returnType: DataFrameType.DataFrame },
'head': { mapper: mapDataFrameHeadTail, library: 'utils', returnType: DataFrameType.DataFrame },
'tail': { mapper: mapDataFrameHeadTail, library: 'utils', returnType: DataFrameType.DataFrame },
'subset': { mapper: mapDataFrameSubset, library: 'base', returnType: DataFrameType.DataFrame },
'filter': { mapper: mapDataFrameFilter, library: 'dplyr', returnType: DataFrameType.DataFrame },
'select': { mapper: mapDataFrameSelect, library: 'dplyr', returnType: DataFrameType.DataFrame },
'mutate': { mapper: mapDataFrameMutate, library: 'dplyr', returnType: DataFrameType.DataFrame },
'transform': { mapper: mapDataFrameMutate, library: 'base', returnType: DataFrameType.DataFrame },
'group_by': { mapper: mapDataFrameGroupBy, library: 'dplyr', returnType: DataFrameType.Tibble },
'summarise': { mapper: mapDataFrameSummarize, library: 'dplyr', returnType: DataFrameType.DataFrame },
'summarize': { mapper: mapDataFrameSummarize, library: 'dplyr', returnType: DataFrameType.DataFrame },
'inner_join': { mapper: mapDataFrameJoin, library: 'dplyr', returnType: DataFrameType.DataFrame },
'left_join': { mapper: mapDataFrameJoin, library: 'dplyr', returnType: DataFrameType.DataFrame },
'right_join': { mapper: mapDataFrameJoin, library: 'dplyr', returnType: DataFrameType.DataFrame },
'full_join': { mapper: mapDataFrameJoin, library: 'dplyr', returnType: DataFrameType.DataFrame },
'merge': { mapper: mapDataFrameJoin, library: 'base', returnType: DataFrameType.DataFrame },
'relocate': { mapper: mapDataFrameIdentity, library: 'dplyr', returnType: DataFrameType.DataFrame },
'arrange': { mapper: mapDataFrameIdentity, library: 'dplyr', returnType: DataFrameType.DataFrame }
};
/**
* List of other data frame functions that are not explicitly supported but may return data frames.
*/
const OtherDataFrameFunctions = [
{
type: 'entry_point',
names: ['anova', 'AIC', 'BIC'],
library: 'anova',
returnType: DataFrameType.DataFrame
}, {
type: 'entry_point',
names: ['Anova', 'Manova'],
library: 'car',
returnType: DataFrameType.DataFrame
}, {
type: 'entry_point',
names: ['lmer'],
library: 'lme4',
returnType: DataFrameType.DataFrame
}, {
type: 'entry_point',
names: ['data_frame', 'as_data_frame'],
library: 'dplyr',
returnType: DataFrameType.DataFrame
}, {
type: 'entry_point',
names: ['tbl', 'as.tbl'],
library: 'dplyr',
returnType: DataFrameType.Tibble
}, {
type: 'entry_point',
names: ['read_fwf', 'read_log'],
library: 'readr',
returnType: DataFrameType.Tibble
}, {
type: 'entry_point',
names: ['read_excel', 'read_xls', 'read_xlsx'],
library: 'readxl',
returnType: DataFrameType.Tibble
}, {
type: 'entry_point',
names: ['tibble', 'tibble_row', 'as_tibble', 'tribble'],
library: 'tibble',
returnType: DataFrameType.Tibble
}, {
type: 'entry_point',
names: ['data.table', 'as.data.table', 'fread'],
library: 'data.table',
returnType: DataFrameType.DataTable
}, {
type: 'transformation',
names: ['na.omit'],
library: 'stats',
returnType: DataFrameType.DataFrame,
dataFrame: { pos: 0, name: 'object' }
}, {
type: 'transformation',
names: ['unique', 't'],
library: 'base',
returnType: DataFrameType.DataFrame,
dataFrame: { pos: 0, name: 'x' }
}, {
type: 'transformation',
names: ['aggregate'],
library: 'stats',
returnType: DataFrameType.DataFrame,
dataFrame: { pos: 0, name: 'x' }
}, {
type: 'transformation',
names: ['with', 'within'],
library: 'base',
returnType: DataFrameType.DataFrame,
dataFrame: { pos: 0, name: 'data' }
}, {
type: 'transformation',
names: ['reshape'],
library: 'stats',
returnType: DataFrameType.DataFrame,
dataFrame: { pos: 0, name: 'data' }
}, {
type: 'transformation',
names: ['melt'],
library: 'reshape2',
returnType: DataFrameType.DataFrame,
dataFrame: { pos: 0, name: 'data' }
}, {
type: 'transformation',
names: [
'transmute', 'distinct', 'distinct_prepare', 'group_by_prepare', 'rename', 'rename_with', 'reframe',
'slice', 'slice_head', 'slice_tail', 'slice_min', 'slice_max', 'slice_sample'
],
library: 'dplyr',
returnType: DataFrameType.DataFrame,
dataFrame: { pos: 0, name: '.data' }
}, {
type: 'transformation',
names: [
'filter_if', 'filter_at', 'filter_all', 'select_if', 'select_at', 'select_all',
'mutate_if', 'mutate_at', 'mutate_all', 'transmute_if', 'transmute_at', 'transmute_all',
'distinct_if', 'distinct_at', 'distinct_all', 'group_by_if', 'group_by_at', 'group_by_all',
'summarize_if', 'summarise_if', 'summarize_at', 'summarise_at', 'summarize_all', 'summarise_all',
'arrange_if', 'arrange_at', 'arrange_all', 'rename_if', 'rename_at', 'rename_all'
],
library: 'dplyr',
returnType: DataFrameType.Tibble,
dataFrame: { pos: 0, name: '.tbl' }
}, {
type: 'transformation',
names: [
'semi_join', 'anti_join', 'nest_join', 'cross_join',
'ungroup', 'count', 'tally', 'add_count', 'add_tally',
'rows_insert', 'rows_append', 'rows_update', 'rows_patch', 'rows_upsert', 'rows_delete'
],
library: 'dplyr',
returnType: DataFrameType.DataFrame,
dataFrame: { pos: 0, name: 'x' }
}, {
type: 'transformation',
names: ['bind_cols', 'bind_rows'],
library: 'dplyr',
returnType: DataFrameType.DataFrame
}, {
type: 'transformation',
names: [
'drop_na', 'replace_na', 'pivot_longer', 'pivot_wider',
'separate', 'separate_wider_position', 'separate_wider_delim', 'unite'
],
library: 'tidyr',
returnType: DataFrameType.DataFrame,
dataFrame: { pos: 0, name: 'data' }
}, {
type: 'transformation',
names: ['add_column', 'add_row', 'add_case'],
library: 'tibble',
returnType: DataFrameType.Tibble,
dataFrame: { pos: 0, name: '.data' }
}, {
type: 'transformation',
names: ['melt', 'dcast'],
library: 'data.table',
returnType: DataFrameType.DataTable,
dataFrame: { pos: 0, name: 'data' }
}
];
/**
* Mapper for defining the location of all relevant function parameters for each supported data frame function of {@link DataFrameFunctionMapper}.
*/
const DataFrameFunctionParamsMapper = {
'data.frame': {
checkNames: { pos: -1, name: 'check.names', default: true },
noDupNames: { pos: -1, name: 'check.names', default: true },
special: ['row.names', 'check.rows', 'check.names', 'fix.empty.names', 'stringsAsFactors'],
critical: [{ pos: -1, name: 'row.names' }]
},
'as.data.frame': {
dataFrame: { pos: 0, name: 'x' },
critical: []
},
'read.table': {
fileName: { pos: 0, name: 'file' },
header: { pos: 1, name: 'header', default: false },
separator: { pos: 2, name: 'sep', default: '\\s' },
quote: { pos: 3, name: 'quote', default: '"\'' },
skipLines: { pos: 12, name: 'skip', default: 0 },
checkNames: { pos: 13, name: 'check.names', default: true },
noDupNames: { pos: 13, name: 'check.names', default: true },
comment: { pos: 17, name: 'comment.char', default: '#' },
text: { pos: 23, name: 'text' },
critical: [
{ pos: 6, name: 'row.names' },
{ pos: 7, name: 'col.names' },
{ pos: 11, name: 'nrows', default: -1 },
{ pos: 15, name: 'strip.white', default: false },
{ pos: 16, name: 'blank.lines.skip', default: true },
{ pos: 18, name: 'allow.escapes', default: false },
]
},
'read.csv': {
fileName: { pos: 0, name: 'file' },
header: { pos: 1, name: 'header', default: true },
separator: { pos: 2, name: 'sep', default: ',' },
quote: { pos: 3, name: 'quote', default: '"' },
comment: { pos: 6, name: 'comment.char', default: '' },
skipLines: { pos: -1, name: 'skip', default: 0 },
checkNames: { pos: -1, name: 'check.names', default: true },
noDupNames: { pos: -1, name: 'check.names', default: true },
text: { pos: -1, name: 'text' },
critical: [
{ pos: -1, name: 'row.names' },
{ pos: -1, name: 'col.names' },
{ pos: -1, name: 'nrows', default: -1 },
{ pos: -1, name: 'strip.white', default: false },
{ pos: -1, name: 'blank.lines.skip', default: true },
{ pos: -1, name: 'allow.escapes', default: false },
]
},
'read.csv2': {
fileName: { pos: 0, name: 'file' },
header: { pos: 1, name: 'header', default: true },
separator: { pos: 2, name: 'sep', default: ';' },
quote: { pos: 3, name: 'quote', default: '"' },
comment: { pos: 6, name: 'comment.char', default: '' },
skipLines: { pos: -1, name: 'skip', default: 0 },
checkNames: { pos: -1, name: 'check.names', default: true },
noDupNames: { pos: -1, name: 'check.names', default: true },
text: { pos: -1, name: 'text' },
critical: [
{ pos: -1, name: 'row.names' },
{ pos: -1, name: 'col.names' },
{ pos: -1, name: 'nrows', default: -1 },
{ pos: -1, name: 'strip.white', default: false },
{ pos: -1, name: 'blank.lines.skip', default: true },
{ pos: -1, name: 'allow.escapes', default: false },
]
},
'read.delim': {
fileName: { pos: 0, name: 'file' },
header: { pos: 1, name: 'header', default: true },
separator: { pos: 2, name: 'sep', default: '\\t' },
quote: { pos: 3, name: 'quote', default: '"' },
comment: { pos: 6, name: 'comment.char', default: '' },
skipLines: { pos: -1, name: 'skip', default: 0 },
checkNames: { pos: -1, name: 'check.names', default: true },
noDupNames: { pos: -1, name: 'check.names', default: true },
text: { pos: -1, name: 'text' },
critical: [
{ pos: -1, name: 'row.names' },
{ pos: -1, name: 'col.names' },
{ pos: -1, name: 'nrows', default: -1 },
{ pos: -1, name: 'strip.white', default: false },
{ pos: -1, name: 'blank.lines.skip', default: true },
{ pos: -1, name: 'allow.escapes', default: false },
]
},
'read.delim2': {
fileName: { pos: 0, name: 'file' },
header: { pos: 1, name: 'header', default: true },
separator: { pos: 2, name: 'sep', default: '\\t' },
quote: { pos: 3, name: 'quote', default: '"' },
comment: { pos: 6, name: 'comment.char', default: '' },
skipLines: { pos: -1, name: 'skip', default: 0 },
checkNames: { pos: -1, name: 'check.names', default: true },
noDupNames: { pos: -1, name: 'check.names', default: true },
text: { pos: -1, name: 'text' },
critical: [
{ pos: -1, name: 'row.names' },
{ pos: -1, name: 'col.names' },
{ pos: -1, name: 'nrows', default: -1 },
{ pos: -1, name: 'strip.white', default: false },
{ pos: -1, name: 'blank.lines.skip', default: true },
{ pos: -1, name: 'allow.escapes', default: false },
]
},
'read_table': {
fileName: { pos: 0, name: 'file' },
header: { pos: 1, name: 'col_names', default: true },
separator: { pos: -1, default: '\\s' },
quote: { pos: -1, default: '"' },
skipLines: { pos: 5, name: 'skip', default: 0 },
comment: { pos: 9, name: 'comment', default: '' },
checkNames: { pos: -1, default: false },
noDupNames: { pos: -1, default: true },
critical: [
{ pos: 6, name: 'n_max', default: Infinity },
{ pos: 11, name: 'skip_empty_rows', default: true }
],
noEmptyNames: true
},
'read_csv': {
fileName: { pos: 0, name: 'file' },
header: { pos: 1, name: 'col_names', default: true },
separator: { pos: -1, default: ',' },
quote: { pos: 8, name: 'quote', default: '"' },
comment: { pos: 9, name: 'comment', default: '' },
skipLines: { pos: 11, name: 'skip', default: 0 },
checkNames: { pos: -1, default: false },
noDupNames: { pos: -1, default: true },
critical: [
{ pos: 3, name: 'col_select' },
{ pos: 4, name: 'id' },
{ pos: 10, name: 'trim_ws', default: true },
{ pos: 12, name: 'n_max', default: Infinity },
{ pos: 14, name: 'name_repair', default: 'unique' },
{ pos: 18, name: 'skip_empty_rows', default: true }
],
noEmptyNames: true
},
'read_csv2': {
fileName: { pos: 0, name: 'file' },
header: { pos: 1, name: 'col_names', default: true },
separator: { pos: -1, default: ';' },
quote: { pos: 8, name: 'quote', default: '"' },
comment: { pos: 9, name: 'comment', default: '' },
skipLines: { pos: 11, name: 'skip', default: 0 },
checkNames: { pos: -1, default: false },
noDupNames: { pos: -1, default: true },
critical: [
{ pos: 3, name: 'col_select' },
{ pos: 4, name: 'id' },
{ pos: 10, name: 'trim_ws', default: true },
{ pos: 12, name: 'n_max', default: Infinity },
{ pos: 14, name: 'name_repair', default: 'unique' },
{ pos: 18, name: 'skip_empty_rows', default: true }
],
noEmptyNames: true
},
'read_tsv': {
fileName: { pos: 0, name: 'file' },
header: { pos: 1, name: 'col_names', default: true },
separator: { pos: -1, default: '\\t' },
quote: { pos: 8, name: 'quote', default: '"' },
comment: { pos: 9, name: 'comment', default: '' },
skipLines: { pos: 11, name: 'skip', default: 0 },
checkNames: { pos: -1, default: false },
noDupNames: { pos: -1, default: true },
critical: [
{ pos: 3, name: 'col_select' },
{ pos: 4, name: 'id' },
{ pos: 10, name: 'trim_ws', default: true },
{ pos: 12, name: 'n_max', default: Infinity },
{ pos: 14, name: 'name_repair', default: 'unique' },
{ pos: 18, name: 'skip_empty_rows', default: true }
],
noEmptyNames: true
},
'read_delim': {
fileName: { pos: 0, name: 'file' },
separator: { pos: 1, name: 'delim', default: '\t' },
quote: { pos: 2, name: 'quote', default: '"' },
header: { pos: 5, name: 'col_names', default: true },
comment: { pos: 12, name: 'comment', default: '' },
skipLines: { pos: 14, name: 'skip', default: 0 },
checkNames: { pos: -1, default: false },
noDupNames: { pos: -1, default: true },
critical: [
{ pos: 3, name: 'escape_backslash', default: false },
{ pos: 4, name: 'escape_double', default: true },
{ pos: 7, name: 'col_select' },
{ pos: 8, name: 'id' },
{ pos: 13, name: 'trim_ws', default: false },
{ pos: 15, name: 'n_max', default: Infinity },
{ pos: 17, name: 'name_repair', default: 'unique' },
{ pos: 21, name: 'skip_empty_rows', default: true }
],
noEmptyNames: true
},
'cbind': {
special: ['deparse.level', 'make.row.names', 'stringsAsFactors', 'factor.exclude']
},
'rbind': {
special: ['deparse.level', 'make.row.names', 'stringsAsFactors', 'factor.exclude']
},
'head': {
dataFrame: { pos: 0, name: 'x' },
amount: { pos: 1, name: 'n', default: 6 }
},
'tail': {
dataFrame: { pos: 0, name: 'x' },
amount: { pos: 1, name: 'n', default: 6 }
},
'subset': {
dataFrame: { pos: 0, name: 'x' },
subset: { pos: 1, name: 'subset' },
select: { pos: 2, name: 'select' },
drop: { pos: 3, name: 'drop', default: false }
},
'filter': {
dataFrame: { pos: 0, name: '.data' },
special: ['.by', '.preserve']
},
'select': {
dataFrame: { pos: 0, name: '.data' },
special: []
},
'mutate': {
dataFrame: { pos: 0, name: '.data' },
special: ['.by', '.keep', '.before', '.after'],
critical: [{ pos: -1, name: '.keep' }],
checkNames: false,
noDupNames: false
},
'transform': {
dataFrame: { pos: 0, name: '_data' },
special: [],
checkNames: true,
noDupNames: true
},
'group_by': {
dataFrame: { pos: 0, name: '.data' },
by: { pos: 1 },
special: ['.add', '.drop']
},
'summarise': {
dataFrame: { pos: 0, name: '.data' },
special: ['.by', '.groups']
},
'summarize': {
dataFrame: { pos: 0, name: '.data' },
special: ['.by', '.groups']
},
'inner_join': {
dataFrame: { pos: 0, name: 'x' },
otherDataFrame: { pos: 1, name: 'y' },
by: { pos: 2, name: 'by' },
joinAll: { pos: -1, default: false },
joinLeft: { pos: -1, default: false },
joinRight: { pos: -1, default: false },
critical: [{ pos: -1, name: 'keep' }]
},
'left_join': {
dataFrame: { pos: 0, name: 'x' },
otherDataFrame: { pos: 1, name: 'y' },
by: { pos: 2, name: 'by' },
joinAll: { pos: -1, default: false },
joinLeft: { pos: -1, default: true },
joinRight: { pos: -1, default: false },
critical: [{ pos: -1, name: 'keep' }]
},
'right_join': {
dataFrame: { pos: 0, name: 'x' },
otherDataFrame: { pos: 1, name: 'y' },
by: { pos: 2, name: 'by' },
joinAll: { pos: -1, default: false },
joinLeft: { pos: -1, default: false },
joinRight: { pos: -1, default: true },
critical: [{ pos: -1, name: 'keep' }]
},
'full_join': {
dataFrame: { pos: 0, name: 'x' },
otherDataFrame: { pos: 1, name: 'y' },
by: { pos: 2, name: 'by' },
joinAll: { pos: -1, default: true },
joinLeft: { pos: -1, default: false },
joinRight: { pos: -1, default: false },
critical: [{ pos: -1, name: 'keep' }]
},
'merge': {
dataFrame: { pos: 0, name: 'x' },
otherDataFrame: { pos: 1, name: 'y' },
by: { pos: 2, name: 'by' },
joinAll: { pos: 5, name: 'all', default: false },
joinLeft: { pos: 6, name: 'all.x', default: false },
joinRight: { pos: 7, name: 'all.y', default: false },
critical: [
{ pos: 3, name: 'by.x' },
{ pos: 4, name: 'by.y' }
]
},
'relocate': {
dataFrame: { pos: 0, name: '.data' },
special: ['.before', '.after'],
disallowNamedArgs: true
},
'arrange': {
dataFrame: { pos: 0, name: '.data' },
special: ['.by_group', '.locale']
}
};
/**
* Maps a concrete data frame function call to abstract data frame operations.
* @param node - The R node of the function call
* @param inference - The data frame shape inference visitor
* @param dfg - The data flow graph for resolving the arguments
* @param ctx - The current flowR analyzer context
* @returns The mapped abstract data frame operations for the function call, or `undefined` if the node does not represent a data frame function call
*/
function mapDataFrameFunctionCall(node, inference, dfg, ctx) {
if (node.type !== type_1.RType.FunctionCall || !node.named) {
return;
}
const resolveInfo = { graph: dfg, idMap: dfg.idMap, full: true, resolve: config_1.VariableResolve.Alias, ctx };
const n = identifier_1.Identifier.getName(node.functionName.content);
if (isDataFrameFunction(n)) {
const functionName = n;
const mapper = DataFrameFunctionMapper[functionName].mapper;
const params = DataFrameFunctionParamsMapper[functionName];
const args = (0, arguments_1.getFunctionArguments)(node, dfg);
if ((0, arguments_1.hasCriticalArgument)(args, params.critical, resolveInfo)) {
return [{ operation: 'unknown', operand: undefined }];
}
else {
return mapper(args, params, inference, resolveInfo);
}
}
else {
const mapping = getOtherDataFrameFunction(identifier_1.Identifier.getName(node.functionName.content));
if (mapping === undefined) {
return;
}
else if (mapping.type === 'entry_point') {
return [{ operation: 'unknown', operand: undefined }];
}
else if (mapping.type === 'transformation' || mapping.type === 'modification') {
const args = (0, arguments_1.getFunctionArguments)(node, dfg);
return mapDataFrameUnknown(args, mapping, inference, resolveInfo);
}
else {
(0, assert_1.assertUnreachable)(mapping);
}
}
}
function isDataFrameFunction(functionName) {
// a check with `functionName in DataFrameFunctionMapper` would return true for "toString"
return Object.hasOwn(DataFrameFunctionMapper, functionName);
}
function getOtherDataFrameFunction(functionName) {
return OtherDataFrameFunctions.find(entry => entry.names.includes(functionName));
}
function mapDataFrameCreate(args, params, inference, info) {
const checkNames = (0, arguments_1.getArgumentValue)(args, params.checkNames, info);
const noDupNames = (0, arguments_1.getArgumentValue)(args, params.noDupNames, info);
args = (0, arguments_1.getEffectiveArgs)(args, params.special);
const argNames = args.map(arg => (0, resolve_args_1.resolveIdToArgName)(arg, info));
const argLengths = args.map(arg => (0, resolve_args_1.resolveIdToArgVectorLength)(arg, info));
const allVectors = argLengths.every(assert_1.isNotUndefined);
const rows = allVectors ? Math.max(...argLengths, 0) : undefined;
let colnames = argNames;
// over-approximate the column names if arguments are present but cannot be resolved to values
if (!allVectors || typeof checkNames !== 'boolean' || typeof noDupNames !== 'boolean') {
colnames = undefined;
}
else if (rows === 0) {
colnames = [];
}
else {
colnames = (0, arguments_1.filterValidNames)(colnames, checkNames, noDupNames);
}
return [{
operation: 'create',
operand: undefined,
colnames,
rows
}];
}
function mapDataFrameConvert(args, params, inference, info) {
const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info);
if (dataFrame === r_function_call_1.EmptyArgument || dataFrame?.value === undefined) {
return [{ operation: 'unknown', operand: undefined }];
}
return [{
operation: 'identity',
operand: dataFrame.value.info.id
}];
}
function mapDataFrameRead(args, params, inference, info) {
const fileNameArg = (0, arguments_1.getFunctionArgument)(args, params.fileName, info);
const textArg = params.text ? (0, arguments_1.getFunctionArgument)(args, params.text, info) : undefined;
const { source, request } = getRequestFromRead(fileNameArg, textArg, params, info);
const header = (0, arguments_1.getArgumentValue)(args, params.header, info);
const separator = (0, arguments_1.getArgumentValue)(args, params.separator, info);
const quote = (0, arguments_1.getArgumentValue)(args, params.quote, info);
const comment = (0, arguments_1.getArgumentValue)(args, params.comment, info);
const skipLines = (0, arguments_1.getArgumentValue)(args, params.skipLines, info);
const checkNames = (0, arguments_1.getArgumentValue)(args, params.checkNames, info);
const noDupNames = (0, arguments_1.getArgumentValue)(args, params.noDupNames, info);
const validArguments = typeof header === 'boolean' && typeof separator === 'string' && typeof quote === 'string' && typeof comment === 'string' &&
typeof skipLines === 'number' && typeof checkNames === 'boolean' && typeof noDupNames === 'boolean';
if (request === undefined || !info.ctx.config.abstractInterpretation.dataFrame.readLoadedData.readExternalFiles || !validArguments) {
return [{
operation: 'read',
operand: undefined,
source,
colnames: undefined,
rows: undefined
}];
}
const LineCommentRegex = new RegExp(`\\s*[${(0, arguments_1.escapeRegExp)(comment, true)}].*`);
let firstLine = undefined;
let firstLineNumber = 0;
let rowCount = 0;
const parseLine = (line, lineNumber) => {
const text = comment ? line.toString().replace(LineCommentRegex, '') : line.toString();
if (text.length > 0 && lineNumber >= (skipLines ?? 0)) {
if (firstLine === undefined) {
firstLine = getEntriesFromCsvLine(text, separator, quote, comment);
firstLineNumber = lineNumber;
}
if (!header || lineNumber > firstLineNumber) {
rowCount++;
}
}
};
const allLines = (0, arguments_1.parseRequestContent)(request, parseLine, info.ctx.config.abstractInterpretation.dataFrame.readLoadedData.maxReadLines);
let colnames;
if (header) {
colnames = (0, arguments_1.filterValidNames)(firstLine, checkNames, noDupNames, params.noEmptyNames);
}
else if (firstLine !== undefined) {
colnames = Array(firstLine.length).fill(undefined);
}
return [{
operation: 'read',
operand: undefined,
source,
colnames,
rows: allLines ? rowCount : [rowCount, Infinity]
}];
}
function mapDataFrameColBind(args, params, inference, info) {
args = (0, arguments_1.getEffectiveArgs)(args, params.special);
const dataFrame = args.find(arg => (0, arguments_1.isDataFrameArgument)(arg, inference));
if (dataFrame === undefined) {
return;
}
else if (args.length === 1) {
return [{ operation: 'identity', operand: dataFrame.value.info.id }];
}
const result = [];
let operand = dataFrame.value;
let colnames = [];
for (const arg of args) {
if (arg !== dataFrame && arg !== r_function_call_1.EmptyArgument) {
const otherDataFrame = inference.getAbstractValue(arg.value);
if (otherDataFrame !== undefined) {
result.push({
operation: 'concatCols',
operand: operand?.info.id,
other: otherDataFrame
});
operand = undefined;
// added columns are top if argument cannot be resolved to constant (vector-like) value
}
else if ((0, resolve_args_1.resolveIdToArgValue)(arg, info) !== undefined) {
const colname = (0, resolve_args_1.resolveIdToArgName)(arg, info);
colnames?.push(colname);
}
else {
colnames = undefined;
}
}
}
if (colnames === undefined || colnames.length > 0) {
result.push({
operation: 'addCols',
operand: operand?.info.id,
colnames
});
}
return result;
}
function mapDataFrameRowBind(args, params, inference, info) {
args = (0, arguments_1.getEffectiveArgs)(args, params.special);
const dataFrame = args.find(arg => (0, arguments_1.isDataFrameArgument)(arg, inference));
if (dataFrame === undefined) {
return;
}
else if (args.length === 1) {
return [{ operation: 'identity', operand: dataFrame.value.info.id }];
}
const result = [];
let operand = dataFrame.value;
let rows = 0;
for (const arg of args) {
if (arg !== dataFrame && arg !== r_function_call_1.EmptyArgument) {
const otherDataFrame = inference.getAbstractValue(arg.value);
if (otherDataFrame !== undefined) {
result.push({
operation: 'concatRows',
operand: operand?.info.id,
other: otherDataFrame
});
operand = undefined;
// number of added rows is top if arguments cannot be resolved to constant (vector-like) value
}
else if ((0, resolve_args_1.resolveIdToArgValue)(arg, info) !== undefined) {
rows = rows !== undefined ? rows + 1 : undefined;
}
else {
rows = undefined;
}
}
}
if (rows === undefined || rows > 0) {
result.push({
operation: 'addRows',
operand: operand?.info.id,
rows
});
}
return result;
}
function mapDataFrameHeadTail(args, params, inference, info) {
const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info);
if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) {
return;
}
const result = [];
const amount = (0, arguments_1.getArgumentValue)(args, params.amount, info);
let rows = undefined;
let cols = undefined;
if (typeof amount === 'number') {
rows = amount;
}
else if (Array.isArray(amount) && amount.length <= 2 && amount.every(value => typeof value === 'number')) {
rows = amount[0];
cols = amount[1];
}
result.push({
operation: rows === undefined || rows >= 0 ? 'subsetRows' : 'removeRows',
operand: dataFrame.value.info.id,
rows: rows !== undefined ? Math.abs(rows) : undefined
});
if (cols !== undefined) {
result.push({
operation: cols >= 0 ? 'subsetCols' : 'removeCols',
operand: undefined,
colnames: Array(Math.abs(cols)).fill(undefined)
});
}
return result;
}
function mapDataFrameSubset(args, params, inference, info) {
const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info);
if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) {
return;
}
else if (args.length === 1) {
return [{ operation: 'identity', operand: dataFrame.value.info.id }];
}
const result = [];
let operand = dataFrame.value;
const filterArg = (0, arguments_1.getFunctionArgument)(args, params.subset, info);
const filterValue = (0, resolve_args_1.resolveIdToArgValue)(filterArg, info);
const selectArg = (0, arguments_1.getFunctionArgument)(args, params.select, info);
const dropArg = (0, arguments_1.getFunctionArgument)(args, params.drop, info);
const condition = typeof filterValue === 'boolean' ? filterValue : undefined;
const filterNames = (0, arguments_1.getUnresolvedSymbolsInExpression)(filterArg, info.graph);
const { selectedCols, unselectedCols } = getSelectedColumns([selectArg], info);
const accessedCols = [...filterNames, ...selectedCols ?? [], ...unselectedCols ?? []];
const mixedAccess = accessedCols.some(col => typeof col === 'string') && accessedCols.some(col => typeof col === 'number');
const duplicateCols = accessedCols.some((col, index, list) => col !== undefined && list.indexOf(col) !== index);
if (accessedCols.some(col => typeof col === 'string')) {
result.push({
operation: 'accessCols',
operand: operand?.info.id,
columns: accessedCols.filter(col => typeof col === 'string')
});
}
if (accessedCols.some(col => typeof col === 'number')) {
result.push({
operation: 'accessCols',
operand: operand?.info.id,
columns: accessedCols.filter(col => typeof col === 'number').map(Math.abs)
});
}
if (filterArg !== undefined && filterArg !== r_function_call_1.EmptyArgument) {
result.push({
operation: 'filterRows',
operand: operand?.info.id,
condition: condition
});
operand = undefined;
}
if (!dropArg || accessedCols.length > 1) {
if (unselectedCols === undefined || unselectedCols.length > 0) {
result.push({
operation: 'removeCols',
operand: operand?.info.id,
colnames: unselectedCols?.map(col => typeof col === 'string' ? col : undefined)
});
operand = undefined;
}
if (selectedCols === undefined || selectedCols.length > 0) {
result.push({
operation: 'subsetCols',
operand: operand?.info.id,
colnames: selectedCols?.map(col => typeof col === 'string' ? col : undefined),
...(duplicateCols || mixedAccess ? { options: { duplicateCols: true } } : {})
});
operand = undefined;
}
}
return result;
}
function mapDataFrameFilter(args, params, inference, info) {
args = (0, arguments_1.getEffectiveArgs)(args, params.special);
const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info);
if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) {
return;
}
else if (args.length === 1) {
return [{ operation: 'identity', operand: dataFrame.value.info.id }];
}
const result = [];
const filterArgs = args.filter(arg => arg !== dataFrame);
const filterValues = filterArgs.map(arg => (0, resolve_args_1.resolveIdToArgValue)(arg, info));
const accessedNames = filterArgs.flatMap(arg => (0, arguments_1.getUnresolvedSymbolsInExpression)(arg, info.graph).map(identifier_1.Identifier.getName));
const condition = filterValues.every(value => typeof value === 'boolean') ? filterValues.every(cond => cond) : undefined;
if (accessedNames.length > 0) {
result.push({
operation: 'accessCols',
operand: dataFrame.value.info.id,
columns: accessedNames
});
}
result.push({
operation: 'filterRows',
operand: dataFrame.value.info.id,
condition: condition
});
return result;
}
function mapDataFrameSelect(args, params, inference, info) {
args = (0, arguments_1.getEffectiveArgs)(args, params.special);
const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info);
if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) {
return;
}
const result = [];
let operand = dataFrame.value;
const selectArgs = args.filter(arg => arg !== dataFrame);
let { selectedCols, unselectedCols } = getSelectedColumns(selectArgs, info);
const accessedCols = [...selectedCols ?? [], ...unselectedCols ?? []];
const mixedAccess = accessedCols.some(col => typeof col === 'string') && accessedCols.some(col => typeof col === 'number');
const duplicateAccess = accessedCols.some((col, _, list) => col !== undefined && list.filter(other => other === col).length > 1);
const renamedCols = selectArgs.some(arguments_1.isNamedArgument);
// map to top if columns are selected mixed by string and number, or are selected duplicate
if (mixedAccess || duplicateAccess) {
selectedCols = undefined;
unselectedCols = [];
}
if (accessedCols.some(col => typeof col === 'string')) {
result.push({
operation: 'accessCols',
operand: operand?.info.id,
columns: accessedCols.filter(col => typeof col === 'string')
});
}
if (accessedCols.some(col => typeof col === 'number')) {
result.push({
operation: 'accessCols',
operand: operand?.info.id,
columns: accessedCols.filter(col => typeof col === 'number').map(Math.abs)
});
}
if (unselectedCols === undefined || unselectedCols.length > 0) {
result.push({
operation: 'removeCols',
operand: operand?.info.id,
colnames: unselectedCols?.map(col => typeof col === 'string' ? col : undefined)
});
operand = undefined;
}
if (selectedCols === undefined || selectedCols.length > 0 || unselectedCols?.length === 0) {
result.push({
operation: 'subsetCols',
operand: operand?.info.id,
colnames: selectedCols?.map(col => typeof col === 'string' ? col : undefined),
...(renamedCols ? { options: { renamedCols: true } } : {})
});
operand = undefined;
}
return result;
}
function mapDataFrameMutate(args, params, inference, info) {
args = (0, arguments_1.getEffectiveArgs)(args, params.special);
const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info);
if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) {
return;
}
else if (args.length === 1) {
return [{ operation: 'identity', operand: dataFrame.value.info.id }];
}
const result = [];
let operand = dataFrame.value;
const mutateArgs = args.filter(arg => arg !== dataFrame);
let deletedCols = mutateArgs
.filter(arguments_1.isRNull)
.map(arg => (0, resolve_args_1.resolveIdToArgName)(arg, info));
let mutatedCols = mutateArgs
.filter(arg => !(0, arguments_1.isRNull)(arg))
.map(arg => (0, resolve_args_1.resolveIdToArgName)(arg, info));
// only column names that are not created by mutation are preconditions on the operand
const accessedNames = mutateArgs
.flatMap(arg => (0, arguments_1.getUnresolvedSymbolsInExpression)(arg, info.graph).map(identifier_1.Identifier.toString))
.filter(arg => !mutatedCols?.includes(arg));
deletedCols = (0, arguments_1.filterValidNames)(deletedCols, params.checkNames, params.noDupNames, undefined, true);
mutatedCols = (0, arguments_1.filterValidNames)(mutatedCols, params.checkNames, params.noDupNames, undefined, true);
if (accessedNames.length > 0) {
result.push({
operation: 'accessCols',
operand: operand?.info.id,
columns: accessedNames
});
}
if (mutatedCols === undefined || mutatedCols.length > 0 || deletedCols?.length === 0) {
result.push({
operation: 'mutateCols',
operand: operand?.info.id,
colnames: mutatedCols
});
operand = undefined;
}
if (deletedCols === undefined || deletedCols.length > 0) {
result.push({
operation: 'removeCols',
operand: operand?.info.id,
colnames: deletedCols,
options: { maybe: true }
});
operand = undefined;
}
return result;
}
function mapDataFrameGroupBy(args, params, inference, info) {
args = (0, arguments_1.getEffectiveArgs)(args, params.special);
const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info);
if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) {
return;
}
else if (args.length === 1) {
return [{ operation: 'identity', operand: dataFrame.value.info.id }];
}
const result = [];
const byArgs = args.filter(arg => arg !== dataFrame);
const accessedNames = byArgs.flatMap(arg => (0, arguments_1.getUnresolvedSymbolsInExpression)(arg, info.graph)).map(identifier_1.Identifier.toString);
const byNames = byArgs.map(arg => (0, arguments_1.isNamedArgument)(arg) ? (0, resolve_args_1.resolveIdToArgName)(arg, info) : (0, resolve_args_1.resolveIdToArgValueSymbolName)(arg, info));
const mutatedCols = byArgs.some(arguments_1.isNamedArgument) || byNames.some(assert_1.isUndefined);
if (accessedNames.length > 0) {
result.push({
operation: 'accessCols',
operand: dataFrame.value.info.id,
columns: accessedNames
});
}
result.push({
operation: 'groupBy',
operand: dataFrame.value.info.id,
by: byNames,
...(mutatedCols ? { options: { mutatedCols: true } } : {})
});
return result;
}
function mapDataFrameSummarize(args, params, inference, info) {
args = (0, arguments_1.getEffectiveArgs)(args, params.special);
const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info);
if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) {
return;
}
const result = [];
const summarizeArgs = args.filter(arg => arg !== dataFrame);
const summarizedCols = summarizeArgs.map(arg => (0, resolve_args_1.resolveIdToArgName)(arg, info));
// only column names that are not created by summarize are preconditions on the operand
const accessedNames = summarizeArgs
.flatMap(arg => (0, arguments_1.getUnresolvedSymbolsInExpression)(arg, info.graph).map(identifier_1.Identifier.toString))
.filter(arg => !summarizedCols.includes(arg));
if (accessedNames.length > 0) {
result.push({
operation: 'accessCols',
operand: dataFrame.value.info.id,
columns: accessedNames
});
}
result.push({
operation: 'summarize',
operand: dataFrame.value.info.id,
colnames: summarizedCols
});
return result;
}
function mapDataFrameJoin(args, params, inference, info) {
const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info);
const joinAll = (0, arguments_1.getArgumentValue)(args, params.joinAll, info);
const joinLeft = (0, arguments_1.getArgumentValue)(args, params.joinLeft, info);
const joinRight = (0, arguments_1.getArgumentValue)(args, params.joinRight, info);
if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) {
return;
}
else if (args.length === 1) {
return [{ operation: 'identity', operand: dataFrame.value.info.id }];
}
else if (typeof joinAll !== 'boolean' || typeof joinLeft !== 'boolean' || typeof joinRight !== 'boolean') {
return [{ operation: 'unknown', operand: dataFrame.value.info.id }];
}
const result = [];
const otherArg = (0, arguments_1.getFunctionArgument)(args, params.otherDataFrame, info);
const byArg = (0, arguments_1.getFunctionArgument)(args, params.by, info);
const otherDataFrame = inference.getAbstractValue(otherArg) ?? dataframe_domain_1.DataFrameDomain.top(info.ctx.config.abstractInterpretation.dataFrame.maxColNames);
let byCols;
const joinType = getJoinType(joinAll, joinLeft, joinRight);
if (byArg !== undefined) {
const byValue = (0, resolve_args_1.resolveIdToArgValue)(byArg, info);
if (typeof byValue === 'string' || typeof byValue === 'number') {
byCols = [byValue];
}
else if (Array.isArray(byValue) && (byValue.every(by => typeof by === 'string') || byValue.every(by => typeof by === 'number'))) {
byCols = byValue;
}
}
if (byCols?.some(by => typeof by === 'string')) {
result.push({
operation: 'accessCols',
operand: dataFrame.value.info.id,
columns: byCols.filter(by => typeof by === 'string')
});
}
if (byCols?.some(by => typeof by === 'number')) {
result.push({
operation: 'accessCols',
operand: dataFrame.value.info.id,
columns: byCols.filter(by => typeof by === 'number')
});
}
result.push({
operation: 'join',
operand: dataFrame.value.info.id,
other: otherDataFrame,
by: byCols?.map(by => typeof by === 'string' ? by : undefined),
options: { join: joinType, natural: byArg === undefined }
});
return result;
}
function mapDataFrameIdentity(args, params, inference, info) {
args = (0, arguments_1.getEffectiveArgs)(args, params.special);
const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info);
if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) {
return;
}
else if (params.disallowNamedArgs && args.some(arguments_1.isNamedArgument)) {
return [{ operation: 'unknown', operand: dataFrame.value.info.id }];
}
return [{
operation: 'identity',
operand: dataFrame.value.info.id
}];
}
function mapDataFrameUnknown(args, params, inference, info) {
let dataFrame;
if (params.dataFrame !== undefined) {
dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info);
}
else {
dataFrame = args.find(arg => (0, arguments_1.isDataFrameArgument)(arg, inference));
}
if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) {
return;
}
return [{
operation: 'unknown',
operand: dataFrame.value.info.id,
...(params.constraintType !== undefined ? { type: params.constraintType } : {})
}];
}
function getRequestFromRead(fileNameArg, textArg, params, info) {
let source;
let request;
if (fileNameArg !== undefined && fileNameArg !== r_function_call_1.EmptyArgument) {
const fileName = (0, resolve_args_1.resolveIdToArgValue)(fileNameArg, info);
if (typeof fileName === 'string') {
const text = (0, resolve_args_1.unescapeSpecialChars)(fileName);
source = fileName;
const referenceChain = fileNameArg.info.file ? [fileNameArg.info.file] : [];
const sources = (0, built_in_source_1.findSource)(info.ctx.config.solver.resolveSource, fileName, { referenceChain, ctx: info.ctx });
if (sources?.length === 1) {
source = sources[0];
// create request from resolved source file path
request = { request: 'file', content: sources[0] };
}
else if (params.text === undefined && text.includes('\n')) {
// create request from string if file name argument contains newline
request = (0, retriever_1.requestFromInput)(text);
}
}
}
else if (textArg !== undefined && textArg !== r_function_call_1.EmptyArgument) {
const text = (0, resolve_args_1.resolveIdToArgValue)(textArg, info);
if (typeof text === 'string') {
source = text;
request = (0, retriever_1.requestFromInput)((0, resolve_args_1.unescapeSpecialChars)(text));
}
}
request = request ? info.ctx.files.resolveRequest(request).r : undefined;
return { source, request };
}
/**
* Gets all entries from a line of a CSV file u