UNPKG

@eagleoutice/flowr

Version:

Static Dataflow Analyzer and Program Slicer for the R Programming Language

1,136 lines 54.4 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.mapDataFrameFunctionCall = mapDataFrameFunctionCall; const config_1 = require("../../../config"); const make_argument_1 = require("../../../dataflow/internal/process/functions/call/argument/make-argument"); const built_in_source_1 = require("../../../dataflow/internal/process/functions/call/built-in/built-in-source"); const r_function_call_1 = require("../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call"); const type_1 = require("../../../r-bridge/lang-4.x/ast/model/type"); const retriever_1 = require("../../../r-bridge/retriever"); const assert_1 = require("../../../util/assert"); const dataframe_domain_1 = require("../dataframe-domain"); const resolve_args_1 = require("../resolve-args"); const arguments_1 = require("./arguments"); const identifier_1 = require("../../../dataflow/environments/identifier"); /** * Represents the different types of data frames in R */ var DataFrameType; (function (DataFrameType) { DataFrameType["DataFrame"] = "data.frame"; DataFrameType["Tibble"] = "tibble"; DataFrameType["DataTable"] = "data.table"; })(DataFrameType || (DataFrameType = {})); /** * Mapper for mapping the supported concrete data frame functions to mapper functions, * including information about the origin library of the functions and the type of the returned data frame. */ const DataFrameFunctionMapper = { 'data.frame': { mapper: mapDataFrameCreate, library: 'base', returnType: DataFrameType.DataFrame }, 'as.data.frame': { mapper: mapDataFrameConvert, library: 'base', returnType: DataFrameType.DataFrame }, 'read.table': { mapper: mapDataFrameRead, library: 'utils', returnType: DataFrameType.DataFrame }, 'read.csv': { mapper: mapDataFrameRead, library: 'utils', returnType: DataFrameType.DataFrame }, 'read.csv2': { mapper: mapDataFrameRead, library: 'utils', returnType: DataFrameType.DataFrame }, 'read.delim': { mapper: mapDataFrameRead, library: 'utils', returnType: DataFrameType.DataFrame }, 'read.delim2': { mapper: mapDataFrameRead, library: 'utils', returnType: DataFrameType.DataFrame }, 'read_table': { mapper: mapDataFrameRead, library: 'readr', returnType: DataFrameType.Tibble }, 'read_csv': { mapper: mapDataFrameRead, library: 'readr', returnType: DataFrameType.Tibble }, 'read_csv2': { mapper: mapDataFrameRead, library: 'readr', returnType: DataFrameType.Tibble }, 'read_tsv': { mapper: mapDataFrameRead, library: 'readr', returnType: DataFrameType.Tibble }, 'read_delim': { mapper: mapDataFrameRead, library: 'readr', returnType: DataFrameType.Tibble }, 'cbind': { mapper: mapDataFrameColBind, library: 'base', returnType: DataFrameType.DataFrame }, 'rbind': { mapper: mapDataFrameRowBind, library: 'base', returnType: DataFrameType.DataFrame }, 'head': { mapper: mapDataFrameHeadTail, library: 'utils', returnType: DataFrameType.DataFrame }, 'tail': { mapper: mapDataFrameHeadTail, library: 'utils', returnType: DataFrameType.DataFrame }, 'subset': { mapper: mapDataFrameSubset, library: 'base', returnType: DataFrameType.DataFrame }, 'filter': { mapper: mapDataFrameFilter, library: 'dplyr', returnType: DataFrameType.DataFrame }, 'select': { mapper: mapDataFrameSelect, library: 'dplyr', returnType: DataFrameType.DataFrame }, 'mutate': { mapper: mapDataFrameMutate, library: 'dplyr', returnType: DataFrameType.DataFrame }, 'transform': { mapper: mapDataFrameMutate, library: 'base', returnType: DataFrameType.DataFrame }, 'group_by': { mapper: mapDataFrameGroupBy, library: 'dplyr', returnType: DataFrameType.Tibble }, 'summarise': { mapper: mapDataFrameSummarize, library: 'dplyr', returnType: DataFrameType.DataFrame }, 'summarize': { mapper: mapDataFrameSummarize, library: 'dplyr', returnType: DataFrameType.DataFrame }, 'inner_join': { mapper: mapDataFrameJoin, library: 'dplyr', returnType: DataFrameType.DataFrame }, 'left_join': { mapper: mapDataFrameJoin, library: 'dplyr', returnType: DataFrameType.DataFrame }, 'right_join': { mapper: mapDataFrameJoin, library: 'dplyr', returnType: DataFrameType.DataFrame }, 'full_join': { mapper: mapDataFrameJoin, library: 'dplyr', returnType: DataFrameType.DataFrame }, 'merge': { mapper: mapDataFrameJoin, library: 'base', returnType: DataFrameType.DataFrame }, 'relocate': { mapper: mapDataFrameIdentity, library: 'dplyr', returnType: DataFrameType.DataFrame }, 'arrange': { mapper: mapDataFrameIdentity, library: 'dplyr', returnType: DataFrameType.DataFrame } }; /** * List of other data frame functions that are not explicitly supported but may return data frames. */ const OtherDataFrameFunctions = [ { type: 'entry_point', names: ['anova', 'AIC', 'BIC'], library: 'anova', returnType: DataFrameType.DataFrame }, { type: 'entry_point', names: ['Anova', 'Manova'], library: 'car', returnType: DataFrameType.DataFrame }, { type: 'entry_point', names: ['lmer'], library: 'lme4', returnType: DataFrameType.DataFrame }, { type: 'entry_point', names: ['data_frame', 'as_data_frame'], library: 'dplyr', returnType: DataFrameType.DataFrame }, { type: 'entry_point', names: ['tbl', 'as.tbl'], library: 'dplyr', returnType: DataFrameType.Tibble }, { type: 'entry_point', names: ['read_fwf', 'read_log'], library: 'readr', returnType: DataFrameType.Tibble }, { type: 'entry_point', names: ['read_excel', 'read_xls', 'read_xlsx'], library: 'readxl', returnType: DataFrameType.Tibble }, { type: 'entry_point', names: ['tibble', 'tibble_row', 'as_tibble', 'tribble'], library: 'tibble', returnType: DataFrameType.Tibble }, { type: 'entry_point', names: ['data.table', 'as.data.table', 'fread'], library: 'data.table', returnType: DataFrameType.DataTable }, { type: 'transformation', names: ['na.omit'], library: 'stats', returnType: DataFrameType.DataFrame, dataFrame: { pos: 0, name: 'object' } }, { type: 'transformation', names: ['unique', 't'], library: 'base', returnType: DataFrameType.DataFrame, dataFrame: { pos: 0, name: 'x' } }, { type: 'transformation', names: ['aggregate'], library: 'stats', returnType: DataFrameType.DataFrame, dataFrame: { pos: 0, name: 'x' } }, { type: 'transformation', names: ['with', 'within'], library: 'base', returnType: DataFrameType.DataFrame, dataFrame: { pos: 0, name: 'data' } }, { type: 'transformation', names: ['reshape'], library: 'stats', returnType: DataFrameType.DataFrame, dataFrame: { pos: 0, name: 'data' } }, { type: 'transformation', names: ['melt'], library: 'reshape2', returnType: DataFrameType.DataFrame, dataFrame: { pos: 0, name: 'data' } }, { type: 'transformation', names: [ 'transmute', 'distinct', 'distinct_prepare', 'group_by_prepare', 'rename', 'rename_with', 'reframe', 'slice', 'slice_head', 'slice_tail', 'slice_min', 'slice_max', 'slice_sample' ], library: 'dplyr', returnType: DataFrameType.DataFrame, dataFrame: { pos: 0, name: '.data' } }, { type: 'transformation', names: [ 'filter_if', 'filter_at', 'filter_all', 'select_if', 'select_at', 'select_all', 'mutate_if', 'mutate_at', 'mutate_all', 'transmute_if', 'transmute_at', 'transmute_all', 'distinct_if', 'distinct_at', 'distinct_all', 'group_by_if', 'group_by_at', 'group_by_all', 'summarize_if', 'summarise_if', 'summarize_at', 'summarise_at', 'summarize_all', 'summarise_all', 'arrange_if', 'arrange_at', 'arrange_all', 'rename_if', 'rename_at', 'rename_all' ], library: 'dplyr', returnType: DataFrameType.Tibble, dataFrame: { pos: 0, name: '.tbl' } }, { type: 'transformation', names: [ 'semi_join', 'anti_join', 'nest_join', 'cross_join', 'ungroup', 'count', 'tally', 'add_count', 'add_tally', 'rows_insert', 'rows_append', 'rows_update', 'rows_patch', 'rows_upsert', 'rows_delete' ], library: 'dplyr', returnType: DataFrameType.DataFrame, dataFrame: { pos: 0, name: 'x' } }, { type: 'transformation', names: ['bind_cols', 'bind_rows'], library: 'dplyr', returnType: DataFrameType.DataFrame }, { type: 'transformation', names: [ 'drop_na', 'replace_na', 'pivot_longer', 'pivot_wider', 'separate', 'separate_wider_position', 'separate_wider_delim', 'unite' ], library: 'tidyr', returnType: DataFrameType.DataFrame, dataFrame: { pos: 0, name: 'data' } }, { type: 'transformation', names: ['add_column', 'add_row', 'add_case'], library: 'tibble', returnType: DataFrameType.Tibble, dataFrame: { pos: 0, name: '.data' } }, { type: 'transformation', names: ['melt', 'dcast'], library: 'data.table', returnType: DataFrameType.DataTable, dataFrame: { pos: 0, name: 'data' } } ]; /** * Mapper for defining the location of all relevant function parameters for each supported data frame function of {@link DataFrameFunctionMapper}. */ const DataFrameFunctionParamsMapper = { 'data.frame': { checkNames: { pos: -1, name: 'check.names', default: true }, noDupNames: { pos: -1, name: 'check.names', default: true }, special: ['row.names', 'check.rows', 'check.names', 'fix.empty.names', 'stringsAsFactors'], critical: [{ pos: -1, name: 'row.names' }] }, 'as.data.frame': { dataFrame: { pos: 0, name: 'x' }, critical: [] }, 'read.table': { fileName: { pos: 0, name: 'file' }, header: { pos: 1, name: 'header', default: false }, separator: { pos: 2, name: 'sep', default: '\\s' }, quote: { pos: 3, name: 'quote', default: '"\'' }, skipLines: { pos: 12, name: 'skip', default: 0 }, checkNames: { pos: 13, name: 'check.names', default: true }, noDupNames: { pos: 13, name: 'check.names', default: true }, comment: { pos: 17, name: 'comment.char', default: '#' }, text: { pos: 23, name: 'text' }, critical: [ { pos: 6, name: 'row.names' }, { pos: 7, name: 'col.names' }, { pos: 11, name: 'nrows', default: -1 }, { pos: 15, name: 'strip.white', default: false }, { pos: 16, name: 'blank.lines.skip', default: true }, { pos: 18, name: 'allow.escapes', default: false }, ] }, 'read.csv': { fileName: { pos: 0, name: 'file' }, header: { pos: 1, name: 'header', default: true }, separator: { pos: 2, name: 'sep', default: ',' }, quote: { pos: 3, name: 'quote', default: '"' }, comment: { pos: 6, name: 'comment.char', default: '' }, skipLines: { pos: -1, name: 'skip', default: 0 }, checkNames: { pos: -1, name: 'check.names', default: true }, noDupNames: { pos: -1, name: 'check.names', default: true }, text: { pos: -1, name: 'text' }, critical: [ { pos: -1, name: 'row.names' }, { pos: -1, name: 'col.names' }, { pos: -1, name: 'nrows', default: -1 }, { pos: -1, name: 'strip.white', default: false }, { pos: -1, name: 'blank.lines.skip', default: true }, { pos: -1, name: 'allow.escapes', default: false }, ] }, 'read.csv2': { fileName: { pos: 0, name: 'file' }, header: { pos: 1, name: 'header', default: true }, separator: { pos: 2, name: 'sep', default: ';' }, quote: { pos: 3, name: 'quote', default: '"' }, comment: { pos: 6, name: 'comment.char', default: '' }, skipLines: { pos: -1, name: 'skip', default: 0 }, checkNames: { pos: -1, name: 'check.names', default: true }, noDupNames: { pos: -1, name: 'check.names', default: true }, text: { pos: -1, name: 'text' }, critical: [ { pos: -1, name: 'row.names' }, { pos: -1, name: 'col.names' }, { pos: -1, name: 'nrows', default: -1 }, { pos: -1, name: 'strip.white', default: false }, { pos: -1, name: 'blank.lines.skip', default: true }, { pos: -1, name: 'allow.escapes', default: false }, ] }, 'read.delim': { fileName: { pos: 0, name: 'file' }, header: { pos: 1, name: 'header', default: true }, separator: { pos: 2, name: 'sep', default: '\\t' }, quote: { pos: 3, name: 'quote', default: '"' }, comment: { pos: 6, name: 'comment.char', default: '' }, skipLines: { pos: -1, name: 'skip', default: 0 }, checkNames: { pos: -1, name: 'check.names', default: true }, noDupNames: { pos: -1, name: 'check.names', default: true }, text: { pos: -1, name: 'text' }, critical: [ { pos: -1, name: 'row.names' }, { pos: -1, name: 'col.names' }, { pos: -1, name: 'nrows', default: -1 }, { pos: -1, name: 'strip.white', default: false }, { pos: -1, name: 'blank.lines.skip', default: true }, { pos: -1, name: 'allow.escapes', default: false }, ] }, 'read.delim2': { fileName: { pos: 0, name: 'file' }, header: { pos: 1, name: 'header', default: true }, separator: { pos: 2, name: 'sep', default: '\\t' }, quote: { pos: 3, name: 'quote', default: '"' }, comment: { pos: 6, name: 'comment.char', default: '' }, skipLines: { pos: -1, name: 'skip', default: 0 }, checkNames: { pos: -1, name: 'check.names', default: true }, noDupNames: { pos: -1, name: 'check.names', default: true }, text: { pos: -1, name: 'text' }, critical: [ { pos: -1, name: 'row.names' }, { pos: -1, name: 'col.names' }, { pos: -1, name: 'nrows', default: -1 }, { pos: -1, name: 'strip.white', default: false }, { pos: -1, name: 'blank.lines.skip', default: true }, { pos: -1, name: 'allow.escapes', default: false }, ] }, 'read_table': { fileName: { pos: 0, name: 'file' }, header: { pos: 1, name: 'col_names', default: true }, separator: { pos: -1, default: '\\s' }, quote: { pos: -1, default: '"' }, skipLines: { pos: 5, name: 'skip', default: 0 }, comment: { pos: 9, name: 'comment', default: '' }, checkNames: { pos: -1, default: false }, noDupNames: { pos: -1, default: true }, critical: [ { pos: 6, name: 'n_max', default: Infinity }, { pos: 11, name: 'skip_empty_rows', default: true } ], noEmptyNames: true }, 'read_csv': { fileName: { pos: 0, name: 'file' }, header: { pos: 1, name: 'col_names', default: true }, separator: { pos: -1, default: ',' }, quote: { pos: 8, name: 'quote', default: '"' }, comment: { pos: 9, name: 'comment', default: '' }, skipLines: { pos: 11, name: 'skip', default: 0 }, checkNames: { pos: -1, default: false }, noDupNames: { pos: -1, default: true }, critical: [ { pos: 3, name: 'col_select' }, { pos: 4, name: 'id' }, { pos: 10, name: 'trim_ws', default: true }, { pos: 12, name: 'n_max', default: Infinity }, { pos: 14, name: 'name_repair', default: 'unique' }, { pos: 18, name: 'skip_empty_rows', default: true } ], noEmptyNames: true }, 'read_csv2': { fileName: { pos: 0, name: 'file' }, header: { pos: 1, name: 'col_names', default: true }, separator: { pos: -1, default: ';' }, quote: { pos: 8, name: 'quote', default: '"' }, comment: { pos: 9, name: 'comment', default: '' }, skipLines: { pos: 11, name: 'skip', default: 0 }, checkNames: { pos: -1, default: false }, noDupNames: { pos: -1, default: true }, critical: [ { pos: 3, name: 'col_select' }, { pos: 4, name: 'id' }, { pos: 10, name: 'trim_ws', default: true }, { pos: 12, name: 'n_max', default: Infinity }, { pos: 14, name: 'name_repair', default: 'unique' }, { pos: 18, name: 'skip_empty_rows', default: true } ], noEmptyNames: true }, 'read_tsv': { fileName: { pos: 0, name: 'file' }, header: { pos: 1, name: 'col_names', default: true }, separator: { pos: -1, default: '\\t' }, quote: { pos: 8, name: 'quote', default: '"' }, comment: { pos: 9, name: 'comment', default: '' }, skipLines: { pos: 11, name: 'skip', default: 0 }, checkNames: { pos: -1, default: false }, noDupNames: { pos: -1, default: true }, critical: [ { pos: 3, name: 'col_select' }, { pos: 4, name: 'id' }, { pos: 10, name: 'trim_ws', default: true }, { pos: 12, name: 'n_max', default: Infinity }, { pos: 14, name: 'name_repair', default: 'unique' }, { pos: 18, name: 'skip_empty_rows', default: true } ], noEmptyNames: true }, 'read_delim': { fileName: { pos: 0, name: 'file' }, separator: { pos: 1, name: 'delim', default: '\t' }, quote: { pos: 2, name: 'quote', default: '"' }, header: { pos: 5, name: 'col_names', default: true }, comment: { pos: 12, name: 'comment', default: '' }, skipLines: { pos: 14, name: 'skip', default: 0 }, checkNames: { pos: -1, default: false }, noDupNames: { pos: -1, default: true }, critical: [ { pos: 3, name: 'escape_backslash', default: false }, { pos: 4, name: 'escape_double', default: true }, { pos: 7, name: 'col_select' }, { pos: 8, name: 'id' }, { pos: 13, name: 'trim_ws', default: false }, { pos: 15, name: 'n_max', default: Infinity }, { pos: 17, name: 'name_repair', default: 'unique' }, { pos: 21, name: 'skip_empty_rows', default: true } ], noEmptyNames: true }, 'cbind': { special: ['deparse.level', 'make.row.names', 'stringsAsFactors', 'factor.exclude'] }, 'rbind': { special: ['deparse.level', 'make.row.names', 'stringsAsFactors', 'factor.exclude'] }, 'head': { dataFrame: { pos: 0, name: 'x' }, amount: { pos: 1, name: 'n', default: 6 } }, 'tail': { dataFrame: { pos: 0, name: 'x' }, amount: { pos: 1, name: 'n', default: 6 } }, 'subset': { dataFrame: { pos: 0, name: 'x' }, subset: { pos: 1, name: 'subset' }, select: { pos: 2, name: 'select' }, drop: { pos: 3, name: 'drop', default: false } }, 'filter': { dataFrame: { pos: 0, name: '.data' }, special: ['.by', '.preserve'] }, 'select': { dataFrame: { pos: 0, name: '.data' }, special: [] }, 'mutate': { dataFrame: { pos: 0, name: '.data' }, special: ['.by', '.keep', '.before', '.after'], critical: [{ pos: -1, name: '.keep' }], checkNames: false, noDupNames: false }, 'transform': { dataFrame: { pos: 0, name: '_data' }, special: [], checkNames: true, noDupNames: true }, 'group_by': { dataFrame: { pos: 0, name: '.data' }, by: { pos: 1 }, special: ['.add', '.drop'] }, 'summarise': { dataFrame: { pos: 0, name: '.data' }, special: ['.by', '.groups'] }, 'summarize': { dataFrame: { pos: 0, name: '.data' }, special: ['.by', '.groups'] }, 'inner_join': { dataFrame: { pos: 0, name: 'x' }, otherDataFrame: { pos: 1, name: 'y' }, by: { pos: 2, name: 'by' }, joinAll: { pos: -1, default: false }, joinLeft: { pos: -1, default: false }, joinRight: { pos: -1, default: false }, critical: [{ pos: -1, name: 'keep' }] }, 'left_join': { dataFrame: { pos: 0, name: 'x' }, otherDataFrame: { pos: 1, name: 'y' }, by: { pos: 2, name: 'by' }, joinAll: { pos: -1, default: false }, joinLeft: { pos: -1, default: true }, joinRight: { pos: -1, default: false }, critical: [{ pos: -1, name: 'keep' }] }, 'right_join': { dataFrame: { pos: 0, name: 'x' }, otherDataFrame: { pos: 1, name: 'y' }, by: { pos: 2, name: 'by' }, joinAll: { pos: -1, default: false }, joinLeft: { pos: -1, default: false }, joinRight: { pos: -1, default: true }, critical: [{ pos: -1, name: 'keep' }] }, 'full_join': { dataFrame: { pos: 0, name: 'x' }, otherDataFrame: { pos: 1, name: 'y' }, by: { pos: 2, name: 'by' }, joinAll: { pos: -1, default: true }, joinLeft: { pos: -1, default: false }, joinRight: { pos: -1, default: false }, critical: [{ pos: -1, name: 'keep' }] }, 'merge': { dataFrame: { pos: 0, name: 'x' }, otherDataFrame: { pos: 1, name: 'y' }, by: { pos: 2, name: 'by' }, joinAll: { pos: 5, name: 'all', default: false }, joinLeft: { pos: 6, name: 'all.x', default: false }, joinRight: { pos: 7, name: 'all.y', default: false }, critical: [ { pos: 3, name: 'by.x' }, { pos: 4, name: 'by.y' } ] }, 'relocate': { dataFrame: { pos: 0, name: '.data' }, special: ['.before', '.after'], disallowNamedArgs: true }, 'arrange': { dataFrame: { pos: 0, name: '.data' }, special: ['.by_group', '.locale'] } }; /** * Maps a concrete data frame function call to abstract data frame operations. * @param node - The R node of the function call * @param inference - The data frame shape inference visitor * @param dfg - The data flow graph for resolving the arguments * @param ctx - The current flowR analyzer context * @returns The mapped abstract data frame operations for the function call, or `undefined` if the node does not represent a data frame function call */ function mapDataFrameFunctionCall(node, inference, dfg, ctx) { if (node.type !== type_1.RType.FunctionCall || !node.named) { return; } const resolveInfo = { graph: dfg, idMap: dfg.idMap, full: true, resolve: config_1.VariableResolve.Alias, ctx }; const n = identifier_1.Identifier.getName(node.functionName.content); if (isDataFrameFunction(n)) { const functionName = n; const mapper = DataFrameFunctionMapper[functionName].mapper; const params = DataFrameFunctionParamsMapper[functionName]; const args = (0, arguments_1.getFunctionArguments)(node, dfg); if ((0, arguments_1.hasCriticalArgument)(args, params.critical, resolveInfo)) { return [{ operation: 'unknown', operand: undefined }]; } else { return mapper(args, params, inference, resolveInfo); } } else { const mapping = getOtherDataFrameFunction(identifier_1.Identifier.getName(node.functionName.content)); if (mapping === undefined) { return; } else if (mapping.type === 'entry_point') { return [{ operation: 'unknown', operand: undefined }]; } else if (mapping.type === 'transformation' || mapping.type === 'modification') { const args = (0, arguments_1.getFunctionArguments)(node, dfg); return mapDataFrameUnknown(args, mapping, inference, resolveInfo); } else { (0, assert_1.assertUnreachable)(mapping); } } } function isDataFrameFunction(functionName) { // a check with `functionName in DataFrameFunctionMapper` would return true for "toString" return Object.hasOwn(DataFrameFunctionMapper, functionName); } function getOtherDataFrameFunction(functionName) { return OtherDataFrameFunctions.find(entry => entry.names.includes(functionName)); } function mapDataFrameCreate(args, params, inference, info) { const checkNames = (0, arguments_1.getArgumentValue)(args, params.checkNames, info); const noDupNames = (0, arguments_1.getArgumentValue)(args, params.noDupNames, info); args = (0, arguments_1.getEffectiveArgs)(args, params.special); const argNames = args.map(arg => (0, resolve_args_1.resolveIdToArgName)(arg, info)); const argLengths = args.map(arg => (0, resolve_args_1.resolveIdToArgVectorLength)(arg, info)); const allVectors = argLengths.every(assert_1.isNotUndefined); const rows = allVectors ? Math.max(...argLengths, 0) : undefined; let colnames = argNames; // over-approximate the column names if arguments are present but cannot be resolved to values if (!allVectors || typeof checkNames !== 'boolean' || typeof noDupNames !== 'boolean') { colnames = undefined; } else if (rows === 0) { colnames = []; } else { colnames = (0, arguments_1.filterValidNames)(colnames, checkNames, noDupNames); } return [{ operation: 'create', operand: undefined, colnames, rows }]; } function mapDataFrameConvert(args, params, inference, info) { const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info); if (dataFrame === r_function_call_1.EmptyArgument || dataFrame?.value === undefined) { return [{ operation: 'unknown', operand: undefined }]; } return [{ operation: 'identity', operand: dataFrame.value.info.id }]; } function mapDataFrameRead(args, params, inference, info) { const fileNameArg = (0, arguments_1.getFunctionArgument)(args, params.fileName, info); const textArg = params.text ? (0, arguments_1.getFunctionArgument)(args, params.text, info) : undefined; const { source, request } = getRequestFromRead(fileNameArg, textArg, params, info); const header = (0, arguments_1.getArgumentValue)(args, params.header, info); const separator = (0, arguments_1.getArgumentValue)(args, params.separator, info); const quote = (0, arguments_1.getArgumentValue)(args, params.quote, info); const comment = (0, arguments_1.getArgumentValue)(args, params.comment, info); const skipLines = (0, arguments_1.getArgumentValue)(args, params.skipLines, info); const checkNames = (0, arguments_1.getArgumentValue)(args, params.checkNames, info); const noDupNames = (0, arguments_1.getArgumentValue)(args, params.noDupNames, info); const validArguments = typeof header === 'boolean' && typeof separator === 'string' && typeof quote === 'string' && typeof comment === 'string' && typeof skipLines === 'number' && typeof checkNames === 'boolean' && typeof noDupNames === 'boolean'; if (request === undefined || !info.ctx.config.abstractInterpretation.dataFrame.readLoadedData.readExternalFiles || !validArguments) { return [{ operation: 'read', operand: undefined, source, colnames: undefined, rows: undefined }]; } const LineCommentRegex = new RegExp(`\\s*[${(0, arguments_1.escapeRegExp)(comment, true)}].*`); let firstLine = undefined; let firstLineNumber = 0; let rowCount = 0; const parseLine = (line, lineNumber) => { const text = comment ? line.toString().replace(LineCommentRegex, '') : line.toString(); if (text.length > 0 && lineNumber >= (skipLines ?? 0)) { if (firstLine === undefined) { firstLine = getEntriesFromCsvLine(text, separator, quote, comment); firstLineNumber = lineNumber; } if (!header || lineNumber > firstLineNumber) { rowCount++; } } }; const allLines = (0, arguments_1.parseRequestContent)(request, parseLine, info.ctx.config.abstractInterpretation.dataFrame.readLoadedData.maxReadLines); let colnames; if (header) { colnames = (0, arguments_1.filterValidNames)(firstLine, checkNames, noDupNames, params.noEmptyNames); } else if (firstLine !== undefined) { colnames = Array(firstLine.length).fill(undefined); } return [{ operation: 'read', operand: undefined, source, colnames, rows: allLines ? rowCount : [rowCount, Infinity] }]; } function mapDataFrameColBind(args, params, inference, info) { args = (0, arguments_1.getEffectiveArgs)(args, params.special); const dataFrame = args.find(arg => (0, arguments_1.isDataFrameArgument)(arg, inference)); if (dataFrame === undefined) { return; } else if (args.length === 1) { return [{ operation: 'identity', operand: dataFrame.value.info.id }]; } const result = []; let operand = dataFrame.value; let colnames = []; for (const arg of args) { if (arg !== dataFrame && arg !== r_function_call_1.EmptyArgument) { const otherDataFrame = inference.getAbstractValue(arg.value); if (otherDataFrame !== undefined) { result.push({ operation: 'concatCols', operand: operand?.info.id, other: otherDataFrame }); operand = undefined; // added columns are top if argument cannot be resolved to constant (vector-like) value } else if ((0, resolve_args_1.resolveIdToArgValue)(arg, info) !== undefined) { const colname = (0, resolve_args_1.resolveIdToArgName)(arg, info); colnames?.push(colname); } else { colnames = undefined; } } } if (colnames === undefined || colnames.length > 0) { result.push({ operation: 'addCols', operand: operand?.info.id, colnames }); } return result; } function mapDataFrameRowBind(args, params, inference, info) { args = (0, arguments_1.getEffectiveArgs)(args, params.special); const dataFrame = args.find(arg => (0, arguments_1.isDataFrameArgument)(arg, inference)); if (dataFrame === undefined) { return; } else if (args.length === 1) { return [{ operation: 'identity', operand: dataFrame.value.info.id }]; } const result = []; let operand = dataFrame.value; let rows = 0; for (const arg of args) { if (arg !== dataFrame && arg !== r_function_call_1.EmptyArgument) { const otherDataFrame = inference.getAbstractValue(arg.value); if (otherDataFrame !== undefined) { result.push({ operation: 'concatRows', operand: operand?.info.id, other: otherDataFrame }); operand = undefined; // number of added rows is top if arguments cannot be resolved to constant (vector-like) value } else if ((0, resolve_args_1.resolveIdToArgValue)(arg, info) !== undefined) { rows = rows !== undefined ? rows + 1 : undefined; } else { rows = undefined; } } } if (rows === undefined || rows > 0) { result.push({ operation: 'addRows', operand: operand?.info.id, rows }); } return result; } function mapDataFrameHeadTail(args, params, inference, info) { const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info); if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) { return; } const result = []; const amount = (0, arguments_1.getArgumentValue)(args, params.amount, info); let rows = undefined; let cols = undefined; if (typeof amount === 'number') { rows = amount; } else if (Array.isArray(amount) && amount.length <= 2 && amount.every(value => typeof value === 'number')) { rows = amount[0]; cols = amount[1]; } result.push({ operation: rows === undefined || rows >= 0 ? 'subsetRows' : 'removeRows', operand: dataFrame.value.info.id, rows: rows !== undefined ? Math.abs(rows) : undefined }); if (cols !== undefined) { result.push({ operation: cols >= 0 ? 'subsetCols' : 'removeCols', operand: undefined, colnames: Array(Math.abs(cols)).fill(undefined) }); } return result; } function mapDataFrameSubset(args, params, inference, info) { const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info); if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) { return; } else if (args.length === 1) { return [{ operation: 'identity', operand: dataFrame.value.info.id }]; } const result = []; let operand = dataFrame.value; const filterArg = (0, arguments_1.getFunctionArgument)(args, params.subset, info); const filterValue = (0, resolve_args_1.resolveIdToArgValue)(filterArg, info); const selectArg = (0, arguments_1.getFunctionArgument)(args, params.select, info); const dropArg = (0, arguments_1.getFunctionArgument)(args, params.drop, info); const condition = typeof filterValue === 'boolean' ? filterValue : undefined; const filterNames = (0, arguments_1.getUnresolvedSymbolsInExpression)(filterArg, info.graph); const { selectedCols, unselectedCols } = getSelectedColumns([selectArg], info); const accessedCols = [...filterNames, ...selectedCols ?? [], ...unselectedCols ?? []]; const mixedAccess = accessedCols.some(col => typeof col === 'string') && accessedCols.some(col => typeof col === 'number'); const duplicateCols = accessedCols.some((col, index, list) => col !== undefined && list.indexOf(col) !== index); if (accessedCols.some(col => typeof col === 'string')) { result.push({ operation: 'accessCols', operand: operand?.info.id, columns: accessedCols.filter(col => typeof col === 'string') }); } if (accessedCols.some(col => typeof col === 'number')) { result.push({ operation: 'accessCols', operand: operand?.info.id, columns: accessedCols.filter(col => typeof col === 'number').map(Math.abs) }); } if (filterArg !== undefined && filterArg !== r_function_call_1.EmptyArgument) { result.push({ operation: 'filterRows', operand: operand?.info.id, condition: condition }); operand = undefined; } if (!dropArg || accessedCols.length > 1) { if (unselectedCols === undefined || unselectedCols.length > 0) { result.push({ operation: 'removeCols', operand: operand?.info.id, colnames: unselectedCols?.map(col => typeof col === 'string' ? col : undefined) }); operand = undefined; } if (selectedCols === undefined || selectedCols.length > 0) { result.push({ operation: 'subsetCols', operand: operand?.info.id, colnames: selectedCols?.map(col => typeof col === 'string' ? col : undefined), ...(duplicateCols || mixedAccess ? { options: { duplicateCols: true } } : {}) }); operand = undefined; } } return result; } function mapDataFrameFilter(args, params, inference, info) { args = (0, arguments_1.getEffectiveArgs)(args, params.special); const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info); if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) { return; } else if (args.length === 1) { return [{ operation: 'identity', operand: dataFrame.value.info.id }]; } const result = []; const filterArgs = args.filter(arg => arg !== dataFrame); const filterValues = filterArgs.map(arg => (0, resolve_args_1.resolveIdToArgValue)(arg, info)); const accessedNames = filterArgs.flatMap(arg => (0, arguments_1.getUnresolvedSymbolsInExpression)(arg, info.graph).map(identifier_1.Identifier.getName)); const condition = filterValues.every(value => typeof value === 'boolean') ? filterValues.every(cond => cond) : undefined; if (accessedNames.length > 0) { result.push({ operation: 'accessCols', operand: dataFrame.value.info.id, columns: accessedNames }); } result.push({ operation: 'filterRows', operand: dataFrame.value.info.id, condition: condition }); return result; } function mapDataFrameSelect(args, params, inference, info) { args = (0, arguments_1.getEffectiveArgs)(args, params.special); const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info); if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) { return; } const result = []; let operand = dataFrame.value; const selectArgs = args.filter(arg => arg !== dataFrame); let { selectedCols, unselectedCols } = getSelectedColumns(selectArgs, info); const accessedCols = [...selectedCols ?? [], ...unselectedCols ?? []]; const mixedAccess = accessedCols.some(col => typeof col === 'string') && accessedCols.some(col => typeof col === 'number'); const duplicateAccess = accessedCols.some((col, _, list) => col !== undefined && list.filter(other => other === col).length > 1); const renamedCols = selectArgs.some(arguments_1.isNamedArgument); // map to top if columns are selected mixed by string and number, or are selected duplicate if (mixedAccess || duplicateAccess) { selectedCols = undefined; unselectedCols = []; } if (accessedCols.some(col => typeof col === 'string')) { result.push({ operation: 'accessCols', operand: operand?.info.id, columns: accessedCols.filter(col => typeof col === 'string') }); } if (accessedCols.some(col => typeof col === 'number')) { result.push({ operation: 'accessCols', operand: operand?.info.id, columns: accessedCols.filter(col => typeof col === 'number').map(Math.abs) }); } if (unselectedCols === undefined || unselectedCols.length > 0) { result.push({ operation: 'removeCols', operand: operand?.info.id, colnames: unselectedCols?.map(col => typeof col === 'string' ? col : undefined) }); operand = undefined; } if (selectedCols === undefined || selectedCols.length > 0 || unselectedCols?.length === 0) { result.push({ operation: 'subsetCols', operand: operand?.info.id, colnames: selectedCols?.map(col => typeof col === 'string' ? col : undefined), ...(renamedCols ? { options: { renamedCols: true } } : {}) }); operand = undefined; } return result; } function mapDataFrameMutate(args, params, inference, info) { args = (0, arguments_1.getEffectiveArgs)(args, params.special); const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info); if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) { return; } else if (args.length === 1) { return [{ operation: 'identity', operand: dataFrame.value.info.id }]; } const result = []; let operand = dataFrame.value; const mutateArgs = args.filter(arg => arg !== dataFrame); let deletedCols = mutateArgs .filter(arguments_1.isRNull) .map(arg => (0, resolve_args_1.resolveIdToArgName)(arg, info)); let mutatedCols = mutateArgs .filter(arg => !(0, arguments_1.isRNull)(arg)) .map(arg => (0, resolve_args_1.resolveIdToArgName)(arg, info)); // only column names that are not created by mutation are preconditions on the operand const accessedNames = mutateArgs .flatMap(arg => (0, arguments_1.getUnresolvedSymbolsInExpression)(arg, info.graph).map(identifier_1.Identifier.toString)) .filter(arg => !mutatedCols?.includes(arg)); deletedCols = (0, arguments_1.filterValidNames)(deletedCols, params.checkNames, params.noDupNames, undefined, true); mutatedCols = (0, arguments_1.filterValidNames)(mutatedCols, params.checkNames, params.noDupNames, undefined, true); if (accessedNames.length > 0) { result.push({ operation: 'accessCols', operand: operand?.info.id, columns: accessedNames }); } if (mutatedCols === undefined || mutatedCols.length > 0 || deletedCols?.length === 0) { result.push({ operation: 'mutateCols', operand: operand?.info.id, colnames: mutatedCols }); operand = undefined; } if (deletedCols === undefined || deletedCols.length > 0) { result.push({ operation: 'removeCols', operand: operand?.info.id, colnames: deletedCols, options: { maybe: true } }); operand = undefined; } return result; } function mapDataFrameGroupBy(args, params, inference, info) { args = (0, arguments_1.getEffectiveArgs)(args, params.special); const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info); if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) { return; } else if (args.length === 1) { return [{ operation: 'identity', operand: dataFrame.value.info.id }]; } const result = []; const byArgs = args.filter(arg => arg !== dataFrame); const accessedNames = byArgs.flatMap(arg => (0, arguments_1.getUnresolvedSymbolsInExpression)(arg, info.graph)).map(identifier_1.Identifier.toString); const byNames = byArgs.map(arg => (0, arguments_1.isNamedArgument)(arg) ? (0, resolve_args_1.resolveIdToArgName)(arg, info) : (0, resolve_args_1.resolveIdToArgValueSymbolName)(arg, info)); const mutatedCols = byArgs.some(arguments_1.isNamedArgument) || byNames.some(assert_1.isUndefined); if (accessedNames.length > 0) { result.push({ operation: 'accessCols', operand: dataFrame.value.info.id, columns: accessedNames }); } result.push({ operation: 'groupBy', operand: dataFrame.value.info.id, by: byNames, ...(mutatedCols ? { options: { mutatedCols: true } } : {}) }); return result; } function mapDataFrameSummarize(args, params, inference, info) { args = (0, arguments_1.getEffectiveArgs)(args, params.special); const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info); if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) { return; } const result = []; const summarizeArgs = args.filter(arg => arg !== dataFrame); const summarizedCols = summarizeArgs.map(arg => (0, resolve_args_1.resolveIdToArgName)(arg, info)); // only column names that are not created by summarize are preconditions on the operand const accessedNames = summarizeArgs .flatMap(arg => (0, arguments_1.getUnresolvedSymbolsInExpression)(arg, info.graph).map(identifier_1.Identifier.toString)) .filter(arg => !summarizedCols.includes(arg)); if (accessedNames.length > 0) { result.push({ operation: 'accessCols', operand: dataFrame.value.info.id, columns: accessedNames }); } result.push({ operation: 'summarize', operand: dataFrame.value.info.id, colnames: summarizedCols }); return result; } function mapDataFrameJoin(args, params, inference, info) { const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info); const joinAll = (0, arguments_1.getArgumentValue)(args, params.joinAll, info); const joinLeft = (0, arguments_1.getArgumentValue)(args, params.joinLeft, info); const joinRight = (0, arguments_1.getArgumentValue)(args, params.joinRight, info); if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) { return; } else if (args.length === 1) { return [{ operation: 'identity', operand: dataFrame.value.info.id }]; } else if (typeof joinAll !== 'boolean' || typeof joinLeft !== 'boolean' || typeof joinRight !== 'boolean') { return [{ operation: 'unknown', operand: dataFrame.value.info.id }]; } const result = []; const otherArg = (0, arguments_1.getFunctionArgument)(args, params.otherDataFrame, info); const byArg = (0, arguments_1.getFunctionArgument)(args, params.by, info); const otherDataFrame = inference.getAbstractValue(otherArg) ?? dataframe_domain_1.DataFrameDomain.top(info.ctx.config.abstractInterpretation.dataFrame.maxColNames); let byCols; const joinType = getJoinType(joinAll, joinLeft, joinRight); if (byArg !== undefined) { const byValue = (0, resolve_args_1.resolveIdToArgValue)(byArg, info); if (typeof byValue === 'string' || typeof byValue === 'number') { byCols = [byValue]; } else if (Array.isArray(byValue) && (byValue.every(by => typeof by === 'string') || byValue.every(by => typeof by === 'number'))) { byCols = byValue; } } if (byCols?.some(by => typeof by === 'string')) { result.push({ operation: 'accessCols', operand: dataFrame.value.info.id, columns: byCols.filter(by => typeof by === 'string') }); } if (byCols?.some(by => typeof by === 'number')) { result.push({ operation: 'accessCols', operand: dataFrame.value.info.id, columns: byCols.filter(by => typeof by === 'number') }); } result.push({ operation: 'join', operand: dataFrame.value.info.id, other: otherDataFrame, by: byCols?.map(by => typeof by === 'string' ? by : undefined), options: { join: joinType, natural: byArg === undefined } }); return result; } function mapDataFrameIdentity(args, params, inference, info) { args = (0, arguments_1.getEffectiveArgs)(args, params.special); const dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info); if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) { return; } else if (params.disallowNamedArgs && args.some(arguments_1.isNamedArgument)) { return [{ operation: 'unknown', operand: dataFrame.value.info.id }]; } return [{ operation: 'identity', operand: dataFrame.value.info.id }]; } function mapDataFrameUnknown(args, params, inference, info) { let dataFrame; if (params.dataFrame !== undefined) { dataFrame = (0, arguments_1.getFunctionArgument)(args, params.dataFrame, info); } else { dataFrame = args.find(arg => (0, arguments_1.isDataFrameArgument)(arg, inference)); } if (!(0, arguments_1.isDataFrameArgument)(dataFrame, inference)) { return; } return [{ operation: 'unknown', operand: dataFrame.value.info.id, ...(params.constraintType !== undefined ? { type: params.constraintType } : {}) }]; } function getRequestFromRead(fileNameArg, textArg, params, info) { let source; let request; if (fileNameArg !== undefined && fileNameArg !== r_function_call_1.EmptyArgument) { const fileName = (0, resolve_args_1.resolveIdToArgValue)(fileNameArg, info); if (typeof fileName === 'string') { const text = (0, resolve_args_1.unescapeSpecialChars)(fileName); source = fileName; const referenceChain = fileNameArg.info.file ? [fileNameArg.info.file] : []; const sources = (0, built_in_source_1.findSource)(info.ctx.config.solver.resolveSource, fileName, { referenceChain, ctx: info.ctx }); if (sources?.length === 1) { source = sources[0]; // create request from resolved source file path request = { request: 'file', content: sources[0] }; } else if (params.text === undefined && text.includes('\n')) { // create request from string if file name argument contains newline request = (0, retriever_1.requestFromInput)(text); } } } else if (textArg !== undefined && textArg !== r_function_call_1.EmptyArgument) { const text = (0, resolve_args_1.resolveIdToArgValue)(textArg, info); if (typeof text === 'string') { source = text; request = (0, retriever_1.requestFromInput)((0, resolve_args_1.unescapeSpecialChars)(text)); } } request = request ? info.ctx.files.resolveRequest(request).r : undefined; return { source, request }; } /** * Gets all entries from a line of a CSV file u