data-profile
Version: 
A minimal TypeScript package that profiles arrays of records (data-frame-like) for LLM context.
691 lines (668 loc) • 19 kB
Markdown
# data-profile
A minimal TypeScript package that profiles arrays of records (data-frame-like) for LLM context.
## Installation
```bash
npm install data-profile
```
## Usage
```typescript
import profile from 'data-profile';
const data = [
  { id: 1, name: 'Alice', age: 30, active: true },
  { id: 2, name: 'Bob', age: 25, active: false },
  { id: 3, name: 'Charlie', age: 35, active: true }
];
// Basic profiling
const summary = profile(data);
// With advanced features
const detailed = profile(data, {
  sampleSize: 10,
  associationMatrix: true,
  keysDependencies: true,
  missingnessPatterns: true,
  outliers: true,
  categoricalEntropy: true
});
```
## Features
- **Basic Stats**: Row count, column types, missing values
- **Numeric Analysis**: Min/max, mean, median, quartiles, percentiles, standard deviation
- **Categorical Analysis**: Unique values, top 10 frequencies, HHI index
- **Associations**: Pearson correlation, Cramér's V, eta-squared
- **Data Quality**: Primary key candidates, functional dependencies, outlier detection
- **Missing Patterns**: Per-column rates, co-missing analysis
- **Entropy**: Information content for categorical variables
- **Customizable Sampling**: Control number of sample rows returned (default: 5)
## Output Structure
```typescript
type ProfileSummary = {
  rowCount: number;
  columns: string[];
  columnStats: Record<string, {
    present: number;
    missing: number;
    types: string[];
    numericStats?: { min, max, mean, median, quartiles, stddev, percentiles };
    categoricalStats?: { unique, top10, hhi };
  }>;
  sampleRows: Array<Record<string, unknown>>;
  // Optional advanced features...
};
```
## Real Dataset Example
Here's an example using the Titanic dataset:
```typescript
const basicProfile = profile(titanicSample);
console.log(JSON.stringify(basicProfile, null, 2));
```
```json
{
    "rowCount": 891,
    "columns": [
        "Age",
        "Cabin",
        "Embarked",
        "Fare",
        "Name",
        "Parch",
        "PassengerId",
        "Pclass",
        "Sex",
        "SibSp",
        "Survived",
        "Ticket"
    ],
    "columnStats": {
        "Age": {
            "present": 714,
            "missing": 177,
            "types": [
                "number"
            ],
            "numericStats": {
                "min": 0.42,
                "max": 80,
                "mean": 29.69911764705882,
                "median": 28,
                "quartiles": [
                    20.125,
                    28,
                    38
                ],
                "stddev": 14.526497332334044,
                "percentiles": {
                    "10": 14,
                    "90": 50
                }
            }
        },
        "Cabin": {
            "present": 204,
            "missing": 687,
            "types": [
                "string"
            ],
            "categoricalStats": {
                "unique": 147,
                "top10": [
                    {
                        "value": "G6",
                        "count": 4
                    },
                    {
                        "value": "C23 C25 C27",
                        "count": 4
                    },
                    {
                        "value": "B96 B98",
                        "count": 4
                    },
                    {
                        "value": "F33",
                        "count": 3
                    },
                    {
                        "value": "E101",
                        "count": 3
                    },
                    {
                        "value": "F2",
                        "count": 3
                    },
                    {
                        "value": "D",
                        "count": 3
                    },
                    {
                        "value": "C22 C26",
                        "count": 3
                    },
                    {
                        "value": "C123",
                        "count": 2
                    },
                    {
                        "value": "D33",
                        "count": 2
                    }
                ],
                "hhi": 83.14109957708554
            }
        },
        "Embarked": {
            "present": 889,
            "missing": 2,
            "types": [
                "string"
            ],
            "categoricalStats": {
                "unique": 3,
                "top10": [
                    {
                        "value": "S",
                        "count": 644
                    },
                    {
                        "value": "C",
                        "count": 168
                    },
                    {
                        "value": "Q",
                        "count": 77
                    }
                ],
                "hhi": 5679.8313596627195
            }
        },
        "Fare": {
            "present": 891,
            "missing": 0,
            "types": [
                "number"
            ],
            "numericStats": {
                "min": 0,
                "max": 512.3292,
                "mean": 32.2042079685746,
                "median": 14.4542,
                "quartiles": [
                    7.9104,
                    14.4542,
                    31
                ],
                "stddev": 49.693428597180905,
                "percentiles": {
                    "10": 7.55,
                    "90": 77.9583
                }
            }
        },
        "Name": {
            "present": 891,
            "missing": 0,
            "types": [
                "string"
            ],
            "categoricalStats": {
                "unique": 891,
                "top10": [
                    {
                        "value": "Braund, Mr. Owen Harris",
                        "count": 1
                    },
                    {
                        "value": "Cumings, Mrs. John Bradley (Florence Briggs Thayer)",
                        "count": 1
                    },
                    {
                        "value": "Heikkinen, Miss. Laina",
                        "count": 1
                    },
                    {
                        "value": "Futrelle, Mrs. Jacques Heath (Lily May Peel)",
                        "count": 1
                    },
                    {
                        "value": "Allen, Mr. William Henry",
                        "count": 1
                    },
                    {
                        "value": "Moran, Mr. James",
                        "count": 1
                    },
                    {
                        "value": "McCarthy, Mr. Timothy J",
                        "count": 1
                    },
                    {
                        "value": "Palsson, Master. Gosta Leonard",
                        "count": 1
                    },
                    {
                        "value": "Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",
                        "count": 1
                    },
                    {
                        "value": "Nasser, Mrs. Nicholas (Adele Achem)",
                        "count": 1
                    }
                ],
                "hhi": 11.223344556677977
            }
        },
        "Parch": {
            "present": 891,
            "missing": 0,
            "types": [
                "number"
            ],
            "numericStats": {
                "min": 0,
                "max": 6,
                "mean": 0.38159371492704824,
                "median": 0,
                "quartiles": [
                    0,
                    0,
                    0
                ],
                "stddev": 0.8060572211299559,
                "percentiles": {
                    "10": 0,
                    "90": 2
                }
            }
        },
        "PassengerId": {
            "present": 891,
            "missing": 0,
            "types": [
                "number"
            ],
            "numericStats": {
                "min": 1,
                "max": 891,
                "mean": 446,
                "median": 446,
                "quartiles": [
                    223.5,
                    446,
                    668.5
                ],
                "stddev": 257.3538420152301,
                "percentiles": {
                    "10": 90,
                    "90": 802
                }
            }
        },
        "Pclass": {
            "present": 891,
            "missing": 0,
            "types": [
                "number"
            ],
            "numericStats": {
                "min": 1,
                "max": 3,
                "mean": 2.308641975308642,
                "median": 3,
                "quartiles": [
                    2,
                    3,
                    3
                ],
                "stddev": 0.8360712409770513,
                "percentiles": {
                    "10": 1,
                    "90": 3
                }
            }
        },
        "Sex": {
            "present": 891,
            "missing": 0,
            "types": [
                "string"
            ],
            "categoricalStats": {
                "unique": 2,
                "top10": [
                    {
                        "value": "male",
                        "count": 577
                    },
                    {
                        "value": "female",
                        "count": 314
                    }
                ],
                "hhi": 5435.638338743466
            }
        },
        "SibSp": {
            "present": 891,
            "missing": 0,
            "types": [
                "number"
            ],
            "numericStats": {
                "min": 0,
                "max": 8,
                "mean": 0.5230078563411896,
                "median": 0,
                "quartiles": [
                    0,
                    0,
                    1
                ],
                "stddev": 1.1027434322934275,
                "percentiles": {
                    "10": 0,
                    "90": 1
                }
            }
        },
        "Survived": {
            "present": 891,
            "missing": 0,
            "types": [
                "number"
            ],
            "numericStats": {
                "min": 0,
                "max": 1,
                "mean": 0.3838383838383838,
                "median": 0,
                "quartiles": [
                    0,
                    0,
                    1
                ],
                "stddev": 0.4865924542648585,
                "percentiles": {
                    "10": 0,
                    "90": 1
                }
            }
        },
        "Ticket": {
            "present": 891,
            "missing": 0,
            "types": [
                "string",
                "number"
            ]
        }
    },
    "sampleRows": [
        {
            "Age": 22,
            "Cabin": null,
            "Embarked": "S",
            "Fare": 7.25,
            "Name": "Braund, Mr. Owen Harris",
            "Parch": 0,
            "PassengerId": 1,
            "Pclass": 3,
            "Sex": "male",
            "SibSp": 1,
            "Survived": 0,
            "Ticket": "A/5 21171"
        }
        ....
    ],
}
```
**Advanced Features Output:**
```typescript
const advancedProfile = profile(titanicSample, {
  associationMatrix: true,
  keysDependencies: true,
  missingnessPatterns: true,
  outliers: true,
  categoricalEntropy: true
});
```
```json
{
    "associationMatrix": {
        "Age": {
            "Cabin": 0.84487746358188,
            "Embarked": 0.0017926615874587443,
            "Fare": 0.09606669176903883,
            "Name": 1,
            "Parch": -0.18911926263203518,
            "PassengerId": 0.03684719786132784,
            "Pclass": -0.36922601531551574,
            "Sex": 0.008696229596377726,
            "SibSp": -0.3082467589236574,
            "Survived": -0.07722109457217737
        },
        "Cabin": {
            "Age": 0.84487746358188,
            "Embarked": 0.9492663573697424,
            "Fare": 0.8853684889490768,
            "Name": 0.9999999999998898,
            "Parch": 0.9406806543707776,
            "PassengerId": 0.8050749169584301,
            "Pclass": 1.0000000000000016,
            "Sex": 0.85902989670101,
            "SibSp": 0.9294261294261293,
            "Survived": 0.7904411764705864
        },
        "Embarked": {
            "Age": 0.0017926615874587443,
            "Cabin": 0.9492663573697424,
            "Fare": 0.07927065096791176,
            "Name": 0.9999999999999969,
            "Parch": 0.00723127712686622,
            "PassengerId": 0.0011747820348445024,
            "Pclass": 0.09501742839465667,
            "Sex": 0.12256919037251322,
            "SibSp": 0.004906413368023388,
            "Survived": 0.029796568998017418
        },
        "Fare": {
            "Age": 0.09606669176903883,
            "Cabin": 0.8853684889490768,
            "Embarked": 0.07927065096791176,
            "Name": 1,
            "Parch": 0.21622494477076254,
            "PassengerId": 0.012658219287491229,
            "Pclass": -0.5494996199439062,
            "Sex": 0.033245262282584974,
            "SibSp": 0.15965104324216106,
            "Survived": 0.2573065223849618
        },
        "Name": {
            "Age": 1,
            "Cabin": 0.9999999999998898,
            "Embarked": 0.9999999999999969,
            "Fare": 1,
            "Parch": 1,
            "PassengerId": 1,
            "Pclass": 1,
            "Sex": 0.9999999999999973,
            "SibSp": 1,
            "Survived": 1
        },
        "Parch": {
            "Age": -0.18911926263203518,
            "Cabin": 0.9406806543707776,
            "Embarked": 0.00723127712686622,
            "Fare": 0.21622494477076254,
            "Name": 1,
            "PassengerId": -0.0016520124027188283,
            "Pclass": 0.01844267131074835,
            "Sex": 0.06026482952641173,
            "SibSp": 0.41483769862015263,
            "Survived": 0.08162940708348221
        },
        "PassengerId": {
            "Age": 0.03684719786132784,
            "Cabin": 0.8050749169584301,
            "Embarked": 0.0011747820348445024,
            "Fare": 0.012658219287491229,
            "Name": 1,
            "Parch": -0.0016520124027188283,
            "Pclass": -0.03514399403037966,
            "Sex": 0.0018437474224206054,
            "SibSp": -0.057526833784441705,
            "Survived": -0.005006660767066476
        },
        "Pclass": {
            "Age": -0.36922601531551574,
            "Cabin": 1.0000000000000016,
            "Embarked": 0.09501742839465667,
            "Fare": -0.5494996199439062,
            "Name": 1,
            "Parch": 0.01844267131074835,
            "PassengerId": -0.03514399403037966,
            "Sex": 0.017397739421770773,
            "SibSp": 0.08308136284568661,
            "Survived": -0.3384810359610158
        },
        "Sex": {
            "Age": 0.008696229596377726,
            "Cabin": 0.85902989670101,
            "Embarked": 0.12256919037251322,
            "Fare": 0.033245262282584974,
            "Name": 0.9999999999999973,
            "Parch": 0.06026482952641173,
            "PassengerId": 0.0018437474224206054,
            "Pclass": 0.017397739421770773,
            "SibSp": 0.013140222690535797,
            "Survived": 0.29523072286268753
        },
        "SibSp": {
            "Age": -0.3082467589236574,
            "Cabin": 0.9294261294261293,
            "Embarked": 0.004906413368023388,
            "Fare": 0.15965104324216106,
            "Name": 1,
            "Parch": 0.41483769862015263,
            "PassengerId": -0.057526833784441705,
            "Pclass": 0.08308136284568661,
            "Sex": 0.013140222690535797,
            "Survived": -0.03532249888573588
        },
        "Survived": {
            "Age": -0.07722109457217737,
            "Cabin": 0.7904411764705864,
            "Embarked": 0.029796568998017418,
            "Fare": 0.2573065223849618,
            "Name": 1,
            "Parch": 0.08162940708348221,
            "PassengerId": -0.005006660767066476,
            "Pclass": -0.3384810359610158,
            "Sex": 0.29523072286268753,
            "SibSp": -0.03532249888573588
        },
        "Ticket": {}
    },
    "keysAndDependencies": {
        "candidatePrimaryKeys": [
            [
                "Name"
            ],
            [
                "PassengerId"
            ]
        ],
        "functionalDependencies": [
            {
                "from": [
                    "Cabin"
                ],
                "to": "Pclass"
            },
            {
                "from": [
                    "Ticket"
                ],
                "to": "Pclass"
            }
        ]
    },
    "missingnessPatterns": {
        "perColumnRates": {
            "Age": 0.19865319865319866,
            "Cabin": 0.7710437710437711,
            "Embarked": 0.002244668911335578,
            "Fare": 0,
            "Name": 0,
            "Parch": 0,
            "PassengerId": 0,
            "Pclass": 0,
            "Sex": 0,
            "SibSp": 0,
            "Survived": 0,
            "Ticket": 0
        },
        "topCoMissingPairs": [
            {
                "pair": [
                    "Age",
                    "Cabin"
                ],
                "count": 158
            }
        ]
    },
    "outliers": {
        "Age": {
            "tukeyCount": 11,
            "zscoreCount": 2
        },
        "Fare": {
            "tukeyCount": 116,
            "zscoreCount": 20
        },
        "Parch": {
            "tukeyCount": 213,
            "zscoreCount": 15
        },
        "PassengerId": {
            "tukeyCount": 0,
            "zscoreCount": 0
        },
        "Pclass": {
            "tukeyCount": 0,
            "zscoreCount": 0
        },
        "SibSp": {
            "tukeyCount": 46,
            "zscoreCount": 30
        },
        "Survived": {
            "tukeyCount": 0,
            "zscoreCount": 0
        }
    },
    "categoricalEntropy": {
        "Cabin": {
            "entropy": 7.065687903192981,
            "tailShareOutsideTop10": 0.8480392156862745
        },
        "Embarked": {
            "entropy": 1.0968693499252113,
            "tailShareOutsideTop10": 0
        },
        "Name": {
            "entropy": 9.79928162152199,
            "tailShareOutsideTop10": 0.9887766554433222
        },
        "Sex": {
            "entropy": 0.9362046432498521,
            "tailShareOutsideTop10": 0
        }
    }
}
```
## Performance
- Association matrix: Capped at 50 columns
- Key detection: Capped at 20 columns  
- Missing patterns: Optimized for large datasets
## License
MIT