nodejs-polars
Version:
Polars: Blazingly fast DataFrames in Rust, Python, Node.js, R and SQL
786 lines (758 loc) • 37.6 kB
TypeScript
import { type DataFrame, type JoinSchemas, type Schema } from "../dataframe";
import type { Series } from "../series";
import type { Deserialize, GroupByOps, Serialize } from "../shared_traits";
import type { CsvWriterOptions, LazyCrossJoinOptions, LazyDifferentNameColumnJoinOptions, LazyOptions, LazySameNameColumnJoinOptions, SinkIpcOptions, SinkJsonOptions, SinkParquetOptions } from "../types";
import { type ColumnSelection, type ColumnsOrExpr, type ExprOrString, type Simplify, type ValueOrArray } from "../utils";
import { Expr } from "./expr";
import { type LazyGroupBy } from "./groupby";
declare const inspect: unique symbol;
/**
* Representation of a Lazy computation graph / query.
*/
export interface LazyDataFrame<S extends Schema = any> extends Serialize, GroupByOps<LazyGroupBy> {
/** @ignore */
_ldf: any;
[inspect](): string;
[Symbol.toStringTag]: string;
get columns(): string[];
/**
* Cache the result once the execution of the physical plan hits this node.
*/
cache(): LazyDataFrame<S>;
clone(): LazyDataFrame<S>;
/**
*
* Collect into a DataFrame.
* Note: use `fetch` if you want to run this query on the first `n` rows only.
* This can be a huge time saver in debugging queries.
* @param opts.typeCoercion -Do type coercion optimization.
* @param opts.predicatePushdown - Do predicate pushdown optimization.
* @param opts.projectionPushdown - Do projection pushdown optimization.
* @param opts.simplifyExpression - Run simplify expressions optimization.
* @param opts.noOptimization - Turn off optimizations.
* @param opts.commSubplanElim - Will try to cache branching subplans that occur on self-joins or unions.
* @param opts.commSubexprElim - Common subexpressions will be cached and reused.
* @param opts.streaming - Process the query in batches to handle larger-than-memory data.
If set to `False` (default), the entire query is processed in a single
batch.
.. warning::
Streaming mode is considered **unstable**. It may be changed
at any point without it being considered a breaking change.
* @return DataFrame
*
*/
collect(opts?: LazyOptions): Promise<DataFrame<S>>;
collectSync(opts?: LazyOptions): DataFrame<S>;
/**
* A string representation of the optimized query plan.
*/
describeOptimizedPlan(opts?: LazyOptions): string;
/**
* A string representation of the unoptimized query plan.
*/
describePlan(): string;
/**
* Remove one or multiple columns from a DataFrame.
* @param name - column or list of columns to be removed
*/
drop<U extends string>(name: U): LazyDataFrame<Simplify<Omit<S, U>>>;
drop<const U extends string[]>(names: U): LazyDataFrame<Simplify<Omit<S, U[number]>>>;
drop<U extends string, const V extends string[]>(name: U, ...names: V): LazyDataFrame<Simplify<Omit<S, U | V[number]>>>;
/**
* Drop rows with null values from this DataFrame.
* This method only drops nulls row-wise if any single value of the row is null.
*/
dropNulls(column: string): LazyDataFrame<S>;
dropNulls(columns: string[]): LazyDataFrame<S>;
dropNulls(...columns: string[]): LazyDataFrame<S>;
/**
* Explode lists to long format.
*/
explode(column: ExprOrString): LazyDataFrame;
explode(columns: ExprOrString[]): LazyDataFrame;
explode(column: ExprOrString, ...columns: ExprOrString[]): LazyDataFrame;
/**
* Fetch is like a collect operation, but it overwrites the number of rows read by every scan
* Note that the fetch does not guarantee the final number of rows in the DataFrame.
* Filter, join operations and a lower number of rows available in the scanned file influence the final number of rows.
* @deprecated *since 0.23.0* use `LazyFrame.collect` instead, in conjunction with a call to `head`
* @param numRows - collect 'n' number of rows from data source
* @param opts.typeCoercion -Do type coercion optimization.
* @param opts.predicatePushdown - Do predicate pushdown optimization.
* @param opts.projectionPushdown - Do projection pushdown optimization.
* @param opts.simplifyExpression - Run simplify expressions optimization.
* @param opts.commSubplanElim - Will try to cache branching subplans that occur on self-joins or unions.
* @param opts.commSubexprElim - Common subexpressions will be cached and reused.
* @param opts.streaming - Process the query in batches to handle larger-than-memory data.
If set to `False` (default), the entire query is processed in a single
batch.
.. warning::
Streaming mode is considered **unstable**. It may be changed
at any point without it being considered a breaking change.
*
*/
fetch(numRows: number, opts: LazyOptions): Promise<DataFrame<S>>;
fetch(numRows?: number): Promise<DataFrame<S>>;
/** Behaves the same as fetch, but will perform the actions synchronously */
fetchSync(numRows?: number): DataFrame<S>;
fetchSync(numRows: number, opts: LazyOptions): DataFrame<S>;
/**
* Fill missing values
* @param fillValue value to fill the missing values with
*/
fillNull(fillValue: string | number | Expr): LazyDataFrame<S>;
/**
* Filter the rows in the DataFrame based on a predicate expression.
* @param predicate - Expression that evaluates to a boolean Series.
* @example
* ```
* > lf = pl.DataFrame({
* > "foo": [1, 2, 3],
* > "bar": [6, 7, 8],
* > "ham": ['a', 'b', 'c']
* > }).lazy()
* > // Filter on one condition
* > lf.filter(pl.col("foo").lt(3)).collect()
* shape: (2, 3)
* ┌─────┬─────┬─────┐
* │ foo ┆ bar ┆ ham │
* │ --- ┆ --- ┆ --- │
* │ i64 ┆ i64 ┆ str │
* ╞═════╪═════╪═════╡
* │ 1 ┆ 6 ┆ a │
* ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
* │ 2 ┆ 7 ┆ b │
* └─────┴─────┴─────┘
* ```
*/
filter(predicate: Expr | string): LazyDataFrame<S>;
/**
* Get the first row of the DataFrame.
*/
first(): DataFrame<S>;
/**
* Start a groupby operation.
*/
groupBy(by: ColumnsOrExpr, maintainOrder?: boolean): LazyGroupBy;
groupBy(by: ColumnsOrExpr, opts: {
maintainOrder: boolean;
}): LazyGroupBy;
/**
* Gets the first `n` rows of the DataFrame. You probably don't want to use this!
*
* Consider using the `fetch` operation.
* The `fetch` operation will truly load the first `n`rows lazily.
*/
head(length?: number): LazyDataFrame<S>;
inner(): any;
/**
* __SQL like joins.__
* @param other - DataFrame to join with.
* @param joinOptions.on - Name(s) of the join columns in both DataFrames.
* @param joinOptions.how - Join strategy
* @param joinOptions.suffix - Suffix to append to columns with a duplicate name.
* @param joinOptions.coalesce - Coalescing behavior (merging of join columns).
* @param joinOptions.allowParallel - Allow the physical plan to optionally evaluate the computation of both DataFrames up to the join in parallel.
* @param joinOptions.forceParallel - Force the physical plan to evaluate the computation of both DataFrames up to the join in parallel.
* @see {@link LazyJoinOptions}
* @example
* ```
* >>> const df = pl.DataFrame({
* >>> foo: [1, 2, 3],
* >>> bar: [6.0, 7.0, 8.0],
* >>> ham: ['a', 'b', 'c'],
* >>> }).lazy()
* >>>
* >>> const otherDF = pl.DataFrame({
* >>> apple: ['x', 'y', 'z'],
* >>> ham: ['a', 'b', 'd'],
* >>> }).lazy();
* >>> const result = await df.join(otherDF, { on: 'ham', how: 'inner' }).collect();
* shape: (2, 4)
* ╭─────┬─────┬─────┬───────╮
* │ foo ┆ bar ┆ ham ┆ apple │
* │ --- ┆ --- ┆ --- ┆ --- │
* │ i64 ┆ f64 ┆ str ┆ str │
* ╞═════╪═════╪═════╪═══════╡
* │ 1 ┆ 6 ┆ "a" ┆ "x" │
* ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
* │ 2 ┆ 7 ┆ "b" ┆ "y" │
* ╰─────┴─────┴─────┴───────╯
* ```
*/
join<S2 extends Schema, const Opts extends LazySameNameColumnJoinOptions<Extract<keyof S, string>, Extract<keyof S2, string>>>(other: LazyDataFrame<S2>, joinOptions: Opts & LazySameNameColumnJoinOptions): LazyDataFrame<JoinSchemas<S, S2, Opts>>;
/**
* __SQL like joins with different names for left and right dataframes.__
* @param other - DataFrame to join with.
* @param joinOptions.leftOn - Name(s) of the left join column(s).
* @param joinOptions.rightOn - Name(s) of the right join column(s).
* @param joinOptions.how - Join strategy
* @param joinOptions.suffix - Suffix to append to columns with a duplicate name.
* @param joinOptions.coalesce - Coalescing behavior (merging of join columns).
* @param joinOptions.allowParallel - Allow the physical plan to optionally evaluate the computation of both DataFrames up to the join in parallel.
* @param joinOptions.forceParallel - Force the physical plan to evaluate the computation of both DataFrames up to the join in parallel.
* @see {@link LazyJoinOptions}
* @example
* ```
* >>> const df = pl.DataFrame({
* >>> foo: [1, 2, 3],
* >>> bar: [6.0, 7.0, 8.0],
* >>> ham: ['a', 'b', 'c'],
* >>> }).lazy()
* >>>
* >>> const otherDF = pl.DataFrame({
* >>> apple: ['x', 'y', 'z'],
* >>> ham: ['a', 'b', 'd'],
* >>> }).lazy();
* >>> const result = await df.join(otherDF, { leftOn: 'ham', rightOn: 'ham', how: 'inner' }).collect();
* shape: (2, 4)
* ╭─────┬─────┬─────┬───────╮
* │ foo ┆ bar ┆ ham ┆ apple │
* │ --- ┆ --- ┆ --- ┆ --- │
* │ i64 ┆ f64 ┆ str ┆ str │
* ╞═════╪═════╪═════╪═══════╡
* │ 1 ┆ 6 ┆ "a" ┆ "x" │
* ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
* │ 2 ┆ 7 ┆ "b" ┆ "y" │
* ╰─────┴─────┴─────┴───────╯
* ```
*/
join<S2 extends Schema, const Opts extends LazyDifferentNameColumnJoinOptions<Extract<keyof S, string>, Extract<keyof S2, string>>>(other: LazyDataFrame<S2>, joinOptions: Opts & LazyDifferentNameColumnJoinOptions): LazyDataFrame<JoinSchemas<S, S2, Opts>>;
/**
* __SQL like cross joins.__
* @param other - DataFrame to join with.
* @param joinOptions.how - Join strategy {'inner', 'left', 'right', 'full', 'semi', 'anti', 'cross'}
* @param joinOptions.suffix - Suffix to append to columns with a duplicate name.
* @param joinOptions.coalesce - Coalescing behavior (merging of join columns). default: undefined
* - **undefined** - *(Default)* Coalesce unless `how='full'` is specified.
* - **true** - Always coalesce join columns.
* - **false** - Never coalesce join columns.
* @param joinOptions.validate - Checks if join is of specified type. default: m:m
* valid options: {'m:m', 'm:1', '1:m', '1:1'}
* - **m:m** - *(Default)* Many-to-many (default). Does not result in checks.
* - **1:1** - One-to-one. Checks if join keys are unique in both left and right datasets.
* - **1:m** - One-to-many. Checks if join keys are unique in left dataset.
* - **m:1** - Many-to-one. Check if join keys are unique in right dataset.
* @param joinOptions.allowParallel - Allow the physical plan to optionally evaluate the computation of both DataFrames up to the join in parallel.
* @param joinOptions.forceParallel - Force the physical plan to evaluate the computation of both DataFrames up to the join in parallel.
* @see {@link LazyJoinOptions}
* @example
* ```
* >>> const df = pl.DataFrame({
* >>> foo: [1, 2],
* >>> bar: [6.0, 7.0],
* >>> ham: ['a', 'b'],
* >>> }).lazy()
* >>>
* >>> const otherDF = pl.DataFrame({
* >>> apple: ['x', 'y'],
* >>> ham: ['a', 'b'],
* >>> }).lazy();
* >>> const result = await df.join(otherDF, { how: 'cross' }).collect();
* shape: (4, 5)
* ╭─────┬─────┬─────┬───────┬───────────╮
* │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │
* │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
* │ f64 ┆ f64 ┆ str ┆ str ┆ str │
* ╞═════╪═════╪═════╪═══════╪═══════════╡
* │ 1.0 ┆ 6.0 ┆ a ┆ x ┆ a │
* │ 1.0 ┆ 6.0 ┆ a ┆ y ┆ b │
* │ 2.0 ┆ 7.0 ┆ b ┆ x ┆ a │
* │ 2.0 ┆ 7.0 ┆ b ┆ y ┆ b │
* ╰─────┴─────┴─────┴───────┴───────────╯
* ```
*/
join<S2 extends Schema, const Opts extends LazyCrossJoinOptions>(other: LazyDataFrame<S2>, joinOptions: Opts & LazyCrossJoinOptions): LazyDataFrame<JoinSchemas<S, S2, Opts>>;
/**
* Perform an asof join. This is similar to a left-join except that we
* match on nearest key rather than equal keys.
*
* Both DataFrames must be sorted by the asof_join key.
*
For each row in the left DataFrame:
- A "backward" search selects the last row in the right DataFrame whose
'on' key is less than or equal to the left's key.
- A "forward" search selects the first row in the right DataFrame whose
'on' key is greater than or equal to the left's key.
- A "nearest" search selects the last row in the right DataFrame whose value
is nearest to the left's key. String keys are not currently supported for a
nearest search.
The default is "backward".
Parameters
----------
@param other DataFrame to join with.
@param options.leftOn Join column of the left DataFrame.
@param options.rightOn Join column of the right DataFrame.
@param options.on Join column of both DataFrames. If set, `leftOn` and `rightOn` should be undefined.
@param options.byLeft join on these columns before doing asof join
@param options.byRight join on these columns before doing asof join
@param options.strategy One of {'forward', 'backward', 'nearest'}
@param options.suffix Suffix to append to columns with a duplicate name.
@param options.tolerance
Numeric tolerance. By setting this the join will only be done if the near keys are within this distance.
If an asof join is done on columns of dtype "Date", "Datetime" you
use the following string language:
- 1ns *(1 nanosecond)*
- 1us *(1 microsecond)*
- 1ms *(1 millisecond)*
- 1s *(1 second)*
- 1m *(1 minute)*
- 1h *(1 hour)*
- 1d *(1 day)*
- 1w *(1 week)*
- 1mo *(1 calendar month)*
- 1y *(1 calendar year)*
- 1i *(1 index count)*
Or combine them:
- "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
@param options.allowParallel Allow the physical plan to optionally evaluate the computation of both DataFrames up to the join in parallel.
@param options.forceParallel Force the physical plan to evaluate the computation of both DataFrames up to the join in parallel.
@param options.checkSortedness
Check the sortedness of the asof keys. If the keys are not sorted Polars
will error, or in case of 'by' argument raise a warning. This might become
a hard error in the future.
@example
```
>const gdp = pl.DataFrame({
... date: [
... new Date('2016-01-01'),
... new Date('2017-01-01'),
... new Date('2018-01-01'),
... new Date('2019-01-01'),
... ], // note record date: Jan 1st (sorted!)
... gdp: [4164, 4411, 4566, 4696],
... })
>const population = pl.DataFrame({
... date: [
... new Date('2016-05-12'),
... new Date('2017-05-12'),
... new Date('2018-05-12'),
... new Date('2019-05-12'),
... ], // note record date: May 12th (sorted!)
... "population": [82.19, 82.66, 83.12, 83.52],
... })
>population.joinAsof(
... gdp,
... {leftOn:"date", rightOn:"date", strategy:"backward"}
... )
shape: (4, 3)
┌─────────────────────┬────────────┬──────┐
│ date ┆ population ┆ gdp │
│ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ f64 ┆ i64 │
╞═════════════════════╪════════════╪══════╡
│ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │
└─────────────────────┴────────────┴──────┘
```
*/
joinAsof(other: LazyDataFrame, options: {
leftOn?: string;
rightOn?: string;
on?: string;
byLeft?: string | string[];
byRight?: string | string[];
by?: string | string[];
strategy?: "backward" | "forward" | "nearest";
suffix?: string;
tolerance?: number | string;
allowParallel?: boolean;
forceParallel?: boolean;
checkSortedness?: boolean;
}): LazyDataFrame;
/**
* Get the last row of the DataFrame.
*/
last(): LazyDataFrame<S>;
/**
* @see {@link head}
*/
limit(n?: number): LazyDataFrame<S>;
/**
* @see {@link DataFrame.max}
*/
max(): LazyDataFrame<S>;
/**
* @see {@link DataFrame.mean}
*/
mean(): LazyDataFrame<S>;
/**
* @see {@link DataFrame.median}
*/
median(): LazyDataFrame<S>;
/**
* @see {@link DataFrame.unpivot}
* @deprecated use `LazyFrame.unpivot` instead
*/
melt(idVars: ColumnSelection, valueVars: ColumnSelection): LazyDataFrame;
/**
* @see {@link DataFrame.unpivot}
*/
unpivot(idVars: ColumnSelection, valueVars: ColumnSelection, options?: {
variableName?: string | null;
valueName?: string | null;
}): LazyDataFrame;
/**
* @see {@link DataFrame.min}
*/
min(): LazyDataFrame<S>;
/**
* @see {@link DataFrame.quantile}
*/
quantile(quantile: number): LazyDataFrame<S>;
/**
* @see {@link DataFrame.rename}
*/
rename<const U extends Partial<Record<keyof S, string>>>(mapping: U): LazyDataFrame<{
[K in keyof S as U[K] extends string ? U[K] : K]: S[K];
}>;
rename(mapping: Record<string, string>): LazyDataFrame;
/**
* Reverse the DataFrame.
*/
reverse(): LazyDataFrame<S>;
/**
* @see {@link DataFrame.select}
*/
select<U extends keyof S>(...columns: U[]): LazyDataFrame<{
[P in U]: S[P];
}>;
select(column: ExprOrString | Series): LazyDataFrame;
select(columns: (ExprOrString | Series)[]): LazyDataFrame;
select(...columns: (ExprOrString | Series)[]): LazyDataFrame;
/**
* @see {@link DataFrame.shift}
*/
shift(periods: number): LazyDataFrame<S>;
shift(opts: {
periods: number;
}): LazyDataFrame<S>;
/**
* @see {@link DataFrame.shiftAndFill}
*/
shiftAndFill(n: number, fillValue: number): LazyDataFrame<S>;
shiftAndFill(opts: {
n: number;
fillValue: number;
}): LazyDataFrame<S>;
/**
* @see {@link DataFrame.slice}
*/
slice(offset: number, length: number): LazyDataFrame<S>;
slice(opts: {
offset: number;
length: number;
}): LazyDataFrame<S>;
/**
* @see {@link DataFrame.sort}
*/
sort(by: ColumnsOrExpr, descending?: ValueOrArray<boolean>, nullsLast?: boolean, maintainOrder?: boolean): LazyDataFrame<S>;
sort(opts: {
by: ColumnsOrExpr;
descending?: ValueOrArray<boolean>;
nullsLast?: boolean;
maintainOrder?: boolean;
}): LazyDataFrame<S>;
/**
* @see {@link DataFrame.std}
*/
std(): LazyDataFrame<S>;
/**
* Aggregate the columns in the DataFrame to their sum value.
*/
sum(): LazyDataFrame<S>;
/**
* Get the last `n` rows of the DataFrame.
* @see {@link DataFrame.tail}
*/
tail(length?: number): LazyDataFrame<S>;
/**
* compatibility with `JSON.stringify`
*/
toJSON(): string;
/**
* Drop duplicate rows from this DataFrame.
* Note that this fails if there is a column of type `List` in the DataFrame.
* @param subset Column name(s), selector(s) to consider when identifying duplicate rows. If set to `None` (default), all columns are considered.
* @param keep : 'first', 'last', 'any', 'none'
* Which of the duplicate rows to keep.
* 'any': Defaut, does not give any guarantee of which row is kept. This allows more optimizations.
* 'none': Don't keep duplicate rows.
* 'first': Keep the first unique row.
* 'last': Keep the last unique row.
* @param maintainOrder Keep the same order as the original DataFrame. This is more expensive to compute. Default: false
* @returns LazyDataFrame with unique rows.
* @example
* const ldf = pl.DataFrame({
foo: [1, 2, 2, 3],
bar: [1, 2, 2, 4],
ham: ["a", "d", "d", "c"],
}).lazy();
> ldf.unique().collectSync();
By default, all columns are considered when determining which rows are unique:
shape: (3, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ str │
╞═════╪═════╪═════╡
│ 3.0 ┆ 4.0 ┆ c │
│ 1.0 ┆ 1.0 ┆ a │
│ 2.0 ┆ 2.0 ┆ d │
└─────┴─────┴─────┘
> ldf.unique("foo").collectSync();
shape: (3, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ str │
╞═════╪═════╪═════╡
│ 3.0 ┆ 4.0 ┆ c │
│ 1.0 ┆ 1.0 ┆ a │
│ 2.0 ┆ 2.0 ┆ d │
└─────┴─────┴─────┘
> ldf.unique(["foo", "ham"], "first", true).collectSync(); or df.unique({ subset: ["foo", "ham"], keep: "first", maintainOrder: true }).collectSync();
shape: (3, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ str │
╞═════╪═════╪═════╡
│ 1.0 ┆ 1.0 ┆ a │
│ 2.0 ┆ 2.0 ┆ d │
│ 3.0 ┆ 4.0 ┆ c │
└─────┴─────┴─────┘
*/
unique(subset?: ColumnSelection, keep?: "first" | "last" | "any" | "none", maintainOrder?: boolean): LazyDataFrame<S>;
unique(opts: {
subset?: ColumnSelection;
keep?: "first" | "last" | "any" | "none";
maintainOrder?: boolean;
}): LazyDataFrame<S>;
/**
* Aggregate the columns in the DataFrame to their variance value.
*/
var(): LazyDataFrame<S>;
/**
* Add or overwrite column in a DataFrame.
* @param expr - Expression that evaluates to column.
*/
withColumn(expr: Expr | Series): LazyDataFrame;
/**
* Add or overwrite multiple columns in a DataFrame.
* @param exprs - List of Expressions that evaluate to columns.
*
*/
withColumns(...exprs: (Expr | Series)[]): LazyDataFrame;
withColumnRenamed<Existing extends keyof S, New extends string>(existing: Existing, replacement: New): LazyDataFrame<{
[K in keyof S as K extends Existing ? New : K]: S[K];
}>;
withColumnRenamed(existing: string, replacement: string): LazyDataFrame;
/**
* Add a column at index 0 that counts the rows.
* @see {@link DataFrame.withRowCount}
* @deprecated *since 0.23.0 use withRowIndex instead
*/
withRowCount(): LazyDataFrame;
/**
* Add a row index as the first column in the DataFrame.
* @param name Name of the index column.
* @param offset Start the index at this offset. Cannot be negative.
* @example
*
* >>> ldf = pl.DataFrame(
... {
... "a": [1, 3, 5],
... "b": [2, 4, 6],
... }
... ).lazy();
>>> ldf.withRowIndex().collectSync();
shape: (3, 3)
┌───────┬─────┬─────┐
│ index ┆ a ┆ b │
│ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ i64 │
╞═══════╪═════╪═════╡
│ 0 ┆ 1 ┆ 2 │
│ 1 ┆ 3 ┆ 4 │
│ 2 ┆ 5 ┆ 6 │
└───────┴─────┴─────┘
>>> ldf.withRowIndex("id", offset=1000).collectSync();
shape: (3, 3)
┌──────┬─────┬─────┐
│ id ┆ a ┆ b │
│ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ i64 │
╞══════╪═════╪═════╡
│ 1000 ┆ 1 ┆ 2 │
│ 1001 ┆ 3 ┆ 4 │
│ 1002 ┆ 5 ┆ 6 │
└──────┴─────┴─────┘
*/
withRowIndex(name?: string, offset?: number): LazyDataFrame;
/***
*
* Evaluate the query in streaming mode and write to a CSV file.
.. warning::
Streaming mode is considered **unstable**. It may be changed
at any point without it being considered a breaking change.
This allows streaming results that are larger than RAM to be written to disk.
Parameters
----------
@param path - File path to which the file should be written.
@param options.includeBom - Whether to include UTF-8 BOM in the CSV output.
@param options.includeHeader - Whether to include header in the CSV output.
@param options.separator - Separate CSV fields with this symbol.
@param options.lineTerminator - String used to end each row.
@param options.quoteChar - Byte to use as quoting character. Default: '"'
@param options.batchSize - Number of rows that will be processed per thread. Default - 1024
@param options.datetimeFormat - A format string, with the specifiers defined by the
`chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
Rust crate. If no format specified, the default fractional-second
precision is inferred from the maximum timeunit found in the frame's
Datetime cols (if any).
@param options.dateFormat - A format string, with the specifiers defined by the
`chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
Rust crate.
@param options.timeFormat A format string, with the specifiers defined by the
`chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
Rust crate.
@param options.floatPrecision - Number of decimal places to write, applied to both `Float32` and `Float64` datatypes.
@param options.nullValue - A string representing null values (defaulting to the empty string).
@param options.maintainOrder - Maintain the order in which data is processed.
Setting this to `False` will be slightly faster.
@return DataFrame
Examples
--------
>>> const lf = pl.scanCsv("/path/to/my_larger_than_ram_file.csv")
>>> lf.sinkCsv("out.csv").collect()
*/
sinkCSV(path: string, options?: CsvWriterOptions): LazyDataFrame;
/***
*
* Evaluate the query in streaming mode and write to a Parquet file.
This allows streaming results that are larger than RAM to be written to disk.
Parameters
----------
@param path - File path to which the file should be written.
@param options.compression : {'lz4', 'uncompressed', 'snappy', 'gzip', 'lzo', 'brotli', 'zstd'}
Choose "zstd" for good compression performance. (default)
Choose "lz4" for fast compression/decompression.
Choose "snappy" for more backwards compatibility guarantees
when you deal with older parquet readers.
@param options.compressionLevel - The level of compression to use. Higher compression means smaller files on disk.
- "gzip" : min-level: 0, max-level: 10.
- "brotli" : min-level: 0, max-level: 11.
- "zstd" : min-level: 1, max-level: 22.
@param options.statistics - Write statistics to the parquet headers. This requires extra compute. Default - false
@param options.rowGroupSize - Size of the row groups in number of rows.
If None (default), the chunks of the `DataFrame` are
used. Writing in smaller chunks may reduce memory pressure and improve
writing speeds.
@param options.dataPagesizeLimit - Size limit of individual data pages.
If not set defaults to 1024 * 1024 bytes
@param options.maintainOrder - Maintain the order in which data is processed. Default -> true
Setting this to `False` will be slightly faster.
@param options.typeCoercion - Do type coercion optimization. Default -> true
@param options.predicatePushdown - Do predicate pushdown optimization. Default -> true
@param options.projectionPushdown - Do projection pushdown optimization. Default -> true
@param options.simplifyExpression - Run simplify expressions optimization. Default -> true
@param options.slicePushdown - Slice pushdown optimization. Default -> true
@param options.noOptimization - Turn off (certain) optimizations. Default -> false
@param options.cloudOptions - Options that indicate how to connect to a cloud provider.
If the cloud provider is not supported by Polars, the storage options are passed to `fsspec.open()`.
The cloud providers currently supported are AWS, GCP, and Azure.
See supported keys here:
* `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
* `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
* `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
If `cloudOptions` is not provided, Polars will try to infer the information from environment variables.
@return DataFrame
Examples
--------
>>> const lf = pl.scanCsv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP
>>> lf.sinkParquet("out.parquet").collect() # doctest: +SKIP
*/
sinkParquet(path: string, options?: SinkParquetOptions): LazyDataFrame;
/**
*
* Evaluate the query in streaming mode and write to an NDJSON file.
* This allows streaming results that are larger than RAM to be written to disk.
*
* Parameters
@param path - File path to which the file should be written.
@param options.maintainOrder - Maintain the order in which data is processed. Default -> true
Setting this to `False` will be slightly faster.
@param options.mkdir - Recursively create all the directories in the path. Default -> false
@param options.retries - Number of retries if accessing a cloud instance fails. Default = 2
@param options.syncOnClose - { None, 'data', 'all' } Default -> 'all'
Sync to disk when before closing a file.
* `None` does not sync.
* `data` syncs the file contents.
* `all` syncs the file contents and metadata.
@param options.cloudOptions - Options that indicate how to connect to a cloud provider.
If the cloud provider is not supported by Polars, the storage options are passed to `fsspec.open()`.
The cloud providers currently supported are AWS, GCP, and Azure.
See supported keys here:
* `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
* `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
* `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
If `cloudOptions` is not provided, Polars will try to infer the information from environment variables.
@return DataFrame
Examples
--------
>>> const lf = pl.scanCsv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP
>>> lf.sinkNdJson("out.ndjson").collect()
*/
sinkNdJson(path: string, options?: SinkJsonOptions): LazyDataFrame;
/**
*
* Evaluate the query in streaming mode and write to an IPC file.
* This allows streaming results that are larger than RAM to be written to disk.
*
* Parameters
@param path - File path to which the file should be written.
@param options.compression : {'uncompressed', 'lz4', 'zstd'}
Choose "zstd" for good compression performance.
Choose "lz4" for fast compression/decompression.
@param options.compatLevel : { 'newest', 'oldest' } Default -> newest
Use a specific compatibility level when exporting Polars' internal data structures.
@param options.maintainOrder - Maintain the order in which data is processed. Default -> true
Setting this to `False` will be slightly faster.
@param options.mkdir - Recursively create all the directories in the path. Default -> false
@param options.retries - Number of retries if accessing a cloud instance fails. Default = 2
@param options.syncOnClose - { None, 'data', 'all' } Default -> 'all'
Sync to disk when before closing a file.
* `None` does not sync.
* `data` syncs the file contents.
* `all` syncs the file contents and metadata.
@param options.cloudOptions - Options that indicate how to connect to a cloud provider.
If the cloud provider is not supported by Polars, the storage options are passed to `fsspec.open()`.
The cloud providers currently supported are AWS, GCP, and Azure.
See supported keys here:
* `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
* `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
* `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
If `cloudOptions` is not provided, Polars will try to infer the information from environment variables.
@return DataFrame
Examples
--------
>>> const lf = pl.scanCsv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP
>>> lf.sinkIpc("out.arrow").collect()
*/
sinkIpc(path: string, options?: SinkIpcOptions): LazyDataFrame;
}
/** @ignore */
export declare const _LazyDataFrame: (_ldf: any) => LazyDataFrame;
export interface LazyDataFrameConstructor extends Deserialize<LazyDataFrame> {
fromExternal(external: any): LazyDataFrame;
isLazyDataFrame(arg: any): arg is LazyDataFrame;
}
/** @ignore */
export declare const LazyDataFrame: LazyDataFrameConstructor;
export {};