import polars as pl
import polars.selectors as cs
import polars_ds as pds
import numpy as npdata_dict = {
'group': ['a']*4 + ['b']*4,
'x': np.arange(1,9,1),
'y': np.arange(8,0,-1),
'p': np.arange(1,9,1)/10
}
df = pl.DataFrame(data_dict)
df.glimpse()Rows: 8
Columns: 4
$ group <str> 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'
$ x <i64> 1, 2, 3, 4, 5, 6, 7, 8
$ y <i64> 8, 7, 6, 5, 4, 3, 2, 1
$ p <f64> 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8
Rows: 8
Columns: 4
$ group <str> 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'
$ x <i64> 1, 2, 3, 4, 5, 6, 7, 8
$ y <i64> 8, 7, 6, 5, 4, 3, 2, 1
$ p <f64> 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8
Modularizing:
- map_columns()
- pipe()
Applying custom:
- map_batches()
- map_elements()
Aggregations:
- map_groups()
- partition_by()
Extension packages:
- polars_ds
Modularizing Polars Logic
def cap(c:pl.Expr, ceil:int = 5) -> pl.Expr: return pl.when( c > ceil).then(ceil).otherwise( c )
df.with_columns( cap( pl.col('x') ))
shape: (8, 5)
| group | x | y | p | literal |
|---|---|---|---|---|
| str | i64 | i64 | f64 | i64 |
| "a" | 1 | 8 | 0.1 | 1 |
| "a" | 2 | 7 | 0.2 | 2 |
| "a" | 3 | 6 | 0.3 | 3 |
| "a" | 4 | 5 | 0.4 | 4 |
| "b" | 5 | 4 | 0.5 | 5 |
| "b" | 6 | 3 | 0.6 | 5 |
| "b" | 7 | 2 | 0.7 | 5 |
| "b" | 8 | 1 | 0.8 | 5 |
df.map_columns( cs.numeric(), cap)
shape: (8, 4)
| group | x | y | p |
|---|---|---|---|
| str | i64 | i64 | f64 |
| "a" | 1 | 5 | 0.1 |
| "a" | 2 | 5 | 0.2 |
| "a" | 3 | 5 | 0.3 |
| "a" | 4 | 5 | 0.4 |
| "b" | 5 | 4 | 0.5 |
| "b" | 5 | 3 | 0.6 |
| "b" | 5 | 2 | 0.7 |
| "b" | 5 | 1 | 0.8 |
def cap(c:pl.Expr, ceil:int = 5) -> pl.Expr: return pl.when( c > ceil).then(ceil).otherwise( c )
df.with_columns( cs.numeric().pipe(cap).name.keep() )
shape: (8, 4)
| group | x | y | p |
|---|---|---|---|
| str | i64 | i64 | f64 |
| "a" | 1 | 5 | 0.1 |
| "a" | 2 | 5 | 0.2 |
| "a" | 3 | 5 | 0.3 |
| "a" | 4 | 5 | 0.4 |
| "b" | 5 | 4 | 0.5 |
| "b" | 5 | 3 | 0.6 |
| "b" | 5 | 2 | 0.7 |
| "b" | 5 | 1 | 0.8 |
def calc_stats(df:pl.DataFrame, threshhold:int = 5) -> pl.DataFrame:
df_out = (
df
.with_columns(
abs = (pl.col('x') - pl.col('y')).abs(),
abs_gt_t = (pl.col('x') - pl.col('y')).abs() > threshhold,
)
)
return df_out
df.pipe(calc_stats)
shape: (8, 6)
| group | x | y | p | abs | abs_gt_t |
|---|---|---|---|---|---|
| str | i64 | i64 | f64 | i64 | bool |
| "a" | 1 | 8 | 0.1 | 7 | true |
| "a" | 2 | 7 | 0.2 | 5 | false |
| "a" | 3 | 6 | 0.3 | 3 | false |
| "a" | 4 | 5 | 0.4 | 1 | false |
| "b" | 5 | 4 | 0.5 | 1 | false |
| "b" | 6 | 3 | 0.6 | 3 | false |
| "b" | 7 | 2 | 0.7 | 5 | false |
| "b" | 8 | 1 | 0.8 | 7 | true |
df.pipe(calc_stats, threshhold = 3)
shape: (8, 6)
| group | x | y | p | abs | abs_gt_t |
|---|---|---|---|---|---|
| str | i64 | i64 | f64 | i64 | bool |
| "a" | 1 | 8 | 0.1 | 7 | true |
| "a" | 2 | 7 | 0.2 | 5 | true |
| "a" | 3 | 6 | 0.3 | 3 | false |
| "a" | 4 | 5 | 0.4 | 1 | false |
| "b" | 5 | 4 | 0.5 | 1 | false |
| "b" | 6 | 3 | 0.6 | 3 | false |
| "b" | 7 | 2 | 0.7 | 5 | true |
| "b" | 8 | 1 | 0.8 | 7 | true |
Custom Funtion
from numpy.random import binomial
# one column in, one value out
df.with_columns(
coin_flip = pl.col('p').map_batches(function = lambda p: binomial(n = 1, p = p), returns_scalar = True, return_dtype = pl.UInt16)
)
# one column in, one value out
df.with_columns(
coin_flip = pl.map_batches(exprs = ['x', 'p'],
function = lambda z: binomial(n = z[0], p = z[1]),
returns_scalar = True, return_dtype = pl.UInt16)
)
# two column in, one value out - with exprs
df.with_columns(
coin_flip = pl.struct('x','p').map_batches(
function = lambda z: binomial(n = z.struct.field('x'), p = z.struct.field('p')),
returns_scalar = True, return_dtype = pl.UInt16)
)
# two column in, one value out - with struct
df.with_columns(
coin_flip = pl.struct('x','p').map_batches(
function = lambda z: binomial(n = z.struct.field('x'), p = z.struct.field('p')),
returns_scalar = True, return_dtype = pl.UInt16)
)
# many columns out
df.with_columns(
coin_flip = pl.struct('x','p').map_batches(
function = lambda z: binomial(n = z.struct['x'],
p = z.struct['p'],
size = (100,z.shape[0])
).transpose(),
return_dtype = pl.Array(pl.UInt16, 100)
)
).with_columns(
avg_outcome = pl.col('coin_flip').arr.mean(),
exp_value = pl.col('x') * pl.col('p')
)
shape: (8, 7)
| group | x | y | p | coin_flip | avg_outcome | exp_value |
|---|---|---|---|---|---|---|
| str | i64 | i64 | f64 | array[u16, 100] | f64 | f64 |
| "a" | 1 | 8 | 0.1 | [0, 0, … 0] | 0.13 | 0.1 |
| "a" | 2 | 7 | 0.2 | [1, 0, … 1] | 0.5 | 0.4 |
| "a" | 3 | 6 | 0.3 | [2, 2, … 0] | 0.93 | 0.9 |
| "a" | 4 | 5 | 0.4 | [2, 2, … 1] | 1.53 | 1.6 |
| "b" | 5 | 4 | 0.5 | [2, 3, … 3] | 2.49 | 2.5 |
| "b" | 6 | 3 | 0.6 | [3, 5, … 4] | 3.6 | 3.6 |
| "b" | 7 | 2 | 0.7 | [4, 4, … 4] | 4.9 | 4.9 |
| "b" | 8 | 1 | 0.8 | [6, 6, … 6] | 6.47 | 6.4 |
Custom Aggregation
data_dict = {
'group': ['a']*4 + ['b']*4,
'truth': [1,1,0,0]*2,
'mod_bad': [0.25,0.25,0.75,0.75]*2,
'mod_bst': [0.99,0.75,0.25,0.01]*2,
'mod_rnd': [0.5]*8,
'mod_mix': [0.99,0.75,0.25,0.01]+[0.5]*3+[0.6]
}
df = pl.DataFrame(data_dict)
df.glimpse()Rows: 8
Columns: 6
$ group <str> 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'
$ truth <i64> 1, 1, 0, 0, 1, 1, 0, 0
$ mod_bad <f64> 0.25, 0.25, 0.75, 0.75, 0.25, 0.25, 0.75, 0.75
$ mod_bst <f64> 0.99, 0.75, 0.25, 0.01, 0.99, 0.75, 0.25, 0.01
$ mod_rnd <f64> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5
$ mod_mix <f64> 0.99, 0.75, 0.25, 0.01, 0.5, 0.5, 0.5, 0.6
Rows: 8
Columns: 6
$ group <str> 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'
$ truth <i64> 1, 1, 0, 0, 1, 1, 0, 0
$ mod_bad <f64> 0.25, 0.25, 0.75, 0.75, 0.25, 0.25, 0.75, 0.75
$ mod_bst <f64> 0.99, 0.75, 0.25, 0.01, 0.99, 0.75, 0.25, 0.01
$ mod_rnd <f64> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5
$ mod_mix <f64> 0.99, 0.75, 0.25, 0.01, 0.5, 0.5, 0.5, 0.6
from sklearn.metrics import roc_auc_score
df.group_by('group').agg (
pl.map_groups(
exprs = ['truth', 'mod_mix'],
function = lambda x: roc_auc_score(x[0], x[1]),
return_dtype = pl.Float64,
returns_scalar = True
)
)
shape: (2, 2)
| group | truth |
|---|---|
| str | f64 |
| "b" | 0.25 |
| "a" | 1.0 |
df.group_by('group').agg (
auroc = pds.query_roc_auc('truth', 'mod_bst')
)
shape: (2, 2)
| group | auroc |
|---|---|
| str | f64 |
| "b" | 1.0 |
| "a" | 1.0 |
def auroc_expressions(models):
for m in models:
yield pds.query_roc_auc( 'truth', m).alias(m)
mods = cs.expand_selector(df, cs.starts_with('mod_')) # could also do: [c for c in df.columns if c[:4] == 'mod_']
df.group_by('group').agg( auroc_expressions( mods ))
shape: (2, 5)
| group | mod_bad | mod_bst | mod_rnd | mod_mix |
|---|---|---|---|---|
| str | f64 | f64 | f64 | f64 |
| "b" | -0.0 | 1.0 | 0.5 | 0.25 |
| "a" | -0.0 | 1.0 | 0.5 | 1.0 |
Custom Partitions
import statsmodels.api as sm
(
df.group_by('group').agg (
mod = pl.map_groups(
exprs = ['mod_bst', 'mod_mix'],
function = lambda x: sm.OLS( x[0].to_numpy(), sm.add_constant( x[1] )).fit() ,
return_dtype = pl.Object,
returns_scalar = True
)
)
.with_columns(
params = pl.col('mod').map_elements(lambda x: x.params, return_dtype = pl.List(pl.Float64)),
r_sq = pl.col('mod').map_elements(lambda x: x.rsquared, return_dtype = pl.Float64)
)
)
shape: (2, 4)
| group | mod | params | r_sq |
|---|---|---|---|
| str | object | list[f64] | f64 |
| "b" | <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x0000026EA086D350> | [3.93, -6.533333] | 0.528971 |
| "a" | <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x0000026EA1591E50> | [-1.7347e-16, 1.0] | 1.0 |
dfs = df.partition_by('group', as_dict = True, include_key = True)
mods = [ sm.OLS( d['mod_bst'].to_numpy(),
sm.add_constant( d['mod_mix'].to_numpy() )
).fit() for k,d in dfs.items()]
coef = [m.params[1] for m in mods]
dict(zip( dfs.keys(), coef)){('a',): np.float64(1.0000000000000004),
('b',): np.float64(-6.533333333333337)}