?meta:title | Emily Riederer

import polars as pl
import polars.selectors as cs
import polars_ds as pds
import numpy as np

data_dict = {
  'group': ['a']*4 + ['b']*4,
  'x': np.arange(1,9,1),
  'y': np.arange(8,0,-1), 
  'p': np.arange(1,9,1)/10
}
df = pl.DataFrame(data_dict)
df.glimpse()

Rows: 8
Columns: 4
$ group <str> 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'
$ x     <i64> 1, 2, 3, 4, 5, 6, 7, 8
$ y     <i64> 8, 7, 6, 5, 4, 3, 2, 1
$ p     <f64> 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8

Rows: 8
Columns: 4
$ group <str> 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'
$ x     <i64> 1, 2, 3, 4, 5, 6, 7, 8
$ y     <i64> 8, 7, 6, 5, 4, 3, 2, 1
$ p     <f64> 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8

Modularizing:

map_columns()
pipe()

Applying custom:

map_batches()
map_elements()

Aggregations:

map_groups()
partition_by()

Extension packages:

polars_ds

Modularizing Polars Logic

def cap(c:pl.Expr, ceil:int = 5) -> pl.Expr: return pl.when( c > ceil).then(ceil).otherwise( c )

df.with_columns( cap( pl.col('x') ))

shape: (8, 5)

group	x	y	p	literal
str	i64	i64	f64	i64
"a"	1	8	0.1	1
"a"	2	7	0.2	2
"a"	3	6	0.3	3
"a"	4	5	0.4	4
"b"	5	4	0.5	5
"b"	6	3	0.6	5
"b"	7	2	0.7	5
"b"	8	1	0.8	5

df.map_columns( cs.numeric(), cap)

shape: (8, 4)

group	x	y	p
str	i64	i64	f64
"a"	1	5	0.1
"a"	2	5	0.2
"a"	3	5	0.3
"a"	4	5	0.4
"b"	5	4	0.5
"b"	5	3	0.6
"b"	5	2	0.7
"b"	5	1	0.8

def cap(c:pl.Expr, ceil:int = 5) -> pl.Expr: return pl.when( c > ceil).then(ceil).otherwise( c )

df.with_columns( cs.numeric().pipe(cap).name.keep() )

shape: (8, 4)

group	x	y	p
str	i64	i64	f64
"a"	1	5	0.1
"a"	2	5	0.2
"a"	3	5	0.3
"a"	4	5	0.4
"b"	5	4	0.5
"b"	5	3	0.6
"b"	5	2	0.7
"b"	5	1	0.8

def calc_stats(df:pl.DataFrame, threshhold:int = 5) -> pl.DataFrame:

    df_out = (
        df
        .with_columns(
            abs = (pl.col('x') - pl.col('y')).abs(),
            abs_gt_t = (pl.col('x') - pl.col('y')).abs() > threshhold,
        )
    )
    return df_out 

df.pipe(calc_stats)

shape: (8, 6)

group	x	y	p	abs	abs_gt_t
str	i64	i64	f64	i64	bool
"a"	1	8	0.1	7	true
"a"	2	7	0.2	5	false
"a"	3	6	0.3	3	false
"a"	4	5	0.4	1	false
"b"	5	4	0.5	1	false
"b"	6	3	0.6	3	false
"b"	7	2	0.7	5	false
"b"	8	1	0.8	7	true

df.pipe(calc_stats, threshhold = 3)

shape: (8, 6)

group	x	y	p	abs	abs_gt_t
str	i64	i64	f64	i64	bool
"a"	1	8	0.1	7	true
"a"	2	7	0.2	5	true
"a"	3	6	0.3	3	false
"a"	4	5	0.4	1	false
"b"	5	4	0.5	1	false
"b"	6	3	0.6	3	false
"b"	7	2	0.7	5	true
"b"	8	1	0.8	7	true

Custom Funtion

from numpy.random import binomial

# one column in, one value out
df.with_columns(
    coin_flip = pl.col('p').map_batches(function = lambda p: binomial(n = 1, p = p), returns_scalar = True, return_dtype = pl.UInt16)
)

# one column in, one value out
df.with_columns(
    coin_flip = pl.map_batches(exprs = ['x', 'p'],
                               function = lambda z: binomial(n = z[0], p = z[1]), 
                               returns_scalar = True, return_dtype = pl.UInt16)
)

# two column in, one value out - with exprs
df.with_columns(
    coin_flip = pl.struct('x','p').map_batches(
                               function = lambda z: binomial(n = z.struct.field('x'), p = z.struct.field('p')), 
                               returns_scalar = True, return_dtype = pl.UInt16)
)

# two column in, one value out - with struct
df.with_columns(
    coin_flip = pl.struct('x','p').map_batches(
                               function = lambda z: binomial(n = z.struct.field('x'), p = z.struct.field('p')), 
                               returns_scalar = True, return_dtype = pl.UInt16)
)

# many columns out
df.with_columns(
    coin_flip = pl.struct('x','p').map_batches(
                               function = lambda z: binomial(n = z.struct['x'], 
                                                             p = z.struct['p'],
                                                             size = (100,z.shape[0])
                                                             ).transpose(), 
                                return_dtype = pl.Array(pl.UInt16, 100) 
                                )
).with_columns( 
    avg_outcome = pl.col('coin_flip').arr.mean(),
    exp_value = pl.col('x') * pl.col('p')
)

shape: (8, 7)

group	x	y	p	coin_flip	avg_outcome	exp_value
str	i64	i64	f64	array[u16, 100]	f64	f64
"a"	1	8	0.1	[0, 0, … 0]	0.13	0.1
"a"	2	7	0.2	[1, 0, … 1]	0.5	0.4
"a"	3	6	0.3	[2, 2, … 0]	0.93	0.9
"a"	4	5	0.4	[2, 2, … 1]	1.53	1.6
"b"	5	4	0.5	[2, 3, … 3]	2.49	2.5
"b"	6	3	0.6	[3, 5, … 4]	3.6	3.6
"b"	7	2	0.7	[4, 4, … 4]	4.9	4.9
"b"	8	1	0.8	[6, 6, … 6]	6.47	6.4

Custom Aggregation

data_dict = {
  'group': ['a']*4 + ['b']*4,
  'truth': [1,1,0,0]*2,
  'mod_bad': [0.25,0.25,0.75,0.75]*2, 
  'mod_bst': [0.99,0.75,0.25,0.01]*2,
  'mod_rnd': [0.5]*8,
  'mod_mix': [0.99,0.75,0.25,0.01]+[0.5]*3+[0.6]
}
df = pl.DataFrame(data_dict)
df.glimpse()

Rows: 8
Columns: 6
$ group   <str> 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'
$ truth   <i64> 1, 1, 0, 0, 1, 1, 0, 0
$ mod_bad <f64> 0.25, 0.25, 0.75, 0.75, 0.25, 0.25, 0.75, 0.75
$ mod_bst <f64> 0.99, 0.75, 0.25, 0.01, 0.99, 0.75, 0.25, 0.01
$ mod_rnd <f64> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5
$ mod_mix <f64> 0.99, 0.75, 0.25, 0.01, 0.5, 0.5, 0.5, 0.6

Rows: 8
Columns: 6
$ group   <str> 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'
$ truth   <i64> 1, 1, 0, 0, 1, 1, 0, 0
$ mod_bad <f64> 0.25, 0.25, 0.75, 0.75, 0.25, 0.25, 0.75, 0.75
$ mod_bst <f64> 0.99, 0.75, 0.25, 0.01, 0.99, 0.75, 0.25, 0.01
$ mod_rnd <f64> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5
$ mod_mix <f64> 0.99, 0.75, 0.25, 0.01, 0.5, 0.5, 0.5, 0.6

from sklearn.metrics import roc_auc_score

df.group_by('group').agg (
    pl.map_groups(
    exprs = ['truth', 'mod_mix'],
    function = lambda x: roc_auc_score(x[0], x[1]),
    return_dtype = pl.Float64,
    returns_scalar = True
    )
)

shape: (2, 2)

group	truth
str	f64
"b"	0.25
"a"	1.0

df.group_by('group').agg (
    auroc = pds.query_roc_auc('truth', 'mod_bst')
)

shape: (2, 2)

group	auroc
str	f64
"b"	1.0
"a"	1.0

def auroc_expressions(models):
    for m in models:
        yield pds.query_roc_auc( 'truth', m).alias(m)

mods = cs.expand_selector(df, cs.starts_with('mod_')) # could also do: [c for c in df.columns if c[:4] == 'mod_']
df.group_by('group').agg( auroc_expressions( mods ))

shape: (2, 5)

group	mod_bad	mod_bst	mod_rnd	mod_mix
str	f64	f64	f64	f64
"b"	-0.0	1.0	0.5	0.25
"a"	-0.0	1.0	0.5	1.0

Custom Partitions

import statsmodels.api as sm

(
df.group_by('group').agg (
    mod = pl.map_groups(
    exprs = ['mod_bst', 'mod_mix'],
    function = lambda x: sm.OLS( x[0].to_numpy(), sm.add_constant( x[1] )).fit() ,
    return_dtype = pl.Object,
    returns_scalar = True
    )
)
.with_columns(
    params = pl.col('mod').map_elements(lambda x: x.params, return_dtype = pl.List(pl.Float64)),
    r_sq  = pl.col('mod').map_elements(lambda x: x.rsquared, return_dtype = pl.Float64)
)
)

shape: (2, 4)

group	mod	params	r_sq
str	object	list[f64]	f64
"b"	<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x0000026EA086D350>	[3.93, -6.533333]	0.528971
"a"	<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x0000026EA1591E50>	[-1.7347e-16, 1.0]	1.0

dfs = df.partition_by('group', as_dict = True, include_key = True)
mods = [ sm.OLS( d['mod_bst'].to_numpy(), 
                 sm.add_constant( d['mod_mix'].to_numpy() )
                ).fit() for k,d in dfs.items()]
coef = [m.params[1] for m in mods]
dict(zip( dfs.keys(), coef))

{('a',): np.float64(1.0000000000000004),
 ('b',): np.float64(-6.533333333333337)}