import polars as pl
import polars.selectors as cs
import polars_ds as pds
import numpy as np
data_dict = {
  'group': ['a']*4 + ['b']*4,
  'x': np.arange(1,9,1),
  'y': np.arange(8,0,-1), 
  'p': np.arange(1,9,1)/10
}
df = pl.DataFrame(data_dict)
df.glimpse()
Rows: 8
Columns: 4
$ group <str> 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'
$ x     <i64> 1, 2, 3, 4, 5, 6, 7, 8
$ y     <i64> 8, 7, 6, 5, 4, 3, 2, 1
$ p     <f64> 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8

Rows: 8
Columns: 4
$ group <str> 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'
$ x     <i64> 1, 2, 3, 4, 5, 6, 7, 8
$ y     <i64> 8, 7, 6, 5, 4, 3, 2, 1
$ p     <f64> 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8

Modularizing:

Applying custom:

Aggregations:

Extension packages:

Modularizing Polars Logic

def cap(c:pl.Expr, ceil:int = 5) -> pl.Expr: return pl.when( c > ceil).then(ceil).otherwise( c )

df.with_columns( cap( pl.col('x') ))
shape: (8, 5)
group x y p literal
str i64 i64 f64 i64
"a" 1 8 0.1 1
"a" 2 7 0.2 2
"a" 3 6 0.3 3
"a" 4 5 0.4 4
"b" 5 4 0.5 5
"b" 6 3 0.6 5
"b" 7 2 0.7 5
"b" 8 1 0.8 5
df.map_columns( cs.numeric(), cap)
shape: (8, 4)
group x y p
str i64 i64 f64
"a" 1 5 0.1
"a" 2 5 0.2
"a" 3 5 0.3
"a" 4 5 0.4
"b" 5 4 0.5
"b" 5 3 0.6
"b" 5 2 0.7
"b" 5 1 0.8
def cap(c:pl.Expr, ceil:int = 5) -> pl.Expr: return pl.when( c > ceil).then(ceil).otherwise( c )

df.with_columns( cs.numeric().pipe(cap).name.keep() )
shape: (8, 4)
group x y p
str i64 i64 f64
"a" 1 5 0.1
"a" 2 5 0.2
"a" 3 5 0.3
"a" 4 5 0.4
"b" 5 4 0.5
"b" 5 3 0.6
"b" 5 2 0.7
"b" 5 1 0.8
def calc_stats(df:pl.DataFrame, threshhold:int = 5) -> pl.DataFrame:

    df_out = (
        df
        .with_columns(
            abs = (pl.col('x') - pl.col('y')).abs(),
            abs_gt_t = (pl.col('x') - pl.col('y')).abs() > threshhold,
        )
    )
    return df_out 

df.pipe(calc_stats)
shape: (8, 6)
group x y p abs abs_gt_t
str i64 i64 f64 i64 bool
"a" 1 8 0.1 7 true
"a" 2 7 0.2 5 false
"a" 3 6 0.3 3 false
"a" 4 5 0.4 1 false
"b" 5 4 0.5 1 false
"b" 6 3 0.6 3 false
"b" 7 2 0.7 5 false
"b" 8 1 0.8 7 true
df.pipe(calc_stats, threshhold = 3)
shape: (8, 6)
group x y p abs abs_gt_t
str i64 i64 f64 i64 bool
"a" 1 8 0.1 7 true
"a" 2 7 0.2 5 true
"a" 3 6 0.3 3 false
"a" 4 5 0.4 1 false
"b" 5 4 0.5 1 false
"b" 6 3 0.6 3 false
"b" 7 2 0.7 5 true
"b" 8 1 0.8 7 true

Custom Funtion

from numpy.random import binomial

# one column in, one value out
df.with_columns(
    coin_flip = pl.col('p').map_batches(function = lambda p: binomial(n = 1, p = p), returns_scalar = True, return_dtype = pl.UInt16)
)

# one column in, one value out
df.with_columns(
    coin_flip = pl.map_batches(exprs = ['x', 'p'],
                               function = lambda z: binomial(n = z[0], p = z[1]), 
                               returns_scalar = True, return_dtype = pl.UInt16)
)

# two column in, one value out - with exprs
df.with_columns(
    coin_flip = pl.struct('x','p').map_batches(
                               function = lambda z: binomial(n = z.struct.field('x'), p = z.struct.field('p')), 
                               returns_scalar = True, return_dtype = pl.UInt16)
)

# two column in, one value out - with struct
df.with_columns(
    coin_flip = pl.struct('x','p').map_batches(
                               function = lambda z: binomial(n = z.struct.field('x'), p = z.struct.field('p')), 
                               returns_scalar = True, return_dtype = pl.UInt16)
)

# many columns out
df.with_columns(
    coin_flip = pl.struct('x','p').map_batches(
                               function = lambda z: binomial(n = z.struct['x'], 
                                                             p = z.struct['p'],
                                                             size = (100,z.shape[0])
                                                             ).transpose(), 
                                return_dtype = pl.Array(pl.UInt16, 100) 
                                )
).with_columns( 
    avg_outcome = pl.col('coin_flip').arr.mean(),
    exp_value = pl.col('x') * pl.col('p')
)
shape: (8, 7)
group x y p coin_flip avg_outcome exp_value
str i64 i64 f64 array[u16, 100] f64 f64
"a" 1 8 0.1 [0, 0, … 0] 0.13 0.1
"a" 2 7 0.2 [1, 0, … 1] 0.5 0.4
"a" 3 6 0.3 [2, 2, … 0] 0.93 0.9
"a" 4 5 0.4 [2, 2, … 1] 1.53 1.6
"b" 5 4 0.5 [2, 3, … 3] 2.49 2.5
"b" 6 3 0.6 [3, 5, … 4] 3.6 3.6
"b" 7 2 0.7 [4, 4, … 4] 4.9 4.9
"b" 8 1 0.8 [6, 6, … 6] 6.47 6.4

Custom Aggregation

data_dict = {
  'group': ['a']*4 + ['b']*4,
  'truth': [1,1,0,0]*2,
  'mod_bad': [0.25,0.25,0.75,0.75]*2, 
  'mod_bst': [0.99,0.75,0.25,0.01]*2,
  'mod_rnd': [0.5]*8,
  'mod_mix': [0.99,0.75,0.25,0.01]+[0.5]*3+[0.6]
}
df = pl.DataFrame(data_dict)
df.glimpse()
Rows: 8
Columns: 6
$ group   <str> 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'
$ truth   <i64> 1, 1, 0, 0, 1, 1, 0, 0
$ mod_bad <f64> 0.25, 0.25, 0.75, 0.75, 0.25, 0.25, 0.75, 0.75
$ mod_bst <f64> 0.99, 0.75, 0.25, 0.01, 0.99, 0.75, 0.25, 0.01
$ mod_rnd <f64> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5
$ mod_mix <f64> 0.99, 0.75, 0.25, 0.01, 0.5, 0.5, 0.5, 0.6

Rows: 8
Columns: 6
$ group   <str> 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'
$ truth   <i64> 1, 1, 0, 0, 1, 1, 0, 0
$ mod_bad <f64> 0.25, 0.25, 0.75, 0.75, 0.25, 0.25, 0.75, 0.75
$ mod_bst <f64> 0.99, 0.75, 0.25, 0.01, 0.99, 0.75, 0.25, 0.01
$ mod_rnd <f64> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5
$ mod_mix <f64> 0.99, 0.75, 0.25, 0.01, 0.5, 0.5, 0.5, 0.6
from sklearn.metrics import roc_auc_score

df.group_by('group').agg (
    pl.map_groups(
    exprs = ['truth', 'mod_mix'],
    function = lambda x: roc_auc_score(x[0], x[1]),
    return_dtype = pl.Float64,
    returns_scalar = True
    )
)
shape: (2, 2)
group truth
str f64
"b" 0.25
"a" 1.0
df.group_by('group').agg (
    auroc = pds.query_roc_auc('truth', 'mod_bst')
)
shape: (2, 2)
group auroc
str f64
"b" 1.0
"a" 1.0
def auroc_expressions(models):
    for m in models:
        yield pds.query_roc_auc( 'truth', m).alias(m)

mods = cs.expand_selector(df, cs.starts_with('mod_')) # could also do: [c for c in df.columns if c[:4] == 'mod_']
df.group_by('group').agg( auroc_expressions( mods ))
shape: (2, 5)
group mod_bad mod_bst mod_rnd mod_mix
str f64 f64 f64 f64
"b" -0.0 1.0 0.5 0.25
"a" -0.0 1.0 0.5 1.0

Custom Partitions

import statsmodels.api as sm

(
df.group_by('group').agg (
    mod = pl.map_groups(
    exprs = ['mod_bst', 'mod_mix'],
    function = lambda x: sm.OLS( x[0].to_numpy(), sm.add_constant( x[1] )).fit() ,
    return_dtype = pl.Object,
    returns_scalar = True
    )
)
.with_columns(
    params = pl.col('mod').map_elements(lambda x: x.params, return_dtype = pl.List(pl.Float64)),
    r_sq  = pl.col('mod').map_elements(lambda x: x.rsquared, return_dtype = pl.Float64)
)
)
shape: (2, 4)
group mod params r_sq
str object list[f64] f64
"b" <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x0000026EA086D350> [3.93, -6.533333] 0.528971
"a" <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x0000026EA1591E50> [-1.7347e-16, 1.0] 1.0
dfs = df.partition_by('group', as_dict = True, include_key = True)
mods = [ sm.OLS( d['mod_bst'].to_numpy(), 
                 sm.add_constant( d['mod_mix'].to_numpy() )
                ).fit() for k,d in dfs.items()]
coef = [m.params[1] for m in mods]
dict(zip( dfs.keys(), coef))
{('a',): np.float64(1.0000000000000004),
 ('b',): np.float64(-6.533333333333337)}