25  Aggregate

import polars as pl
import numpy as np
import re
iris = pl.read_csv("/Users/egenn/icloud/Data/iris.csv", 
    dtypes = {"Species": pl.Categorical})
iris
/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_56360/1124065206.py:1: DeprecationWarning:

The argument `dtypes` for `read_csv` is deprecated. It has been renamed to `schema_overrides`.
shape: (150, 5)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
f64 f64 f64 f64 cat
5.1 3.5 1.4 0.2 "setosa"
4.9 3.0 1.4 0.2 "setosa"
4.7 3.2 1.3 0.2 "setosa"
4.6 3.1 1.5 0.2 "setosa"
5.0 3.6 1.4 0.2 "setosa"
6.7 3.0 5.2 2.3 "virginica"
6.3 2.5 5.0 1.9 "virginica"
6.5 3.0 5.2 2.0 "virginica"
6.2 3.4 5.4 2.3 "virginica"
5.9 3.0 5.1 1.8 "virginica"

Clean column names

iris.columns = [re.sub("\.", "_", col) for col in iris.columns]
iris.columns
<>:1: SyntaxWarning:

invalid escape sequence '\.'

<>:1: SyntaxWarning:

invalid escape sequence '\.'

/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_56360/3413779961.py:1: SyntaxWarning:

invalid escape sequence '\.'
['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species']

25.1 Groupby + Mean

Get mean Sepal_Length by Species

iris.group_by('Species').agg(
    pl.col('Sepal_Length').mean().alias('Sepal Length (mean)'))
shape: (3, 2)
Species Sepal Length (mean)
cat f64
"virginica" 6.588
"setosa" 5.006
"versicolor" 5.936

To perform the above with lazy evaluation, just add df.lazy() and end with df.collect():

iris.lazy().group_by('Species').agg(
    pl.col('Sepal_Length').mean().alias('Sepal Length (mean)')).collect()
shape: (3, 2)
Species Sepal Length (mean)
cat f64
"virginica" 6.588
"versicolor" 5.936
"setosa" 5.006
q = (
    iris
    .lazy()
    .group_by("Species")
    .agg(
        pl.col("Sepal_Length").mean().alias("Sepal Length (mean)")
    )
)
q.collect()
shape: (3, 2)
Species Sepal Length (mean)
cat f64
"virginica" 6.588
"setosa" 5.006
"versicolor" 5.936

25.2 Groupby + multiple metrics

You can get multiple summary measure and sort results:

iris.group_by("Species").agg(
    [
        pl.col("Sepal_Length").mean().alias("Sepal_Length (Mean)"),
        pl.col("Petal_Length").mean().alias("Petal_Length (Mean)")
    ]
).sort('Species')
shape: (3, 3)
Species Sepal_Length (Mean) Petal_Length (Mean)
cat f64 f64
"setosa" 5.006 1.462
"versicolor" 5.936 4.26
"virginica" 6.588 5.552

with lazy evaluation:

iris.lazy().group_by('Species').agg(
    [
        pl.col("Sepal_Length").mean().alias("Sepal_Length (Mean)"),
        pl.col("Petal_Length").mean().alias("Petal_Length (Mean)")
    ]
).sort('Species')

NAIVE QUERY PLAN

run LazyFrame.show_graph() to see the optimized version

polars_query p1 SORT BY [col("Species")] p2 AGG [col("Sepal_Length").mean().alias("Sepal_Length (Mean)"), col("Petal_Length").mean().alias("Petal_Length (Mean)")] BY [col("Species")] p1--p2 p3 TABLE π */5 p2--p3

or build a query and collect() it:

q = (
    iris
    .lazy()
    .group_by("Species")
    .agg(
        [
        pl.col("Sepal_Length").mean().alias("Sepal_Length (Mean)"),
        pl.col("Petal_Length").mean().alias("Petal_Length (Mean)")
        ]
    )
    .sort("Species")
)
q.collect()
shape: (3, 3)
Species Sepal_Length (Mean) Petal_Length (Mean)
cat f64 f64
"setosa" 5.006 1.462
"versicolor" 5.936 4.26
"virginica" 6.588 5.552

25.3 Groupby counts

iris.group_by("Species").agg(pl.count())
/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_56360/3201567798.py:1: DeprecationWarning:

`pl.count()` is deprecated. Please use `pl.len()` instead.
shape: (3, 2)
Species count
cat u32
"setosa" 50
"virginica" 50
"versicolor" 50

Remove duplicates and repeat

iris = iris.unique()
iris.group_by("Species").agg(pl.count())
/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_56360/3201567798.py:1: DeprecationWarning:

`pl.count()` is deprecated. Please use `pl.len()` instead.
shape: (3, 2)
Species count
cat u32
"setosa" 50
"virginica" 49
"versicolor" 50

One virginica row was removed

iris.group_by("Species").agg(
    [
        pl.count().alias("Count"),
        pl.max('Sepal_Length').alias("Sepal Length (max)"),
        pl.min('Sepal_Length').alias("Sepal Length (min)")
    ]
)
/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_56360/635172873.py:3: DeprecationWarning:

`pl.count()` is deprecated. Please use `pl.len()` instead.
shape: (3, 4)
Species Count Sepal Length (max) Sepal Length (min)
cat u32 f64 f64
"versicolor" 50 7.0 4.9
"virginica" 49 7.9 4.9
"setosa" 50 5.8 4.3

the same with lazy evaluation

iris.lazy().group_by("Species").agg(
    [
        pl.count().alias("Count"),
        pl.max('Sepal_Length').alias("Sepal Length (max)"),
        pl.min('Sepal_Length').alias("Sepal Length (min)")
    ]
).collect()
/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_56360/359113352.py:3: DeprecationWarning:

`pl.count()` is deprecated. Please use `pl.len()` instead.
shape: (3, 4)
Species Count Sepal Length (max) Sepal Length (min)
cat u32 f64 f64
"versicolor" 50 7.0 4.9
"setosa" 50 5.8 4.3
"virginica" 49 7.9 4.9

or

q = (
    iris
    .lazy()
    .group_by('Species')
    .agg(
        [
            pl.count().alias("Count"),
            pl.max('Sepal_Length').alias("Sepal Length (max)"),
            pl.min('Sepal_Length').alias("Sepal Length (min)")
        ]
    )
)
q.collect()
/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_56360/3670571711.py:7: DeprecationWarning:

`pl.count()` is deprecated. Please use `pl.len()` instead.
shape: (3, 4)
Species Count Sepal Length (max) Sepal Length (min)
cat u32 f64 f64
"virginica" 49 7.9 4.9
"versicolor" 50 7.0 4.9
"setosa" 50 5.8 4.3

25.4 Columnwise operations

iris.select(pl.col(iris.columns[0:4]))
shape: (149, 4)
Sepal_Length Sepal_Width Petal_Length Petal_Width
f64 f64 f64 f64
5.8 2.6 4.0 1.2
4.3 3.0 1.1 0.1
6.7 3.1 5.6 2.4
6.9 3.2 5.7 2.3
5.4 3.4 1.7 0.2
6.7 3.0 5.2 2.3
5.6 2.9 3.6 1.3
4.6 3.6 1.0 0.2
4.4 3.2 1.3 0.2
4.5 2.3 1.3 0.3

25.5 Resources