import polars as pl
import re21 Basics
21.1 Construct DataFrame by column
You can create a polars DataFrame from a dictionary of arrays/lists, i.e. by inputting columb by column - just like in pandas:
dat1 = pl.DataFrame({"Fruit":["mango", "banana", "tangerine"],
"Rating":[8, 9, 7],
"Cost":[5, 2, 3]})
dat1
shape: (3, 3)
| Fruit | Rating | Cost |
|---|---|---|
| str | i64 | i64 |
| "mango" | 8 | 5 |
| "banana" | 9 | 2 |
| "tangerine" | 7 | 3 |
type(dat1)polars.dataframe.frame.DataFrame
21.2 Read csv
dat = pl.read_csv("/Users/egenn/icloud/Data/iris.csv",
schema_overrides={'Species': pl.Categorical})
dat
shape: (150, 5)
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
|---|---|---|---|---|
| f64 | f64 | f64 | f64 | cat |
| 5.1 | 3.5 | 1.4 | 0.2 | "setosa" |
| 4.9 | 3.0 | 1.4 | 0.2 | "setosa" |
| 4.7 | 3.2 | 1.3 | 0.2 | "setosa" |
| 4.6 | 3.1 | 1.5 | 0.2 | "setosa" |
| 5.0 | 3.6 | 1.4 | 0.2 | "setosa" |
| … | … | … | … | … |
| 6.7 | 3.0 | 5.2 | 2.3 | "virginica" |
| 6.3 | 2.5 | 5.0 | 1.9 | "virginica" |
| 6.5 | 3.0 | 5.2 | 2.0 | "virginica" |
| 6.2 | 3.4 | 5.4 | 2.3 | "virginica" |
| 5.9 | 3.0 | 5.1 | 1.8 | "virginica" |
21.3 Get dimensions: shape
dat.shape(150, 5)
21.4 Show first n rows: head()
defaults to first 5 rows
dat.head()
shape: (5, 5)
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
|---|---|---|---|---|
| f64 | f64 | f64 | f64 | cat |
| 5.1 | 3.5 | 1.4 | 0.2 | "setosa" |
| 4.9 | 3.0 | 1.4 | 0.2 | "setosa" |
| 4.7 | 3.2 | 1.3 | 0.2 | "setosa" |
| 4.6 | 3.1 | 1.5 | 0.2 | "setosa" |
| 5.0 | 3.6 | 1.4 | 0.2 | "setosa" |
dat.head(3)
shape: (3, 5)
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
|---|---|---|---|---|
| f64 | f64 | f64 | f64 | cat |
| 5.1 | 3.5 | 1.4 | 0.2 | "setosa" |
| 4.9 | 3.0 | 1.4 | 0.2 | "setosa" |
| 4.7 | 3.2 | 1.3 | 0.2 | "setosa" |
21.5 Get & set column names: df.columns
dat.columns['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']
dat.columns = [re.sub("\.", "_", col) for col in list(dat.columns)]
dat.columns<>:1: SyntaxWarning:
invalid escape sequence '\.'
<>:1: SyntaxWarning:
invalid escape sequence '\.'
/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_51394/1385048352.py:1: SyntaxWarning:
invalid escape sequence '\.'
['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species']
21.6 Get column data types: df.dtypes
dat.dtypes[Float64, Float64, Float64, Float64, Categorical(ordering='physical')]
21.7 Get names and types: df.schema
dat.schemaSchema([('Sepal_Length', Float64),
('Sepal_Width', Float64),
('Petal_Length', Float64),
('Petal_Width', Float64),
('Species', Categorical(ordering='physical'))])
21.8 Indexing
Polars favors using df.select() and df.filter() for indexing columns and rows. Square bracket indexing is also available (may be removed)
21.9 Select
dat.select("Species")
shape: (150, 1)
| Species |
|---|
| cat |
| "setosa" |
| "setosa" |
| "setosa" |
| "setosa" |
| "setosa" |
| … |
| "virginica" |
| "virginica" |
| "virginica" |
| "virginica" |
| "virginica" |
dat.select(["Sepal_Length", "Species"])
shape: (150, 2)
| Sepal_Length | Species |
|---|---|
| f64 | cat |
| 5.1 | "setosa" |
| 4.9 | "setosa" |
| 4.7 | "setosa" |
| 4.6 | "setosa" |
| 5.0 | "setosa" |
| … | … |
| 6.7 | "virginica" |
| 6.3 | "virginica" |
| 6.5 | "virginica" |
| 6.2 | "virginica" |
| 5.9 | "virginica" |
You can (for now?) also index using brackets:
dat[10:15, "Sepal_Length":"Petal_Length"]
shape: (5, 3)
| Sepal_Length | Sepal_Width | Petal_Length |
|---|---|---|
| f64 | f64 | f64 |
| 5.4 | 3.7 | 1.5 |
| 4.8 | 3.4 | 1.6 |
| 4.8 | 3.0 | 1.4 |
| 4.3 | 3.0 | 1.1 |
| 5.8 | 4.0 | 1.2 |
dat[0:10, 2:5]
shape: (10, 3)
| Petal_Length | Petal_Width | Species |
|---|---|---|
| f64 | f64 | cat |
| 1.4 | 0.2 | "setosa" |
| 1.4 | 0.2 | "setosa" |
| 1.3 | 0.2 | "setosa" |
| 1.5 | 0.2 | "setosa" |
| 1.4 | 0.2 | "setosa" |
| 1.7 | 0.4 | "setosa" |
| 1.4 | 0.3 | "setosa" |
| 1.5 | 0.2 | "setosa" |
| 1.4 | 0.2 | "setosa" |
| 1.5 | 0.1 | "setosa" |
21.10 Filter
dat.filter(pl.col("Species") == "versicolor")
shape: (50, 5)
| Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species |
|---|---|---|---|---|
| f64 | f64 | f64 | f64 | cat |
| 7.0 | 3.2 | 4.7 | 1.4 | "versicolor" |
| 6.4 | 3.2 | 4.5 | 1.5 | "versicolor" |
| 6.9 | 3.1 | 4.9 | 1.5 | "versicolor" |
| 5.5 | 2.3 | 4.0 | 1.3 | "versicolor" |
| 6.5 | 2.8 | 4.6 | 1.5 | "versicolor" |
| … | … | … | … | … |
| 5.7 | 3.0 | 4.2 | 1.2 | "versicolor" |
| 5.7 | 2.9 | 4.2 | 1.3 | "versicolor" |
| 6.2 | 2.9 | 4.3 | 1.3 | "versicolor" |
| 5.1 | 2.5 | 3.0 | 1.1 | "versicolor" |
| 5.7 | 2.8 | 4.1 | 1.3 | "versicolor" |
dat.filter(pl.col("Species").cast(pl.Utf8).is_in(['versicolor', 'virginica']))
shape: (100, 5)
| Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species |
|---|---|---|---|---|
| f64 | f64 | f64 | f64 | cat |
| 7.0 | 3.2 | 4.7 | 1.4 | "versicolor" |
| 6.4 | 3.2 | 4.5 | 1.5 | "versicolor" |
| 6.9 | 3.1 | 4.9 | 1.5 | "versicolor" |
| 5.5 | 2.3 | 4.0 | 1.3 | "versicolor" |
| 6.5 | 2.8 | 4.6 | 1.5 | "versicolor" |
| … | … | … | … | … |
| 6.7 | 3.0 | 5.2 | 2.3 | "virginica" |
| 6.3 | 2.5 | 5.0 | 1.9 | "virginica" |
| 6.5 | 3.0 | 5.2 | 2.0 | "virginica" |
| 6.2 | 3.4 | 5.4 | 2.3 | "virginica" |
| 5.9 | 3.0 | 5.1 | 1.8 | "virginica" |
dat.filter(pl.col("Sepal_Length") < 4.5)
shape: (4, 5)
| Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species |
|---|---|---|---|---|
| f64 | f64 | f64 | f64 | cat |
| 4.4 | 2.9 | 1.4 | 0.2 | "setosa" |
| 4.3 | 3.0 | 1.1 | 0.1 | "setosa" |
| 4.4 | 3.0 | 1.3 | 0.2 | "setosa" |
| 4.4 | 3.2 | 1.3 | 0.2 | "setosa" |
dat.filter(pl.col("Petal_Length") < dat["Petal_Length"].mean())
shape: (57, 5)
| Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species |
|---|---|---|---|---|
| f64 | f64 | f64 | f64 | cat |
| 5.1 | 3.5 | 1.4 | 0.2 | "setosa" |
| 4.9 | 3.0 | 1.4 | 0.2 | "setosa" |
| 4.7 | 3.2 | 1.3 | 0.2 | "setosa" |
| 4.6 | 3.1 | 1.5 | 0.2 | "setosa" |
| 5.0 | 3.6 | 1.4 | 0.2 | "setosa" |
| … | … | … | … | … |
| 5.6 | 2.9 | 3.6 | 1.3 | "versicolor" |
| 5.7 | 2.6 | 3.5 | 1.0 | "versicolor" |
| 5.5 | 2.4 | 3.7 | 1.0 | "versicolor" |
| 5.0 | 2.3 | 3.3 | 1.0 | "versicolor" |
| 5.1 | 2.5 | 3.0 | 1.1 | "versicolor" |
df = pl.DataFrame({
'ID': [1, 3, 5, 7 ],
'Age': [45, 43, 23, 76]
})
df
shape: (4, 2)
| ID | Age |
|---|---|
| i64 | i64 |
| 1 | 45 |
| 3 | 43 |
| 5 | 23 |
| 7 | 76 |
df.filter(pl.col("ID").is_in([3, 5]))
shape: (2, 2)
| ID | Age |
|---|---|
| i64 | i64 |
| 3 | 43 |
| 5 | 23 |