import polars as pl23 Reshape
23.1 Long to wide
Example 1:
dat_long = pl.DataFrame({"ID":list(range(1,4))*3,
"Fruit":["mango", "mango", "mango",
"banana", "banana", "banana",
"tangerine", "tangerine", "tangerine"],
"Score":[1.1, 2.1, 3.1, 1.2, 2.2, 3.2, 1.3, 2.3, 3.3]})
dat_long
shape: (9, 3)
| ID | Fruit | Score |
|---|---|---|
| i64 | str | f64 |
| 1 | "mango" | 1.1 |
| 2 | "mango" | 2.1 |
| 3 | "mango" | 3.1 |
| 1 | "banana" | 1.2 |
| 2 | "banana" | 2.2 |
| 3 | "banana" | 3.2 |
| 1 | "tangerine" | 1.3 |
| 2 | "tangerine" | 2.3 |
| 3 | "tangerine" | 3.3 |
dat_long.pivot(index = "ID", columns = "Fruit", values = "Score")/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_56096/1831660072.py:1: DeprecationWarning:
The argument `columns` for `DataFrame.pivot` is deprecated. It has been renamed to `on`.
shape: (3, 4)
| ID | mango | banana | tangerine |
|---|---|---|---|
| i64 | f64 | f64 | f64 |
| 1 | 1.1 | 1.2 | 1.3 |
| 2 | 2.1 | 2.2 | 2.3 |
| 3 | 3.1 | 3.2 | 3.3 |
Example 2:
dat_long = pl.DataFrame({
"Account_ID": [8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004,
8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004],
"Age": [67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671, 67.8017038366664, 42.9198507293701,
46.2301756642422, 39.665983196671, 67.8017038366664,
42.9198507293701, 46.2301756642422, 39.665983196671,
67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671],
"Admission": ["ED", "Planned", "Planned", "ED", "ED", "Planned",
"Planned", "ED", "ED", "Planned", "Planned", "ED", "ED",
"Planned", "Planned", "ED"],
"Lab_key": ["RBC", "RBC", "RBC", "RBC", "WBC", "WBC", "WBC", "WBC",
"Hematocrit", "Hematocrit", "Hematocrit", "Hematocrit",
"Hemoglobin", "Hemoglobin", "Hemoglobin", "Hemoglobin"],
"Lab_value": [4.63449321082268, 3.34968550627897, 4.27037213597765,
4.93897736897793, 8374.22887757195, 7612.37380499927,
8759.27855519425, 6972.28096216548, 36.272693147236,
40.5716317809522, 39.9888624177955, 39.8786884058422,
12.6188444991545, 12.1739747363806, 15.1293426442183,
14.8885696185238]
})
dat_long
shape: (16, 5)
| Account_ID | Age | Admission | Lab_key | Lab_value |
|---|---|---|---|---|
| i64 | f64 | str | str | f64 |
| 8001 | 67.801704 | "ED" | "RBC" | 4.634493 |
| 8002 | 42.919851 | "Planned" | "RBC" | 3.349686 |
| 8003 | 46.230176 | "Planned" | "RBC" | 4.270372 |
| 8004 | 39.665983 | "ED" | "RBC" | 4.938977 |
| 8001 | 67.801704 | "ED" | "WBC" | 8374.228878 |
| … | … | … | … | … |
| 8004 | 39.665983 | "ED" | "Hematocrit" | 39.878688 |
| 8001 | 67.801704 | "ED" | "Hemoglobin" | 12.618844 |
| 8002 | 42.919851 | "Planned" | "Hemoglobin" | 12.173975 |
| 8003 | 46.230176 | "Planned" | "Hemoglobin" | 15.129343 |
| 8004 | 39.665983 | "ED" | "Hemoglobin" | 14.88857 |
Usage: df.pivot(value, index, columns)
Unfortunately, including Age in the index currently leads to an error (not executed)
dat_long.pivot(values = "Lab_value",
index = ["Account_ID", "Age", "Admission"],
columns = "Lab_key")Works if Age is cast to UTF-8 (and back to Float64)
dat_long = dat_long.with_columns(
pl.col("Age").cast(pl.Utf8)
)
dat_wide = dat_long.pivot(values = "Lab_value",
index = ["Account_ID", "Age", "Admission"],
columns = "Lab_key")
dat_wide = dat_wide.with_columns(
pl.col("Age").cast(pl.Float64)
)
dat_wide/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_56096/1948555039.py:4: DeprecationWarning:
The argument `columns` for `DataFrame.pivot` is deprecated. It has been renamed to `on`.
shape: (4, 7)
| Account_ID | Age | Admission | RBC | WBC | Hematocrit | Hemoglobin |
|---|---|---|---|---|---|---|
| i64 | f64 | str | f64 | f64 | f64 | f64 |
| 8001 | 67.801704 | "ED" | 4.634493 | 8374.228878 | 36.272693 | 12.618844 |
| 8002 | 42.919851 | "Planned" | 3.349686 | 7612.373805 | 40.571632 | 12.173975 |
| 8003 | 46.230176 | "Planned" | 4.270372 | 8759.278555 | 39.988862 | 15.129343 |
| 8004 | 39.665983 | "ED" | 4.938977 | 6972.280962 | 39.878688 | 14.88857 |
Excluding Age, the pivot works:
dat_long.pivot(values = "Lab_value",
index = ["Account_ID", "Admission"],
columns = "Lab_key")/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_56096/1152422138.py:1: DeprecationWarning:
The argument `columns` for `DataFrame.pivot` is deprecated. It has been renamed to `on`.
shape: (4, 6)
| Account_ID | Admission | RBC | WBC | Hematocrit | Hemoglobin |
|---|---|---|---|---|---|
| i64 | str | f64 | f64 | f64 | f64 |
| 8001 | "ED" | 4.634493 | 8374.228878 | 36.272693 | 12.618844 |
| 8002 | "Planned" | 3.349686 | 7612.373805 | 40.571632 | 12.173975 |
| 8003 | "Planned" | 4.270372 | 8759.278555 | 39.988862 | 15.129343 |
| 8004 | "ED" | 4.938977 | 6972.280962 | 39.878688 | 14.88857 |
23.2 Wide to Long
Example 1:
dat_wide = pl.DataFrame({"ID":[1, 2, 3],
"mango":[1.1, 2.1, 3.1],
"banana":[1.2, 2.2, 3.2],
"tangerine":[1.3, 2.3, 3.3]})
dat_wide
shape: (3, 4)
| ID | mango | banana | tangerine |
|---|---|---|---|
| i64 | f64 | f64 | f64 |
| 1 | 1.1 | 1.2 | 1.3 |
| 2 | 2.1 | 2.2 | 2.3 |
| 3 | 3.1 | 3.2 | 3.3 |
Usage: df.metl(id_vars, value_vars, variable_name, value_name)
dat_wide.melt("ID", value_vars = ['mango', "banana", "tangerine"])/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_56096/687839568.py:1: DeprecationWarning:
`DataFrame.melt` is deprecated. Use `unpivot` instead, with `index` instead of `id_vars` and `on` instead of `value_vars`
shape: (9, 3)
| ID | variable | value |
|---|---|---|
| i64 | str | f64 |
| 1 | "mango" | 1.1 |
| 2 | "mango" | 2.1 |
| 3 | "mango" | 3.1 |
| 1 | "banana" | 1.2 |
| 2 | "banana" | 2.2 |
| 3 | "banana" | 3.2 |
| 1 | "tangerine" | 1.3 |
| 2 | "tangerine" | 2.3 |
| 3 | "tangerine" | 3.3 |
In the above example, value_vars can be ommitted: all columns other than those defined as ids will be used.
Example 2:
dat_wide = pl.DataFrame({
"Account_ID": [8001, 8002, 8003, 8004],
"Age": [67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671],
"RBC": [4.63449321082268, 3.34968550627897, 4.27037213597765,
4.93897736897793],
"WBC": [8374.22887757195, 7612.37380499927, 8759.27855519425,
6972.28096216548],
"Hematocrit": [36.272693147236, 40.5716317809522, 39.9888624177955,
39.8786884058422],
"Hemoglobin": [12.618444991545, 12.1739747363806, 15.1293426442183,
14.8885696185238],
"Admission": ["ED", "Planned", "Planned", "Planned"]
})
dat_wide
shape: (4, 7)
| Account_ID | Age | RBC | WBC | Hematocrit | Hemoglobin | Admission |
|---|---|---|---|---|---|---|
| i64 | f64 | f64 | f64 | f64 | f64 | str |
| 8001 | 67.801704 | 4.634493 | 8374.228878 | 36.272693 | 12.618445 | "ED" |
| 8002 | 42.919851 | 3.349686 | 7612.373805 | 40.571632 | 12.173975 | "Planned" |
| 8003 | 46.230176 | 4.270372 | 8759.278555 | 39.988862 | 15.129343 | "Planned" |
| 8004 | 39.665983 | 4.938977 | 6972.280962 | 39.878688 | 14.88857 | "Planned" |
Usage: df.metl(id_vars, value_vars, variable_name, value_name)
dat_wide.melt(id_vars = ["Account_ID", "Age", "Admission"],
value_vars = ["RBC", "WBC", "Hematocrit", "Hemoglobin"],
variable_name = "Lab",
value_name = "Value")/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_56096/2589903787.py:1: DeprecationWarning:
`DataFrame.melt` is deprecated. Use `unpivot` instead, with `index` instead of `id_vars` and `on` instead of `value_vars`
shape: (16, 5)
| Account_ID | Age | Admission | Lab | Value |
|---|---|---|---|---|
| i64 | f64 | str | str | f64 |
| 8001 | 67.801704 | "ED" | "RBC" | 4.634493 |
| 8002 | 42.919851 | "Planned" | "RBC" | 3.349686 |
| 8003 | 46.230176 | "Planned" | "RBC" | 4.270372 |
| 8004 | 39.665983 | "Planned" | "RBC" | 4.938977 |
| 8001 | 67.801704 | "ED" | "WBC" | 8374.228878 |
| … | … | … | … | … |
| 8004 | 39.665983 | "Planned" | "Hematocrit" | 39.878688 |
| 8001 | 67.801704 | "ED" | "Hemoglobin" | 12.618445 |
| 8002 | 42.919851 | "Planned" | "Hemoglobin" | 12.173975 |
| 8003 | 46.230176 | "Planned" | "Hemoglobin" | 15.129343 |
| 8004 | 39.665983 | "Planned" | "Hemoglobin" | 14.88857 |
Note: value_vars above can be omitted and will default to remaining columns