import dask.dataframe as dd
import pandas as pd18 Reshape
18.1 Long to wide
Example 1:
dat_long = pd.DataFrame({'ID':[1, 2, 3, 1, 2, 3, 1, 2, 3],
'Fruit':['mango', 'mango', 'mango',
'banana', 'banana', 'banana',
'tangerine', 'tangerine', 'tangerine'],
'Score':[1.1, 2.1, 3.1, 1.2, 2.2, 3.2, 1.3, 2.3, 3.3]})
dat_long['Fruit'] = dat_long['Fruit'].astype('category')
dat_long = dd.from_pandas(dat_long, npartitions=1)
dat_longDask DataFrame Structure:
| ID | Fruit | Score | |
|---|---|---|---|
| npartitions=1 | |||
| 0 | int64 | category[known] | float64 |
| 8 | ... | ... | ... |
Dask Name: frompandas, 1 expression
Usage: dd.pivot(df, index, columns, values)
dat_long.pivot_table(
index = 'ID',
columns = 'Fruit',
values = 'Score').compute()| Fruit | banana | mango | tangerine |
|---|---|---|---|
| ID | |||
| 1 | 1.2 | 1.1 | 1.3 |
| 2 | 2.2 | 2.1 | 2.3 |
| 3 | 3.2 | 3.1 | 3.3 |
Example 2:
dat_long = pd.DataFrame({
'Account_ID': [8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004,
8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004],
'Age': [67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671, 67.8017038366664, 42.9198507293701,
46.2301756642422, 39.665983196671, 67.8017038366664,
42.9198507293701, 46.2301756642422, 39.665983196671,
67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671],
'Admission': ['ED', 'Planned', 'Planned', 'ED', 'ED', 'Planned',
'Planned', 'ED', 'ED', 'Planned', 'Planned', 'ED', 'ED',
'Planned', 'Planned', 'ED'],
'Lab_key': ['RBC', 'RBC', 'RBC', 'RBC', 'WBC', 'WBC', 'WBC', 'WBC',
'Hematocrit', 'Hematocrit', 'Hematocrit', 'Hematocrit',
'Hemoglobin', 'Hemoglobin', 'Hemoglobin', 'Hemoglobin'],
'Lab_value': [4.63449321082268, 3.34968550627897, 4.27037213597765,
4.93897736897793, 8374.22887757195, 7612.37380499927,
8759.27855519425, 6972.28096216548, 36.272693147236,
40.5716317809522, 39.9888624177955, 39.8786884058422,
12.6188444991545, 12.1739747363806, 15.1293426442183,
14.8885696185238]
})
dat_long.Lab_key = dat_long.Lab_key.astype('category')
dat_long = dd.from_pandas(dat_long, npartitions=1)
dat_longDask DataFrame Structure:
| Account_ID | Age | Admission | Lab_key | Lab_value | |
|---|---|---|---|---|---|
| npartitions=1 | |||||
| 0 | int64 | float64 | string | category[known] | float64 |
| 15 | ... | ... | ... | ... | ... |
Dask Name: frompandas, 1 expression
Usage: df.pivot_table(index, columns, values)
Unlike Pandas, the index needs to be a single column of type ‘category’
dat_long.pivot_table(
index = 'Account_ID',
columns = 'Lab_key',
values = 'Lab_value')Dask DataFrame Structure:
| Hematocrit | Hemoglobin | RBC | WBC | |
|---|---|---|---|---|
| npartitions=1 | ||||
| float64 | float64 | float64 | float64 | |
| ... | ... | ... | ... |
Dask Name: pivottable, 2 expressions
18.2 Wide to Long
dat_wide = pd.DataFrame({'ID':[1, 2, 3],
'mango':[1.1, 2.1, 3.1],
'banana':[1.2, 2.2, 3.2],
'tangerine':[1.3, 2.3, 3.3]})
dat_wide = dd.from_pandas(dat_wide, npartitions=1)
dat_wide.compute()| ID | mango | banana | tangerine | |
|---|---|---|---|---|
| 0 | 1 | 1.1 | 1.2 | 1.3 |
| 1 | 2 | 2.1 | 2.2 | 2.3 |
| 2 | 3 | 3.1 | 3.2 | 3.3 |
Usage: pdd.melt(id_vars, value_vars, var_name, value_name)
dat_wide.melt(
id_vars = 'ID',
value_vars = ['mango', 'banana', 'tangerine'],
var_name = 'Fruit',
value_name = 'Value').compute()| ID | Fruit | Value | |
|---|---|---|---|
| 0 | 1 | mango | 1.1 |
| 1 | 2 | mango | 2.1 |
| 2 | 3 | mango | 3.1 |
| 3 | 1 | banana | 1.2 |
| 4 | 2 | banana | 2.2 |
| 5 | 3 | banana | 3.2 |
| 6 | 1 | tangerine | 1.3 |
| 7 | 2 | tangerine | 2.3 |
| 8 | 3 | tangerine | 3.3 |
Sam as in Pandas, in the above example, value_vars can be ommitted: all columns other than those defined as ids will be used.
Example 2:
dat_wide = pd.DataFrame({
'Account_ID': [8001, 8002, 8003, 8004],
'Age': [67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671],
'RBC': [4.63449321082268, 3.34968550627897, 4.27037213597765,
4.93897736897793],
'WBC': [8374.22887757195, 7612.37380499927, 8759.27855519425,
6972.28096216548],
'Hematocrit': [36.272693147236, 40.5716317809522, 39.9888624177955,
39.8786884058422],
'Hemoglobin': [12.618444991545, 12.1739747363806, 15.1293426442183,
14.8885696185238],
'Admission': ['ED', 'Planned', 'Planned', 'Planned']
})
dat_wide = dd.from_pandas(dat_wide, npartitions=1)
dat_wide.compute()| Account_ID | Age | RBC | WBC | Hematocrit | Hemoglobin | Admission | |
|---|---|---|---|---|---|---|---|
| 0 | 8001 | 67.801704 | 4.634493 | 8374.228878 | 36.272693 | 12.618445 | ED |
| 1 | 8002 | 42.919851 | 3.349686 | 7612.373805 | 40.571632 | 12.173975 | Planned |
| 2 | 8003 | 46.230176 | 4.270372 | 8759.278555 | 39.988862 | 15.129343 | Planned |
| 3 | 8004 | 39.665983 | 4.938977 | 6972.280962 | 39.878688 | 14.888570 | Planned |
Usage: pd.melt(df, id_vars, value_vars, var_name, value_name)
dat_wide.melt(
id_vars = ['Account_ID', 'Age', 'Admission'],
value_vars = ['RBC', 'WBC', 'Hematocrit', 'Hemoglobin'],
var_name = 'Lab',
value_name = 'Value').compute()| Account_ID | Age | Admission | Lab | Value | |
|---|---|---|---|---|---|
| 0 | 8001 | 67.801704 | ED | RBC | 4.634493 |
| 1 | 8002 | 42.919851 | Planned | RBC | 3.349686 |
| 2 | 8003 | 46.230176 | Planned | RBC | 4.270372 |
| 3 | 8004 | 39.665983 | Planned | RBC | 4.938977 |
| 4 | 8001 | 67.801704 | ED | WBC | 8374.228878 |
| 5 | 8002 | 42.919851 | Planned | WBC | 7612.373805 |
| 6 | 8003 | 46.230176 | Planned | WBC | 8759.278555 |
| 7 | 8004 | 39.665983 | Planned | WBC | 6972.280962 |
| 8 | 8001 | 67.801704 | ED | Hematocrit | 36.272693 |
| 9 | 8002 | 42.919851 | Planned | Hematocrit | 40.571632 |
| 10 | 8003 | 46.230176 | Planned | Hematocrit | 39.988862 |
| 11 | 8004 | 39.665983 | Planned | Hematocrit | 39.878688 |
| 12 | 8001 | 67.801704 | ED | Hemoglobin | 12.618445 |
| 13 | 8002 | 42.919851 | Planned | Hemoglobin | 12.173975 |
| 14 | 8003 | 46.230176 | Planned | Hemoglobin | 15.129343 |
| 15 | 8004 | 39.665983 | Planned | Hemoglobin | 14.888570 |