import dask.dataframe as dd
import pandas as pd19 Joins
We are using very small data frames as an example. Such operations would normally be done in dask instead of pandas for much bigger datasets.
a = pd.DataFrame({
"PID": list(range(1, 10)),
"Hospital": ["UCSF", "HUP", "Stanford",
"Stanford", "UCSF", "HUP",
"HUP", "Stanford", "UCSF"],
"Age": [22, 34, 41, 19, 53, 21, 63, 22, 19],
"Sex": [1, 1, 0, 1, 0, 0, 1, 0, 0]
})
a.set_index("PID")
a = dd.from_pandas(a, npartitions=1)
aDask DataFrame Structure:
| PID | Hospital | Age | Sex | |
|---|---|---|---|---|
| npartitions=1 | ||||
| 0 | int64 | string | int64 | int64 |
| 8 | ... | ... | ... | ... |
Dask Name: frompandas, 1 expression
b = pd.DataFrame({
"PID": list(range(6, 13)),
"V1": [153, 89, 112, 228, 91, 190, 101],
"Department": ["Neurology", "Radiology",
"Emergency", "Cardiology",
"Surgery", "Neurology", "Psychiatry"]
})
# Optional, but supposed to help in large datasets
b.set_index("PID")
b = dd.from_pandas(b, npartitions=1)
bDask DataFrame Structure:
| PID | V1 | Department | |
|---|---|---|---|
| npartitions=1 | |||
| 0 | int64 | int64 | string |
| 6 | ... | ... | ... |
Dask Name: frompandas, 1 expression
19.1 Inner join
Similar to Pandas
dd.merge(a, b, on='PID', how='inner').compute()| PID | Hospital | Age | Sex | V1 | Department | |
|---|---|---|---|---|---|---|
| 0 | 6 | HUP | 21 | 0 | 153 | Neurology |
| 1 | 7 | HUP | 63 | 1 | 89 | Radiology |
| 2 | 8 | Stanford | 22 | 0 | 112 | Emergency |
| 3 | 9 | UCSF | 19 | 0 | 228 | Cardiology |
or:
a.merge(b, on='PID', how='inner').compute()| PID | Hospital | Age | Sex | V1 | Department | |
|---|---|---|---|---|---|---|
| 0 | 6 | HUP | 21 | 0 | 153 | Neurology |
| 1 | 7 | HUP | 63 | 1 | 89 | Radiology |
| 2 | 8 | Stanford | 22 | 0 | 112 | Emergency |
| 3 | 9 | UCSF | 19 | 0 | 228 | Cardiology |
Since we have set an index on both DataFrames, we can skip ‘on’:
dd.merge(a, b, how = "inner").compute()| PID | Hospital | Age | Sex | V1 | Department | |
|---|---|---|---|---|---|---|
| 0 | 6 | HUP | 21 | 0 | 153 | Neurology |
| 1 | 7 | HUP | 63 | 1 | 89 | Radiology |
| 2 | 8 | Stanford | 22 | 0 | 112 | Emergency |
| 3 | 9 | UCSF | 19 | 0 | 228 | Cardiology |
Note: The pd.join syntax is simialr to polars, but unlike polars, it only works correctly under specific conditions and will often give wrong results with no error or warning - avoid.
19.2 Outer join
dd.merge(a, b, on = "PID", how = "outer").compute()| PID | Hospital | Age | Sex | V1 | Department | |
|---|---|---|---|---|---|---|
| 0 | 1 | UCSF | 22.0 | 1.0 | NaN | <NA> |
| 1 | 2 | HUP | 34.0 | 1.0 | NaN | <NA> |
| 2 | 3 | Stanford | 41.0 | 0.0 | NaN | <NA> |
| 3 | 4 | Stanford | 19.0 | 1.0 | NaN | <NA> |
| 4 | 5 | UCSF | 53.0 | 0.0 | NaN | <NA> |
| 5 | 6 | HUP | 21.0 | 0.0 | 153.0 | Neurology |
| 6 | 7 | HUP | 63.0 | 1.0 | 89.0 | Radiology |
| 7 | 8 | Stanford | 22.0 | 0.0 | 112.0 | Emergency |
| 8 | 9 | UCSF | 19.0 | 0.0 | 228.0 | Cardiology |
| 9 | 10 | <NA> | NaN | NaN | 91.0 | Surgery |
| 10 | 11 | <NA> | NaN | NaN | 190.0 | Neurology |
| 11 | 12 | <NA> | NaN | NaN | 101.0 | Psychiatry |
19.3 Left outer join
dd.merge(a, b, on = "PID", how = "left").compute()| PID | Hospital | Age | Sex | V1 | Department | |
|---|---|---|---|---|---|---|
| 0 | 1 | UCSF | 22 | 1 | NaN | <NA> |
| 1 | 2 | HUP | 34 | 1 | NaN | <NA> |
| 2 | 3 | Stanford | 41 | 0 | NaN | <NA> |
| 3 | 4 | Stanford | 19 | 1 | NaN | <NA> |
| 4 | 5 | UCSF | 53 | 0 | NaN | <NA> |
| 5 | 6 | HUP | 21 | 0 | 153.0 | Neurology |
| 6 | 7 | HUP | 63 | 1 | 89.0 | Radiology |
| 7 | 8 | Stanford | 22 | 0 | 112.0 | Emergency |
| 8 | 9 | UCSF | 19 | 0 | 228.0 | Cardiology |
19.4 Right outer join
dd.merge(a, b, on = "PID", how = "right").compute()| PID | Hospital | Age | Sex | V1 | Department | |
|---|---|---|---|---|---|---|
| 0 | 6 | HUP | 21.0 | 0.0 | 153 | Neurology |
| 1 | 7 | HUP | 63.0 | 1.0 | 89 | Radiology |
| 2 | 8 | Stanford | 22.0 | 0.0 | 112 | Emergency |
| 3 | 9 | UCSF | 19.0 | 0.0 | 228 | Cardiology |
| 4 | 10 | <NA> | NaN | NaN | 91 | Surgery |
| 5 | 11 | <NA> | NaN | NaN | 190 | Neurology |
| 6 | 12 | <NA> | NaN | NaN | 101 | Psychiatry |