26  DuckDB

import duckdb
import polars as pl

26.1 Setup Connection

By default, connect()’s database argument defaults to :memory:, creating a non-persistent, in-memory object.

con = duckdb.connect()

26.2 Settings

con.execute(
    """
    PRAGMA enable_progress_bar;
    """
)
<duckdb.duckdb.DuckDBPyConnection at 0x111d0b370>

26.3 Read CSV

con.execute(
    "SELECT * FROM read_csv_auto('~/icloud/Data/iris.csv');"
)
<duckdb.duckdb.DuckDBPyConnection at 0x111d0b370>
con.execute(
    "CREATE TABLE iris AS SELECT * FROM '~/icloud/Data/iris.csv';"
)
<duckdb.duckdb.DuckDBPyConnection at 0x111d0b370>
con.execute(
    """
    CREATE TABLE ir AS SELECT * FROM 
    read_csv_auto('/Users/egenn/icloud/Data/iris.csv', 
    delim=',', header=True);
    """
)
<duckdb.duckdb.DuckDBPyConnection at 0x111d0b370>

26.3.1 Fetch data to pandas DataFrame:

iris = con.execute(
    "SELECT * from iris"
).fetchdf()
iris.head()
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
type(iris)
pandas.core.frame.DataFrame

26.3.2 Fetch data to polars DataFrame:

iris = pl.from_arrow(
    con.execute(
        "SELECT * from iris"
    ).fetch_arrow_table()
)
iris.head()
shape: (5, 5)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
f64 f64 f64 f64 str
5.1 3.5 1.4 0.2 "setosa"
4.9 3.0 1.4 0.2 "setosa"
4.7 3.2 1.3 0.2 "setosa"
4.6 3.1 1.5 0.2 "setosa"
5.0 3.6 1.4 0.2 "setosa"
type(iris)
polars.dataframe.frame.DataFrame

26.4 Read CSV specifying schema

con.execute(
    """
    CREATE TABLE iris2(
        "Sepal.Length" DOUBLE,
        "Sepal.Width" DOUBLE, 
        "Petal.Length" DOUBLE,
        "Petal.Width" DOUBLE, 
        Species VARCHAR);
    COPY iris2 FROM '/Users/egenn/icloud/Data/iris.csv' (AUTO_DETECT TRUE);
    """
)
con.execute("SELECT * FROM iris2").df().head()
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa

26.4.1 DuckDBPyRelation

ir = duckdb.from_csv_auto('/Users/egenn/icloud/Data/iris.csv')
ir
┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐
│ Sepal.Length │ Sepal.Width │ Petal.Length │ Petal.Width │  Species  │
│    double    │   double    │    double    │   double    │  varchar  │
├──────────────┼─────────────┼──────────────┼─────────────┼───────────┤
│          5.1 │         3.5 │          1.4 │         0.2 │ setosa    │
│          4.9 │         3.0 │          1.4 │         0.2 │ setosa    │
│          4.7 │         3.2 │          1.3 │         0.2 │ setosa    │
│          4.6 │         3.1 │          1.5 │         0.2 │ setosa    │
│          5.0 │         3.6 │          1.4 │         0.2 │ setosa    │
│          5.4 │         3.9 │          1.7 │         0.4 │ setosa    │
│          4.6 │         3.4 │          1.4 │         0.3 │ setosa    │
│          5.0 │         3.4 │          1.5 │         0.2 │ setosa    │
│          4.4 │         2.9 │          1.4 │         0.2 │ setosa    │
│          4.9 │         3.1 │          1.5 │         0.1 │ setosa    │
│           ·  │          ·  │           ·  │          ·  │   ·       │
│           ·  │          ·  │           ·  │          ·  │   ·       │
│           ·  │          ·  │           ·  │          ·  │   ·       │
│          6.7 │         3.1 │          5.6 │         2.4 │ virginica │
│          6.9 │         3.1 │          5.1 │         2.3 │ virginica │
│          5.8 │         2.7 │          5.1 │         1.9 │ virginica │
│          6.8 │         3.2 │          5.9 │         2.3 │ virginica │
│          6.7 │         3.3 │          5.7 │         2.5 │ virginica │
│          6.7 │         3.0 │          5.2 │         2.3 │ virginica │
│          6.3 │         2.5 │          5.0 │         1.9 │ virginica │
│          6.5 │         3.0 │          5.2 │         2.0 │ virginica │
│          6.2 │         3.4 │          5.4 │         2.3 │ virginica │
│          5.9 │         3.0 │          5.1 │         1.8 │ virginica │
├──────────────┴─────────────┴──────────────┴─────────────┴───────────┤
│ 150 rows (20 shown)                                       5 columns │
└─────────────────────────────────────────────────────────────────────┘
type(ir)
duckdb.duckdb.DuckDBPyRelation

26.5 Select at read

Note: Sepal.Length below must have double quotes, while Species may have double or none:

iris = pl.from_arrow(
    con.execute(
        """
        SELECT "Sepal.Length", Species
        FROM read_csv_auto('/Users/egenn/icloud/Data/iris.csv');
        """
    ).fetch_arrow_table()
)
iris.head()
shape: (5, 2)
Sepal.Length Species
f64 str
5.1 "setosa"
4.9 "setosa"
4.7 "setosa"
4.6 "setosa"
5.0 "setosa"

Concatenate a string to select multiple columns, instead of passing a list or tuple:

cols = tuple(["Sepal.Length", "Species"])
cols = '"' + '", "'.join(cols) + '"'
iris = pl.from_arrow(
    con.execute(
        f"""
        SELECT {cols}
        FROM read_csv_auto('/Users/egenn/icloud/Data/iris.csv');
        """
    ).fetch_arrow_table()
)
iris.head()
shape: (5, 2)
Sepal.Length Species
f64 str
5.1 "setosa"
4.9 "setosa"
4.7 "setosa"
4.6 "setosa"
5.0 "setosa"

26.6 Filter at read

26.6.1 Read entire table

ids = tuple([21, 22, 23, 24, 25])
con.execute(
    f"""
    CREATE TABLE synth1 AS
    SELECT * FROM read_csv_auto('/Users/egenn/icloud/Data/synth1.csv')
    """
)
con.execute("SELECT * FROM synth1").df()
ID Age Group
0 1 49.001420 D
1 2 28.266542 B
2 3 31.025146 A
3 4 25.554986 C
4 5 36.689864 B
... ... ... ...
495 496 36.633847 A
496 497 34.457815 C
497 498 49.096331 A
498 499 31.708578 A
499 500 53.850253 A

500 rows × 3 columns

26.6.2 Read only specific IDs

ids = tuple([21, 22, 23, 24, 25])
con.execute(
    f"""
    CREATE TABLE synth1f AS
    SELECT * FROM read_csv_auto('/Users/egenn/icloud/Data/synth1.csv')
    WHERE ID in {ids};
    """
)
con.execute("SELECT * FROM synth1f").df()
ID Age Group
0 21 43.644608 A
1 22 43.836510 A
2 23 51.134057 C
3 24 52.115098 B
4 25 36.516745 C

26.7 Using Queries with the Python API

ids = tuple([21, 22, 23, 24, 25])
sq = duckdb.query(
    f"""
    SELECT * FROM read_csv_auto('/Users/egenn/icloud/Data/synth1.csv')
    WHERE ID in {ids};
    """
)
sq
┌───────┬──────────────────┬─────────┐
│  ID   │       Age        │  Group  │
│ int64 │      double      │ varchar │
├───────┼──────────────────┼─────────┤
│    21 │ 43.6446078379911 │ A       │
│    22 │ 43.8365100056002 │ A       │
│    23 │ 51.1340571729247 │ C       │
│    24 │ 52.1150978723465 │ B       │
│    25 │  36.516745383723 │ C       │
└───────┴──────────────────┴─────────┘

You can execute the query at any time:

sq.execute().df()
ID Age Group
0 21 43.644608 A
1 22 43.836510 A
2 23 51.134057 C
3 24 52.115098 B
4 25 36.516745 C

26.8 Compose queries

How to specify path, select, and filter programmatically

26.8.1 Select

distinct = "DISTINCT" # set to "" to not filter unique rows
sep = ","
path = "/Users/egenn/icloud/Data/synth1.csv"
cols = tuple(["ID", "Age"])
cols = '"' + '", "'.join(cols) + '"'
ignore_errors = True
# cols = "*" # to select all programmatically
sq = duckdb.query(
    f"""
    SELECT {distinct + cols} FROM 
    read_csv_auto('{path}', sep = '{sep}', ignore_errors = {ignore_errors});
    """
)
sq.execute().df()
ID Age
0 2 28.266542
1 31 41.177789
2 70 44.229731
3 92 42.604247
4 115 40.441800
... ... ...
495 445 43.608565
496 450 41.649456
497 454 41.697756
498 475 40.230046
499 484 52.954887

500 rows × 2 columns

26.8.2 Filter

Suppose you have a list of IDs in a list, convert to tuple first

ids = list(range(80, 91))
filter_col = "ID"
sq = sq.filter(f"{filter_col} in {tuple(ids)}")
sq.execute().df()
ID Age
0 90 39.540553
1 81 29.840911
2 87 53.437334
3 80 48.873065
4 82 58.708691
5 89 67.469268
6 84 29.255924
7 85 18.044240
8 86 45.345446
9 88 53.850035
10 83 50.761984

26.9 Resources