Files
training.python.datascience/workshop/pandas-shoes/dask_shoes.py
2025-07-04 19:58:11 +02:00

35 lines
1.2 KiB
Python

from contexttimer import Timer
from dask import dataframe as dd
from dask.distributed import Client
import pandas as pd
if __name__ == '__main__':
loops: int = 10
client = Client(n_workers=loops)
df: dd.DataFrame = dd.read_csv("data/womens-shoes.csv.xz", low_memory=False, dtype={"asins": "object", "prices.offer": "object", "upc": "object", "weight": "object"})
pf = pd.read_csv("data/womens-shoes.csv.xz")
af = pd.read_csv("data/womens-shoes.csv.xz", engine="pyarrow")
# Dask Dataframe (±4800ms)
with Timer() as timer:
for i in range(loops):
df2 = df.sort_values("prices.amountMin", ascending=bool(i % 2)).compute()
print(df2["prices.amountMin"].iloc[0])
print(timer.elapsed)
# Pandas Dataframe (±35ms)
with Timer() as timer:
for i in range(loops):
df2 = pf.sort_values("prices.amountMin", ascending=bool(i % 2))
print(df2["prices.amountMin"].iloc[0])
print(timer.elapsed)
# Pandas PyArrow Dataframe (±29ms)
with Timer() as timer:
for i in range(loops):
df2 = af.sort_values("prices.amountMin", ascending=bool(i % 2))
print(df2["prices.amountMin"].iloc[0])
print(timer.elapsed)