from contexttimer import Timer from dask import dataframe as dd from dask.distributed import Client import pandas as pd if __name__ == '__main__': loops: int = 10 client = Client(n_workers=loops) df: dd.DataFrame = dd.read_csv("data/womens-shoes.csv.xz", low_memory=False, dtype={"asins": "object", "prices.offer": "object", "upc": "object", "weight": "object"}) pf = pd.read_csv("data/womens-shoes.csv.xz") af = pd.read_csv("data/womens-shoes.csv.xz", engine="pyarrow") # Dask Dataframe (±4800ms) with Timer() as timer: for i in range(loops): df2 = df.sort_values("prices.amountMin", ascending=bool(i % 2)).compute() print(df2["prices.amountMin"].iloc[0]) print(timer.elapsed) # Pandas Dataframe (±35ms) with Timer() as timer: for i in range(loops): df2 = pf.sort_values("prices.amountMin", ascending=bool(i % 2)) print(df2["prices.amountMin"].iloc[0]) print(timer.elapsed) # Pandas PyArrow Dataframe (±29ms) with Timer() as timer: for i in range(loops): df2 = af.sort_values("prices.amountMin", ascending=bool(i % 2)) print(df2["prices.amountMin"].iloc[0]) print(timer.elapsed)