35 lines
1.2 KiB
Python
35 lines
1.2 KiB
Python
from contexttimer import Timer
|
|
from dask import dataframe as dd
|
|
from dask.distributed import Client
|
|
import pandas as pd
|
|
|
|
if __name__ == '__main__':
|
|
loops: int = 10
|
|
client = Client(n_workers=loops)
|
|
df: dd.DataFrame = dd.read_csv("data/womens-shoes.csv.xz", low_memory=False, dtype={"asins": "object", "prices.offer": "object", "upc": "object", "weight": "object"})
|
|
pf = pd.read_csv("data/womens-shoes.csv.xz")
|
|
af = pd.read_csv("data/womens-shoes.csv.xz", engine="pyarrow")
|
|
|
|
# Dask Dataframe (±4800ms)
|
|
with Timer() as timer:
|
|
for i in range(loops):
|
|
df2 = df.sort_values("prices.amountMin", ascending=bool(i % 2)).compute()
|
|
print(df2["prices.amountMin"].iloc[0])
|
|
print(timer.elapsed)
|
|
|
|
# Pandas Dataframe (±35ms)
|
|
with Timer() as timer:
|
|
for i in range(loops):
|
|
df2 = pf.sort_values("prices.amountMin", ascending=bool(i % 2))
|
|
print(df2["prices.amountMin"].iloc[0])
|
|
print(timer.elapsed)
|
|
|
|
# Pandas PyArrow Dataframe (±29ms)
|
|
with Timer() as timer:
|
|
for i in range(loops):
|
|
df2 = af.sort_values("prices.amountMin", ascending=bool(i % 2))
|
|
print(df2["prices.amountMin"].iloc[0])
|
|
print(timer.elapsed)
|
|
|
|
|