# Test de performance Dask

Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes.

In [8]:
from contexttimer import Timer
from dask import dataframe as dd


loops: int = 10
df: dd.DataFrame = dd.read_csv("data/womens-shoes.csv.xz", low_memory=False, dtype={"asins": "object", "prices.offer": "object", "upc": "object", "weight": "object"})

# Dask Dataframe (±4800ms)
with Timer() as timer:
 for i in range(loops):
 df2 = df.sort_values("prices.amountMin", ascending=bool(i % 2)).compute()
print(f"Dask Dataframe: {timer.elapsed:.4f} seconds")


Dask Dataframe: 3.4439 seconds


# Test de performance Pandas

Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes :

- Avec des types `numpy`
- Avec des types `pyarrow`

In [11]:
from contexttimer import Timer
import pandas as pd

loops: int = 10
pf = pd.read_csv("data/womens-shoes.csv.xz")
af = pd.read_csv("data/womens-shoes.csv.xz", engine="pyarrow")

# Pandas Dataframe (±35ms)
with Timer() as timer:
 for i in range(loops):
 df2 = pf.sort_values("prices.amountMin", ascending=bool(i % 2))
print(f"Pandas Dataframe: {timer.elapsed:.4f} seconds")

# Pandas PyArrow Dataframe (±29ms)
with Timer() as timer:
 for i in range(loops):
 df2 = af.sort_values("prices.amountMin", ascending=bool(i % 2))
print(f"Pandas PyArrow Dataframe: {timer.elapsed:.4f} seconds")




Pandas Dataframe: 0.0489 seconds
Pandas PyArrow Dataframe: 0.0401 seconds


# Test de performance Polars

Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes 

In [9]:
import polars as pl
from contexttimer import Timer


loops: int = 10
pf = pl.read_csv("data/womens-shoes.csv", infer_schema_length=2 ** 30)

# Polars Dataframe (±9ms)
with Timer() as timer:
 for i in range(loops):
 df2 = pf.sort("prices.amountMin", descending=bool(i % 2))
print(f"Polars Dataframe: {timer.elapsed:.4f} seconds")



Polars Dataframe: 0.0092 seconds


# Test de performance DuckDB

Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes 

In [2]:
import pandas as pd
import duckdb as duck
from contexttimer import Timer


loops: int = 10
df = pd.read_csv("data/womens-shoes.csv")

# DuckDB Dataframe (±1000ms)
with Timer() as timer:
 for i in range(loops):
 asc = "ASC" if i % 2 == 0 else "DESC"
 df2 = duck.query(f"""SELECT * FROM df ORDER BY "prices.amountMin" {asc}""").to_df()
print(f"DuckDB on Pandas Dataframe: {timer.elapsed:.4f} seconds")


DuckDB on Pandas Dataframe: 1.2534 seconds
