Files
training.python.datascience/workshop/pandas-shoes/performance-test.ipynb
2025-07-04 19:58:11 +02:00

218 lines
5.4 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": [
"# Test de performance Dask\n",
"\n",
"Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes."
],
"id": "b623744e3d523007"
},
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-06-30T14:02:48.704914Z",
"start_time": "2024-06-30T14:02:45.257624Z"
}
},
"source": [
"from contexttimer import Timer\n",
"from dask import dataframe as dd\n",
"\n",
"\n",
"loops: int = 10\n",
"df: dd.DataFrame = dd.read_csv(\"data/womens-shoes.csv.xz\", low_memory=False, dtype={\"asins\": \"object\", \"prices.offer\": \"object\", \"upc\": \"object\", \"weight\": \"object\"})\n",
"\n",
"# Dask Dataframe (±4800ms)\n",
"with Timer() as timer:\n",
" for i in range(loops):\n",
" df2 = df.sort_values(\"prices.amountMin\", ascending=bool(i % 2)).compute()\n",
"print(f\"Dask Dataframe: {timer.elapsed:.4f} seconds\")\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dask Dataframe: 3.4439 seconds\n"
]
}
],
"execution_count": 8
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"# Test de performance Pandas\n",
"\n",
"Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes:\n",
"\n",
"- Avec des types `numpy`\n",
"- Avec des types `pyarrow`"
],
"id": "2d1c8a87edd35ebd"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-30T15:55:33.485090Z",
"start_time": "2024-06-30T15:55:32.684991Z"
}
},
"cell_type": "code",
"source": [
"from contexttimer import Timer\n",
"import pandas as pd\n",
"\n",
"loops: int = 10\n",
"pf = pd.read_csv(\"data/womens-shoes.csv.xz\")\n",
"af = pd.read_csv(\"data/womens-shoes.csv.xz\", engine=\"pyarrow\")\n",
"\n",
"# Pandas Dataframe (±35ms)\n",
"with Timer() as timer:\n",
" for i in range(loops):\n",
" df2 = pf.sort_values(\"prices.amountMin\", ascending=bool(i % 2))\n",
"print(f\"Pandas Dataframe: {timer.elapsed:.4f} seconds\")\n",
"\n",
"# Pandas PyArrow Dataframe (±29ms)\n",
"with Timer() as timer:\n",
" for i in range(loops):\n",
" df2 = af.sort_values(\"prices.amountMin\", ascending=bool(i % 2))\n",
"print(f\"Pandas PyArrow Dataframe: {timer.elapsed:.4f} seconds\")\n",
"\n",
"\n"
],
"id": "53cce38938154ec2",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pandas Dataframe: 0.0489 seconds\n",
"Pandas PyArrow Dataframe: 0.0401 seconds\n"
]
}
],
"execution_count": 11
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"# Test de performance Polars\n",
"\n",
"Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes"
],
"id": "c9fb11f939e169b7"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-30T15:55:16.739324Z",
"start_time": "2024-06-30T15:55:16.185126Z"
}
},
"cell_type": "code",
"source": [
"import polars as pl\n",
"from contexttimer import Timer\n",
"\n",
"\n",
"loops: int = 10\n",
"pf = pl.read_csv(\"data/womens-shoes.csv\", infer_schema_length=2 ** 30)\n",
"\n",
"# Polars Dataframe (±9ms)\n",
"with Timer() as timer:\n",
" for i in range(loops):\n",
" df2 = pf.sort(\"prices.amountMin\", descending=bool(i % 2))\n",
"print(f\"Polars Dataframe: {timer.elapsed:.4f} seconds\")\n",
"\n"
],
"id": "58d61771c0970c52",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Polars Dataframe: 0.0092 seconds\n"
]
}
],
"execution_count": 9
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"# Test de performance DuckDB\n",
"\n",
"Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes"
],
"id": "7a152865db550d98"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-30T15:54:26.130463Z",
"start_time": "2024-06-30T15:54:24.404916Z"
}
},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import duckdb as duck\n",
"from contexttimer import Timer\n",
"\n",
"\n",
"loops: int = 10\n",
"df = pd.read_csv(\"data/womens-shoes.csv\")\n",
"\n",
"# DuckDB Dataframe (±1000ms)\n",
"with Timer() as timer:\n",
" for i in range(loops):\n",
" asc = \"ASC\" if i % 2 == 0 else \"DESC\"\n",
" df2 = duck.query(f\"\"\"SELECT * FROM df ORDER BY \"prices.amountMin\" {asc}\"\"\").to_df()\n",
"print(f\"DuckDB on Pandas Dataframe: {timer.elapsed:.4f} seconds\")\n"
],
"id": "1f92c50608a41220",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DuckDB on Pandas Dataframe: 1.2534 seconds\n"
]
}
],
"execution_count": 2
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}