{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": [ "# Test de performance Dask\n", "\n", "Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes." ], "id": "b623744e3d523007" }, { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2024-06-30T14:02:48.704914Z", "start_time": "2024-06-30T14:02:45.257624Z" } }, "source": [ "from contexttimer import Timer\n", "from dask import dataframe as dd\n", "\n", "\n", "loops: int = 10\n", "df: dd.DataFrame = dd.read_csv(\"data/womens-shoes.csv.xz\", low_memory=False, dtype={\"asins\": \"object\", \"prices.offer\": \"object\", \"upc\": \"object\", \"weight\": \"object\"})\n", "\n", "# Dask Dataframe (±4800ms)\n", "with Timer() as timer:\n", " for i in range(loops):\n", " df2 = df.sort_values(\"prices.amountMin\", ascending=bool(i % 2)).compute()\n", "print(f\"Dask Dataframe: {timer.elapsed:.4f} seconds\")\n" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dask Dataframe: 3.4439 seconds\n" ] } ], "execution_count": 8 }, { "metadata": {}, "cell_type": "markdown", "source": [ "# Test de performance Pandas\n", "\n", "Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes :\n", "\n", "- Avec des types `numpy`\n", "- Avec des types `pyarrow`" ], "id": "2d1c8a87edd35ebd" }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-30T15:55:33.485090Z", "start_time": "2024-06-30T15:55:32.684991Z" } }, "cell_type": "code", "source": [ "from contexttimer import Timer\n", "import pandas as pd\n", "\n", "loops: int = 10\n", "pf = pd.read_csv(\"data/womens-shoes.csv.xz\")\n", "af = pd.read_csv(\"data/womens-shoes.csv.xz\", engine=\"pyarrow\")\n", "\n", "# Pandas Dataframe (±35ms)\n", "with Timer() as timer:\n", " for i in range(loops):\n", " df2 = pf.sort_values(\"prices.amountMin\", ascending=bool(i % 2))\n", "print(f\"Pandas Dataframe: {timer.elapsed:.4f} seconds\")\n", "\n", "# Pandas PyArrow Dataframe (±29ms)\n", "with Timer() as timer:\n", " for i in range(loops):\n", " df2 = af.sort_values(\"prices.amountMin\", ascending=bool(i % 2))\n", "print(f\"Pandas PyArrow Dataframe: {timer.elapsed:.4f} seconds\")\n", "\n", "\n" ], "id": "53cce38938154ec2", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pandas Dataframe: 0.0489 seconds\n", "Pandas PyArrow Dataframe: 0.0401 seconds\n" ] } ], "execution_count": 11 }, { "metadata": {}, "cell_type": "markdown", "source": [ "# Test de performance Polars\n", "\n", "Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes " ], "id": "c9fb11f939e169b7" }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-30T15:55:16.739324Z", "start_time": "2024-06-30T15:55:16.185126Z" } }, "cell_type": "code", "source": [ "import polars as pl\n", "from contexttimer import Timer\n", "\n", "\n", "loops: int = 10\n", "pf = pl.read_csv(\"data/womens-shoes.csv\", infer_schema_length=2 ** 30)\n", "\n", "# Polars Dataframe (±9ms)\n", "with Timer() as timer:\n", " for i in range(loops):\n", " df2 = pf.sort(\"prices.amountMin\", descending=bool(i % 2))\n", "print(f\"Polars Dataframe: {timer.elapsed:.4f} seconds\")\n", "\n" ], "id": "58d61771c0970c52", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Polars Dataframe: 0.0092 seconds\n" ] } ], "execution_count": 9 }, { "metadata": {}, "cell_type": "markdown", "source": [ "# Test de performance DuckDB\n", "\n", "Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes " ], "id": "7a152865db550d98" }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-30T15:54:26.130463Z", "start_time": "2024-06-30T15:54:24.404916Z" } }, "cell_type": "code", "source": [ "import pandas as pd\n", "import duckdb as duck\n", "from contexttimer import Timer\n", "\n", "\n", "loops: int = 10\n", "df = pd.read_csv(\"data/womens-shoes.csv\")\n", "\n", "# DuckDB Dataframe (±1000ms)\n", "with Timer() as timer:\n", " for i in range(loops):\n", " asc = \"ASC\" if i % 2 == 0 else \"DESC\"\n", " df2 = duck.query(f\"\"\"SELECT * FROM df ORDER BY \"prices.amountMin\" {asc}\"\"\").to_df()\n", "print(f\"DuckDB on Pandas Dataframe: {timer.elapsed:.4f} seconds\")\n" ], "id": "1f92c50608a41220", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DuckDB on Pandas Dataframe: 1.2534 seconds\n" ] } ], "execution_count": 2 } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }