Initial commit

This commit is contained in:
2025-07-04 19:58:11 +02:00
commit 7d9352968d
101 changed files with 12643 additions and 0 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,10 @@
import pandas as pd
import sqlite3
# Créer une connexion SQLite3 pour Pandas
connection = sqlite3.connect("sakila-master.db")
df = pd.read_sql("SELECT * FROM actor", connection, dtype={"last_update": "datetime64[ns]"})
connection.close()
print(df)
print(df.dtypes)

Binary file not shown.

View File

@ -0,0 +1,34 @@
from contexttimer import Timer
from dask import dataframe as dd
from dask.distributed import Client
import pandas as pd
if __name__ == '__main__':
loops: int = 10
client = Client(n_workers=loops)
df: dd.DataFrame = dd.read_csv("data/womens-shoes.csv.xz", low_memory=False, dtype={"asins": "object", "prices.offer": "object", "upc": "object", "weight": "object"})
pf = pd.read_csv("data/womens-shoes.csv.xz")
af = pd.read_csv("data/womens-shoes.csv.xz", engine="pyarrow")
# Dask Dataframe (±4800ms)
with Timer() as timer:
for i in range(loops):
df2 = df.sort_values("prices.amountMin", ascending=bool(i % 2)).compute()
print(df2["prices.amountMin"].iloc[0])
print(timer.elapsed)
# Pandas Dataframe (±35ms)
with Timer() as timer:
for i in range(loops):
df2 = pf.sort_values("prices.amountMin", ascending=bool(i % 2))
print(df2["prices.amountMin"].iloc[0])
print(timer.elapsed)
# Pandas PyArrow Dataframe (±29ms)
with Timer() as timer:
for i in range(loops):
df2 = af.sort_values("prices.amountMin", ascending=bool(i % 2))
print(df2["prices.amountMin"].iloc[0])
print(timer.elapsed)

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,217 @@
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": [
"# Test de performance Dask\n",
"\n",
"Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes."
],
"id": "b623744e3d523007"
},
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-06-30T14:02:48.704914Z",
"start_time": "2024-06-30T14:02:45.257624Z"
}
},
"source": [
"from contexttimer import Timer\n",
"from dask import dataframe as dd\n",
"\n",
"\n",
"loops: int = 10\n",
"df: dd.DataFrame = dd.read_csv(\"data/womens-shoes.csv.xz\", low_memory=False, dtype={\"asins\": \"object\", \"prices.offer\": \"object\", \"upc\": \"object\", \"weight\": \"object\"})\n",
"\n",
"# Dask Dataframe (±4800ms)\n",
"with Timer() as timer:\n",
" for i in range(loops):\n",
" df2 = df.sort_values(\"prices.amountMin\", ascending=bool(i % 2)).compute()\n",
"print(f\"Dask Dataframe: {timer.elapsed:.4f} seconds\")\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dask Dataframe: 3.4439 seconds\n"
]
}
],
"execution_count": 8
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"# Test de performance Pandas\n",
"\n",
"Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes:\n",
"\n",
"- Avec des types `numpy`\n",
"- Avec des types `pyarrow`"
],
"id": "2d1c8a87edd35ebd"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-30T15:55:33.485090Z",
"start_time": "2024-06-30T15:55:32.684991Z"
}
},
"cell_type": "code",
"source": [
"from contexttimer import Timer\n",
"import pandas as pd\n",
"\n",
"loops: int = 10\n",
"pf = pd.read_csv(\"data/womens-shoes.csv.xz\")\n",
"af = pd.read_csv(\"data/womens-shoes.csv.xz\", engine=\"pyarrow\")\n",
"\n",
"# Pandas Dataframe (±35ms)\n",
"with Timer() as timer:\n",
" for i in range(loops):\n",
" df2 = pf.sort_values(\"prices.amountMin\", ascending=bool(i % 2))\n",
"print(f\"Pandas Dataframe: {timer.elapsed:.4f} seconds\")\n",
"\n",
"# Pandas PyArrow Dataframe (±29ms)\n",
"with Timer() as timer:\n",
" for i in range(loops):\n",
" df2 = af.sort_values(\"prices.amountMin\", ascending=bool(i % 2))\n",
"print(f\"Pandas PyArrow Dataframe: {timer.elapsed:.4f} seconds\")\n",
"\n",
"\n"
],
"id": "53cce38938154ec2",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pandas Dataframe: 0.0489 seconds\n",
"Pandas PyArrow Dataframe: 0.0401 seconds\n"
]
}
],
"execution_count": 11
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"# Test de performance Polars\n",
"\n",
"Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes"
],
"id": "c9fb11f939e169b7"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-30T15:55:16.739324Z",
"start_time": "2024-06-30T15:55:16.185126Z"
}
},
"cell_type": "code",
"source": [
"import polars as pl\n",
"from contexttimer import Timer\n",
"\n",
"\n",
"loops: int = 10\n",
"pf = pl.read_csv(\"data/womens-shoes.csv\", infer_schema_length=2 ** 30)\n",
"\n",
"# Polars Dataframe (±9ms)\n",
"with Timer() as timer:\n",
" for i in range(loops):\n",
" df2 = pf.sort(\"prices.amountMin\", descending=bool(i % 2))\n",
"print(f\"Polars Dataframe: {timer.elapsed:.4f} seconds\")\n",
"\n"
],
"id": "58d61771c0970c52",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Polars Dataframe: 0.0092 seconds\n"
]
}
],
"execution_count": 9
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"# Test de performance DuckDB\n",
"\n",
"Test d'un tri sur un `DateFrame` de 10 000 lignes et 34 colonnes"
],
"id": "7a152865db550d98"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-30T15:54:26.130463Z",
"start_time": "2024-06-30T15:54:24.404916Z"
}
},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import duckdb as duck\n",
"from contexttimer import Timer\n",
"\n",
"\n",
"loops: int = 10\n",
"df = pd.read_csv(\"data/womens-shoes.csv\")\n",
"\n",
"# DuckDB Dataframe (±1000ms)\n",
"with Timer() as timer:\n",
" for i in range(loops):\n",
" asc = \"ASC\" if i % 2 == 0 else \"DESC\"\n",
" df2 = duck.query(f\"\"\"SELECT * FROM df ORDER BY \"prices.amountMin\" {asc}\"\"\").to_df()\n",
"print(f\"DuckDB on Pandas Dataframe: {timer.elapsed:.4f} seconds\")\n"
],
"id": "1f92c50608a41220",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DuckDB on Pandas Dataframe: 1.2534 seconds\n"
]
}
],
"execution_count": 2
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,25 @@
import polars as pl
import pandas as pd
import duckdb as duck
from contexttimer import Timer
if __name__ == '__main__':
loops: int = 10
pf = pl.read_csv("data/womens-shoes.csv", infer_schema_length=2 ** 30)
df = pd.read_csv("data/womens-shoes.csv")
# Polars Dataframe (±9ms)
with Timer() as timer:
for i in range(loops):
df2 = pf.sort("prices.amountMin", descending=bool(i % 2))
print(df2["prices.amountMin"][0])
print(timer.elapsed)
# DuckDB Dataframe (±1000ms)
with Timer() as timer:
for i in range(loops):
asc = "ASC" if i % 2 == 0 else "DESC"
df2 = duck.query(f"""SELECT * FROM df ORDER BY "prices.amountMin" {asc}""").to_df()
print(df2["prices.amountMin"].iloc[0])
print(timer.elapsed)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,302 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Exercice de traitement de données avec Pandas\n",
"\n",
"Tiré du site de Guillaume Dueymes : https://www.guillaumedueymes.com/courses/formation_python/8-pandas-exercice/\n",
"\n",
"Nous allons analyser un data set contenant des informations sur 10 000 paires de chaussures \n",
"vendues sur le site Amazon, avec de nombreuses caractéristiques comme le prix minimal observé, \n",
"le prix maximal observé, les couleurs disponibles, les tailles disponibles, le poids des articles, \n",
"la marque…\n"
],
"metadata": {
"collapsed": false
},
"id": "ca962ad386449748"
},
{
"cell_type": "markdown",
"source": [
"## Découverte du data set\n",
"\n",
"1. À laide de la fonction `read_csv()`, importez entièrement le data set et enregistrez-le dans une variable `shoes`.\n",
"2. Utilisez la méthode `.head()` pour afficher les premières lignes du `DataFrame`.\n",
"3. Il y a plus de 4 colonnes, beaucoup ne sont pas visibles. Afin de toutes les voir lors de l'affichage, utilisez la fonction `pandas.set_option()` pour que `.head()` affiche toutes les colonnes du `DataFrame`. (consultez la documentation de `set_option()`)\n",
"4. On va garder uniquement les colonnes intéressantes. Grâce à la syntaxe de filtrage par colonnes, créez une variable `shoes_light`, comprenant uniquement les colonnes suivantes : `id`, `name`, `brand`, dateUpdated`, `colors`, `prices.amountMax`, `prices.amountMin` et `prices.merchant`. Affichez le `head()` de `shoes_light`.\n"
],
"metadata": {
"collapsed": false
},
"id": "2f0fb688fd2b801e"
},
{
"cell_type": "code",
"execution_count": 1,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas in /home/steve/Code/python/.venv/datascience/lib/python3.11/site-packages (2.1.1)\r\n",
"Requirement already satisfied: numpy>=1.23.2 in /home/steve/Code/python/.venv/datascience/lib/python3.11/site-packages (from pandas) (1.26.0)\r\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /home/steve/Code/python/.venv/datascience/lib/python3.11/site-packages (from pandas) (2.8.2)\r\n",
"Requirement already satisfied: pytz>=2020.1 in /home/steve/Code/python/.venv/datascience/lib/python3.11/site-packages (from pandas) (2023.3.post1)\r\n",
"Requirement already satisfied: tzdata>=2022.1 in /home/steve/Code/python/.venv/datascience/lib/python3.11/site-packages (from pandas) (2023.3)\r\n",
"Requirement already satisfied: six>=1.5 in /home/steve/Code/python/.venv/datascience/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\r\n",
"\r\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.3.2\u001B[0m\r\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n"
]
}
],
"source": [
"!pip install pandas\n",
"import pandas as pd # noqa"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-12-19T09:59:24.292192547Z",
"start_time": "2023-12-19T09:59:21.699160182Z"
}
},
"id": "fb6b33145b46036b"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 1\n",
"shoes = pd.read_csv(\"womens-shoes.csv.xz\")"
],
"metadata": {
"collapsed": false
},
"id": "1b86b0cd93c36795"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 2"
],
"metadata": {
"collapsed": false
},
"id": "2efcdad9ec4b7583"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 3"
],
"metadata": {
"collapsed": false
},
"id": "2961d6e42459dfa9"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 4"
],
"metadata": {
"collapsed": false
},
"id": "6ee0a6019e90db1b"
},
{
"cell_type": "markdown",
"source": [
"## Data Cleaning\n",
"\n",
"1. À l'aide de l'attribut `.dtypes` du dataframe, regardez attentivement les types de chaque colonne. Certaines ont un type qui n'est pas celui attendu. Lesquelles ?\n",
"2. À l'aide des méthodes `.isnull()` (ou `.isna()`), `.sum()` et `len()`, calculez pour chaque colonne le pourcentage de valeurs non renseignées. Notez quelque part celles qui ont un non remplissage supérieur à 10%. La méthode `sum()` employée sur une série de booléens fait l'addition en considérant que `False == 0` et `True == 1`.\n",
"3. Supprimez du dataframe `shoes_light` les colonnes que vous avez notées dans la question précédente, elles ont trop de valeurs non renseignées.\n",
"4. À l'aide de la méthode `.to_datetime()` du dataframe, convertissez le type de la colonne `dateUpdated`.\n"
],
"metadata": {
"collapsed": false
},
"id": "865055e156e3a14c"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 1"
],
"metadata": {
"collapsed": false
},
"id": "5856f3288e1b837c"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 2"
],
"metadata": {
"collapsed": false
},
"id": "8b3be0e49e5edd4e"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 3"
],
"metadata": {
"collapsed": false
},
"id": "3d11033148a17412"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 4"
],
"metadata": {
"collapsed": false
},
"id": "6bc78f0eab8a090e"
},
{
"cell_type": "markdown",
"source": [
"## Features Modeling\n",
"\n",
"1. Ajoutez au dataframe une nouvelle colonne `prices.amountAverage` calculant la moyenne des colonnes `prices.amountMax` et `prices.amountMin` (via une addition et une division par 2).\n",
"2. Grâce à l'attribut `Series.dt.weekday`, ajoutez au dataframe une nouvelle colonne `dayOfweekUpdated`, extrayant depuis la colonne `dateUpdated` le jour de la semaine où les produits sont mis à jour (un nombre entre 0 et 6).\n"
],
"metadata": {
"collapsed": false
},
"id": "97a23a2b56e659b3"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 1"
],
"metadata": {
"collapsed": false
},
"id": "c1c2d9b73cf09b13"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 2"
],
"metadata": {
"collapsed": false
},
"id": "f79a989bd03e4f61"
},
{
"cell_type": "markdown",
"source": [
"## Data Analyse\n",
"\n",
"1. Affichez le prix moyen, écart type, etc. des chaussures avec la méthode `.describe()`.\n",
"2. Y a-t-il de grandes différences de prix en fonction de la marque ? À l'aide des méthodes `groupby()`, `mean()` et `sort_values()`, créez une variable `luxury` contenant les 10 marques les plus chères, puis une variable `low_cost` contenant les 10 marques les moins chères.\n",
"3. Grâce à la méthode `value_counts()`, déterminez le jour de la semaine où les produits sont le plus souvent mis à jour.\n",
"4. **(Optionnel)** Donnez le prix moyen des produits de la marque `easy street` mis à jour un jeudi (jour 3).\n"
],
"metadata": {
"collapsed": false
},
"id": "1980ec43bdbabe8"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 1"
],
"metadata": {
"collapsed": false
},
"id": "73df484243c117c5"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 2"
],
"metadata": {
"collapsed": false
},
"id": "7f1abc9d9f255936"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 3"
],
"metadata": {
"collapsed": false
},
"id": "bae81f6190201b34"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Question 4"
],
"metadata": {
"collapsed": false
},
"id": "f46f052425a9711f"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}