From 2e48027fef45c17a01b7e40b2b0cc1a8f6962eca Mon Sep 17 00:00:00 2001 From: Quentin Loridant Date: Mon, 4 Nov 2024 18:16:27 +0100 Subject: [PATCH 01/13] =?UTF-8?q?Suppression=20des=20lignes=20dupliqu?= =?UTF-8?q?=C3=A9es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- macantine/etl/analysis.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/macantine/etl/analysis.py b/macantine/etl/analysis.py index 17ffc6076..3b507b8c0 100644 --- a/macantine/etl/analysis.py +++ b/macantine/etl/analysis.py @@ -192,6 +192,11 @@ def load_dataset(self): """ self.warehouse.insert_dataframe(self.df, self.extracted_table_name) + def _clean_dataset(self): + self.df = self.df.loc[:, ~self.df.columns.duplicated()] + self.df = utils.filter_dataframe_with_schema_cols(self.df, self.schema) + self.df = self.df.drop_duplicates(subset=["id"]) + class ETL_ANALYSIS_TD(ETL_ANALYSIS): """ @@ -338,7 +343,9 @@ def transform_dataset(self): # Extract the sector names and categories logger.info("Canteens : Extract sectors and SPE...") self.df = utils.extract_sectors(self.df, extract_spe=True, split_category_and_sector=True, only_one_value=True) - self.df = self.df.rename(columns={"categories": "categorie"}) + self.df = self.df.rename(columns={"categories": "categorie"}) self.df = self.df.rename(columns=self.columns_mapper) - self.df = utils.filter_dataframe_with_schema_cols(self.df, self.schema) + + logger.info("Canteens : Clean dataset") + self._clean_dataset() From ff88a37b2c657424a147c4af9a10a204cdd53526 Mon Sep 17 00:00:00 2001 From: Quentin Loridant Date: Mon, 4 Nov 2024 18:16:55 +0100 Subject: [PATCH 02/13] Renommage des tables --- macantine/etl/analysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/macantine/etl/analysis.py b/macantine/etl/analysis.py index 3b507b8c0..a5f541f87 100644 --- a/macantine/etl/analysis.py +++ b/macantine/etl/analysis.py @@ -209,7 +209,7 @@ class ETL_ANALYSIS_TD(ETL_ANALYSIS): def __init__(self): self.df = None self.years = utils.CAMPAIGN_DATES.keys() - self.extracted_table_name = "teledeclarations_extracted" + self.extracted_table_name = "teledeclarations" self.warehouse = DataWareHouse() self.schema = json.load(open("data/schemas/schema_analysis.json")) @@ -307,7 +307,7 @@ class ETL_ANALYSIS_CANTEEN(ETL_ANALYSIS): def __init__(self): self.df = None - self.extracted_table_name = "canteens_extracted" + self.extracted_table_name = "canteens" self.warehouse = DataWareHouse() self.schema = json.load(open("data/schemas/schema_analysis_cantines.json")) # The following mapper is used for renaming columns and for selecting the columns to extract from db From d6a38747e49cfe3e225d5354b041100344710c14 Mon Sep 17 00:00:00 2001 From: Quentin Loridant Date: Tue, 12 Nov 2024 11:29:25 +0100 Subject: [PATCH 03/13] =?UTF-8?q?Ajout=20de=20filtres=20suppl=C3=A9menaire?= =?UTF-8?q?s=20pour=20=C3=AAtre=20en=20adequation=20avec=20les=20TD=20du?= =?UTF-8?q?=20rapport?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- macantine/etl/analysis.py | 8 ++++++-- macantine/etl/utils.py | 40 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/macantine/etl/analysis.py b/macantine/etl/analysis.py index a5f541f87..de39b1fa5 100644 --- a/macantine/etl/analysis.py +++ b/macantine/etl/analysis.py @@ -190,6 +190,7 @@ def load_dataset(self): """ Load in database """ + logger.info(f"Loading {len(self.df)} objects in db") self.warehouse.insert_dataframe(self.df, self.extracted_table_name) def _clean_dataset(self): @@ -230,6 +231,9 @@ def transform_dataset(self): # Aggregate columns for complete TD - Must occur before other transformations self.df = aggregate(self.df) + # Add additionnal filters (that couldn't be processed at queryset) + self.df = utils.filter_teledeclarations(self.df) + self.compute_miscellaneous_columns() # Convert types @@ -243,14 +247,14 @@ def transform_dataset(self): self.fill_geo_names(prefix="canteen.") # Fill campaign participation - logger.info("Canteens : Fill campaign participations...") + logger.info("TD : Fill campaign participations...") for year in utils.CAMPAIGN_DATES.keys(): campaign_participation = utils.map_canteens_td(year) col_name_campaign = f"declaration_{year}" self.df[col_name_campaign] = self.df["id"].apply(lambda x: x in campaign_participation) # Extract the sector names and categories - logger.info("Canteens : Extract sectors...") + logger.info("TD : Extract sectors...") self.df[["secteur", "catégorie"]] = self.df.apply( lambda x: utils.format_td_sector_column(x, "canteen.sectors"), axis=1, result_type="expand" ) diff --git a/macantine/etl/utils.py b/macantine/etl/utils.py index a3cbefc78..e648ccc92 100644 --- a/macantine/etl/utils.py +++ b/macantine/etl/utils.py @@ -225,6 +225,33 @@ def map_sectors(): return sectors_mapper +def filter_empty_values(df: pd.DataFrame, col_name) -> pd.DataFrame: + """ + Filtering out the teledeclarations for wich a certain field is empty + """ + return df.dropna(subset=col_name) + + +def filter_aberrant_td(df: pd.DataFrame) -> pd.DataFrame: + """ + Filtering out the teledeclarations that : + * products > 1 million € + AND + * an avg meal cost > 20 € + """ + mask = (df["teledeclaration.value_total_ht"] > 1000000) & ( + df["teledeclaration.value_total_ht"] / df["canteen.yearly_meal_count"] > 20 + ) + return df[~mask] + + +def filter_teledeclarations(df: pd.DataFrame): + df = filter_empty_values(df, col_name="teledeclaration.value_total_ht") + df = filter_empty_values(df, col_name="teledeclaration.value_bio_ht") + df = filter_aberrant_td(df) + return df + + def fetch_teledeclarations(years: list) -> pd.DataFrame: df = pd.DataFrame() for year in years: @@ -238,7 +265,18 @@ def fetch_teledeclarations(years: list) -> pd.DataFrame: ), status=Teledeclaration.TeledeclarationStatus.SUBMITTED, canteen_id__isnull=False, - ).values() + canteen__siret__isnull=False, + canteen__siret__length_gt=14, + diagnostic__value_total_ht__isnull=False, + diagnostic__value_bio_ht__isnull=False, + ) + .exclude( + canteen__deletion_date__range=( + CAMPAIGN_DATES[year]["start_date"], + CAMPAIGN_DATES[year]["end_date"], + ), + ) + .values() ) df = pd.concat([df, df_year]) else: From 992b74cc4f00e33d546e64ea01cb76ef14e140f2 Mon Sep 17 00:00:00 2001 From: Quentin Loridant Date: Tue, 12 Nov 2024 17:18:07 +0100 Subject: [PATCH 04/13] Remplacer le fitlrer sur les siret vides --- macantine/etl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macantine/etl/utils.py b/macantine/etl/utils.py index e648ccc92..5049d9a28 100644 --- a/macantine/etl/utils.py +++ b/macantine/etl/utils.py @@ -266,7 +266,6 @@ def fetch_teledeclarations(years: list) -> pd.DataFrame: status=Teledeclaration.TeledeclarationStatus.SUBMITTED, canteen_id__isnull=False, canteen__siret__isnull=False, - canteen__siret__length_gt=14, diagnostic__value_total_ht__isnull=False, diagnostic__value_bio_ht__isnull=False, ) @@ -275,6 +274,7 @@ def fetch_teledeclarations(years: list) -> pd.DataFrame: CAMPAIGN_DATES[year]["start_date"], CAMPAIGN_DATES[year]["end_date"], ), + canteen__siret="", ) .values() ) From 0bba66802b12b92dfd5a714b39d30897d80a39bf Mon Sep 17 00:00:00 2001 From: Quentin Loridant Date: Tue, 12 Nov 2024 17:18:49 +0100 Subject: [PATCH 05/13] =?UTF-8?q?Mise=20=C3=A0=20jour=20des=20filtres=20en?= =?UTF-8?q?=20ajoutant=20un=20SIRET=20aux=20cantines=20afin=20que=20les=20?= =?UTF-8?q?TD=20soient=20prises=20en=20compte?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- macantine/tests/test_etl_analysis.py | 2 +- macantine/tests/test_etl_open_data.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/macantine/tests/test_etl_analysis.py b/macantine/tests/test_etl_analysis.py index a55b046be..8b14ccf07 100644 --- a/macantine/tests/test_etl_analysis.py +++ b/macantine/tests/test_etl_analysis.py @@ -54,7 +54,7 @@ def test_extraction_teledeclaration(self): """ Only teledeclarations that occurred during teledeclaration campaigns should be extracted """ - canteen = CanteenFactory.create() + canteen = CanteenFactory.create(siret="98648424243607") applicant = UserFactory.create() with freeze_time("1991-01-14"): # Faking time to mock creation_date diagnostic_1990 = DiagnosticFactory.create(canteen=canteen, year=1990, diagnostic_type=None) diff --git a/macantine/tests/test_etl_open_data.py b/macantine/tests/test_etl_open_data.py index 675c85552..68d345fa2 100644 --- a/macantine/tests/test_etl_open_data.py +++ b/macantine/tests/test_etl_open_data.py @@ -21,7 +21,7 @@ def test_td_range_years(self, mock): """ Only teledeclarations that occurred during one specific teledeclaration campaign should be extracted """ - canteen = CanteenFactory.create() + canteen = CanteenFactory.create(siret="98648424243607") applicant = UserFactory.create() test_cases = [ {"name": "Ignore years out of range", "year": 1990, "expected_outcome": 0}, @@ -37,7 +37,7 @@ def test_td_range_years(self, mock): @freeze_time("2023-05-14") # Faking time to mock creation_date def test_ignore_cancelled_tds(self, mock): - canteen = CanteenFactory.create() + canteen = CanteenFactory.create(siret="98648424243607") applicant = UserFactory.create() diagnostic = DiagnosticFactory.create(canteen=canteen, year=2022, diagnostic_type=None) teledeclaration = Teledeclaration.create_from_diagnostic(diagnostic, applicant) @@ -119,10 +119,10 @@ def test_extraction_canteen(self, mock): self.assertEqual(etl_canteen.len_dataset(), 0, "There shoud be an empty dataframe") # Adding data in the db - canteen_1 = CanteenFactory.create() + canteen_1 = CanteenFactory.create(siret="98648424243607") canteen_1.managers.add(UserFactory.create()) - canteen_2 = CanteenFactory.create() # Another canteen, but without a manager + canteen_2 = CanteenFactory.create(siret="98648424243607") # Another canteen, but without a manager canteen_2.managers.clear() etl_canteen.extract_dataset() From 8ad9c0bb8b249a941746b7bda2e8553234bd7182 Mon Sep 17 00:00:00 2001 From: Quentin Loridant Date: Tue, 12 Nov 2024 17:39:05 +0100 Subject: [PATCH 06/13] Utiliser le champs canteen_siret directement pour le filtre au lieu de canteen__siret --- macantine/etl/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/macantine/etl/utils.py b/macantine/etl/utils.py index 5049d9a28..fc489a312 100644 --- a/macantine/etl/utils.py +++ b/macantine/etl/utils.py @@ -265,7 +265,7 @@ def fetch_teledeclarations(years: list) -> pd.DataFrame: ), status=Teledeclaration.TeledeclarationStatus.SUBMITTED, canteen_id__isnull=False, - canteen__siret__isnull=False, + canteen_siret__isnull=False, diagnostic__value_total_ht__isnull=False, diagnostic__value_bio_ht__isnull=False, ) @@ -274,7 +274,6 @@ def fetch_teledeclarations(years: list) -> pd.DataFrame: CAMPAIGN_DATES[year]["start_date"], CAMPAIGN_DATES[year]["end_date"], ), - canteen__siret="", ) .values() ) From a4365788803b00f05691cecb2f6265cd11c84e36 Mon Sep 17 00:00:00 2001 From: Quentin Loridant Date: Wed, 13 Nov 2024 15:28:18 +0100 Subject: [PATCH 07/13] Ajout test sur filtre date --- macantine/tests/test_etl_analysis.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/macantine/tests/test_etl_analysis.py b/macantine/tests/test_etl_analysis.py index 8b14ccf07..0a51d33e0 100644 --- a/macantine/tests/test_etl_analysis.py +++ b/macantine/tests/test_etl_analysis.py @@ -68,6 +68,15 @@ def test_extraction_teledeclaration(self): diagnostic_2023 = DiagnosticFactory.create(canteen=canteen, year=2023, diagnostic_type=None) td_2023 = Teledeclaration.create_from_diagnostic(diagnostic_2023, applicant) + # This TD should not be extracted as its canteen has been deleted during the campaign + with freeze_time("2024-02-14"): # Faking time to mock creation_date + canteen_deleted_during_campaign = CanteenFactory.create(siret="28838740672960") + diagnostic_out_of_range = DiagnosticFactory.create( + canteen=canteen_deleted_during_campaign, year=2023, diagnostic_type=None + ) + Teledeclaration.create_from_diagnostic(diagnostic_out_of_range, applicant) + canteen_deleted_during_campaign.delete() + etl_stats = ETL_ANALYSIS_TD() etl_stats.extract_dataset() From bc7df65210d491fb55d46446b24d69aef080ee5c Mon Sep 17 00:00:00 2001 From: Quentin Loridant Date: Fri, 15 Nov 2024 09:54:06 +0100 Subject: [PATCH 08/13] AJout commande export dataset dans le launch --- .vscode/launch.json | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 733aa741f..f88a7a7ce 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -25,7 +25,10 @@ "program": "${workspaceFolder}/manage.py", "args": [ "test", - "--noinput" + "--noinput", + "--failfast", + "--keepdb", + "macantine.tests.test_etl_analysis" ], "justMyCode": false, "django": true @@ -66,5 +69,18 @@ "justMyCode": false, "django": true }, + { + "name": "Export Dataset", + "type": "debugpy", + "request": "launch", + "stopOnEntry": false, + "python": "${workspaceFolder}/venv/bin/python", + "program": "${workspaceFolder}/manage.py", + "args": [ + "export_dataset", + ], + "justMyCode": false, + "django": true + }, ] } From cd13936c3074c1f331699a8231b351fa73065960 Mon Sep 17 00:00:00 2001 From: qloridant Date: Fri, 15 Nov 2024 18:11:52 +0100 Subject: [PATCH 09/13] Refacto test en utilisant une liste de test cases --- macantine/etl/analysis.py | 13 +++++- macantine/tests/test_etl_analysis.py | 64 +++++++++++++++------------- 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/macantine/etl/analysis.py b/macantine/etl/analysis.py index de39b1fa5..a6d348ba1 100644 --- a/macantine/etl/analysis.py +++ b/macantine/etl/analysis.py @@ -213,13 +213,24 @@ def __init__(self): self.extracted_table_name = "teledeclarations" self.warehouse = DataWareHouse() self.schema = json.load(open("data/schemas/schema_analysis.json")) + self.columns = [field["name"] for field in self.schema["fields"]] def extract_dataset(self): # Load teledeclarations from prod database into the Data Warehouse self.df = utils.fetch_teledeclarations(self.years) - self.df.index = self.df.id + + if self.df.empty: + logger.warning("Dataset is empty. Creating an empty dataframe with columns from the schema") + self.df = pd.DataFrame(columns=self.columns) def transform_dataset(self): + if self.df.empty: + logger.warning("Dataset is empty. Skipping transformation") + return + + # Use id as index + self.df.index = self.df.id + # Flatten json 'declared_data' column df_json = pd.json_normalize(self.df["declared_data"]) del df_json["year"] diff --git a/macantine/tests/test_etl_analysis.py b/macantine/tests/test_etl_analysis.py index 0a51d33e0..d96dedca2 100644 --- a/macantine/tests/test_etl_analysis.py +++ b/macantine/tests/test_etl_analysis.py @@ -56,37 +56,43 @@ def test_extraction_teledeclaration(self): """ canteen = CanteenFactory.create(siret="98648424243607") applicant = UserFactory.create() - with freeze_time("1991-01-14"): # Faking time to mock creation_date - diagnostic_1990 = DiagnosticFactory.create(canteen=canteen, year=1990, diagnostic_type=None) - _ = Teledeclaration.create_from_diagnostic(diagnostic_1990, applicant) - - with freeze_time("2023-05-14"): # Faking time to mock creation_date - diagnostic_2022 = DiagnosticFactory.create(canteen=canteen, year=2022, diagnostic_type=None) - td_2022 = Teledeclaration.create_from_diagnostic(diagnostic_2022, applicant) - - with freeze_time("2024-02-14"): # Faking time to mock creation_date - diagnostic_2023 = DiagnosticFactory.create(canteen=canteen, year=2023, diagnostic_type=None) - td_2023 = Teledeclaration.create_from_diagnostic(diagnostic_2023, applicant) - - # This TD should not be extracted as its canteen has been deleted during the campaign - with freeze_time("2024-02-14"): # Faking time to mock creation_date - canteen_deleted_during_campaign = CanteenFactory.create(siret="28838740672960") - diagnostic_out_of_range = DiagnosticFactory.create( - canteen=canteen_deleted_during_campaign, year=2023, diagnostic_type=None - ) - Teledeclaration.create_from_diagnostic(diagnostic_out_of_range, applicant) - canteen_deleted_during_campaign.delete() - etl_stats = ETL_ANALYSIS_TD() - etl_stats.extract_dataset() - self.assertEqual( - len(etl_stats.df), - 2, - "There should be two teledeclaration. None for 1990 (no campaign). One for 2022 and one for 2023", - ) - self.assertEqual(etl_stats.df[etl_stats.df.id == td_2022.id].year.iloc[0], 2022) - self.assertEqual(etl_stats.df[etl_stats.df.id == td_2023.id].year.iloc[0], 2023) + test_cases = [ + { + "date_mocked": "1991-01-14", + "year": 1990, + "canteen": canteen, + "delete_canteen": False, + "expected_outcome": "no_extraction", + }, + { + "date_mocked": "2023-05-14", + "year": 2022, + "canteen": canteen, + "delete_canteen": False, + "expected_outcome": "extraction", + }, + { + "date_mocked": "2024-02-14", + "year": 2023, + "canteen": canteen, + "delete_canteen": True, + "expected_outcome": "no_extraction", + }, + ] + for tc in test_cases: + with freeze_time(tc["date_mocked"]): # Faking time to mock creation_date + diag = DiagnosticFactory.create(canteen=tc["canteen"], year=tc["year"], diagnostic_type=None) + td = Teledeclaration.create_from_diagnostic(diag, applicant) + if tc["delete_canteen"]: + tc["canteen"].delete() + + etl_stats.extract_dataset() + if tc["expected_outcome"] == "extraction": + self.assertEqual(len(etl_stats.df[etl_stats.df.id == td.id]), 1) + else: + self.assertEqual(len(etl_stats.df[etl_stats.df.id == td.id]), 0) def test_get_egalim_hors_bio(self): data = { From fc5e3255603c68d4a908205b6d8df18078329dbc Mon Sep 17 00:00:00 2001 From: qloridant Date: Fri, 15 Nov 2024 18:17:20 +0100 Subject: [PATCH 10/13] Ajout test pour les cantines sans SIRET --- macantine/tests/test_etl_analysis.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/macantine/tests/test_etl_analysis.py b/macantine/tests/test_etl_analysis.py index d96dedca2..5a2ab15da 100644 --- a/macantine/tests/test_etl_analysis.py +++ b/macantine/tests/test_etl_analysis.py @@ -55,6 +55,7 @@ def test_extraction_teledeclaration(self): Only teledeclarations that occurred during teledeclaration campaigns should be extracted """ canteen = CanteenFactory.create(siret="98648424243607") + canteen_no_siret = CanteenFactory.create() applicant = UserFactory.create() etl_stats = ETL_ANALYSIS_TD() @@ -65,6 +66,7 @@ def test_extraction_teledeclaration(self): "canteen": canteen, "delete_canteen": False, "expected_outcome": "no_extraction", + "msg": "Outside any campaign date", }, { "date_mocked": "2023-05-14", @@ -72,6 +74,7 @@ def test_extraction_teledeclaration(self): "canteen": canteen, "delete_canteen": False, "expected_outcome": "extraction", + "msg": "Valid", }, { "date_mocked": "2024-02-14", @@ -79,6 +82,15 @@ def test_extraction_teledeclaration(self): "canteen": canteen, "delete_canteen": True, "expected_outcome": "no_extraction", + "msg": "Canteen deleted during campaign", + }, + { + "date_mocked": "2024-02-14", + "year": 2023, + "canteen": canteen_no_siret, + "delete_canteen": False, + "expected_outcome": "no_extraction", + "msg": "Canteen without a siret", }, ] for tc in test_cases: From b32edb706a113d93b0c1ef062f4a7e988c9d859b Mon Sep 17 00:00:00 2001 From: qloridant Date: Tue, 19 Nov 2024 14:52:47 +0100 Subject: [PATCH 11/13] Sortir le filtre sur le siret dans un nouveau exclude (or au lieu de and) --- macantine/etl/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/macantine/etl/utils.py b/macantine/etl/utils.py index fc489a312..0ea562be1 100644 --- a/macantine/etl/utils.py +++ b/macantine/etl/utils.py @@ -273,8 +273,9 @@ def fetch_teledeclarations(years: list) -> pd.DataFrame: canteen__deletion_date__range=( CAMPAIGN_DATES[year]["start_date"], CAMPAIGN_DATES[year]["end_date"], - ), + ) ) + .exclude(canteen_siret="") .values() ) df = pd.concat([df, df_year]) From 301c4f0cffe8f48b0bc921dd2cd61cde6db7a06c Mon Sep 17 00:00:00 2001 From: qloridant Date: Tue, 19 Nov 2024 14:55:45 +0100 Subject: [PATCH 12/13] Nettoyage launch.json --- .vscode/launch.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index f88a7a7ce..c4461f04a 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -26,9 +26,6 @@ "args": [ "test", "--noinput", - "--failfast", - "--keepdb", - "macantine.tests.test_etl_analysis" ], "justMyCode": false, "django": true From 2fd35a153e2263df9cdb1e3f496c58567c6760de Mon Sep 17 00:00:00 2001 From: qloridant Date: Tue, 19 Nov 2024 14:57:40 +0100 Subject: [PATCH 13/13] Nettoyage launch.json --- .vscode/launch.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index c4461f04a..0b265f675 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -25,7 +25,7 @@ "program": "${workspaceFolder}/manage.py", "args": [ "test", - "--noinput", + "--noinput" ], "justMyCode": false, "django": true