From 1351e7d992f5b462ca9df752c167dfe081c48c90 Mon Sep 17 00:00:00 2001 From: Linlang Date: Tue, 7 Jan 2025 18:05:26 +0800 Subject: [PATCH] format with black --- scripts/check_data_health.py | 66 +++++++++++++++--------------------- 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/scripts/check_data_health.py b/scripts/check_data_health.py index bfde576403..a8a557572e 100644 --- a/scripts/check_data_health.py +++ b/scripts/check_data_health.py @@ -34,9 +34,7 @@ def __init__( missing_data_num=0, ): assert csv_path or qlib_dir, "One of csv_path or qlib_dir should be provided." - assert not ( - csv_path and qlib_dir - ), "Only one of csv_path or qlib_dir should be provided." + assert not (csv_path and qlib_dir), "Only one of csv_path or qlib_dir should be provided." self.data = {} self.problems = {} @@ -63,12 +61,12 @@ def load_qlib_data(self): df = D.features([instrument], required_fields, freq="day") df.rename( columns={ - '$open': 'open', - '$close': 'close', - '$low': 'low', - '$high': 'high', - '$volume': 'volume', - '$factor': 'factor', + "$open": "open", + "$close": "close", + "$low": "low", + "$high": "high", + "$volume": "volume", + "$factor": "factor", }, inplace=True, ) @@ -85,9 +83,7 @@ def check_missing_data(self) -> Optional[Tuple[DataProblem, List[str]]]: "volume": [], } for filename, df in self.data.items(): - missing_data_columns = ( - df.isnull().sum()[df.isnull().sum() > self.missing_data_num].index.tolist() - ) + missing_data_columns = df.isnull().sum()[df.isnull().sum() > self.missing_data_num].index.tolist() if len(missing_data_columns) > 0: result_dict["instruments"].append(filename) result_dict["open"].append(df.isnull().sum()["open"]) @@ -115,11 +111,7 @@ def check_large_step_changes(self) -> Optional[Tuple[DataProblem, List[str]]]: for col in ["open", "high", "low", "close", "volume"]: if col in df.columns: pct_change = df[col].pct_change(fill_method=None).abs() - threshold = ( - self.large_step_threshold_volume - if col == "volume" - else self.large_step_threshold_price - ) + threshold = self.large_step_threshold_volume if col == "volume" else self.large_step_threshold_price if pct_change.max() > threshold: large_steps = pct_change[pct_change > threshold] result_dict["instruments"].append(filename) @@ -143,9 +135,7 @@ def check_required_columns(self) -> Optional[Tuple[DataProblem, List[str]]]: } for filename, df in self.data.items(): if not all(column in df.columns for column in required_columns): - missing_required_columns = [ - column for column in required_columns if column not in df.columns - ] + missing_required_columns = [column for column in required_columns if column not in df.columns] result_dict["instruments"].append(filename) result_dict["missing_col"] += missing_required_columns @@ -181,24 +171,24 @@ def check_missing_factor(self) -> Optional[Tuple[DataProblem, List[str]]]: logger.info(f"✅ The `factor` column already exists and is not empty.") def check_data(self): - check_missing_data_result = self.check_missing_data() - check_large_step_changes_result = self.check_large_step_changes() - check_required_columns_result = self.check_required_columns() - check_missing_factor_result = self.check_missing_factor() - print(f"\nSummary of data health check ({len(self.data)} files checked):") - print("-------------------------------------------------") - if isinstance(check_missing_data_result, pd.DataFrame): - logger.warning(f"There is missing data.") - print(check_missing_data_result) - if isinstance(check_large_step_changes_result, pd.DataFrame): - logger.warning(f"The OHLCV column has large step changes.") - print(check_large_step_changes_result) - if isinstance(check_required_columns_result, pd.DataFrame): - logger.warning(f"Columns (OLHCV) are missing.") - print(check_required_columns_result) - if isinstance(check_missing_factor_result, pd.DataFrame): - logger.warning(f"The factor column does not exist or is empty") - print(check_missing_factor_result) + check_missing_data_result = self.check_missing_data() + check_large_step_changes_result = self.check_large_step_changes() + check_required_columns_result = self.check_required_columns() + check_missing_factor_result = self.check_missing_factor() + print(f"\nSummary of data health check ({len(self.data)} files checked):") + print("-------------------------------------------------") + if isinstance(check_missing_data_result, pd.DataFrame): + logger.warning(f"There is missing data.") + print(check_missing_data_result) + if isinstance(check_large_step_changes_result, pd.DataFrame): + logger.warning(f"The OHLCV column has large step changes.") + print(check_large_step_changes_result) + if isinstance(check_required_columns_result, pd.DataFrame): + logger.warning(f"Columns (OLHCV) are missing.") + print(check_required_columns_result) + if isinstance(check_missing_factor_result, pd.DataFrame): + logger.warning(f"The factor column does not exist or is empty") + print(check_missing_factor_result) if __name__ == "__main__":