diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index f6ca04765..5f5f63a04 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -5,7 +5,7 @@ name: Python package on: push: - branches: [ master, V0.9.46 ] + branches: [ master, V0.9.47 ] pull_request: branches: [ master ] diff --git a/czsc/__init__.py b/czsc/__init__.py index 436f9c744..e82661194 100644 --- a/czsc/__init__.py +++ b/czsc/__init__.py @@ -154,12 +154,13 @@ rolling_slope, rolling_tanh, feature_adjust, + normalize_corr, ) -__version__ = "0.9.46" +__version__ = "0.9.47" __author__ = "zengbin93" __email__ = "zeng_bin8888@163.com" -__date__ = "20240318" +__date__ = "20240328" def welcome(): diff --git a/czsc/connectors/cooperation.py b/czsc/connectors/cooperation.py index 79b56247d..1ce98eca0 100644 --- a/czsc/connectors/cooperation.py +++ b/czsc/connectors/cooperation.py @@ -82,6 +82,11 @@ def get_symbols(name, **kwargs): kline = dc.future_klines(trade_date="20231101") return kline['code'].unique().tolist() + if name.upper() == "ALL": + symbols = get_symbols("股票") + get_symbols("ETF") + symbols += get_symbols("A股指数") + get_symbols("南华指数") + get_symbols("期货主力") + return symbols + raise ValueError(f"{name} 分组无法识别,获取标的列表失败!") diff --git a/czsc/features/utils.py b/czsc/features/utils.py index 1d69826cb..3d7b4bd78 100644 --- a/czsc/features/utils.py +++ b/czsc/features/utils.py @@ -256,6 +256,52 @@ def __lr_slope(x): return df +def normalize_corr(df: pd.DataFrame, fcol, ycol=None, **kwargs): + """标准化因子与收益相关性为正数 + + 方法说明:对因子进行滚动相关系数计算,因子乘以滚动相关系数的符号 + + **注意:** + + 1. simple 模式下,计算过程有一定的未来信息泄露,在回测中使用时需要注意 + 2. rolling 模式下,计算过程依赖 window 参数,有可能调整后相关性为负数 + + :param df: pd.DataFrame, 必须包含 dt、symbol、price 列,以及因子列 + :param fcol: str 因子列名 + :param kwargs: dict + + - window: int, 滚动窗口大小 + - min_periods: int, 最小计算周期 + - mode: str, 计算方法, rolling 表示使用滚动调整相关系数,simple 表示使用镜像反转相关系数 + - copy: bool, 是否复制 df + + :return: pd.DataFrame + """ + window = kwargs.get("window", 1000) + min_periods = kwargs.get("min_periods", 5) + mode = kwargs.get("mode", "rolling") + if kwargs.get("copy", False): + df = df.copy() + + df = df.sort_values(['symbol', 'dt'], ascending=True).reset_index(drop=True) + for symbol, dfg in df.groupby("symbol"): + dfg['ycol'] = dfg['price'].pct_change().shift(-1) + + if mode.lower() == "rolling": + dfg['corr_sign'] = np.sign(dfg[fcol].rolling(window=window, min_periods=min_periods).corr(dfg['ycol'])) + dfg[fcol] = (dfg['corr_sign'].shift(3) * dfg[fcol]).fillna(0) + + elif mode.lower() == "simple": + corr_sign = np.sign(dfg[fcol].corr(dfg['ycol'])) + dfg[fcol] = corr_sign * dfg[fcol] + + else: + raise ValueError(f"Unknown mode: {mode}") + + df.loc[df['symbol'] == symbol, fcol] = dfg[fcol] + return df + + def feature_adjust_V230101(df: pd.DataFrame, fcol, **kwargs): """特征调整函数:对特征进行调整,使其符合持仓权重的定义 @@ -312,6 +358,7 @@ def feature_adjust(df: pd.DataFrame, fcol, method, **kwargs): :param fcol: str, 因子列名 :param method: str, 调整方法 + - KEEP: 直接使用原始因子值作为权重 - V230101: 对因子进行滚动相关系数计算,然后对因子值用 maxabs_scale 进行归一化,最后乘以滚动相关系数的符号 - V240323: 对因子进行滚动相关系数计算,然后对因子值用 scale + tanh 进行归一化,最后乘以滚动相关系数的符号 @@ -322,6 +369,10 @@ def feature_adjust(df: pd.DataFrame, fcol, method, **kwargs): :return: pd.DataFrame, 新增 weight 列 """ + if method == "KEEP": + df["weight"] = df[fcol] + return df + if method == "V230101": return feature_adjust_V230101(df, fcol, **kwargs) elif method == "V240323": diff --git a/czsc/utils/st_components.py b/czsc/utils/st_components.py index ff53a0a4a..f46214c41 100644 --- a/czsc/utils/st_components.py +++ b/czsc/utils/st_components.py @@ -762,6 +762,7 @@ def show_out_in_compare(df, ret_col, mid_dt, **kwargs): df = df[[ret_col]].copy().fillna(0) df.sort_index(inplace=True, ascending=True) + mid_dt = pd.to_datetime(mid_dt) dfi = df[df.index < mid_dt].copy() dfo = df[df.index >= mid_dt].copy() @@ -807,7 +808,7 @@ def show_out_in_compare(df, ret_col, mid_dt, **kwargs): '新高占比': '{:.2%}', } ) - st.dataframe(df_stats, use_container_width=True) + st.dataframe(df_stats, use_container_width=True, hide_index=True) def show_optuna_study(study: optuna.Study, **kwargs): diff --git a/test/test_features.py b/test/test_features.py index c7aed09b6..1deed1999 100644 --- a/test/test_features.py +++ b/test/test_features.py @@ -37,3 +37,37 @@ def test_rolling_tanh(): result_df = rolling_tanh(df, 'col1', new_col='col1_tanh3', window=100, min_periods=50) assert 'col1_tanh3' in result_df.columns assert result_df['col1_tanh3'].between(-1, 1).all() + + +def test_normalize_corr(): + from czsc.features.utils import normalize_corr + + np.random.seed(123) + # Create a fake DataFrame + df = pd.DataFrame({ + 'dt': pd.date_range(start='1/1/2021', periods=3000), + 'symbol': ['AAPL'] * 3000, + 'price': np.random.rand(3000), + 'factor': np.random.rand(3000), + }) + + df['n1b'] = df['price'].shift(-1) / df['price'] - 1 + raw_corr = df['n1b'].corr(df['factor']) + + # Call the function with the fake DataFrame + result = normalize_corr(df, fcol='factor', copy=True, mode='rolling', window=600) + corr1 = result['n1b'].corr(result['factor']) + assert result.shape == df.shape and np.sign(corr1) == -np.sign(raw_corr) + + # Call the function with the fake DataFrame + result = normalize_corr(df, fcol='factor', copy=True, mode='rolling', window=300) + corr1 = result['n1b'].corr(result['factor']) + assert result.shape == df.shape and np.sign(corr1) == np.sign(raw_corr) + + result = normalize_corr(df, fcol='factor', copy=True, mode='rolling', window=2000) + corr1 = result['n1b'].corr(result['factor']) + assert result.shape == df.shape and np.sign(corr1) == -np.sign(raw_corr) + + result = normalize_corr(df, fcol='factor', copy=True, mode='simple') + corr2 = result['n1b'].corr(result['factor']) + assert result.shape == df.shape and corr2 == -raw_corr