diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml index 4cc842b223..4b9fa7c34d 100644 --- a/.github/workflows/test_qlib_from_pip.yml +++ b/.github/workflows/test_qlib_from_pip.yml @@ -45,6 +45,9 @@ jobs: - name: Qlib installation test run: | + # 2024-05-30 scs has released a new version: 3.2.4.post2, + # This will cause the CI to fail, so we have limited the version of scs for now. + python -m pip install "scs<=3.2.4" python -m pip install pyqlib - name: Install Lightgbm for MacOS diff --git a/README.md b/README.md index 65c4420e6b..773eeaf39b 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ Recent released features Features released before 2021 are not listed here.

- +

Qlib is an open-source, AI-oriented quantitative investment platform that aims to realize the potential, empower research, and create value using AI technologies in quantitative investment, from exploring ideas to implementing productions. Qlib supports diverse machine learning modeling paradigms, including supervised learning, market dynamics modeling, and reinforcement learning. @@ -166,7 +166,7 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor * Clone the repository and install ``Qlib`` as follows. ```bash git clone https://github.com/microsoft/qlib.git && cd qlib - pip install . + pip install . # `pip install -e .[dev]` is recommended for development. check details in docs/developer/code_standard_and_dev_guide.rst ``` **Note**: You can install Qlib with `python setup.py install` as well. But it is not the recommended approach. It will skip `pip` and cause obscure problems. For example, **only** the command ``pip install .`` **can** overwrite the stable version installed by ``pip install pyqlib``, while the command ``python setup.py install`` **can't**. diff --git a/qlib/data/dataset/utils.py b/qlib/data/dataset/utils.py index f19dfe08fa..688cde99af 100644 --- a/qlib/data/dataset/utils.py +++ b/qlib/data/dataset/utils.py @@ -9,7 +9,7 @@ from qlib.data.dataset import DataHandler -def get_level_index(df: pd.DataFrame, level=Union[str, int]) -> int: +def get_level_index(df: pd.DataFrame, level: Union[str, int]) -> int: """ get the level index of `df` given `level` diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py index 2517e9bce8..2efc2feadc 100644 --- a/scripts/data_collector/base.py +++ b/scripts/data_collector/base.py @@ -301,6 +301,7 @@ def _executor(self, file_path: Path): na_values={col: symbol_na if col == self._symbol_field_name else default_na for col in columns}, ) + # NOTE: It has been reported that there may be some problems here, and the specific issues will be dealt with when they are identified. df = self._normalize_obj.normalize(df) if df is not None and not df.empty: if self._end_date is not None: diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 596eae60ef..feec170bb1 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -15,7 +15,6 @@ import numpy as np import pandas as pd -from lxml import etree from loguru import logger from yahooquery import Ticker from tqdm import tqdm @@ -190,17 +189,43 @@ def get_hs_stock_symbols() -> list: global _HS_SYMBOLS # pylint: disable=W0603 def _get_symbol(): - _res = set() - for _k, _v in (("ha", "ss"), ("sa", "sz"), ("gem", "sz")): - resp = requests.get(HS_SYMBOLS_URL.format(s_type=_k), timeout=None) - _res |= set( - map( - lambda x: "{}.{}".format(re.findall(r"\d+", x)[0], _v), # pylint: disable=W0640 - etree.HTML(resp.text).xpath("//div[@class='result']/ul//li/a/text()"), # pylint: disable=I1101 - ) - ) - time.sleep(3) - return _res + """ + Get the stock pool from a web page and process it into the format required by yahooquery. + Format of data retrieved from the web page: 600519, 000001 + The data format required by yahooquery: 600519.ss, 000001.sz + + Returns + ------- + set: Returns the set of symbol codes. + + Examples: + ------- + {600000.ss, 600001.ss, 600002.ss, 600003.ss, ...} + """ + url = "http://99.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&po=1&np=1&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12" + try: + resp = requests.get(url, timeout=None) + resp.raise_for_status() + except requests.exceptions.HTTPError as e: + raise requests.exceptions.HTTPError(f"Request to {url} failed with status code {resp.status_code}") from e + + try: + _symbols = [_v["f12"] for _v in resp.json()["data"]["diff"]] + except Exception as e: + logger.warning("An error occurred while extracting data from the response.") + raise + + if len(_symbols) < 3900: + raise ValueError("The complete list of stocks is not available.") + + # Add suffix after the stock code to conform to yahooquery standard, otherwise the data will not be fetched. + _symbols = [ + _symbol + ".ss" if _symbol.startswith("6") else _symbol + ".sz" if _symbol.startswith(("0", "3")) else None + for _symbol in _symbols + ] + _symbols = [_symbol for _symbol in _symbols if _symbol is not None] + + return set(_symbols) if _HS_SYMBOLS is None: symbols = set() diff --git a/setup.py b/setup.py index adafefd614..1feabd30c1 100644 --- a/setup.py +++ b/setup.py @@ -166,6 +166,9 @@ def get_version(rel_path: str) -> str: "lxml", "baostock", "yahooquery", + # 2024-05-30 scs has released a new version: 3.2.4.post2, + # this version, causes qlib installation to fail, so we've limited the scs version a bit for now. + "scs<=3.2.4", "beautifulsoup4", # In version 0.4.11 of tianshou, the code: # logits, hidden = self.actor(batch.obs, state=state, info=batch.info)