diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 5d88b2959a..9310cd5e97 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -51,8 +51,8 @@ jobs: python setup.py bdist_wheel - name: Build and publish env: - TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} - TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} run: | twine upload dist/* @@ -72,10 +72,10 @@ jobs: python-version: 3.7 - name: Install dependencies run: | - pip install twine + pip install twine - name: Build and publish env: - TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} - TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} run: | twine upload dist/pyqlib-*-manylinux*.whl diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml index 113a4f009f..7eefec5801 100644 --- a/.github/workflows/release-drafter.yml +++ b/.github/workflows/release-drafter.yml @@ -6,8 +6,14 @@ on: branches: - main +permissions: + contents: read + jobs: update_release_draft: + permissions: + contents: write + pull-requests: read runs-on: ubuntu-latest steps: # Drafts your next Release notes as Pull Requests are merged into "master" diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml index bde41d8026..4cc842b223 100644 --- a/.github/workflows/test_qlib_from_pip.yml +++ b/.github/workflows/test_qlib_from_pip.yml @@ -13,7 +13,10 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-latest] + # Since macos-latest changed from 12.7.4 to 14.4.1, + # the minimum python version that matches a 14.4.1 version of macos is 3.10, + # so we limit the macos version to macos-12. + os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-12] # not supporting 3.6 due to annotations is not supported https://stackoverflow.com/a/52890129 python-version: [3.7, 3.8] diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml index 9205a13641..38f32da8ed 100644 --- a/.github/workflows/test_qlib_from_source.yml +++ b/.github/workflows/test_qlib_from_source.yml @@ -14,7 +14,10 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-latest] + # Since macos-latest changed from 12.7.4 to 14.4.1, + # the minimum python version that matches a 14.4.1 version of macos is 3.10, + # so we limit the macos version to macos-12. + os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-12] # not supporting 3.6 due to annotations is not supported https://stackoverflow.com/a/52890129 python-version: [3.7, 3.8] diff --git a/.github/workflows/test_qlib_from_source_slow.yml b/.github/workflows/test_qlib_from_source_slow.yml index caab6f444e..8725d4fe03 100644 --- a/.github/workflows/test_qlib_from_source_slow.yml +++ b/.github/workflows/test_qlib_from_source_slow.yml @@ -14,7 +14,10 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-latest] + # Since macos-latest changed from 12.7.4 to 14.4.1, + # the minimum python version that matches a 14.4.1 version of macos is 3.10, + # so we limit the macos version to macos-12. + os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-12] # not supporting 3.6 due to annotations is not supported https://stackoverflow.com/a/52890129 python-version: [3.7, 3.8] diff --git a/.gitignore b/.gitignore index 8854c25e99..29ea1cd5e3 100644 --- a/.gitignore +++ b/.gitignore @@ -48,4 +48,4 @@ tags *.swp ./pretrain -.idea/ +.idea/ \ No newline at end of file diff --git a/.readthedocs.yml b/.readthedocs.yaml similarity index 80% rename from .readthedocs.yml rename to .readthedocs.yaml index 7d4cb854ae..71b29a2279 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yaml @@ -5,6 +5,12 @@ # Required version: 2 +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.7" + # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py @@ -14,7 +20,6 @@ formats: all # Optionally set the version of Python and requirements required to build your docs python: - version: 3.7 install: - requirements: docs/requirements.txt - method: pip diff --git a/README.md b/README.md index a9d5e4cc23..65c4420e6b 100644 --- a/README.md +++ b/README.md @@ -175,6 +175,20 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor **Tips for Mac**: If you are using Mac with M1, you might encounter issues in building the wheel for LightGBM, which is due to missing dependencies from OpenMP. To solve the problem, install openmp first with ``brew install libomp`` and then run ``pip install .`` to build it successfully. ## Data Preparation +❗ Due to more restrict data security policy. The offical dataset is disabled temporarily. You can try [this data source](https://github.com/chenditc/investment_data/releases) contributed by the community. +Here is an example to download the data updated on 20220720. +```bash +wget https://github.com/chenditc/investment_data/releases/download/20220720/qlib_bin.tar.gz +mkdir -p ~/.qlib/qlib_data/cn_data +tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2 +rm -f qlib_bin.tar.gz +``` + +The official dataset below will resume in short future. + + +---- + Load and prepare data by running the following code: ### Get with module diff --git a/docs/requirements.txt b/docs/requirements.txt index c10a86d4ee..9444c55737 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,3 +5,4 @@ scipy scikit-learn pandas tianshou +sphinx_rtd_theme diff --git a/examples/orderbook_data/README.md b/examples/orderbook_data/README.md index 890e11f41e..53fd523d7f 100644 --- a/examples/orderbook_data/README.md +++ b/examples/orderbook_data/README.md @@ -16,7 +16,7 @@ Current version of script with default value tries to connect localhost **via de Run following command to install necessary libraries ``` -pip install pytest coverage +pip install pytest coverage gdown pip install arctic # NOTE: pip may fail to resolve the right package dependency !!! Please make sure the dependency are satisfied. ``` @@ -27,7 +27,8 @@ pip install arctic # NOTE: pip may fail to resolve the right package dependency 2. Please follow following steps to download example data ```bash cd examples/orderbook_data/ -python ../../scripts/get_data.py download_data --target_dir . --file_name highfreq_orderbook_example_data.zip +gdown https://drive.google.com/uc?id=15nZF7tFT_eKVZAcMFL1qPS4jGyJflH7e # Proxies may be necessary here. +python ../../scripts/get_data.py _unzip --file_path highfreq_orderbook_example_data.zip --target_dir . ``` 3. Please import the example data to your mongo db diff --git a/qlib/__init__.py b/qlib/__init__.py index ed95f589e4..fca74e4567 100644 --- a/qlib/__init__.py +++ b/qlib/__init__.py @@ -2,7 +2,7 @@ # Licensed under the MIT License. from pathlib import Path -__version__ = "0.9.3.99" +__version__ = "0.9.5.99" __version__bak = __version__ # This version is backup for QlibConfig.reset_qlib_version import os from typing import Union diff --git a/qlib/data/data.py b/qlib/data/data.py index 116827f232..1b1353ee4e 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -536,7 +536,6 @@ def get_column_names(fields): """ if len(fields) == 0: raise ValueError("fields cannot be empty") - fields = fields.copy() column_names = [str(f) for f in fields] return column_names diff --git a/qlib/tests/data.py b/qlib/tests/data.py index f6bd780905..2fa76855b5 100644 --- a/qlib/tests/data.py +++ b/qlib/tests/data.py @@ -12,15 +12,11 @@ from tqdm import tqdm from pathlib import Path from loguru import logger -from cryptography.fernet import Fernet from qlib.utils import exists_qlib_data class GetData: - REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data" - # "?" is not included in the token. - TOKEN = b"gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy" - KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA=" + REMOTE_URL = "https://github.com/SunsetWolf/qlib_dataset/releases/download" def __init__(self, delete_zip_file=False): """ @@ -33,9 +29,45 @@ def __init__(self, delete_zip_file=False): self.delete_zip_file = delete_zip_file def merge_remote_url(self, file_name: str): - fernet = Fernet(self.KEY) - token = fernet.decrypt(self.TOKEN).decode() - return f"{self.REMOTE_URL}/{file_name}?{token}" + """ + Generate download links. + + Parameters + ---------- + file_name: str + The name of the file to be downloaded. + The file name can be accompanied by a version number, (e.g.: v2/qlib_data_simple_cn_1d_latest.zip), + if no version number is attached, it will be downloaded from v0 by default. + """ + return f"{self.REMOTE_URL}/{file_name}" if "/" in file_name else f"{self.REMOTE_URL}/v0/{file_name}" + + def download(self, url: str, target_path: [Path, str]): + """ + Download a file from the specified url. + + Parameters + ---------- + url: str + The url of the data. + target_path: str + The location where the data is saved, including the file name. + """ + file_name = str(target_path).rsplit("/", maxsplit=1)[-1] + resp = requests.get(url, stream=True, timeout=60) + resp.raise_for_status() + if resp.status_code != 200: + raise requests.exceptions.HTTPError() + + chunk_size = 1024 + logger.warning( + f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)" + ) + logger.info(f"{os.path.basename(file_name)} downloading......") + with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar: + with target_path.open("wb") as fp: + for chunk in resp.iter_content(chunk_size=chunk_size): + fp.write(chunk) + p_bar.update(chunk_size) def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True): """ @@ -70,21 +102,7 @@ def download_data(self, file_name: str, target_dir: [Path, str], delete_old: boo target_path = target_dir.joinpath(_target_file_name) url = self.merge_remote_url(file_name) - resp = requests.get(url, stream=True, timeout=60) - resp.raise_for_status() - if resp.status_code != 200: - raise requests.exceptions.HTTPError() - - chunk_size = 1024 - logger.warning( - f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)" - ) - logger.info(f"{os.path.basename(file_name)} downloading......") - with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar: - with target_path.open("wb") as fp: - for chunk in resp.iter_content(chunk_size=chunk_size): - fp.write(chunk) - p_bar.update(chunk_size) + self.download(url=url, target_path=target_path) self._unzip(target_path, target_dir, delete_old) if self.delete_zip_file: @@ -99,7 +117,9 @@ def check_dataset(self, file_name: str): return status @staticmethod - def _unzip(file_path: Path, target_dir: Path, delete_old: bool = True): + def _unzip(file_path: [Path, str], target_dir: [Path, str], delete_old: bool = True): + file_path = Path(file_path) + target_dir = Path(target_dir) if delete_old: logger.warning( f"will delete the old qlib data directory(features, instruments, calendars, features_cache, dataset_cache): {target_dir}" diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 9e63c104a1..732638b236 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -25,7 +25,12 @@ from pathlib import Path from typing import List, Union, Optional, Callable from packaging import version -from .file import get_or_create_path, save_multiple_parts_file, unpack_archive_with_buffer, get_tmp_file_with_buffer +from .file import ( + get_or_create_path, + save_multiple_parts_file, + unpack_archive_with_buffer, + get_tmp_file_with_buffer, +) from ..config import C from ..log import get_module_logger, set_log_with_config @@ -37,7 +42,12 @@ #################### Server #################### def get_redis_connection(): """get redis connection instance.""" - return redis.StrictRedis(host=C.redis_host, port=C.redis_port, db=C.redis_task_db, password=C.redis_password) + return redis.StrictRedis( + host=C.redis_host, + port=C.redis_port, + db=C.redis_task_db, + password=C.redis_password, + ) #################### Data #################### @@ -96,7 +106,14 @@ def get_period_offset(first_year, period, quarterly): return offset -def read_period_data(index_path, data_path, period, cur_date_int: int, quarterly, last_period_index: int = None): +def read_period_data( + index_path, + data_path, + period, + cur_date_int: int, + quarterly, + last_period_index: int = None, +): """ At `cur_date`(e.g. 20190102), read the information at `period`(e.g. 201803). Only the updating info before cur_date or at cur_date will be used. @@ -273,7 +290,10 @@ def parse_field(field): # \uff09 -> ) chinese_punctuation_regex = r"\u3001\uff1a\uff08\uff09" for pattern, new in [ - (rf"\$\$([\w{chinese_punctuation_regex}]+)", r'PFeature("\1")'), # $$ must be before $ + ( + rf"\$\$([\w{chinese_punctuation_regex}]+)", + r'PFeature("\1")', + ), # $$ must be before $ (rf"\$([\w{chinese_punctuation_regex}]+)", r'Feature("\1")'), (r"(\w+\s*)\(", r"Operators.\1("), ]: # Features # Operators @@ -383,7 +403,14 @@ def get_date_range(trading_date, left_shift=0, right_shift=0, future=False): return calendar -def get_date_by_shift(trading_date, shift, future=False, clip_shift=True, freq="day", align: Optional[str] = None): +def get_date_by_shift( + trading_date, + shift, + future=False, + clip_shift=True, + freq="day", + align: Optional[str] = None, +): """get trading date with shift bias will cur_date e.g. : shift == 1, return next trading date shift == -1, return previous trading date @@ -569,7 +596,38 @@ def exists_qlib_data(qlib_dir): # check instruments code_names = set(map(lambda x: fname_to_code(x.name.lower()), features_dir.iterdir())) _instrument = instruments_dir.joinpath("all.txt") - miss_code = set(pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower)) - set(code_names) + # Removed two possible ticker names "NA" and "NULL" from the default na_values list for column 0 + miss_code = set( + pd.read_csv( + _instrument, + sep="\t", + header=None, + keep_default_na=False, + na_values={ + 0: [ + " ", + "#N/A", + "#N/A N/A", + "#NA", + "-1.#IND", + "-1.#QNAN", + "-NaN", + "-nan", + "1.#IND", + "1.#QNAN", + "", + "N/A", + "NaN", + "None", + "n/a", + "nan", + "null ", + ] + }, + ) + .loc[:, 0] + .apply(str.lower) + ) - set(code_names) if miss_code and any(map(lambda x: "sht" not in x, miss_code)): return False diff --git a/scripts/data_collector/cn_index/collector.py b/scripts/data_collector/cn_index/collector.py index 96f68ef9cd..237df6fe87 100644 --- a/scripts/data_collector/cn_index/collector.py +++ b/scripts/data_collector/cn_index/collector.py @@ -396,14 +396,7 @@ def get_history_companies(self) -> pd.DataFrame: today = pd.Timestamp.now() date_range = pd.DataFrame(pd.date_range(start="2007-01-15", end=today, freq="7D"))[0].dt.date ret_list = [] - col = ["date", "symbol", "code_name"] for date in tqdm(date_range, desc="Download CSI500"): - rs = bs.query_zz500_stocks(date=str(date)) - zz500_stocks = [] - while (rs.error_code == "0") & rs.next(): - zz500_stocks.append(rs.get_row_data()) - result = pd.DataFrame(zz500_stocks, columns=col) - result["symbol"] = result["symbol"].apply(lambda x: x.replace(".", "").upper()) result = self.get_data_from_baostock(date) ret_list.append(result[["date", "symbol"]]) bs.logout() diff --git a/scripts/data_collector/cn_index/requirements.txt b/scripts/data_collector/cn_index/requirements.txt index bff59525cd..87933e9d9d 100644 --- a/scripts/data_collector/cn_index/requirements.txt +++ b/scripts/data_collector/cn_index/requirements.txt @@ -5,3 +5,5 @@ pandas lxml loguru tqdm +yahooquery +openpyxl diff --git a/scripts/dump_pit.py b/scripts/dump_pit.py index 1ca9cfc942..806bbd0cc9 100644 --- a/scripts/dump_pit.py +++ b/scripts/dump_pit.py @@ -3,7 +3,7 @@ """ TODO: - A more well-designed PIT database is required. - - seperated insert, delete, update, query operations are required. + - separated insert, delete, update, query operations are required. """ import shutil