From 61c31ca8a480e68606c588ae49503e866838654a Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 26 Oct 2023 11:25:56 +0800 Subject: [PATCH] improve docstring --- qlib/data/storage/file_storage.py | 12 +++- qlib/data/storage/storage.py | 102 ++++++++++++++++++++---------- tests/test_pit.py | 62 +++++++++--------- 3 files changed, 111 insertions(+), 65 deletions(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index 2d36fe3bef..a36d1cca60 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -490,7 +490,7 @@ def write(self, data_array: np.ndarray, index: int = None) -> None: Args: data_array: Structured arrays contains date, period, value and next - index: _description_. Defaults to None. + index: target index to start writing. Defaults to None. """ if len(data_array) == 0: @@ -509,6 +509,8 @@ def write(self, data_array: np.ndarray, index: int = None) -> None: with self.uri.open("wb") as fp: data_array.tofile(self.uri) else: + if index is None or index > self.end_index: + index = self.end_index + 1 with self.uri.open("rb+") as fp: fp.seek(index * self.itemsize) data_array.tofile(fp) @@ -525,6 +527,14 @@ def end_index(self) -> Union[int, None]: return self.start_index + len(self) - 1 def np_data(self, i: Union[int, slice] = None) -> np.ndarray: + """return numpy structured array + + Args: + i: index or slice. Defaults to None. + + Returns: + np.ndarray + """ if not self.uri.exists(): if isinstance(i, int): return None, None diff --git a/qlib/data/storage/storage.py b/qlib/data/storage/storage.py index 0d0ee0e7eb..acd6172ab5 100644 --- a/qlib/data/storage/storage.py +++ b/qlib/data/storage/storage.py @@ -495,10 +495,36 @@ def __len__(self) -> int: class PITStorage(FeatureStorage): + """PIT data is a special case of Feature data, it looks like + + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 + + It is sorted by [date, period]. + + next field currently is not used. just for forward compatible. + """ + @property def storage_name(self) -> str: return "financial" # for compatibility + def np_data(self, i: Union[int, slice] = None) -> np.ndarray: + """return numpy structured array + + Args: + i: index or slice. Defaults to None. + + Returns: + np.ndarray + """ + + raise NotImplementedError("Subclass of FeatureStorage must implement `write` method") + @property def data(self) -> pd.DataFrame: """get all data @@ -511,7 +537,7 @@ def data(self) -> pd.DataFrame: """ raise NotImplementedError("Subclass of FeatureStorage must implement `data` method") - def write(self, data_array: Union[List, np.ndarray, Tuple], index: int = None): + def write(self, data_array: np.ndarray, index: int = None): """Write data_array to FeatureStorage starting from index. Notes @@ -526,42 +552,24 @@ def write(self, data_array: Union[List, np.ndarray, Tuple], index: int = None): --------- .. code-block:: - feature: - 3 4 - 4 5 - 5 6 + pit data: + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 - >>> self.write([6, 7], index=6) + >>> s.write(np.array([(20070917, 200703, 0.239330, 0)], dtype=s.raw_dtype), 1) feature: - 3 4 - 4 5 - 5 6 - 6 6 - 7 7 - - >>> self.write([8], index=9) - - feature: - 3 4 - 4 5 - 5 6 - 6 6 - 7 7 - 8 np.nan - 9 8 - - >>> self.write([1, np.nan], index=3) - - feature: - 3 1 - 4 np.nan - 5 6 - 6 6 - 7 7 - 8 np.nan - 9 8 + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070917 200703 0.239330 0 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 """ raise NotImplementedError("Subclass of FeatureStorage must implement `write` method") @@ -579,6 +587,34 @@ def rewrite(self, data: Union[List, np.ndarray, Tuple], index: int): self.clear() self.write(data, index) + def update(self, data_array: np.ndarray) -> None: + """update data to storage, replace current data from start_date to end_date with given data_array + + Args: + data_array: Structured arrays contains date, period, value and next. same with self.raw_dtype + + Examples + --------- + .. code-block:: + + pit data: + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20071023 200703 0.245863 4294967295 + 3 20080301 200704 0.347900 80 + 4 20080313 200704 0.395989 4294967295 + + >>> s.update(np.array([(20070917, 200703, 0.111111, 0), (20100314, 200703, 0.111111, 0)], dtype=s.raw_dtype)) + date period value _next + 0 20070428 200701 0.090219 4294967295 + 1 20070817 200702 0.139330 4294967295 + 2 20070917 200703 0.111111 0 + 3 20100314 200703 0.111111 0 + + """ + raise NotImplementedError("Subclass of FeatureStorage must implement `update` method") + @overload def __getitem__(self, s: slice) -> pd.Series: """x.__getitem__(slice(start: int, stop: int, step: int)) <==> x[start:stop:step] diff --git a/tests/test_pit.py b/tests/test_pit.py index 26655b85ab..359be618dd 100644 --- a/tests/test_pit.py +++ b/tests/test_pit.py @@ -35,37 +35,37 @@ class TestPIT(unittest.TestCase): - # @classmethod - # def tearDownClass(cls) -> None: - # shutil.rmtree(str(DATA_DIR.resolve())) - - # @classmethod - # def setUpClass(cls) -> None: - # cn_data_dir = str(QLIB_DIR.joinpath("cn_data").resolve()) - # pit_dir = str(SOURCE_DIR.joinpath("pit").resolve()) - # pit_normalized_dir = str(SOURCE_DIR.joinpath("pit_normalized").resolve()) - # GetData().qlib_data( - # name="qlib_data_simple", target_dir=cn_data_dir, region="cn", delete_old=False, exists_skip=True - # ) - # GetData().qlib_data(name="qlib_data", target_dir=pit_dir, region="pit", delete_old=False, exists_skip=True) - - # # NOTE: This code does the same thing as line 43, but since baostock is not stable in downloading data, we have chosen to download offline data. - # # bs.login() - # # Run( - # # source_dir=pit_dir, - # # interval="quarterly", - # # ).download_data(start="2000-01-01", end="2020-01-01", symbol_regex="^(600519|000725).*") - # # bs.logout() - - # Run( - # source_dir=pit_dir, - # normalize_dir=pit_normalized_dir, - # interval="quarterly", - # ).normalize_data() - # DumpPitData( - # csv_path=pit_normalized_dir, - # qlib_dir=cn_data_dir, - # ).dump(interval="quarterly") + @classmethod + def tearDownClass(cls) -> None: + shutil.rmtree(str(DATA_DIR.resolve())) + + @classmethod + def setUpClass(cls) -> None: + cn_data_dir = str(QLIB_DIR.joinpath("cn_data").resolve()) + pit_dir = str(SOURCE_DIR.joinpath("pit").resolve()) + pit_normalized_dir = str(SOURCE_DIR.joinpath("pit_normalized").resolve()) + GetData().qlib_data( + name="qlib_data_simple", target_dir=cn_data_dir, region="cn", delete_old=False, exists_skip=True + ) + GetData().qlib_data(name="qlib_data", target_dir=pit_dir, region="pit", delete_old=False, exists_skip=True) + + # NOTE: This code does the same thing as line 43, but since baostock is not stable in downloading data, we have chosen to download offline data. + # bs.login() + # Run( + # source_dir=pit_dir, + # interval="quarterly", + # ).download_data(start="2000-01-01", end="2020-01-01", symbol_regex="^(600519|000725).*") + # bs.logout() + + Run( + source_dir=pit_dir, + normalize_dir=pit_normalized_dir, + interval="quarterly", + ).normalize_data() + DumpPitData( + csv_path=pit_normalized_dir, + qlib_dir=cn_data_dir, + ).dump(interval="quarterly") def setUp(self): # qlib.init(kernels=1) # NOTE: set kernel to 1 to make it debug easier