From 61c31ca8a480e68606c588ae49503e866838654a Mon Sep 17 00:00:00 2001
From: John Lyu <lvjunhong@citics.com>
Date: Thu, 26 Oct 2023 11:25:56 +0800
Subject: [PATCH] improve docstring

---
 qlib/data/storage/file_storage.py |  12 +++-
 qlib/data/storage/storage.py      | 102 ++++++++++++++++++++----------
 tests/test_pit.py                 |  62 +++++++++---------
 3 files changed, 111 insertions(+), 65 deletions(-)

diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py
index 2d36fe3bef..a36d1cca60 100644
--- a/qlib/data/storage/file_storage.py
+++ b/qlib/data/storage/file_storage.py
@@ -490,7 +490,7 @@ def write(self, data_array: np.ndarray, index: int = None) -> None:
 
         Args:
             data_array: Structured arrays contains date, period, value and next
-            index: _description_. Defaults to None.
+            index: target index to start writing. Defaults to None.
         """
 
         if len(data_array) == 0:
@@ -509,6 +509,8 @@ def write(self, data_array: np.ndarray, index: int = None) -> None:
             with self.uri.open("wb") as fp:
                 data_array.tofile(self.uri)
         else:
+            if index is None or index > self.end_index:
+                index = self.end_index + 1
             with self.uri.open("rb+") as fp:
                 fp.seek(index * self.itemsize)
                 data_array.tofile(fp)
@@ -525,6 +527,14 @@ def end_index(self) -> Union[int, None]:
         return self.start_index + len(self) - 1
 
     def np_data(self, i: Union[int, slice] = None) -> np.ndarray:
+        """return numpy structured array
+
+        Args:
+            i: index or slice. Defaults to None.
+
+        Returns:
+            np.ndarray
+        """
         if not self.uri.exists():
             if isinstance(i, int):
                 return None, None
diff --git a/qlib/data/storage/storage.py b/qlib/data/storage/storage.py
index 0d0ee0e7eb..acd6172ab5 100644
--- a/qlib/data/storage/storage.py
+++ b/qlib/data/storage/storage.py
@@ -495,10 +495,36 @@ def __len__(self) -> int:
 
 
 class PITStorage(FeatureStorage):
+    """PIT data is a special case of Feature data, it looks like
+
+                date  period     value       _next
+            0  20070428  200701  0.090219  4294967295
+            1  20070817  200702  0.139330  4294967295
+            2  20071023  200703  0.245863  4294967295
+            3  20080301  200704  0.347900          80
+            4  20080313  200704  0.395989  4294967295
+
+    It is sorted by [date, period].
+
+    next field currently is not used. just for forward compatible.
+    """
+
     @property
     def storage_name(self) -> str:
         return "financial"  # for compatibility
 
+    def np_data(self, i: Union[int, slice] = None) -> np.ndarray:
+        """return numpy structured array
+
+        Args:
+            i: index or slice. Defaults to None.
+
+        Returns:
+            np.ndarray
+        """
+
+        raise NotImplementedError("Subclass of FeatureStorage must implement `write` method")
+
     @property
     def data(self) -> pd.DataFrame:
         """get all data
@@ -511,7 +537,7 @@ def data(self) -> pd.DataFrame:
         """
         raise NotImplementedError("Subclass of FeatureStorage must implement `data` method")
 
-    def write(self, data_array: Union[List, np.ndarray, Tuple], index: int = None):
+    def write(self, data_array: np.ndarray, index: int = None):
         """Write data_array to FeatureStorage starting from index.
 
         Notes
@@ -526,42 +552,24 @@ def write(self, data_array: Union[List, np.ndarray, Tuple], index: int = None):
         ---------
             .. code-block::
 
-                feature:
-                    3   4
-                    4   5
-                    5   6
+                pit data:
+                    date  period     value       _next
+                0  20070428  200701  0.090219  4294967295
+                1  20070817  200702  0.139330  4294967295
+                2  20071023  200703  0.245863  4294967295
+                3  20080301  200704  0.347900          80
+                4  20080313  200704  0.395989  4294967295
 
 
-            >>> self.write([6, 7], index=6)
+            >>> s.write(np.array([(20070917, 200703, 0.239330, 0)], dtype=s.raw_dtype), 1)
 
                 feature:
-                    3   4
-                    4   5
-                    5   6
-                    6   6
-                    7   7
-
-            >>> self.write([8], index=9)
-
-                feature:
-                    3   4
-                    4   5
-                    5   6
-                    6   6
-                    7   7
-                    8   np.nan
-                    9   8
-
-            >>> self.write([1, np.nan], index=3)
-
-                feature:
-                    3   1
-                    4   np.nan
-                    5   6
-                    6   6
-                    7   7
-                    8   np.nan
-                    9   8
+                    date  period     value       _next
+                0  20070428  200701  0.090219  4294967295
+                1  20070917  200703  0.239330  0
+                2  20071023  200703  0.245863  4294967295
+                3  20080301  200704  0.347900          80
+                4  20080313  200704  0.395989  4294967295
 
         """
         raise NotImplementedError("Subclass of FeatureStorage must implement `write` method")
@@ -579,6 +587,34 @@ def rewrite(self, data: Union[List, np.ndarray, Tuple], index: int):
         self.clear()
         self.write(data, index)
 
+    def update(self, data_array: np.ndarray) -> None:
+        """update data to storage, replace current data from start_date to end_date with given data_array
+
+        Args:
+            data_array: Structured arrays contains date, period, value and next. same with self.raw_dtype
+
+        Examples
+        ---------
+            .. code-block::
+
+                pit data:
+                    date  period     value       _next
+                0  20070428  200701  0.090219  4294967295
+                1  20070817  200702  0.139330  4294967295
+                2  20071023  200703  0.245863  4294967295
+                3  20080301  200704  0.347900          80
+                4  20080313  200704  0.395989  4294967295
+
+            >>> s.update(np.array([(20070917, 200703, 0.111111, 0), (20100314, 200703, 0.111111, 0)], dtype=s.raw_dtype))
+                    date  period     value       _next
+                0  20070428  200701  0.090219  4294967295
+                1  20070817  200702  0.139330  4294967295
+                2  20070917  200703  0.111111           0
+                3  20100314  200703  0.111111           0
+
+        """
+        raise NotImplementedError("Subclass of FeatureStorage must implement `update` method")
+
     @overload
     def __getitem__(self, s: slice) -> pd.Series:
         """x.__getitem__(slice(start: int, stop: int, step: int)) <==> x[start:stop:step]
diff --git a/tests/test_pit.py b/tests/test_pit.py
index 26655b85ab..359be618dd 100644
--- a/tests/test_pit.py
+++ b/tests/test_pit.py
@@ -35,37 +35,37 @@
 
 
 class TestPIT(unittest.TestCase):
-    # @classmethod
-    # def tearDownClass(cls) -> None:
-    #     shutil.rmtree(str(DATA_DIR.resolve()))
-
-    # @classmethod
-    # def setUpClass(cls) -> None:
-    #     cn_data_dir = str(QLIB_DIR.joinpath("cn_data").resolve())
-    #     pit_dir = str(SOURCE_DIR.joinpath("pit").resolve())
-    #     pit_normalized_dir = str(SOURCE_DIR.joinpath("pit_normalized").resolve())
-    #     GetData().qlib_data(
-    #         name="qlib_data_simple", target_dir=cn_data_dir, region="cn", delete_old=False, exists_skip=True
-    #     )
-    #     GetData().qlib_data(name="qlib_data", target_dir=pit_dir, region="pit", delete_old=False, exists_skip=True)
-
-    #     # NOTE: This code does the same thing as line 43, but since baostock is not stable in downloading data, we have chosen to download offline data.
-    #     # bs.login()
-    #     # Run(
-    #     #     source_dir=pit_dir,
-    #     #     interval="quarterly",
-    #     # ).download_data(start="2000-01-01", end="2020-01-01", symbol_regex="^(600519|000725).*")
-    #     # bs.logout()
-
-    #     Run(
-    #         source_dir=pit_dir,
-    #         normalize_dir=pit_normalized_dir,
-    #         interval="quarterly",
-    #     ).normalize_data()
-    #     DumpPitData(
-    #         csv_path=pit_normalized_dir,
-    #         qlib_dir=cn_data_dir,
-    #     ).dump(interval="quarterly")
+    @classmethod
+    def tearDownClass(cls) -> None:
+        shutil.rmtree(str(DATA_DIR.resolve()))
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cn_data_dir = str(QLIB_DIR.joinpath("cn_data").resolve())
+        pit_dir = str(SOURCE_DIR.joinpath("pit").resolve())
+        pit_normalized_dir = str(SOURCE_DIR.joinpath("pit_normalized").resolve())
+        GetData().qlib_data(
+            name="qlib_data_simple", target_dir=cn_data_dir, region="cn", delete_old=False, exists_skip=True
+        )
+        GetData().qlib_data(name="qlib_data", target_dir=pit_dir, region="pit", delete_old=False, exists_skip=True)
+
+        # NOTE: This code does the same thing as line 43, but since baostock is not stable in downloading data, we have chosen to download offline data.
+        # bs.login()
+        # Run(
+        #     source_dir=pit_dir,
+        #     interval="quarterly",
+        # ).download_data(start="2000-01-01", end="2020-01-01", symbol_regex="^(600519|000725).*")
+        # bs.logout()
+
+        Run(
+            source_dir=pit_dir,
+            normalize_dir=pit_normalized_dir,
+            interval="quarterly",
+        ).normalize_data()
+        DumpPitData(
+            csv_path=pit_normalized_dir,
+            qlib_dir=cn_data_dir,
+        ).dump(interval="quarterly")
 
     def setUp(self):
         # qlib.init(kernels=1)  # NOTE: set kernel to 1 to make it debug easier