kedro-org · ravi-kumar-pilla · Aug 14, 2023 · Jul 31, 2023 · Jul 31, 2023 · Aug 1, 2023
@@ -192,4 +192,28 @@ describe('Flowchart DAG', () => {
       .should('exist')
       .and('have.text', `Oops, there's nothing to see here`);
   });
+
+  it('verifies that users can open and see the dataset statistics in the metadata panel for datasets. #TC-51', () => {
+    const dataNodeText = 'Companies';
+
+    // Assert before action
+    cy.get('[data-label="Dataset statistics:]').should('not.exist');
+
+    // Action
+    cy.get('.pipeline-node > .pipeline-node__text')
+      .contains(dataNodeText)
+      .click({ force: true });
+
+    // Assert after action
+    cy.get('[data-label="Dataset statistics:"]').should('exist');
+    cy.get('[data-test=profiler-value-rows]')
+      .invoke('text')
+      .should((rowsValue) => expect(parseInt(rowsValue)).to.be.eq(77096));
+    cy.get('[data-test=profiler-value-columns]')
+      .invoke('text')
+      .should((colsValue) => expect(parseInt(colsValue)).to.be.eq(5));
+    cy.get('[data-test=profiler-value-file_size]')
+      .invoke('text')
+      .should((fileSizeValue) => expect(fileSizeValue).to.be.eq('1.8MB'));
+  });
 });
@@ -0,0 +1,17 @@
+{
+  "companies": { "rows": 77096, "columns": 5 },
+  "reviews": { "rows": 77096, "columns": 10 },
+  "shuttles": { "rows": 77096, "columns": 13 },
+  "ingestion.int_typed_companies": { "rows": 77096, "columns": 5 },
+  "ingestion.int_typed_shuttles@pandas2": { "rows": 77096, "columns": 13 },
+  "ingestion.prm_agg_companies": { "rows": 50098, "columns": 5 },
+  "ingestion.int_typed_reviews": { "rows": 55790, "columns": 11 },
+  "prm_spine_table": { "rows": 29768, "columns": 3 },
+  "prm_shuttle_company_reviews": { "rows": 29768, "columns": 27 },
+  "feature_engineering.feat_static_features": { "rows": 29768, "columns": 12 },
+  "feature_engineering.feat_derived_features": { "rows": 29768, "columns": 3 },
+  "feature_importance_output": { "rows": 15, "columns": 2 },
+  "model_input_table": { "rows": 29768, "columns": 12 },
+  "X_train": { "rows": 23814, "columns": 11 },
+  "X_test": { "rows": 5954, "columns": 11 }
+}
@@ -114,6 +114,7 @@ class DataNodeMetadataAPIResponse(BaseAPIResponse):
     tracking_data: Optional[Dict]
     run_command: Optional[str]
     preview: Optional[Dict]
+    profiler: Optional[Dict]
 
     class Config:
         schema_extra = {
@@ -130,6 +131,7 @@ class TranscodedDataNodeMetadataAPIReponse(BaseAPIResponse):
     original_type: str
     transcoded_types: List[str]
     run_command: Optional[str]
+    profiler: Optional[Dict]
 
 
 class ParametersNodeMetadataAPIResponse(BaseAPIResponse):

@@ -49,10 +49,12 @@ async def get_single_node_metadata(node_id: str):
         return TaskNodeMetadata(node)
 
     if isinstance(node, DataNode):
-        return DataNodeMetadata(node)
+        dataset_stats = data_access_manager.get_dataset_stats(node)
+        return DataNodeMetadata(node, dataset_stats if dataset_stats else {})
 
     if isinstance(node, TranscodedDataNode):
-        return TranscodedDataNodeMetadata(node)
+        dataset_stats = data_access_manager.get_dataset_stats(node)
+        return TranscodedDataNodeMetadata(node, dataset_stats if dataset_stats else {})
 
     return ParametersNodeMetadata(node)
 

@@ -62,6 +62,7 @@ def __init__(self):
         )
         self.runs = RunsRepository()
         self.tracking_datasets = TrackingDatasetsRepository()
+        self.dataset_stats = {}
 
     def set_db_session(self, db_session_class: sessionmaker):
         """Set db session on repositories that need it."""
@@ -91,6 +92,32 @@ def add_pipelines(self, pipelines: Dict[str, KedroPipeline]):
             # Add the registered pipeline and its components to their repositories
             self.add_pipeline(registered_pipeline_id, pipeline)
 
+    def add_dataset_stats(self, stats_dict: Union[Dict[str, int], None]):
+        """Add dataset statistics (eg. rows, columns) as a dictionary.
+        This will help in showing the relevant stats in the metadata panel
+
+        Args:
+            stats_dict: A dictionary object loaded from stats.json file in the kedro project
+        """
+        self.dataset_stats = stats_dict
+
+    def get_dataset_stats(
+        self, data_node: Union[DataNode, TranscodedDataNode]
+    ) -> Union[Dict[str, int], None]:
+        """Returns the dataset statistics for the data node if found else returns None
+
+        Args:
+            The data node for which we need the statistics
+        """
+        if (
+            not data_node
+            or not self.dataset_stats
+            or data_node.name not in self.dataset_stats
+        ):
+            return None
+
+        return self.dataset_stats[data_node.name]
+
     def add_pipeline(self, registered_pipeline_id: str, pipeline: KedroPipeline):
         """Iterate through all the nodes and datasets in a "registered" pipeline
         and add them to relevant repositories. Take care of extracting other relevant information

@@ -54,11 +54,27 @@ def _bootstrap(project_path: Path):
         return
 
 
+def get_dataset_stats(project_path: Path):
-def get_dataset_stats(project_path: Path):
+def get_dataset_stats(project_path: Path) -> dict:
-def get_dataset_stats(project_path: Path):
+def get_dataset_stats(project_path: Path) -> dict:
+    """Return the stats saved at stats.json"""
+    import json as json_lib
+
+    stats_file_path = project_path / "stats.json"
+
+    if not stats_file_path.exists():
+        return None
+
+    with open(stats_file_path, encoding="utf8") as stats_file:
+        stats = json_lib.load(stats_file)
+        return stats
+
+
 def load_data(
     project_path: Path,
     env: Optional[str] = None,
     extra_params: Optional[Dict[str, Any]] = None,
-) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore]:
+) -> Tuple[
+    DataCatalog, Dict[str, Pipeline], BaseSessionStore, Optional[Dict[str, int]]
+]:
     """Load data from a Kedro project.
     Args:
         project_path: the path whether the Kedro project is located.
@@ -91,7 +107,13 @@ def load_data(
             # in case user doesn't have an active session down the line when it's first accessed.
             # Useful for users who have `get_current_session` in their `register_pipelines()`.
             pipelines_dict = dict(pipelines)
-        return catalog, pipelines_dict, session_store
+            stats_dict = (
+                dict(get_dataset_stats(project_path))
+                if get_dataset_stats(project_path) is not None
+                else {}
+            )
+
+        return catalog, pipelines_dict, session_store, stats_dict
     elif KEDRO_VERSION.match(">=0.17.1"):
         from kedro.framework.session import KedroSession
 
@@ -103,8 +125,13 @@ def load_data(
         ) as session:
             context = session.load_context()
             session_store = session._store
+            stats_dict = (
+                dict(get_dataset_stats(project_path))
+                if get_dataset_stats(project_path) is not None
+                else {}
+            )
 
-        return context.catalog, context.pipelines, session_store
+        return context.catalog, context.pipelines, session_store, stats_dict
     else:
         # Since Viz is only compatible with kedro>=0.17.0, this just matches 0.17.0
         from kedro.framework.session import KedroSession
@@ -120,8 +147,13 @@ def load_data(
         ) as session:
             context = session.load_context()
             session_store = session._store
+            stats_dict = (
+                dict(get_dataset_stats(project_path))
+                if get_dataset_stats(project_path) is not None
+                else {}
+            )
 
-        return context.catalog, context.pipelines, session_store
+        return context.catalog, context.pipelines, session_store, stats_dict
 
 
 # The dataset type is available as an attribute if and only if the import from kedro

@@ -0,0 +1,66 @@
+# pylint: disable=broad-exception-caught
+"""`kedro_viz.integrations.kedro.hooks` defines hooks to add additional
+functionalities for a kedro run."""
+
+import logging
+from collections import defaultdict
+from typing import Any
+
+from kedro.framework.hooks import hook_impl
+
+logger = logging.getLogger(__name__)
+
+
+class DatasetStatsHook:
+    """Hook to collect dataset statistics during a kedro run
+    and save it to a JSON file"""
+
+    def __init__(self):
+        self._stats = defaultdict(dict)
+
+    @hook_impl
+    def after_dataset_loaded(self, dataset_name: str, data: Any):
+        """Hook to be invoked after a dataset is loaded from the catalog.
+        Once the dataset is loaded, extract the required dataset statistics
+
+        Args:
+            dataset_name: name of the dataset that was saved to the catalog.
+            data: the actual data that was saved to the catalog.
+        """
+        try:
+            import pandas as pd  # pylint: disable=import-outside-toplevel
+
+            if isinstance(data, pd.DataFrame):
+                self._stats[dataset_name] = {}
+                self._stats[dataset_name]["rows"] = int(data.shape[0])
+                self._stats[dataset_name]["columns"] = int(data.shape[1])
+
+        except ImportError as exc:  # pragma: no cover
+            logger.warning("%s : %s", exc.__class__.__name__, exc.msg)
+
+        except Exception as exc:  # pragma: no cover
+            logger.error(
+                "Error creating the stats for the dataset %s : %s", dataset_name, exc
+            )
+
+    @hook_impl
+    def after_pipeline_run(self):
+        """Hook to be invoked after a pipeline runs.
+        Once the pipeline run completes, write the dataset
+        statistics to stats.json file
+
+        """
+        try:
+            import json as json_lib  # pylint: disable=import-outside-toplevel
+
+            with open("stats.json", "w", encoding="utf8") as file:
+                json_lib.dump(self._stats, file)
+
+        except ImportError as exc:  # pragma: no cover
+            logger.warning("%s : %s", exc.__class__.__name__, exc.msg)
+
+        except Exception as exc:  # pragma: no cover
+            logger.error("Error writing the stats for the pipeline: %s", exc)
+
+
+dataset_stats_hook = DatasetStatsHook()
@@ -13,7 +13,7 @@
 from kedro.pipeline.node import Node as KedroNode
 from kedro.pipeline.pipeline import TRANSCODING_SEPARATOR, _strip_transcoding
 
-from .utils import get_dataset_type
+from .utils import get_dataset_type, get_file_size
 
 try:
     # kedro 0.18.11 onwards
@@ -541,6 +541,7 @@ class DataNodeMetadata(GraphNodeMetadata):
 
     # the underlying data node to which this metadata belongs
     data_node: InitVar[DataNode]
+    dataset_stats: InitVar[Dict]
 
     # the optional plot data if the underlying dataset has a plot.
     # currently only applicable for PlotlyDataSet
@@ -557,12 +558,28 @@ class DataNodeMetadata(GraphNodeMetadata):
 
     preview: Optional[Dict] = field(init=False, default=None)
 
+    profiler: Optional[Dict] = field(init=False, default=None)
+
     # TODO: improve this scheme.
-    def __post_init__(self, data_node: DataNode):
+    def __post_init__(self, data_node: DataNode, dataset_stats: Dict):
         self.type = data_node.dataset_type
         dataset = cast(AbstractDataset, data_node.kedro_obj)
         dataset_description = dataset._describe()
         self.filepath = _parse_filepath(dataset_description)
+        self.profiler = dataset_stats
+
+        # TODO: Can we use _describe method of kedro-dataset plugin to get the file size
+        # by adding a file_size key to the return dict
+        # self.profiler["file_size"] = (
+        #     dataset_description["file_size"]
+        #     if "file_size" in dataset_description
+        #     else 0
+        # )
+
+        # TODO: This will help to read the file size if the file path is present.
+        # If not, this will return 0. Not sure if this works for remote files
+        # Use fsspec to get the file size
+        self.profiler["file_size"] = get_file_size(self.filepath)
 
         # Run command is only available if a node is an output, i.e. not a free input
         if not data_node.is_free_input:
@@ -615,10 +632,15 @@ class TranscodedDataNodeMetadata(GraphNodeMetadata):
 
     transcoded_types: List[str] = field(init=False)
 
+    profiler: Optional[Dict] = field(init=False, default=None)
+
     # the underlying data node to which this metadata belongs
     transcoded_data_node: InitVar[TranscodedDataNode]
+    dataset_stats: InitVar[Dict]
 
-    def __post_init__(self, transcoded_data_node: TranscodedDataNode):
+    def __post_init__(
+        self, transcoded_data_node: TranscodedDataNode, dataset_stats: Dict
+    ):
         original_version = transcoded_data_node.original_version
 
         self.original_type = get_dataset_type(original_version)
@@ -629,6 +651,20 @@ def __post_init__(self, transcoded_data_node: TranscodedDataNode):
 
         dataset_description = original_version._describe()
         self.filepath = _parse_filepath(dataset_description)
+        self.profiler = dataset_stats
+
+        # TODO: Can we use _describe method of kedro-dataset plugin to get the file size
+        # by adding a file_size key to the return dict
+        # self.profiler["file_size"] = (
+        #     dataset_description["file_size"]
+        #     if "file_size" in dataset_description
+        #     else 0
+        # )
+
+        # TODO: This will help to read the file size if the file path is present.
+        # If not, this will return 0. Not sure if this works for remote files
+        # Use fsspec to get the file size
+        self.profiler["file_size"] = get_file_size(self.filepath)
 
         if not transcoded_data_node.is_free_input:
             self.run_command = (