kedro-org · ravi-kumar-pilla · Aug 14, 2023 · Jul 31, 2023 · Jul 31, 2023 · Aug 1, 2023
@@ -192,4 +192,28 @@ describe('Flowchart DAG', () => {
       .should('exist')
       .and('have.text', `Oops, there's nothing to see here`);
   });
+
+  it('verifies that users can open and see the dataset statistics in the metadata panel for datasets. #TC-51', () => {
+    const dataNodeText = 'Companies';
+
+    // Assert before action
+    cy.get('[data-label="Dataset statistics:]').should('not.exist');
+
+    // Action
+    cy.get('.pipeline-node > .pipeline-node__text')
+      .contains(dataNodeText)
+      .click({ force: true });
+
+    // Assert after action
+    cy.get('[data-label="Dataset statistics:"]').should('exist');
+    cy.get('[data-test=stats-value-rows]')
+      .invoke('text')
+      .should((rowsValue) => expect(rowsValue).to.be.eq('77,096'));
+    cy.get('[data-test=stats-value-columns]')
+      .invoke('text')
+      .should((colsValue) => expect(parseInt(colsValue)).to.be.eq(5));
+    cy.get('[data-test=stats-value-file_size]')
+      .invoke('text')
+      .should((fileSizeValue) => expect(fileSizeValue).to.be.eq('1.8MB'));
+  });
 });
@@ -0,0 +1,64 @@
+{
+  "companies": { "rows": 77096, "columns": 5, "file_size": 1810602 },
+  "reviews": { "rows": 77096, "columns": 10, "file_size": 2937144 },
+  "shuttles": { "rows": 77096, "columns": 13, "file_size": 4195290 },
+  "ingestion.int_typed_companies": {
+    "rows": 77096,
+    "columns": 5,
+    "file_size": 550616
+  },
+  "ingestion.int_typed_shuttles@pandas1": { "file_size": 1235685 },
+  "ingestion.int_typed_shuttles@pandas2": {
+    "rows": 77096,
+    "columns": 13,
+    "file_size": 1235685
+  },
+  "ingestion.int_typed_reviews": {
+    "rows": 55790,
+    "columns": 11,
+    "file_size": 1335600
+  },
+  "prm_shuttle_company_reviews": {
+    "rows": 29768,
+    "columns": 27,
+    "file_size": 1020356
+  },
+  "prm_spine_table": { "rows": 29768, "columns": 3, "file_size": 655994 },
+  "feature_engineering.feat_weighting_metrics": { "file_size": 0 },
+  "feature_engineering.feat_scaling_metrics": { "file_size": 0 },
+  "feature_importance_output": { "rows": 15, "columns": 2, "file_size": 455 },
+  "model_input_table": { "rows": 29768, "columns": 12, "file_size": 787351 },
+  "train_evaluation.linear_regression.regressor": { "file_size": 843 },
+  "train_evaluation.random_forest.regressor": { "file_size": 175421422 },
+  "reporting.cancellation_policy_breakdown": { "file_size": 8744 },
+  "reporting.price_histogram": { "file_size": 216598 },
+  "reporting.feature_importance": { "file_size": 8553 },
+  "reporting.cancellation_policy_grid": { "file_size": 3116 },
+  "reporting.confusion_matrix": { "file_size": 14748 },
+  "train_evaluation.linear_regression.r2_score": { "file_size": 37 },
+  "train_evaluation.random_forest.r2_score": { "file_size": 36 },
+  "train_evaluation.linear_regression.experiment_params": { "file_size": 102 },
+  "train_evaluation.random_forest.experiment_params": { "file_size": 338 },
+  "params:ingestion.typing.reviews.columns_as_floats": {
+    "rows": 1,
+    "file_size": 88
+  },
+  "ingestion.prm_agg_companies": { "rows": 50098, "columns": 5 },
+  "params:feature_engineering.feature.derived": { "rows": 3, "file_size": 88 },
+  "params:feature_engineering.feature.static": { "rows": 9, "file_size": 184 },
+  "feature_engineering.feat_static_features": { "rows": 29768, "columns": 12 },
+  "feature_engineering.feat_derived_features": { "rows": 29768, "columns": 3 },
+  "params:split_options": { "rows": 3, "file_size": 232 },
+  "X_train": { "rows": 23814, "columns": 11 },
+  "y_train": { "rows": 23814, "file_size": 381024 },
+  "params:train_evaluation.model_options.linear_regression": {
+    "rows": 3,
+    "file_size": 232
+  },
+  "params:train_evaluation.model_options.random_forest": {
+    "rows": 3,
+    "file_size": 232
+  },
+  "X_test": { "rows": 5954, "columns": 11 },
+  "y_test": { "rows": 5954, "file_size": 95264 }
+}
@@ -114,6 +114,7 @@ class DataNodeMetadataAPIResponse(BaseAPIResponse):
     tracking_data: Optional[Dict]
     run_command: Optional[str]
     preview: Optional[Dict]
+    stats: Optional[Dict]
 
     class Config:
         schema_extra = {
@@ -130,6 +131,7 @@ class TranscodedDataNodeMetadataAPIReponse(BaseAPIResponse):
     original_type: str
     transcoded_types: List[str]
     run_command: Optional[str]
+    stats: Optional[Dict]
 
 
 class ParametersNodeMetadataAPIResponse(BaseAPIResponse):

@@ -49,10 +49,12 @@ async def get_single_node_metadata(node_id: str):
         return TaskNodeMetadata(node)
 
     if isinstance(node, DataNode):
-        return DataNodeMetadata(node)
+        dataset_stats = data_access_manager.get_dataset_stats(node)
+        return DataNodeMetadata(node, dataset_stats)
 
     if isinstance(node, TranscodedDataNode):
-        return TranscodedDataNodeMetadata(node)
+        dataset_stats = data_access_manager.get_dataset_stats(node)
+        return TranscodedDataNodeMetadata(node, dataset_stats)
 
     return ParametersNodeMetadata(node)
 

@@ -62,6 +62,7 @@ def __init__(self):
         )
         self.runs = RunsRepository()
         self.tracking_datasets = TrackingDatasetsRepository()
+        self.dataset_stats = {}
 
     def set_db_session(self, db_session_class: sessionmaker):
         """Set db session on repositories that need it."""
@@ -91,6 +92,25 @@ def add_pipelines(self, pipelines: Dict[str, KedroPipeline]):
             # Add the registered pipeline and its components to their repositories
             self.add_pipeline(registered_pipeline_id, pipeline)
 
+    def add_dataset_stats(self, stats_dict: Dict):
+        """Add dataset statistics (eg. rows, columns, file_size) as a dictionary.
+        This will help in showing the relevant stats in the metadata panel
+
+        Args:
+            stats_dict: A dictionary object loaded from stats.json file in the kedro project
+        """
+
+        self.dataset_stats = stats_dict
+
+    def get_dataset_stats(self, data_node: Union[DataNode, TranscodedDataNode]) -> Dict:
+        """Returns the dataset statistics for the data node if found else returns None
+
+        Args:
+            The data node for which we need the statistics
+        """
+
+        return self.dataset_stats.get(data_node.name, {})
+
     def add_pipeline(self, registered_pipeline_id: str, pipeline: KedroPipeline):
         """Iterate through all the nodes and datasets in a "registered" pipeline
         and add them to relevant repositories. Take care of extracting other relevant information

@@ -6,6 +6,8 @@
 # pylint: disable=missing-function-docstring, no-else-return
 
 import base64
+import json as json_lib
+import logging
 from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
 
@@ -26,11 +28,13 @@
         plotly,
         tracking,
     )
+
 from kedro.io import DataCatalog
 from kedro.io.core import get_filepath_str
 from kedro.pipeline import Pipeline
 from semver import VersionInfo
 
+logger = logging.getLogger(__name__)
 KEDRO_VERSION = VersionInfo.parse(__version__)
 
 
@@ -54,11 +58,34 @@ def _bootstrap(project_path: Path):
         return
 
 
+def get_dataset_stats(project_path: Path):
-def get_dataset_stats(project_path: Path):
+def get_dataset_stats(project_path: Path) -> dict:
-def get_dataset_stats(project_path: Path):
+def get_dataset_stats(project_path: Path) -> dict:
+    """Return the stats saved at stats.json
+
+    Args:
+        project_path: the path where the Kedro project is located.
+    """
+    try:
+        stats_file_path = project_path / "stats.json"
+
+        if not stats_file_path.exists():
+            return {}
+
+        with open(stats_file_path, encoding="utf8") as stats_file:
+            stats = json_lib.load(stats_file)
+            return stats
+
+    except Exception as exc:  # pylint: disable=broad-exception-caught
+        logger.warning(
+            "Unable to get dataset stats from project path %s : %s", project_path, exc
+        )
+        return {}
+
+
 def load_data(
     project_path: Path,
     env: Optional[str] = None,
     extra_params: Optional[Dict[str, Any]] = None,
-) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore]:
+) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore, Dict]:
     """Load data from a Kedro project.
     Args:
         project_path: the path whether the Kedro project is located.
@@ -91,7 +118,9 @@ def load_data(
             # in case user doesn't have an active session down the line when it's first accessed.
             # Useful for users who have `get_current_session` in their `register_pipelines()`.
             pipelines_dict = dict(pipelines)
-        return catalog, pipelines_dict, session_store
+            stats_dict = dict(get_dataset_stats(project_path))
+
+        return catalog, pipelines_dict, session_store, stats_dict
     elif KEDRO_VERSION.match(">=0.17.1"):
         from kedro.framework.session import KedroSession
 
@@ -103,8 +132,9 @@ def load_data(
         ) as session:
             context = session.load_context()
             session_store = session._store
+            stats_dict = dict(get_dataset_stats(project_path))
 
-        return context.catalog, context.pipelines, session_store
+        return context.catalog, context.pipelines, session_store, stats_dict
     else:
         # Since Viz is only compatible with kedro>=0.17.0, this just matches 0.17.0
         from kedro.framework.session import KedroSession
@@ -120,8 +150,9 @@ def load_data(
         ) as session:
             context = session.load_context()
             session_store = session._store
+            stats_dict = dict(get_dataset_stats(project_path))
 
-        return context.catalog, context.pipelines, session_store
+        return context.catalog, context.pipelines, session_store, stats_dict
 
 
 # The dataset type is available as an attribute if and only if the import from kedro

@@ -0,0 +1,106 @@
+# pylint: disable=broad-exception-caught
+# pylint: disable=protected-access
+"""`kedro_viz.integrations.kedro.hooks` defines hooks to add additional
+functionalities for a kedro run."""
+
+import json as json_lib
+import logging
+import sys
+from collections import defaultdict
+from typing import Any
+
+import pandas as pd
+from kedro.framework.hooks import hook_impl
+from kedro.io import DataCatalog
+
+from kedro_viz.integrations.kedro.utils import stats_order
+from kedro_viz.models.utils import get_file_size
+
+try:
+    # kedro 0.18.11 onwards
+    from kedro.io import MemoryDataset
+except ImportError:  # pragma: no cover
+    # older versions
+    from kedro.io import MemoryDataSet as MemoryDataset
+
+logger = logging.getLogger(__name__)
+
+
+class DatasetStatsHook:
+    """Class to collect dataset statistics during a kedro run
+    and save it to a JSON file. The class currently supports
+    (pd.DataFrame, list, dict and pd.core.series.Series) dataset instances"""
+
+    def __init__(self):
+        self._stats = defaultdict(dict)
+
+    @hook_impl
+    def after_catalog_created(self, catalog: DataCatalog):
+        """Hook to be invoked after a data catalog is created.
+        Use this hook and get the file_size for the dataset if it has filepath.
+
+        Args:
+            catalog: The catalog that was created.
+        """
+        try:
+            datasets = catalog._data_sets
+
+            for dataset_name, dataset in datasets.items():
+                if not isinstance(dataset, MemoryDataset):
+                    file_path = dataset._filepath  # noqa: no-member
+                    self._stats[dataset_name]["file_size"] = get_file_size(file_path)
+
+        except Exception as exc:  # pragma: no cover
+            logger.warning(
+                "Unable to process file_size stat for the dataset %s : %s",
+                dataset_name,
+                exc,
+            )
+
+    @hook_impl
+    def after_dataset_loaded(self, dataset_name: str, data: Any):
+        """Hook to be invoked after a dataset is loaded from the catalog.
+        Once the dataset is loaded, extract the required dataset statistics.
+        The hook currently supports (pd.DataFrame, list, dict and pd.core.series.Series)
+        dataset instances
+
+        Args:
+            dataset_name: name of the dataset that was saved to the catalog.
+            data: the actual data that was saved to the catalog.
+        """
+        try:
+            if isinstance(data, pd.DataFrame):
+                self._stats[dataset_name]["rows"] = int(data.shape[0])
+                self._stats[dataset_name]["columns"] = int(data.shape[1])
+            elif isinstance(data, (list, dict)):
+                self._stats[dataset_name]["rows"] = int(len(data))
+                self._stats[dataset_name]["file_size"] = sys.getsizeof(data)
+            elif isinstance(data, pd.core.series.Series):
+                self._stats[dataset_name]["rows"] = int(len(data))
+                self._stats[dataset_name]["file_size"] = data.memory_usage(deep=True)
+
+        except Exception as exc:  # pragma: no cover
+            logger.warning(
+                "Error creating the stats for the dataset %s : %s", dataset_name, exc
+            )
+
+    @hook_impl
+    def after_pipeline_run(self):
+        """Hook to be invoked after a pipeline runs.
+        Once the pipeline run completes, write the dataset
+        statistics to stats.json file
+
+        """
+        try:
+            with open("stats.json", "w", encoding="utf8") as file:
+                sorted_stats_data = {
+                    dataset_name: stats_order(stats)
+                    for dataset_name, stats in self._stats.items()
+                }
+                json_lib.dump(sorted_stats_data, file)
+
+        except Exception as exc:  # pragma: no cover
+            logger.warning("Error writing the stats for the pipeline: %s", exc)
+
+
+dataset_stats_hook = DatasetStatsHook()
diff --git a/package/kedro_viz/integrations/kedro/utils.py b/package/kedro_viz/integrations/kedro/utils.py
@@ -0,0 +1,17 @@
+"""`kedro_viz.integrations.kedro.utils` contains utility
+functions used in the `kedro_viz.integrations.kedro` package"""
+
+from typing import Dict
+
+
+def stats_order(stats: Dict):
+    """Sort the stats extracted from the datasets using the sort order
+
+    Args:
+        stats: A dictionary of statistics for a dataset
+
+    Returns: A sorted dictionary based on the sort_order
+    """
+    # Custom sort order
+    sort_order = ["rows", "columns", "file_size"]
+    return {stat: stats.get(stat) for stat in sort_order if stat in stats}