diff --git a/etl/helpers.py b/etl/helpers.py index 74af9bff9b8..946a884f097 100644 --- a/etl/helpers.py +++ b/etl/helpers.py @@ -8,6 +8,7 @@ import tempfile from collections.abc import Generator from contextlib import contextmanager +from functools import cache from pathlib import Path from typing import Any, Dict, Iterable, Iterator, List, Literal, Optional, Union, cast from urllib.parse import urljoin @@ -377,6 +378,10 @@ class WrongStepName(ExceptionFromDocstring): """Wrong step name. If this step was in the dag, it should be corrected.""" +# loading DAG can take up to 1 second, so cache it +load_dag_cached = cache(load_dag) + + class PathFinder: """Helper object with naming conventions. It uses your module path (__file__) and extracts from it commonly used attributes like channel / namespace / version / short_name or @@ -414,9 +419,9 @@ def dag(self): """Lazy loading of DAG.""" if self._dag is None: if "/archive/" in str(self.f): - self._dag = load_dag(paths.DAG_ARCHIVE_FILE) + self._dag = load_dag_cached(paths.DAG_ARCHIVE_FILE) else: - self._dag = load_dag() + self._dag = load_dag_cached() return self._dag @property diff --git a/etl/steps/__init__.py b/etl/steps/__init__.py index c4416ea8d2a..711a36042f9 100644 --- a/etl/steps/__init__.py +++ b/etl/steps/__init__.py @@ -14,7 +14,6 @@ from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field -from functools import cache from glob import glob from importlib import import_module from pathlib import Path @@ -131,7 +130,6 @@ def traverse(graph: Graph, nodes: Set[str]) -> Graph: return dict(reachable) -@cache def load_dag(filename: Union[str, Path] = paths.DEFAULT_DAG_FILE) -> Dict[str, Any]: return _load_dag(filename, {}) @@ -517,8 +515,7 @@ def _output_dataset(self) -> catalog.Dataset: return catalog.Dataset(self._dest_dir.as_posix()) def checksum_output(self) -> str: - # output checksum is checksum of all ingredients - return self.checksum_input() + return self._output_dataset.checksum() def _step_files(self) -> List[str]: "Return a list of code files defining this step." @@ -714,7 +711,12 @@ def has_existing_data(self) -> bool: return True def checksum_output(self) -> str: - return Snapshot(self.path).m.outs[0]["md5"] + # NOTE: we could use the checksum from `_dvc_path` to + # speed this up. Test the performance on + # time poetry run etl run garden --dry-run + # Make sure that the checksum below is the same as DVC checksum! It + # looks like it might be different for some reason + return files.checksum_file(self._dvc_path) @property def _dvc_path(self) -> str: