Improve aggregation and status performance (no conditions). (#818)

* Use cached_statepoint. * Require signac 2.2.0 for cached_statepoint. * Cache the list of job ids while buffered. This allows faster `job in project` tests and iteration over jobs. Also remove some expensive open_job calls and job in project checks that are not needed while registering aggregates. * Do not iterate over all jobs for labels when there are no labels defined. This saves a small amount of absolute time in projects with no labels. It also gives the *appearance* of faster status checks as the user sees only 1 progress bar. Also, hide the "labels" section of the status output when there are no labels to show. * Run pre-commit. * Suggest cached_statepoint usage in pre conditions. * Update change log. * Fix typo. --------- Co-authored-by: Corwin Kerr <[email protected]>
glotzerlab · Feb 15, 2024 · a5648af · a5648af
1 parent 85b3b9b
commit a5648af
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 34 deletions.
diff --git a/changelog.txt b/changelog.txt
@@ -25,6 +25,8 @@ Changed
 +++++++
 
 - Move "Submit command" comment to end of pretend output (#805).
+- Improve aggregate registration performance (#818).
+- Hide empty "Labels" section in status output when there are no labels (#818).
 
 Removed
 +++++++

diff --git a/flow/aggregates.py b/flow/aggregates.py
@@ -235,19 +235,19 @@ def foo(*jobs):
             if default is None:
 
                 def keyfunction(job):
-                    return job.statepoint[key]
+                    return job.cached_statepoint[key]
 
             else:
 
                 def keyfunction(job):
-                    return job.statepoint.get(key, default)
+                    return job.cached_statepoint.get(key, default)
 
         elif isinstance(key, Iterable):
             keys = list(key)
             if default is None:
 
                 def keyfunction(job):
-                    return [job.statepoint[key] for key in keys]
+                    return [job.cached_statepoint[key] for key in keys]
 
             else:
                 if isinstance(default, Iterable):
@@ -264,7 +264,7 @@ def keyfunction(job):
 
                 def keyfunction(job):
                     return [
-                        job.statepoint.get(key, default_value)
+                        job.cached_statepoint.get(key, default_value)
                         for key, default_value in zip(keys, default)
                     ]
 
@@ -430,11 +430,6 @@ def _register_aggregates(self):
         # Initialize the internal mapping from id to aggregate
         self._aggregates_by_id = {}
         for aggregate in self._generate_aggregates():
-            for job in aggregate:
-                if job not in self._project:
-                    raise LookupError(
-                        f"The signac job {job.id} not found in {self._project}"
-                    )
             try:
                 stored_aggregate = tuple(aggregate)
             except TypeError:  # aggregate is not iterable
@@ -456,7 +451,7 @@ def _generate_aggregates(self):
             else:
 
                 def sort_function(job):
-                    return job.statepoint[self._aggregator._sort_by]
+                    return job.cached_statepoint[self._aggregator._sort_by]
 
             jobs = sorted(
                 jobs,
@@ -517,14 +512,7 @@ def __contains__(self, id):
             The job id.
 
         """
-        try:
-            self._project.open_job(id=id)
-        except KeyError:
-            return False
-        except LookupError:
-            raise
-        else:
-            return True
+        return self._project._contains_job_id(job_id=id)
 
     def __len__(self):
         return len(self._project)
@@ -538,8 +526,11 @@ def __hash__(self):
         return hash(self._project_repr)
 
     def keys(self):
-        for job in self._project:
-            yield job.id
+        if self._project._is_buffered:
+            return self._project._jobs_cursor._ids
+        else:
+            for job in self._project:
+                yield job.id
 
     def values(self):
         for job in self._project:

diff --git a/flow/project.py b/flow/project.py
@@ -1273,6 +1273,11 @@ def hi_all(*jobs):
             are used by :meth:`~.detect_operation_graph` when comparing
             conditions for equality. The tag defaults to the bytecode of the
             function.
+
+            .. tip::
+
+                Use ``job.cached_statepoint`` for the best performance in preconditions
+                that depend on the job's statepoint.
             """
 
             _parent_class = parent_class
@@ -1746,6 +1751,9 @@ def __init__(self, path=None, environment=None, entrypoint=None):
             format_checker=jsonschema.Draft7Validator.FORMAT_CHECKER,
         )
 
+        self._is_buffered = False
+        self._jobs_cursor = None
+
         # Associate this class with a compute environment.
         self._environment = environment or get_environment()
 
@@ -1777,6 +1785,27 @@ def __init__(self, path=None, environment=None, entrypoint=None):
         self._group_to_aggregate_store = _bidict()
         self._register_groups()
 
+    def __iter__(self):
+        """Provide a cached view of jobs while in a buffered state."""
+        if self._is_buffered:
+            return iter(self._jobs_cursor)
+        else:
+            return super().__iter__()
+
+    def __len__(self):
+        """Provide a cached view of jobs while in a buffered state."""
+        if self._is_buffered:
+            return len(self._jobs_cursor._ids)
+        else:
+            return super().__len__()
+
+    def _contains_job_id(self, job_id):
+        """Provide a cached view of jobs while in a buffered state."""
+        if self._is_buffered:
+            return job_id in self._jobs_cursor._id_set
+        else:
+            return super()._contains_job_id(job_id)
+
     def _setup_template_environment(self):
         """Set up the jinja2 template environment.
 
@@ -2762,14 +2791,17 @@ def compute_status(data):
                 self._get_job_labels,
                 ignore_errors=ignore_errors,
             )
-            job_labels = list(
-                parallel_executor(
-                    compute_labels,
-                    individual_jobs,
-                    desc="Fetching labels",
-                    file=err,
+            if len(self._label_functions) > 0:
+                job_labels = list(
+                    parallel_executor(
+                        compute_labels,
+                        individual_jobs,
+                        desc="Fetching labels",
+                        file=err,
+                    )
                 )
-            )
+            else:
+                job_labels = []
 
         def combine_group_and_operation_status(aggregate_status_results):
             group_statuses = {}
@@ -3113,10 +3145,10 @@ def display_group_name(group_name):
                     {
                         key
                         for job in individual_jobs
-                        for key in job.statepoint.keys()
+                        for key in job.cached_statepoint.keys()
                         if len(
                             {
-                                _to_hashable(job.statepoint().get(key))
+                                _to_hashable(job.cached_statepoint.get(key))
                                 for job in individual_jobs
                             }
                         )
@@ -3156,7 +3188,7 @@ def dotted_get(mapping, key):
                         else:
                             parameter_name = parameter
                         if statepoint is None:
-                            statepoint = job.statepoint()
+                            statepoint = job.cached_statepoint
                         status["parameters"][parameter] = shorten(
                             str(self._alias(dotted_get(statepoint, parameter_name))),
                             param_max_width,
@@ -3981,10 +4013,18 @@ def _convert_jobs_to_aggregates(self, jobs):
     def _buffered(self):
         """Enable the use of buffered mode for certain functions."""
         logger.debug("Entering buffered mode.")
+
+        self._jobs_cursor = self.find_jobs()
+        self._is_buffered = True
+
         with signac.buffered():
             yield
+
         logger.debug("Exiting buffered mode.")
 
+        self._is_buffered = False
+        self._jobs_cursor = None
+
     def _generate_submit_script(
         self, _id, operations, template, show_template_help, **kwargs
     ):

diff --git a/flow/templates/base_status.jinja b/flow/templates/base_status.jinja
@@ -18,11 +18,13 @@
 Overview: {{ total_num_jobs_or_aggregates }} jobs/aggregates, {{ total_num_eligible_jobs_or_aggregates }} jobs/aggregates with eligible operations.
 
         {% block progress %}
+            {% if progress_sorted|length > 0 %}
 | label | ratio |
 | ----- | ----- |
-            {% for label in progress_sorted %}
+                {% for label in progress_sorted %}
 | {{ label[0] }} | {{ label[1]|draw_progress_bar(total_num_job_labels, '\\') }} |
-            {% endfor %}
+                {% endfor %}
+            {% endif %}
         {% endblock progress %}
 
         {% block operation_summary %}

diff --git a/pyproject.toml b/pyproject.toml
@@ -31,7 +31,7 @@ classifiers = [
 ]
 dependencies = [
     # The core package.
-    "signac>=2.0.0",
+    "signac>=2.2.0",
     # For the templated generation of (submission) scripts.
     "jinja2>=3.0.0",
     # To enable the parallelized execution of operations across processes.

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-signac>=2.0.0
+signac>=2.2.0
 jinja2>=3.0.0
 cloudpickle>=1.6.0
 deprecation>=2.0.0
-Original file line number
+Diff line change
@@ Expand Up / @@ -25,6 +25,8 @@ Changed @@
     +++++++
     - Move "Submit command" comment to end of pretend output (#805).
+    - Improve aggregate registration performance (#818).
+    - Hide empty "Labels" section in status output when there are no labels (#818).
     Removed
     +++++++
@@ Expand Down @@