From 292722913845e82a66a0554a115176f0e8e287b6 Mon Sep 17 00:00:00 2001 From: Jan-Thorsten Peter Date: Tue, 12 Jul 2022 15:47:13 +0200 Subject: [PATCH 1/4] Path sis_hash is now cached and used for __lt__, and __eq__. --- sisyphus/job_path.py | 51 +++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/sisyphus/job_path.py b/sisyphus/job_path.py index e155f74..fd7b27e 100644 --- a/sisyphus/job_path.py +++ b/sisyphus/job_path.py @@ -70,6 +70,7 @@ def __init__(self, path, creator=None, cached=False, hash_overwrite=None, tags=N self._tags = tags self._available = available + self._sis_hash_cache = None def keep_value(self, value): if self.creator: @@ -103,22 +104,24 @@ def add_user(self, user): self.users.add(user) def _sis_hash(self): - if self.hash_overwrite is None: - creator = self.creator - path = self.path - else: - overwrite = self.hash_overwrite - assert_msg = "sis_hash for path must be str or tuple of length 2" - if isinstance(overwrite, tuple): - assert len(overwrite) == 2, assert_msg - creator, path = overwrite + if not self._sis_hash_cache: + if self.hash_overwrite is None: + creator = self.creator + path = self.path else: - assert isinstance(overwrite, str), assert_msg - creator = None - path = overwrite - if hasattr(creator, '_sis_id'): - creator = os.path.join(creator._sis_id(), gs.JOB_OUTPUT) - return b'(Path, ' + tools.sis_hash_helper((creator, path)) + b')' + overwrite = self.hash_overwrite + assert_msg = "sis_hash for path must be str or tuple of length 2" + if isinstance(overwrite, tuple): + assert len(overwrite) == 2, assert_msg + creator, path = overwrite + else: + assert isinstance(overwrite, str), assert_msg + creator = None + path = overwrite + if hasattr(creator, '_sis_id'): + creator = os.path.join(creator._sis_id(), gs.JOB_OUTPUT) + self._sis_hash_cache = b'(Path, ' + tools.sis_hash_helper((creator, path)) + b')' + return self._sis_hash_cache @finished_results_cache.caching(get_key=lambda self, debug_info=None: ('available', self.rel_path())) def available(self, debug_info=None): @@ -192,19 +195,7 @@ def __lt__(self, other): if not isinstance(other, AbstractPath): assert False, "Cannot compare path to none path" - def creator_to_str(c): - if isinstance(c, str): - return c - elif hasattr(c, '_sis_id'): - return c._sis_id() - elif c is None: - return str(c) - else: - assert False, "User of path is not a job" - - s = "%s %s" % (creator_to_str(self.creator), self.path) - o = "%s %s" % (creator_to_str(other.creator), other.path) - return s < o + return self._sis_hash() < other._sis_hash() def __eq__(self, other): if type(self) != type(other): @@ -214,9 +205,7 @@ def __eq__(self, other): if len(self.__dict__) == len(other.__dict__) == 0: return True - creator_equal = self.creator == other.creator - path_equal = self.path == other.path - return creator_equal and path_equal + return self._sis_hash() == other._sis_hash() def __hash__(self): # TODO Check how uninitialized object should behave here From a7ab09df5df3651423cdb1769f2260dff11de0b5 Mon Sep 17 00:00:00 2001 From: Jan-Thorsten Peter Date: Tue, 12 Jul 2022 16:09:09 +0200 Subject: [PATCH 2/4] path __hash__ now uses _sis_hash if available --- sisyphus/job_path.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sisyphus/job_path.py b/sisyphus/job_path.py index fd7b27e..40fdfb1 100644 --- a/sisyphus/job_path.py +++ b/sisyphus/job_path.py @@ -208,9 +208,11 @@ def __eq__(self, other): return self._sis_hash() == other._sis_hash() def __hash__(self): - # TODO Check how uninitialized object should behave here - return hash((self.__dict__.get('creator'), - self.__dict__.get('path'))) + if hasattr(self, '_sis_hash_cache'): + # Add prefix to avoid collision with sis_hash string + return hash(b'HASH # ' + self._sis_hash()) + else: + return super().__hash__() def __getstate__(self): """ Skips exporting users From d855200c394419d7ba9ff297c304e7b4f310d0c3 Mon Sep 17 00:00:00 2001 From: Jan-Thorsten Peter Date: Thu, 14 Jul 2022 09:10:27 +0200 Subject: [PATCH 3/4] Remove sis_hash_cache from Path again --- sisyphus/job_path.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/sisyphus/job_path.py b/sisyphus/job_path.py index 40fdfb1..2f06be8 100644 --- a/sisyphus/job_path.py +++ b/sisyphus/job_path.py @@ -70,7 +70,6 @@ def __init__(self, path, creator=None, cached=False, hash_overwrite=None, tags=N self._tags = tags self._available = available - self._sis_hash_cache = None def keep_value(self, value): if self.creator: @@ -104,24 +103,22 @@ def add_user(self, user): self.users.add(user) def _sis_hash(self): - if not self._sis_hash_cache: - if self.hash_overwrite is None: - creator = self.creator - path = self.path + if self.hash_overwrite is None: + creator = self.creator + path = self.path + else: + overwrite = self.hash_overwrite + assert_msg = "sis_hash for path must be str or tuple of length 2" + if isinstance(overwrite, tuple): + assert len(overwrite) == 2, assert_msg + creator, path = overwrite else: - overwrite = self.hash_overwrite - assert_msg = "sis_hash for path must be str or tuple of length 2" - if isinstance(overwrite, tuple): - assert len(overwrite) == 2, assert_msg - creator, path = overwrite - else: - assert isinstance(overwrite, str), assert_msg - creator = None - path = overwrite - if hasattr(creator, '_sis_id'): - creator = os.path.join(creator._sis_id(), gs.JOB_OUTPUT) - self._sis_hash_cache = b'(Path, ' + tools.sis_hash_helper((creator, path)) + b')' - return self._sis_hash_cache + assert isinstance(overwrite, str), assert_msg + creator = None + path = overwrite + if hasattr(creator, '_sis_id'): + creator = os.path.join(creator._sis_id(), gs.JOB_OUTPUT) + return b'(Path, ' + tools.sis_hash_helper((creator, path)) + b')' @finished_results_cache.caching(get_key=lambda self, debug_info=None: ('available', self.rel_path())) def available(self, debug_info=None): From 204c2d3d6ec4c164c4b0e2af085e96b4eb884356 Mon Sep 17 00:00:00 2001 From: Jan-Thorsten Peter Date: Thu, 14 Jul 2022 09:31:38 +0200 Subject: [PATCH 4/4] Make using sis_hash for comparison an option --- sisyphus/global_settings.py | 4 ++++ sisyphus/job_path.py | 35 +++++++++++++++++++++++++++++------ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/sisyphus/global_settings.py b/sisyphus/global_settings.py index 19ae820..adf18f3 100644 --- a/sisyphus/global_settings.py +++ b/sisyphus/global_settings.py @@ -163,6 +163,10 @@ def file_caching(path): #: Default function to hash jobs and objects SIS_HASH = sisyphus.hash.short_hash +#: Use the sis hash to compare Paths and to compute __hash__. The original behavior was not based on sis_hash +#: which might lead to Paths with the same sis_hash to not be considered equal. +USE_SIS_HASH_FOR_PATH_COMPARISON = False + #: List of paths searched for loading config and recipe files. The module name should be part of the path e.g.: #: adding 'config' will cause Sisyphus to the current directory for a folder named config to load modules starting #: with config, other python files in the current directory will be ignored. diff --git a/sisyphus/job_path.py b/sisyphus/job_path.py index 2f06be8..4de6d70 100644 --- a/sisyphus/job_path.py +++ b/sisyphus/job_path.py @@ -192,7 +192,22 @@ def __lt__(self, other): if not isinstance(other, AbstractPath): assert False, "Cannot compare path to none path" - return self._sis_hash() < other._sis_hash() + if gs.USE_SIS_HASH_FOR_PATH_COMPARISON: + return self._sis_hash() < other._sis_hash() + + def creator_to_str(c): + if isinstance(c, str): + return c + elif hasattr(c, '_sis_id'): + return c._sis_id() + elif c is None: + return str(c) + else: + assert False, "User of path is not a job" + + s = "%s %s" % (creator_to_str(self.creator), self.path) + o = "%s %s" % (creator_to_str(other.creator), other.path) + return s < o def __eq__(self, other): if type(self) != type(other): @@ -202,14 +217,22 @@ def __eq__(self, other): if len(self.__dict__) == len(other.__dict__) == 0: return True - return self._sis_hash() == other._sis_hash() + if gs.USE_SIS_HASH_FOR_PATH_COMPARISON: + return self._sis_hash() == other._sis_hash() + + creator_equal = self.creator == other.creator + path_equal = self.path == other.path + return creator_equal and path_equal def __hash__(self): - if hasattr(self, '_sis_hash_cache'): - # Add prefix to avoid collision with sis_hash string - return hash(b'HASH # ' + self._sis_hash()) + if gs.USE_SIS_HASH_FOR_PATH_COMPARISON: + if hasattr(self, 'creator'): # uninitialized object have no creator and calling sis_hash would fail + # Add prefix to avoid collision with sis_hash string + return hash(b'HASH # ' + self._sis_hash()) + else: + return super().__hash__() else: - return super().__hash__() + return hash((self.__dict__.get('creator'), self.__dict__.get('path'))) def __getstate__(self): """ Skips exporting users