From ca6512030c6e51cab7fed8a8ddfdb95ef74c7699 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 8 Mar 2021 15:53:37 -0600 Subject: [PATCH 01/15] Add, test unordered_hash --- pytools/__init__.py | 36 ++++++++++++++++++++++++++++++++++++ test/test_pytools.py | 20 ++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/pytools/__init__.py b/pytools/__init__.py index ec878b57..e7d2306a 100644 --- a/pytools/__init__.py +++ b/pytools/__init__.py @@ -169,6 +169,11 @@ .. autofunction:: resolve_name +Hashing +------- + +.. autofunction:: unordered_hash + Type Variables Used ------------------- @@ -2605,6 +2610,37 @@ def resolve_name(name): # }}} +# {{{ unordered_hash + +def unordered_hash(hash_constructor, iterable): + """Using a hash algorithm given by the parameter-less constructor + *hash_constructor*, return a hash object whose internal state + depends on the entries of *iterable*, but not their order. If *hash* + is the instance returned by evaluating ``hash_constructor()``, then + the each entry *i* of the iterable must permit ``hash.upate(i)`` to + succeed. An example of *hash_constructor* is :class:`hashlib.sha256`. + ``hash.digest_size`` must also be defined. + + .. warning:: + + The construction used in this function is likely not cryptographically + secure. Do not use this function in a security-relevant context. + + .. versionadded:: 2021.2 + """ + h_int = 0 + for i in iterable: + h_i = hash_constructor() + h_i.update(i) + h_int = h_int ^ int.from_bytes(h_i.digest(), sys.byteorder) + + h = hash_constructor() + h.update(h_int.to_bytes(h.digest_size, sys.byteorder)) + return h + +# }}} + + def _test(): import doctest doctest.testmod() diff --git a/test/test_pytools.py b/test/test_pytools.py index 87b5d3a7..59cfc92b 100644 --- a/test/test_pytools.py +++ b/test/test_pytools.py @@ -370,6 +370,26 @@ class BestInClassRibbon(FairRibbon, UniqueTag): t4.without_tags(red_ribbon) +def test_unordered_hash(): + import random + import hashlib + + lst = [random.randbytes(20) for _ in range(200)] + lorig = lst[:] + random.shuffle(lst) + + from pytools import unordered_hash + assert (unordered_hash(hashlib.sha256, lorig).digest() + == unordered_hash(hashlib.sha256, lst).digest()) + assert (unordered_hash(hashlib.sha256, lorig).digest() + == unordered_hash(hashlib.sha256, lorig).digest()) + assert (unordered_hash(hashlib.sha256, lorig).digest() + != unordered_hash(hashlib.sha256, lorig[:-1]).digest()) + lst[0] = b"aksdjfla;sdfjafd" + assert (unordered_hash(hashlib.sha256, lorig).digest() + != unordered_hash(hashlib.sha256, lst).digest()) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) From 03ae1f08425912feb0d626038ede68392f90741f Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 8 Mar 2021 15:53:51 -0600 Subject: [PATCH 02/15] Add license header to test_pytools --- test/test_pytools.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/test/test_pytools.py b/test/test_pytools.py index 59cfc92b..f9b33a8a 100644 --- a/test/test_pytools.py +++ b/test/test_pytools.py @@ -1,3 +1,26 @@ +__copyright__ = "Copyright (C) 2009-2021 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + import sys import pytest From 773dd380ca36096549f327306f176fa49863ed86 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 8 Mar 2021 15:55:10 -0600 Subject: [PATCH 03/15] Add persistent_dict.KeyBuilder.new_hash for hash alg customization --- pytools/persistent_dict.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pytools/persistent_dict.py b/pytools/persistent_dict.py index c33f2404..c559299e 100644 --- a/pytools/persistent_dict.py +++ b/pytools/persistent_dict.py @@ -173,6 +173,10 @@ def error_clean_up(self): # {{{ key generation class KeyBuilder: + # this exists so that we can (conceivably) switch algorithms at some point + # down the road + new_hash = hashlib.sha256 + def rec(self, key_hash, key): digest = None @@ -187,7 +191,7 @@ def rec(self, key_hash, key): except AttributeError: pass else: - inner_key_hash = hashlib.sha256() + inner_key_hash = self.new_hash() method(inner_key_hash, self) digest = inner_key_hash.digest() @@ -205,7 +209,7 @@ def rec(self, key_hash, key): method = self.update_for_specific_dtype if method is not None: - inner_key_hash = hashlib.sha256() + inner_key_hash = self.new_hash() method(inner_key_hash, key) digest = inner_key_hash.digest() @@ -224,7 +228,7 @@ def rec(self, key_hash, key): key_hash.update(digest) def __call__(self, key): - key_hash = hashlib.sha256() + key_hash = self.new_hash() self.rec(key_hash, key) return key_hash.hexdigest() From 5b4aa1e29a42a132d497ca7cdb7a6a6640ce2561 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 8 Mar 2021 15:56:12 -0600 Subject: [PATCH 04/15] Document persistent_dict.KeyBuilder --- pytools/persistent_dict.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pytools/persistent_dict.py b/pytools/persistent_dict.py index c559299e..0047aa93 100644 --- a/pytools/persistent_dict.py +++ b/pytools/persistent_dict.py @@ -173,6 +173,22 @@ def error_clean_up(self): # {{{ key generation class KeyBuilder: + """A (stateless) object that computes hashes of objects fed to it. Subclassing + this class permits customizing the computation of hash keys. + + .. automethod:: __call__ + .. automethod:: rec + .. staticmethod:: new_hash() + + Return a new hash instance following the protocol of the ones + from :mod:`hashlib`. This will permit switching to different + hash algorithms in the future. Subclasses are expected to use + this to create new hashes. Not doing so is deprecated and + may stop working as early as 2022. + + .. versionadded:: 2021.1.3 + """ + # this exists so that we can (conceivably) switch algorithms at some point # down the road new_hash = hashlib.sha256 From 48c9204145e01340d1a41504c54b65032a0d5cc8 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 8 Mar 2021 15:56:44 -0600 Subject: [PATCH 05/15] KeyBuilder.rec: return updated key_hash --- pytools/persistent_dict.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pytools/persistent_dict.py b/pytools/persistent_dict.py index 0047aa93..9e961a33 100644 --- a/pytools/persistent_dict.py +++ b/pytools/persistent_dict.py @@ -194,6 +194,16 @@ class KeyBuilder: new_hash = hashlib.sha256 def rec(self, key_hash, key): + """ + :arg key_hash: the hash object to be updated with the hash of *key*. + :arg key: the (immutable) Python object to be hashed. + :returns: the updated *key_hash* + + .. versionchanged:: 2021.2 + + Now returns the updated *key_hash*. + """ + digest = None try: @@ -242,6 +252,7 @@ def rec(self, key_hash, key): pass key_hash.update(digest) + return key_hash def __call__(self, key): key_hash = self.new_hash() From 877bc5c50132e662860102abcb00be0646819780 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 8 Mar 2021 16:05:34 -0600 Subject: [PATCH 06/15] Use (hopefully) faster hash functions in KeyBuilder for int, float --- pytools/persistent_dict.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pytools/persistent_dict.py b/pytools/persistent_dict.py index 9e961a33..7836b44e 100644 --- a/pytools/persistent_dict.py +++ b/pytools/persistent_dict.py @@ -263,14 +263,21 @@ def __call__(self, key): @staticmethod def update_for_int(key_hash, key): - key_hash.update(str(key).encode("utf8")) + sz = 8 + while True: + try: + key_hash.update(key.to_bytes(sz, byteorder="little", signed=True)) + return + except OverflowError: + sz *= 2 - update_for_long = update_for_int - update_for_bool = update_for_int + @staticmethod + def update_for_bool(key_hash, key): + key_hash.update(str(key).encode("utf8")) @staticmethod def update_for_float(key_hash, key): - key_hash.update(repr(key).encode("utf8")) + key_hash.update(key.hex().encode("utf8")) @staticmethod def update_for_str(key_hash, key): From 487362e29ca6a7cbbbb9e1d5e1aa1859ec8e92cf Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 8 Mar 2021 16:06:16 -0600 Subject: [PATCH 07/15] Use unordered_hash in KeyBuilder hashing frozenset --- pytools/persistent_dict.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pytools/persistent_dict.py b/pytools/persistent_dict.py index 7836b44e..f4d3bf20 100644 --- a/pytools/persistent_dict.py +++ b/pytools/persistent_dict.py @@ -292,8 +292,13 @@ def update_for_tuple(self, key_hash, key): self.rec(key_hash, obj_i) def update_for_frozenset(self, key_hash, key): - for set_key in sorted(key): - self.rec(key_hash, set_key) + from pytools import unordered_hash + + self.rec(key_hash, + unordered_hash( + self.new_hash, + (self.rec(self.new_hash(), key_i).digest() for key_i in key) + ).digest()) @staticmethod def update_for_NoneType(key_hash, key): # noqa From cf2d2779cc783d83943efda3102002eff45f29f0 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 8 Mar 2021 16:06:41 -0600 Subject: [PATCH 08/15] Bump persistent_dict version --- pytools/persistent_dict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytools/persistent_dict.py b/pytools/persistent_dict.py index f4d3bf20..df9f2f5b 100644 --- a/pytools/persistent_dict.py +++ b/pytools/persistent_dict.py @@ -469,7 +469,7 @@ def __init__(self, identifier, key_builder=None, container_dir=None): import appdirs container_dir = join( appdirs.user_cache_dir("pytools", "pytools"), - "pdict-v3-{}-py{}".format( + "pdict-v4-{}-py{}".format( identifier, ".".join(str(i) for i in sys.version_info))) From ae733c2160e0e1cac032ade1e3279ee9d4f24156 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 8 Mar 2021 16:06:55 -0600 Subject: [PATCH 09/15] Test persistent_dict on negative integers, frozensets --- test/test_persistent_dict.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_persistent_dict.py b/test/test_persistent_dict.py index bf1fd543..050d9d95 100644 --- a/test/test_persistent_dict.py +++ b/test/test_persistent_dict.py @@ -58,7 +58,8 @@ def rand_str(n=20): for i in range(n)) keys = [ - (randrange(2000), rand_str(), None, SomeTag(rand_str())) + (randrange(2000)-1000, rand_str(), None, SomeTag(rand_str()), + frozenset({"abc", 123})) for i in range(20)] values = [randrange(2000) for i in range(20)] From b45524f23b513c04ff8e6f249e0c2fa502eb66b1 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 8 Mar 2021 16:07:17 -0600 Subject: [PATCH 10/15] Bump version --- pytools/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytools/version.py b/pytools/version.py index 8b36d4f9..b431cafd 100644 --- a/pytools/version.py +++ b/pytools/version.py @@ -1,3 +1,3 @@ -VERSION = (2021, 1, 2) +VERSION = (2021, 2) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS From 4b22ac110c311decdce3e262ba6c9f4ba32e10b1 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 8 Mar 2021 16:17:01 -0600 Subject: [PATCH 11/15] test_unordered_hash: rand_bytes is Py3.9+, replace --- test/test_pytools.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_pytools.py b/test/test_pytools.py index f9b33a8a..f3231c6f 100644 --- a/test/test_pytools.py +++ b/test/test_pytools.py @@ -397,7 +397,9 @@ def test_unordered_hash(): import random import hashlib - lst = [random.randbytes(20) for _ in range(200)] + # FIXME: Use randbytes once >=3.9 is OK + lst = [bytes([random.randrange(256) for _ in range(20)]) + for _ in range(200)] lorig = lst[:] random.shuffle(lst) From 6b3adea9a3a206538094a82a9ef2a96a16c39db6 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 8 Mar 2021 16:20:21 -0600 Subject: [PATCH 12/15] Fix doc references in unordered_hash --- pytools/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytools/__init__.py b/pytools/__init__.py index e7d2306a..6330869c 100644 --- a/pytools/__init__.py +++ b/pytools/__init__.py @@ -2618,8 +2618,8 @@ def unordered_hash(hash_constructor, iterable): depends on the entries of *iterable*, but not their order. If *hash* is the instance returned by evaluating ``hash_constructor()``, then the each entry *i* of the iterable must permit ``hash.upate(i)`` to - succeed. An example of *hash_constructor* is :class:`hashlib.sha256`. - ``hash.digest_size`` must also be defined. + succeed. An example of *hash_constructor* is ``hashlib.sha256`` + from :mod:`hashlib`. ``hash.digest_size`` must also be defined. .. warning:: From 74595b24e187d0c643c63639588eaba8578069eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Wed, 10 Mar 2021 15:41:10 -0600 Subject: [PATCH 13/15] unordered_hash: Explain why sys-independent despite use of sys.byteorder --- pytools/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pytools/__init__.py b/pytools/__init__.py index 6330869c..ae06dd17 100644 --- a/pytools/__init__.py +++ b/pytools/__init__.py @@ -2632,6 +2632,11 @@ def unordered_hash(hash_constructor, iterable): for i in iterable: h_i = hash_constructor() h_i.update(i) + # Using sys.byteorder (for efficiency) here technically makes the + # hash system-dependent (which it should not be), however the + # effect of this is undone by the to_bytes conversion below, while + # left invariant by the intervening XOR operations (which do not + # mix adjacent bits). h_int = h_int ^ int.from_bytes(h_i.digest(), sys.byteorder) h = hash_constructor() From 99b513796a7d02b506909f5e8266e9956a73365d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Wed, 10 Mar 2021 15:42:16 -0600 Subject: [PATCH 14/15] Fix versionadded for KeyBuilder.new_hash --- pytools/persistent_dict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytools/persistent_dict.py b/pytools/persistent_dict.py index df9f2f5b..f48027fd 100644 --- a/pytools/persistent_dict.py +++ b/pytools/persistent_dict.py @@ -186,7 +186,7 @@ class KeyBuilder: this to create new hashes. Not doing so is deprecated and may stop working as early as 2022. - .. versionadded:: 2021.1.3 + .. versionadded:: 2021.2 """ # this exists so that we can (conceivably) switch algorithms at some point From d9d009b981a75f888b4bcac32a4e0c8637ef1294 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 10 Mar 2021 16:17:28 -0600 Subject: [PATCH 15/15] unordered_hash: update a hash instance, instead of forcing creation of a new one --- pytools/__init__.py | 17 +++++++++++++---- pytools/persistent_dict.py | 8 +++----- test/test_pytools.py | 16 ++++++++-------- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/pytools/__init__.py b/pytools/__init__.py index c8cfa90b..f1f1ebf3 100644 --- a/pytools/__init__.py +++ b/pytools/__init__.py @@ -2623,7 +2623,7 @@ def resolve_name(name): # {{{ unordered_hash -def unordered_hash(hash_constructor, iterable): +def unordered_hash(hash_instance, iterable, hash_constructor=None): """Using a hash algorithm given by the parameter-less constructor *hash_constructor*, return a hash object whose internal state depends on the entries of *iterable*, but not their order. If *hash* @@ -2631,6 +2631,10 @@ def unordered_hash(hash_constructor, iterable): the each entry *i* of the iterable must permit ``hash.upate(i)`` to succeed. An example of *hash_constructor* is ``hashlib.sha256`` from :mod:`hashlib`. ``hash.digest_size`` must also be defined. + If *hash_constructor* is not provided, ``hash_instance.name`` is + used to deduce it. + + :returns: the updated *hash_instance*. .. warning:: @@ -2639,6 +2643,12 @@ def unordered_hash(hash_constructor, iterable): .. versionadded:: 2021.2 """ + + if hash_constructor is None: + from functools import partial + import hashlib + hash_constructor = partial(hashlib.new, hash_instance.name) + h_int = 0 for i in iterable: h_i = hash_constructor() @@ -2650,9 +2660,8 @@ def unordered_hash(hash_constructor, iterable): # mix adjacent bits). h_int = h_int ^ int.from_bytes(h_i.digest(), sys.byteorder) - h = hash_constructor() - h.update(h_int.to_bytes(h.digest_size, sys.byteorder)) - return h + hash_instance.update(h_int.to_bytes(hash_instance.digest_size, sys.byteorder)) + return hash_instance # }}} diff --git a/pytools/persistent_dict.py b/pytools/persistent_dict.py index f48027fd..c3a7b1da 100644 --- a/pytools/persistent_dict.py +++ b/pytools/persistent_dict.py @@ -294,11 +294,9 @@ def update_for_tuple(self, key_hash, key): def update_for_frozenset(self, key_hash, key): from pytools import unordered_hash - self.rec(key_hash, - unordered_hash( - self.new_hash, - (self.rec(self.new_hash(), key_i).digest() for key_i in key) - ).digest()) + unordered_hash( + key_hash, + (self.rec(self.new_hash(), key_i).digest() for key_i in key)) @staticmethod def update_for_NoneType(key_hash, key): # noqa diff --git a/test/test_pytools.py b/test/test_pytools.py index 49a4390d..356e47c3 100644 --- a/test/test_pytools.py +++ b/test/test_pytools.py @@ -446,15 +446,15 @@ def test_unordered_hash(): random.shuffle(lst) from pytools import unordered_hash - assert (unordered_hash(hashlib.sha256, lorig).digest() - == unordered_hash(hashlib.sha256, lst).digest()) - assert (unordered_hash(hashlib.sha256, lorig).digest() - == unordered_hash(hashlib.sha256, lorig).digest()) - assert (unordered_hash(hashlib.sha256, lorig).digest() - != unordered_hash(hashlib.sha256, lorig[:-1]).digest()) + assert (unordered_hash(hashlib.sha256(), lorig).digest() + == unordered_hash(hashlib.sha256(), lst).digest()) + assert (unordered_hash(hashlib.sha256(), lorig).digest() + == unordered_hash(hashlib.sha256(), lorig).digest()) + assert (unordered_hash(hashlib.sha256(), lorig).digest() + != unordered_hash(hashlib.sha256(), lorig[:-1]).digest()) lst[0] = b"aksdjfla;sdfjafd" - assert (unordered_hash(hashlib.sha256, lorig).digest() - != unordered_hash(hashlib.sha256, lst).digest()) + assert (unordered_hash(hashlib.sha256(), lorig).digest() + != unordered_hash(hashlib.sha256(), lst).digest()) if __name__ == "__main__":