Issue #8188: Introduce a new scheme for computing hashes of numbers

(instances of int, float, complex, decimal.Decimal and fractions.Fraction) that makes it easy to maintain the invariant that hash(x) == hash(y) whenever x and y have equal value.
python · May 23, 2010 · dc787d2 · dc787d2
1 parent 0372113
commit dc787d2
Show file tree

Hide file tree

Showing 14 changed files with 569 additions and 140 deletions.
diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst
@@ -595,6 +595,109 @@ hexadecimal string representing the same number::
    '0x1.d380000000000p+11'
 
 
+.. _numeric-hash:
+
+Hashing of numeric types
+------------------------
+
+For numbers ``x`` and ``y``, possibly of different types, it's a requirement
+that ``hash(x) == hash(y)`` whenever ``x == y`` (see the :meth:`__hash__`
+method documentation for more details).  For ease of implementation and
+efficiency across a variety of numeric types (including :class:`int`,
+:class:`float`, :class:`decimal.Decimal` and :class:`fractions.Fraction`)
+Python's hash for numeric types is based on a single mathematical function
+that's defined for any rational number, and hence applies to all instances of
+:class:`int` and :class:`fraction.Fraction`, and all finite instances of
+:class:`float` and :class:`decimal.Decimal`.  Essentially, this function is
+given by reduction modulo ``P`` for a fixed prime ``P``.  The value of ``P`` is
+made available to Python as the :attr:`modulus` attribute of
+:data:`sys.hash_info`.
+
+.. impl-detail::
+
+   Currently, the prime used is ``P = 2**31 - 1`` on machines with 32-bit C
+   longs and ``P = 2**61 - 1`` on machines with 64-bit C longs.
+
+Here are the rules in detail:
+
+ - If ``x = m / n`` is a nonnegative rational number and ``n`` is not divisible
+   by ``P``, define ``hash(x)`` as ``m * invmod(n, P) % P``, where ``invmod(n,
+   P)`` gives the inverse of ``n`` modulo ``P``.
+
+ - If ``x = m / n`` is a nonnegative rational number and ``n`` is
+   divisible by ``P`` (but ``m`` is not) then ``n`` has no inverse
+   modulo ``P`` and the rule above doesn't apply; in this case define
+   ``hash(x)`` to be the constant value ``sys.hash_info.inf``.
+
+ - If ``x = m / n`` is a negative rational number define ``hash(x)``
+   as ``-hash(-x)``.  If the resulting hash is ``-1``, replace it with
+   ``-2``.
+
+ - The particular values ``sys.hash_info.inf``, ``-sys.hash_info.inf``
+   and ``sys.hash_info.nan`` are used as hash values for positive
+   infinity, negative infinity, or nans (respectively).  (All hashable
+   nans have the same hash value.)
+
+ - For a :class:`complex` number ``z``, the hash values of the real
+   and imaginary parts are combined by computing ``hash(z.real) +
+   sys.hash_info.imag * hash(z.imag)``, reduced modulo
+   ``2**sys.hash_info.width`` so that it lies in
+   ``range(-2**(sys.hash_info.width - 1), 2**(sys.hash_info.width -
+   1))``.  Again, if the result is ``-1``, it's replaced with ``-2``.
+
+
+To clarify the above rules, here's some example Python code,
+equivalent to the builtin hash, for computing the hash of a rational
+number, :class:`float`, or :class:`complex`::
+
+
+   import sys, math
+
+   def hash_fraction(m, n):
+       """Compute the hash of a rational number m / n.
+
+       Assumes m and n are integers, with n positive.
+       Equivalent to hash(fractions.Fraction(m, n)).
+
+       """
+       P = sys.hash_info.modulus
+       # Remove common factors of P.  (Unnecessary if m and n already coprime.)
+       while m % P == n % P == 0:
+           m, n = m // P, n // P
+
+       if n % P == 0:
+           hash_ = sys.hash_info.inf
+       else:
+           # Fermat's Little Theorem: pow(n, P-1, P) is 1, so
+           # pow(n, P-2, P) gives the inverse of n modulo P.
+           hash_ = (abs(m) % P) * pow(n, P - 2, P) % P
+       if m < 0:
+           hash_ = -hash_
+       if hash_ == -1:
+           hash_ = -2
+       return hash_
+
+   def hash_float(x):
+       """Compute the hash of a float x."""
+
+       if math.isnan(x):
+           return sys.hash_info.nan
+       elif math.isinf(x):
+           return sys.hash_info.inf if x > 0 else -sys.hash_info.inf
+       else:
+           return hash_fraction(*x.as_integer_ratio())
+
+   def hash_complex(z):
+       """Compute the hash of a complex number z."""
+
+       hash_ = hash_float(z.real) + sys.hash_info.imag * hash_float(z.imag)
+       # do a signed reduction modulo 2**sys.hash_info.width
+       M = 2**(sys.hash_info.width - 1)
+       hash_ = (hash_ & (M - 1)) - (hash & M)
+       if hash_ == -1:
+           hash_ == -2
+       return hash_
+
 .. _typeiter:
 
 Iterator Types

diff --git a/Doc/library/sys.rst b/Doc/library/sys.rst
@@ -446,6 +446,30 @@ always available.
       Changed to a named tuple and added *service_pack_minor*,
       *service_pack_major*, *suite_mask*, and *product_type*.
 
+
+.. data:: hash_info
+
+   A structseq giving parameters of the numeric hash implementation.  For
+   more details about hashing of numeric types, see :ref:`numeric-hash`.
+
+   +---------------------+--------------------------------------------------+
+   | attribute           | explanation                                      |
+   +=====================+==================================================+
+   | :const:`width`      | width in bits used for hash values               |
+   +---------------------+--------------------------------------------------+
+   | :const:`modulus`    | prime modulus P used for numeric hash scheme     |
+   +---------------------+--------------------------------------------------+
+   | :const:`inf`        | hash value returned for a positive infinity      |
+   +---------------------+--------------------------------------------------+
+   | :const:`nan`        | hash value returned for a nan                    |
+   +---------------------+--------------------------------------------------+
+   | :const:`imag`       | multiplier used for the imaginary part of a      |
+   |                     | complex number                                   |
+   +---------------------+--------------------------------------------------+
+
+   .. versionadded:: 3.2
+
+
 .. data:: hexversion
 
    The version number encoded as a single integer.  This is guaranteed to increase

diff --git a/Include/pyport.h b/Include/pyport.h
@@ -126,6 +126,20 @@ Used in:  PY_LONG_LONG
 #endif
 #endif
 
+/* Parameters used for the numeric hash implementation.  See notes for
+   _PyHash_Double in Objects/object.c.  Numeric hashes are based on
+   reduction modulo the prime 2**_PyHASH_BITS - 1. */
+
+#if SIZEOF_LONG >= 8
+#define _PyHASH_BITS 61
+#else
+#define _PyHASH_BITS 31
+#endif
+#define _PyHASH_MODULUS ((1UL << _PyHASH_BITS) - 1)
+#define _PyHASH_INF 314159
+#define _PyHASH_NAN 0
+#define _PyHASH_IMAG 1000003UL
+
 /* uintptr_t is the C9X name for an unsigned integral type such that a
  * legitimate void* can be cast to uintptr_t and then back to void* again
  * without loss of information.  Similarly for intptr_t, wrt a signed

diff --git a/Lib/decimal.py b/Lib/decimal.py
@@ -862,15 +862,15 @@ def _cmp(self, other):
     # that specified by IEEE 754.
 
     def __eq__(self, other, context=None):
-        other = _convert_other(other, allow_float=True)
+        other = _convert_other(other, allow_float = True)
         if other is NotImplemented:
             return other
         if self._check_nans(other, context):
             return False
         return self._cmp(other) == 0
 
     def __ne__(self, other, context=None):
-        other = _convert_other(other, allow_float=True)
+        other = _convert_other(other, allow_float = True)
         if other is NotImplemented:
             return other
         if self._check_nans(other, context):
@@ -879,7 +879,7 @@ def __ne__(self, other, context=None):
 
 
     def __lt__(self, other, context=None):
-        other = _convert_other(other, allow_float=True)
+        other = _convert_other(other, allow_float = True)
         if other is NotImplemented:
             return other
         ans = self._compare_check_nans(other, context)
@@ -888,7 +888,7 @@ def __lt__(self, other, context=None):
         return self._cmp(other) < 0
 
     def __le__(self, other, context=None):
-        other = _convert_other(other, allow_float=True)
+        other = _convert_other(other, allow_float = True)
         if other is NotImplemented:
             return other
         ans = self._compare_check_nans(other, context)
@@ -897,7 +897,7 @@ def __le__(self, other, context=None):
         return self._cmp(other) <= 0
 
     def __gt__(self, other, context=None):
-        other = _convert_other(other, allow_float=True)
+        other = _convert_other(other, allow_float = True)
         if other is NotImplemented:
             return other
         ans = self._compare_check_nans(other, context)
@@ -906,7 +906,7 @@ def __gt__(self, other, context=None):
         return self._cmp(other) > 0
 
     def __ge__(self, other, context=None):
-        other = _convert_other(other, allow_float=True)
+        other = _convert_other(other, allow_float = True)
         if other is NotImplemented:
             return other
         ans = self._compare_check_nans(other, context)
@@ -935,55 +935,28 @@ def compare(self, other, context=None):
 
     def __hash__(self):
         """x.__hash__() <==> hash(x)"""
-        # Decimal integers must hash the same as the ints
-        #
-        # The hash of a nonspecial noninteger Decimal must depend only
-        # on the value of that Decimal, and not on its representation.
-        # For example: hash(Decimal('100E-1')) == hash(Decimal('10')).
-
-        # Equality comparisons involving signaling nans can raise an
-        # exception; since equality checks are implicitly and
-        # unpredictably used when checking set and dict membership, we
-        # prevent signaling nans from being used as set elements or
-        # dict keys by making __hash__ raise an exception.
+
+        # In order to make sure that the hash of a Decimal instance
+        # agrees with the hash of a numerically equal integer, float
+        # or Fraction, we follow the rules for numeric hashes outlined
+        # in the documentation.  (See library docs, 'Built-in Types').
         if self._is_special:
             if self.is_snan():
                 raise TypeError('Cannot hash a signaling NaN value.')
             elif self.is_nan():
-                # 0 to match hash(float('nan'))
-                return 0
+                return _PyHASH_NAN
             else:
-                # values chosen to match hash(float('inf')) and
-                # hash(float('-inf')).
                 if self._sign:
-                    return -271828
+                    return -_PyHASH_INF
                 else:
-                    return 314159
-
-        # In Python 2.7, we're allowing comparisons (but not
-        # arithmetic operations) between floats and Decimals;  so if
-        # a Decimal instance is exactly representable as a float then
-        # its hash should match that of the float.
-        self_as_float = float(self)
-        if Decimal.from_float(self_as_float) == self:
-            return hash(self_as_float)
-
-        if self._isinteger():
-            op = _WorkRep(self.to_integral_value())
-            # to make computation feasible for Decimals with large
-            # exponent, we use the fact that hash(n) == hash(m) for
-            # any two nonzero integers n and m such that (i) n and m
-            # have the same sign, and (ii) n is congruent to m modulo
-            # 2**64-1.  So we can replace hash((-1)**s*c*10**e) with
-            # hash((-1)**s*c*pow(10, e, 2**64-1).
-            return hash((-1)**op.sign*op.int*pow(10, op.exp, 2**64-1))
-        # The value of a nonzero nonspecial Decimal instance is
-        # faithfully represented by the triple consisting of its sign,
-        # its adjusted exponent, and its coefficient with trailing
-        # zeros removed.
-        return hash((self._sign,
-                     self._exp+len(self._int),
-                     self._int.rstrip('0')))
+                    return _PyHASH_INF
+
+        if self._exp >= 0:
+            exp_hash = pow(10, self._exp, _PyHASH_MODULUS)
+        else:
+            exp_hash = pow(_PyHASH_10INV, -self._exp, _PyHASH_MODULUS)
+        hash_ = int(self._int) * exp_hash % _PyHASH_MODULUS
+        return hash_ if self >= 0 else -hash_
 
     def as_tuple(self):
         """Represents the number as a triple tuple.
@@ -6218,6 +6191,17 @@ def _format_number(is_negative, intpart, fracpart, exp, spec):
 # _SignedInfinity[sign] is infinity w/ that sign
 _SignedInfinity = (_Infinity, _NegativeInfinity)
 
+# Constants related to the hash implementation;  hash(x) is based
+# on the reduction of x modulo _PyHASH_MODULUS
+import sys
+_PyHASH_MODULUS = sys.hash_info.modulus
+# hash values to use for positive and negative infinities, and nans
+_PyHASH_INF = sys.hash_info.inf
+_PyHASH_NAN = sys.hash_info.nan
+del sys
+
+# _PyHASH_10INV is the inverse of 10 modulo the prime _PyHASH_MODULUS
+_PyHASH_10INV = pow(10, _PyHASH_MODULUS - 2, _PyHASH_MODULUS)
 
 
 if __name__ == '__main__':

diff --git a/Lib/fractions.py b/Lib/fractions.py
@@ -8,6 +8,7 @@
 import numbers
 import operator
 import re
+import sys
 
 __all__ = ['Fraction', 'gcd']
 
@@ -23,6 +24,12 @@ def gcd(a, b):
         a, b = b, a%b
     return a
 
+# Constants related to the hash implementation;  hash(x) is based
+# on the reduction of x modulo the prime _PyHASH_MODULUS.
+_PyHASH_MODULUS = sys.hash_info.modulus
+# Value to be used for rationals that reduce to infinity modulo
+# _PyHASH_MODULUS.
+_PyHASH_INF = sys.hash_info.inf
 
 _RATIONAL_FORMAT = re.compile(r"""
     \A\s*                      # optional whitespace at the start, then
@@ -528,16 +535,22 @@ def __hash__(self):
 
         """
         # XXX since this method is expensive, consider caching the result
-        if self._denominator == 1:
-            # Get integers right.
-            return hash(self._numerator)
-        # Expensive check, but definitely correct.
-        if self == float(self):
-            return hash(float(self))
+
+        # In order to make sure that the hash of a Fraction agrees
+        # with the hash of a numerically equal integer, float or
+        # Decimal instance, we follow the rules for numeric hashes
+        # outlined in the documentation.  (See library docs, 'Built-in
+        # Types').
+
+        # dinv is the inverse of self._denominator modulo the prime
+        # _PyHASH_MODULUS, or 0 if self._denominator is divisible by
+        # _PyHASH_MODULUS.
+        dinv = pow(self._denominator, _PyHASH_MODULUS - 2, _PyHASH_MODULUS)
+        if not dinv:
+            hash_ = _PyHASH_INF
         else:
-            # Use tuple's hash to avoid a high collision rate on
-            # simple fractions.
-            return hash((self._numerator, self._denominator))
+            hash_ = abs(self._numerator) * dinv % _PyHASH_MODULUS
+        return hash_ if self >= 0 else -hash_
 
     def __eq__(a, b):
         """a == b"""

diff --git a/Lib/test/test_float.py b/Lib/test/test_float.py
@@ -914,15 +914,6 @@ def notest_float_inf(self):
         self.assertFalse(NAN.is_inf())
         self.assertFalse((0.).is_inf())
 
-    def test_hash_inf(self):
-        # the actual values here should be regarded as an
-        # implementation detail, but they need to be
-        # identical to those used in the Decimal module.
-        self.assertEqual(hash(float('inf')), 314159)
-        self.assertEqual(hash(float('-inf')), -271828)
-        self.assertEqual(hash(float('nan')), 0)
-
-
 fromHex = float.fromhex
 toHex = float.hex
 class HexFloatTestCase(unittest.TestCase):