maphash

package standard library
master (9936a78) Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 16, 2026 License: BSD-3-Clause Imports: 4 Imported by: 939

Documentation

Overview

Package maphash provides hash functions on byte sequences and comparable values. It also defines Hasher, the interface between a hash function and a hash table.

These hash functions are intended to be used to implement hash tables, Bloom filters, and other data structures that need to map arbitrary strings or byte sequences to a uniform distribution on unsigned 64-bit integers.

Each different instance of a hash table or data structure should use its own Seed.

The hash functions are not cryptographically secure. (See crypto/sha256 and crypto/sha512 for cryptographic use.)

Example
package main

import (
	"fmt"
	"hash/maphash"
)

func main() {
	// The zero Hash value is valid and ready to use; setting an
	// initial seed is not necessary.
	var h maphash.Hash

	// Add a string to the hash, and print the current hash value.
	h.WriteString("hello, ")
	fmt.Printf("%#x\n", h.Sum64())

	// Append additional data (in the form of a byte array).
	h.Write([]byte{'w', 'o', 'r', 'l', 'd'})
	fmt.Printf("%#x\n", h.Sum64())

	// Reset discards all data previously added to the Hash, without
	// changing its seed.
	h.Reset()

	// Use SetSeed to create a new Hash h2 which will behave
	// identically to h.
	var h2 maphash.Hash
	h2.SetSeed(h.Seed())

	h.WriteString("same")
	h2.WriteString("same")
	fmt.Printf("%#x == %#x\n", h.Sum64(), h2.Sum64())
}
Example (BloomFilter)
package main

// This file demonstrates a Bloom filter.

import (
	"fmt"
	"hash/maphash"
	"math"
	"math/bits"
)

// BloomFilter is a Bloom filter, a probabilistic space-efficient
// representation of a set of values of type V based on hashing.
//
// It provides two operations: Insert inserts an element and Contains
// queries whether a value is a member of the set.
//
// However, unlike a typical set, the size is independent of the
// number of elements. The catch: Contains is permitted to report
// true even for some elements that are not present. The trade-off
// between space and accuracy is determined by parameters at
// construction.
type BloomFilter[V any] struct {
	hasher maphash.Hasher[V]
	seeds  []maphash.Seed // each seed determines a hash function
	bytes  []byte         // bit vector
}

// NewBloomFilterComparable returns a new BloomFilter for the
// specified elements using their natural comparison.
func NewComparableBloomFilter[V comparable](n int, fpProb float64) *BloomFilter[V] {
	return NewBloomFilter(maphash.ComparableHasher[V]{}, n, fpProb)
}

// NewBloomFilter constructs a new BloomFilter capable of holding n
// elements with the specified probability of false positive results,
// assuming a well dispersed hash function.
func NewBloomFilter[V any](hasher maphash.Hasher[V], n int, fpProb float64) *BloomFilter[V] {
	nbytes, nseeds := calibrate(n, fpProb)

	seeds := make([]maphash.Seed, nseeds)
	for i := range nseeds {
		seeds[i] = maphash.MakeSeed()
	}

	return &BloomFilter[V]{
		hasher: hasher,
		bytes:  make([]byte, nbytes),
		seeds:  seeds,
	}
}

func (f *BloomFilter[V]) Contains(v V) bool {
	for _, seed := range f.seeds {
		index, bit := f.locate(seed, v)
		if f.bytes[index]&bit == 0 {
			return false
		}
	}
	return true
}

func (f *BloomFilter[V]) Insert(v V) {
	for _, seed := range f.seeds {
		index, bit := f.locate(seed, v)
		f.bytes[index] |= bit
	}
}

func (f *BloomFilter[V]) locate(seed maphash.Seed, v V) (uint64, byte) {
	// Optimization note: the dynamic call to hasher.Hash causes h
	// to escape. You can use a sync.Pool can help mitigate the
	// allocation cost.
	//
	// Alternatively, you could copy and specialize the filter logic
	// for a specific implementation of maphash.Hasher, allowing
	// the compiler's escape analysis to determine that the
	// hasher.Hash call does not in fact cause h to escape.
	var h maphash.Hash
	h.SetSeed(seed)
	f.hasher.Hash(&h, v)
	hash := h.Sum64()

	index := reduce(hash, uint64(len(f.bytes)))
	mask := byte(1 << (hash % 8))
	return index, mask
}

// reduce maps hash to a value in the interval [0, n).
// See https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction
func reduce(hash, n uint64) uint64 {
	if n > 1<<32-1 {
		hi, _ := bits.Mul64(hash, n)
		return hi
	}
	return hash >> 32 * n >> 32
}

func calibrate(n int, fpProb float64) (bytes, seeds int) {
	// Following https://en.wikipedia.org/wiki/Bloom_filter:
	// - k is the number of hash functions,
	// - m is the size of the bit field;
	// - n is the number of set bits.

	if n == 0 {
		return 1, 1
	}

	logFpProb := math.Log(fpProb)
	m := -(float64(n) * logFpProb) / (math.Ln2 * math.Ln2)

	// Round up to a byte.
	// TODO(adonovan): opt: round up to the next allocation size
	// class (see bytes.growSlice) and then compute the possibly
	// smaller number of needed seeds based on this higher number.
	bytes = int(m) / 8
	if float64(bytes*8) < m {
		bytes++
	}

	k := -logFpProb / math.Ln2
	seeds = max(int(math.Round(k)), 1)
	return bytes, seeds
}

func main() {
	// Create a Bloom filter optimized for 2 elements with
	// a one-in-a-billion false-positive rate.
	//
	// (This low rate demands a lot of space: 88 bits and
	// 30 hash functions. More typical rates are 1-5%;
	// at 5%, we need only 16 bits and 4 hash functions.)
	f := NewComparableBloomFilter[string](2, 1e-9)

	// Insert two elements.
	f.Insert("apple")
	f.Insert("banana")

	// Check whether elements are present.
	//
	// "cherry" was not inserted, but Contains is probabilistic, so
	// this test will spuriously report Contains("cherry") = true
	// about once every billion runs.
	for _, fruit := range []string{"apple", "banana", "cherry"} {
		fmt.Printf("Contains(%q) = %v\n", fruit, f.Contains(fruit))
	}

}
Output:

Contains("apple") = true
Contains("banana") = true
Contains("cherry") = false

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

func Bytes added in go1.19

func Bytes(seed Seed, b []byte) uint64

Bytes returns the hash of b with the given seed.

Bytes is equivalent to, but more convenient and efficient than:

var h Hash
h.SetSeed(seed)
h.Write(b)
return h.Sum64()

func Comparable added in go1.24.0

func Comparable[T comparable](seed Seed, v T) uint64

Comparable returns the hash of comparable value v with the given seed such that Comparable(s, v1) == Comparable(s, v2) if v1 == v2. If v != v, then the resulting hash is randomly distributed.

func String added in go1.19

func String(seed Seed, s string) uint64

String returns the hash of s with the given seed.

String is equivalent to, but more convenient and efficient than:

var h Hash
h.SetSeed(seed)
h.WriteString(s)
return h.Sum64()

func WriteComparable added in go1.24.0

func WriteComparable[T comparable](h *Hash, x T)

WriteComparable adds x to the data hashed by h.

Types

type ComparableHasher

type ComparableHasher[T comparable] struct {
	// contains filtered or unexported fields
}

ComparableHasher is an implementation of Hasher whose Equal(x, y) method is consistent with x == y.

ComparableHasher is defined only for comparable types. The type system will not prevent you from instantiating a type such as ComparableHasher[any]; nonetheless you must not pass non-comparable argument values to its Hash or Equal methods.

func (ComparableHasher[T]) Equal

func (ComparableHasher[T]) Equal(x, y T) bool

func (ComparableHasher[T]) Hash

func (ComparableHasher[T]) Hash(h *Hash, v T)

type Hash

type Hash struct {
	// contains filtered or unexported fields
}

A Hash computes a seeded hash of a byte sequence.

The zero Hash is a valid Hash ready to use. A zero Hash chooses a random seed for itself during the first call to a Reset, Write, Seed, Clone, or Sum64 method. For control over the seed, use SetSeed.

The computed hash values depend only on the initial seed and the sequence of bytes provided to the Hash object, not on the way in which the bytes are provided. For example, the three sequences

h.Write([]byte{'f','o','o'})
h.WriteByte('f'); h.WriteByte('o'); h.WriteByte('o')
h.WriteString("foo")

all have the same effect.

Hashes are intended to be collision-resistant, even for situations where an adversary controls the byte sequences being hashed.

A Hash is not safe for concurrent use by multiple goroutines, but a Seed is. If multiple goroutines must compute the same seeded hash, each can declare its own Hash and call SetSeed with a common Seed.

func (*Hash) BlockSize

func (h *Hash) BlockSize() int

BlockSize returns h's block size.

func (*Hash) Clone added in go1.25.0

func (h *Hash) Clone() (hash.Cloner, error)

Clone implements hash.Cloner.

func (*Hash) Reset

func (h *Hash) Reset()

Reset discards all bytes added to h. (The seed remains the same.)

func (*Hash) Seed

func (h *Hash) Seed() Seed

Seed returns h's seed value.

func (*Hash) SetSeed

func (h *Hash) SetSeed(seed Seed)

SetSeed sets h to use seed, which must have been returned by MakeSeed or by another Hash.Seed method. Two Hash objects with the same seed behave identically. Two Hash objects with different seeds will very likely behave differently. Any bytes added to h before this call will be discarded.

func (*Hash) Size

func (h *Hash) Size() int

Size returns h's hash value size, 8 bytes.

func (*Hash) Sum

func (h *Hash) Sum(b []byte) []byte

Sum appends the hash's current 64-bit value to b. It exists for implementing hash.Hash. For direct calls, it is more efficient to use Hash.Sum64.

func (*Hash) Sum64

func (h *Hash) Sum64() uint64

Sum64 returns h's current 64-bit value, which depends on h's seed and the sequence of bytes added to h since the last call to Hash.Reset or Hash.SetSeed.

All bits of the Sum64 result are close to uniformly and independently distributed, so it can be safely reduced by using bit masking, shifting, or modular arithmetic.

func (*Hash) Write

func (h *Hash) Write(b []byte) (int, error)

Write adds b to the sequence of bytes hashed by h. It always writes all of b and never fails; the count and error result are for implementing io.Writer.

func (*Hash) WriteByte

func (h *Hash) WriteByte(b byte) error

WriteByte adds b to the sequence of bytes hashed by h. It never fails; the error result is for implementing io.ByteWriter.

func (*Hash) WriteString

func (h *Hash) WriteString(s string) (int, error)

WriteString adds the bytes of s to the sequence of bytes hashed by h. It always writes all of s and never fails; the count and error result are for implementing io.StringWriter.

type Hasher

type Hasher[T any] interface {
	Hash(*Hash, T)
	Equal(x, y T) bool
}

A Hasher defines the interface between a hash-based container and its elements. It provides a hash function and an equivalence relation over values of type T, enabling those values to be inserted in hash tables and similar data structures.

Of course, comparable types can already be used as keys of Go's built-in map type, but a Hasher enables non-comparable types to be used as keys of a suitable hash table too. Hashers may be useful even for comparable types, to define an equivalence relation that differs from the usual one (==), such as a field-based comparison for a pointer-to-struct type, or a case-insensitive comparison for strings, as in this example:

// CaseInsensitive is a Hasher[string] whose
// equivalence relation ignores letter case.
type CaseInsensitive struct{}

func (CaseInsensitive) Hash(h *Hash, s string) {
	h.WriteString(strings.ToLower(s))
}

func (CaseInsensitive) Equal(x, y string) bool {
	// (We avoid strings.EqualFold as it is not
	// consistent with ToLower for all values.)
	return strings.ToLower(x) == strings.ToLower(y)
}

A Hasher also permits values to be used with other hash-based data structures such as a Bloom filter. The ComparableHasher type makes it convenient to enable comparable types to be used in such data structures under their usual (==) equivalence relation.

Hash invariants

If two values are equal as defined by Equal(x, y), then they must have the same hash as defined by the effects of Hash(h, x) on h.

Hashers must be logically stateless: the behavior of the Hash and Equal methods depends only on the arguments.

Writing a good function

When defining a hash function and equivalence relation for a data type, it may help to first define a canonical encoding for values of that type as a sequence of elements, each being a number, string, boolean, or pointer. An encoding is canonical if two values that are logically equal have the same encoding, even if they are represented differently. For example, a canonical case-insensitive encoding of a string is strings.ToLower.

Once you have defined the encoding, the Hasher's Hash method should encode a value into the Hash using a sequence of calls to Hash.Write for byte slices, Hash.WriteString for strings, Hash.WriteByte for bytes, and WriteComparable for elements of other types. The Hasher's Equal method should compute the encodings of two values, then compare their corresponding elements, returning false at the first mismatch.

A Hash method may discard information so long as it remains consistent with the Equal method as defined above. For example, valid implementations of CaseInsensitive.Hash might inspect only the first letter of the string, or even use a constant value. However, the lossier the hash function, the more frequent the hash collisions and the slower the hash table.

Some data types, such as sets, are inherently unordered: the set {a, b, c} is equal to the set {c, b, a}. In some cases it is possible to define a canonical encoding for a set by sorting the elements into some order. In other cases this may inefficient, since it may require allocating memory, or infeasible, as when there is no convenient order. Another way to hash an unordered set is to compute the hash for each element separately, then combine all the element hashes using a commutative (order-independent) operator such as + or ^.

The Hash method below, for a hypothetical Set type, illustrates this approach:

type Set[T comparable] struct{ ... }

type setHasher[T comparable] struct{}

func (setHasher[T]) Hash(hash *maphash.Hash, set *Set[T]) {
	var accum uint64
	for elem := range set.Elements() {
		// Initialize a hasher for the element,
		// using same seed as the outer hash.
		var sub maphash.Hash
		sub.SetSeed(hash.Seed())

		// Hash the element.
		maphash.WriteComparable(&sub, elem)

		// Mix the element's hash into the set's hash.
		accum ^= sub.Sum64()
	}
	maphash.WriteComparable(hash, accum)
}

In many languages, a data type's hash operation simply returns an integer value. However, that makes it possible for an adversary to systematically construct a large number of values that all have the same hash, degrading the asymptotic performance of hash tables in a denial-of-service attack known as "hash flooding". By contrast, computing hashes as a sequence of values emitted into a Hash with an unpredictable Seed that varies from one hash table to another mitigates this attack.

In effect, the Seed chooses one of 2⁶⁴ different hash functions. The code example above calls SetSeed on the element's sub-Hasher so that it uses the same hash function as for the Set itself, and not a random one.

type Seed

type Seed struct {
	// contains filtered or unexported fields
}

A Seed is a random value that selects the specific hash function computed by a Hash. If two Hashes use the same Seeds, they will compute the same hash values for any given input. If two Hashes use different Seeds, they are very likely to compute distinct hash values for any given input.

A Seed must be initialized by calling MakeSeed. The zero seed is uninitialized and not valid for use with Hash's SetSeed method.

Each Seed value is local to a single process and cannot be serialized or otherwise recreated in a different process.

func MakeSeed

func MakeSeed() Seed

MakeSeed returns a new random seed.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL