From b762417b80d842baa7ca216f38f577320dda1b7c Mon Sep 17 00:00:00 2001 From: "M.V. Hutz" Date: Mon, 16 Mar 2026 21:10:08 -0400 Subject: [PATCH] chore: move from `tools/dsa` Moved the implementation of this hash table from `tools/dsa` #1. --- .gitea/workflows/lint.yml | 62 ++++++++++ .golangci.yml | 235 ++++++++++++++++++++++++++++++++++++++ .gremlins.yaml | 7 ++ Makefile | 12 ++ bucket.go | 73 ++++++++++++ compare.go | 17 +++ compare_example_test.go | 37 ++++++ cuckoo_fuzz_test.go | 49 ++++++++ cuckoo_internal_test.go | 30 +++++ cuckoo_test.go | 130 +++++++++++++++++++++ doc.go | 9 ++ doc_example_test.go | 32 ++++++ go.mod | 11 ++ go.sum | 9 ++ hash.go | 27 +++++ hash_example_test.go | 34 ++++++ settings.go | 45 ++++++++ table.go | 203 ++++++++++++++++++++++++++++++++ 18 files changed, 1022 insertions(+) create mode 100644 .gitea/workflows/lint.yml create mode 100644 .golangci.yml create mode 100644 .gremlins.yaml create mode 100644 Makefile create mode 100644 bucket.go create mode 100644 compare.go create mode 100644 compare_example_test.go create mode 100644 cuckoo_fuzz_test.go create mode 100644 cuckoo_internal_test.go create mode 100644 cuckoo_test.go create mode 100644 doc.go create mode 100644 doc_example_test.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 hash.go create mode 100644 hash_example_test.go create mode 100644 settings.go create mode 100644 table.go diff --git a/.gitea/workflows/lint.yml b/.gitea/workflows/lint.yml new file mode 100644 index 0000000..f40f322 --- /dev/null +++ b/.gitea/workflows/lint.yml @@ -0,0 +1,62 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - uses: golangci/golangci-lint-action@v7 + with: + version: latest + + unit-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Run unit tests + run: go test ./... -cover -v + + fuzz-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Run fuzz tests + run: | + for func in $(grep -r --include='*_test.go' -oh 'func Fuzz\w*' . | sed 's/func //'); do + go test ./... -fuzz="^${func}$" -fuzztime=30s + done + + mutation-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Install gremlins + run: go install github.com/go-gremlins/gremlins/cmd/gremlins@latest + + - name: Run mutation tests + run: gremlins unleash diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..1ec3d80 --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,235 @@ +--- +# golangci-lint configuration file made by @ccoVeille +# Source: https://github.com/ccoVeille/golangci-lint-config-examples/ +# Author: @ccoVeille +# License: MIT +# Variant: 03-safe +# Version: v2.0.0 +# +version: "2" + +formatters: + enable: + # format the code + - gofmt + # format the block of imports + - gci + + settings: + # format the code with Go standard library + gofmt: + # simplify the code + # https://pkg.go.dev/cmd/gofmt#hdr-The_simplify_command + simplify: true + rewrite-rules: + # replace `interface{}` with `any` in the code on format + - pattern: 'interface{}' + replacement: 'any' + + # make sure imports are always in a deterministic order + # https://github.com/daixiang0/gci/ + gci: # define the section orders for imports + sections: + # Standard section: captures all standard packages. + - standard + # Default section: catchall that is not standard or custom + - default + # linters that related to local tool, so they should be separated + - localmodule + +linters: + exclusions: + # these presets where present in the v1 version of golangci-lint + # it's interesting to keep them when migrating, but removing them should be the goal + presets: + # exclude check on comments format in godoc + # These are common false positives in poor code + # you should not use this on recent code you write from scratch + # More information: https://golangci-lint.run/usage/false-positives/#comments + # + # Please uncomment the following line if your code is not using the godoc format + # - comments + + # Common false positives + # feel free to remove this if you don't have any false positives + # More information: https://golangci-lint.run/usage/false-positives/#common-false-positives + - common-false-positives + + # Legacy preset is not recommended anymore + # More information: https://golangci-lint.run/usage/false-positives/#legacy + - legacy + + # std-error-handling is a set of rules that avoid reporting unhandled errors on common functions/methods + # More information: https://golangci-lint.run/usage/false-positives/#std-error-handling + - std-error-handling + + # some linters are enabled by default + # https://golangci-lint.run/usage/linters/ + # + # enable some extra linters + enable: + # Errcheck is a program for checking for unchecked errors in Go code. + - errcheck + + # Vet examines Go source code and reports suspicious constructs. + - govet + + # Detects when assignments to existing variables are not used. + - ineffassign + + # It's a set of rules from staticcheck. See https://staticcheck.io/ + - staticcheck + + # Checks Go code for unused constants, variables, functions and types. + - unused + + # Fast, configurable, extensible, flexible, and beautiful linter for Go. + # Drop-in replacement of golint. + - revive + + # make sure to use t.Helper() when needed + - thelper + + # mirror suggests rewrites to avoid unnecessary []byte/string conversion + - mirror + + # detect the possibility to use variables/constants from the Go standard library. + - usestdlibvars + + # Finds commonly misspelled English words. + - misspell + + # Checks for duplicate words in the source code. + - dupword + + # linter to detect errors invalid key values count + - loggercheck + + # detect when a package or method could be replaced by one from the standard library + - exptostd + + # detects nested contexts in loops or function literals + - fatcontext + + # Reports uses of functions with replacement inside the testing package. + - usetesting + + settings: + revive: + rules: + # these are the default revive rules + # you can remove the whole "rules" node if you want + # BUT + # ! /!\ they all need to be present when you want to add more rules than the default ones + # otherwise, you won't have the default rules, but only the ones you define in the "rules" node + + # Blank import should be only in a main or test package, or have a comment justifying it. + - name: blank-imports + + # Packages should have comments of the form "Package x ...". + - name: package-comments + + # context.Context() should be the first parameter of a function when provided as argument. + - name: context-as-argument + arguments: + - allowTypesBefore: "*testing.T" + + # Basic types should not be used as a key in `context.WithValue` + - name: context-keys-type + + # Importing with `.` makes the programs much harder to understand + - name: dot-imports + + # Empty blocks make code less readable and could be a symptom of a bug or unfinished refactoring. + - name: empty-block + + # for better readability, variables of type `error` must be named with the prefix `err`. + - name: error-naming + + # for better readability, the errors should be last in the list of returned values by a function. + - name: error-return + + # for better readability, error messages should not be capitalized or end with punctuation or a newline. + - name: error-strings + + # report when replacing `errors.New(fmt.Sprintf())` with `fmt.Errorf()` is possible + - name: errorf + + # check naming and commenting conventions on exported symbols. + - name: exported + arguments: + # make error messages clearer + - "sayRepetitiveInsteadOfStutters" + # require comments on public interface methods + - "checkPublicInterface" + + # incrementing an integer variable by 1 is recommended to be done using the `++` operator + - name: increment-decrement + + # highlights redundant else-blocks that can be eliminated from the code + # - name: indent-error-flow + + # This rule suggests a shorter way of writing ranges that do not use the second value. + - name: range + + # receiver names in a method should reflect the struct name (p for Person, for example) + - name: receiver-naming + + # redefining built in names (true, false, append, make) can lead to bugs very difficult to detect. + - name: redefines-builtin-id + + # redundant else-blocks that can be eliminated from the code. + # - name: superfluous-else + + # prevent confusing name for variables when using `time` package + - name: time-naming + + # warns when an exported function or method returns a value of an un-exported type. + - name: unexported-return + + # spots and proposes to remove unreachable code. also helps to spot errors + - name: unreachable-code + + # Functions or methods with unused parameters can be a symptom of an unfinished refactoring or a bug. + - name: unused-parameter + + # report when a variable declaration can be simplified + - name: var-declaration + + # warns when initialism, variable or package naming conventions are not followed. + - name: var-naming + + misspell: + # Correct spellings using locale preferences for US or UK. + # Setting locale to US will correct the British spelling of 'colour' to 'color'. + # Default ("") is to use a neutral variety of English. + locale: US + + # List of words to ignore + # among the one defined in https://github.com/golangci/misspell/blob/master/words.go + ignore-rules: [] + # - valor + # - and + + # Extra word corrections. + extra-words: [] + # - typo: "whattever" + # correction: "whatever" + +output: + # Order to use when sorting results. + # Possible values: `file`, `linter`, and `severity`. + # + # If the severity values are inside the following list, they are ordered in this order: + # 1. error + # 2. warning + # 3. high + # 4. medium + # 5. low + # Either they are sorted alphabetically. + # + # Default: ["file"] + sort-order: + - linter + - severity + - file # filepath, line, and column. \ No newline at end of file diff --git a/.gremlins.yaml b/.gremlins.yaml new file mode 100644 index 0000000..7a1872f --- /dev/null +++ b/.gremlins.yaml @@ -0,0 +1,7 @@ +# yaml-language-server: $schema=https://gremlins.dev/0.6/schema/configuration.json + +unleash: + timeout-coefficient: 50 + + workers: 4 + dry-run: false \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d6095e6 --- /dev/null +++ b/Makefile @@ -0,0 +1,12 @@ +unit: + go test ./... -cover -v + +mutation: + gremlins unleash + +fuzz: + go test ./... -fuzz=$(FN) + +docs: + @echo ">>> Visit: http://localhost:6060/pkg/git.maximhutz.com/tools/dsa/" + godoc -http=:6060 \ No newline at end of file diff --git a/bucket.go b/bucket.go new file mode 100644 index 0000000..63a6681 --- /dev/null +++ b/bucket.go @@ -0,0 +1,73 @@ +package cuckoo + +type entry[K, V any] struct { + key K + value V +} + +type slot[K, V any] struct { + entry[K, V] + occupied bool +} + +type bucket[K, V any] struct { + hash Hash[K] + slots []slot[K, V] + capacity, size uint64 + compare EqualFunc[K] +} + +func (b bucket[K, V]) location(key K) uint64 { + return b.hash(key) % b.capacity +} + +func (b bucket[K, V]) get(key K) (value V, found bool) { + slot := b.slots[b.location(key)] + return slot.value, slot.occupied && b.compare(slot.key, key) +} + +func (b *bucket[K, V]) resize(capacity uint64) { + b.slots = make([]slot[K, V], capacity) + b.capacity = capacity + b.size = 0 +} + +func (b bucket[K, V]) update(key K, value V) (updated bool) { + slot := &b.slots[b.location(key)] + + if slot.occupied && b.compare(slot.key, key) { + slot.value = value + return true + } + + return false +} + +func (b *bucket[K, V]) evict(insertion entry[K, V]) (evicted entry[K, V], eviction bool) { + slot := &b.slots[b.location(insertion.key)] + + if !slot.occupied { + slot.entry = insertion + slot.occupied = true + b.size++ + return + } + + if b.compare(slot.key, insertion.key) { + slot.value = insertion.value + return + } + + insertion, slot.entry = slot.entry, insertion + return insertion, true +} + +func newBucket[K, V any](capacity uint64, hash Hash[K], compare EqualFunc[K]) bucket[K, V] { + return bucket[K, V]{ + hash: hash, + capacity: capacity, + compare: compare, + size: 0, + slots: make([]slot[K, V], capacity), + } +} diff --git a/compare.go b/compare.go new file mode 100644 index 0000000..57f8b47 --- /dev/null +++ b/compare.go @@ -0,0 +1,17 @@ +package cuckoo + +// An EqualFunc determines whethers two keys are 'equal'. Keys that are 'equal' +// are teated as the same by the [Table]. A good EqualFunc is pure, +// deterministic, and fast. By default, [NewTable] uses [DefaultEqualFunc]. +// +// This function MUST NOT return true if the [Hash] digest of two keys +// are different: the [Table] will not work. +type EqualFunc[K any] = func(a, b K) bool + +// DefaultEqualFunc compares two keys by strict equality. Returns true if the +// keys have [equal values]. +// +// [equal values]: https://go.dev/ref/spec#Comparison_operators +func DefaultEqualFunc[K comparable](a, b K) bool { + return a == b +} diff --git a/compare_example_test.go b/compare_example_test.go new file mode 100644 index 0000000..363bab7 --- /dev/null +++ b/compare_example_test.go @@ -0,0 +1,37 @@ +package cuckoo_test + +import ( + "fmt" + + "git.maximhutz.com/tools/go-cuckoo" +) + +// This example demonstrates what happens when EqualFunc and Hash disagree on +// equality. Although 'isEqual' only compares user IDs, but the hashes use the +// entire 'User' object. So, two objects with the same ID but different names +// hash to different slots, so the table cannot find them. +func ExampleEqualFunc_badEqualFunc() { + type User struct{ ID, Name string } + + // Two users with the same ID are equal. + isEqual := func(a, b User) bool { return a.ID == b.ID } + + hashA, hashB := cuckoo.NewDefaultHash[User](), cuckoo.NewDefaultHash[User]() + userbase := cuckoo.NewCustomTable[User, bool](hashA, hashB, isEqual) + + (userbase.Put(User{"1", "Robert Doe"}, true)) + + fmt.Println("Has Robert?", userbase.Has(User{"1", "Robert Doe"})) + fmt.Println("Has Johanna?", userbase.Has(User{"2", "Johanna Smith"})) + + // The hashes are different, so even though the equal function returns true, + // the table does not recognize it. + fmt.Println("Equal?", isEqual(User{"1", "Rob Doe"}, User{"1", "Robert Doe"})) + fmt.Println("Has Rob?", userbase.Has(User{"1", "Rob Doe"})) + + // Output: + // Has Robert? true + // Has Johanna? false + // Equal? true + // Has Rob? false +} diff --git a/cuckoo_fuzz_test.go b/cuckoo_fuzz_test.go new file mode 100644 index 0000000..22cf07b --- /dev/null +++ b/cuckoo_fuzz_test.go @@ -0,0 +1,49 @@ +package cuckoo_test + +import ( + "bytes" + "encoding/binary" + "testing" + + "github.com/stretchr/testify/assert" + + "git.maximhutz.com/tools/go-cuckoo" +) + +func offsetHash(seed uint32) cuckoo.Hash[uint32] { + return func(x uint32) uint64 { + v := uint64(x) ^ uint64(seed) + v = (v ^ (v >> 30)) * 0xbf58476d1ce4e5b9 + v = (v ^ (v >> 27)) * 0x94d049bb133111eb + return v ^ (v >> 31) + } +} + +func FuzzInsertLookup(f *testing.F) { + f.Fuzz(func(t *testing.T, data []byte, seedA, seedB uint32) { + assert := assert.New(t) + + table := cuckoo.NewCustomTable[uint32, uint32]( + offsetHash(seedA), + offsetHash(seedB), + func(a, b uint32) bool { return a == b }, + ) + + if seedA == seedB { + return + } + + r := bytes.NewReader(data) + var key, value uint32 + for binary.Read(r, binary.LittleEndian, &key) == nil && + binary.Read(r, binary.LittleEndian, &value) == nil { + + err := table.Put(key, value) + assert.NoError(err) + + found, err := table.Get(key) + assert.NoError(err) + assert.Equal(value, found) + } + }) +} diff --git a/cuckoo_internal_test.go b/cuckoo_internal_test.go new file mode 100644 index 0000000..fde14ee --- /dev/null +++ b/cuckoo_internal_test.go @@ -0,0 +1,30 @@ +package cuckoo + +import ( + "math" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestMaxEvictions(t *testing.T) { + assert := assert.New(t) + + for i := 16; i < 116; i++ { + table := NewTable[int, bool](Capacity(i / 2)) + expectedEvictions := 3 * math.Floor(math.Log2(float64(i))) + + assert.Equal(table.maxEvictions(), int(expectedEvictions)) + } +} + +func TestLoad(t *testing.T) { + assert := assert.New(t) + table := NewTable[int, bool](Capacity(8)) + + for i := range 16 { + err := table.Put(i, true) + assert.NoError(err) + assert.Equal(float64(table.Size())/float64(table.Capacity()), table.load()) + } +} diff --git a/cuckoo_test.go b/cuckoo_test.go new file mode 100644 index 0000000..3f9c7ec --- /dev/null +++ b/cuckoo_test.go @@ -0,0 +1,130 @@ +package cuckoo_test + +import ( + "maps" + "math/rand/v2" + "testing" + + "github.com/stretchr/testify/assert" + + "git.maximhutz.com/tools/go-cuckoo" +) + +func TestNewTable(t *testing.T) { + assert := assert.New(t) + + table := cuckoo.NewTable[int, bool]() + + assert.NotNil(table) + assert.Zero(table.Size()) +} + +func TestAddItem(t *testing.T) { + assert := assert.New(t) + key, value := 0, true + table := cuckoo.NewTable[int, bool]() + + err := table.Put(key, value) + + assert.NoError(err) + assert.Equal(1, table.Size()) + assert.True(table.Has(key)) +} + +func TestPutOverwrite(t *testing.T) { + assert := assert.New(t) + key, value, newValue := 0, 1, 2 + table := cuckoo.NewTable[int, int]() + (table.Put(key, value)) + + err := table.Put(key, newValue) + + assert.NoError(err) + assert.Equal(1, table.Size()) + assert.True(table.Has(key)) + found, _ := table.Get(key) + assert.Equal(newValue, found) +} + +func TestSameHash(t *testing.T) { + assert := assert.New(t) + hash := func(int) uint64 { return 0 } + table := cuckoo.NewCustomTable[int, bool](hash, hash, cuckoo.DefaultEqualFunc[int]) + + errA := table.Put(0, true) + errB := table.Put(1, true) + errC := table.Put(2, true) + + assert.NoError(errA) + assert.NoError(errB) + assert.ErrorContains(errC, "bad hash") +} + +func TestStartingCapacity(t *testing.T) { + assert := assert.New(t) + table := cuckoo.NewTable[int, bool](cuckoo.Capacity(64)) + + assert.Equal(uint64(128), table.Capacity()) +} + +func TestResizeCapacity(t *testing.T) { + assert := assert.New(t) + table := cuckoo.NewTable[int, bool]( + cuckoo.Capacity(8), + cuckoo.GrowthFactor(2), + ) + + for table.Capacity() == 16 { + err := table.Put(rand.Int(), true) + assert.NoError(err) + } + + assert.Equal(uint64(32), table.Capacity()) +} + +func TestPutMany(t *testing.T) { + assert := assert.New(t) + expected, actual := map[int]bool{}, cuckoo.NewTable[int, bool]() + + for i := range 1_000 { + expected[i] = true + err := actual.Put(i, true) + + assert.NoError(err) + } + + assert.Equal(maps.Collect(actual.Entries()), expected) + assert.Equal(len(expected), actual.Size()) +} + +func TestGetMany(t *testing.T) { + assert := assert.New(t) + table := cuckoo.NewTable[int, bool]() + + for i := range 1_000 { + err := table.Put(i, true) + assert.NoError(err) + } + + for i := range 2_000 { + value, err := table.Get(i) + if i < 1_000 { + assert.NoError(err) + assert.Equal(value, true) + } else { + assert.Error(err) + } + } +} + +func TestRemove(t *testing.T) { + assert := assert.New(t) + table := cuckoo.NewTable[int, bool]() + + assert.False(table.Has(0)) + + err := table.Put(0, true) + assert.NoError(err) + + assert.True(table.Has(0)) +} diff --git a/doc.go b/doc.go new file mode 100644 index 0000000..11f8cda --- /dev/null +++ b/doc.go @@ -0,0 +1,9 @@ +// Package cuckoo provides a hash table that uses cuckoo hashing to achieve +// a worst-case O(1) lookup time. +// +// While a [NewTable] only supports comparable keys by default, you can create +// a table with any key type using [NewCustomTable]. Custom [Hash] functions and +// key comparison are also supported. +// +// See more: https://en.wikipedia.org/wiki/Cuckoo_hashing +package cuckoo diff --git a/doc_example_test.go b/doc_example_test.go new file mode 100644 index 0000000..1fad4ff --- /dev/null +++ b/doc_example_test.go @@ -0,0 +1,32 @@ +// This example +package cuckoo_test + +import ( + "fmt" + + "git.maximhutz.com/tools/go-cuckoo" +) + +func Example_basic() { + table := cuckoo.NewTable[int, string]() + + if err := table.Put(1, "Hello, World!"); err != nil { + fmt.Println("Put error:", err) + } + + if item, err := table.Get(1); err != nil { + fmt.Println("Error:", err) + } else { + fmt.Println("Found 1:", item) + } + + if item, err := table.Get(0); err != nil { + fmt.Println("Error:", err) + } else { + fmt.Println("Found 0:", item) + } + + // Output: + // Found 1: Hello, World! + // Error: key '0' not found +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..492bb0f --- /dev/null +++ b/go.mod @@ -0,0 +1,11 @@ +module git.maximhutz.com/tools/go-cuckoo + +go 1.25.6 + +require github.com/stretchr/testify v1.11.1 + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..cc8b3f4 --- /dev/null +++ b/go.sum @@ -0,0 +1,9 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/hash.go b/hash.go new file mode 100644 index 0000000..a83fe55 --- /dev/null +++ b/hash.go @@ -0,0 +1,27 @@ +package cuckoo + +import ( + "hash/maphash" +) + +// A Hash function maps any data to a fixed-length value (in this case, a +// [uint64]). +// +// It is used by the [Table] to evenly distribute values +// amongst its slots. A good hash function is uniform, [chaotic], and +// deterministic. [Table] uses [NewDefaultHash] by default, which is built on +// [maphash.Comparable]. +// +// [chaotic]: https://en.wikipedia.org/wiki/Avalanche_effect +type Hash[K any] = func(key K) (digest uint64) + +// NewDefaultHash returns a new [Hash] which uses [maphash.Comparable]. +// +// Each hash has a random seed, so calling this function again will return a new +// hash. Do not use this for testing. +func NewDefaultHash[K comparable]() Hash[K] { + seed := maphash.MakeSeed() + return func(key K) (digest uint64) { + return maphash.Comparable(seed, key) + } +} diff --git a/hash_example_test.go b/hash_example_test.go new file mode 100644 index 0000000..03c5e2e --- /dev/null +++ b/hash_example_test.go @@ -0,0 +1,34 @@ +package cuckoo_test + +import ( + "crypto/hmac" + "crypto/sha256" + "encoding/binary" + "fmt" + "io" + + "git.maximhutz.com/tools/go-cuckoo" +) + +func NewStringHash(seed uint64) cuckoo.Hash[string] { + key := binary.LittleEndian.AppendUint64(nil, seed) + hash := hmac.New(sha256.New, key) + + return func(key string) uint64 { + hash.Reset() + // This will never return an error, as part of the [hash.Hash] contract. We + // can safely ignore it. + (io.WriteString(hash, key)) + return binary.LittleEndian.Uint64(hash.Sum(nil)) + } +} + +func ExampleHash_stringHash() { + hash := NewStringHash(1) + fmt.Printf("Digest 'Hello, world!': %x\n", hash("Hello, world!")) + fmt.Printf("Digest 'Hello, world?': %x\n", hash("Hello, world?")) + + // Output: + // Digest 'Hello, world!': dc6602d6edcdf549 + // Digest 'Hello, world?': 432c2ddc1ae9f14b +} diff --git a/settings.go b/settings.go new file mode 100644 index 0000000..c96ee45 --- /dev/null +++ b/settings.go @@ -0,0 +1,45 @@ +package cuckoo + +// DefaultCapacity is the initial capacity of a [Table]. It is inspired from +// Java's [HashMap] implementation, which also uses 16. +// +// [HashMap]: https://docs.oracle.com/javase/8/docs/api/java/util/HashMap.html#HashMap-- +const DefaultCapacity uint64 = 16 + +// DefaultGrowthFactor is the standard resize multiplier for a [Table]. Most +// hash table implementations use 2. +const DefaultGrowthFactor uint64 = 2 + +// DefaultMinimumLoad is the default lowest acceptable occupancy of a [Table]. +// The value of 5% is taken from [libcuckoo]. +// +// [libcuckoo]: https://github.com/efficient/libcuckoo/blob/656714705a055df2b7a605eb3c71586d9da1e119/libcuckoo/cuckoohash_config.hh#L21 +const DefaultMinimumLoad float64 = 0.05 + +type settings struct { + growthFactor uint64 + minLoadFactor float64 + bucketSize uint64 +} + +// An Option modifies the settings of a [Table]. It is used in its constructors +// like [NewTable], for example. +type Option func(*settings) + +// Capacity modifies the starting capacity of each bucket of the [Table]. The +// value must be greater than 0. +func Capacity(value int) Option { + return func(s *settings) { s.bucketSize = uint64(value) } +} + +// MinimumLoad modifies the [DefaultMinimumLoad] of the [Table]. The value must +// be between 0.00 and 1.00. +func MinimumLoad(value float64) Option { + return func(s *settings) { s.minLoadFactor = value } +} + +// GrowthFactor controls how much the capacity of the [Table] multiplies when +// it must resize. The value must be greater than 1. +func GrowthFactor(value int) Option { + return func(s *settings) { s.growthFactor = uint64(value) } +} diff --git a/table.go b/table.go new file mode 100644 index 0000000..78ef2ae --- /dev/null +++ b/table.go @@ -0,0 +1,203 @@ +package cuckoo + +import ( + "fmt" + "iter" + "math/bits" + "strings" +) + +// A Table is hash table that uses cuckoo hashing to resolve collision. Create +// one with [NewTable]. Or if you want more granularity, use [NewTableBy] or +// [NewCustomTable]. +type Table[K, V any] struct { + bucketA, bucketB bucket[K, V] + growthFactor uint64 + minLoadFactor float64 +} + +// Capacity returns the number of slots allocated for the [Table]. To get the +// number of slots filled, look at [Table.Size]. +func (t Table[K, V]) Capacity() uint64 { + return t.bucketA.capacity + t.bucketB.capacity +} + +// Size returns how many slots are filled in the [Table]. +func (t Table[K, V]) Size() int { + return int(t.bucketA.size + t.bucketB.size) +} + +func log2(n uint64) (m int) { + return bits.Len64(n) - 1 +} + +func (t Table[K, V]) maxEvictions() int { + return 3 * log2(t.Capacity()) +} + +func (t Table[K, V]) load() float64 { + return float64(t.Size()) / float64(t.Capacity()) +} + +func (t *Table[K, V]) resize() error { + entries := make([]entry[K, V], 0, t.Size()) + for k, v := range t.Entries() { + entries = append(entries, entry[K, V]{k, v}) + } + + t.bucketA.resize(t.growthFactor * t.bucketA.capacity) + t.bucketB.resize(t.growthFactor * t.bucketB.capacity) + + for _, entry := range entries { + if err := t.Put(entry.key, entry.value); err != nil { + return err + } + } + + return nil +} + +// Get fetches the value for a key in the [Table]. Returns an error if no value +// is found. +func (t Table[K, V]) Get(key K) (value V, err error) { + if item, ok := t.bucketA.get(key); ok { + return item, nil + } + + if item, ok := t.bucketB.get(key); ok { + return item, nil + } + + return value, fmt.Errorf("key '%v' not found", key) +} + +// Has returns true if a key has a value in the table. +func (t Table[K, V]) Has(key K) (exists bool) { + _, err := t.Get(key) + return err == nil +} + +// Put sets the value for a key. Returns error if its value cannot be set. +func (t *Table[K, V]) Put(key K, value V) (err error) { + if t.bucketA.update(key, value) { + return nil + } + + if t.bucketB.update(key, value) { + return nil + } + + entry, eviction := entry[K, V]{key, value}, false + for range t.maxEvictions() { + if entry, eviction = t.bucketA.evict(entry); !eviction { + return nil + } + + if entry, eviction = t.bucketB.evict(entry); !eviction { + return nil + } + } + + if t.load() < t.minLoadFactor { + return fmt.Errorf("bad hash: resize on load %d/%d = %f", t.Size(), t.Capacity(), t.load()) + } + + if err := t.resize(); err != nil { + return err + } + + return t.Put(entry.key, entry.value) +} + +// Drop removes a value for a key in the table. Returns an error if its value +// cannot be removed. +// +// Deprecated: Do not use. +func (t Table[K, V]) Drop(_ K) { + panic("Not implemented") +} + +// Entries returns an unordered sequence of all key-value pairs in the table. +func (t Table[K, V]) Entries() iter.Seq2[K, V] { + return func(yield func(K, V) bool) { + for _, slot := range t.bucketA.slots { + if slot.occupied { + if !yield(slot.key, slot.value) { + return + } + } + } + + for _, slot := range t.bucketB.slots { + if slot.occupied { + if !yield(slot.key, slot.value) { + return + } + } + } + } +} + +// String returns the entries of the table as a string in the format: +// "table[k1:v1 h2:v2 ...]". +func (t Table[K, V]) String() string { + var sb strings.Builder + sb.WriteString("table[") + + first := true + for k, v := range t.Entries() { + if !first { + sb.WriteString(" ") + } + + fmt.Fprintf(&sb, "%v:%v", k, v) + first = false + } + + sb.WriteString("]") + return sb.String() +} + +// NewCustomTable creates a [Table] with custom [Hash] and [EqualFunc] +// functions, along with any [Option] the user provides. +func NewCustomTable[K, V any](hashA, hashB Hash[K], compare EqualFunc[K], options ...Option) *Table[K, V] { + settings := &settings{ + growthFactor: DefaultGrowthFactor, + bucketSize: DefaultCapacity, + minLoadFactor: DefaultMinimumLoad, + } + + for _, option := range options { + option(settings) + } + + return &Table[K, V]{ + growthFactor: settings.growthFactor, + minLoadFactor: settings.minLoadFactor, + bucketA: newBucket[K, V](settings.bucketSize, hashA, compare), + bucketB: newBucket[K, V](settings.bucketSize, hashB, compare), + } +} + +func pipe[X, Y, Z any](a func(X) Y, b func(Y) Z) func(X) Z { + return func(x X) Z { return b(a(x)) } +} + +// NewTableBy creates a [Table] for any key type by using keyFunc to derive a +// comparable key. Two keys with the same derived key are treated as equal. +func NewTableBy[K, V any, C comparable](keyFunc func(K) C, options ...Option) *Table[K, V] { + return NewCustomTable[K, V]( + pipe(keyFunc, NewDefaultHash[C]()), + pipe(keyFunc, NewDefaultHash[C]()), + func(a, b K) bool { return keyFunc(a) == keyFunc(b) }, + options..., + ) +} + +// NewTable creates a [Table] using the default [Hash] and [EqualFunc]. Use +// the [Option] functions to configure its behavior. Note that this constructor +// is only provided for comparable keys. For arbitrary keys, consider +// [NewTableBy] or [NewCustomTable]. +func NewTable[K comparable, V any](options ...Option) *Table[K, V] { + return NewCustomTable[K, V](NewDefaultHash[K](), NewDefaultHash[K](), DefaultEqualFunc[K], options...) +} -- 2.49.1