diff --git a/cuckoo_fuzz_test.go b/cuckoo_fuzz_test.go index 89e4b48..a6a5672 100644 --- a/cuckoo_fuzz_test.go +++ b/cuckoo_fuzz_test.go @@ -68,12 +68,13 @@ func FuzzInsertLookup(f *testing.F) { for _, step := range scenario.steps { if step.drop { - err := actual.Drop(step.key) - assert.NoError(err) + ok := actual.Drop(step.key) + _, has := expected[step.key] + assert.Equal(ok, has) delete(expected, step.key) - _, ok := actual.Get(step.key) + _, ok = actual.Get(step.key) assert.False(ok) } else { err := actual.Put(step.key, step.value) diff --git a/cuckoo_test.go b/cuckoo_test.go index e9df9f9..08394ad 100644 --- a/cuckoo_test.go +++ b/cuckoo_test.go @@ -124,9 +124,9 @@ func TestDropExistingItem(t *testing.T) { table := cuckoo.New[int, bool]() (table.Put(key, value)) - err := table.Drop(key) + had := table.Drop(key) - assert.NoError(err) + assert.True(had) assert.Equal(0, table.Size()) assert.False(table.Has(key)) } @@ -136,9 +136,9 @@ func TestDropNoItem(t *testing.T) { key := 0 table := cuckoo.New[int, bool]() - err := table.Drop(key) + had := table.Drop(key) - assert.NoError(err) + assert.False(had) assert.Equal(0, table.Size()) assert.False(table.Has(key)) } @@ -152,10 +152,9 @@ func TestDropItemCapacity(t *testing.T) { ) startingCapacity := table.TotalCapacity() - err := table.Drop(key) + table.Drop(key) endingCapacity := table.TotalCapacity() - assert.NoError(err) assert.Equal(0, table.Size()) assert.Equal(uint64(128), startingCapacity) assert.Equal(uint64(64), endingCapacity) @@ -203,9 +202,9 @@ func TestDropResizeCapacity(t *testing.T) { err1 := table.Put(0, true) err2 := table.Put(1, true) - err3 := table.Drop(1) + table.Drop(1) - assert.NoError(errors.Join(err1, err2, err3)) + assert.NoError(errors.Join(err1, err2)) assert.Equal(uint64(20), table.TotalCapacity()) } diff --git a/doc.go b/doc.go index 3be8bfc..2d1f62f 100644 --- a/doc.go +++ b/doc.go @@ -5,5 +5,8 @@ // a table with any key type using [NewCustom]. Custom [Hash] functions and // key comparison are also supported. // +// NOTE: The [Table] is a look-up structure, and not a source of truth. If +// [ErrBadHash] occurs, the data cannot be restored. +// // See more: https://en.wikipedia.org/wiki/Cuckoo_hashing package cuckoo diff --git a/settings.go b/settings.go index 97c6e0a..a527e76 100644 --- a/settings.go +++ b/settings.go @@ -19,6 +19,11 @@ const DefaultGrowthFactor uint64 = 2 // [libcuckoo]: https://github.com/efficient/libcuckoo/blob/656714705a055df2b7a605eb3c71586d9da1e119/libcuckoo/cuckoohash_config.hh#L21 const defaultMinimumLoad float64 = 0.05 +// defaultGrowthLimit is the maximum number of times a [Table] can grow in a +// single [Table.Put], before the library infers it will lead to a stack +// overflow. The value of '64' was chosen arbirarily. +const defaultGrowthLimit uint64 = 64 + type settings struct { growthFactor uint64 minLoadFactor float64 diff --git a/subtable.go b/subtable.go index 343fe43..e6d591a 100644 --- a/subtable.go +++ b/subtable.go @@ -1,5 +1,6 @@ package cuckoo +// An entry is a key-value pair. type entry[K, V any] struct { key K value V @@ -48,10 +49,13 @@ func (t *subtable[K, V]) drop(key K) (occupied bool) { return false } -func (t *subtable[K, V]) resize(capacity uint64) { - t.slots = make([]slot[K, V], capacity) - t.capacity = capacity - t.size = 0 +func (t *subtable[K, V]) resized(capacity uint64) *subtable[K, V] { + return &subtable[K, V]{ + slots: make([]slot[K, V], capacity), + capacity: capacity, + hash: t.hash, + compare: t.compare, + } } func (t *subtable[K, V]) update(key K, value V) (updated bool) { @@ -69,7 +73,7 @@ func (t *subtable[K, V]) update(key K, value V) (updated bool) { return false } -func (t *subtable[K, V]) evict(insertion entry[K, V]) (evicted entry[K, V], eviction bool) { +func (t *subtable[K, V]) insert(insertion entry[K, V]) (evicted entry[K, V], eviction bool) { if t.capacity == 0 { return insertion, true } @@ -92,8 +96,8 @@ func (t *subtable[K, V]) evict(insertion entry[K, V]) (evicted entry[K, V], evic return insertion, true } -func newSubtable[K, V any](capacity uint64, hash Hash[K], compare EqualFunc[K]) subtable[K, V] { - return subtable[K, V]{ +func newSubtable[K, V any](capacity uint64, hash Hash[K], compare EqualFunc[K]) *subtable[K, V] { + return &subtable[K, V]{ hash: hash, capacity: capacity, compare: compare, diff --git a/table.go b/table.go index c9dcf9f..b0c6d78 100644 --- a/table.go +++ b/table.go @@ -9,7 +9,7 @@ import ( ) // ErrBadHash occurs when the hashes given to a [Table] cause too many key -// collisions. Try rebuilding the table using: +// collisions. Discard the old table, rebuild it from your source data, and try: // // 1. Different hash seeds. Equal seeds produce equal hash functions, which // always cycle. @@ -20,7 +20,7 @@ var ErrBadHash = errors.New("bad hash") // one with [New]. Or if you want more granularity, use [NewBy] or // [NewCustom]. type Table[K, V any] struct { - tableA, tableB subtable[K, V] + tableA, tableB *subtable[K, V] growthFactor uint64 minLoadFactor float64 } @@ -54,30 +54,61 @@ func (t *Table[K, V]) load() float64 { return float64(t.Size()) / float64(t.TotalCapacity()) } -// resize clears all tables, changes the sizes of them to a specific capacity, -// and fills them back up again. It is a helper function for [Table.grow] and -// [Table.shrink]; use them instead. -func (t *Table[K, V]) resize(capacity uint64) error { - entries := make([]entry[K, V], 0, t.Size()) - for k, v := range t.Entries() { - entries = append(entries, entry[K, V]{k, v}) +// insert attempts to put/update an entry in the table, without modifying the +// size of the table. Returns a displaced entry and 'homeless = true' if an +// entry could not be placed after exhausting evictions. +func (t *Table[K, V]) insert(entry entry[K, V]) (displaced entry[K, V], homeless bool) { + if t.tableA.update(entry.key, entry.value) { + return } - t.tableA.resize(capacity) - t.tableB.resize(capacity) + if t.tableB.update(entry.key, entry.value) { + return + } - for _, entry := range entries { - if err := t.Put(entry.key, entry.value); err != nil { - return err + for range t.maxEvictions() { + if entry, homeless = t.tableA.insert(entry); !homeless { + return + } + + if entry, homeless = t.tableB.insert(entry); !homeless { + return } } - return nil + return entry, true +} + +// resized creates an empty copy of the table, with a new capacity for each +// bucket. +func (t *Table[K, V]) resized(capacity uint64) *Table[K, V] { + return &Table[K, V]{ + growthFactor: t.growthFactor, + minLoadFactor: t.minLoadFactor, + tableA: t.tableA.resized(capacity), + tableB: t.tableB.resized(capacity), + } +} + +// resize creates a new [Table.resized] with 'capacity', inserts all items into +// the array, and replaces the current table. It is a helper function for +// [Table.grow] and [Table.shrink]; use them instead. +func (t *Table[K, V]) resize(capacity uint64) bool { + updated := t.resized(capacity) + + for k, v := range t.Entries() { + if _, failed := updated.insert(entry[K, V]{k, v}); failed { + return false + } + } + + *t = *updated + return true } // grow increases the table's capacity by the growth factor. If the // capacity is 0, it increases it to 1. -func (t *Table[K, V]) grow() error { +func (t *Table[K, V]) grow() bool { var newCapacity uint64 if t.TotalCapacity() == 0 { @@ -91,7 +122,7 @@ func (t *Table[K, V]) grow() error { // shrink reduces the table's capacity by the growth factor. It may // reduce it down to 0. -func (t *Table[K, V]) shrink() error { +func (t *Table[K, V]) shrink() bool { return t.resize(t.tableA.capacity / t.growthFactor) } @@ -122,49 +153,48 @@ func (t *Table[K, V]) Has(key K) (exists bool) { return } -// Put sets the value for a key. Returns error if its value cannot be set. +// Put sets the value for a key. If it cannot be set, an error is returned. func (t *Table[K, V]) Put(key K, value V) (err error) { - if t.tableA.update(key, value) { - return nil - } + var ( + entry = entry[K, V]{key, value} + homeless bool + ) - if t.tableB.update(key, value) { - return nil - } - - entry, eviction := entry[K, V]{key, value}, false - for range t.maxEvictions() { - if entry, eviction = t.tableA.evict(entry); !eviction { - return nil + for range defaultGrowthLimit { + if entry, homeless = t.insert(entry); !homeless { + return } - if entry, eviction = t.tableB.evict(entry); !eviction { - return nil + // Both this and the growth limit are necessary: this catches bad hashes + // early when the table is sparse, while the latter catches cases where + // growing never helps. + if t.load() < t.minLoadFactor { + return fmt.Errorf("hash functions produced a cycle at load %d/%d: %w", t.Size(), t.TotalCapacity(), ErrBadHash) + } + + // It is theoretically possible to have a table with a larger capacity + // that is valid. But this chance is astronomically small, so we ignore + // it in this implementation. + if grew := t.grow(); !grew { + return fmt.Errorf("could not redistribute entries into larger table: %w", ErrBadHash) } } - if t.load() < t.minLoadFactor { - return fmt.Errorf("hash functions produced a cycle at load %d/%d: %w", t.Size(), t.TotalCapacity(), ErrBadHash) - } - - if err := t.grow(); err != nil { - return err - } - - return t.Put(entry.key, entry.value) + return fmt.Errorf("could not place entry after %d resizes: %w", defaultGrowthLimit, ErrBadHash) } -// Drop removes a value for a key in the table. Returns an error if its value -// cannot be removed. -func (t *Table[K, V]) Drop(key K) (err error) { - t.tableA.drop(key) - t.tableB.drop(key) +// Drop removes a value for a key in the table. Returns whether the key had +// existed. +func (t *Table[K, V]) Drop(key K) bool { + occupied := t.tableA.drop(key) || t.tableB.drop(key) if t.load() < t.minLoadFactor { - return t.shrink() + // The error is not handled here, because table-shrinking is an internal + // optimization. + t.shrink() } - return nil + return occupied } // Entries returns an unordered sequence of all key-value pairs in the table.