Efficient Key-Value Storage: A Comprehensive Guide to Hash Tables in C

Hash tables are one of the most fundamental and powerful data structures in computer science, providing average O(1) time complexity for insertions, deletions, and lookups. In C, implementing hash tables requires careful attention to memory management, collision handling, and performance optimization. This comprehensive guide covers everything from basic concepts to advanced implementations.

What is a Hash Table?

A hash table is a data structure that maps keys to values using a hash function. The hash function computes an index into an array of buckets, from which the desired value can be found.

Key: "John" → [Hash Function] → Index: 3 → [Bucket 3] → Value: 25
Key: "Alice" → [Hash Function] → Index: 7 → [Bucket 7] → Value: 30

Core Components

  1. Hash Function: Converts a key into an array index
  2. Array (Buckets): Stores the key-value pairs
  3. Collision Resolution: Handles when two keys hash to the same index
  4. Load Factor: Ratio of elements to bucket count

Basic Hash Table Structure

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
// Entry structure for key-value pair
typedef struct HashEntry {
char* key;
int value;
struct HashEntry* next;  // For chaining
} HashEntry;
// Main hash table structure
typedef struct {
HashEntry** buckets;
int size;          // Number of buckets
int count;         // Number of entries
float load_factor; // Current load factor
} HashTable;

Hash Functions

1. Simple String Hash (djb2 by Dan Bernstein)

// djb2 hash function - simple and effective for strings
unsigned long hash_djb2(const char* str) {
unsigned long hash = 5381;
int c;
while ((c = *str++)) {
hash = ((hash << 5) + hash) + c; // hash * 33 + c
}
return hash;
}
// sdbm hash function - good for strings
unsigned long hash_sdbm(const char* str) {
unsigned long hash = 0;
int c;
while ((c = *str++)) {
hash = c + (hash << 6) + (hash << 16) - hash;
}
return hash;
}
// FNV-1a hash - good for general use
unsigned long hash_fnv1a(const char* str) {
unsigned long hash = 2166136261u;
int c;
while ((c = *str++)) {
hash ^= c;
hash *= 16777619;
}
return hash;
}

2. Integer Hash Functions

// Thomas Wang's integer hash
unsigned int hash_int(unsigned int key) {
key = ~key + (key << 15);
key = key ^ (key >> 12);
key = key + (key << 2);
key = key ^ (key >> 4);
key = key * 2057;
key = key ^ (key >> 16);
return key;
}
// Simple modulo hash for integers
unsigned int hash_int_simple(int key, int table_size) {
return abs(key) % table_size;
}
// Knuth's multiplicative hash
unsigned int hash_int_knuth(int key, int table_size) {
const double A = 0.6180339887; // (sqrt(5)-1)/2
double fractional = (key * A) - (int)(key * A);
return (int)(fractional * table_size);
}

3. Generic Hash Function Wrapper

typedef enum {
HASH_DJB2,
HASH_SDBM,
HASH_FNV1A,
HASH_INT
} HashType;
unsigned long hash_generic(const void* key, HashType type, int size) {
switch (type) {
case HASH_DJB2:
return hash_djb2((const char*)key) % size;
case HASH_SDBM:
return hash_sdbm((const char*)key) % size;
case HASH_FNV1A:
return hash_fnv1a((const char*)key) % size;
case HASH_INT:
return hash_int(*(int*)key) % size;
default:
return 0;
}
}

Collision Resolution: Chaining

Chaining uses linked lists to handle collisions:

// Create a new hash entry
HashEntry* create_entry(const char* key, int value) {
HashEntry* entry = (HashEntry*)malloc(sizeof(HashEntry));
entry->key = (char*)malloc(strlen(key) + 1);
strcpy(entry->key, key);
entry->value = value;
entry->next = NULL;
return entry;
}
// Initialize hash table
HashTable* create_table(int size) {
HashTable* table = (HashTable*)malloc(sizeof(HashTable));
table->size = size;
table->count = 0;
table->load_factor = 0;
table->buckets = (HashEntry**)calloc(size, sizeof(HashEntry*));
// Initialize all buckets to NULL
for (int i = 0; i < size; i++) {
table->buckets[i] = NULL;
}
return table;
}
// Insert with chaining
void insert_chaining(HashTable* table, const char* key, int value) {
unsigned long index = hash_djb2(key) % table->size;
// Check if key already exists
HashEntry* current = table->buckets[index];
while (current != NULL) {
if (strcmp(current->key, key) == 0) {
current->value = value; // Update existing
return;
}
current = current->next;
}
// Create new entry and insert at beginning
HashEntry* new_entry = create_entry(key, value);
new_entry->next = table->buckets[index];
table->buckets[index] = new_entry;
table->count++;
// Update load factor
table->load_factor = (float)table->count / table->size;
}
// Search with chaining
int search_chaining(HashTable* table, const char* key, int* found) {
unsigned long index = hash_djb2(key) % table->size;
HashEntry* current = table->buckets[index];
while (current != NULL) {
if (strcmp(current->key, key) == 0) {
*found = 1;
return current->value;
}
current = current->next;
}
*found = 0;
return 0;
}
// Delete with chaining
int delete_chaining(HashTable* table, const char* key) {
unsigned long index = hash_djb2(key) % table->size;
HashEntry* current = table->buckets[index];
HashEntry* prev = NULL;
while (current != NULL) {
if (strcmp(current->key, key) == 0) {
if (prev == NULL) {
// First node in bucket
table->buckets[index] = current->next;
} else {
prev->next = current->next;
}
free(current->key);
free(current);
table->count--;
table->load_factor = (float)table->count / table->size;
return 1; // Success
}
prev = current;
current = current->next;
}
return 0; // Not found
}

Collision Resolution: Open Addressing

1. Linear Probing

typedef struct {
char* key;
int value;
int occupied;  // 1 if occupied, 0 if empty
int deleted;   // 1 if deleted (tombstone)
} OpenEntry;
typedef struct {
OpenEntry* entries;
int size;
int count;
int deleted_count;
} OpenHashTable;
OpenHashTable* create_open_table(int size) {
OpenHashTable* table = (OpenHashTable*)malloc(sizeof(OpenHashTable));
table->size = size;
table->count = 0;
table->deleted_count = 0;
table->entries = (OpenEntry*)calloc(size, sizeof(OpenEntry));
for (int i = 0; i < size; i++) {
table->entries[i].occupied = 0;
table->entries[i].deleted = 0;
table->entries[i].key = NULL;
}
return table;
}
// Linear probing insertion
void insert_linear(OpenHashTable* table, const char* key, int value) {
unsigned long index = hash_djb2(key) % table->size;
unsigned long original_index = index;
// Linear probing
while (table->entries[index].occupied && 
!table->entries[index].deleted &&
strcmp(table->entries[index].key, key) != 0) {
index = (index + 1) % table->size;
if (index == original_index) {
printf("Table is full!\n");
return;
}
}
// If updating existing key
if (table->entries[index].occupied && 
strcmp(table->entries[index].key, key) == 0) {
table->entries[index].value = value;
return;
}
// Insert new entry
if (table->entries[index].occupied) {
free(table->entries[index].key);
}
table->entries[index].key = (char*)malloc(strlen(key) + 1);
strcpy(table->entries[index].key, key);
table->entries[index].value = value;
table->entries[index].occupied = 1;
table->entries[index].deleted = 0;
table->count++;
}
// Linear probing search
int search_linear(OpenHashTable* table, const char* key, int* found) {
unsigned long index = hash_djb2(key) % table->size;
unsigned long original_index = index;
while (table->entries[index].occupied || table->entries[index].deleted) {
if (table->entries[index].occupied && 
!table->entries[index].deleted &&
strcmp(table->entries[index].key, key) == 0) {
*found = 1;
return table->entries[index].value;
}
index = (index + 1) % table->size;
if (index == original_index) {
break;
}
}
*found = 0;
return 0;
}
// Linear probing deletion (with tombstone)
int delete_linear(OpenHashTable* table, const char* key) {
unsigned long index = hash_djb2(key) % table->size;
unsigned long original_index = index;
while (table->entries[index].occupied || table->entries[index].deleted) {
if (table->entries[index].occupied && 
!table->entries[index].deleted &&
strcmp(table->entries[index].key, key) == 0) {
table->entries[index].deleted = 1;
table->count--;
table->deleted_count++;
return 1;
}
index = (index + 1) % table->size;
if (index == original_index) {
break;
}
}
return 0;
}

2. Quadratic Probing

// Quadratic probing insertion
void insert_quadratic(OpenHashTable* table, const char* key, int value) {
unsigned long index = hash_djb2(key) % table->size;
unsigned long original_index = index;
int i = 1;
// Quadratic probing: index + i^2
while (table->entries[index].occupied && 
!table->entries[index].deleted &&
strcmp(table->entries[index].key, key) != 0) {
index = (original_index + i * i) % table->size;
i++;
if (i > table->size) {
printf("Table is full!\n");
return;
}
}
// Similar insertion logic as linear probing
if (table->entries[index].occupied && 
strcmp(table->entries[index].key, key) == 0) {
table->entries[index].value = value;
return;
}
if (table->entries[index].occupied) {
free(table->entries[index].key);
}
table->entries[index].key = (char*)malloc(strlen(key) + 1);
strcpy(table->entries[index].key, key);
table->entries[index].value = value;
table->entries[index].occupied = 1;
table->entries[index].deleted = 0;
table->count++;
}

3. Double Hashing

// Second hash function for double hashing
unsigned long hash2(const char* key, int size) {
// Must be non-zero and relatively prime to size
return 1 + (hash_djb2(key) % (size - 1));
}
// Double hashing insertion
void insert_double(OpenHashTable* table, const char* key, int value) {
unsigned long h1 = hash_djb2(key) % table->size;
unsigned long h2 = hash2(key, table->size);
unsigned long index = h1;
int i = 1;
while (table->entries[index].occupied && 
!table->entries[index].deleted &&
strcmp(table->entries[index].key, key) != 0) {
index = (h1 + i * h2) % table->size;
i++;
if (i > table->size) {
printf("Table is full!\n");
return;
}
}
// Insertion logic (similar to linear probing)
if (table->entries[index].occupied && 
strcmp(table->entries[index].key, key) == 0) {
table->entries[index].value = value;
return;
}
if (table->entries[index].occupied) {
free(table->entries[index].key);
}
table->entries[index].key = (char*)malloc(strlen(key) + 1);
strcpy(table->entries[index].key, key);
table->entries[index].value = value;
table->entries[index].occupied = 1;
table->entries[index].deleted = 0;
table->count++;
}

Dynamic Resizing

// Rehash function for dynamic resizing
void rehash(HashTable* table, int new_size) {
HashEntry** old_buckets = table->buckets;
int old_size = table->size;
// Create new bucket array
table->buckets = (HashEntry**)calloc(new_size, sizeof(HashEntry*));
table->size = new_size;
table->count = 0;
// Reinsert all entries
for (int i = 0; i < old_size; i++) {
HashEntry* entry = old_buckets[i];
while (entry != NULL) {
HashEntry* next = entry->next;
// Rehash into new table
unsigned long new_index = hash_djb2(entry->key) % new_size;
entry->next = table->buckets[new_index];
table->buckets[new_index] = entry;
table->count++;
entry = next;
}
}
free(old_buckets);
table->load_factor = (float)table->count / table->size;
}
// Insert with automatic resizing
void insert_with_resize(HashTable* table, const char* key, int value) {
// Resize if load factor exceeds threshold
if (table->load_factor > 0.75) {
int new_size = table->size * 2;
rehash(table, new_size);
printf("Resized table to %d buckets\n", new_size);
}
// Normal insertion
insert_chaining(table, key, value);
}

Complete Hash Table Implementation

Here's a complete, production-ready hash table implementation:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
// Entry structure
typedef struct HashEntry {
char* key;
int value;
struct HashEntry* next;
} HashEntry;
// Iterator structure
typedef struct {
HashTable* table;
int bucket_index;
HashEntry* current_entry;
} HashIterator;
// Main hash table structure
typedef struct {
HashEntry** buckets;
int size;
int count;
float load_factor;
// Function pointers for customization
unsigned long (*hash_func)(const char*);
void (*free_key)(void*);
} HashTable;
// Default hash function
unsigned long default_hash(const char* key) {
unsigned long hash = 5381;
int c;
while ((c = *key++)) {
hash = ((hash << 5) + hash) + c;
}
return hash;
}
// Create hash table
HashTable* ht_create(int size) {
HashTable* table = (HashTable*)malloc(sizeof(HashTable));
table->size = size;
table->count = 0;
table->load_factor = 0;
table->hash_func = default_hash;
table->free_key = free;
table->buckets = (HashEntry**)calloc(size, sizeof(HashEntry*));
return table;
}
// Create entry
HashEntry* ht_create_entry(const char* key, int value) {
HashEntry* entry = (HashEntry*)malloc(sizeof(HashEntry));
entry->key = (char*)malloc(strlen(key) + 1);
strcpy(entry->key, key);
entry->value = value;
entry->next = NULL;
return entry;
}
// Insert key-value pair
void ht_insert(HashTable* table, const char* key, int value) {
unsigned long index = table->hash_func(key) % table->size;
// Check if key exists
HashEntry* current = table->buckets[index];
while (current) {
if (strcmp(current->key, key) == 0) {
current->value = value;
return;
}
current = current->next;
}
// Insert new entry at beginning
HashEntry* new_entry = ht_create_entry(key, value);
new_entry->next = table->buckets[index];
table->buckets[index] = new_entry;
table->count++;
table->load_factor = (float)table->count / table->size;
}
// Get value by key
bool ht_get(HashTable* table, const char* key, int* value) {
unsigned long index = table->hash_func(key) % table->size;
HashEntry* current = table->buckets[index];
while (current) {
if (strcmp(current->key, key) == 0) {
*value = current->value;
return true;
}
current = current->next;
}
return false;
}
// Check if key exists
bool ht_contains(HashTable* table, const char* key) {
unsigned long index = table->hash_func(key) % table->size;
HashEntry* current = table->buckets[index];
while (current) {
if (strcmp(current->key, key) == 0) {
return true;
}
current = current->next;
}
return false;
}
// Delete key
bool ht_delete(HashTable* table, const char* key) {
unsigned long index = table->hash_func(key) % table->size;
HashEntry* current = table->buckets[index];
HashEntry* prev = NULL;
while (current) {
if (strcmp(current->key, key) == 0) {
if (prev == NULL) {
table->buckets[index] = current->next;
} else {
prev->next = current->next;
}
table->free_key(current->key);
free(current);
table->count--;
table->load_factor = (float)table->count / table->size;
return true;
}
prev = current;
current = current->next;
}
return false;
}
// Get number of entries
int ht_size(HashTable* table) {
return table->count;
}
// Check if table is empty
bool ht_empty(HashTable* table) {
return table->count == 0;
}
// Clear all entries
void ht_clear(HashTable* table) {
for (int i = 0; i < table->size; i++) {
HashEntry* current = table->buckets[i];
while (current) {
HashEntry* next = current->next;
table->free_key(current->key);
free(current);
current = next;
}
table->buckets[i] = NULL;
}
table->count = 0;
table->load_factor = 0;
}
// Destroy table
void ht_destroy(HashTable* table) {
ht_clear(table);
free(table->buckets);
free(table);
}
// Create iterator
HashIterator* ht_iterator_create(HashTable* table) {
HashIterator* it = (HashIterator*)malloc(sizeof(HashIterator));
it->table = table;
it->bucket_index = -1;
it->current_entry = NULL;
return it;
}
// Get next key-value pair
bool ht_iterator_next(HashIterator* it, char** key, int* value) {
// Find next non-empty bucket
if (it->current_entry != NULL) {
it->current_entry = it->current_entry->next;
}
while (it->current_entry == NULL) {
it->bucket_index++;
if (it->bucket_index >= it->table->size) {
return false;
}
it->current_entry = it->table->buckets[it->bucket_index];
}
*key = it->current_entry->key;
*value = it->current_entry->value;
return true;
}
// Destroy iterator
void ht_iterator_destroy(HashIterator* it) {
free(it);
}
// Print table statistics
void ht_print_stats(HashTable* table) {
printf("Hash Table Statistics:\n");
printf("  Size: %d\n", table->size);
printf("  Count: %d\n", table->count);
printf("  Load Factor: %.2f\n", table->load_factor);
// Calculate bucket distribution
int empty_buckets = 0;
int max_chain = 0;
float avg_chain = 0;
for (int i = 0; i < table->size; i++) {
int chain_len = 0;
HashEntry* current = table->buckets[i];
while (current) {
chain_len++;
current = current->next;
}
if (chain_len == 0) {
empty_buckets++;
}
if (chain_len > max_chain) {
max_chain = chain_len;
}
avg_chain += chain_len;
}
avg_chain /= table->size;
printf("  Empty Buckets: %d (%.2f%%)\n", 
empty_buckets, (float)empty_buckets / table->size * 100);
printf("  Max Chain Length: %d\n", max_chain);
printf("  Average Chain Length: %.2f\n", avg_chain);
}

Example Usage

int main() {
// Create hash table
HashTable* table = ht_create(10);
// Insert some values
printf("Inserting values...\n");
ht_insert(table, "John", 25);
ht_insert(table, "Alice", 30);
ht_insert(table, "Bob", 22);
ht_insert(table, "Charlie", 35);
ht_insert(table, "Diana", 28);
// Print stats
ht_print_stats(table);
// Retrieve values
printf("\nRetrieving values:\n");
int value;
if (ht_get(table, "Alice", &value)) {
printf("  Alice: %d\n", value);
}
if (ht_get(table, "Eve", &value)) {
printf("  Eve: %d\n", value);
} else {
printf("  Eve: not found\n");
}
// Update value
printf("\nUpdating John's age...\n");
ht_insert(table, "John", 26);
ht_get(table, "John", &value);
printf("  John: %d\n", value);
// Iterate through all entries
printf("\nAll entries:\n");
HashIterator* it = ht_iterator_create(table);
char* key;
while (ht_iterator_next(it, &key, &value)) {
printf("  %s: %d\n", key, value);
}
ht_iterator_destroy(it);
// Delete an entry
printf("\nDeleting Bob...\n");
if (ht_delete(table, "Bob")) {
printf("  Bob deleted\n");
}
// Check if exists
printf("  Bob exists: %s\n", 
ht_contains(table, "Bob") ? "yes" : "no");
// Final stats
printf("\nFinal statistics:\n");
ht_print_stats(table);
// Cleanup
ht_destroy(table);
return 0;
}

Advanced Features

1. Thread-Safe Hash Table

#include <pthread.h>
typedef struct {
HashTable* table;
pthread_mutex_t* locks;
int num_locks;
} ThreadSafeHashTable;
ThreadSafeHashTable* ts_create(int size, int num_locks) {
ThreadSafeHashTable* ts = malloc(sizeof(ThreadSafeHashTable));
ts->table = ht_create(size);
ts->num_locks = num_locks;
ts->locks = malloc(num_locks * sizeof(pthread_mutex_t));
for (int i = 0; i < num_locks; i++) {
pthread_mutex_init(&ts->locks[i], NULL);
}
return ts;
}
void ts_insert(ThreadSafeHashTable* ts, const char* key, int value) {
unsigned long hash = default_hash(key);
int lock_idx = hash % ts->num_locks;
pthread_mutex_lock(&ts->locks[lock_idx]);
ht_insert(ts->table, key, value);
pthread_mutex_unlock(&ts->locks[lock_idx]);
}
bool ts_get(ThreadSafeHashTable* ts, const char* key, int* value) {
unsigned long hash = default_hash(key);
int lock_idx = hash % ts->num_locks;
pthread_mutex_lock(&ts->locks[lock_idx]);
bool result = ht_get(ts->table, key, value);
pthread_mutex_unlock(&ts->locks[lock_idx]);
return result;
}

2. Custom Key Types

// Generic key support
typedef struct {
void* data;
size_t size;
unsigned long (*hash)(const void*);
int (*equals)(const void*, const void*);
} GenericKey;
unsigned long hash_int_key(const void* key) {
return *(int*)key;
}
unsigned long hash_string_key(const void* key) {
return hash_djb2((const char*)key);
}
int equals_int(const void* a, const void* b) {
return *(int*)a == *(int*)b;
}
int equals_string(const void* a, const void* b) {
return strcmp((const char*)a, (const char*)b) == 0;
}

3. Value with Reference Counting

typedef struct {
void* data;
int refcount;
} RefCountedValue;
RefCountedValue* rc_create(void* data) {
RefCountedValue* rc = malloc(sizeof(RefCountedValue));
rc->data = data;
rc->refcount = 1;
return rc;
}
void rc_retain(RefCountedValue* rc) {
rc->refcount++;
}
void rc_release(RefCountedValue* rc) {
rc->refcount--;
if (rc->refcount == 0) {
free(rc->data);
free(rc);
}
}

Performance Testing

#include <time.h>
#include <stdio.h>
void benchmark_hash_table() {
const int NUM_OPERATIONS = 100000;
const int TABLE_SIZE = 10000;
HashTable* table = ht_create(TABLE_SIZE);
// Measure insertions
clock_t start = clock();
for (int i = 0; i < NUM_OPERATIONS; i++) {
char key[20];
sprintf(key, "key_%d", i);
ht_insert(table, key, i);
}
clock_t end = clock();
double insert_time = (double)(end - start) / CLOCKS_PER_SEC;
// Measure lookups
start = clock();
for (int i = 0; i < NUM_OPERATIONS; i++) {
char key[20];
sprintf(key, "key_%d", i);
int value;
ht_get(table, key, &value);
}
end = clock();
double lookup_time = (double)(end - start) / CLOCKS_PER_SEC;
printf("Performance Results (%d operations):\n", NUM_OPERATIONS);
printf("  Insertions: %.3f seconds (%.0f ops/sec)\n", 
insert_time, NUM_OPERATIONS / insert_time);
printf("  Lookups: %.3f seconds (%.0f ops/sec)\n", 
lookup_time, NUM_OPERATIONS / lookup_time);
ht_destroy(table);
}

Common Pitfalls and Best Practices

  1. Choose good hash functions - Avoid simple modulo with non-prime sizes
  2. Monitor load factor - Resize before performance degrades
  3. Handle collisions properly - Use chaining or open addressing appropriately
  4. Free memory correctly - Avoid leaks, especially with string keys
  5. Consider thread safety - Use mutexes for concurrent access
  6. Test edge cases - Empty table, full table, many collisions
  7. Profile performance - Measure and optimize bottlenecks

Conclusion

Hash tables are essential data structures that provide efficient key-value storage and retrieval. By understanding the underlying concepts—hash functions, collision resolution, and dynamic resizing—you can implement robust hash tables tailored to your specific needs.

The implementations in this guide provide a solid foundation for building production-ready hash tables in C. Whether you need a simple in-memory cache, a symbol table for a compiler, or a database index, these patterns and techniques will serve you well. Remember to always consider your specific requirements for performance, memory usage, and thread safety when designing your hash table implementation.

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper