The Need for Speed: A Complete Guide to Performance Optimization in C

Performance optimization is both an art and a science. In C, where you have direct control over memory and hardware, optimization opportunities abound. However, premature optimization can lead to complex, unmaintainable code. This comprehensive guide covers systematic approaches to identifying bottlenecks, applying optimization techniques, and writing high-performance C code that balances speed with maintainability.

The Optimization Mindset

Before diving into techniques, understand these principles:

  1. Measure, don't guess: Profile before optimizing
  2. Optimize the hot path: Focus on frequently executed code
  3. Maintain correctness: Optimizations shouldn't change behavior
  4. Consider the big picture: Algorithm choice matters more than micro-optimizations
  5. Document optimizations: Future maintainers need to understand why
80/20 Rule: 80% of execution time is spent in 20% of the code

Part 1: Profiling and Measurement

Timing Utilities

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
// High-resolution timer (microseconds)
long long get_time_us() {
struct timeval tv;
gettimeofday(&tv, NULL);
return (long long)tv.tv_sec * 1000000 + tv.tv_usec;
}
// Nanosecond precision (clock_gettime)
long long get_time_ns() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (long long)ts.tv_sec * 1000000000 + ts.tv_nsec;
}
// Simple benchmark macro
#define BENCHMARK(code, iterations) do { \
long long start = get_time_ns(); \
for (long long _i = 0; _i < iterations; _i++) { \
code; \
} \
long long end = get_time_ns(); \
printf("%s: %.3f ns per iteration\n", #code, \
(double)(end - start) / iterations); \
} while(0)
// Function benchmark wrapper
typedef void (*bench_func)(void);
void run_benchmark(const char *name, bench_func func, int iterations) {
long long start = get_time_ns();
for (int i = 0; i < iterations; i++) {
func();
}
long long end = get_time_ns();
printf("%s: %.3f ns per call\n", name, 
(double)(end - start) / iterations);
}

Using perf (Linux Performance Profiler)

# Install perf
sudo apt-get install linux-tools-common linux-tools-generic
# Profile a program
perf record ./my_program
perf report
# Profile with call graph
perf record -g ./my_program
perf report -g graph
# Count specific events
perf stat -e cycles,instructions,cache-misses ./my_program

Using Valgrind/Callgrind

# Install valgrind
sudo apt-get install valgrind
# Profile with callgrind
valgrind --tool=callgrind ./my_program
# View results with kcachegrind
kcachegrind callgrind.out.*
# Cache simulation
valgrind --tool=cachegrind ./my_program

Simple Profiling Implementation

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
typedef struct ProfilerEntry {
const char *name;
long long total_time;
long long call_count;
struct ProfilerEntry *next;
} ProfilerEntry;
static ProfilerEntry *profiler_head = NULL;
void profiler_record(const char *name, long long duration) {
ProfilerEntry *entry = profiler_head;
while (entry) {
if (strcmp(entry->name, name) == 0) {
entry->total_time += duration;
entry->call_count++;
return;
}
entry = entry->next;
}
// New entry
entry = malloc(sizeof(ProfilerEntry));
entry->name = strdup(name);
entry->total_time = duration;
entry->call_count = 1;
entry->next = profiler_head;
profiler_head = entry;
}
void profiler_print() {
printf("\n=== Profiler Results ===\n");
ProfilerEntry *entry = profiler_head;
while (entry) {
printf("%-30s calls: %-8lld total: %-12lld us avg: %.3f us\n",
entry->name, entry->call_count, entry->total_time / 1000,
(double)entry->total_time / entry->call_count / 1000);
entry = entry->next;
}
}
void profiler_cleanup() {
ProfilerEntry *entry = profiler_head;
while (entry) {
ProfilerEntry *next = entry->next;
free((void*)entry->name);
free(entry);
entry = next;
}
}
#define PROFILE_START(name) \
long long _start_##name = get_time_ns();
#define PROFILE_END(name) \
profiler_record(#name, get_time_ns() - _start_##name);
// Usage
void expensive_function() {
PROFILE_START(expensive_function)
// ... expensive work ...
PROFILE_END(expensive_function)
}

Part 2: Algorithm Optimization

Choosing the Right Algorithm

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
// O(n²) - Quadratic
int linear_search(int arr[], int n, int target) {
for (int i = 0; i < n; i++) {
if (arr[i] == target) return i;
}
return -1;
}
// O(log n) - Logarithmic
int binary_search(int arr[], int n, int target) {
int left = 0, right = n - 1;
while (left <= right) {
int mid = left + (right - left) / 2;
if (arr[mid] == target) return mid;
if (arr[mid] < target) left = mid + 1;
else right = mid - 1;
}
return -1;
}
// Compare search algorithms
void compare_search_algorithms() {
const int SIZE = 10000000;
int *arr = malloc(SIZE * sizeof(int));
for (int i = 0; i < SIZE; i++) arr[i] = i;
printf("Searching for element %d\n", SIZE - 1);
long long start = get_time_ns();
linear_search(arr, SIZE, SIZE - 1);
long long linear_time = get_time_ns() - start;
start = get_time_ns();
binary_search(arr, SIZE, SIZE - 1);
long long binary_time = get_time_ns() - start;
printf("Linear search: %.3f ms\n", linear_time / 1e6);
printf("Binary search: %.3f ms\n", binary_time / 1e6);
printf("Speedup: %.0fx\n", (double)linear_time / binary_time);
free(arr);
}

Optimizing Sort for Specific Data

// Hybrid sort: insertion sort for small arrays, quicksort for large
void hybrid_sort(int arr[], int left, int right) {
const int THRESHOLD = 32;
if (right - left + 1 <= THRESHOLD) {
// Insertion sort for small arrays
for (int i = left + 1; i <= right; i++) {
int key = arr[i];
int j = i - 1;
while (j >= left && arr[j] > key) {
arr[j + 1] = arr[j];
j--;
}
arr[j + 1] = key;
}
} else {
// Quicksort for larger arrays
int pivot = arr[right];
int i = left - 1;
for (int j = left; j < right; j++) {
if (arr[j] <= pivot) {
i++;
int temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
}
int temp = arr[i + 1];
arr[i + 1] = arr[right];
arr[right] = temp;
hybrid_sort(arr, left, i);
hybrid_sort(arr, i + 2, right);
}
}

Part 3: Memory Optimization

Cache-Friendly Data Structures

#include <stdio.h>
#include <stdlib.h>
// Poor: Linked list (pointer chasing, poor locality)
typedef struct ListNode {
int data;
struct ListNode *next;
} ListNode;
// Good: Array-based list (contiguous memory)
typedef struct {
int *data;
size_t size;
size_t capacity;
} ArrayList;
// Poor: Struct of arrays (stride = size of struct)
typedef struct {
int x;
int y;
int z;
} PointPoor;
// Good: Array of structs (stride = 1)
typedef struct {
int *x;
int *y;
int *z;
} PointGood;  // Better for SIMD, worse for memory locality
// Best for vector operations: Structure of Arrays (SoA)
typedef struct {
float *x;
float *y;
float *z;
size_t count;
} Vector3Array;
// Matrix multiplication - cache-friendly
void matrix_multiply(int n, double A[n][n], double B[n][n], double C[n][n]) {
// Initialize C to zero
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
C[i][j] = 0;
}
}
// Cache-friendly order: i-k-j
for (int i = 0; i < n; i++) {
for (int k = 0; k < n; k++) {
double aik = A[i][k];
for (int j = 0; j < n; j++) {
C[i][j] += aik * B[k][j];
}
}
}
}
// Cache-unfriendly order (avoid)
void matrix_multiply_bad(int n, double A[n][n], double B[n][n], double C[n][n]) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
C[i][j] = 0;
for (int k = 0; k < n; k++) {
C[i][j] += A[i][k] * B[k][j];  // Poor locality
}
}
}
}

Memory Alignment

#include <stddef.h>
#include <stdlib.h>
// Poor: Misaligned structure
typedef struct __attribute__((packed)) {
char c;      // 1 byte
int i;       // 4 bytes (potentially misaligned)
short s;     // 2 bytes
} UnalignedStruct;
// Good: Properly aligned (natural alignment)
typedef struct {
int i;       // 4 bytes
short s;     // 2 bytes
char c;      // 1 byte
char pad;    // 1 byte padding (automatically added)
} AlignedStruct;  // Size: 8 bytes
// Force alignment
typedef struct {
int i;
char c;
} __attribute__((aligned(64))) CacheAligned;  // Aligned to cache line
// Allocate aligned memory
void* aligned_malloc(size_t size, size_t alignment) {
void *ptr = malloc(size + alignment - 1 + sizeof(void*));
if (!ptr) return NULL;
uintptr_t addr = (uintptr_t)ptr + sizeof(void*);
uintptr_t aligned = (addr + alignment - 1) & ~(alignment - 1);
((void**)aligned)[-1] = ptr;
return (void*)aligned;
}
void aligned_free(void *ptr) {
if (ptr) {
free(((void**)ptr)[-1]);
}
}

Prefetching

#include <x86intrin.h>  // GCC/Clang
// Software prefetching
void prefetch_example(int *data, size_t n) {
for (size_t i = 0; i < n; i++) {
// Prefetch next element
__builtin_prefetch(&data[i + 64], 0, 3);
// Process current element
process(data[i]);
}
}
// Prefetching in loop
void sum_with_prefetch(const int *arr, size_t n, int *result) {
*result = 0;
const int *end = arr + n;
const int *p = arr;
// Prefetch first chunk
__builtin_prefetch(p + 64, 0, 3);
while (p < end) {
__builtin_prefetch(p + 64, 0, 3);  // Prefetch next
*result += *p;
p++;
}
}

Part 4: Compiler Optimizations

Optimization Flags

# GCC optimization levels
-O0     # No optimization (default)
-O1     # Basic optimizations
-O2     # More aggressive (recommended for most code)
-O3     # Very aggressive (may increase code size)
-Ofast  # O3 + fast math (may break standards compliance)
-Os     # Optimize for size
# Architecture-specific
-march=native    # Optimize for current CPU
-mtune=native    # Tune for current CPU
# Link-time optimization
-flto            # Link-time optimization
-flto=auto       # Automatic parallel LTO
# Profile-guided optimization
-fprofile-generate  # Generate profile data
-fprofile-use       # Use profile data

Function Attributes

// Force inlining
static inline __attribute__((always_inline)) int fast_add(int a, int b) {
return a + b;
}
// Prevent inlining (useful for profiling)
__attribute__((noinline)) void cold_function() {
// Rarely called code
}
// Hot function (hint for optimization)
__attribute__((hot)) void frequently_called() {
// Frequently executed code
}
// Cold function (hint for optimization)
__attribute__((cold)) void error_handler() {
// Error handling code
}
// Pure function (no side effects, result depends only on arguments)
__attribute__((pure)) int square(int x) {
return x * x;
}
// Const function (like pure, but no pointer arguments)
__attribute__((const)) int cube(int x) {
return x * x * x;
}
// Alias for function
void original_func() { }
void alias_func() __attribute__((alias("original_func")));
// Deprecated function
void old_func() __attribute__((deprecated("Use new_func instead")));

Loop Optimizations

// Tell compiler loops don't have dependencies
#pragma GCC ivdep
for (int i = 0; i < n; i++) {
a[i] = b[i] + c[i];
}
// Loop unrolling
#pragma GCC unroll 4
for (int i = 0; i < n; i++) {
process(a[i]);
}
// Restrict pointers (no aliasing)
void vector_add(int *restrict a, int *restrict b, int *restrict c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}

Part 5: Micro-Optimizations

Branch Prediction

#include <stdbool.h>
#include <stdlib.h>
// Likely/unlikely macros
#define likely(x)   __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
// Use for branch prediction
void handle_error(int error_code) {
if (unlikely(error_code != 0)) {
// Error handling - rarely taken
fprintf(stderr, "Error: %d\n", error_code);
}
}
// Branchless programming
int abs_branch(int x) {
return x < 0 ? -x : x;  // With branch
}
int abs_branchless(int x) {
int mask = x >> (sizeof(int) * 8 - 1);
return (x + mask) ^ mask;  // No branch
}
// Branchless min/max
int min_branch(int a, int b) {
return a < b ? a : b;
}
int min_branchless(int a, int b) {
return b ^ ((a ^ b) & -(a < b));
}
// Boolean to int without branch
int bool_to_int(bool b) {
return b;  // Compiler may optimize
// return b ? 1 : 0;  // Equivalent
// return -(int)b;  // For 0/-1 values
}

Bit Twiddling

// Power of two checks
bool is_power_of_two(unsigned int x) {
return x && !(x & (x - 1));
}
// Round up to next power of two
unsigned int next_power_of_two(unsigned int x) {
x--;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return x + 1;
}
// Count set bits (population count)
int popcount(unsigned int x) {
x = x - ((x >> 1) & 0x55555555);
x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
x = (x + (x >> 4)) & 0x0F0F0F0F;
x = x + (x >> 8);
x = x + (x >> 16);
return x & 0x0000003F;
}
// Find least significant set bit
int lsb_index(unsigned int x) {
return __builtin_ctz(x);  // Compiler builtin
}
// Find most significant set bit
int msb_index(unsigned int x) {
return 31 - __builtin_clz(x);
}
// Reverse bits
unsigned int reverse_bits(unsigned int x) {
x = ((x >> 1) & 0x55555555) | ((x & 0x55555555) << 1);
x = ((x >> 2) & 0x33333333) | ((x & 0x33333333) << 2);
x = ((x >> 4) & 0x0F0F0F0F) | ((x & 0x0F0F0F0F) << 4);
x = ((x >> 8) & 0x00FF00FF) | ((x & 0x00FF00FF) << 8);
x = (x >> 16) | (x << 16);
return x;
}

String Optimization

#include <string.h>
// Precompute string length
void process_string(const char *str) {
size_t len = strlen(str);  // Compute once
for (size_t i = 0; i < len; i++) {
// Process each character
}
}
// Compare strings with length known
bool strings_equal_len(const char *a, const char *b, size_t len) {
return memcmp(a, b, len) == 0;  // Faster than strncmp
}
// Copy with known size
void copy_buffer(const char *src, char *dst, size_t n) {
memcpy(dst, src, n);  // Use memcpy when possible
}
// Avoid strlen in loops
void bad_string_loop(const char *str) {
for (size_t i = 0; i < strlen(str); i++) {  // O(n²)!
putchar(str[i]);
}
}
void good_string_loop(const char *str) {
for (size_t i = 0; str[i]; i++) {  // O(n)
putchar(str[i]);
}
}

Part 6: SIMD (Single Instruction Multiple Data)

Using Compiler Auto-vectorization

// Enable auto-vectorization with -O3 -march=native
// Vectorizable loop
void add_arrays(float *a, float *b, float *c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];  // Compiler may vectorize
}
}
// Manual hints
void add_arrays_aligned(float *a, float *b, float *c, int n) {
#pragma omp simd
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}

Intel Intrinsics (x86/SSE/AVX)

#include <immintrin.h>
// SSE2: 4 floats at once
void add_arrays_sse(float *a, float *b, float *c, int n) {
for (int i = 0; i < n; i += 4) {
__m128 va = _mm_loadu_ps(&a[i]);
__m128 vb = _mm_loadu_ps(&b[i]);
__m128 vc = _mm_add_ps(va, vb);
_mm_storeu_ps(&c[i], vc);
}
}
// AVX2: 8 floats at once
void add_arrays_avx(float *a, float *b, float *c, int n) {
for (int i = 0; i < n; i += 8) {
__m256 va = _mm256_loadu_ps(&a[i]);
__m256 vb = _mm256_loadu_ps(&b[i]);
__m256 vc = _mm256_add_ps(va, vb);
_mm256_storeu_ps(&c[i], vc);
}
}
// Dot product with AVX
float dot_product_avx(float *a, float *b, int n) {
__m256 sum = _mm256_setzero_ps();
for (int i = 0; i < n; i += 8) {
__m256 va = _mm256_loadu_ps(&a[i]);
__m256 vb = _mm256_loadu_ps(&b[i]);
sum = _mm256_add_ps(sum, _mm256_mul_ps(va, vb));
}
// Horizontal sum
float result[8];
_mm256_storeu_ps(result, sum);
return result[0] + result[1] + result[2] + result[3] +
result[4] + result[5] + result[6] + result[7];
}

Part 7: Concurrency Optimization

Parallel Processing with OpenMP

#include <omp.h>
// Parallel for loop
void parallel_add(int *a, int *b, int *c, int n) {
#pragma omp parallel for
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// Reduction
int parallel_sum(int *arr, int n) {
int sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < n; i++) {
sum += arr[i];
}
return sum;
}
// Task parallelism
void parallel_tasks() {
#pragma omp parallel
{
#pragma omp single
{
#pragma omp task
{ task1(); }
#pragma omp task
{ task2(); }
#pragma omp taskwait
}
}
}

Thread Affinity

#define _GNU_SOURCE
#include <sched.h>
#include <pthread.h>
// Pin thread to specific CPU core
void pin_thread_to_core(int core_id) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(core_id, &cpuset);
pthread_t current_thread = pthread_self();
pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
}
// Get current CPU core
int get_current_cpu() {
return sched_getcpu();
}

Part 8: I/O Optimization

Buffered I/O

#include <stdio.h>
// Bad: Single character I/O
void bad_io_write(const char *str) {
while (*str) {
putchar(*str++);  // One syscall per character
}
}
// Good: Buffered I/O
void good_io_write(const char *str) {
fputs(str, stdout);  // Single syscall (buffered)
}
// Custom buffer
void buffered_write(int fd, const char *data, size_t len) {
static char buffer[8192];
static size_t pos = 0;
for (size_t i = 0; i < len; i++) {
buffer[pos++] = data[i];
if (pos >= sizeof(buffer)) {
write(fd, buffer, pos);
pos = 0;
}
}
}
void buffered_flush(int fd) {
static char buffer[8192];
static size_t pos = 0;
if (pos > 0) {
write(fd, buffer, pos);
pos = 0;
}
}

Memory-Mapped Files

#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
void process_large_file(const char *filename) {
int fd = open(filename, O_RDONLY);
if (fd == -1) return;
off_t size = lseek(fd, 0, SEEK_END);
if (size == -1) {
close(fd);
return;
}
char *data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
if (data == MAP_FAILED) {
close(fd);
return;
}
// Process data directly in memory
process_data(data, size);
munmap(data, size);
close(fd);
}

Part 9: Complete Optimization Example

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <x86intrin.h>
// Optimized matrix multiplication with blocking
void matrix_multiply_optimized(int n, double A[n][n], double B[n][n], double C[n][n]) {
const int BLOCK_SIZE = 64;  // Cache line size
double *b_col = malloc(n * sizeof(double));
// Initialize C
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
C[i][j] = 0.0;
}
}
// Blocked multiplication
for (int jj = 0; jj < n; jj += BLOCK_SIZE) {
int j_end = (jj + BLOCK_SIZE < n) ? jj + BLOCK_SIZE : n;
// Extract B column block for better locality
for (int j = jj; j < j_end; j++) {
for (int k = 0; k < n; k++) {
b_col[k] = B[k][j];
}
for (int i = 0; i < n; i++) {
double sum = 0.0;
#pragma omp simd reduction(+:sum)
for (int k = 0; k < n; k++) {
sum += A[i][k] * b_col[k];
}
C[i][j] = sum;
}
}
}
free(b_col);
}
// Optimized image convolution
void convolution_optimized(const unsigned char *input, unsigned char *output,
int width, int height, const float *kernel, int ksize) {
const int half = ksize / 2;
const int stride = width;
// Prefetch hints
__builtin_prefetch(input, 0, 3);
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
float sum = 0.0f;
int k = 0;
// Unrolled inner loops for 3x3 kernel
if (ksize == 3) {
for (int ky = -1; ky <= 1; ky++) {
int yi = y + ky;
if (yi >= 0 && yi < height) {
const unsigned char *row = input + yi * stride;
for (int kx = -1; kx <= 1; kx++) {
int xi = x + kx;
if (xi >= 0 && xi < width) {
sum += row[xi] * kernel[k];
}
k++;
}
} else {
k += 3;  // Skip kernel values for out-of-bounds
}
}
} else {
// Generic convolution
for (int ky = -half; ky <= half; ky++) {
int yi = y + ky;
if (yi >= 0 && yi < height) {
const unsigned char *row = input + yi * stride;
for (int kx = -half; kx <= half; kx++) {
int xi = x + kx;
if (xi >= 0 && xi < width) {
sum += row[xi] * kernel[k];
}
k++;
}
} else {
k += ksize;  // Skip kernel values
}
}
}
output[y * stride + x] = (unsigned char)(sum + 0.5f);
}
// Prefetch next row
__builtin_prefetch(input + (y + 1) * stride, 0, 3);
}
}

Part 10: Optimization Checklist

Before Optimization

  • [ ] Profile to identify bottlenecks
  • [ ] Establish baseline metrics
  • [ ] Define performance goals
  • [ ] Consider algorithm improvements first

During Implementation

  • [ ] Use appropriate data structures
  • [ ] Optimize memory access patterns
  • [ ] Enable compiler optimizations
  • [ ] Use restrict for pointers
  • [ ] Prefer stack allocation over heap
  • [ ] Use bit operations when appropriate
  • [ ] Minimize branches in hot paths

After Implementation

  • [ ] Profile again to verify improvements
  • [ ] Check for correctness
  • [ ] Document optimizations
  • [ ] Consider maintainability trade-offs

Performance Anti-Patterns to Avoid

// 1. Repeated function calls in loops
for (int i = 0; i < strlen(str); i++) { }  // BAD
size_t len = strlen(str);                   // GOOD
for (size_t i = 0; i < len; i++) { }
// 2. Unnecessary memory allocation
while (condition) {
char *buf = malloc(1024);  // BAD
// ... use buf
free(buf);
}
char *buf = malloc(1024);      // GOOD
while (condition) {
// ... use buf
}
free(buf);
// 3. Virtual function calls in hot loops (C++)
// 4. Unaligned memory access
// 5. Cache thrashing
// 6. False sharing in parallel code
// 7. Branch mispredictions
// 8. Poor I/O buffering

Conclusion

Performance optimization in C requires a systematic approach:

  1. Measure first: Profile to find real bottlenecks
  2. Choose the right algorithm: Better algorithms beat micro-optimizations
  3. Optimize memory: Leverage cache hierarchies and locality
  4. Use compiler help: Enable optimizations and use attributes
  5. Consider architecture: SIMD, prefetching, and branch prediction
  6. Parallelize wisely: OpenMP and threading for compute-bound tasks
  7. Profile again: Verify improvements and check for regressions

Remember the golden rule: Make it work, then make it fast. Optimize only when necessary, and always maintain correctness. The most optimized code is worthless if it produces wrong results or is impossible to maintain.

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper