Performance Optimization Mastery: A Complete Guide to Code Profiling in C

Code profiling is the systematic analysis of program performance to identify bottlenecks, optimize resource usage, and improve overall efficiency. For C programmers, profiling is essential because C is often chosen for performance-critical applications. This comprehensive guide covers everything from basic timing measurements to advanced profiling tools and techniques.

Why Profile C Code?

C is used in systems where performance matters: databases, game engines, embedded systems, and high-frequency trading platforms. Profiling helps answer critical questions:

  • Which functions consume the most CPU time?
  • Where are the memory bottlenecks?
  • What causes cache misses?
  • Why is I/O slowing down the application?
  • Where can optimizations have the most impact?

Types of Profiling

  1. Instrumentation Profiling: Inserting code to measure execution (e.g., gprof)
  2. Sampling Profiling: Interrupting program at intervals to record state (e.g., perf)
  3. Tracing: Recording detailed event sequences
  4. Memory Profiling: Tracking allocation/deallocation patterns
  5. Cache Profiling: Analyzing memory access patterns

Manual Timing with Standard Libraries

1. Basic Time Measurement

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void slow_function() {
// Simulate slow computation
volatile long long sum = 0;
for (long long i = 0; i < 100000000; i++) {
sum += i;
}
}
void fast_function() {
volatile long long sum = 0;
for (long long i = 0; i < 10000000; i++) {
sum += i;
}
}
int main() {
clock_t start, end;
double cpu_time_used;
// Measure slow function
start = clock();
slow_function();
end = clock();
cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("slow_function took %f seconds\n", cpu_time_used);
// Measure fast function
start = clock();
fast_function();
end = clock();
cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("fast_function took %f seconds\n", cpu_time_used);
return 0;
}

2. High-Resolution Timing with clock_gettime()

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
// Convert timespec to nanoseconds
long long timespec_to_ns(struct timespec *ts) {
return (long long)ts->tv_sec * 1000000000 + ts->tv_nsec;
}
// Measure execution time with nanosecond precision
void measure_function(void (*func)(void)) {
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
func();
clock_gettime(CLOCK_MONOTONIC, &end);
long long elapsed_ns = timespec_to_ns(&end) - timespec_to_ns(&start);
printf("Execution time: %lld ns (%.3f ms)\n", 
elapsed_ns, elapsed_ns / 1000000.0);
}
void test_function() {
// Simulate work
volatile long long sum = 0;
for (long long i = 0; i < 100000000; i++) {
sum += i;
}
}
int main() {
printf("Measuring function performance...\n");
measure_function(test_function);
return 0;
}

3. Timer Utility Library

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
typedef struct {
char name[64];
long long total_time;
long long count;
long long min_time;
long long max_time;
struct timespec start_time;
int active;
} Profiler;
Profiler profilers[256];
int profiler_count = 0;
void profiler_start(const char *name) {
for (int i = 0; i < profiler_count; i++) {
if (strcmp(profilers[i].name, name) == 0 && !profilers[i].active) {
profilers[i].active = 1;
clock_gettime(CLOCK_MONOTONIC, &profilers[i].start_time);
return;
}
}
// Create new profiler
Profiler *p = &profilers[profiler_count++];
strncpy(p->name, name, sizeof(p->name) - 1);
p->total_time = 0;
p->count = 0;
p->min_time = 1LL << 62;
p->max_time = 0;
p->active = 1;
clock_gettime(CLOCK_MONOTONIC, &p->start_time);
}
void profiler_stop(const char *name) {
struct timespec end_time;
clock_gettime(CLOCK_MONOTONIC, &end_time);
for (int i = 0; i < profiler_count; i++) {
if (strcmp(profilers[i].name, name) == 0 && profilers[i].active) {
long long elapsed = (end_time.tv_sec - profilers[i].start_time.tv_sec) * 1000000000LL +
(end_time.tv_nsec - profilers[i].start_time.tv_nsec);
profilers[i].total_time += elapsed;
profilers[i].count++;
if (elapsed < profilers[i].min_time) profilers[i].min_time = elapsed;
if (elapsed > profilers[i].max_time) profilers[i].max_time = elapsed;
profilers[i].active = 0;
return;
}
}
fprintf(stderr, "Profiler '%s' not found\n", name);
}
void profiler_report(void) {
printf("\n=== Profiler Report ===\n");
printf("%-20s %12s %12s %12s %12s %12s\n", 
"Function", "Count", "Total (ms)", "Avg (us)", "Min (us)", "Max (us)");
printf("%-20s %12s %12s %12s %12s %12s\n",
"--------", "-----", "----------", "-------", "-------", "-------");
for (int i = 0; i < profiler_count; i++) {
Profiler *p = &profilers[i];
double total_ms = p->total_time / 1000000.0;
double avg_us = (p->total_time / p->count) / 1000.0;
double min_us = p->min_time / 1000.0;
double max_us = p->max_time / 1000.0;
printf("%-20s %12lld %12.3f %12.2f %12.2f %12.2f\n",
p->name, p->count, total_ms, avg_us, min_us, max_us);
}
}
// Example usage
void expensive_function() {
profiler_start("expensive_function");
volatile long long sum = 0;
for (long long i = 0; i < 50000000; i++) {
sum += i;
}
profiler_stop("expensive_function");
}
void medium_function() {
profiler_start("medium_function");
volatile long long sum = 0;
for (long long i = 0; i < 10000000; i++) {
sum += i;
}
profiler_stop("medium_function");
}
int main() {
for (int i = 0; i < 5; i++) {
expensive_function();
medium_function();
}
profiler_report();
return 0;
}

Using GNU Profiler (gprof)

1. Compiling for gprof

# Compile with profiling flags
gcc -pg -g -o program program.c
# Run program to generate gmon.out
./program
# Analyze with gprof
gprof program gmon.out > analysis.txt
gprof program gmon.out | less

2. Example Program for gprof

#include <stdio.h>
#include <stdlib.h>
// Function that does heavy computation
void compute_primes(int limit) {
int *is_prime = malloc((limit + 1) * sizeof(int));
for (int i = 2; i <= limit; i++) {
is_prime[i] = 1;
}
for (int i = 2; i * i <= limit; i++) {
if (is_prime[i]) {
for (int j = i * i; j <= limit; j += i) {
is_prime[j] = 0;
}
}
}
int count = 0;
for (int i = 2; i <= limit; i++) {
if (is_prime[i]) {
count++;
}
}
printf("Found %d primes up to %d\n", count, limit);
free(is_prime);
}
// Function that does array processing
void process_array(int size) {
int *arr = malloc(size * sizeof(int));
for (int i = 0; i < size; i++) {
arr[i] = rand() % 1000;
}
// Bubble sort (inefficient on purpose)
for (int i = 0; i < size - 1; i++) {
for (int j = 0; j < size - i - 1; j++) {
if (arr[j] > arr[j + 1]) {
int temp = arr[j];
arr[j] = arr[j + 1];
arr[j + 1] = temp;
}
}
}
free(arr);
}
int main() {
printf("Starting performance test...\n");
for (int i = 0; i < 5; i++) {
compute_primes(100000);
}
for (int i = 0; i < 10; i++) {
process_array(5000);
}
return 0;
}

3. Understanding gprof Output

Flat profile:
Each sample counts as 0.01 seconds.
%   cumulative   self              self     total
time   seconds   seconds    calls  ms/call  ms/call  name
65.22      0.45     0.45       50     9.00    18.00  compute_primes
30.43      0.66     0.21       10    21.00    21.00  process_array
4.35      0.69     0.03                             main
Call graph (profile):
granularity: each sample hit covers 2 byte(s) for 1.45% of 0.69 seconds
index % time    self  children    called     name
0.45    0.18       5/5           main [1]
[2]     91.3    0.45    0.18       5         compute_primes [2]
0.18    0.00       5/5           prime_operations [3]

Using perf (Linux Performance Profiler)

1. Basic perf Commands

# Record performance data
perf record -g ./program
# View report
perf report
# Record with call graph
perf record -g --call-graph dwarf ./program
# Record specific events
perf record -e cycles -e instructions -e cache-misses ./program
# Generate flame graph data
perf script > out.perf

2. Example with perf

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Memory-intensive operation
void matrix_multiply(int n) {
double **a = malloc(n * sizeof(double*));
double **b = malloc(n * sizeof(double*));
double **c = malloc(n * sizeof(double*));
for (int i = 0; i < n; i++) {
a[i] = malloc(n * sizeof(double));
b[i] = malloc(n * sizeof(double));
c[i] = malloc(n * sizeof(double));
for (int j = 0; j < n; j++) {
a[i][j] = (double)rand() / RAND_MAX;
b[i][j] = (double)rand() / RAND_MAX;
c[i][j] = 0;
}
}
// Naive matrix multiplication
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
for (int k = 0; k < n; k++) {
c[i][j] += a[i][k] * b[k][j];
}
}
}
// Cleanup
for (int i = 0; i < n; i++) {
free(a[i]);
free(b[i]);
free(c[i]);
}
free(a);
free(b);
free(c);
}
// CPU-intensive operation
void fibonacci(int n) {
long long fib[3] = {0, 1, 0};
for (int i = 2; i <= n; i++) {
fib[2] = fib[0] + fib[1];
fib[0] = fib[1];
fib[1] = fib[2];
}
}
int main() {
printf("Running matrix multiplication...\n");
matrix_multiply(500);
printf("Running Fibonacci...\n");
for (int i = 0; i < 10000000; i++) {
fibonacci(100);
}
return 0;
}

Using Valgrind/Callgrind

1. Basic Callgrind Usage

# Run with callgrind
valgrind --tool=callgrind ./program
# View with kcachegrind
kcachegrind callgrind.out.*
# With detailed instruction counts
valgrind --tool=callgrind --dump-instr=yes --collect-jumps=yes ./program
# Run only specific functions
valgrind --tool=callgrind --toggle-collect=function_name ./program

2. Example Program for Callgrind

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct Node {
int data;
struct Node *next;
} Node;
// Linked list operations
Node* list_create(int size) {
Node *head = NULL;
Node *tail = NULL;
for (int i = 0; i < size; i++) {
Node *node = malloc(sizeof(Node));
node->data = i;
node->next = NULL;
if (!head) {
head = node;
tail = node;
} else {
tail->next = node;
tail = node;
}
}
return head;
}
void list_free(Node *head) {
while (head) {
Node *next = head->next;
free(head);
head = next;
}
}
int list_search(Node *head, int value) {
int comparisons = 0;
while (head) {
comparisons++;
if (head->data == value) {
return comparisons;
}
head = head->next;
}
return -comparisons;
}
// Binary search on array
int binary_search(int *arr, int size, int target) {
int left = 0, right = size - 1;
int comparisons = 0;
while (left <= right) {
comparisons++;
int mid = left + (right - left) / 2;
if (arr[mid] == target) {
return comparisons;
} else if (arr[mid] < target) {
left = mid + 1;
} else {
right = mid - 1;
}
}
return -comparisons;
}
int main() {
const int SIZE = 10000;
// Create array for binary search
int *arr = malloc(SIZE * sizeof(int));
for (int i = 0; i < SIZE; i++) {
arr[i] = i;
}
// Create linked list
Node *list = list_create(SIZE);
// Perform searches
int searches = 1000;
int found = 0;
for (int i = 0; i < searches; i++) {
int target = rand() % SIZE;
int binary_comps = binary_search(arr, SIZE, target);
if (binary_comps > 0) found++;
int list_comps = list_search(list, target);
if (list_comps > 0) found++;
}
printf("Searches completed, found %d matches\n", found);
free(arr);
list_free(list);
return 0;
}

Memory Profiling with Valgrind/Massif

1. Using Massif for Heap Profiling

# Heap profiling
valgrind --tool=massif ./program
# View results
ms_print massif.out.*
# With detailed allocation information
valgrind --tool=massif --detailed-freq=1 ./program

2. Memory Profiling Example

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct {
int *data;
size_t size;
} Vector;
Vector* vector_create(size_t size) {
Vector *v = malloc(sizeof(Vector));
v->data = malloc(size * sizeof(int));
v->size = size;
for (size_t i = 0; i < size; i++) {
v->data[i] = 0;
}
return v;
}
void vector_resize(Vector *v, size_t new_size) {
v->data = realloc(v->data, new_size * sizeof(int));
if (new_size > v->size) {
for (size_t i = v->size; i < new_size; i++) {
v->data[i] = 0;
}
}
v->size = new_size;
}
void vector_free(Vector *v) {
free(v->data);
free(v);
}
// Memory leak demonstration
void memory_leak_example() {
char *leak = malloc(1024);
strcpy(leak, "This memory will leak");
// No free!
}
int main() {
printf("Creating vectors...\n");
// Create and resize vectors
Vector *v1 = vector_create(1000);
Vector *v2 = vector_create(2000);
vector_resize(v1, 5000);
vector_resize(v2, 10000);
// Some operations
for (size_t i = 0; i < v1->size; i++) {
v1->data[i] = i;
}
// Clean up
vector_free(v1);
vector_free(v2);
// This will show as memory leak
memory_leak_example();
return 0;
}

Cache Profiling with perf

#include <stdio.h>
#include <stdlib.h>
// Cache-unfriendly access pattern
void cache_unfriendly(int *arr, int size) {
// Jump stride that causes cache misses
int stride = 1024;  // Large enough to miss cache lines
long long sum = 0;
for (int i = 0; i < size; i += stride) {
sum += arr[i % size];
}
}
// Cache-friendly access pattern
void cache_friendly(int *arr, int size) {
long long sum = 0;
// Sequential access - good cache locality
for (int i = 0; i < size; i++) {
sum += arr[i];
}
}
// Matrix multiplication with different cache behaviors
void matrix_multiply_cached(int n, double **a, double **b, double **c) {
// Cache-friendly loop ordering
for (int i = 0; i < n; i++) {
for (int k = 0; k < n; k++) {
double aik = a[i][k];
for (int j = 0; j < n; j++) {
c[i][j] += aik * b[k][j];
}
}
}
}
void matrix_multiply_uncached(int n, double **a, double **b, double **c) {
// Cache-unfriendly loop ordering
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
for (int k = 0; k < n; k++) {
c[i][j] += a[i][k] * b[k][j];
}
}
}
}
int main() {
const int SIZE = 1000000;
int *arr = malloc(SIZE * sizeof(int));
for (int i = 0; i < SIZE; i++) {
arr[i] = i;
}
printf("Running cache benchmarks...\n");
// These will have different cache miss rates
cache_unfriendly(arr, SIZE);
cache_friendly(arr, SIZE);
free(arr);
return 0;
}

Instrumentation with GCC's -finstrument-functions

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
// Function instrumentation hooks
void __cyg_profile_func_enter(void *this_fn, void *call_site) 
__attribute__((no_instrument_function));
void __cyg_profile_func_exit(void *this_fn, void *call_site) 
__attribute__((no_instrument_function));
static FILE *log_file = NULL;
void __cyg_profile_func_enter(void *this_fn, void *call_site) {
if (!log_file) {
log_file = fopen("instrument.log", "w");
}
struct timeval tv;
gettimeofday(&tv, NULL);
fprintf(log_file, "E %ld.%06ld %p\n", tv.tv_sec, tv.tv_usec, this_fn);
fflush(log_file);
}
void __cyg_profile_func_exit(void *this_fn, void *call_site) {
if (!log_file) return;
struct timeval tv;
gettimeofday(&tv, NULL);
fprintf(log_file, "X %ld.%06ld %p\n", tv.tv_sec, tv.tv_usec, this_fn);
fflush(log_file);
}
// Example functions
void function_a() {
volatile int sum = 0;
for (int i = 0; i < 1000000; i++) {
sum += i;
}
}
void function_b() {
volatile long long product = 1;
for (int i = 1; i <= 1000; i++) {
product *= i;
}
}
int main() {
function_a();
function_b();
if (log_file) fclose(log_file);
return 0;
}

Compile with:

gcc -finstrument-functions -g -o program program.c

Real-Time Profiling with Timer Interrupts

#include <stdio.h>
#include <signal.h>
#include <sys/time.h>
#include <unistd.h>
#include <stdlib.h>
#include <execinfo.h>
#define MAX_STACK_DEPTH 20
#define SAMPLE_INTERVAL_US 10000  // 10 ms
typedef struct {
void *addresses[MAX_STACK_DEPTH];
int depth;
long long count;
} StackSample;
StackSample samples[10000];
int sample_count = 0;
void sample_handler(int signum) {
if (sample_count >= 10000) return;
StackSample *sample = &samples[sample_count++];
sample->depth = backtrace(sample->addresses, MAX_STACK_DEPTH);
sample->count = 1;
}
void setup_timer() {
struct sigaction sa;
struct itimerval timer;
// Set up signal handler
sa.sa_handler = sample_handler;
sigemptyset(&sa.sa_mask);
sa.sa_flags = SA_RESTART;
sigaction(SIGPROF, &sa, NULL);
// Configure timer
timer.it_value.tv_sec = 0;
timer.it_value.tv_usec = SAMPLE_INTERVAL_US;
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = SAMPLE_INTERVAL_US;
setitimer(ITIMER_PROF, &timer, NULL);
}
void report_samples() {
printf("\n=== Profiling Report ===\n");
printf("Total samples: %d\n", sample_count);
// Simple aggregation by top address
// In practice, you'd want to aggregate by function name
}
// Test functions
void busy_work() {
volatile long long sum = 0;
for (long long i = 0; i < 10000000; i++) {
sum += i;
}
}
void medium_work() {
volatile long long sum = 0;
for (long long i = 0; i < 5000000; i++) {
sum += i;
}
}
int main() {
setup_timer();
printf("Starting workload...\n");
for (int i = 0; i < 5; i++) {
busy_work();
medium_work();
usleep(10000);
}
report_samples();
return 0;
}

Custom Call Graph Profiler

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
typedef struct CallNode {
char name[64];
long long total_time;
long long self_time;
long long call_count;
struct CallNode *parent;
struct CallNode *children[64];
int child_count;
} CallNode;
CallNode *root = NULL;
CallNode *current = NULL;
CallNode* find_child(CallNode *parent, const char *name) {
for (int i = 0; i < parent->child_count; i++) {
if (strcmp(parent->children[i]->name, name) == 0) {
return parent->children[i];
}
}
return NULL;
}
CallNode* create_node(const char *name, CallNode *parent) {
CallNode *node = malloc(sizeof(CallNode));
strncpy(node->name, name, sizeof(node->name) - 1);
node->total_time = 0;
node->self_time = 0;
node->call_count = 0;
node->parent = parent;
node->child_count = 0;
return node;
}
void profile_enter(const char *name) {
struct timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
long long now_ns = now.tv_sec * 1000000000LL + now.tv_nsec;
if (!root) {
root = create_node(name, NULL);
current = root;
} else {
CallNode *child = find_child(current, name);
if (!child) {
child = create_node(name, current);
current->children[current->child_count++] = child;
}
current = child;
}
current->call_count++;
current->start_time = now_ns;
}
void profile_exit(const char *name) {
struct timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
long long now_ns = now.tv_sec * 1000000000LL + now.tv_nsec;
long long elapsed = now_ns - current->start_time;
current->total_time += elapsed;
current->self_time += elapsed;
// Subtract from parent's self time
if (current->parent) {
current->parent->self_time -= elapsed;
}
current = current->parent;
}
void print_tree(CallNode *node, int depth) {
if (!node) return;
for (int i = 0; i < depth; i++) printf("  ");
printf("%s: calls=%lld, total=%.3fms, self=%.3fms\n",
node->name,
node->call_count,
node->total_time / 1000000.0,
node->self_time / 1000000.0);
for (int i = 0; i < node->child_count; i++) {
print_tree(node->children[i], depth + 1);
}
}
// Example usage
#define PROFILE_FUNC() \
void __attribute__((constructor)) profile_enter_##__func__() { \
profile_enter(__func__); \
} \
void __attribute__((destructor)) profile_exit_##__func__() { \
profile_exit(__func__); \
}
void function_a() {
PROFILE_FUNC();
volatile long long sum = 0;
for (int i = 0; i < 10000000; i++) sum += i;
}
void function_b() {
PROFILE_FUNC();
volatile long long sum = 0;
for (int i = 0; i < 5000000; i++) sum += i;
function_a();
}
void function_c() {
PROFILE_FUNC();
function_b();
function_a();
}
int main() {
for (int i = 0; i < 3; i++) {
function_c();
function_b();
}
print_tree(root, 0);
return 0;
}

Best Practices for Code Profiling

  1. Profile with realistic data: Use production-like workloads
  2. Profile optimized builds: -O2 or -O3 for accurate results
  3. Include debug symbols: -g for meaningful output
  4. Multiple runs: Average results to account for variance
  5. Profile in target environment: Similar hardware and OS
  6. Start with high-level profiling: Identify bottlenecks before micro-optimizing
  7. Profile different scenarios: Various input sizes and configurations
  8. Understand measurement overhead: Instrumentation affects results
  9. Use multiple tools: Different tools provide different insights
  10. Profile regularly: Integrate into CI/CD pipeline

Common Pitfalls

// 1. Measuring I/O without warming up
void bad_io_measurement() {
clock_t start = clock();
FILE *f = fopen("large_file.txt", "r");
// First read includes disk seek and caching
clock_t end = clock();  // Highly variable!
}
// Better: warm up first
void good_io_measurement() {
// Warm up file cache
FILE *f = fopen("large_file.txt", "r");
fclose(f);
clock_t start = clock();
f = fopen("large_file.txt", "r");
// Now measuring after caching
clock_t end = clock();
fclose(f);
}
// 2. Not preventing compiler optimization
void bad_prevention() {
int sum = 0;
for (int i = 0; i < 1000000; i++) {
sum += i;  // Compiler might optimize away!
}
}
void good_prevention() {
volatile int sum = 0;  // Prevents optimization
for (int i = 0; i < 1000000; i++) {
sum += i;
}
}

Conclusion

Code profiling in C is an essential skill for performance-critical applications. From simple manual timing to sophisticated tools like perf, gprof, and Valgrind, the C ecosystem offers a rich set of profiling capabilities.

Key takeaways:

  • Choose the right tool: perf for system-level, gprof for function-level, Valgrind for memory
  • Profile with purpose: Focus on bottlenecks that matter
  • Measure accurately: Account for variance and overhead
  • Profile continuously: Make profiling part of development workflow
  • Combine approaches: Use multiple tools for comprehensive insights

By mastering profiling techniques, you can transform slow, inefficient C programs into well-optimized, performant applications that fully utilize system resources. Remember: premature optimization is the root of all evil, but informed optimization based on profiling data is the path to performance excellence.

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper