Parallel Data Processing: A Complete Guide to SIMD Programming in C

Single Instruction, Multiple Data (SIMD) is a parallel computing technique that allows a single instruction to operate on multiple data elements simultaneously. Modern processors feature SIMD instruction sets like SSE, AVX, and NEON that can dramatically accelerate data-parallel workloads. This comprehensive guide explores SIMD programming in C, from basic concepts to advanced optimization techniques.

What is SIMD?

SIMD enables processors to perform the same operation on multiple data points in one instruction. Instead of processing array elements one by one, SIMD processes them in chunks:

Scalar:        [a] + [b] → [c]
[d] + [e] → [f]
[g] + [h] → [i]
[j] + [k] → [l]
SIMD (4-wide): [a,b,c,d] + [e,f,g,h] → [i,j,k,l]  (one instruction)

SIMD Instruction Sets

Instruction SetRegister WidthElementsProcessors
MMX64-bit2x32-bitLegacy Intel
SSE128-bit4x32-bitIntel/AMD (since Pentium III)
SSE2128-bit2x64-bitIntel/AMD
SSE3/SSE4128-bitVariousIntel/AMD
AVX256-bit8x32-bitIntel Sandy Bridge+
AVX2256-bit8x32-bitIntel Haswell+
AVX-512512-bit16x32-bitIntel Skylake-X+
NEON128-bit4x32-bitARM Cortex-A
SVEScalableVariableARMv8.2+

SIMD Programming Approaches

1. Compiler Auto-Vectorization

Modern compilers can automatically generate SIMD code from scalar loops:

#include <stdio.h>
#include <stdlib.h>
// Compiler may auto-vectorize this loop
void vector_add(float *a, float *b, float *c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// Compile with: gcc -O3 -march=native -ftree-vectorize -fopt-info-vec
// To help the compiler:
// - Use restrict to indicate no pointer aliasing
// - Use aligned memory
// - Use loop counts that are multiples of vector size
void vector_add_optimized(float *restrict a, float *restrict b, 
float *restrict c, int n) {
#pragma omp simd
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}

2. Intrinsics (Compiler-Specific)

Intrinsics provide C functions that map directly to SIMD instructions:

#include <immintrin.h>  // SSE, AVX intrinsics
#include <arm_neon.h>   // ARM NEON intrinsics

SSE Intrinsics (x86-64)

1. Basic SSE Operations

#include <immintrin.h>
#include <stdio.h>
// 128-bit SIMD operations on 4 floats
void sse_basic_example() {
// Load 4 floats into registers
__m128 a = _mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f);  // Reverse order!
__m128 b = _mm_set_ps(8.0f, 7.0f, 6.0f, 5.0f);
// Set in order
__m128 c = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
// Arithmetic operations
__m128 sum = _mm_add_ps(a, b);      // a + b
__m128 diff = _mm_sub_ps(a, b);     // a - b
__m128 prod = _mm_mul_ps(a, b);     // a * b
__m128 quot = _mm_div_ps(a, b);     // a / b
// Store results
float result[4];
_mm_store_ps(result, sum);
printf("Sum: %f, %f, %f, %f\n", result[0], result[1], result[2], result[3]);
}
// Vector addition of arrays
void sse_vector_add(float *a, float *b, float *c, int n) {
int i = 0;
// Process 4 elements at a time
for (; i + 3 < n; i += 4) {
__m128 va = _mm_loadu_ps(&a[i]);  // Unaligned load
__m128 vb = _mm_loadu_ps(&b[i]);
__m128 vc = _mm_add_ps(va, vb);
_mm_storeu_ps(&c[i], vc);
}
// Handle remaining elements
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}

2. Aligned Memory Access

#include <stdlib.h>
#include <immintrin.h>
// Aligned memory allocation
float* aligned_alloc_float(size_t n, size_t alignment) {
void *ptr;
if (posix_memalign(&ptr, alignment, n * sizeof(float)) != 0) {
return NULL;
}
return (float*)ptr;
}
// Vector addition with aligned memory
void sse_vector_add_aligned(float *a, float *b, float *c, int n) {
int i = 0;
// Process 4 elements at a time with aligned loads
for (; i + 3 < n; i += 4) {
__m128 va = _mm_load_ps(&a[i]);   // Requires 16-byte alignment
__m128 vb = _mm_load_ps(&b[i]);
__m128 vc = _mm_add_ps(va, vb);
_mm_store_ps(&c[i], vc);
}
// Handle remainder
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}

3. SSE Math Functions

#include <immintrin.h>
#include <math.h>
// Vectorized square root
void sse_sqrt(float *a, float *b, int n) {
int i = 0;
for (; i + 3 < n; i += 4) {
__m128 va = _mm_loadu_ps(&a[i]);
__m128 vb = _mm_sqrt_ps(va);      // Square root of 4 floats
_mm_storeu_ps(&b[i], vb);
}
for (; i < n; i++) {
b[i] = sqrtf(a[i]);
}
}
// Vectorized reciprocal approximation (faster but less accurate)
void sse_reciprocal(float *a, float *b, int n) {
int i = 0;
for (; i + 3 < n; i += 4) {
__m128 va = _mm_loadu_ps(&a[i]);
__m128 vb = _mm_rcp_ps(va);       // Approximate reciprocal
_mm_storeu_ps(&b[i], vb);
}
for (; i < n; i++) {
b[i] = 1.0f / a[i];
}
}

4. SSE Integer Operations

#include <immintrin.h>
// Vectorized integer addition (16-bit integers)
void sse_short_add(short *a, short *b, short *c, int n) {
int i = 0;
// Process 8 shorts at a time (128 bits / 16 bits = 8)
for (; i + 7 < n; i += 8) {
__m128i va = _mm_loadu_si128((__m128i*)&a[i]);
__m128i vb = _mm_loadu_si128((__m128i*)&b[i]);
__m128i vc = _mm_add_epi16(va, vb);
_mm_storeu_si128((__m128i*)&c[i], vc);
}
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// Vectorized dot product (32-bit integers)
int sse_dot_product(int *a, int *b, int n) {
__m128i sum = _mm_setzero_si128();
int i = 0;
for (; i + 3 < n; i += 4) {
__m128i va = _mm_loadu_si128((__m128i*)&a[i]);
__m128i vb = _mm_loadu_si128((__m128i*)&b[i]);
__m128i prod = _mm_mullo_epi32(va, vb);
sum = _mm_add_epi32(sum, prod);
}
// Horizontal sum of the 4 components
int result[4];
_mm_storeu_si128((__m128i*)result, sum);
int total = result[0] + result[1] + result[2] + result[3];
// Handle remaining elements
for (; i < n; i++) {
total += a[i] * b[i];
}
return total;
}

AVX/AVX2 Intrinsics (256-bit)

#include <immintrin.h>
// AVX vector addition (8 floats at once)
void avx_vector_add(float *a, float *b, float *c, int n) {
int i = 0;
// Process 8 elements at a time
for (; i + 7 < n; i += 8) {
__m256 va = _mm256_loadu_ps(&a[i]);
__m256 vb = _mm256_loadu_ps(&b[i]);
__m256 vc = _mm256_add_ps(va, vb);
_mm256_storeu_ps(&c[i], vc);
}
// Handle remainder
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// AVX2 integer operations
void avx2_int_add(int *a, int *b, int *c, int n) {
int i = 0;
// Process 8 ints at a time
for (; i + 7 < n; i += 8) {
__m256i va = _mm256_loadu_si256((__m256i*)&a[i]);
__m256i vb = _mm256_loadu_si256((__m256i*)&b[i]);
__m256i vc = _mm256_add_epi32(va, vb);
_mm256_storeu_si256((__m256i*)&c[i], vc);
}
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// Fused Multiply-Add (FMA) - AVX2
void avx_fma(float *a, float *b, float *c, float *d, int n) {
int i = 0;
// d[i] = a[i] * b[i] + c[i]
for (; i + 7 < n; i += 8) {
__m256 va = _mm256_loadu_ps(&a[i]);
__m256 vb = _mm256_loadu_ps(&b[i]);
__m256 vc = _mm256_loadu_ps(&c[i]);
__m256 vd = _mm256_fmadd_ps(va, vb, vc);
_mm256_storeu_ps(&d[i], vd);
}
for (; i < n; i++) {
d[i] = a[i] * b[i] + c[i];
}
}

AVX-512 Intrinsics (512-bit)

#include <immintrin.h>
// AVX-512 vector addition (16 floats at once)
void avx512_vector_add(float *a, float *b, float *c, int n) {
int i = 0;
// Process 16 elements at a time
for (; i + 15 < n; i += 16) {
__m512 va = _mm512_loadu_ps(&a[i]);
__m512 vb = _mm512_loadu_ps(&b[i]);
__m512 vc = _mm512_add_ps(va, vb);
_mm512_storeu_ps(&c[i], vc);
}
// Handle remainder with mask
if (i < n) {
int remaining = n - i;
__mmask16 mask = (1 << remaining) - 1;
__m512 va = _mm512_maskz_loadu_ps(mask, &a[i]);
__m512 vb = _mm512_maskz_loadu_ps(mask, &b[i]);
__m512 vc = _mm512_add_ps(va, vb);
_mm512_mask_storeu_ps(&c[i], mask, vc);
}
}
// Gather-scatter operations
void avx512_gather_scatter(float *src, float *dst, int *indices, int n) {
int i = 0;
for (; i + 15 < n; i += 16) {
__m512i idx = _mm512_loadu_si512(&indices[i]);
__m512 gathered = _mm512_i32gather_ps(idx, src, 4);
_mm512_storeu_ps(&dst[i], gathered);
}
for (; i < n; i++) {
dst[i] = src[indices[i]];
}
}

ARM NEON Intrinsics

#include <arm_neon.h>
#include <stdio.h>
// NEON vector addition (4 floats)
void neon_vector_add(float *a, float *b, float *c, int n) {
int i = 0;
// Process 4 elements at a time
for (; i + 3 < n; i += 4) {
float32x4_t va = vld1q_f32(&a[i]);
float32x4_t vb = vld1q_f32(&b[i]);
float32x4_t vc = vaddq_f32(va, vb);
vst1q_f32(&c[i], vc);
}
// Handle remainder
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// NEON integer operations (8-bit)
void neon_uint8_add(uint8_t *a, uint8_t *b, uint8_t *c, int n) {
int i = 0;
// Process 16 bytes at a time
for (; i + 15 < n; i += 16) {
uint8x16_t va = vld1q_u8(&a[i]);
uint8x16_t vb = vld1q_u8(&b[i]);
uint8x16_t vc = vaddq_u8(va, vb);
vst1q_u8(&c[i], vc);
}
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// NEON matrix multiplication (4x4)
void neon_matrix_multiply(float a[4][4], float b[4][4], float c[4][4]) {
float32x4_t col0, col1, col2, col3;
// Load columns of B
col0 = vld1q_f32(&b[0][0]);
col1 = vld1q_f32(&b[0][1]);
col2 = vld1q_f32(&b[0][2]);
col3 = vld1q_f32(&b[0][3]);
for (int i = 0; i < 4; i++) {
float32x4_t row = vld1q_f32(&a[i][0]);
// Multiply row by each column
float32x4_t r0 = vmulq_n_f32(col0, row[0]);
float32x4_t r1 = vmulq_n_f32(col1, row[1]);
float32x4_t r2 = vmulq_n_f32(col2, row[2]);
float32x4_t r3 = vmulq_n_f32(col3, row[3]);
// Sum results
float32x4_t sum = vaddq_f32(r0, vaddq_f32(r1, vaddq_f32(r2, r3)));
vst1q_f32(&c[i][0], sum);
}
}

Practical Examples

1. Image Processing: Grayscale Conversion

#include <stdint.h>
#include <immintrin.h>
// Convert RGB to Grayscale: Y = 0.299R + 0.587G + 0.114B
void rgb_to_grayscale_sse(uint8_t *rgb, uint8_t *gray, int width, int height) {
int pixels = width * height;
int i = 0;
// Coefficients as 16-bit fixed point (0.299 * 256 ≈ 76, etc.)
__m128i coeff_r = _mm_set1_epi16(76);
__m128i coeff_g = _mm_set1_epi16(150);
__m128i coeff_b = _mm_set1_epi16(29);
for (; i + 15 < pixels; i += 16) {
// Load 16 pixels (48 bytes of RGB data)
__m128i r = _mm_loadu_si128((__m128i*)(rgb + i * 3));
__m128i g = _mm_loadu_si128((__m128i*)(rgb + i * 3 + 16));
__m128i b = _mm_loadu_si128((__m128i*)(rgb + i * 3 + 32));
// Unpack to 16-bit
__m128i r16 = _mm_unpacklo_epi8(r, _mm_setzero_si128());
__m128i g16 = _mm_unpacklo_epi8(g, _mm_setzero_si128());
__m128i b16 = _mm_unpacklo_epi8(b, _mm_setzero_si128());
// Multiply and accumulate
__m128i y16 = _mm_mullo_epi16(r16, coeff_r);
y16 = _mm_add_epi16(y16, _mm_mullo_epi16(g16, coeff_g));
y16 = _mm_add_epi16(y16, _mm_mullo_epi16(b16, coeff_b));
// Shift right (divide by 256) and pack
y16 = _mm_srli_epi16(y16, 8);
__m128i y8 = _mm_packus_epi16(y16, y16);
_mm_storeu_si128((__m128i*)&gray[i], y8);
}
// Handle remaining pixels with scalar code
for (; i < pixels; i++) {
gray[i] = (rgb[i*3] * 76 + rgb[i*3+1] * 150 + rgb[i*3+2] * 29) >> 8;
}
}

2. Convolution (3x3 Kernel)

#include <immintrin.h>
void convolution_3x3_sse(float *input, float *output, float *kernel, 
int width, int height) {
__m128 k0 = _mm_set1_ps(kernel[0]);
__m128 k1 = _mm_set1_ps(kernel[1]);
__m128 k2 = _mm_set1_ps(kernel[2]);
__m128 k3 = _mm_set1_ps(kernel[3]);
__m128 k4 = _mm_set1_ps(kernel[4]);
__m128 k5 = _mm_set1_ps(kernel[5]);
__m128 k6 = _mm_set1_ps(kernel[6]);
__m128 k7 = _mm_set1_ps(kernel[7]);
__m128 k8 = _mm_set1_ps(kernel[8]);
for (int y = 1; y < height - 1; y++) {
for (int x = 1; x < width - 1; x += 4) {
// Load 4 output positions at once
int idx = y * width + x;
// Top row
__m128 t0 = _mm_loadu_ps(&input[(y-1)*width + x-1]);
__m128 t1 = _mm_loadu_ps(&input[(y-1)*width + x]);
__m128 t2 = _mm_loadu_ps(&input[(y-1)*width + x+1]);
// Middle row
__m128 m0 = _mm_loadu_ps(&input[y*width + x-1]);
__m128 m1 = _mm_loadu_ps(&input[y*width + x]);
__m128 m2 = _mm_loadu_ps(&input[y*width + x+1]);
// Bottom row
__m128 b0 = _mm_loadu_ps(&input[(y+1)*width + x-1]);
__m128 b1 = _mm_loadu_ps(&input[(y+1)*width + x]);
__m128 b2 = _mm_loadu_ps(&input[(y+1)*width + x+1]);
// Apply kernel
__m128 sum = _mm_mul_ps(t0, k0);
sum = _mm_add_ps(sum, _mm_mul_ps(t1, k1));
sum = _mm_add_ps(sum, _mm_mul_ps(t2, k2));
sum = _mm_add_ps(sum, _mm_mul_ps(m0, k3));
sum = _mm_add_ps(sum, _mm_mul_ps(m1, k4));
sum = _mm_add_ps(sum, _mm_mul_ps(m2, k5));
sum = _mm_add_ps(sum, _mm_mul_ps(b0, k6));
sum = _mm_add_ps(sum, _mm_mul_ps(b1, k7));
sum = _mm_add_ps(sum, _mm_mul_ps(b2, k8));
_mm_storeu_ps(&output[idx], sum);
}
}
}

3. Mandelbrot Set Calculation

#include <immintrin.h>
// Calculate Mandelbrot iterations for 4 points at once
__m128i mandelbrot_sse(float x0, float y0, float x_step, float y_step, int max_iter) {
__m128 x = _mm_set_ps(x0 + x_step*3, x0 + x_step*2, x0 + x_step, x0);
__m128 y = _mm_set1_ps(y0);
__m128 zx = _mm_setzero_ps();
__m128 zy = _mm_setzero_ps();
__m128i iter = _mm_setzero_si128();
__m128i one = _mm_set1_epi32(1);
for (int i = 0; i < max_iter; i++) {
__m128 zx2 = _mm_mul_ps(zx, zx);
__m128 zy2 = _mm_mul_ps(zy, zy);
__m128 zxzy = _mm_mul_ps(zx, zy);
__m128 len2 = _mm_add_ps(zx2, zy2);
__m128 cmp = _mm_cmple_ps(len2, _mm_set1_ps(4.0f));
if (_mm_movemask_ps(cmp) == 0) {
break;
}
zx = _mm_add_ps(_mm_sub_ps(zx2, zy2), x);
zy = _mm_add_ps(_mm_add_ps(zxzy, zxzy), y);
__m128i inc = _mm_castps_si128(_mm_and_ps(cmp, *(__m128*)&one));
iter = _mm_add_epi32(iter, inc);
}
return iter;
}

4. Matrix Multiplication (Blocked)

#include <immintrin.h>
void matrix_multiply_avx(float *a, float *b, float *c, int n) {
// Block size for better cache locality
const int BLOCK = 64;
for (int i0 = 0; i0 < n; i0 += BLOCK) {
for (int j0 = 0; j0 < n; j0 += BLOCK) {
for (int k0 = 0; k0 < n; k0 += BLOCK) {
for (int i = i0; i < i0 + BLOCK && i < n; i++) {
for (int k = k0; k < k0 + BLOCK && k < n; k++) {
__m256 aik = _mm256_set1_ps(a[i * n + k]);
int j = j0;
for (; j + 7 < j0 + BLOCK && j + 7 < n; j += 8) {
__m256 bkj = _mm256_loadu_ps(&b[k * n + j]);
__m256 cij = _mm256_loadu_ps(&c[i * n + j]);
cij = _mm256_fmadd_ps(aik, bkj, cij);
_mm256_storeu_ps(&c[i * n + j], cij);
}
for (; j < j0 + BLOCK && j < n; j++) {
c[i * n + j] += a[i * n + k] * b[k * n + j];
}
}
}
}
}
}
}

SIMD Optimization Techniques

1. Data Alignment

#include <stdlib.h>
#include <immintrin.h>
// Proper alignment for maximum performance
float* aligned_alloc(size_t n, size_t alignment) {
void *ptr;
if (posix_memalign(&ptr, alignment, n * sizeof(float)) != 0) {
return NULL;
}
return (float*)ptr;
}
// Use aligned loads/stores when possible
void aligned_process(float *a, float *b, float *c, int n) {
// Assume a, b, c are 32-byte aligned
for (int i = 0; i < n; i += 8) {
__m256 va = _mm256_load_ps(&a[i]);   // Aligned load
__m256 vb = _mm256_load_ps(&b[i]);
__m256 vc = _mm256_add_ps(va, vb);
_mm256_store_ps(&c[i], vc);          // Aligned store
}
}

2. Loop Unrolling

// Unrolled loop for better instruction-level parallelism
void sse_add_unrolled(float *a, float *b, float *c, int n) {
int i = 0;
for (; i + 15 < n; i += 16) {
__m256 va0 = _mm256_loadu_ps(&a[i]);
__m256 va1 = _mm256_loadu_ps(&a[i+8]);
__m256 vb0 = _mm256_loadu_ps(&b[i]);
__m256 vb1 = _mm256_loadu_ps(&b[i+8]);
__m256 vc0 = _mm256_add_ps(va0, vb0);
__m256 vc1 = _mm256_add_ps(va1, vb1);
_mm256_storeu_ps(&c[i], vc0);
_mm256_storeu_ps(&c[i+8], vc1);
}
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}

3. Prefetching

#include <immintrin.h>
void prefetch_example(float *a, float *b, float *c, int n) {
int i = 0;
for (; i + 31 < n; i += 32) {
// Prefetch next cache lines
_mm_prefetch(&a[i + 32], _MM_HINT_T0);
_mm_prefetch(&b[i + 32], _MM_HINT_T0);
// Process current chunk
__m256 va0 = _mm256_loadu_ps(&a[i]);
__m256 va1 = _mm256_loadu_ps(&a[i+8]);
__m256 va2 = _mm256_loadu_ps(&a[i+16]);
__m256 va3 = _mm256_loadu_ps(&a[i+24]);
__m256 vb0 = _mm256_loadu_ps(&b[i]);
__m256 vb1 = _mm256_loadu_ps(&b[i+8]);
__m256 vb2 = _mm256_loadu_ps(&b[i+16]);
__m256 vb3 = _mm256_loadu_ps(&b[i+24]);
__m256 vc0 = _mm256_add_ps(va0, vb0);
__m256 vc1 = _mm256_add_ps(va1, vb1);
__m256 vc2 = _mm256_add_ps(va2, vb2);
__m256 vc3 = _mm256_add_ps(va3, vb3);
_mm256_storeu_ps(&c[i], vc0);
_mm256_storeu_ps(&c[i+8], vc1);
_mm256_storeu_ps(&c[i+16], vc2);
_mm256_storeu_ps(&c[i+24], vc3);
}
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}

Compiler-Specific Optimizations

// GCC/Clang vector extensions (simpler than intrinsics)
typedef float v8sf __attribute__((vector_size(32)));
void vector_extension_example(float *a, float *b, float *c, int n) {
for (int i = 0; i < n; i += 8) {
v8sf va = *(v8sf*)&a[i];
v8sf vb = *(v8sf*)&b[i];
v8sf vc = va + vb;  // Automatic vectorization
*(v8sf*)&c[i] = vc;
}
}
// OpenMP SIMD directives
#pragma omp declare simd
float my_function(float x) {
return x * x + x;
}
void openmp_simd_example(float *a, float *b, int n) {
#pragma omp simd
for (int i = 0; i < n; i++) {
b[i] = my_function(a[i]);
}
}

Performance Measurement

#include <time.h>
#include <stdio.h>
double get_time() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec / 1e9;
}
void benchmark(const char *name, void (*func)(float*, float*, float*, int), 
float *a, float *b, float *c, int n, int iterations) {
double start = get_time();
for (int i = 0; i < iterations; i++) {
func(a, b, c, n);
}
double elapsed = get_time() - start;
double ops = (double)n * iterations / elapsed;
printf("%s: %.3f seconds, %.2f M ops/sec\n", name, elapsed, ops / 1e6);
}
// Usage
int main() {
int n = 10000000;
float *a = aligned_alloc_float(n, 32);
float *b = aligned_alloc_float(n, 32);
float *c = aligned_alloc_float(n, 32);
// Initialize arrays...
benchmark("Scalar", scalar_vector_add, a, b, c, n, 100);
benchmark("SSE", sse_vector_add, a, b, c, n, 100);
benchmark("AVX", avx_vector_add, a, b, c, n, 100);
free(a); free(b); free(c);
return 0;
}

Best Practices

  1. Use compiler auto-vectorization first: Let the compiler do the work
  2. Align data: 16-byte for SSE, 32-byte for AVX
  3. Use intrinsics for critical loops: When auto-vectorization fails
  4. Consider cache behavior: Block algorithms for large datasets
  5. Profile and measure: Verify performance gains
  6. Provide scalar fallbacks: For portability
  7. Use appropriate data types: Float often better than double for SIMD
  8. Handle remainder elements: Don't forget trailing elements

Common Pitfalls

// 1. Misaligned loads/stores on aligned instructions
__m128 va = _mm_load_ps(&a[i]);  // Crashes if a[i] not 16-byte aligned
// 2. Assuming vector width is constant
// Use _mm_set1_ps for constants, not scalar operations
// 3. Not handling remainder
for (int i = 0; i < n; i += 4) {  // May overflow if n not multiple of 4
// ...
}
// 4. Mixing data types without conversion
__m128i va = _mm_loadu_si128((__m128i*)a);  // a is float*
// Wrong! Use appropriate load function
// 5. Forgetting to clear YMM registers on AVX
// Use vzeroupper when mixing AVX and SSE code

Conclusion

SIMD programming in C offers substantial performance gains for data-parallel workloads. Key takeaways:

  • Start with compiler auto-vectorization: The simplest approach
  • Use intrinsics for critical paths: When manual control is needed
  • Align data: Essential for maximum performance
  • Consider architecture: Different SIMD extensions for different CPUs
  • Provide fallbacks: Maintain scalar versions for portability

With modern processors offering wide SIMD capabilities (AVX-512 up to 16 floats per instruction), mastering SIMD programming is essential for high-performance computing, image processing, scientific computing, and game development. The techniques in this guide provide a foundation for writing efficient, vectorized code in C.

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper