C is often described as "portable assembly language," but there are times when you need to step down to actual assembly code. Whether for performance-critical sections, accessing processor-specific features, or implementing hardware-level operations, integrating assembly with C is an essential skill for systems programmers. This comprehensive guide covers everything from inline assembly to separate assembly modules.
Why Integrate Assembly with C?
- Performance optimization: Hand-tuned critical loops
- Hardware access: Special instructions not available in C
- Interrupt handling: Low-level interrupt service routines
- Boot code: Initialization before C runtime is ready
- Processor features: SIMD, cryptography extensions
- Real-time constraints: Precise timing control
Inline Assembly Basics
1. GCC Inline Assembly Syntax
#include <stdio.h>
// Basic inline assembly
void basic_asm_example() {
int a = 10, b = 20, result;
// Simple addition
__asm__ volatile (
"add %1, %0" // Assembly instruction
: "=r"(result) // Output operands
: "r"(a), "0"(b) // Input operands
: // Clobbered registers
);
printf("Result: %d\n", result);
}
2. GCC Extended ASM Format
// Extended inline assembly format
__asm__ volatile (
"assembly code"
: output_operands
: input_operands
: clobbered_registers
);
// Example: Atomic exchange
int atomic_exchange(int *ptr, int new_val) {
int old_val;
__asm__ volatile (
"xchg %0, %1"
: "=r"(old_val), "+m"(*ptr)
: "0"(new_val)
: "memory"
);
return old_val;
}
Register Constraints and Operands
#include <stdint.h>
// Common register constraints
void register_constraints_example() {
int a = 10;
int b = 20;
int result;
__asm__ volatile (
"add %2, %0\n\t"
"add %3, %0"
: "=r"(result) // Output: any register
: "0"(a), // Input: same as output 0
"r"(b), // Input: any register
"i"(5) // Input: immediate constant
: "cc" // Clobbers condition codes
);
}
// Constraint types
// "r" - Any register
// "m" - Memory operand
// "i" - Immediate integer
// "g" - Any register, memory, or immediate
// "a" - %eax/%rax
// "b" - %ebx/%rbx
// "c" - %ecx/%rcx
// "d" - %edx/%rdx
// "S" - %esi/%rsi
// "D" - %edi/%rdi
// "q" - Any 64-bit register (x86_64)
Common Inline Assembly Patterns
1. Bit Manipulation
#include <stdint.h>
// Rotate left
uint32_t rotl32(uint32_t x, int shift) {
__asm__ volatile (
"rol %b1, %0"
: "=r"(x)
: "0"(x), "c"(shift)
: "cc"
);
return x;
}
// Bit test and set
int test_and_set_bit(volatile uint32_t *addr, int bit) {
int old;
__asm__ volatile (
"bts %2, %1\n\t"
"sbb %0, %0"
: "=r"(old), "+m"(*addr)
: "r"(bit)
: "cc"
);
return old;
}
// Count leading zeros (CLZ)
int count_leading_zeros(uint32_t x) {
int result;
__asm__ volatile (
"clz %0, %1"
: "=r"(result)
: "r"(x)
);
return result;
}
2. Atomic Operations
#include <stdint.h>
// Atomic increment
void atomic_increment(volatile int *ptr) {
__asm__ volatile (
"lock incl %0"
: "+m"(*ptr)
:
: "memory"
);
}
// Compare and swap
int compare_and_swap(volatile int *ptr, int old_val, int new_val) {
int result;
__asm__ volatile (
"lock cmpxchg %2, %1"
: "=a"(result), "+m"(*ptr)
: "r"(new_val), "0"(old_val)
: "memory", "cc"
);
return result;
}
// Atomic add and fetch
int atomic_add_fetch(volatile int *ptr, int val) {
int result;
__asm__ volatile (
"lock xadd %0, %1\n\t"
"add %0, %0"
: "=r"(result), "+m"(*ptr)
: "0"(val)
: "memory", "cc"
);
return result + val;
}
3. Memory Barriers
#include <stdint.h>
// Compiler barrier
#define barrier() __asm__ volatile("" ::: "memory")
// Full memory barrier
#define mb() __asm__ volatile("mfence" ::: "memory")
// Read barrier
#define rmb() __asm__ volatile("lfence" ::: "memory")
// Write barrier
#define wmb() __asm__ volatile("sfence" ::: "memory")
// Example: Lock-free queue
typedef struct {
volatile uint32_t head;
volatile uint32_t tail;
void *buffer[256];
} RingBuffer;
int ring_buffer_push(RingBuffer *rb, void *item) {
uint32_t head = rb->head;
uint32_t next = (head + 1) % 256;
if (next == rb->tail) {
return -1; // Full
}
rb->buffer[head] = item;
wmb(); // Ensure write completes before updating head
rb->head = next;
return 0;
}
ARM Architecture Integration
#include <stdint.h>
// ARM Cortex-M specific operations
// Enter sleep mode
void enter_sleep(void) {
__asm__ volatile (
"wfi"
:
:
: "memory"
);
}
// Set stack pointer
void set_stack_pointer(uint32_t sp) {
__asm__ volatile (
"mov sp, %0"
:
: "r"(sp)
: "sp"
);
}
// Enable interrupts
void enable_interrupts(void) {
__asm__ volatile (
"cpsie i"
:
:
: "memory"
);
}
// Disable interrupts
void disable_interrupts(void) {
__asm__ volatile (
"cpsid i"
:
:
: "memory"
);
}
// Read program counter
uint32_t get_pc(void) {
uint32_t pc;
__asm__ volatile (
"mov %0, pc"
: "=r"(pc)
);
return pc;
}
// Read link register (return address)
uint32_t get_lr(void) {
uint32_t lr;
__asm__ volatile (
"mov %0, lr"
: "=r"(lr)
);
return lr;
}
// ARM NEON SIMD example
void vector_add(float *a, float *b, float *c, int n) {
#ifdef __ARM_NEON
for (int i = 0; i < n; i += 4) {
__asm__ volatile (
"vld1.32 {q0}, [%1]!\n\t"
"vld1.32 {q1}, [%2]!\n\t"
"vadd.f32 q0, q0, q1\n\t"
"vst1.32 {q0}, [%0]!"
: "+r"(c), "+r"(a), "+r"(b)
:
: "q0", "q1", "memory"
);
}
#else
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
#endif
}
x86/x86_64 Architecture Integration
#include <stdint.h>
#include <cpuid.h>
// Read Time Stamp Counter (TSC)
uint64_t rdtsc(void) {
uint32_t lo, hi;
__asm__ volatile (
"rdtsc"
: "=a"(lo), "=d"(hi)
);
return ((uint64_t)hi << 32) | lo;
}
// CPUID instruction
void cpuid(uint32_t leaf, uint32_t *eax, uint32_t *ebx,
uint32_t *ecx, uint32_t *edx) {
__asm__ volatile (
"cpuid"
: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
: "a"(leaf)
);
}
// SSE vector operations
void sse_add(float *a, float *b, float *c, int n) {
#ifdef __SSE__
for (int i = 0; i < n; i += 4) {
__asm__ volatile (
"movups (%1), %%xmm0\n\t"
"movups (%2), %%xmm1\n\t"
"addps %%xmm1, %%xmm0\n\t"
"movups %%xmm0, (%0)"
: "+r"(c), "+r"(a), "+r"(b)
:
: "xmm0", "xmm1", "memory"
);
}
#else
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
#endif
}
// AVX2 operations
void avx2_multiply(float *a, float *b, float *c, int n) {
#ifdef __AVX2__
for (int i = 0; i < n; i += 8) {
__asm__ volatile (
"vmovups (%1), %%ymm0\n\t"
"vmovups (%2), %%ymm1\n\t"
"vmulps %%ymm1, %%ymm0, %%ymm0\n\t"
"vmovups %%ymm0, (%0)"
: "+r"(c), "+r"(a), "+r"(b)
:
: "ymm0", "ymm1", "memory"
);
}
#else
for (int i = 0; i < n; i++) {
c[i] = a[i] * b[i];
}
#endif
}
Separate Assembly Modules
1. Assembly File (functions.s)
; x86_64 assembly functions section .text global add_ints global multiply_ints global dot_product global memcpy_asm ; int add_ints(int a, int b) add_ints: mov eax, edi ; First argument add eax, esi ; Second argument ret ; int multiply_ints(int a, int b) multiply_ints: mov eax, edi imul eax, esi ret ; int dot_product(int *a, int *b, int n) dot_product: push rbx xor eax, eax ; sum = 0 xor ecx, ecx ; i = 0 .loop: cmp ecx, edx ; i < n? jge .done mov ebx, [rdi + rcx*4] ; a[i] imul ebx, [rsi + rcx*4] ; a[i] * b[i] add eax, ebx ; sum += product inc ecx jmp .loop .done: pop rbx ret ; void memcpy_asm(void *dest, const void *src, size_t n) memcpy_asm: mov rcx, rdx ; Count rep movsb ; Copy bytes ret
2. C Header (functions.h)
#ifndef FUNCTIONS_H #define FUNCTIONS_H #include <stddef.h> // Assembly function declarations int add_ints(int a, int b); int multiply_ints(int a, int b); int dot_product(const int *a, const int *b, int n); void memcpy_asm(void *dest, const void *src, size_t n); #endif
3. C Main Program
#include <stdio.h>
#include "functions.h"
int main() {
int a = 10, b = 20;
printf("add: %d\n", add_ints(a, b));
printf("multiply: %d\n", multiply_ints(a, b));
int arr1[] = {1, 2, 3, 4, 5};
int arr2[] = {5, 4, 3, 2, 1};
int result = dot_product(arr1, arr2, 5);
printf("dot product: %d\n", result);
char src[] = "Hello, Assembly!";
char dest[32];
memcpy_asm(dest, src, sizeof(src));
printf("copied: %s\n", dest);
return 0;
}
4. Build Commands
# Assemble and compile nasm -f elf64 functions.s -o functions.o gcc -c main.c -o main.o gcc main.o functions.o -o program # Or with GCC inline gcc -o program main.c functions.s
Thumb Mode (ARM) Assembly
; ARM Thumb mode assembly for Cortex-M .syntax unified .thumb .text .global enable_irq .global disable_irq .global get_primask .global set_primask ; void enable_irq(void) enable_irq: cpsie i bx lr ; void disable_irq(void) disable_irq: cpsid i bx lr ; uint32_t get_primask(void) get_primask: mrs r0, primask bx lr ; void set_primask(uint32_t mask) set_primask: msr primask, r0 bx lr ; void hard_fault_handler(void) .thumb_func .global HardFault_Handler HardFault_Handler: ; Save registers mrs r0, psp mrs r1, msp mov r2, lr ; Call C handler bl hard_fault_c_handler ; Infinite loop b .
Interrupt Service Routines in Assembly
; ARM Cortex-M interrupt handler
.section .text
.global TIM2_IRQHandler
.type TIM2_IRQHandler, %function
TIM2_IRQHandler:
; Save context
push {r0-r3, lr}
; Clear interrupt flag (C function)
bl timer_clear_flag
; Handle interrupt
bl timer_interrupt_handler
; Restore context and return
pop {r0-r3, lr}
bx lr
; x86 interrupt handler
section .text
global timer_interrupt
extern timer_handler
timer_interrupt:
pusha ; Save all registers
push ds
push es
push fs
push gs
mov ax, 0x10 ; Kernel data segment
mov ds, ax
mov es, ax
call timer_handler ; C handler
pop gs
pop fs
pop es
pop ds
popa
iret ; Return from interrupt
Optimization Techniques
1. Loop Unrolling with Assembly
// C implementation
void vector_add_c(float *a, float *b, float *c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// Assembly-optimized with loop unrolling
void vector_add_asm(float *a, float *b, float *c, int n) {
__asm__ volatile (
"test %4, %4\n\t"
"jz 2f\n\t"
"1:\n\t"
"movups (%0), %%xmm0\n\t"
"movups (%1), %%xmm1\n\t"
"addps %%xmm1, %%xmm0\n\t"
"movups %%xmm0, (%2)\n\t"
"add $16, %0\n\t"
"add $16, %1\n\t"
"add $16, %2\n\t"
"sub $4, %3\n\t"
"jnz 1b\n\t"
"2:"
: "+r"(a), "+r"(b), "+r"(c), "+r"(n)
:
: "xmm0", "xmm1", "memory"
);
}
2. Prefetching
// Assembly with prefetch instructions
void prefetch_example(int *array, int n) {
__asm__ volatile (
"1:\n\t"
"prefetcht0 (%0)\n\t" // Prefetch into L1 cache
"mov (%0), %%eax\n\t" // Load data
"add $64, %0\n\t" // Next cache line
"dec %1\n\t"
"jnz 1b"
: "+r"(array), "+r"(n)
:
: "eax", "memory"
);
}
Advanced Techniques
1. Function Prologue/Epilogue Control
// Custom prologue/epilogue for interrupt handlers
__attribute__((naked)) void interrupt_handler(void) {
__asm__ volatile (
"push {r0-r3, lr}\n\t" // Save context
"bl handle_interrupt\n\t" // C handler
"pop {r0-r3, lr}\n\t" // Restore context
"bx lr"
);
}
2. Thunk Functions
// Thunk for calling C++ methods from assembly
typedef struct {
void *object;
void (*method)(void*);
} Thunk;
void call_thunk(Thunk *t) {
__asm__ volatile (
"mov %0, %%eax\n\t"
"mov 4(%eax), %%edx\n\t"
"mov (%eax), %%ecx\n\t"
"call *%%edx"
: : "r"(t)
: "eax", "ecx", "edx", "memory"
);
}
3. Dynamic Code Generation
#include <sys/mman.h>
#include <string.h>
// Generate simple assembly code at runtime
typedef int (*add_func)(int, int);
add_func create_add_function(void) {
// x86_64 machine code for "mov eax, edi; add eax, esi; ret"
unsigned char code[] = {
0x89, 0xf8, // mov eax, edi
0x01, 0xf0, // add eax, esi
0xc3 // ret
};
// Allocate executable memory
void *mem = mmap(NULL, sizeof(code),
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
// Copy code and make executable
memcpy(mem, code, sizeof(code));
return (add_func)mem;
}
Error Handling and Debugging
#include <stdio.h>
// Assembly error checking macro
#define CHECK_ASM(expr) \
do { \
__asm__ volatile ( \
"1:\n\t" \
expr "\n\t" \
"2:\n\t" \
".section .debug_info\n\t" \
".long 2b-1b\n\t" \
".previous" \
); \
} while(0)
// Exception handling with assembly
int divide_safe(int a, int b, int *result) {
int error = 0;
__asm__ volatile (
"mov %2, %%eax\n\t"
"mov %3, %%ecx\n\t"
"test %%ecx, %%ecx\n\t"
"jz 1f\n\t"
"xor %%edx, %%edx\n\t"
"idiv %%ecx\n\t"
"mov %%eax, %0\n\t"
"xor %1, %1\n\t"
"jmp 2f\n\t"
"1:\n\t"
"mov $1, %1\n\t"
"2:"
: "=m"(*result), "=r"(error)
: "r"(a), "r"(b)
: "eax", "ecx", "edx", "cc"
);
return error;
}
Compiler-Specific Extensions
// GCC extended asm with C expression operands
#define ADD_WITH_CARRY(a, b, carry) \
({ \
unsigned int sum, new_carry; \
__asm__ volatile ( \
"add %2, %0\n\t" \
"adc $0, %1" \
: "=r"(sum), "=r"(new_carry) \
: "r"(b), "0"(a), "1"(carry) \
: "cc" \
); \
(struct { unsigned int sum; unsigned int carry; }){sum, new_carry}; \
})
// MSVC inline assembly
#ifdef _MSC_VER
void msvc_asm_example() {
int a = 10, b = 20, result;
__asm {
mov eax, a
add eax, b
mov result, eax
}
printf("Result: %d\n", result);
}
#endif
Best Practices
- Use volatile: Prevent compiler from optimizing away assembly
- Document register usage: Clarify which registers are clobbered
- Keep assembly minimal: Profile to verify performance gains
- Provide C fallbacks: For portability across architectures
- Use macros for portability: Abstract architecture-specific code
- Test thoroughly: Assembly bugs can be subtle and hard to debug
- Understand calling conventions: Match your C compiler's ABI
- Consider maintenance: Assembly is harder to maintain than C
- Use constraints correctly: Let compiler handle register allocation
- Test with optimization: Assembly may behave differently with -O2
Common Pitfalls
// 1. Missing clobber list
__asm__ ("add %0, %0" : "+r"(x) : : "cc"); // Need "cc" if condition codes affected
// 2. Assuming register values persist
__asm__ ("mov $5, %%eax" : : : "eax"); // eax value lost after asm
// 3. Incorrect constraint types
__asm__ ("add %1, %0" : "=r"(result) : "m"(x)); // Using memory where register needed
// 4. Not marking volatile when needed
__asm__ ("nop"); // Might be optimized away, use __asm__ volatile
// 5. Ignoring memory clobber
__asm__ ("mov %0, %1" : "=m"(*ptr) : "r"(value)); // Missing "memory" if others affected
Conclusion
Assembly integration in C is a powerful technique that combines the portability of C with the low-level control of assembly. Key takeaways:
- Inline assembly: Best for small, architecture-specific operations
- Separate modules: Better for larger assembly functions
- Understand constraints: Let compiler manage register allocation
- Provide fallbacks: Maintain C versions for portability
- Profile before optimizing: Ensure assembly is actually needed
When used judiciously, assembly can provide significant performance gains, access to processor features, and precise hardware control. However, it should be viewed as a tool of last resort, applied only when C cannot achieve the required performance or functionality. The combination of C's high-level abstraction and assembly's low-level control creates a powerful synergy for systems programming.