Bridging Worlds: A Complete Guide to Assembly Integration in C

C is often described as "portable assembly language," but there are times when you need to step down to actual assembly code. Whether for performance-critical sections, accessing processor-specific features, or implementing hardware-level operations, integrating assembly with C is an essential skill for systems programmers. This comprehensive guide covers everything from inline assembly to separate assembly modules.

Why Integrate Assembly with C?

  • Performance optimization: Hand-tuned critical loops
  • Hardware access: Special instructions not available in C
  • Interrupt handling: Low-level interrupt service routines
  • Boot code: Initialization before C runtime is ready
  • Processor features: SIMD, cryptography extensions
  • Real-time constraints: Precise timing control

Inline Assembly Basics

1. GCC Inline Assembly Syntax

#include <stdio.h>
// Basic inline assembly
void basic_asm_example() {
int a = 10, b = 20, result;
// Simple addition
__asm__ volatile (
"add %1, %0"           // Assembly instruction
: "=r"(result)          // Output operands
: "r"(a), "0"(b)        // Input operands
:                       // Clobbered registers
);
printf("Result: %d\n", result);
}

2. GCC Extended ASM Format

// Extended inline assembly format
__asm__ volatile (
"assembly code"
: output_operands
: input_operands
: clobbered_registers
);
// Example: Atomic exchange
int atomic_exchange(int *ptr, int new_val) {
int old_val;
__asm__ volatile (
"xchg %0, %1"
: "=r"(old_val), "+m"(*ptr)
: "0"(new_val)
: "memory"
);
return old_val;
}

Register Constraints and Operands

#include <stdint.h>
// Common register constraints
void register_constraints_example() {
int a = 10;
int b = 20;
int result;
__asm__ volatile (
"add %2, %0\n\t"
"add %3, %0"
: "=r"(result)           // Output: any register
: "0"(a),                // Input: same as output 0
"r"(b),                // Input: any register
"i"(5)                 // Input: immediate constant
: "cc"                   // Clobbers condition codes
);
}
// Constraint types
// "r"  - Any register
// "m"  - Memory operand
// "i"  - Immediate integer
// "g"  - Any register, memory, or immediate
// "a"  - %eax/%rax
// "b"  - %ebx/%rbx
// "c"  - %ecx/%rcx
// "d"  - %edx/%rdx
// "S"  - %esi/%rsi
// "D"  - %edi/%rdi
// "q"  - Any 64-bit register (x86_64)

Common Inline Assembly Patterns

1. Bit Manipulation

#include <stdint.h>
// Rotate left
uint32_t rotl32(uint32_t x, int shift) {
__asm__ volatile (
"rol %b1, %0"
: "=r"(x)
: "0"(x), "c"(shift)
: "cc"
);
return x;
}
// Bit test and set
int test_and_set_bit(volatile uint32_t *addr, int bit) {
int old;
__asm__ volatile (
"bts %2, %1\n\t"
"sbb %0, %0"
: "=r"(old), "+m"(*addr)
: "r"(bit)
: "cc"
);
return old;
}
// Count leading zeros (CLZ)
int count_leading_zeros(uint32_t x) {
int result;
__asm__ volatile (
"clz %0, %1"
: "=r"(result)
: "r"(x)
);
return result;
}

2. Atomic Operations

#include <stdint.h>
// Atomic increment
void atomic_increment(volatile int *ptr) {
__asm__ volatile (
"lock incl %0"
: "+m"(*ptr)
:
: "memory"
);
}
// Compare and swap
int compare_and_swap(volatile int *ptr, int old_val, int new_val) {
int result;
__asm__ volatile (
"lock cmpxchg %2, %1"
: "=a"(result), "+m"(*ptr)
: "r"(new_val), "0"(old_val)
: "memory", "cc"
);
return result;
}
// Atomic add and fetch
int atomic_add_fetch(volatile int *ptr, int val) {
int result;
__asm__ volatile (
"lock xadd %0, %1\n\t"
"add %0, %0"
: "=r"(result), "+m"(*ptr)
: "0"(val)
: "memory", "cc"
);
return result + val;
}

3. Memory Barriers

#include <stdint.h>
// Compiler barrier
#define barrier() __asm__ volatile("" ::: "memory")
// Full memory barrier
#define mb() __asm__ volatile("mfence" ::: "memory")
// Read barrier
#define rmb() __asm__ volatile("lfence" ::: "memory")
// Write barrier
#define wmb() __asm__ volatile("sfence" ::: "memory")
// Example: Lock-free queue
typedef struct {
volatile uint32_t head;
volatile uint32_t tail;
void *buffer[256];
} RingBuffer;
int ring_buffer_push(RingBuffer *rb, void *item) {
uint32_t head = rb->head;
uint32_t next = (head + 1) % 256;
if (next == rb->tail) {
return -1;  // Full
}
rb->buffer[head] = item;
wmb();  // Ensure write completes before updating head
rb->head = next;
return 0;
}

ARM Architecture Integration

#include <stdint.h>
// ARM Cortex-M specific operations
// Enter sleep mode
void enter_sleep(void) {
__asm__ volatile (
"wfi"
:
:
: "memory"
);
}
// Set stack pointer
void set_stack_pointer(uint32_t sp) {
__asm__ volatile (
"mov sp, %0"
:
: "r"(sp)
: "sp"
);
}
// Enable interrupts
void enable_interrupts(void) {
__asm__ volatile (
"cpsie i"
:
:
: "memory"
);
}
// Disable interrupts
void disable_interrupts(void) {
__asm__ volatile (
"cpsid i"
:
:
: "memory"
);
}
// Read program counter
uint32_t get_pc(void) {
uint32_t pc;
__asm__ volatile (
"mov %0, pc"
: "=r"(pc)
);
return pc;
}
// Read link register (return address)
uint32_t get_lr(void) {
uint32_t lr;
__asm__ volatile (
"mov %0, lr"
: "=r"(lr)
);
return lr;
}
// ARM NEON SIMD example
void vector_add(float *a, float *b, float *c, int n) {
#ifdef __ARM_NEON
for (int i = 0; i < n; i += 4) {
__asm__ volatile (
"vld1.32 {q0}, [%1]!\n\t"
"vld1.32 {q1}, [%2]!\n\t"
"vadd.f32 q0, q0, q1\n\t"
"vst1.32 {q0}, [%0]!"
: "+r"(c), "+r"(a), "+r"(b)
:
: "q0", "q1", "memory"
);
}
#else
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
#endif
}

x86/x86_64 Architecture Integration

#include <stdint.h>
#include <cpuid.h>
// Read Time Stamp Counter (TSC)
uint64_t rdtsc(void) {
uint32_t lo, hi;
__asm__ volatile (
"rdtsc"
: "=a"(lo), "=d"(hi)
);
return ((uint64_t)hi << 32) | lo;
}
// CPUID instruction
void cpuid(uint32_t leaf, uint32_t *eax, uint32_t *ebx, 
uint32_t *ecx, uint32_t *edx) {
__asm__ volatile (
"cpuid"
: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
: "a"(leaf)
);
}
// SSE vector operations
void sse_add(float *a, float *b, float *c, int n) {
#ifdef __SSE__
for (int i = 0; i < n; i += 4) {
__asm__ volatile (
"movups (%1), %%xmm0\n\t"
"movups (%2), %%xmm1\n\t"
"addps %%xmm1, %%xmm0\n\t"
"movups %%xmm0, (%0)"
: "+r"(c), "+r"(a), "+r"(b)
:
: "xmm0", "xmm1", "memory"
);
}
#else
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
#endif
}
// AVX2 operations
void avx2_multiply(float *a, float *b, float *c, int n) {
#ifdef __AVX2__
for (int i = 0; i < n; i += 8) {
__asm__ volatile (
"vmovups (%1), %%ymm0\n\t"
"vmovups (%2), %%ymm1\n\t"
"vmulps %%ymm1, %%ymm0, %%ymm0\n\t"
"vmovups %%ymm0, (%0)"
: "+r"(c), "+r"(a), "+r"(b)
:
: "ymm0", "ymm1", "memory"
);
}
#else
for (int i = 0; i < n; i++) {
c[i] = a[i] * b[i];
}
#endif
}

Separate Assembly Modules

1. Assembly File (functions.s)

; x86_64 assembly functions
section .text
global add_ints
global multiply_ints
global dot_product
global memcpy_asm
; int add_ints(int a, int b)
add_ints:
mov eax, edi    ; First argument
add eax, esi    ; Second argument
ret
; int multiply_ints(int a, int b)
multiply_ints:
mov eax, edi
imul eax, esi
ret
; int dot_product(int *a, int *b, int n)
dot_product:
push rbx
xor eax, eax        ; sum = 0
xor ecx, ecx        ; i = 0
.loop:
cmp ecx, edx        ; i < n?
jge .done
mov ebx, [rdi + rcx*4]  ; a[i]
imul ebx, [rsi + rcx*4] ; a[i] * b[i]
add eax, ebx        ; sum += product
inc ecx
jmp .loop
.done:
pop rbx
ret
; void memcpy_asm(void *dest, const void *src, size_t n)
memcpy_asm:
mov rcx, rdx        ; Count
rep movsb           ; Copy bytes
ret

2. C Header (functions.h)

#ifndef FUNCTIONS_H
#define FUNCTIONS_H
#include <stddef.h>
// Assembly function declarations
int add_ints(int a, int b);
int multiply_ints(int a, int b);
int dot_product(const int *a, const int *b, int n);
void memcpy_asm(void *dest, const void *src, size_t n);
#endif

3. C Main Program

#include <stdio.h>
#include "functions.h"
int main() {
int a = 10, b = 20;
printf("add: %d\n", add_ints(a, b));
printf("multiply: %d\n", multiply_ints(a, b));
int arr1[] = {1, 2, 3, 4, 5};
int arr2[] = {5, 4, 3, 2, 1};
int result = dot_product(arr1, arr2, 5);
printf("dot product: %d\n", result);
char src[] = "Hello, Assembly!";
char dest[32];
memcpy_asm(dest, src, sizeof(src));
printf("copied: %s\n", dest);
return 0;
}

4. Build Commands

# Assemble and compile
nasm -f elf64 functions.s -o functions.o
gcc -c main.c -o main.o
gcc main.o functions.o -o program
# Or with GCC inline
gcc -o program main.c functions.s

Thumb Mode (ARM) Assembly

; ARM Thumb mode assembly for Cortex-M
.syntax unified
.thumb
.text
.global enable_irq
.global disable_irq
.global get_primask
.global set_primask
; void enable_irq(void)
enable_irq:
cpsie i
bx lr
; void disable_irq(void)
disable_irq:
cpsid i
bx lr
; uint32_t get_primask(void)
get_primask:
mrs r0, primask
bx lr
; void set_primask(uint32_t mask)
set_primask:
msr primask, r0
bx lr
; void hard_fault_handler(void)
.thumb_func
.global HardFault_Handler
HardFault_Handler:
; Save registers
mrs r0, psp
mrs r1, msp
mov r2, lr
; Call C handler
bl hard_fault_c_handler
; Infinite loop
b .

Interrupt Service Routines in Assembly

; ARM Cortex-M interrupt handler
.section .text
.global TIM2_IRQHandler
.type TIM2_IRQHandler, %function
TIM2_IRQHandler:
; Save context
push {r0-r3, lr}
; Clear interrupt flag (C function)
bl timer_clear_flag
; Handle interrupt
bl timer_interrupt_handler
; Restore context and return
pop {r0-r3, lr}
bx lr
; x86 interrupt handler
section .text
global timer_interrupt
extern timer_handler
timer_interrupt:
pusha               ; Save all registers
push ds
push es
push fs
push gs
mov ax, 0x10        ; Kernel data segment
mov ds, ax
mov es, ax
call timer_handler  ; C handler
pop gs
pop fs
pop es
pop ds
popa
iret                ; Return from interrupt

Optimization Techniques

1. Loop Unrolling with Assembly

// C implementation
void vector_add_c(float *a, float *b, float *c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// Assembly-optimized with loop unrolling
void vector_add_asm(float *a, float *b, float *c, int n) {
__asm__ volatile (
"test %4, %4\n\t"
"jz 2f\n\t"
"1:\n\t"
"movups (%0), %%xmm0\n\t"
"movups (%1), %%xmm1\n\t"
"addps %%xmm1, %%xmm0\n\t"
"movups %%xmm0, (%2)\n\t"
"add $16, %0\n\t"
"add $16, %1\n\t"
"add $16, %2\n\t"
"sub $4, %3\n\t"
"jnz 1b\n\t"
"2:"
: "+r"(a), "+r"(b), "+r"(c), "+r"(n)
:
: "xmm0", "xmm1", "memory"
);
}

2. Prefetching

// Assembly with prefetch instructions
void prefetch_example(int *array, int n) {
__asm__ volatile (
"1:\n\t"
"prefetcht0 (%0)\n\t"    // Prefetch into L1 cache
"mov (%0), %%eax\n\t"     // Load data
"add $64, %0\n\t"         // Next cache line
"dec %1\n\t"
"jnz 1b"
: "+r"(array), "+r"(n)
:
: "eax", "memory"
);
}

Advanced Techniques

1. Function Prologue/Epilogue Control

// Custom prologue/epilogue for interrupt handlers
__attribute__((naked)) void interrupt_handler(void) {
__asm__ volatile (
"push {r0-r3, lr}\n\t"   // Save context
"bl handle_interrupt\n\t" // C handler
"pop {r0-r3, lr}\n\t"    // Restore context
"bx lr"
);
}

2. Thunk Functions

// Thunk for calling C++ methods from assembly
typedef struct {
void *object;
void (*method)(void*);
} Thunk;
void call_thunk(Thunk *t) {
__asm__ volatile (
"mov %0, %%eax\n\t"
"mov 4(%eax), %%edx\n\t"
"mov (%eax), %%ecx\n\t"
"call *%%edx"
: : "r"(t)
: "eax", "ecx", "edx", "memory"
);
}

3. Dynamic Code Generation

#include <sys/mman.h>
#include <string.h>
// Generate simple assembly code at runtime
typedef int (*add_func)(int, int);
add_func create_add_function(void) {
// x86_64 machine code for "mov eax, edi; add eax, esi; ret"
unsigned char code[] = {
0x89, 0xf8,        // mov eax, edi
0x01, 0xf0,        // add eax, esi
0xc3               // ret
};
// Allocate executable memory
void *mem = mmap(NULL, sizeof(code), 
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
// Copy code and make executable
memcpy(mem, code, sizeof(code));
return (add_func)mem;
}

Error Handling and Debugging

#include <stdio.h>
// Assembly error checking macro
#define CHECK_ASM(expr) \
do { \
__asm__ volatile ( \
"1:\n\t" \
expr "\n\t" \
"2:\n\t" \
".section .debug_info\n\t" \
".long 2b-1b\n\t" \
".previous" \
); \
} while(0)
// Exception handling with assembly
int divide_safe(int a, int b, int *result) {
int error = 0;
__asm__ volatile (
"mov %2, %%eax\n\t"
"mov %3, %%ecx\n\t"
"test %%ecx, %%ecx\n\t"
"jz 1f\n\t"
"xor %%edx, %%edx\n\t"
"idiv %%ecx\n\t"
"mov %%eax, %0\n\t"
"xor %1, %1\n\t"
"jmp 2f\n\t"
"1:\n\t"
"mov $1, %1\n\t"
"2:"
: "=m"(*result), "=r"(error)
: "r"(a), "r"(b)
: "eax", "ecx", "edx", "cc"
);
return error;
}

Compiler-Specific Extensions

// GCC extended asm with C expression operands
#define ADD_WITH_CARRY(a, b, carry) \
({ \
unsigned int sum, new_carry; \
__asm__ volatile ( \
"add %2, %0\n\t" \
"adc $0, %1" \
: "=r"(sum), "=r"(new_carry) \
: "r"(b), "0"(a), "1"(carry) \
: "cc" \
); \
(struct { unsigned int sum; unsigned int carry; }){sum, new_carry}; \
})
// MSVC inline assembly
#ifdef _MSC_VER
void msvc_asm_example() {
int a = 10, b = 20, result;
__asm {
mov eax, a
add eax, b
mov result, eax
}
printf("Result: %d\n", result);
}
#endif

Best Practices

  1. Use volatile: Prevent compiler from optimizing away assembly
  2. Document register usage: Clarify which registers are clobbered
  3. Keep assembly minimal: Profile to verify performance gains
  4. Provide C fallbacks: For portability across architectures
  5. Use macros for portability: Abstract architecture-specific code
  6. Test thoroughly: Assembly bugs can be subtle and hard to debug
  7. Understand calling conventions: Match your C compiler's ABI
  8. Consider maintenance: Assembly is harder to maintain than C
  9. Use constraints correctly: Let compiler handle register allocation
  10. Test with optimization: Assembly may behave differently with -O2

Common Pitfalls

// 1. Missing clobber list
__asm__ ("add %0, %0" : "+r"(x) : : "cc");  // Need "cc" if condition codes affected
// 2. Assuming register values persist
__asm__ ("mov $5, %%eax" : : : "eax");  // eax value lost after asm
// 3. Incorrect constraint types
__asm__ ("add %1, %0" : "=r"(result) : "m"(x));  // Using memory where register needed
// 4. Not marking volatile when needed
__asm__ ("nop");  // Might be optimized away, use __asm__ volatile
// 5. Ignoring memory clobber
__asm__ ("mov %0, %1" : "=m"(*ptr) : "r"(value));  // Missing "memory" if others affected

Conclusion

Assembly integration in C is a powerful technique that combines the portability of C with the low-level control of assembly. Key takeaways:

  • Inline assembly: Best for small, architecture-specific operations
  • Separate modules: Better for larger assembly functions
  • Understand constraints: Let compiler manage register allocation
  • Provide fallbacks: Maintain C versions for portability
  • Profile before optimizing: Ensure assembly is actually needed

When used judiciously, assembly can provide significant performance gains, access to processor features, and precise hardware control. However, it should be viewed as a tool of last resort, applied only when C cannot achieve the required performance or functionality. The combination of C's high-level abstraction and assembly's low-level control creates a powerful synergy for systems programming.

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper