Async Profiler Usage in Java

Overview

Async Profiler is a low-overhead sampling profiler for Java that collects performance data with minimal impact on running applications. It's particularly useful for production profiling.

Installation and Setup

1. Download and Installation

# Download the latest version
wget https://github.com/jvm-profiling-tools/async-profiler/releases/download/v2.9/async-profiler-2.9-linux-x64.tar.gz
tar -xzf async-profiler-2.9-linux-x64.tar.gz
cd async-profiler-2.9-linux-x64
# Verify installation
./profiler.sh --version

2. Java Agent Setup

// Add to JVM arguments for continuous profiling
-javaagent:/path/to/async-profiler/build/libasyncProfiler.so=start,event=cpu,file=profile.html
// Or use at runtime without agent
public class ProfilingSetup {
public static void main(String[] args) {
// Your application code
}
}

Basic Usage Examples

1. CPU Profiling

public class CPUIntensiveApplication {
public static void main(String[] args) {
System.out.println("Starting CPU intensive application...");
// Simulate different types of CPU work
Thread computationThread = new Thread(() -> {
while (true) {
heavyComputation();
mediumComputation();
lightComputation();
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
});
computationThread.start();
// Keep application running
try {
Thread.sleep(300000); // 5 minutes
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
computationThread.interrupt();
}
// Heavy computation method
public static void heavyComputation() {
long result = 0;
for (int i = 0; i < 10_000_000; i++) {
result += i * i;
result = result % 1000000;
}
System.out.println("Heavy computation result: " + result);
}
// Medium computation method
public static void mediumComputation() {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 100_000; i++) {
sb.append(i).append(",");
}
String result = sb.toString();
System.out.println("Medium computation length: " + result.length());
}
// Light computation method
public static void lightComputation() {
int[] array = new int[1000];
for (int i = 0; i < array.length; i++) {
array[i] = i * 2;
}
System.out.println("Light computation completed");
}
}

Profiling commands:

# Find Java process ID
jps
# Profile CPU for 60 seconds
./profiler.sh -d 60 -f cpu_profile.html <pid>
# Profile with specific settings
./profiler.sh -e cpu -d 30 -i 10ms -o html -f profile_cpu.html <pid>
# Continuous profiling with output every 10 seconds
./profiler.sh start -e cpu -i 10ms -o html -f /tmp/profile_%t.html <pid>

2. Memory Allocation Profiling

public class MemoryIntensiveApplication {
private static final List<byte[]> memoryChunks = new ArrayList<>();
public static void main(String[] args) {
System.out.println("Starting memory intensive application...");
Thread allocationThread = new Thread(() -> {
Random random = new Random();
while (!Thread.currentThread().isInterrupted()) {
// Allocate different sized arrays
allocateSmallObjects();
allocateMediumObjects();
allocateLargeObjects();
// Occasionally clear some memory
if (random.nextInt(10) == 0) {
clearSomeMemory();
}
try {
Thread.sleep(500);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
});
allocationThread.start();
Runtime.getRuntime().addShutdownHook(new Thread(() -> {
System.out.println("Cleaning up memory...");
memoryChunks.clear();
}));
try {
Thread.sleep(300000); // 5 minutes
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
allocationThread.interrupt();
}
private static void allocateSmallObjects() {
for (int i = 0; i < 100; i++) {
byte[] smallArray = new byte[1024]; // 1KB
memoryChunks.add(smallArray);
}
}
private static void allocateMediumObjects() {
for (int i = 0; i < 10; i++) {
byte[] mediumArray = new byte[1024 * 1024]; // 1MB
memoryChunks.add(mediumArray);
}
}
private static void allocateLargeObjects() {
byte[] largeArray = new byte[10 * 1024 * 1024]; // 10MB
memoryChunks.add(largeArray);
}
private static void clearSomeMemory() {
if (!memoryChunks.isEmpty()) {
int toRemove = Math.min(10, memoryChunks.size());
for (int i = 0; i < toRemove; i++) {
memoryChunks.remove(0);
}
System.gc(); // Suggest GC, but don't rely on it
}
}
}

Memory profiling commands:

# Profile allocations
./profiler.sh -e alloc -d 60 -f alloc_profile.html <pid>
# Profile allocations with size filter
./profiler.sh -e alloc -d 60 --alloc 512 -f alloc_large.html <pid>
# Profile live objects only (after GC)
./profiler.sh -e live -d 60 -f live_objects.html <pid>

3. Advanced Profiling Scenarios

import java.util.concurrent.*;
public class ComplexApplication {
private final ExecutorService executor = Executors.newFixedThreadPool(10);
private final BlockingQueue<Task> taskQueue = new LinkedBlockingQueue<>();
private volatile boolean running = true;
public static void main(String[] args) {
ComplexApplication app = new ComplexApplication();
app.start();
// Add shutdown hook for clean profiling
Runtime.getRuntime().addShutdownHook(new Thread(() -> {
System.out.println("Shutting down...");
app.stop();
}));
// Generate workload
app.generateWorkload();
}
public void start() {
// Start worker threads
for (int i = 0; i < 5; i++) {
executor.submit(this::processTasks);
}
// Start monitoring thread
executor.submit(this::monitorPerformance);
}
public void stop() {
running = false;
executor.shutdown();
try {
if (!executor.awaitTermination(5, TimeUnit.SECONDS)) {
executor.shutdownNow();
}
} catch (InterruptedException e) {
executor.shutdownNow();
Thread.currentThread().interrupt();
}
}
private void processTasks() {
while (running && !Thread.currentThread().isInterrupted()) {
try {
Task task = taskQueue.poll(100, TimeUnit.MILLISECONDS);
if (task != null) {
processTask(task);
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
private void processTask(Task task) {
switch (task.type) {
case CPU_INTENSIVE:
performCPUIntensiveWork(task);
break;
case MEMORY_INTENSIVE:
performMemoryIntensiveWork(task);
break;
case IO_SIMULATION:
performIOSimulation(task);
break;
case MIXED:
performMixedWork(task);
break;
}
}
private void performCPUIntensiveWork(Task task) {
long startTime = System.nanoTime();
double result = 0;
// Simulate CPU-intensive computation
for (int i = 0; i < task.iterations; i++) {
result += Math.sin(i) * Math.cos(i);
result = result % 1000;
}
long duration = System.nanoTime() - startTime;
System.out.printf("CPU task completed in %.2f ms\n", duration / 1_000_000.0);
}
private void performMemoryIntensiveWork(Task task) {
List<byte[]> chunks = new ArrayList<>();
// Allocate memory
for (int i = 0; i < task.memoryChunks; i++) {
byte[] chunk = new byte[task.chunkSize];
// Use the memory to prevent optimization
for (int j = 0; j < chunk.length; j++) {
chunk[j] = (byte) (j % 256);
}
chunks.add(chunk);
}
// Process the data
long total = 0;
for (byte[] chunk : chunks) {
for (byte b : chunk) {
total += b & 0xFF;
}
}
System.out.println("Memory task completed, total: " + total);
}
private void performIOSimulation(Task task) {
try {
// Simulate IO wait
Thread.sleep(task.ioDelay);
// Simulate some processing after IO
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 1000; i++) {
sb.append("IO_").append(i).append(" ");
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
private void performMixedWork(Task task) {
// Combination of CPU, memory, and simulated IO
performCPUIntensiveWork(task);
performMemoryIntensiveWork(task);
try {
Thread.sleep(task.ioDelay / 2);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
performCPUIntensiveWork(task);
}
private void monitorPerformance() {
while (running) {
try {
Thread.sleep(5000);
printStatistics();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
}
private void printStatistics() {
System.out.printf("Queue size: %d, Active threads: %d\n",
taskQueue.size(), 
((ThreadPoolExecutor) executor).getActiveCount());
}
private void generateWorkload() {
Random random = new Random();
int taskId = 0;
while (running) {
TaskType type = TaskType.values()[random.nextInt(TaskType.values().length)];
Task task = new Task(taskId++, type);
// Set parameters based on task type
switch (type) {
case CPU_INTENSIVE:
task.iterations = 1_000_000 + random.nextInt(5_000_000);
break;
case MEMORY_INTENSIVE:
task.memoryChunks = 10 + random.nextInt(100);
task.chunkSize = 1024 * (1 + random.nextInt(1024));
break;
case IO_SIMULATION:
task.ioDelay = 10 + random.nextInt(100);
break;
case MIXED:
task.iterations = 500_000 + random.nextInt(1_000_000);
task.memoryChunks = 5 + random.nextInt(20);
task.chunkSize = 1024 * (1 + random.nextInt(512));
task.ioDelay = 5 + random.nextInt(50);
break;
}
taskQueue.offer(task);
try {
Thread.sleep(10 + random.nextInt(100));
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
}
static class Task {
final int id;
final TaskType type;
int iterations;
int memoryChunks;
int chunkSize;
int ioDelay;
Task(int id, TaskType type) {
this.id = id;
this.type = type;
}
}
enum TaskType {
CPU_INTENSIVE, MEMORY_INTENSIVE, IO_SIMULATION, MIXED
}
}

Advanced Profiling Techniques

1. Wall-clock vs CPU Time Profiling

# Wall-clock profiling (includes blocking time)
./profiler.sh -e wall -d 60 -f wall_clock.html <pid>
# CPU time profiling (only active CPU time)
./profiler.sh -e cpu -d 60 -f cpu_time.html <pid>
# Compare both
./profiler.sh -e cpu -d 30 -f cpu.html <pid>
./profiler.sh -e wall -d 30 -f wall.html <pid>

2. Java Method Profiling

public class MethodLevelProfiling {
public void processOrder(Order order) {
validateOrder(order);
calculateTotal(order);
applyDiscounts(order);
processPayment(order);
updateInventory(order);
sendConfirmation(order);
}
private void validateOrder(Order order) {
// Simulate validation logic
try {
Thread.sleep(5);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
if (order.items.isEmpty()) {
throw new IllegalArgumentException("Empty order");
}
}
private void calculateTotal(Order order) {
order.total = order.items.stream()
.mapToDouble(item -> item.price * item.quantity)
.sum();
// Simulate complex calculation
performComplexPricing(order);
}
private void performComplexPricing(Order order) {
// Simulate complex pricing logic
double base = order.total;
for (int i = 0; i < 1000; i++) {
base = Math.log(base + 1) * 100;
}
order.total = base;
}
private void applyDiscounts(Order order) {
// Simulate discount calculation
try {
Thread.sleep(10);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
if (order.total > 100) {
order.total *= 0.9; // 10% discount
}
}
private void processPayment(Order order) {
// Simulate payment processing
try {
Thread.sleep(50); // Payment API call
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
private void updateInventory(Order order) {
// Simulate inventory update
try {
Thread.sleep(20); // Database update
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
private void sendConfirmation(Order order) {
// Simulate email sending
try {
Thread.sleep(30); // Email service call
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
static class Order {
List<OrderItem> items = new ArrayList<>();
double total;
}
static class OrderItem {
String productId;
double price;
int quantity;
}
}

Method-specific profiling:

# Profile specific methods only
./profiler.sh --include "com.example.MethodLevelProfiling.*" -d 60 -f method_profile.html <pid>
# Exclude system methods
./profiler.sh --exclude "java.*" --exclude "sun.*" -d 60 -f app_only.html <pid>

3. Flame Graph Generation

# Generate flame graph for CPU
./profiler.sh -d 60 -o flamegraph -f flamegraph.svg <pid>
# Generate flame graph for allocations
./profiler.sh -e alloc -d 60 -o flamegraph -f alloc_flamegraph.svg <pid>
# Generate folded output for custom processing
./profiler.sh -d 60 -o folded -f profile.folded <pid>

4. Continuous Production Profiling

public class ProductionProfilingSetup {
public static void setupContinuousProfiling() {
// Schedule periodic profiling
ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1);
scheduler.scheduleAtFixedRate(() -> {
try {
triggerProfilingSession();
} catch (Exception e) {
System.err.println("Profiling failed: " + e.getMessage());
}
}, 5, 300, TimeUnit.SECONDS); // Start after 5 sec, repeat every 5 min
Runtime.getRuntime().addShutdownHook(new Thread(scheduler::shutdown));
}
private static void triggerProfilingSession() {
String pid = getCurrentPid();
String timestamp = String.valueOf(System.currentTimeMillis());
try {
// Profile CPU for 30 seconds
String[] cpuCommand = {
"/path/to/async-profiler/profiler.sh",
"-d", "30",
"-o", "html",
"-f", "/tmp/profiles/cpu_profile_" + timestamp + ".html",
pid
};
Process process = Runtime.getRuntime().exec(cpuCommand);
int exitCode = process.waitFor();
if (exitCode == 0) {
System.out.println("CPU profile saved: cpu_profile_" + timestamp + ".html");
}
// Profile allocations every hour
if (shouldProfileAllocations()) {
String[] allocCommand = {
"/path/to/async-profiler/profiler.sh",
"-e", "alloc",
"-d", "30",
"-o", "html",
"-f", "/tmp/profiles/alloc_profile_" + timestamp + ".html",
pid
};
process = Runtime.getRuntime().exec(allocCommand);
exitCode = process.waitFor();
if (exitCode == 0) {
System.out.println("Allocation profile saved: alloc_profile_" + timestamp + ".html");
}
}
} catch (IOException | InterruptedException e) {
Thread.currentThread().interrupt();
System.err.println("Profiling execution failed: " + e.getMessage());
}
}
private static String getCurrentPid() {
return ManagementFactory.getRuntimeMXBean().getName().split("@")[0];
}
private static boolean shouldProfileAllocations() {
// Profile allocations during low traffic periods
Calendar cal = Calendar.getInstance();
int hour = cal.get(Calendar.HOUR_OF_DAY);
return hour >= 1 && hour <= 4; // 1 AM to 4 AM
}
}

Analysis and Interpretation

1. Understanding Profiler Output

public class ProfilingAnalysisExample {
/**
* Example methods that demonstrate different performance characteristics
*/
// CPU-bound method
public void cpuBoundMethod() {
double result = 0;
for (int i = 0; i < 10_000_000; i++) {
result += Math.sqrt(i) * Math.log(i + 1);
}
}
// Memory-bound method
public void memoryBoundMethod() {
List<int[]> arrays = new ArrayList<>();
for (int i = 0; i < 1000; i++) {
int[] array = new int[10000];
Arrays.fill(array, i);
arrays.add(array);
}
}
// IO-bound method (simulated)
public void ioBoundMethod() {
try {
// Simulate database/network call
Thread.sleep(100);
// Process result
processData(new byte[1024]);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
// Lock contention example
private final Object lock = new Object();
public void methodWithLockContention() {
synchronized (lock) {
try {
// Hold lock for significant time
Thread.sleep(50);
cpuBoundMethod();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
// High allocation rate method
public void highAllocationMethod() {
List<String> strings = new ArrayList<>();
for (int i = 0; i < 10000; i++) {
strings.add("String_" + i + "_" + System.currentTimeMillis());
}
}
private void processData(byte[] data) {
// Simulate data processing
for (int i = 0; i < data.length; i++) {
data[i] = (byte) (data[i] ^ 0xFF);
}
}
}

2. Common Performance Issues and Profiling Solutions

public class PerformanceIssues {
// Issue 1: Excessive object creation
public String inefficientStringConcatenation() {
String result = "";
for (int i = 0; i < 10000; i++) {
result += i; // Creates new String object each iteration
}
return result;
}
public String efficientStringConcatenation() {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 10000; i++) {
sb.append(i);
}
return sb.toString();
}
// Issue 2: Inefficient algorithms
public boolean inefficientSearch(List<String> list, String target) {
// O(n) search in unsorted list
return list.contains(target);
}
public boolean efficientSearch(Set<String> set, String target) {
// O(1) search in HashSet
return set.contains(target);
}
// Issue 3: Unnecessary synchronization
private final List<String> synchronizedList = Collections.synchronizedList(new ArrayList<>());
public void unnecessarySynchronization(String item) {
synchronized (synchronizedList) {
if (!synchronizedList.contains(item)) {
synchronizedList.add(item);
}
}
}
public void betterSynchronization(String item) {
// Use concurrent collection
ConcurrentHashMap<String, Boolean> map = new ConcurrentHashMap<>();
map.putIfAbsent(item, true);
}
// Issue 4: Memory leaks
private static final Map<String, Object> cache = new HashMap<>();
public void memoryLeak(String key, Object value) {
cache.put(key, value); // Never removed
}
public void fixedMemoryUsage(String key, Object value) {
// Use weak references or size limits
Cache<String, Object> properCache = CacheBuilder.newBuilder()
.maximumSize(1000)
.expireAfterWrite(10, TimeUnit.MINUTES)
.build();
properCache.put(key, value);
}
}

Best Practices

1. Production Profiling Guidelines

public class ProductionProfilingBestPractices {
public static final class ProfilingConfig {
public static final boolean ENABLED = true;
public static final int DURATION_SECONDS = 30;
public static final int INTERVAL_MS = 10;
public static final String OUTPUT_DIR = "/tmp/profiling";
// Events to profile
public static final String[] EVENTS = {"cpu", "alloc", "lock"};
// Safe profiling hours (low traffic)
public static final int START_HOUR = 1; // 1 AM
public static final int END_HOUR = 4;   // 4 AM
}
public static boolean shouldProfile() {
if (!ProfilingConfig.ENABLED) {
return false;
}
Calendar cal = Calendar.getInstance();
int hour = cal.get(Calendar.HOUR_OF_DAY);
return hour >= ProfilingConfig.START_HOUR && 
hour <= ProfilingConfig.END_HOUR;
}
public static void safeProfiling(Runnable operation) {
if (shouldProfile()) {
// Reduce profiling overhead in production
System.setProperty("jdk.attach.allowAttachSelf", "true");
operation.run();
}
}
}

2. Integration with Monitoring

public class ProfilingIntegration {
public static void integrateWithAPM() {
// Example integration with Micrometer metrics
MeterRegistry registry = new SimpleMeterRegistry();
Timer profilingTimer = Timer.builder("profiling.execution")
.description("Time spent in profiling operations")
.register(registry);
Counter profilingCounter = Counter.builder("profiling.sessions")
.description("Number of profiling sessions")
.register(registry);
}
public static void alertOnPerformanceRegression() {
// Set up alerts based on profiling data
// This would typically integrate with your monitoring system
}
}

Troubleshooting Common Issues

# 1. Permission issues
sudo sysctl kernel.perf_event_paranoid=1
sudo sysctl kernel.kptr_restrict=0
# 2. Container profiling
docker run --cap-add perf_event --pid=host -it your-image
# 3. Missing debug symbols
./profiler.sh -d 30 -o collapsed -f /tmp/folded.txt <pid>
# 4. High overhead warning
./profiler.sh -e cpu -d 60 -i 100ms -f profile.html <pid>  # Increase interval

Async Profiler is a powerful tool for understanding Java application performance characteristics with minimal overhead. The key is to use the right profiling type (CPU, allocation, lock, etc.) for your specific performance investigation and to profile under realistic load conditions.

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper