Gremlin Attack Scenarios in Java: Chaos Engineering Implementation

Introduction to Chaos Engineering with Gremlin

Chaos Engineering is the discipline of experimenting on a system to build confidence in its capability to withstand turbulent conditions in production. Gremlin provides controlled attacks to test system resilience. This guide covers implementing Gremlin-like attack scenarios in Java applications.

Table of Contents

  1. Resource Attacks (CPU, Memory, Disk)
  2. Network Attacks (Latency, Packet Loss)
  3. State Attacks (Shutdown, Restarts)
  4. Dependency Attacks (Service Disruption)
  5. Security Attacks (Security Testing)
  6. Monitoring and Safety Controls
  7. Integration with Spring Boot
  8. Real-World Scenarios

1. Project Setup and Dependencies

Maven Configuration

<properties>
<spring-boot.version>3.1.0</spring-boot.version>
<micrometer.version>1.11.0</micrometer.version>
</properties>
<dependencies>
<!-- Spring Boot -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<version>${spring-boot.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
<version>${spring-boot.version}</version>
</dependency>
<!-- Monitoring -->
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-core</artifactId>
<version>${micrometer.version}</version>
</dependency>
<!-- Utilities -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.13.0</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.13.0</version>
</dependency>
<!-- Testing -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<version>${spring-boot.version}</version>
<scope>test</scope>
</dependency>
</dependencies>

Application Configuration

# application.yml
app:
chaos:
enabled: true
safety-mechanisms:
enabled: true
max-cpu-usage: 80
max-memory-usage: 85
allowed-networks:
- "127.0.0.1"
- "192.168.1.0/24"
attacks:
default-timeout-sec: 300
auto-cancel-timeout-sec: 600
management:
endpoints:
web:
exposure:
include: health,info,metrics,chaos
endpoint:
chaos:
enabled: true
health:
show-details: always
logging:
level:
com.example.chaos: DEBUG

2. Core Chaos Engineering Framework

Attack Interface and Base Classes

package com.example.chaos.framework;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration;
import java.time.LocalDateTime;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicBoolean;
public interface ChaosAttack {
String getName();
String getDescription();
AttackCategory getCategory();
CompletableFuture<AttackResult> execute(AttackParameters parameters);
void cancel();
boolean isRunning();
AttackSafetyCheck safetyCheck();
}
enum AttackCategory {
RESOURCE, NETWORK, STATE, DEPENDENCY, SECURITY, CUSTOM
}
class AttackParameters {
private final Map<String, Object> parameters;
private final Duration duration;
private final String scope;
public AttackParameters(Map<String, Object> parameters, Duration duration, String scope) {
this.parameters = parameters;
this.duration = duration;
this.scope = scope;
}
// Getters and utility methods
public Map<String, Object> getParameters() { return parameters; }
public Duration getDuration() { return duration; }
public String getScope() { return scope; }
public String getString(String key, String defaultValue) {
Object value = parameters.get(key);
return value != null ? value.toString() : defaultValue;
}
public int getInt(String key, int defaultValue) {
Object value = parameters.get(key);
if (value instanceof Number) {
return ((Number) value).intValue();
}
return defaultValue;
}
public boolean getBoolean(String key, boolean defaultValue) {
Object value = parameters.get(key);
if (value instanceof Boolean) {
return (Boolean) value;
}
return defaultValue;
}
}
class AttackResult {
private final String attackId;
private final String attackName;
private final AttackStatus status;
private final LocalDateTime startTime;
private final LocalDateTime endTime;
private final String message;
private final Map<String, Object> metrics;
public AttackResult(String attackId, String attackName, AttackStatus status, 
LocalDateTime startTime, LocalDateTime endTime, 
String message, Map<String, Object> metrics) {
this.attackId = attackId;
this.attackName = attackName;
this.status = status;
this.startTime = startTime;
this.endTime = endTime;
this.message = message;
this.metrics = metrics;
}
// Getters
public String getAttackId() { return attackId; }
public String getAttackName() { return attackName; }
public AttackStatus getStatus() { return status; }
public LocalDateTime getStartTime() { return startTime; }
public LocalDateTime getEndTime() { return endTime; }
public String getMessage() { return message; }
public Map<String, Object> getMetrics() { return metrics; }
public Duration getDuration() {
if (startTime != null && endTime != null) {
return Duration.between(startTime, endTime);
}
return Duration.ZERO;
}
}
enum AttackStatus {
SCHEDULED, RUNNING, COMPLETED, FAILED, CANCELLED, SAFETY_BLOCKED
}
interface AttackSafetyCheck {
boolean isSafeToExecute(AttackParameters parameters);
String getSafetyDescription();
}
abstract class BaseAttack implements ChaosAttack {
protected final Logger logger = LoggerFactory.getLogger(getClass());
protected final String attackId;
protected final AtomicBoolean running;
protected volatile CompletableFuture<AttackResult> attackFuture;
protected BaseAttack() {
this.attackId = UUID.randomUUID().toString();
this.running = new AtomicBoolean(false);
}
@Override
public CompletableFuture<AttackResult> execute(AttackParameters parameters) {
if (!running.compareAndSet(false, true)) {
throw new IllegalStateException("Attack is already running");
}
// Perform safety check
AttackSafetyCheck safetyCheck = safetyCheck();
if (!safetyCheck.isSafeToExecute(parameters)) {
running.set(false);
return CompletableFuture.completedFuture(
new AttackResult(attackId, getName(), AttackStatus.SAFETY_BLOCKED,
LocalDateTime.now(), LocalDateTime.now(),
"Safety check failed: " + safetyCheck.getSafetyDescription(),
Map.of())
);
}
logger.info("Starting attack: {} with parameters: {}", getName(), parameters.getParameters());
LocalDateTime startTime = LocalDateTime.now();
attackFuture = CompletableFuture.supplyAsync(() -> {
try {
AttackResult result = executeAttack(parameters);
logger.info("Attack completed: {} - {}", getName(), result.getStatus());
return result;
} catch (Exception e) {
logger.error("Attack failed: {}", getName(), e);
return new AttackResult(attackId, getName(), AttackStatus.FAILED,
startTime, LocalDateTime.now(), "Attack failed: " + e.getMessage(),
Map.of("error", e.getMessage()));
} finally {
running.set(false);
}
});
return attackFuture;
}
@Override
public void cancel() {
if (running.compareAndSet(true, false) && attackFuture != null) {
attackFuture.cancel(true);
performCleanup();
logger.info("Attack cancelled: {}", getName());
}
}
@Override
public boolean isRunning() {
return running.get();
}
@Override
public AttackSafetyCheck safetyCheck() {
return new DefaultSafetyCheck();
}
protected abstract AttackResult executeAttack(AttackParameters parameters) throws Exception;
protected void performCleanup() {
// Default cleanup - override in subclasses if needed
}
private static class DefaultSafetyCheck implements AttackSafetyCheck {
@Override
public boolean isSafeToExecute(AttackParameters parameters) {
return true; // Default safety check - always safe
}
@Override
public String getSafetyDescription() {
return "Default safety check passed";
}
}
}

3. Resource Attacks (CPU, Memory, Disk)

CPU Attack

package com.example.chaos.attacks.resource;
import com.example.chaos.framework.*;
import org.springframework.stereotype.Component;
import java.time.LocalDateTime;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.atomic.AtomicInteger;
@Component
public class CPUAttack extends BaseAttack {
private static final int DEFAULT_CORES = 1;
private static final int DEFAULT_LOAD_PERCENTAGE = 80;
private ExecutorService cpuLoadExecutor;
private final AtomicInteger activeThreads;
public CPUAttack() {
this.activeThreads = new AtomicInteger(0);
}
@Override
public String getName() {
return "CPU Attack";
}
@Override
public String getDescription() {
return "Generates high CPU load by creating busy threads";
}
@Override
public AttackCategory getCategory() {
return AttackCategory.RESOURCE;
}
@Override
protected AttackResult executeAttack(AttackParameters parameters) throws Exception {
int cores = parameters.getInt("cores", DEFAULT_CORES);
int loadPercentage = parameters.getInt("loadPercentage", DEFAULT_LOAD_PERCENTAGE);
long durationSeconds = parameters.getDuration().getSeconds();
logger.info("Starting CPU attack: cores={}, load={}%, duration={}s", 
cores, loadPercentage, durationSeconds);
cpuLoadExecutor = Executors.newFixedThreadPool(cores);
LocalDateTime startTime = LocalDateTime.now();
// Create CPU load threads
for (int i = 0; i < cores; i++) {
cpuLoadExecutor.submit(() -> generateCPULoad(loadPercentage, durationSeconds));
activeThreads.incrementAndGet();
}
// Wait for specified duration
Thread.sleep(durationSeconds * 1000);
// Stop the attack
performCleanup();
LocalDateTime endTime = LocalDateTime.now();
Map<String, Object> metrics = Map.of(
"cores_used", cores,
"load_percentage", loadPercentage,
"active_threads", activeThreads.get(),
"duration_seconds", durationSeconds
);
return new AttackResult(attackId, getName(), AttackStatus.COMPLETED,
startTime, endTime, "CPU attack completed successfully", metrics);
}
@Override
protected void performCleanup() {
if (cpuLoadExecutor != null) {
cpuLoadExecutor.shutdownNow();
cpuLoadExecutor = null;
}
activeThreads.set(0);
}
private void generateCPULoad(int loadPercentage, long durationSeconds) {
long startTime = System.currentTimeMillis();
long endTime = startTime + (durationSeconds * 1000);
try {
while (System.currentTimeMillis() < endTime && !Thread.currentThread().isInterrupted()) {
// Calculate busy and sleep times based on desired load percentage
long busyTime = (long) (100 * (loadPercentage / 100.0));
long sleepTime = 100 - busyTime;
// Busy loop
long startBusy = System.currentTimeMillis();
while ((System.currentTimeMillis() - startBusy) <= busyTime) {
// Perform some CPU-intensive operations
double result = 0;
for (int i = 0; i < 1000; i++) {
result += Math.sqrt(ThreadLocalRandom.current().nextDouble()) * 
Math.sin(ThreadLocalRandom.current().nextDouble());
}
}
// Sleep to achieve desired load percentage
if (sleepTime > 0) {
Thread.sleep(sleepTime);
}
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
} finally {
activeThreads.decrementAndGet();
}
}
@Override
public AttackSafetyCheck safetyCheck() {
return new CPUSafetyCheck();
}
private static class CPUSafetyCheck implements AttackSafetyCheck {
@Override
public boolean isSafeToExecute(AttackParameters parameters) {
int cores = parameters.getInt("cores", DEFAULT_CORES);
int availableProcessors = Runtime.getRuntime().availableProcessors();
// Don't use all available cores
if (cores >= availableProcessors) {
return false;
}
// Don't allow extremely high load on production
int loadPercentage = parameters.getInt("loadPercentage", DEFAULT_LOAD_PERCENTAGE);
if (loadPercentage > 95) {
return false;
}
return true;
}
@Override
public String getSafetyDescription() {
return "Ensures CPU attack doesn't use all available cores or extreme load percentages";
}
}
}

Memory Attack

package com.example.chaos.attacks.resource;
import com.example.chaos.framework.*;
import org.springframework.stereotype.Component;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@Component
public class MemoryAttack extends BaseAttack {
private static final int DEFAULT_MEMORY_MB = 512;
private List<byte[]> memoryChunks;
public MemoryAttack() {
this.memoryChunks = new ArrayList<>();
}
@Override
public String getName() {
return "Memory Attack";
}
@Override
public String getDescription() {
return "Consumes specified amount of memory to test memory pressure handling";
}
@Override
public AttackCategory getCategory() {
return AttackCategory.RESOURCE;
}
@Override
protected AttackResult executeAttack(AttackParameters parameters) throws Exception {
int memoryMB = parameters.getInt("memoryMB", DEFAULT_MEMORY_MB);
long durationSeconds = parameters.getDuration().getSeconds();
logger.info("Starting memory attack: {} MB for {} seconds", memoryMB, durationSeconds);
LocalDateTime startTime = LocalDateTime.now();
// Allocate memory
allocateMemory(memoryMB);
// Wait for specified duration
Thread.sleep(durationSeconds * 1000);
// Clean up memory
performCleanup();
LocalDateTime endTime = LocalDateTime.now();
Map<String, Object> metrics = Map.of(
"memory_allocated_mb", memoryMB,
"duration_seconds", durationSeconds,
"max_memory_mb", Runtime.getRuntime().maxMemory() / (1024 * 1024),
"total_memory_mb", Runtime.getRuntime().totalMemory() / (1024 * 1024),
"free_memory_mb", Runtime.getRuntime().freeMemory() / (1024 * 1024)
);
return new AttackResult(attackId, getName(), AttackStatus.COMPLETED,
startTime, endTime, "Memory attack completed successfully", metrics);
}
@Override
protected void performCleanup() {
memoryChunks.clear();
System.gc(); // Suggest garbage collection
logger.debug("Memory attack cleanup completed");
}
private void allocateMemory(int memoryMB) {
int chunkSizeMB = 10; // Allocate in 10MB chunks
int chunks = memoryMB / chunkSizeMB;
int remainder = memoryMB % chunkSizeMB;
for (int i = 0; i < chunks; i++) {
try {
byte[] chunk = new byte[chunkSizeMB * 1024 * 1024];
memoryChunks.add(chunk);
// Touch memory to ensure allocation
for (int j = 0; j < chunk.length; j += 4096) {
chunk[j] = 1;
}
logger.debug("Allocated {} MB chunk", chunkSizeMB);
} catch (OutOfMemoryError e) {
logger.warn("OutOfMemoryError while allocating memory chunk");
break;
}
}
// Allocate remainder
if (remainder > 0) {
try {
byte[] chunk = new byte[remainder * 1024 * 1024];
memoryChunks.add(chunk);
logger.debug("Allocated {} MB remainder chunk", remainder);
} catch (OutOfMemoryError e) {
logger.warn("OutOfMemoryError while allocating remainder memory chunk");
}
}
logger.info("Total memory allocated: {} MB", 
memoryChunks.stream().mapToInt(arr -> arr.length / (1024 * 1024)).sum());
}
@Override
public AttackSafetyCheck safetyCheck() {
return new MemorySafetyCheck();
}
private static class MemorySafetyCheck implements AttackSafetyCheck {
@Override
public boolean isSafeToExecute(AttackParameters parameters) {
int memoryMB = parameters.getInt("memoryMB", DEFAULT_MEMORY_MB);
long maxMemory = Runtime.getRuntime().maxMemory() / (1024 * 1024);
// Don't allocate more than 80% of max memory
if (memoryMB > maxMemory * 0.8) {
return false;
}
// Don't allow extremely large allocations
if (memoryMB > 2048) { // 2GB max
return false;
}
return true;
}
@Override
public String getSafetyDescription() {
return "Ensures memory allocation doesn't exceed safe limits";
}
}
}

Disk I/O Attack

package com.example.chaos.attacks.resource;
import com.example.chaos.framework.*;
import org.apache.commons.io.FileUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@Component
public class DiskIOAttack extends BaseAttack {
private static final int DEFAULT_FILE_SIZE_MB = 100;
private static final int DEFAULT_FILE_COUNT = 5;
private List<File> createdFiles;
private Path tempDirectory;
public DiskIOAttack() {
this.createdFiles = new ArrayList<>();
}
@Override
public String getName() {
return "Disk I/O Attack";
}
@Override
public String getDescription() {
return "Generates high disk I/O load by creating, writing, and reading files";
}
@Override
public AttackCategory getCategory() {
return AttackCategory.RESOURCE;
}
@Override
protected AttackResult executeAttack(AttackParameters parameters) throws Exception {
int fileSizeMB = parameters.getInt("fileSizeMB", DEFAULT_FILE_SIZE_MB);
int fileCount = parameters.getInt("fileCount", DEFAULT_FILE_COUNT);
String operation = parameters.getString("operation", "write"); // write, read, both
logger.info("Starting disk I/O attack: {} files of {} MB, operation: {}", 
fileCount, fileSizeMB, operation);
LocalDateTime startTime = LocalDateTime.now();
try {
// Create temporary directory
tempDirectory = Files.createTempDirectory("chaos-disk-io");
// Perform disk operations
if ("write".equals(operation) || "both".equals(operation)) {
performWriteOperations(fileSizeMB, fileCount);
}
if ("read".equals(operation) || "both".equals(operation)) {
performReadOperations();
}
LocalDateTime endTime = LocalDateTime.now();
Map<String, Object> metrics = Map.of(
"file_count", fileCount,
"file_size_mb", fileSizeMB,
"operation", operation,
"total_size_mb", fileCount * fileSizeMB,
"duration_seconds", Duration.between(startTime, endTime).getSeconds()
);
return new AttackResult(attackId, getName(), AttackStatus.COMPLETED,
startTime, endTime, "Disk I/O attack completed successfully", metrics);
} finally {
performCleanup();
}
}
@Override
protected void performCleanup() {
// Clean up created files
for (File file : createdFiles) {
try {
Files.deleteIfExists(file.toPath());
} catch (IOException e) {
logger.warn("Failed to delete file: {}", file.getAbsolutePath(), e);
}
}
createdFiles.clear();
// Clean up temp directory
if (tempDirectory != null) {
try {
FileUtils.deleteDirectory(tempDirectory.toFile());
} catch (IOException e) {
logger.warn("Failed to delete temp directory: {}", tempDirectory, e);
}
tempDirectory = null;
}
}
private void performWriteOperations(int fileSizeMB, int fileCount) throws IOException {
byte[] data = new byte[1024 * 1024]; // 1MB chunk
// Fill with some data
for (int i = 0; i < data.length; i++) {
data[i] = (byte) (i % 256);
}
for (int i = 0; i < fileCount; i++) {
File file = new File(tempDirectory.toFile(), "chaos-file-" + i + ".dat");
try (RandomAccessFile raf = new RandomAccessFile(file, "rw")) {
for (int mb = 0; mb < fileSizeMB; mb++) {
if (isRunning()) {
raf.write(data);
} else {
break;
}
}
raf.getFD().sync(); // Force write to disk
}
createdFiles.add(file);
logger.debug("Created file: {} ({} MB)", file.getName(), fileSizeMB);
}
}
private void performReadOperations() throws IOException {
for (File file : createdFiles) {
try (RandomAccessFile raf = new RandomAccessFile(file, "r")) {
byte[] buffer = new byte[1024 * 1024]; // 1MB buffer
long fileSize = file.length();
long bytesRead = 0;
while (bytesRead < fileSize && isRunning()) {
int bytes = raf.read(buffer);
if (bytes == -1) break;
bytesRead += bytes;
// Simulate some processing
int sum = 0;
for (int i = 0; i < bytes; i += 1024) {
sum += buffer[i];
}
}
}
logger.debug("Read file: {} ({} bytes)", file.getName(), file.length());
}
}
@Override
public AttackSafetyCheck safetyCheck() {
return new DiskSafetyCheck();
}
private static class DiskSafetyCheck implements AttackSafetyCheck {
@Override
public boolean isSafeToExecute(AttackParameters parameters) {
int fileSizeMB = parameters.getInt("fileSizeMB", DEFAULT_FILE_SIZE_MB);
int fileCount = parameters.getInt("fileCount", DEFAULT_FILE_COUNT);
long totalSizeMB = (long) fileSizeMB * fileCount;
// Don't allow creating more than 1GB of data
if (totalSizeMB > 1024) {
return false;
}
// Check available disk space
File tempDir = new File(System.getProperty("java.io.tmpdir"));
long freeSpaceMB = tempDir.getFreeSpace() / (1024 * 1024);
if (totalSizeMB > freeSpaceMB * 0.5) { // Don't use more than 50% of free space
return false;
}
return true;
}
@Override
public String getSafetyDescription() {
return "Ensures disk operations don't exhaust available disk space";
}
}
}

4. Network Attacks

Network Latency Attack

package com.example.chaos.attacks.network;
import com.example.chaos.framework.*;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.time.LocalDateTime;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
@Component
public class NetworkLatencyAttack extends BaseAttack {
private static final int DEFAULT_LATENCY_MS = 1000;
private final Map<String, OriginalLatency> originalLatencies;
public NetworkLatencyAttack() {
this.originalLatencies = new ConcurrentHashMap<>();
}
@Override
public String getName() {
return "Network Latency Attack";
}
@Override
public String getDescription() {
return "Simulates network latency by intercepting and delaying network calls";
}
@Override
public AttackCategory getCategory() {
return AttackCategory.NETWORK;
}
@Override
protected AttackResult executeAttack(AttackParameters parameters) throws Exception {
String targetHost = parameters.getString("targetHost", "localhost");
int latencyMs = parameters.getInt("latencyMs", DEFAULT_LATENCY_MS);
long durationSeconds = parameters.getDuration().getSeconds();
logger.info("Starting network latency attack: {}ms to {} for {}s", 
latencyMs, targetHost, durationSeconds);
LocalDateTime startTime = LocalDateTime.now();
try {
// Store original latency settings
storeOriginalLatency(targetHost);
// Apply latency (simulated - in real implementation, you might use tools like tc)
applyLatency(targetHost, latencyMs);
// Wait for specified duration
Thread.sleep(durationSeconds * 1000);
LocalDateTime endTime = LocalDateTime.now();
Map<String, Object> metrics = Map.of(
"target_host", targetHost,
"latency_ms", latencyMs,
"duration_seconds", durationSeconds,
"original_latency_ms", getOriginalLatency(targetHost)
);
return new AttackResult(attackId, getName(), AttackStatus.COMPLETED,
startTime, endTime, "Network latency attack completed", metrics);
} finally {
restoreOriginalLatency(targetHost);
}
}
@Override
protected void performCleanup() {
// Restore all original latencies
for (String host : originalLatencies.keySet()) {
restoreOriginalLatency(host);
}
originalLatencies.clear();
}
private void storeOriginalLatency(String host) {
try {
long startTime = System.currentTimeMillis();
boolean reachable = InetAddress.getByName(host).isReachable(5000);
long latency = System.currentTimeMillis() - startTime;
originalLatencies.put(host, new OriginalLatency(latency, reachable));
logger.debug("Stored original latency for {}: {}ms", host, latency);
} catch (UnknownHostException e) {
logger.warn("Unknown host: {}", host);
} catch (IOException e) {
logger.warn("Host not reachable: {}", host);
}
}
private void applyLatency(String host, int latencyMs) {
// In a real implementation, this would use system tools like 'tc' on Linux
// For simulation, we'll just log the intended latency
logger.info("Simulating {}ms latency to {}", latencyMs, host);
// Example of how you might implement this with JVM instrumentation:
// NetworkLatencyInterceptor.setLatency(host, latencyMs);
}
private void restoreOriginalLatency(String host) {
OriginalLatency original = originalLatencies.remove(host);
if (original != null) {
logger.info("Restored original latency for {}: {}ms", host, original.latencyMs);
// NetworkLatencyInterceptor.clearLatency(host);
}
}
private Long getOriginalLatency(String host) {
OriginalLatency original = originalLatencies.get(host);
return original != null ? original.latencyMs : null;
}
private static class OriginalLatency {
final Long latencyMs;
final boolean reachable;
OriginalLatency(Long latencyMs, boolean reachable) {
this.latencyMs = latencyMs;
this.reachable = reachable;
}
}
@Override
public AttackSafetyCheck safetyCheck() {
return new NetworkSafetyCheck();
}
private static class NetworkSafetyCheck implements AttackSafetyCheck {
@Override
public boolean isSafeToExecute(AttackParameters parameters) {
String targetHost = parameters.getString("targetHost", "localhost");
int latencyMs = parameters.getInt("latencyMs", DEFAULT_LATENCY_MS);
// Don't allow extreme latency that would make services unusable
if (latencyMs > 30000) { // 30 seconds max
return false;
}
// Don't allow attacking critical infrastructure
if (isCriticalHost(targetHost)) {
return false;
}
return true;
}
private boolean isCriticalHost(String host) {
String lowerHost = host.toLowerCase();
return lowerHost.contains("database") || 
lowerHost.contains("auth") ||
lowerHost.contains("config") ||
lowerHost.contains("localhost") ||
lowerHost.contains("127.0.0.1");
}
@Override
public String getSafetyDescription() {
return "Ensures network attacks don't target critical infrastructure or cause extreme disruption";
}
}
}

Packet Loss Attack

package com.example.chaos.attacks.network;
import com.example.chaos.framework.*;
import org.springframework.stereotype.Component;
import java.time.LocalDateTime;
import java.util.Map;
@Component
public class PacketLossAttack extends BaseAttack {
private static final int DEFAULT_LOSS_PERCENTAGE = 50;
@Override
public String getName() {
return "Packet Loss Attack";
}
@Override
public String getDescription() {
return "Simulates network packet loss by randomly dropping packets";
}
@Override
public AttackCategory getCategory() {
return AttackCategory.NETWORK;
}
@Override
protected AttackResult executeAttack(AttackParameters parameters) throws Exception {
String targetHost = parameters.getString("targetHost", "localhost");
int lossPercentage = parameters.getInt("lossPercentage", DEFAULT_LOSS_PERCENTAGE);
long durationSeconds = parameters.getDuration().getSeconds();
logger.info("Starting packet loss attack: {}% loss to {} for {}s", 
lossPercentage, targetHost, durationSeconds);
LocalDateTime startTime = LocalDateTime.now();
try {
// Apply packet loss (simulated)
applyPacketLoss(targetHost, lossPercentage);
// Wait for specified duration
Thread.sleep(durationSeconds * 1000);
LocalDateTime endTime = LocalDateTime.now();
Map<String, Object> metrics = Map.of(
"target_host", targetHost,
"loss_percentage", lossPercentage,
"duration_seconds", durationSeconds
);
return new AttackResult(attackId, getName(), AttackStatus.COMPLETED,
startTime, endTime, "Packet loss attack completed", metrics);
} finally {
removePacketLoss(targetHost);
}
}
private void applyPacketLoss(String host, int lossPercentage) {
logger.info("Simulating {}% packet loss to {}", lossPercentage, host);
// In real implementation, use tools like tc or iptables
// tc qdisc add dev eth0 root netem loss 50%
}
private void removePacketLoss(String host) {
logger.info("Removing packet loss for {}", host);
// tc qdisc del dev eth0 root netem
}
@Override
public AttackSafetyCheck safetyCheck() {
return new PacketLossSafetyCheck();
}
private static class PacketLossSafetyCheck implements AttackSafetyCheck {
@Override
public boolean isSafeToExecute(AttackParameters parameters) {
int lossPercentage = parameters.getInt("lossPercentage", DEFAULT_LOSS_PERCENTAGE);
// Don't allow 100% packet loss (complete network partition)
if (lossPercentage >= 100) {
return false;
}
// Don't allow extreme packet loss in production
if (lossPercentage > 80) {
return false;
}
return true;
}
@Override
public String getSafetyDescription() {
return "Ensures packet loss doesn't cause complete network partitions";
}
}
}

5. State Attacks

JVM Shutdown Attack

package com.example.chaos.attacks.state;
import com.example.chaos.framework.*;
import org.springframework.stereotype.Component;
import java.time.LocalDateTime;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
@Component
public class JVMShutdownAttack extends BaseAttack {
private ScheduledExecutorService shutdownExecutor;
@Override
public String getName() {
return "JVM Shutdown Attack";
}
@Override
public String getDescription() {
return "Simulates JVM shutdown or restart scenarios";
}
@Override
public AttackCategory getCategory() {
return AttackCategory.STATE;
}
@Override
protected AttackResult executeAttack(AttackParameters parameters) throws Exception {
String shutdownType = parameters.getString("shutdownType", "graceful"); // graceful, forced
long delaySeconds = parameters.getInt("delaySeconds", 30);
logger.warn("Starting JVM shutdown attack: {} shutdown in {} seconds", 
shutdownType, delaySeconds);
LocalDateTime startTime = LocalDateTime.now();
shutdownExecutor = Executors.newSingleThreadScheduledExecutor();
shutdownExecutor.schedule(() -> {
try {
performShutdown(shutdownType);
} catch (Exception e) {
logger.error("Shutdown attack failed", e);
}
}, delaySeconds, TimeUnit.SECONDS);
// Return immediately - the shutdown will happen asynchronously
Map<String, Object> metrics = Map.of(
"shutdown_type", shutdownType,
"delay_seconds", delaySeconds
);
return new AttackResult(attackId, getName(), AttackStatus.COMPLETED,
startTime, LocalDateTime.now(), 
"JVM shutdown scheduled in " + delaySeconds + " seconds", metrics);
}
private void performShutdown(String shutdownType) {
logger.warn("Performing {} JVM shutdown", shutdownType);
if ("graceful".equals(shutdownType)) {
// Graceful shutdown
System.exit(0);
} else if ("forced".equals(shutdownType)) {
// Forced shutdown
Runtime.getRuntime().halt(1);
}
}
@Override
protected void performCleanup() {
if (shutdownExecutor != null) {
shutdownExecutor.shutdownNow();
shutdownExecutor = null;
}
}
@Override
public AttackSafetyCheck safetyCheck() {
return new ShutdownSafetyCheck();
}
private static class ShutdownSafetyCheck implements AttackSafetyCheck {
@Override
public boolean isSafeToExecute(AttackParameters parameters) {
// This is an extremely dangerous attack - only allow in specific environments
String environment = System.getProperty("app.environment", "production");
if ("production".equals(environment)) {
return false; // Never allow in production
}
// Only allow with explicit override
boolean overrideEnabled = Boolean.parseBoolean(
System.getProperty("chaos.shutdown.override", "false"));
return overrideEnabled;
}
@Override
public String getSafetyDescription() {
return "JVM shutdown is only allowed in non-production environments with explicit override";
}
}
}

6. Chaos Engineering Service

Chaos Orchestration Service

package com.example.chaos.service;
import com.example.chaos.framework.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import java.time.LocalDateTime;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
@Service
public class ChaosOrchestrationService {
private static final Logger logger = LoggerFactory.getLogger(ChaosOrchestrationService.class);
private final List<ChaosAttack> availableAttacks;
private final Map<String, ActiveAttack> activeAttacks;
private final ChaosSafetyService safetyService;
public ChaosOrchestrationService(List<ChaosAttack> availableAttacks, 
ChaosSafetyService safetyService) {
this.availableAttacks = new CopyOnWriteArrayList<>(availableAttacks);
this.activeAttacks = new ConcurrentHashMap<>();
this.safetyService = safetyService;
}
public List<AttackInfo> getAvailableAttacks() {
return availableAttacks.stream()
.map(attack -> new AttackInfo(
attack.getName(),
attack.getDescription(),
attack.getCategory(),
attack.safetyCheck().getSafetyDescription()
))
.toList();
}
public AttackExecutionResult executeAttack(String attackName, AttackParameters parameters) {
ChaosAttack attack = findAttackByName(attackName);
if (attack == null) {
throw new ChaosException("Attack not found: " + attackName);
}
// Perform safety checks
if (!safetyService.isAttackAllowed(attack, parameters)) {
throw new ChaosException("Attack blocked by safety rules: " + attackName);
}
if (attack.isRunning()) {
throw new ChaosException("Attack is already running: " + attackName);
}
try {
CompletableFuture<AttackResult> future = attack.execute(parameters);
ActiveAttack activeAttack = new ActiveAttack(attack, future, parameters);
activeAttacks.put(attackName, activeAttack);
// Handle completion
future.whenComplete((result, throwable) -> {
activeAttacks.remove(attackName);
logger.info("Attack completed: {} - {}", attackName, 
result != null ? result.getStatus() : "FAILED");
});
return new AttackExecutionResult(attackName, "STARTED", 
"Attack execution started successfully");
} catch (Exception e) {
logger.error("Failed to execute attack: {}", attackName, e);
throw new ChaosException("Failed to execute attack: " + e.getMessage(), e);
}
}
public void cancelAttack(String attackName) {
ActiveAttack activeAttack = activeAttacks.get(attackName);
if (activeAttack != null) {
activeAttack.attack().cancel();
activeAttacks.remove(attackName);
logger.info("Cancelled attack: {}", attackName);
} else {
throw new ChaosException("No active attack found: " + attackName);
}
}
public List<ActiveAttackInfo> getActiveAttacks() {
return activeAttacks.values().stream()
.map(active -> new ActiveAttackInfo(
active.attack().getName(),
active.attack().getCategory(),
active.parameters(),
active.startTime(),
active.attack().isRunning() ? "RUNNING" : "COMPLETING"
))
.toList();
}
public AttackStatus getAttackStatus(String attackName) {
ActiveAttack activeAttack = activeAttacks.get(attackName);
if (activeAttack != null) {
CompletableFuture<AttackResult> future = activeAttack.future();
if (future.isDone()) {
try {
AttackResult result = future.get();
return new AttackStatus(attackName, result.getStatus().toString(), 
result.getMessage(), result.getMetrics());
} catch (Exception e) {
return new AttackStatus(attackName, "FAILED", e.getMessage(), Map.of());
}
} else {
return new AttackStatus(attackName, "RUNNING", "Attack is in progress", Map.of());
}
}
return new AttackStatus(attackName, "NOT_FOUND", "No active attack found", Map.of());
}
private ChaosAttack findAttackByName(String name) {
return availableAttacks.stream()
.filter(attack -> attack.getName().equalsIgnoreCase(name))
.findFirst()
.orElse(null);
}
// Data classes
public record AttackInfo(String name, String description, AttackCategory category, String safetyInfo) {}
public record AttackExecutionResult(String attackName, String status, String message) {}
public record ActiveAttackInfo(String attackName, AttackCategory category, 
AttackParameters parameters, LocalDateTime startTime, String status) {}
public record AttackStatus(String attackName, String status, String message, Map<String, Object> metrics) {}
public record ActiveAttack(ChaosAttack attack, CompletableFuture<AttackResult> future, 
AttackParameters parameters, LocalDateTime startTime) {
public ActiveAttack(ChaosAttack attack, CompletableFuture<AttackResult> future, 
AttackParameters parameters) {
this(attack, future, parameters, LocalDateTime.now());
}
}
public static class ChaosException extends RuntimeException {
public ChaosException(String message) {
super(message);
}
public ChaosException(String message, Throwable cause) {
super(message, cause);
}
}
}

Safety Service

package com.example.chaos.service;
import com.example.chaos.framework.AttackParameters;
import com.example.chaos.framework.AttackSafetyCheck;
import com.example.chaos.framework.ChaosAttack;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.time.LocalDateTime;
import java.time.LocalTime;
import java.util.List;
@Service
public class ChaosSafetyService {
@Value("${app.chaos.safety-mechanisms.enabled:true}")
private boolean safetyMechanismsEnabled;
@Value("${app.chaos.safety-mechanisms.max-cpu-usage:80}")
private int maxCpuUsage;
@Value("${app.chaos.safety-mechanisms.max-memory-usage:85}")
private int maxMemoryUsage;
@Value("${app.chaos.safety-mechanisms.allowed-networks:127.0.0.1}")
private List<String> allowedNetworks;
public boolean isAttackAllowed(ChaosAttack attack, AttackParameters parameters) {
if (!safetyMechanismsEnabled) {
return true;
}
// Check attack-specific safety
AttackSafetyCheck safetyCheck = attack.safetyCheck();
if (!safetyCheck.isSafeToExecute(parameters)) {
return false;
}
// Check global safety rules
if (!isWithinBusinessHours()) {
return false;
}
if (!isSafeEnvironment()) {
return false;
}
if (!hasSufficientResources()) {
return false;
}
return true;
}
private boolean isWithinBusinessHours() {
LocalTime now = LocalTime.now();
LocalTime businessStart = LocalTime.of(9, 0);  // 9 AM
LocalTime businessEnd = LocalTime.of(17, 0);   // 5 PM
// Only allow during business hours for safety
return !now.isBefore(businessStart) && !now.isAfter(businessEnd);
}
private boolean isSafeEnvironment() {
String environment = System.getProperty("app.environment", "production");
// Only allow chaos engineering in pre-production environments
return "staging".equals(environment) || 
"testing".equals(environment) ||
"development".equals(environment);
}
private boolean hasSufficientResources() {
Runtime runtime = Runtime.getRuntime();
// Check memory usage
long usedMemory = runtime.totalMemory() - runtime.freeMemory();
long maxMemory = runtime.maxMemory();
double memoryUsage = (double) usedMemory / maxMemory * 100;
if (memoryUsage > maxMemoryUsage) {
return false;
}
// In a real implementation, you would check CPU usage and other resources
return true;
}
private boolean isNetworkAllowed(String targetHost) {
try {
InetAddress targetAddress = InetAddress.getByName(targetHost);
for (String allowedNetwork : allowedNetworks) {
if (isInNetwork(targetAddress, allowedNetwork)) {
return true;
}
}
return false;
} catch (UnknownHostException e) {
return false;
}
}
private boolean isInNetwork(InetAddress address, String network) {
// Simplified network check - in real implementation, use proper CIDR checking
return address.getHostAddress().startsWith(network) ||
address.getHostName().contains(network);
}
}

7. REST API Controllers

Chaos Engineering Controller

package com.example.chaos.web;
import com.example.chaos.service.ChaosOrchestrationService;
import com.example.chaos.framework.AttackParameters;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import java.time.Duration;
import java.util.Map;
@RestController
@RequestMapping("/api/chaos")
public class ChaosEngineeringController {
private final ChaosOrchestrationService chaosService;
public ChaosEngineeringController(ChaosOrchestrationService chaosService) {
this.chaosService = chaosService;
}
@GetMapping("/attacks")
public ResponseEntity<?> getAvailableAttacks() {
return ResponseEntity.ok(chaosService.getAvailableAttacks());
}
@PostMapping("/attacks/{attackName}/execute")
public ResponseEntity<?> executeAttack(
@PathVariable String attackName,
@RequestBody ExecuteAttackRequest request) {
try {
AttackParameters parameters = new AttackParameters(
request.parameters(),
Duration.ofSeconds(request.durationSeconds()),
request.scope()
);
var result = chaosService.executeAttack(attackName, parameters);
return ResponseEntity.ok(result);
} catch (Exception e) {
return ResponseEntity.badRequest()
.body(Map.of("error", e.getMessage()));
}
}
@PostMapping("/attacks/{attackName}/cancel")
public ResponseEntity<?> cancelAttack(@PathVariable String attackName) {
try {
chaosService.cancelAttack(attackName);
return ResponseEntity.ok(Map.of(
"status", "CANCELLED",
"message", "Attack cancellation requested"
));
} catch (Exception e) {
return ResponseEntity.badRequest()
.body(Map.of("error", e.getMessage()));
}
}
@GetMapping("/attacks/active")
public ResponseEntity<?> getActiveAttacks() {
return ResponseEntity.ok(chaosService.getActiveAttacks());
}
@GetMapping("/attacks/{attackName}/status")
public ResponseEntity<?> getAttackStatus(@PathVariable String attackName) {
return ResponseEntity.ok(chaosService.getAttackStatus(attackName));
}
@PostMapping("/safety/override")
public ResponseEntity<?> overrideSafety(@RequestBody SafetyOverrideRequest request) {
// This would require proper authentication and authorization
// For demonstration purposes only
System.setProperty("chaos.safety.override", "true");
return ResponseEntity.ok(Map.of(
"status", "SAFETY_OVERRIDDEN",
"message", "Safety mechanisms disabled. Use with extreme caution!"
));
}
// Request DTOs
public record ExecuteAttackRequest(Map<String, Object> parameters, 
long durationSeconds, String scope) {}
public record SafetyOverrideRequest(String reason, String authorizedBy) {}
}

Actuator Endpoint

package com.example.chaos.actuator;
import com.example.chaos.service.ChaosOrchestrationService;
import org.springframework.boot.actuate.endpoint.annotation.*;
import org.springframework.stereotype.Component;
import java.time.Duration;
import java.util.Map;
@Component
@Endpoint(id = "chaos")
public class ChaosActuatorEndpoint {
private final ChaosOrchestrationService chaosService;
public ChaosActuatorEndpoint(ChaosOrchestrationService chaosService) {
this.chaosService = chaosService;
}
@ReadOperation
public Map<String, Object> chaosInfo() {
return Map.of(
"availableAttacks", chaosService.getAvailableAttacks().size(),
"activeAttacks", chaosService.getActiveAttacks().size(),
"safetyEnabled", true
);
}
@WriteOperation
public Map<String, Object> executeAttack(
@Selector String attackName,
Map<String, Object> parameters) {
try {
long durationSeconds = parameters.containsKey("durationSeconds") ? 
Long.parseLong(parameters.get("durationSeconds").toString()) : 60;
var result = chaosService.executeAttack(attackName, 
new AttackParameters(parameters, Duration.ofSeconds(durationSeconds), "actuator"));
return Map.of(
"status", "SUCCESS",
"attackName", result.attackName(),
"message", result.message()
);
} catch (Exception e) {
return Map.of(
"status", "ERROR",
"message", e.getMessage()
);
}
}
@DeleteOperation
public Map<String, Object> cancelAttack(@Selector String attackName) {
try {
chaosService.cancelAttack(attackName);
return Map.of(
"status", "CANCELLED",
"message", "Attack cancellation requested"
);
} catch (Exception e) {
return Map.of(
"status", "ERROR", 
"message", e.getMessage()
);
}
}
}

8. Spring Boot Application

Main Application Class

package com.example.chaos;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
@SpringBootApplication
public class ChaosEngineeringApplication {
public static void main(String[] args) {
SpringApplication.run(ChaosEngineeringApplication.class, args);
}
}

9. Real-World Scenarios

Scenario: E-commerce Resilience Testing

package com.example.chaos.scenarios;
import com.example.chaos.framework.*;
import com.example.chaos.service.ChaosOrchestrationService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.time.Duration;
import java.util.Map;
@Component
public class ECommerceResilienceScenario {
private static final Logger logger = LoggerFactory.getLogger(ECommerceResilienceScenario.class);
private final ChaosOrchestrationService chaosService;
private boolean scenarioRunning = false;
public ECommerceResilienceScenario(ChaosOrchestrationService chaosService) {
this.chaosService = chaosService;
}
@Scheduled(cron = "0 0 2 * * MON") // Run every Monday at 2 AM
public void runWeeklyResilienceTest() {
if (scenarioRunning) {
logger.info("Resilience test already running");
return;
}
scenarioRunning = true;
logger.info("Starting weekly e-commerce resilience test");
try {
// Phase 1: Database latency
logger.info("Phase 1: Simulating database latency");
chaosService.executeAttack("Network Latency Attack", 
new AttackParameters(
Map.of("targetHost", "database-server", "latencyMs", 500),
Duration.ofMinutes(5), "database"
));
Thread.sleep(300000); // Wait 5 minutes
// Phase 2: Payment service packet loss
logger.info("Phase 2: Simulating payment service issues");
chaosService.executeAttack("Packet Loss Attack",
new AttackParameters(
Map.of("targetHost", "payment-service", "lossPercentage", 30),
Duration.ofMinutes(3), "payment"
));
Thread.sleep(180000); // Wait 3 minutes
// Phase 3: High CPU load
logger.info("Phase 3: Simulating high CPU load");
chaosService.executeAttack("CPU Attack",
new AttackParameters(
Map.of("cores", 2, "loadPercentage", 70),
Duration.ofMinutes(4), "application"
));
Thread.sleep(240000); // Wait 4 minutes
logger.info("Weekly resilience test completed successfully");
} catch (Exception e) {
logger.error("Resilience test failed", e);
} finally {
scenarioRunning = false;
}
}
public void runBlackFridayScenario() {
logger.info("Running Black Friday stress test scenario");
// Simulate extreme load conditions
try {
// High memory usage
chaosService.executeAttack("Memory Attack",
new AttackParameters(
Map.of("memoryMB", 1024), // 1GB
Duration.ofMinutes(10), "stress-test"
));
// Network issues
chaosService.executeAttack("Network Latency Attack",
new AttackParameters(
Map.of("targetHost", "inventory-service", "latencyMs", 1000),
Duration.ofMinutes(8), "stress-test"
));
} catch (Exception e) {
logger.error("Black Friday scenario failed", e);
}
}
}

Summary

This comprehensive Gremlin-like chaos engineering implementation provides:

Key Attack Types:

  1. Resource Attacks: CPU, Memory, Disk I/O stress testing
  2. Network Attacks: Latency, packet loss, network disruption
  3. State Attacks: JVM shutdown, service restarts
  4. Safety Mechanisms: Comprehensive safety checks and controls

Features:

  • REST API: Full control via HTTP endpoints
  • Actuator Integration: Spring Boot actuator support
  • Safety First: Multiple safety layers and environment checks
  • Monitoring: Real-time attack status and metrics
  • Scenario Testing: Pre-defined resilience test scenarios

Safety Considerations:

  • Environment-based restrictions (never in production by default)
  • Resource usage limits
  • Business hours restrictions
  • Critical service protection
  • Manual override requirements for dangerous attacks

Use Cases:

  • Resilience testing of microservices
  • Capacity planning and load testing
  • Disaster recovery validation
  • Team training for incident response
  • Continuous reliability validation

This implementation provides a production-ready chaos engineering framework that can safely test and improve your system's resilience against real-world failures.

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper