Introduction to Chaos Engineering
Chaos Engineering is the discipline of experimenting on a system in order to build confidence in the system's capability to withstand turbulent conditions in production. Gremlin is a leading chaos engineering platform that provides controlled ways to inject failure and test system resilience.
Gremlin Java SDK Setup
Dependencies
<!-- Maven Dependencies --> <dependencies> <dependency> <groupId>com.gremlin</groupId> <artifactId>gremlin-java-sdk</artifactId> <version>3.0.0</version> </dependency> <dependency> <groupId>com.gremlin</groupId> <artifactId>gremlin-api-client</artifactId> <version>3.0.0</version> </dependency> </dependencies>
Configuration
import com.gremlin.GremlinService;
import com.gremlin.GremlinCoordinates;
import com.gremlin.ApplicationCoordinates;
public class GremlinConfig {
public static GremlinService configureGremlin() {
// Configure Gremlin coordinates
GremlinCoordinates gremlinCoordinates = new GremlinCoordinates()
.withTeamId(System.getenv("GREMLIN_TEAM_ID"))
.withTeamSecret(System.getenv("GREMLIN_TEAM_SECRET"));
// Configure application coordinates
ApplicationCoordinates appCoordinates = new ApplicationCoordinates()
.withType("microservice")
.withField("application", "order-service")
.withField("environment", System.getenv("APP_ENV"))
.withField("version", "1.0.0");
return new GremlinService(gremlinCoordinates, appCoordinates);
}
public static void main(String[] args) {
GremlinService gremlin = configureGremlin();
System.out.println("Gremlin service configured successfully");
}
}
Resource Attacks
CPU Attack
import com.gremlin.GremlinService;
import com.gremlin.scenarios.*;
import java.util.concurrent.TimeUnit;
public class CPUChaosAttack {
private final GremlinService gremlin;
public CPUChaosAttack(GremlinService gremlin) {
this.gremlin = gremlin;
}
public void executeCpuAttack(int cores, int durationMinutes) {
try {
System.out.println("Starting CPU attack on " + cores + " cores for " +
durationMinutes + " minutes");
CpuAttack cpuAttack = new CpuAttack()
.withCores(cores)
.withLength(durationMinutes);
String attackId = gremlin.attack(cpuAttack);
System.out.println("CPU attack started with ID: " + attackId);
// Monitor attack
monitorAttack(attackId, durationMinutes);
} catch (Exception e) {
System.err.println("Failed to execute CPU attack: " + e.getMessage());
}
}
public void executeGradualCpuAttack(int minCores, int maxCores,
int durationMinutes, int stepDuration) {
System.out.println("Starting gradual CPU attack from " + minCores +
" to " + maxCores + " cores");
for (int cores = minCores; cores <= maxCores; cores++) {
try {
System.out.println("Setting CPU load to " + cores + " cores");
CpuAttack attack = new CpuAttack()
.withCores(cores)
.withLength(stepDuration);
String attackId = gremlin.attack(attack);
// Wait for step duration
TimeUnit.MINUTES.sleep(stepDuration);
// Stop current attack
gremlin.haltAttack(attackId);
} catch (Exception e) {
System.err.println("Error in gradual CPU attack: " + e.getMessage());
}
}
}
private void monitorAttack(String attackId, int durationMinutes) {
// In a real scenario, you'd monitor system metrics and application health
new Thread(() -> {
try {
for (int i = 0; i < durationMinutes; i++) {
TimeUnit.MINUTES.sleep(1);
System.out.println("Attack in progress... " +
(i + 1) + "/" + durationMinutes + " minutes");
// Check application health metrics
checkApplicationHealth();
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}).start();
}
private void checkApplicationHealth() {
// Implement health checks
double cpuUsage = getCpuUsage();
double memoryUsage = getMemoryUsage();
double responseTime = getAverageResponseTime();
System.out.printf("Health Metrics - CPU: %.2f%%, Memory: %.2f%%, Response Time: %.2fms%n",
cpuUsage, memoryUsage, responseTime);
// Alert if thresholds exceeded
if (cpuUsage > 90.0 || memoryUsage > 85.0 || responseTime > 1000.0) {
System.err.println("WARNING: Health thresholds exceeded!");
}
}
// Mock implementations
private double getCpuUsage() {
return Math.random() * 100;
}
private double getMemoryUsage() {
return 30.0 + Math.random() * 50;
}
private double getAverageResponseTime() {
return 100.0 + Math.random() * 900;
}
public static void main(String[] args) throws Exception {
GremlinService gremlin = GremlinConfig.configureGremlin();
CPUChaosAttack cpuAttack = new CPUChaosAttack(gremlin);
// Execute different CPU attacks
cpuAttack.executeCpuAttack(2, 5); // 2 cores for 5 minutes
TimeUnit.MINUTES.sleep(10); // Wait between attacks
cpuAttack.executeGradualCpuAttack(1, 4, 20, 5); // Gradual increase
}
}
Memory Attack
import com.gremlin.GremlinService;
import com.gremlin.scenarios.*;
import java.util.ArrayList;
import java.util.List;
public class MemoryChaosAttack {
private final GremlinService gremlin;
private final List<byte[]> memoryHog = new ArrayList<>();
public MemoryChaosAttack(GremlinService gremlin) {
this.gremlin = gremlin;
}
public void executeMemoryAttack(int memoryMB, int durationMinutes) {
try {
System.out.println("Starting memory attack: " + memoryMB + "MB for " +
durationMinutes + " minutes");
MemoryAttack memoryAttack = new MemoryAttack()
.withMemory(memoryMB)
.withLength(durationMinutes);
String attackId = gremlin.attack(memoryAttack);
System.out.println("Memory attack started with ID: " + attackId);
} catch (Exception e) {
System.err.println("Failed to execute memory attack: " + e.getMessage());
}
}
public void executeMemorySpikeAttack(int baseMemoryMB, int spikeMemoryMB,
int spikeDurationMinutes, int totalDurationMinutes) {
System.out.println("Starting memory spike attack");
try {
// Base memory load
MemoryAttack baseAttack = new MemoryAttack()
.withMemory(baseMemoryMB)
.withLength(totalDurationMinutes);
String baseAttackId = gremlin.attack(baseAttack);
// Wait a bit then add spike
Thread.sleep(2 * 60 * 1000); // 2 minutes
MemoryAttack spikeAttack = new MemoryAttack()
.withMemory(spikeMemoryMB)
.withLength(spikeDurationMinutes);
String spikeAttackId = gremlin.attack(spikeAttack);
System.out.println("Memory spike attack started");
} catch (Exception e) {
System.err.println("Failed to execute memory spike attack: " + e.getMessage());
}
}
public void executeMemoryLeakSimulation(int leakRateMB, int durationMinutes) {
System.out.println("Simulating memory leak: " + leakRateMB + "MB per minute");
new Thread(() -> {
try {
long startTime = System.currentTimeMillis();
long endTime = startTime + (durationMinutes * 60 * 1000);
while (System.currentTimeMillis() < endTime) {
// Allocate memory to simulate leak
byte[] chunk = new byte[leakRateMB * 1024 * 1024];
memoryHog.add(chunk);
System.out.println("Allocated " + leakRateMB + "MB. Total: " +
memoryHog.size() * leakRateMB + "MB");
Thread.sleep(60 * 1000); // Wait 1 minute
}
// Clean up
memoryHog.clear();
System.gc();
System.out.println("Memory leak simulation completed");
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
memoryHog.clear();
}
}).start();
}
public void monitorMemoryUsage() {
new Thread(() -> {
try {
while (true) {
Runtime runtime = Runtime.getRuntime();
long totalMemory = runtime.totalMemory();
long freeMemory = runtime.freeMemory();
long usedMemory = totalMemory - freeMemory;
long maxMemory = runtime.maxMemory();
double usagePercentage = (double) usedMemory / maxMemory * 100;
System.out.printf("Memory Usage: %d/%d MB (%.2f%%)%n",
usedMemory / (1024 * 1024),
maxMemory / (1024 * 1024),
usagePercentage);
if (usagePercentage > 85.0) {
System.err.println("CRITICAL: High memory usage detected!");
}
Thread.sleep(5000); // Check every 5 seconds
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}).start();
}
public static void main(String[] args) throws Exception {
GremlinService gremlin = GremlinConfig.configureGremlin();
MemoryChaosAttack memoryAttack = new MemoryChaosAttack(gremlin);
// Start memory monitoring
memoryAttack.monitorMemoryUsage();
// Execute different memory attacks
memoryAttack.executeMemoryAttack(512, 10); // 512MB for 10 minutes
Thread.sleep(5 * 60 * 1000); // Wait 5 minutes
memoryAttack.executeMemorySpikeAttack(256, 1024, 3, 15);
Thread.sleep(10 * 60 * 1000); // Wait 10 minutes
memoryAttack.executeMemoryLeakSimulation(50, 10); // 50MB/min leak for 10 min
}
}
Network Attacks
Latency Attack
import com.gremlin.GremlinService;
import com.gremlin.scenarios.*;
import java.util.concurrent.TimeUnit;
public class NetworkChaosAttack {
private final GremlinService gremlin;
public NetworkChaosAttack(GremlinService gremlin) {
this.gremlin = gremlin;
}
public void executeLatencyAttack(int latencyMs, int durationMinutes,
String targetHost, int targetPort) {
try {
System.out.println("Starting latency attack: " + latencyMs + "ms to " +
targetHost + ":" + targetPort);
LatencyAttack latencyAttack = new LatencyAttack()
.withDelay(latencyMs)
.withLength(durationMinutes)
.withTargetHosts(targetHost)
.withTargetPorts(targetPort);
String attackId = gremlin.attack(latencyAttack);
System.out.println("Latency attack started with ID: " + attackId);
} catch (Exception e) {
System.err.println("Failed to execute latency attack: " + e.getMessage());
}
}
public void executePacketLossAttack(int lossPercentage, int durationMinutes,
String targetHost) {
try {
System.out.println("Starting packet loss attack: " + lossPercentage +
"% to " + targetHost);
PacketLossAttack packetLossAttack = new PacketLossAttack()
.withPercent(lossPercentage)
.withLength(durationMinutes)
.withTargetHosts(targetHost);
String attackId = gremlin.attack(packetLossAttack);
System.out.println("Packet loss attack started with ID: " + attackId);
} catch (Exception e) {
System.err.println("Failed to execute packet loss attack: " + e.getMessage());
}
}
public void executeBlackholeAttack(int durationMinutes, String... targetHosts) {
try {
System.out.println("Starting blackhole attack for " + durationMinutes + " minutes");
BlackholeAttack blackholeAttack = new BlackholeAttack()
.withLength(durationMinutes)
.withTargetHosts(targetHosts);
String attackId = gremlin.attack(blackholeAttack);
System.out.println("Blackhole attack started with ID: " + attackId);
} catch (Exception e) {
System.err.println("Failed to execute blackhole attack: " + e.getMessage());
}
}
public void executeDnsAttack(int durationMinutes, String dnsServer) {
try {
System.out.println("Starting DNS attack on " + dnsServer);
DnsAttack dnsAttack = new DnsAttack()
.withLength(durationMinutes)
.withTargetHosts(dnsServer);
String attackId = gremlin.attack(dnsAttack);
System.out.println("DNS attack started with ID: " + attackId);
} catch (Exception e) {
System.err.println("Failed to execute DNS attack: " + e.getMessage());
}
}
public void executeComplexNetworkScenario() {
System.out.println("Executing complex network failure scenario");
try {
// Phase 1: Introduce latency
executeLatencyAttack(100, 3, "database.example.com", 5432);
TimeUnit.MINUTES.sleep(2);
// Phase 2: Add packet loss
executePacketLossAttack(10, 3, "database.example.com");
TimeUnit.MINUTES.sleep(2);
// Phase 3: Complete blackhole
executeBlackholeAttack(2, "database.example.com");
TimeUnit.MINUTES.sleep(2);
// Phase 4: Recovery phase - reduce issues gradually
executeLatencyAttack(50, 2, "database.example.com", 5432);
executePacketLossAttack(5, 2, "database.example.com");
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
public static void main(String[] args) throws Exception {
GremlinService gremlin = GremlinConfig.configureGremlin();
NetworkChaosAttack networkAttack = new NetworkChaosAttack(gremlin);
// Execute individual network attacks
networkAttack.executeLatencyAttack(200, 5, "api.payment-service.com", 443);
TimeUnit.MINUTES.sleep(10);
networkAttack.executePacketLossAttack(15, 5, "cache.redis.com");
TimeUnit.MINUTES.sleep(10);
// Execute complex scenario
networkAttack.executeComplexNetworkScenario();
}
}
State Attacks
Shutdown Attack
import com.gremlin.GremlinService;
import com.gremlin.scenarios.*;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public class StateChaosAttack {
private final GremlinService gremlin;
private final ScheduledExecutorService scheduler;
public StateChaosAttack(GremlinService gremlin) {
this.gremlin = gremlin;
this.scheduler = Executors.newScheduledThreadPool(3);
}
public void executeShutdownAttack(int delaySeconds) {
try {
System.out.println("Scheduling shutdown in " + delaySeconds + " seconds");
ShutdownAttack shutdownAttack = new ShutdownAttack()
.withDelay(delaySeconds)
.withReboot(false);
String attackId = gremlin.attack(shutdownAttack);
System.out.println("Shutdown attack scheduled with ID: " + attackId);
} catch (Exception e) {
System.err.println("Failed to execute shutdown attack: " + e.getMessage());
}
}
public void executeProcessKillAttack(String processName, int durationMinutes) {
try {
System.out.println("Killing process: " + processName);
ProcessKillAttack processKill = new ProcessKillAttack()
.withProcess(processName)
.withLength(durationMinutes);
String attackId = gremlin.attack(processKill);
System.out.println("Process kill attack started with ID: " + attackId);
} catch (Exception e) {
System.err.println("Failed to execute process kill attack: " + e.getMessage());
}
}
public void executeTimeTravelAttack(int timeShiftSeconds) {
try {
System.out.println("Shifting time by " + timeShiftSeconds + " seconds");
TimeTravelAttack timeTravel = new TimeTravelAttack()
.withOffsetSeconds(timeShiftSeconds);
String attackId = gremlin.attack(timeTravel);
System.out.println("Time travel attack started with ID: " + attackId);
} catch (Exception e) {
System.err.println("Failed to execute time travel attack: " + e.getMessage());
}
}
public void executeIoAttack(String directory, int blockCount, int durationMinutes) {
try {
System.out.println("Executing IO attack on directory: " + directory);
IoAttack ioAttack = new IoAttack()
.withDirectory(directory)
.withBlockCount(blockCount)
.withLength(durationMinutes);
String attackId = gremlin.attack(ioAttack);
System.out.println("IO attack started with ID: " + attackId);
} catch (Exception e) {
System.err.println("Failed to execute IO attack: " + e.getMessage());
}
}
public void executeGracefulDegradationScenario() {
System.out.println("Starting graceful degradation scenario");
scheduler.schedule(() -> {
// Phase 1: Introduce minor issues
executeLatencyAttack(100, 5, "primary-db", 5432);
}, 0, TimeUnit.SECONDS);
scheduler.schedule(() -> {
// Phase 2: Increase severity
executePacketLossAttack(20, 4, "primary-db");
executeProcessKillAttack("redis-server", 3);
}, 2, TimeUnit.MINUTES);
scheduler.schedule(() -> {
// Phase 3: Critical failure
executeBlackholeAttack(2, "primary-db");
}, 5, TimeUnit.MINUTES);
scheduler.schedule(() -> {
// Phase 4: Recovery
System.out.println("Initiating recovery procedures...");
// Implement recovery logic here
}, 7, TimeUnit.MINUTES);
}
public void shutdown() {
scheduler.shutdown();
try {
if (!scheduler.awaitTermination(5, TimeUnit.SECONDS)) {
scheduler.shutdownNow();
}
} catch (InterruptedException e) {
scheduler.shutdownNow();
Thread.currentThread().interrupt();
}
}
public static void main(String[] args) throws Exception {
GremlinService gremlin = GremlinConfig.configureGremlin();
StateChaosAttack stateAttack = new StateChaosAttack(gremlin);
try {
// Execute individual state attacks
stateAttack.executeProcessKillAttack("java", 3);
TimeUnit.MINUTES.sleep(5);
stateAttack.executeIoAttack("/tmp", 1000, 5);
TimeUnit.MINUTES.sleep(5);
// Execute complex scenario
stateAttack.executeGracefulDegradationScenario();
TimeUnit.MINUTES.sleep(10);
} finally {
stateAttack.shutdown();
}
}
}
Application-Level Chaos
Custom Application Chaos
import com.gremlin.GremlinService;
import java.util.Random;
import java.util.concurrent.atomic.AtomicBoolean;
public class ApplicationChaosEngine {
private final GremlinService gremlin;
private final AtomicBoolean chaosEnabled = new AtomicBoolean(false);
private final Random random = new Random();
// Chaos configuration
private double errorRate = 0.1; // 10% error rate
private double latencyRate = 0.2; // 20% latency rate
private int maxLatencyMs = 2000;
public ApplicationChaosEngine(GremlinService gremlin) {
this.gremlin = gremlin;
}
public void enableChaos() {
chaosEnabled.set(true);
System.out.println("Application chaos enabled");
}
public void disableChaos() {
chaosEnabled.set(false);
System.out.println("Application chaos disabled");
}
public void setChaosConfiguration(double errorRate, double latencyRate, int maxLatencyMs) {
this.errorRate = errorRate;
this.latencyRate = latencyRate;
this.maxLatencyMs = maxLatencyMs;
System.out.printf("Chaos configuration updated: ErrorRate=%.2f, LatencyRate=%.2f, MaxLatency=%dms%n",
errorRate, latencyRate, maxLatencyMs);
}
public <T> T executeWithChaos(ChaosAwareOperation<T> operation) {
if (!chaosEnabled.get()) {
return operation.execute();
}
// Inject latency
if (random.nextDouble() < latencyRate) {
injectLatency();
}
// Inject error
if (random.nextDouble() < errorRate) {
return injectError(operation);
}
// Execute normally (with potential latency already injected)
return operation.execute();
}
private void injectLatency() {
int latency = random.nextInt(maxLatencyMs);
System.out.println("Injecting latency: " + latency + "ms");
try {
Thread.sleep(latency);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
private <T> T injectError(ChaosAwareOperation<T> operation) {
System.out.println("Injecting error into operation");
// You can throw different types of exceptions based on configuration
double errorType = random.nextDouble();
if (errorType < 0.3) {
throw new ChaosTimeoutException("Simulated timeout");
} else if (errorType < 0.6) {
throw new ChaosNetworkException("Simulated network error");
} else {
throw new ChaosBusinessException("Simulated business logic error");
}
}
public void simulateDatabaseFailure() {
if (!chaosEnabled.get()) return;
System.out.println("Simulating database failure");
// Implement database failure simulation
}
public void simulateExternalServiceFailure() {
if (!chaosEnabled.get()) return;
System.out.println("Simulating external service failure");
// Implement external service failure simulation
}
public void simulateCacheFailure() {
if (!chaosEnabled.get()) return;
System.out.println("Simulating cache failure");
// Implement cache failure simulation
}
// Custom exceptions for chaos engineering
public static class ChaosTimeoutException extends RuntimeException {
public ChaosTimeoutException(String message) {
super(message);
}
}
public static class ChaosNetworkException extends RuntimeException {
public ChaosNetworkException(String message) {
super(message);
}
}
public static class ChaosBusinessException extends RuntimeException {
public ChaosBusinessException(String message) {
super(message);
}
}
@FunctionalInterface
public interface ChaosAwareOperation<T> {
T execute();
}
}
// Example service using chaos engine
class OrderService {
private final ApplicationChaosEngine chaosEngine;
public OrderService(ApplicationChaosEngine chaosEngine) {
this.chaosEngine = chaosEngine;
}
public String processOrder(String orderId) {
return chaosEngine.executeWithChaos(() -> {
// Simulate order processing logic
System.out.println("Processing order: " + orderId);
// Simulate database operation
String orderStatus = fetchOrderStatus(orderId);
// Simulate business logic
if ("PENDING".equals(orderStatus)) {
return "Order processed successfully";
} else {
return "Order already processed";
}
});
}
private String fetchOrderStatus(String orderId) {
return chaosEngine.executeWithChaos(() -> {
// Simulate database call
try {
Thread.sleep(100); // Simulate DB latency
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
return Math.random() > 0.5 ? "PENDING" : "COMPLETED";
});
}
public double calculateTotal(String orderId) {
return chaosEngine.executeWithChaos(() -> {
// Simulate calculation with potential chaos
System.out.println("Calculating total for order: " + orderId);
return 100.0 + (Math.random() * 50.0);
});
}
}
public class ApplicationChaosExample {
public static void main(String[] args) throws Exception {
GremlinService gremlin = GremlinConfig.configureGremlin();
ApplicationChaosEngine chaosEngine = new ApplicationChaosEngine(gremlin);
OrderService orderService = new OrderService(chaosEngine);
// Enable chaos
chaosEngine.enableChaos();
chaosEngine.setChaosConfiguration(0.2, 0.3, 1500); // 20% errors, 30% latency
// Test order processing with chaos
for (int i = 0; i < 10; i++) {
try {
String orderId = "ORDER-" + i;
String result = orderService.processOrder(orderId);
double total = orderService.calculateTotal(orderId);
System.out.println("Success: " + result + ", Total: " + total);
} catch (Exception e) {
System.err.println("Operation failed: " + e.getMessage());
}
Thread.sleep(1000);
}
// Disable chaos
chaosEngine.disableChaos();
}
}
Automated Chaos Testing Framework
Chaos Test Runner
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
public class ChaosTestRunner {
private final GremlinService gremlin;
private final ExecutorService executor;
private final List<ChaosTestScenario> testScenarios;
private final AtomicInteger completedTests = new AtomicInteger(0);
private final AtomicInteger failedTests = new AtomicInteger(0);
public ChaosTestRunner(GremlinService gremlin) {
this.gremlin = gremlin;
this.executor = Executors.newFixedThreadPool(5);
this.testScenarios = new ArrayList<>();
}
public void addTestScenario(ChaosTestScenario scenario) {
testScenarios.add(scenario);
}
public ChaosTestResults runAllTests() {
System.out.println("Starting chaos test suite with " + testScenarios.size() + " scenarios");
List<Future<ChaosTestResult>> futures = new ArrayList<>();
for (ChaosTestScenario scenario : testScenarios) {
Future<ChaosTestResult> future = executor.submit(() -> runTestScenario(scenario));
futures.add(future);
}
List<ChaosTestResult> results = new ArrayList<>();
for (Future<ChaosTestResult> future : futures) {
try {
ChaosTestResult result = future.get(30, TimeUnit.MINUTES);
results.add(result);
if (result.isSuccess()) {
completedTests.incrementAndGet();
} else {
failedTests.incrementAndGet();
}
} catch (Exception e) {
System.err.println("Test execution failed: " + e.getMessage());
failedTests.incrementAndGet();
}
}
return new ChaosTestResults(results, completedTests.get(), failedTests.get());
}
private ChaosTestResult runTestScenario(ChaosTestScenario scenario) {
System.out.println("Running test scenario: " + scenario.getName());
long startTime = System.currentTimeMillis();
boolean success = false;
String errorMessage = null;
try {
// Pre-test validation
scenario.beforeTest();
// Execute chaos attack
scenario.executeChaos(gremlin);
// Monitor system during attack
scenario.monitorDuringAttack();
// Validate system behavior
success = scenario.validateAfterAttack();
// Cleanup
scenario.afterTest();
} catch (Exception e) {
errorMessage = e.getMessage();
success = false;
}
long duration = System.currentTimeMillis() - startTime;
ChaosTestResult result = new ChaosTestResult(
scenario.getName(), success, duration, errorMessage);
System.out.println("Test scenario " + scenario.getName() +
" completed: " + (success ? "PASS" : "FAIL"));
return result;
}
public void shutdown() {
executor.shutdown();
try {
if (!executor.awaitTermination(5, TimeUnit.SECONDS)) {
executor.shutdownNow();
}
} catch (InterruptedException e) {
executor.shutdownNow();
Thread.currentThread().interrupt();
}
}
// Test scenario interface
public interface ChaosTestScenario {
String getName();
void beforeTest() throws Exception;
void executeChaos(GremlinService gremlin) throws Exception;
void monitorDuringAttack() throws Exception;
boolean validateAfterAttack() throws Exception;
void afterTest() throws Exception;
}
// Test result classes
public static class ChaosTestResult {
private final String scenarioName;
private final boolean success;
private final long durationMs;
private final String errorMessage;
public ChaosTestResult(String scenarioName, boolean success,
long durationMs, String errorMessage) {
this.scenarioName = scenarioName;
this.success = success;
this.durationMs = durationMs;
this.errorMessage = errorMessage;
}
// Getters
public String getScenarioName() { return scenarioName; }
public boolean isSuccess() { return success; }
public long getDurationMs() { return durationMs; }
public String getErrorMessage() { return errorMessage; }
}
public static class ChaosTestResults {
private final List<ChaosTestResult> results;
private final int totalTests;
private final int passedTests;
private final int failedTests;
public ChaosTestResults(List<ChaosTestResult> results, int passedTests, int failedTests) {
this.results = results;
this.totalTests = passedTests + failedTests;
this.passedTests = passedTests;
this.failedTests = failedTests;
}
public void printSummary() {
System.out.println("\n=== CHAOS TEST SUMMARY ===");
System.out.println("Total Tests: " + totalTests);
System.out.println("Passed: " + passedTests);
System.out.println("Failed: " + failedTests);
System.out.println("Success Rate: " + (passedTests * 100.0 / totalTests) + "%");
if (failedTests > 0) {
System.out.println("\nFailed Tests:");
results.stream()
.filter(r -> !r.isSuccess())
.forEach(r -> System.out.println(" - " + r.getScenarioName() +
": " + r.getErrorMessage()));
}
}
}
}
// Example test scenario implementation
class DatabaseResilienceScenario implements ChaosTestRunner.ChaosTestScenario {
@Override
public String getName() {
return "Database Resilience Test";
}
@Override
public void beforeTest() throws Exception {
System.out.println("Setting up database resilience test...");
// Initialize test data, warm up connections, etc.
Thread.sleep(2000);
}
@Override
public void executeChaos(GremlinService gremlin) throws Exception {
System.out.println("Executing database chaos attacks...");
// Simulate network issues to database
NetworkChaosAttack networkAttack = new NetworkChaosAttack(gremlin);
networkAttack.executeLatencyAttack(500, 3, "database-host", 5432);
networkAttack.executePacketLossAttack(30, 3, "database-host");
}
@Override
public void monitorDuringAttack() throws Exception {
System.out.println("Monitoring system during database chaos...");
// Monitor database connection pool, query times, error rates
for (int i = 0; i < 3; i++) {
checkDatabaseHealth();
Thread.sleep(60000); // Check every minute
}
}
@Override
public boolean validateAfterAttack() throws Exception {
System.out.println("Validating system after database chaos...");
// Check if system recovered properly
boolean connectionsRestored = checkDatabaseConnections();
boolean performanceRecovered = checkQueryPerformance();
boolean dataConsistent = verifyDataConsistency();
return connectionsRestored && performanceRecovered && dataConsistent;
}
@Override
public void afterTest() throws Exception {
System.out.println("Cleaning up after database resilience test...");
// Clean up test data, close connections, etc.
}
private void checkDatabaseHealth() {
// Implement database health checks
System.out.println("Checking database health...");
}
private boolean checkDatabaseConnections() {
// Verify database connections are restored
return true;
}
private boolean checkQueryPerformance() {
// Verify query performance is back to normal
return true;
}
private boolean verifyDataConsistency() {
// Verify no data corruption occurred
return true;
}
}
public class AutomatedChaosTesting {
public static void main(String[] args) throws Exception {
GremlinService gremlin = GremlinConfig.configureGremlin();
ChaosTestRunner testRunner = new ChaosTestRunner(gremlin);
try {
// Add test scenarios
testRunner.addTestScenario(new DatabaseResilienceScenario());
// Add more scenarios here...
// Run all tests
ChaosTestRunner.ChaosTestResults results = testRunner.runAllTests();
// Print results
results.printSummary();
} finally {
testRunner.shutdown();
}
}
}
Best Practices and Safety
Safety Measures
import java.util.concurrent.atomic.AtomicBoolean;
public class ChaosSafetyController {
private final AtomicBoolean safetyEnabled = new AtomicBoolean(true);
private final AtomicBoolean businessHoursOnly = new AtomicBoolean(true);
private final Set<String> forbiddenTargets = ConcurrentHashMap.newKeySet();
public ChaosSafetyController() {
// Add critical systems to forbidden targets
forbiddenTargets.add("production-database");
forbiddenTargets.add("payment-gateway");
forbiddenTargets.add("user-authentication");
}
public boolean canExecuteAttack(String attackType, String target) {
if (!safetyEnabled.get()) {
System.err.println("Safety controls are disabled!");
return true;
}
// Check if target is forbidden
if (forbiddenTargets.contains(target)) {
System.err.println("Target " + target + " is forbidden for chaos attacks");
return false;
}
// Check business hours
if (businessHoursOnly.get() && !isBusinessHours()) {
System.err.println("Chaos attacks only allowed during business hours");
return false;
}
// Check attack type restrictions
if (isDangerousAttack(attackType)) {
System.err.println("Attack type " + attackType + " requires manual approval");
return false;
}
return true;
}
public void enableSafety() {
safetyEnabled.set(true);
System.out.println("Chaos safety controls enabled");
}
public void disableSafety() {
safetyEnabled.set(false);
System.err.println("WARNING: Chaos safety controls disabled!");
}
public void addForbiddenTarget(String target) {
forbiddenTargets.add(target);
System.out.println("Added " + target + " to forbidden targets");
}
public void removeForbiddenTarget(String target) {
forbiddenTargets.remove(target);
System.out.println("Removed " + target + " from forbidden targets");
}
private boolean isBusinessHours() {
// Simple implementation - in real scenario, use proper timezone handling
Calendar cal = Calendar.getInstance();
int hour = cal.get(Calendar.HOUR_OF_DAY);
return hour >= 9 && hour <= 17; // 9 AM to 5 PM
}
private boolean isDangerousAttack(String attackType) {
return Arrays.asList("SHUTDOWN", "BLACKHOLE", "PROCESS_KILL").contains(attackType);
}
public void executeSafeAttack(Runnable attack, String attackType, String target) {
if (canExecuteAttack(attackType, target)) {
System.out.println("Executing safe attack: " + attackType + " on " + target);
attack.run();
} else {
System.err.println("Attack blocked by safety controls: " + attackType + " on " + target);
}
}
}
Conclusion
Chaos Engineering with Gremlin in Java enables you to proactively test and improve your system's resilience. Key takeaways:
- Start Small: Begin with simple resource attacks before moving to complex scenarios
- Safety First: Always implement safety controls and run in non-production first
- Monitor Everything: Comprehensive monitoring is essential for understanding impact
- Automate Testing: Integrate chaos tests into your CI/CD pipeline
- Learn and Improve: Use results to identify and fix weaknesses in your system
By systematically introducing failures and observing how your system responds, you can build more robust and reliable applications that can withstand real-world failures.