Pumba for Container Chaos in Java

Overview

Pumba is a chaos testing tool for Docker containers that helps test system resilience by injecting failures like network delays, packet loss, container stops, and resource constraints. This guide shows how to integrate Pumba chaos testing into Java applications.

Architecture

Chaos Testing Components

  1. Pumba Controller: Manages chaos experiments
  2. Chaos Orchestrator: Coordinates chaos tests
  3. Monitoring: Tracks system behavior during chaos
  4. Recovery: Ensures system returns to normal state

Dependencies

<dependencies>
<!-- Docker Java API -->
<dependency>
<groupId>com.github.docker-java</groupId>
<artifactId>docker-java</artifactId>
<version>3.3.0</version>
</dependency>
<!-- Spring Boot -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- Resilience4j for chaos-aware clients -->
<dependency>
<groupId>io.github.resilience4j</groupId>
<artifactId>resilience4j-spring-boot2</artifactId>
<version>2.0.2</version>
</dependency>
<!-- Metrics and Monitoring -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-core</artifactId>
</dependency>
<!-- Testing -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

Core Implementation

1. Pumba Service Integration

@Service
@Slf4j
public class PumbaChaosService {
private final DockerClient dockerClient;
private final ChaosConfig chaosConfig;
private final MeterRegistry meterRegistry;
private final Map<String, Process> activeChaosProcesses;
public PumbaChaosService(DockerClient dockerClient, 
ChaosConfig chaosConfig,
MeterRegistry meterRegistry) {
this.dockerClient = dockerClient;
this.chaosConfig = chaosConfig;
this.meterRegistry = meterRegistry;
this.activeChaosProcesses = new ConcurrentHashMap<>();
}
// Network Chaos
public ChaosResult injectNetworkDelay(String containerName, int delayMs, int durationSec) {
return executePumbaCommand(
containerName,
"netem", 
"delay", 
delayMs + "ms",
durationSec
);
}
public ChaosResult injectPacketLoss(String containerName, double lossPercent, int durationSec) {
return executePumbaCommand(
containerName,
"netem",
"loss",
lossPercent + "%",
durationSec
);
}
public ChaosResult injectNetworkCorruption(String containerName, double corruptionPercent, int durationSec) {
return executePumbaCommand(
containerName,
"netem",
"corrupt",
corruptionPercent + "%",
durationSec
);
}
public ChaosResult injectNetworkRate(String containerName, String rate, int durationSec) {
return executePumbaCommand(
containerName,
"netem",
"rate",
rate,
durationSec
);
}
// Container Chaos
public ChaosResult stopContainer(String containerName, int durationSec) {
return executePumbaCommand(
containerName,
"stop",
"--duration",
durationSec + "s",
durationSec + 10 // Extra time for restart
);
}
public ChaosResult killContainer(String containerName) {
return executePumbaCommand(
containerName,
"kill",
null,
null,
0
);
}
public ChaosResult pauseContainer(String containerName, int durationSec) {
return executePumbaCommand(
containerName,
"pause",
"--duration",
durationSec + "s",
durationSec
);
}
public ChaosResult removeContainer(String containerName) {
return executePumbaCommand(
containerName,
"rm",
"-f",
null,
0
);
}
// Resource Chaos
public ChaosResult stressCpu(String containerName, int workers, int durationSec) {
return executePumbaCommand(
containerName,
"stress",
"--cpu",
workers + "",
durationSec
);
}
public ChaosResult stressMemory(String containerName, String memory, int durationSec) {
return executePumbaCommand(
containerName,
"stress",
"--mem",
memory,
durationSec
);
}
// Advanced Chaos Patterns
public ChaosResult injectRandomChaos(String containerName, int durationSec) {
ChaosPattern pattern = selectRandomChaosPattern();
return injectChaosPattern(containerName, pattern, durationSec);
}
public ChaosResult injectChaosPattern(String containerName, ChaosPattern pattern, int durationSec) {
log.info("Injecting chaos pattern: {} for container: {}", pattern, containerName);
switch (pattern) {
case NETWORK_DELAY:
return injectNetworkDelay(containerName, 1000, durationSec);
case PACKET_LOSS:
return injectPacketLoss(containerName, 30.0, durationSec);
case CONTAINER_STOP:
return stopContainer(containerName, durationSec);
case CPU_STRESS:
return stressCpu(containerName, 2, durationSec);
case MEMORY_STRESS:
return stressMemory(containerName, "256mb", durationSec);
default:
throw new IllegalArgumentException("Unknown chaos pattern: " + pattern);
}
}
private ChaosResult executePumbaCommand(String containerName, String command, 
String subCommand, String value, int durationSec) {
String experimentId = UUID.randomUUID().toString();
long startTime = System.currentTimeMillis();
try {
List<String> pumbaArgs = buildPumbaCommand(
containerName, command, subCommand, value, durationSec
);
ProcessBuilder processBuilder = new ProcessBuilder(pumbaArgs);
Process process = processBuilder.start();
activeChaosProcesses.put(experimentId, process);
// Wait for completion if duration is specified
if (durationSec > 0) {
boolean completed = process.waitFor(durationSec + 10, TimeUnit.SECONDS);
if (!completed) {
process.destroy();
log.warn("Pumba process timed out for experiment: {}", experimentId);
}
}
int exitCode = process.exitValue();
long endTime = System.currentTimeMillis();
ChaosResult result = new ChaosResult(
experimentId,
containerName,
command,
exitCode == 0,
exitCode,
endTime - startTime
);
recordChaosMetrics(result);
activeChaosProcesses.remove(experimentId);
return result;
} catch (Exception e) {
log.error("Failed to execute Pumba command for container: {}", containerName, e);
return new ChaosResult(
experimentId,
containerName,
command,
false,
-1,
System.currentTimeMillis() - startTime
);
}
}
private List<String> buildPumbaCommand(String containerName, String command, 
String subCommand, String value, int durationSec) {
List<String> args = new ArrayList<>();
args.add("pumba");
if (durationSec > 0) {
args.add("--duration");
args.add(durationSec + "s");
}
args.add(command);
if (subCommand != null) {
args.add(subCommand);
if (value != null) {
args.add(value);
}
}
args.add(containerName);
return args;
}
private ChaosPattern selectRandomChaosPattern() {
ChaosPattern[] patterns = ChaosPattern.values();
return patterns[ThreadLocalRandom.current().nextInt(patterns.length)];
}
private void recordChaosMetrics(ChaosResult result) {
meterRegistry.counter("chaos.experiments.executed",
"container", result.getContainerName(),
"command", result.getCommand(),
"success", String.valueOf(result.isSuccess()))
.increment();
meterRegistry.timer("chaos.experiments.duration",
"container", result.getContainerName(),
"command", result.getCommand())
.record(Duration.ofMillis(result.getDurationMs()));
}
public void stopChaosExperiment(String experimentId) {
Process process = activeChaosProcesses.get(experimentId);
if (process != null) {
process.destroy();
activeChaosProcesses.remove(experimentId);
log.info("Stopped chaos experiment: {}", experimentId);
}
}
public List<ActiveChaosExperiment> getActiveExperiments() {
return activeChaosProcesses.entrySet().stream()
.map(entry -> new ActiveChaosExperiment(
entry.getKey(),
entry.getValue().isAlive()
))
.collect(Collectors.toList());
}
}

2. Chaos Configuration and Models

@Configuration
@ConfigurationProperties(prefix = "chaos")
@Data
public class ChaosConfig {
private boolean enabled = false;
private String pumbaImage = "gaiaadm/pumba";
private double failureProbability = 0.1;
private List<String> targetContainers = new ArrayList<>();
private Map<String, ChaosProfile> profiles = new HashMap<>();
private Duration defaultDuration = Duration.ofMinutes(5);
private int maxConcurrentExperiments = 3;
}
@Data
public class ChaosResult {
private final String experimentId;
private final String containerName;
private final String command;
private final boolean success;
private final int exitCode;
private final long durationMs;
private final Instant timestamp = Instant.now();
}
public enum ChaosPattern {
NETWORK_DELAY,
PACKET_LOSS,
NETWORK_CORRUPTION,
NETWORK_RATE_LIMIT,
CONTAINER_STOP,
CONTAINER_KILL,
CONTAINER_PAUSE,
CPU_STRESS,
MEMORY_STRESS,
RANDOM_CHAOS
}
@Data
public class ChaosProfile {
private String name;
private List<ChaosPattern> patterns;
private Duration duration;
private double intensity; // 0.0 to 1.0
private Map<String, String> parameters;
}
@Data
public class ActiveChaosExperiment {
private final String experimentId;
private final boolean active;
private final Instant startTime = Instant.now();
}
@Data
public class ChaosExperimentRequest {
private String containerName;
private ChaosPattern pattern;
private Duration duration;
private Map<String, String> parameters;
private boolean random = false;
}

3. Chaos Orchestrator

@Service
@Slf4j
public class ChaosOrchestrator {
private final PumbaChaosService pumbaService;
private final ChaosConfig chaosConfig;
private final ApplicationHealthService healthService;
private final ScheduledExecutorService scheduler;
private final Map<String, ScheduledFuture<?>> scheduledExperiments;
public ChaosOrchestrator(PumbaChaosService pumbaService,
ChaosConfig chaosConfig,
ApplicationHealthService healthService) {
this.pumbaService = pumbaService;
this.chaosConfig = chaosConfig;
this.healthService = healthService;
this.scheduler = Executors.newScheduledThreadPool(5);
this.scheduledExperiments = new ConcurrentHashMap<>();
}
@PostConstruct
public void initialize() {
if (chaosConfig.isEnabled()) {
log.info("Chaos testing enabled. Starting chaos orchestrator.");
scheduleRandomChaosExperiments();
} else {
log.info("Chaos testing disabled.");
}
}
@PreDestroy
public void cleanup() {
scheduler.shutdown();
scheduledExperiments.values().forEach(future -> future.cancel(true));
}
public ChaosExperimentResponse scheduleExperiment(ChaosExperimentRequest request) {
if (!chaosConfig.isEnabled()) {
return ChaosExperimentResponse.disabled(request.getContainerName());
}
String experimentId = UUID.randomUUID().toString();
ScheduledFuture<?> future = scheduler.schedule(() -> {
executeChaosExperiment(experimentId, request);
}, ThreadLocalRandom.current().nextInt(30, 120), TimeUnit.SECONDS);
scheduledExperiments.put(experimentId, future);
return new ChaosExperimentResponse(
experimentId,
request.getContainerName(),
request.getPattern(),
ChaosExperimentStatus.SCHEDULED,
"Experiment scheduled to run in 30-120 seconds"
);
}
public ChaosExperimentResponse runExperiment(ChaosExperimentRequest request) {
if (!chaosConfig.isEnabled()) {
return ChaosExperimentResponse.disabled(request.getContainerName());
}
String experimentId = UUID.randomUUID().toString();
executeChaosExperiment(experimentId, request);
return new ChaosExperimentResponse(
experimentId,
request.getContainerName(),
request.getPattern(),
ChaosExperimentStatus.COMPLETED,
"Experiment executed immediately"
);
}
public void scheduleRecurringExperiment(String experimentName, 
ChaosExperimentRequest request, 
Duration interval) {
ScheduledFuture<?> future = scheduler.scheduleAtFixedRate(() -> {
String experimentId = UUID.randomUUID().toString();
executeChaosExperiment(experimentId, request);
}, interval.toSeconds(), interval.toSeconds(), TimeUnit.SECONDS);
scheduledExperiments.put(experimentName, future);
}
private void executeChaosExperiment(String experimentId, ChaosExperimentRequest request) {
try {
log.info("Executing chaos experiment {} on container {}", 
experimentId, request.getContainerName());
// Check system health before chaos
if (!healthService.isSystemHealthy()) {
log.warn("System not healthy, skipping chaos experiment");
return;
}
ChaosResult result;
if (request.isRandom()) {
result = pumbaService.injectRandomChaos(
request.getContainerName(),
(int) request.getDuration().getSeconds()
);
} else {
result = pumbaService.injectChaosPattern(
request.getContainerName(),
request.getPattern(),
(int) request.getDuration().getSeconds()
);
}
log.info("Chaos experiment {} completed with success: {}", 
experimentId, result.isSuccess());
// Monitor recovery
monitorRecovery(request.getContainerName(), experimentId);
} catch (Exception e) {
log.error("Chaos experiment {} failed", experimentId, e);
}
}
private void scheduleRandomChaosExperiments() {
if (chaosConfig.getFailureProbability() > 0) {
scheduler.scheduleAtFixedRate(() -> {
if (ThreadLocalRandom.current().nextDouble() < chaosConfig.getFailureProbability()) {
triggerRandomChaos();
}
}, 1, 5, TimeUnit.MINUTES);
}
}
private void triggerRandomChaos() {
if (chaosConfig.getTargetContainers().isEmpty()) {
return;
}
String randomContainer = chaosConfig.getTargetContainers().get(
ThreadLocalRandom.current().nextInt(chaosConfig.getTargetContainers().size())
);
ChaosExperimentRequest request = new ChaosExperimentRequest();
request.setContainerName(randomContainer);
request.setPattern(ChaosPattern.RANDOM_CHAOS);
request.setDuration(Duration.ofMinutes(2));
request.setRandom(true);
runExperiment(request);
}
private void monitorRecovery(String containerName, String experimentId) {
scheduler.schedule(() -> {
boolean recovered = healthService.isContainerHealthy(containerName);
if (recovered) {
log.info("Container {} recovered from chaos experiment {}", 
containerName, experimentId);
} else {
log.warn("Container {} still unhealthy after chaos experiment {}", 
containerName, experimentId);
}
}, 1, TimeUnit.MINUTES);
}
public boolean cancelExperiment(String experimentId) {
ScheduledFuture<?> future = scheduledExperiments.get(experimentId);
if (future != null) {
boolean cancelled = future.cancel(false);
scheduledExperiments.remove(experimentId);
return cancelled;
}
return false;
}
public List<ScheduledExperiment> getScheduledExperiments() {
return scheduledExperiments.entrySet().stream()
.map(entry -> new ScheduledExperiment(
entry.getKey(),
!entry.getValue().isDone(),
entry.getValue().getDelay(TimeUnit.SECONDS)
))
.collect(Collectors.toList());
}
}

Chaos-Aware Application Components

1. Resilient HTTP Client

@Service
@Slf4j
public class ChaosAwareHttpClient {
private final RestTemplate restTemplate;
private final CircuitBreakerRegistry circuitBreakerRegistry;
private final RetryRegistry retryRegistry;
private final TimeLimiterRegistry timeLimiterRegistry;
private final MeterRegistry meterRegistry;
public ChaosAwareHttpClient(RestTemplate restTemplate,
CircuitBreakerRegistry circuitBreakerRegistry,
RetryRegistry retryRegistry,
TimeLimiterRegistry timeLimiterRegistry,
MeterRegistry meterRegistry) {
this.restTemplate = restTemplate;
this.circuitBreakerRegistry = circuitBreakerRegistry;
this.retryRegistry = retryRegistry;
this.timeLimiterRegistry = timeLimiterRegistry;
this.meterRegistry = meterRegistry;
}
@CircuitBreaker(name = "httpClient", fallbackMethod = "fallback")
@Retry(name = "httpClient", fallbackMethod = "fallback")
@TimeLimiter(name = "httpClient")
public <T> CompletableFuture<T> executeWithResilience(String serviceName, 
HttpMethod method,
String url, 
Object request, 
Class<T> responseType) {
return CompletableFuture.supplyAsync(() -> {
long startTime = System.currentTimeMillis();
try {
ResponseEntity<T> response = restTemplate.exchange(
url, method, createHttpEntity(request), responseType);
recordMetrics(serviceName, true, System.currentTimeMillis() - startTime);
return response.getBody();
} catch (Exception e) {
recordMetrics(serviceName, false, System.currentTimeMillis() - startTime);
throw new ResilientClientException("HTTP request failed", e);
}
});
}
public <T> T fallback(String serviceName, HttpMethod method, String url, 
Object request, Class<T> responseType, Exception e) {
log.warn("Fallback triggered for {} request to {}: {}", 
method, url, e.getMessage());
meterRegistry.counter("http.client.fallback",
"service", serviceName,
"url", url)
.increment();
// Return default response or throw specific exception
return null;
}
private HttpEntity<Object> createHttpEntity(Object body) {
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_JSON);
headers.set("User-Agent", "Chaos-Aware-Client/1.0");
return new HttpEntity<>(body, headers);
}
private void recordMetrics(String serviceName, boolean success, long duration) {
meterRegistry.counter("http.client.requests",
"service", serviceName,
"success", String.valueOf(success))
.increment();
meterRegistry.timer("http.client.duration",
"service", serviceName)
.record(Duration.ofMillis(duration));
}
}

2. Application Health Service

@Service
@Slf4j
public class ApplicationHealthService {
private final DockerClient dockerClient;
private final Map<String, Instant> containerLastCheck;
private final Map<String, HealthStatus> containerHealth;
public ApplicationHealthService(DockerClient dockerClient) {
this.dockerClient = dockerClient;
this.containerLastCheck = new ConcurrentHashMap<>();
this.containerHealth = new ConcurrentHashMap<>();
}
public boolean isSystemHealthy() {
// Check critical services
return isDatabaseHealthy() && isMessageBrokerHealthy() && isCacheHealthy();
}
public boolean isContainerHealthy(String containerName) {
try {
List<Container> containers = dockerClient.listContainersCmd()
.withNameFilter(List.of(containerName))
.exec();
if (containers.isEmpty()) {
return false;
}
Container container = containers.get(0);
String status = container.getStatus();
boolean healthy = status != null && 
(status.contains("Up") || !status.contains("Exited"));
containerHealth.put(containerName, 
healthy ? HealthStatus.HEALTHY : HealthStatus.UNHEALTHY);
containerLastCheck.put(containerName, Instant.now());
return healthy;
} catch (Exception e) {
log.error("Failed to check container health: {}", containerName, e);
containerHealth.put(containerName, HealthStatus.UNKNOWN);
return false;
}
}
public SystemHealth getSystemHealth() {
Map<String, HealthStatus> services = new HashMap<>();
services.put("database", checkDatabaseHealth());
services.put("message-broker", checkMessageBrokerHealth());
services.put("cache", checkCacheHealth());
boolean overallHealthy = services.values().stream()
.allMatch(status -> status == HealthStatus.HEALTHY);
return new SystemHealth(overallHealthy, services, Instant.now());
}
private HealthStatus checkDatabaseHealth() {
// Implement database health check
return HealthStatus.HEALTHY;
}
private HealthStatus checkMessageBrokerHealth() {
// Implement message broker health check
return HealthStatus.HEALTHY;
}
private HealthStatus checkCacheHealth() {
// Implement cache health check
return HealthStatus.HEALTHY;
}
private boolean isDatabaseHealthy() {
return checkDatabaseHealth() == HealthStatus.HEALTHY;
}
private boolean isMessageBrokerHealthy() {
return checkMessageBrokerHealth() == HealthStatus.HEALTHY;
}
private boolean isCacheHealthy() {
return checkCacheHealth() == HealthStatus.HEALTHY;
}
@Scheduled(fixedRate = 30000) // Every 30 seconds
public void refreshHealthStatus() {
containerHealth.keySet().forEach(this::isContainerHealthy);
}
}
public enum HealthStatus {
HEALTHY,
UNHEALTHY,
UNKNOWN
}
@Data
public class SystemHealth {
private final boolean healthy;
private final Map<String, HealthStatus> services;
private final Instant timestamp;
}

REST API for Chaos Management

1. Chaos Controller

@RestController
@RequestMapping("/api/chaos")
@Slf4j
public class ChaosController {
private final ChaosOrchestrator chaosOrchestrator;
private final PumbaChaosService pumbaService;
private final ApplicationHealthService healthService;
public ChaosController(ChaosOrchestrator chaosOrchestrator,
PumbaChaosService pumbaService,
ApplicationHealthService healthService) {
this.chaosOrchestrator = chaosOrchestrator;
this.pumbaService = pumbaService;
this.healthService = healthService;
}
@PostMapping("/experiments")
public ResponseEntity<ChaosExperimentResponse> createExperiment(
@RequestBody ChaosExperimentRequest request) {
ChaosExperimentResponse response = chaosOrchestrator.runExperiment(request);
return ResponseEntity.status(HttpStatus.ACCEPTED).body(response);
}
@PostMapping("/experiments/scheduled")
public ResponseEntity<ChaosExperimentResponse> scheduleExperiment(
@RequestBody ChaosExperimentRequest request) {
ChaosExperimentResponse response = chaosOrchestrator.scheduleExperiment(request);
return ResponseEntity.status(HttpStatus.ACCEPTED).body(response);
}
@DeleteMapping("/experiments/{experimentId}")
public ResponseEntity<Void> cancelExperiment(@PathVariable String experimentId) {
boolean cancelled = chaosOrchestrator.cancelExperiment(experimentId);
if (cancelled) {
return ResponseEntity.noContent().build();
} else {
return ResponseEntity.notFound().build();
}
}
@PostMapping("/network/delay")
public ResponseEntity<ChaosResult> injectNetworkDelay(
@RequestParam String container,
@RequestParam(defaultValue = "1000") int delayMs,
@RequestParam(defaultValue = "60") int durationSec) {
ChaosResult result = pumbaService.injectNetworkDelay(container, delayMs, durationSec);
return ResponseEntity.ok(result);
}
@PostMapping("/network/packet-loss")
public ResponseEntity<ChaosResult> injectPacketLoss(
@RequestParam String container,
@RequestParam(defaultValue = "30.0") double lossPercent,
@RequestParam(defaultValue = "60") int durationSec) {
ChaosResult result = pumbaService.injectPacketLoss(container, lossPercent, durationSec);
return ResponseEntity.ok(result);
}
@PostMapping("/container/stop")
public ResponseEntity<ChaosResult> stopContainer(
@RequestParam String container,
@RequestParam(defaultValue = "30") int durationSec) {
ChaosResult result = pumbaService.stopContainer(container, durationSec);
return ResponseEntity.ok(result);
}
@GetMapping("/health")
public ResponseEntity<SystemHealth> getSystemHealth() {
SystemHealth health = healthService.getSystemHealth();
return ResponseEntity.ok(health);
}
@GetMapping("/experiments/active")
public ResponseEntity<List<ActiveChaosExperiment>> getActiveExperiments() {
List<ActiveChaosExperiment> experiments = pumbaService.getActiveExperiments();
return ResponseEntity.ok(experiments);
}
@GetMapping("/experiments/scheduled")
public ResponseEntity<List<ScheduledExperiment>> getScheduledExperiments() {
List<ScheduledExperiment> experiments = chaosOrchestrator.getScheduledExperiments();
return ResponseEntity.ok(experiments);
}
@PostMapping("/profiles/{profileName}")
public ResponseEntity<ChaosExperimentResponse> runChaosProfile(
@PathVariable String profileName,
@RequestParam String container) {
// Look up profile and execute
// Implementation would fetch profile from configuration
// and execute the defined chaos patterns
return ResponseEntity.status(HttpStatus.NOT_IMPLEMENTED).build();
}
}

Testing Chaos Scenarios

1. Chaos Test Scenarios

@SpringBootTest
@TestPropertySource(properties = {
"chaos.enabled=true",
"chaos.target-containers=app-service,db-service,cache-service"
})
class ChaosTestScenarios {
@Autowired
private ChaosOrchestrator chaosOrchestrator;
@Autowired
private ApplicationHealthService healthService;
@Autowired
private ChaosAwareHttpClient httpClient;
@Test
void testNetworkDelayResilience() throws Exception {
// Given
ChaosExperimentRequest request = new ChaosExperimentRequest();
request.setContainerName("app-service");
request.setPattern(ChaosPattern.NETWORK_DELAY);
request.setDuration(Duration.ofSeconds(30));
// When
chaosOrchestrator.runExperiment(request);
// Then - Verify system remains functional
CompletableFuture<String> response = httpClient.executeWithResilience(
"user-service", HttpMethod.GET, "http://user-service/api/users", null, String.class);
String result = response.get(10, TimeUnit.SECONDS);
assertNotNull(result);
}
@Test
void testContainerStopRecovery() throws Exception {
// Given
ChaosExperimentRequest request = new ChaosExperimentRequest();
request.setContainerName("cache-service");
request.setPattern(ChaosPattern.CONTAINER_STOP);
request.setDuration(Duration.ofSeconds(45));
// When
chaosOrchestrator.runExperiment(request);
// Then - Verify recovery
await().atMost(2, TimeUnit.MINUTES).until(() -> 
healthService.isContainerHealthy("cache-service"));
assertTrue(healthService.isContainerHealthy("cache-service"));
}
@Test
void testRandomChaosEndurance() {
// Run multiple random chaos experiments
for (int i = 0; i < 5; i++) {
ChaosExperimentRequest request = new ChaosExperimentRequest();
request.setContainerName("app-service");
request.setRandom(true);
request.setDuration(Duration.ofMinutes(2));
chaosOrchestrator.scheduleExperiment(request);
}
// Verify system remains healthy overall
await().atMost(10, TimeUnit.MINUTES).until(() -> 
healthService.isSystemHealthy());
}
}

Configuration

application.yml

chaos:
enabled: true
pumba-image: "gaiaadm/pumba:latest"
failure-probability: 0.05
target-containers:
- "app-service"
- "user-service"
- "order-service"
- "payment-service"
default-duration: 5m
max-concurrent-experiments: 3
profiles:
network-degradation:
patterns:
- NETWORK_DELAY
- PACKET_LOSS
duration: 10m
intensity: 0.7
container-failure:
patterns:
- CONTAINER_STOP
duration: 2m
intensity: 0.5
resilience4j:
circuitbreaker:
instances:
httpClient:
sliding-window-size: 10
failure-rate-threshold: 50
wait-duration-in-open-state: 10s
retry:
instances:
httpClient:
max-attempts: 3
wait-duration: 2s
timelimiter:
instances:
httpClient:
timeout-duration: 10s
management:
endpoints:
web:
exposure:
include: health,metrics,chaos
endpoint:
health:
show-details: always

Docker Compose for Testing

version: '3.8'
services:
app-service:
image: my-app:latest
container_name: app-service
ports:
- "8080:8080"
depends_on:
- database
- redis
user-service:
image: user-service:latest
container_name: user-service
ports:
- "8081:8080"
order-service:
image: order-service:latest
container_name: order-service
ports:
- "8082:8080"
pumba:
image: gaiaadm/pumba:latest
container_name: pumba
command: --help
network_mode: host
volumes:
- /var/run/docker.sock:/var/run/docker.sock
privileged: true
chaos-dashboard:
image: my-chaos-dashboard:latest
container_name: chaos-dashboard
ports:
- "3000:3000"
environment:
- CHAOS_API_URL=http://app-service:8080/api/chaos

Best Practices

  1. Start Small: Begin with low-intensity chaos experiments
  2. Monitor Closely: Always monitor system behavior during chaos tests
  3. Have Rollback Plans: Ensure you can quickly stop chaos experiments
  4. Test in Staging First: Never run chaos experiments in production initially
  5. Document Scenarios: Document chaos scenarios and expected behaviors
  6. Automate Recovery: Implement automated recovery procedures
  7. Team Awareness: Ensure the team knows when chaos tests are running
  8. Gradual Intensity: Gradually increase chaos intensity over time

This implementation provides a comprehensive foundation for integrating Pumba chaos testing into Java applications, enabling systematic resilience testing and validation of failure recovery mechanisms.

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper