Error Budgets and SLOs in Java Applications

Understanding Error Budgets and SLOs

What are SLOs and Error Budgets?

Service Level Objectives (SLOs) are specific, measurable targets for service reliability that represent the level of service users can expect.

Error Budgets represent the acceptable amount of unreliability over a specific period. It's calculated as 1 - SLO.

Implementing SLO Monitoring in Java

1. Basic SLO Configuration Class

public class SLOConfig {
private final String serviceName;
private final double sloTarget; // e.g., 0.999 for 99.9%
private final Duration measurementWindow;
private final Duration budgetRefreshPeriod;
public SLOConfig(String serviceName, double sloTarget, 
Duration measurementWindow, Duration budgetRefreshPeriod) {
if (sloTarget <= 0 || sloTarget > 1) {
throw new IllegalArgumentException("SLO target must be between 0 and 1");
}
this.serviceName = serviceName;
this.sloTarget = sloTarget;
this.measurementWindow = measurementWindow;
this.budgetRefreshPeriod = budgetRefreshPeriod;
}
// Getters
public double getErrorBudget() {
return 1 - sloTarget;
}
public String getServiceName() { return serviceName; }
public double getSloTarget() { return sloTarget; }
public Duration getMeasurementWindow() { return measurementWindow; }
public Duration getBudgetRefreshPeriod() { return budgetRefreshPeriod; }
}

2. SLO Tracker Implementation

import java.time.Duration;
import java.time.Instant;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.ConcurrentHashMap;
public class SLOTracker {
private final SLOConfig config;
private final AtomicLong totalRequests = new AtomicLong(0);
private final AtomicLong successfulRequests = new AtomicLong(0);
private final AtomicLong errorRequests = new AtomicLong(0);
private final ConcurrentHashMap<String, AtomicLong> errorCountsByType = new ConcurrentHashMap<>();
private Instant windowStartTime;
private final Object lock = new Object();
public SLOTracker(SLOConfig config) {
this.config = config;
this.windowStartTime = Instant.now();
}
public void recordSuccess() {
maybeResetWindow();
totalRequests.incrementAndGet();
successfulRequests.incrementAndGet();
}
public void recordError(String errorType) {
maybeResetWindow();
totalRequests.incrementAndGet();
errorRequests.incrementAndGet();
errorCountsByType.computeIfAbsent(errorType, k -> new AtomicLong(0)).incrementAndGet();
}
public SLOStatus calculateCurrentSLO() {
maybeResetWindow();
long total = totalRequests.get();
long errors = errorRequests.get();
if (total == 0) {
return new SLOStatus(1.0, 0.0, config.getErrorBudget(), config.getErrorBudget());
}
double actualAvailability = (double) (total - errors) / total;
double errorRate = (double) errors / total;
double remainingBudget = Math.max(0, config.getErrorBudget() - errorRate);
return new SLOStatus(actualAvailability, errorRate, 
config.getErrorBudget(), remainingBudget);
}
public boolean isErrorBudgetExhausted() {
SLOStatus status = calculateCurrentSLO();
return status.getRemainingErrorBudget() <= 0;
}
public double getErrorBudgetBurnRate() {
SLOStatus status = calculateCurrentSLO();
double expectedErrors = config.getErrorBudget() * totalRequests.get();
double actualErrors = errorRequests.get();
return actualErrors / Math.max(expectedErrors, 1);
}
private void maybeResetWindow() {
synchronized (lock) {
Instant now = Instant.now();
if (Duration.between(windowStartTime, now).compareTo(config.getMeasurementWindow()) > 0) {
resetWindow(now);
}
}
}
private void resetWindow(Instant newStartTime) {
totalRequests.set(0);
successfulRequests.set(0);
errorRequests.set(0);
errorCountsByType.clear();
windowStartTime = newStartTime;
}
}

3. SLO Status Data Class

public class SLOStatus {
private final double actualAvailability;
private final double errorRate;
private final double totalErrorBudget;
private final double remainingErrorBudget;
private final Instant timestamp;
public SLOStatus(double actualAvailability, double errorRate, 
double totalErrorBudget, double remainingErrorBudget) {
this.actualAvailability = actualAvailability;
this.errorRate = errorRate;
this.totalErrorBudget = totalErrorBudget;
this.remainingErrorBudget = remainingErrorBudget;
this.timestamp = Instant.now();
}
// Getters
public double getActualAvailability() { return actualAvailability; }
public double getErrorRate() { return errorRate; }
public double getTotalErrorBudget() { return totalErrorBudget; }
public double getRemainingErrorBudget() { return remainingErrorBudget; }
public Instant getTimestamp() { return timestamp; }
public boolean isWithinSLO() {
return actualAvailability >= (1 - totalErrorBudget);
}
@Override
public String toString() {
return String.format("SLOStatus{availability=%.4f, errorRate=%.4f, remainingBudget=%.4f}", 
actualAvailability, errorRate, remainingErrorBudget);
}
}

Integration with Spring Boot Applications

4. Spring Boot Configuration

@Configuration
public class SLOAutoConfiguration {
@Bean
@ConfigurationProperties(prefix = "slo.service")
public SLOConfig serviceSLOConfig() {
return new SLOConfig("api-service", 0.999, 
Duration.ofHours(1), Duration.ofDays(30));
}
@Bean
public SLOTracker serviceSLOTracker(SLOConfig sloConfig) {
return new SLOTracker(sloConfig);
}
}

5. SLO Monitoring Aspect

@Aspect
@Component
public class SLOMonitoringAspect {
private final SLOTracker sloTracker;
private final MeterRegistry meterRegistry;
public SLOMonitoringAspect(SLOTracker sloTracker, MeterRegistry meterRegistry) {
this.sloTracker = sloTracker;
this.meterRegistry = meterRegistry;
}
@Around("@annotation(MonitorSLO)")
public Object monitorSLO(ProceedingJoinPoint joinPoint) throws Throwable {
String methodName = joinPoint.getSignature().getName();
String className = joinPoint.getTarget().getClass().getSimpleName();
String operation = className + "." + methodName;
Timer.Sample sample = Timer.start(meterRegistry);
boolean success = false;
try {
Object result = joinPoint.proceed();
success = true;
return result;
} catch (Exception e) {
sloTracker.recordError(e.getClass().getSimpleName());
meterRegistry.counter("slo.errors", 
"operation", operation, 
"error_type", e.getClass().getSimpleName()
).increment();
throw e;
} finally {
sample.stop(Timer.builder("slo.request.duration")
.tags("operation", operation, "success", String.valueOf(success))
.register(meterRegistry));
if (success) {
sloTracker.recordSuccess();
}
}
}
}
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.METHOD)
public @interface MonitorSLO {
}

6. SLO Management Service

@Service
public class SLOManagementService {
private final SLOTracker sloTracker;
private final ApplicationEventPublisher eventPublisher;
private final Map<String, AlertState> alertStates = new ConcurrentHashMap<>();
public SLOManagementService(SLOTracker sloTracker, ApplicationEventPublisher eventPublisher) {
this.sloTracker = sloTracker;
this.eventPublisher = eventPublisher;
}
@Scheduled(fixedRate = 60000) // Check every minute
public void monitorSLOCompliance() {
SLOStatus status = sloTracker.calculateCurrentSLO();
double burnRate = sloTracker.getErrorBudgetBurnRate();
// Check for high burn rate
if (burnRate > 10.0) { // Burning error budget 10x faster than expected
handleHighBurnRate(status, burnRate);
}
// Check if error budget is exhausted
if (sloTracker.isErrorBudgetExhausted()) {
handleErrorBudgetExhaustion(status);
}
// Emit metrics
eventPublisher.publishEvent(new SLOStatusEvent(this, status));
}
private void handleHighBurnRate(SLOStatus status, double burnRate) {
String alertKey = "high_burn_rate";
if (!alertStates.containsKey(alertKey) || 
alertStates.get(alertKey) == AlertState.RESOLVED) {
alertStates.put(alertKey, AlertState.FIRING);
eventPublisher.publishEvent(new HighBurnRateAlert(this, status, burnRate));
// Trigger mitigation actions
triggerMitigationActions();
}
}
private void handleErrorBudgetExhaustion(SLOStatus status) {
String alertKey = "error_budget_exhausted";
if (!alertStates.containsKey(alertKey) || 
alertStates.get(alertKey) == AlertState.RESOLVED) {
alertStates.put(alertKey, AlertState.FIRING);
eventPublisher.publishEvent(new ErrorBudgetExhaustedAlert(this, status));
// Trigger more aggressive mitigation
triggerEmergencyActions();
}
}
private void triggerMitigationActions() {
// Implement mitigation strategies:
// - Reduce non-essential traffic
// - Enable circuit breakers
// - Scale up resources
// - Disable non-critical features
}
private void triggerEmergencyActions() {
// More aggressive actions:
// - Enter maintenance mode
// - Disable feature flags
// - Alert on-call engineers
}
public enum AlertState {
FIRING, RESOLVED
}
}

Error Budget Aware Circuit Breaker

7. SLO-Aware Circuit Breaker

@Component
public class SLOAwareCircuitBreaker {
private final SLOTracker sloTracker;
private CircuitBreakerState state = CircuitBreakerState.CLOSED;
private Instant lastStateChange = Instant.now();
private final Duration openStateTimeout = Duration.ofSeconds(30);
public SLOAwareCircuitBreaker(SLOTracker sloTracker) {
this.sloTracker = sloTracker;
}
public boolean allowRequest() {
// If error budget is critically low, open circuit breaker
if (sloTracker.isErrorBudgetExhausted() && state != CircuitBreakerState.OPEN) {
state = CircuitBreakerState.OPEN;
lastStateChange = Instant.now();
return false;
}
// Normal circuit breaker logic
switch (state) {
case CLOSED:
return true;
case OPEN:
if (Duration.between(lastStateChange, Instant.now()).compareTo(openStateTimeout) > 0) {
state = CircuitBreakerState.HALF_OPEN;
lastStateChange = Instant.now();
return true;
}
return false;
case HALF_OPEN:
return true;
default:
return false;
}
}
public void recordSuccess() {
if (state == CircuitBreakerState.HALF_OPEN) {
state = CircuitBreakerState.CLOSED;
}
}
public void recordFailure() {
if (state == CircuitBreakerState.HALF_OPEN) {
state = CircuitBreakerState.OPEN;
lastStateChange = Instant.now();
}
}
public enum CircuitBreakerState {
CLOSED, OPEN, HALF_OPEN
}
}

Configuration Properties

8. Application Configuration

# application.yml
slo:
service:
service-name: "user-api"
slo-target: 0.999
measurement-window: 1h
budget-refresh-period: 30d
management:
endpoints:
web:
exposure:
include: health,metrics,slo
endpoint:
slo:
enabled: true

9. SLO Management Controller

@RestController
@RequestMapping("/api/slo")
public class SLOController {
private final SLOTracker sloTracker;
private final SLOManagementService sloManagementService;
public SLOController(SLOTracker sloTracker, SLOManagementService sloManagementService) {
this.sloTracker = sloTracker;
this.sloManagementService = sloManagementService;
}
@GetMapping("/status")
public ResponseEntity<SLOStatus> getSLOStatus() {
return ResponseEntity.ok(sloTracker.calculateCurrentSLO());
}
@GetMapping("/budget")
public ResponseEntity<Map<String, Object>> getErrorBudgetStatus() {
SLOStatus status = sloTracker.calculateCurrentSLO();
Map<String, Object> budgetInfo = new HashMap<>();
budgetInfo.put("remainingBudget", status.getRemainingErrorBudget());
budgetInfo.put("burnRate", sloTracker.getErrorBudgetBurnRate());
budgetInfo.put("isExhausted", sloTracker.isErrorBudgetExhausted());
budgetInfo.put("currentAvailability", status.getActualAvailability());
return ResponseEntity.ok(budgetInfo);
}
@PostMapping("/actions/mitigate")
public ResponseEntity<String> triggerMitigation() {
// Manual trigger for mitigation actions
sloManagementService.triggerMitigationActions();
return ResponseEntity.accepted().body("Mitigation actions triggered");
}
}

Testing SLO Implementation

10. Unit Tests

@ExtendWith(MockitoExtension.class)
class SLOTrackerTest {
private SLOConfig config;
private SLOTracker tracker;
@BeforeEach
void setUp() {
config = new SLOConfig("test-service", 0.99, 
Duration.ofMinutes(5), Duration.ofDays(30));
tracker = new SLOTracker(config);
}
@Test
void testInitialSLOStatus() {
SLOStatus status = tracker.calculateCurrentSLO();
assertEquals(1.0, status.getActualAvailability());
assertEquals(0.01, status.getTotalErrorBudget());
assertTrue(status.isWithinSLO());
}
@Test
void testErrorBudgetCalculation() {
// Record 1000 requests with 20 errors
for (int i = 0; i < 980; i++) {
tracker.recordSuccess();
}
for (int i = 0; i < 20; i++) {
tracker.recordError("Timeout");
}
SLOStatus status = tracker.calculateCurrentSLO();
assertEquals(0.98, status.getActualAvailability(), 0.001);
assertEquals(0.02, status.getErrorRate(), 0.001);
// Remaining budget = 0.01 (total) - 0.02 (used) = -0.01
assertEquals(-0.01, status.getRemainingErrorBudget(), 0.001);
}
@Test
void testErrorBudgetExhaustion() {
// Exhaust error budget with high error rate
for (int i = 0; i < 100; i++) {
tracker.recordError("ServerError");
}
assertTrue(tracker.isErrorBudgetExhausted());
assertTrue(tracker.getErrorBudgetBurnRate() > 1.0);
}
}

Best Practices for Error Budgets in Java

1. Set Realistic SLOs

  • Base SLOs on user experience and business requirements
  • Start conservative and adjust based on data
  • Differentiate between critical and non-critical services

2. Monitor Key Metrics

  • Latency distributions (p50, p95, p99)
  • Error rates by type
  • Traffic volume
  • Dependency health

3. Implement Progressive Actions

  • Warning alerts at 50% budget consumption
  • Critical alerts at 80% consumption
  • Automated mitigation at 100% exhaustion

4. Use Error Budgets for Decision Making

  • Feature releases when budget is healthy
  • Postpone risky changes when budget is low
  • Use budget as a reliability currency

This implementation provides a comprehensive framework for managing error budgets and SLOs in Java applications, helping maintain service reliability while enabling safe velocity in development.

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper