Solution: Level 8 / Project 15 - Level 8 Mini Capstone¶

STOP -- Have you attempted this project yourself first?

Learning happens in the struggle, not in reading answers. Spend at least 20 minutes trying before reading this solution. If you are stuck, try the Walkthrough first -- it guides your thinking without giving away the answer.

Back to project README

Complete solution¶

"""Level 8 Mini Capstone -- full observability platform integrating KPIs,
profiling, SLA monitoring, and fault injection from the entire level."""

from __future__ import annotations

import argparse
import json
import math
import random
import time
from dataclasses import dataclass, field
from enum import Enum
from typing import Any

class ServiceHealth(Enum):
    HEALTHY = "healthy"
    DEGRADED = "degraded"
    CRITICAL = "critical"
    DOWN = "down"

# WHY raw latency samples instead of pre-computed averages? -- Keeping
# individual measurements lets you compute any statistic after the fact:
# mean, percentiles, histograms. Pre-computing averages loses the
# distribution shape, making tail latency invisible.
@dataclass
class ServiceMetrics:
    name: str
    latency_ms: list[float] = field(default_factory=list)
    error_count: int = 0
    success_count: int = 0
    uptime_checks: int = 0
    uptime_passes: int = 0

    @property
    def request_count(self) -> int:
        return self.error_count + self.success_count

    @property
    def error_rate(self) -> float:
        return self.error_count / self.request_count if self.request_count else 0.0

    @property
    def availability(self) -> float:
        return self.uptime_passes / self.uptime_checks * 100 if self.uptime_checks else 100.0

    @property
    def p50_latency(self) -> float:
        return _percentile(self.latency_ms, 50)

    @property
    def p95_latency(self) -> float:
        return _percentile(self.latency_ms, 95)

    @property
    def p99_latency(self) -> float:
        return _percentile(self.latency_ms, 99)

    # WHY multi-signal health classification? -- A service can be unhealthy
    # for different reasons: high error rate, high latency, or low availability.
    # Checking all three signals prevents blind spots.
    def health(self) -> ServiceHealth:
        if self.availability < 95:
            return ServiceHealth.DOWN
        if self.error_rate > 0.10 or self.p99_latency > 1000:
            return ServiceHealth.CRITICAL
        if self.error_rate > 0.05 or self.p99_latency > 500:
            return ServiceHealth.DEGRADED
        return ServiceHealth.HEALTHY

    def to_dict(self) -> dict[str, Any]:
        return {
            "name": self.name, "requests": self.request_count,
            "error_rate_pct": round(self.error_rate * 100, 2),
            "availability_pct": round(self.availability, 2),
            "p50_ms": round(self.p50_latency, 1),
            "p95_ms": round(self.p95_latency, 1),
            "p99_ms": round(self.p99_latency, 1),
            "health": self.health().value,
        }

@dataclass
class Alert:
    service: str
    severity: str
    message: str
    timestamp: float = field(default_factory=time.time)

    def to_dict(self) -> dict[str, Any]:
        return {"service": self.service, "severity": self.severity, "message": self.message}

@dataclass
class PlatformReport:
    services: list[ServiceMetrics]
    alerts: list[Alert]
    overall_health: ServiceHealth

    def to_dict(self) -> dict[str, Any]:
        return {
            "overall_health": self.overall_health.value,
            "service_count": len(self.services),
            "alert_count": len(self.alerts),
            "services": [s.to_dict() for s in self.services],
            "alerts": [a.to_dict() for a in self.alerts[:20]],
            "summary": {
                "healthy": sum(1 for s in self.services if s.health() == ServiceHealth.HEALTHY),
                "degraded": sum(1 for s in self.services if s.health() == ServiceHealth.DEGRADED),
                "critical": sum(1 for s in self.services if s.health() == ServiceHealth.CRITICAL),
                "down": sum(1 for s in self.services if s.health() == ServiceHealth.DOWN),
            },
        }

def _percentile(values: list[float], pct: float) -> float:
    if not values:
        return 0.0
    s = sorted(values)
    idx = max(0, math.ceil(pct / 100 * len(s)) - 1)
    return s[idx]

class ObservabilityPlatform:
    """WHY a unified platform? -- This capstone integrates the entire level:
    metrics collection (project 01), profiling (06), alerting (13), and
    health classification (09). A single platform object coordinates
    all subsystems, just like Datadog or Grafana in production."""

    def __init__(self, service_names: list[str]) -> None:
        self._metrics = {name: ServiceMetrics(name=name) for name in service_names}
        self._alerts: list[Alert] = []

    def record_request(self, service: str, latency_ms: float, success: bool) -> None:
        metrics = self._metrics.get(service)
        if not metrics:
            return
        metrics.latency_ms.append(latency_ms)
        if success:
            metrics.success_count += 1
        else:
            metrics.error_count += 1

    def record_health_check(self, service: str, passed: bool) -> None:
        metrics = self._metrics.get(service)
        if not metrics:
            return
        metrics.uptime_checks += 1
        if passed:
            metrics.uptime_passes += 1

    def evaluate_alerts(self) -> list[Alert]:
        new_alerts: list[Alert] = []
        for metrics in self._metrics.values():
            health = metrics.health()
            if health == ServiceHealth.CRITICAL:
                alert = Alert(service=metrics.name, severity="critical",
                              message=f"{metrics.name}: error_rate={metrics.error_rate:.1%}, p99={metrics.p99_latency:.0f}ms")
                new_alerts.append(alert)
                self._alerts.append(alert)
            elif health == ServiceHealth.DEGRADED:
                alert = Alert(service=metrics.name, severity="warning",
                              message=f"{metrics.name}: degraded performance")
                new_alerts.append(alert)
                self._alerts.append(alert)
            elif health == ServiceHealth.DOWN:
                alert = Alert(service=metrics.name, severity="page",
                              message=f"{metrics.name}: service DOWN, availability={metrics.availability:.1f}%")
                new_alerts.append(alert)
                self._alerts.append(alert)
        return new_alerts

    # WHY worst-status-wins? -- A platform with one DOWN service is not
    # healthy overall. The worst individual health determines the platform state.
    def report(self) -> PlatformReport:
        services = list(self._metrics.values())
        healths = [s.health() for s in services]
        if ServiceHealth.DOWN in healths:
            overall = ServiceHealth.DOWN
        elif ServiceHealth.CRITICAL in healths:
            overall = ServiceHealth.CRITICAL
        elif ServiceHealth.DEGRADED in healths:
            overall = ServiceHealth.DEGRADED
        else:
            overall = ServiceHealth.HEALTHY
        return PlatformReport(services=services, alerts=self._alerts, overall_health=overall)

def run_simulation(num_requests: int = 200, seed: int = 42) -> dict[str, Any]:
    rng = random.Random(seed)
    profiles = {
        "api-gateway": {"latency_base": 50, "error_prob": 0.02, "down_prob": 0.0},
        "user-service": {"latency_base": 30, "error_prob": 0.01, "down_prob": 0.0},
        "payment-service": {"latency_base": 200, "error_prob": 0.08, "down_prob": 0.01},
        "search-service": {"latency_base": 100, "error_prob": 0.03, "down_prob": 0.0},
        "notification-svc": {"latency_base": 80, "error_prob": 0.15, "down_prob": 0.02},
    }
    platform = ObservabilityPlatform(list(profiles.keys()))
    for _ in range(num_requests):
        for svc_name, profile in profiles.items():
            latency = max(1, rng.gauss(profile["latency_base"], profile["latency_base"] * 0.3))
            success = rng.random() > profile["error_prob"]
            platform.record_request(svc_name, latency, success)
            platform.record_health_check(svc_name, rng.random() > profile["down_prob"])
    platform.evaluate_alerts()
    return platform.report().to_dict()

def main(argv: list[str] | None = None) -> None:
    parser = argparse.ArgumentParser(description="Level 8 Capstone: Observability Platform")
    parser.add_argument("--requests", type=int, default=200)
    parser.add_argument("--seed", type=int, default=42)
    args = parser.parse_args(argv)
    print(json.dumps(run_simulation(num_requests=args.requests, seed=args.seed), indent=2))

if __name__ == "__main__":
    main()

Design decisions¶

Decision	Why	Alternative considered
Raw latency samples stored per service	Enables computing any statistic (mean, percentiles, histograms) after collection	Pre-aggregated averages -- loses distribution shape, hides tail latency
Multi-signal health (error rate + latency + availability)	A service can be unhealthy for different reasons; checking all three prevents blind spots	Single-metric health -- misses problems visible only in other dimensions
Worst-status-wins for overall health	One DOWN service means the platform needs attention regardless of others	Majority voting -- hides critical issues when most services are healthy
Service profiles with configurable error/latency	Realistic simulation where each service has different characteristics	Uniform profiles -- oversimplifies; real systems have heterogeneous services
Unified platform class	Single coordination point for metrics, health checks, and alerts	Separate systems -- harder to get a holistic view of platform health

Alternative approaches¶

Approach B: Event-driven architecture with pub/sub¶

class EventBus:
    """Instead of direct method calls, emit events that multiple
    subscribers process. Enables adding new alert channels without
    modifying the platform."""
    def __init__(self):
        self._subscribers: dict[str, list[Callable]] = defaultdict(list)

    def subscribe(self, event_type: str, handler: Callable):
        self._subscribers[event_type].append(handler)

    def emit(self, event_type: str, data: dict):
        for handler in self._subscribers[event_type]:
            handler(data)

Trade-off: Event-driven architecture decouples metric collection from alerting, enabling multiple alert channels (Slack, PagerDuty, email) without modifying the core platform. The tradeoff is increased complexity and harder debugging (events flow through multiple handlers). Use direct method calls for simple systems, event-driven for platforms with multiple consumers.

Common pitfalls¶

Scenario	What happens	Prevention
Service with zero requests	`error_rate` divides by zero	Guard with `if self.request_count else 0.0`
Latency list grows unbounded	Memory grows linearly with request count over long-running simulations	Add a sliding window or periodic aggregation to bound memory usage
Alert fatigue from repeated alerts	`evaluate_alerts()` fires alerts for every degraded service on every call	Add hysteresis: only alert on state transitions, not on every evaluation