Skip to content

Solution: Level 8 / Project 08 - Fault Injection Harness

STOP -- Have you attempted this project yourself first?

Learning happens in the struggle, not in reading answers. Spend at least 20 minutes trying before reading this solution. If you are stuck, try the Walkthrough first -- it guides your thinking without giving away the answer.

Back to project README


Complete solution

"""Fault Injection Harness -- inject failures for resilience testing."""

from __future__ import annotations

import argparse
import json
import random
import time
from contextlib import contextmanager
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Callable, Generator

class FaultType(Enum):
    EXCEPTION = "exception"
    DELAY = "delay"
    CORRUPTION = "corruption"
    TIMEOUT = "timeout"

# WHY probability-based injection? -- Real failures are stochastic. A 30%
# rate tests intermittent recovery, not just total outages. This is the
# Netflix Chaos Monkey approach: inject realistic failure rates.
@dataclass
class FaultConfig:
    name: str
    fault_type: FaultType
    probability: float
    target_function: str = "*"
    delay_seconds: float = 1.0
    exception_class: str = "RuntimeError"
    exception_message: str = "Injected fault"
    enabled: bool = True

    def __post_init__(self) -> None:
        if not 0.0 <= self.probability <= 1.0:
            raise ValueError(f"Probability must be 0.0-1.0, got {self.probability}")

@dataclass
class FaultEvent:
    fault_name: str
    fault_type: FaultType
    target: str
    timestamp: float = field(default_factory=time.monotonic)
    details: str = ""

@dataclass
class HarnessStats:
    calls_intercepted: int = 0
    faults_triggered: int = 0
    faults_skipped: int = 0
    events: list[FaultEvent] = field(default_factory=list)

    def to_dict(self) -> dict[str, Any]:
        return {
            "calls_intercepted": self.calls_intercepted,
            "faults_triggered": self.faults_triggered,
            "faults_skipped": self.faults_skipped,
            "trigger_rate": round(self.faults_triggered / self.calls_intercepted * 100, 1)
                           if self.calls_intercepted > 0 else 0.0,
            "events": [{"fault": e.fault_name, "type": e.fault_type.value,
                         "target": e.target} for e in self.events],
        }

class FaultInjector:
    """Centralized fault engine: add/remove rules, inject via decorator
    or context manager, track all triggered faults in one place."""

    def __init__(self, seed: int | None = None) -> None:
        self._rules: list[FaultConfig] = []
        self._stats = HarnessStats()
        # WHY seeded RNG? -- Same seed = same faults = deterministic tests.
        self._rng = random.Random(seed)
        self._active = True

    @property
    def stats(self) -> HarnessStats:
        return self._stats

    def add_rule(self, config: FaultConfig) -> None:
        self._rules.append(config)

    def remove_rule(self, name: str) -> bool:
        before = len(self._rules)
        self._rules = [r for r in self._rules if r.name != name]
        return len(self._rules) < before

    def enable(self) -> None:
        self._active = True

    def disable(self) -> None:
        self._active = False

    def _matching_rules(self, func_name: str) -> list[FaultConfig]:
        return [r for r in self._rules
                if r.enabled and (r.target_function == "*" or r.target_function == func_name)]

    def _should_trigger(self, probability: float) -> bool:
        return self._rng.random() < probability

    def _apply_fault(self, rule: FaultConfig, func_name: str) -> None:
        self._stats.events.append(FaultEvent(
            fault_name=rule.name, fault_type=rule.fault_type, target=func_name))
        self._stats.faults_triggered += 1
        if rule.fault_type == FaultType.EXCEPTION:
            raise RuntimeError(f"[FAULT:{rule.name}] {rule.exception_message}")
        elif rule.fault_type == FaultType.DELAY:
            time.sleep(rule.delay_seconds)
        elif rule.fault_type == FaultType.TIMEOUT:
            time.sleep(rule.delay_seconds)
            raise TimeoutError(f"[FAULT:{rule.name}] Operation timed out")

    def check_and_inject(self, func_name: str) -> None:
        if not self._active:
            return
        self._stats.calls_intercepted += 1
        for rule in self._matching_rules(func_name):
            if self._should_trigger(rule.probability):
                self._apply_fault(rule, func_name)
                return
        self._stats.faults_skipped += 1

    def inject(self, func: Callable) -> Callable:
        def wrapper(*args: Any, **kwargs: Any) -> Any:
            self.check_and_inject(func.__name__)
            return func(*args, **kwargs)
        wrapper.__name__ = func.__name__
        wrapper.__wrapped__ = func
        return wrapper

    @contextmanager
    def scope(self, rules: list[FaultConfig]) -> Generator[None, None, None]:
        # WHY try/finally? -- Rules must be cleaned up even if the block
        # raises, preventing rule leakage into subsequent tests.
        for rule in rules:
            self.add_rule(rule)
        try:
            yield
        finally:
            for rule in rules:
                self.remove_rule(rule.name)

# WHY check bool before int? -- In Python, bool is a subclass of int.
# isinstance(True, int) is True. Without the bool check first,
# True becomes -1 instead of False.
def corrupt_data(data: dict[str, Any], corruption_rate: float = 0.3,
                 rng: random.Random | None = None) -> dict[str, Any]:
    if rng is None:
        rng = random.Random()
    corrupted = dict(data)
    for key in list(corrupted.keys()):
        if rng.random() < corruption_rate:
            value = corrupted[key]
            if isinstance(value, bool):
                corrupted[key] = not value
            elif isinstance(value, str):
                corrupted[key] = value[::-1]
            elif isinstance(value, (int, float)):
                corrupted[key] = -value
            elif value is None:
                corrupted[key] = "CORRUPTED"
    return corrupted

def run_demo() -> dict[str, Any]:
    injector = FaultInjector(seed=42)
    injector.add_rule(FaultConfig(name="api-error", fault_type=FaultType.EXCEPTION,
                                  probability=0.3, target_function="call_api",
                                  exception_message="Service unavailable"))
    injector.add_rule(FaultConfig(name="db-delay", fault_type=FaultType.DELAY,
                                  probability=0.2, target_function="query_db",
                                  delay_seconds=0.01))

    @injector.inject
    def call_api(endpoint: str) -> dict:
        return {"status": 200, "endpoint": endpoint}

    @injector.inject
    def query_db(query: str) -> list:
        return [{"id": 1, "data": query}]

    results: list[dict[str, Any]] = []
    for i in range(20):
        try:
            call_api(f"/endpoint/{i}")
            results.append({"call": i, "type": "api", "success": True})
        except RuntimeError:
            results.append({"call": i, "type": "api", "success": False})
        try:
            query_db(f"SELECT * FROM t WHERE id={i}")
            results.append({"call": i, "type": "db", "success": True})
        except (RuntimeError, TimeoutError):
            results.append({"call": i, "type": "db", "success": False})

    original = {"name": "Alice", "score": 95, "active": True}
    corrupted = corrupt_data(original, corruption_rate=0.5, rng=random.Random(42))
    return {"stats": injector.stats.to_dict(), "results_sample": results[:10],
            "corruption_demo": {"original": original, "corrupted": corrupted}}

def main(argv: list[str] | None = None) -> None:
    parser = argparse.ArgumentParser(description="Fault injection harness")
    parser.add_argument("--seed", type=int, default=42)
    parser.parse_args(argv)
    print(json.dumps(run_demo(), indent=2))

if __name__ == "__main__":
    main()

Design decisions

Decision Why Alternative considered
Seeded RNG for reproducibility Same seed = same faults = deterministic debugging System random -- realistic but non-reproducible
Decorator + context manager APIs Decorator for permanent injection; context manager for scoped test blocks Decorator only -- loses temporary scoped injection
bool checked before int in corruption Python's bool subclasses int; wrong order corrupts True to -1 Type dispatch dict -- cleaner but more code for a helper
__post_init__ probability validation Catches config bugs at creation time, not at injection time Silent clamping -- hides the error instead of surfacing it

Alternative approaches

Approach B: Middleware-based fault injection for web apps

class FaultMiddleware:
    """Inject faults at the HTTP layer. Every request passes through,
    and the injector decides based on route, headers, or user ID."""
    def __init__(self, app, injector: FaultInjector):
        self.app = app
        self.injector = injector

    def __call__(self, request):
        self.injector.check_and_inject(request.path)
        return self.app(request)

Trade-off: Middleware injection is more realistic for web services (faults at the network boundary) but only works for HTTP-based systems. The decorator approach works for any callable, making it more versatile for unit-level chaos testing.

Common pitfalls

Scenario What happens Prevention
Probability > 1.0 in FaultConfig __post_init__ raises ValueError immediately -- correct behaviour Validation catches this; without it, faults fire every time
Exception inside scope() block Rules must still be removed to avoid polluting future tests try/finally ensures cleanup even on exception
Not catching injected exceptions Unhandled RuntimeError crashes the program Always wrap fault-injected calls in try/except