Solution: Elite Track / Algorithms and Complexity Lab¶
STOP -- Have you attempted this project yourself first?
Learning happens in the struggle, not in reading answers. Spend at least 20 minutes trying before reading this solution. Check the README for requirements and the Walkthrough for guided hints.
Complete solution¶
"""Algorithms and Complexity Lab.
This project is part of the elite extension track.
It intentionally emphasizes explicit, testable engineering decisions.
"""
# WHY deterministic execution? -- Algorithms labs must produce repeatable results
# so learners can compare Big-O predictions against actual run behavior. Any
# randomness would make performance analysis impossible to reproduce.
from __future__ import annotations
import argparse
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
def parse_args() -> argparse.Namespace:
"""Parse CLI inputs for deterministic project execution."""
parser = argparse.ArgumentParser(description="Algorithms and Complexity Lab")
# WHY required --input/--output? -- Explicit file paths make every run
# reproducible. No hidden defaults means the learner controls exactly
# where data comes from and where results go.
parser.add_argument("--input", required=True, help="Path to input text data")
parser.add_argument("--output", required=True, help="Path to output JSON summary")
# WHY optional run-id? -- Traceability across repeated benchmark sessions.
# When comparing algorithm variants, the run-id distinguishes results.
parser.add_argument("--run-id", default="manual-run", help="Optional run identifier")
return parser.parse_args()
def load_lines(input_path: Path) -> list[str]:
"""Load normalized input lines and reject empty datasets safely."""
if not input_path.exists():
raise FileNotFoundError(f"input file not found: {input_path}")
# WHY strip whitespace? -- Cross-platform consistency. Windows editors
# add \r\n, macOS uses \n. Stripping normalizes both to the same output
# so test assertions are stable across platforms.
lines = [line.strip() for line in input_path.read_text(encoding="utf-8").splitlines() if line.strip()]
if not lines:
raise ValueError("input file contains no usable lines")
return lines
def classify_line(line: str) -> dict[str, Any]:
"""Transform one CSV-like line into structured fields with validation."""
parts = [piece.strip() for piece in line.split(",")]
# WHY exactly 3 fields? -- Strict schema validation catches corrupt data
# at parse time rather than producing mysterious downstream errors.
if len(parts) != 3:
raise ValueError(f"invalid line format (expected 3 comma fields): {line}")
name, score_raw, severity = parts
score = int(score_raw)
return {
"name": name,
"score": score,
"severity": severity,
# WHY is_high_risk boolean? -- Creates a consistent risk lens for
# downstream summaries. Centralizing the definition of "high risk"
# here means the summary logic does not need to re-derive it.
"is_high_risk": severity in {"warn", "critical"} or score < 5,
}
def build_summary(records: list[dict[str, Any]], project_title: str, run_id: str) -> dict[str, Any]:
"""Build deterministic summary payload for testing and teach-back review."""
high_risk_count = sum(1 for record in records if record["is_high_risk"])
avg_score = round(sum(record["score"] for record in records) / len(records), 2)
return {
"project_title": project_title,
"run_id": run_id,
"generated_utc": datetime.now(timezone.utc).isoformat(),
"record_count": len(records),
"high_risk_count": high_risk_count,
"average_score": avg_score,
"records": records,
}
def write_summary(output_path: Path, payload: dict[str, Any]) -> None:
"""Write JSON output with parent directory creation for first-time runs."""
# WHY mkdir(parents=True)? -- First-time runners may not have the data/
# directory yet. Creating it automatically prevents a confusing
# FileNotFoundError on the very first run.
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
def main() -> int:
"""Execute end-to-end project run."""
args = parse_args()
input_path = Path(args.input)
output_path = Path(args.output)
lines = load_lines(input_path)
records = [classify_line(line) for line in lines]
payload = build_summary(records, "Algorithms and Complexity Lab", args.run_id)
write_summary(output_path, payload)
print(f"output_summary.json written to {output_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
Design decisions¶
| Decision | Why | Alternative considered |
|---|---|---|
| Deterministic CLI pipeline (input -> transform -> output) | Reproducible runs enable comparing algorithm variants by diffing output files | Interactive REPL -- non-reproducible, cannot be automated in CI |
| Fail-fast validation (FileNotFoundError, ValueError) | Corrupt data caught immediately at the source rather than producing silent wrong results downstream | Lenient parsing with defaults -- hides data quality issues |
| Pure transformation functions | Each function takes input and returns output with no side effects; easy to test, easy to benchmark | Stateful class with mutable state -- harder to isolate for performance measurement |
| JSON output with embedded records | Full traceability: summary stats plus raw data for debugging; diffable across runs | Summary-only output -- loses the ability to inspect individual records |
Alternative approaches¶
Approach B: Timing-instrumented benchmark runner¶
import time
from typing import Callable
def benchmark(fn: Callable, *args, iterations: int = 100) -> dict[str, float]:
"""Measure function execution time across multiple iterations."""
times = []
for _ in range(iterations):
start = time.perf_counter_ns()
fn(*args)
elapsed = time.perf_counter_ns() - start
times.append(elapsed / 1_000_000) # convert to ms
return {
"min_ms": round(min(times), 3),
"max_ms": round(max(times), 3),
"avg_ms": round(sum(times) / len(times), 3),
"iterations": iterations,
}
Trade-off: Adding timing instrumentation lets learners compare theoretical Big-O complexity against measured wall-clock time. However, the current scaffold focuses on data pipeline correctness first. Timing can be layered on top once the pipeline is solid, following the principle of "make it correct, then make it fast."
Common pitfalls¶
| Scenario | What happens | Prevention |
|---|---|---|
| Input file has trailing blank lines | load_lines strips them via the if line.strip() filter, so they are silently ignored |
The filter is the prevention; no action needed |
| Score field contains non-integer text | int(score_raw) raises ValueError with a clear message |
Add a try/except around the int conversion with a descriptive error |
| Output directory does not exist | mkdir(parents=True, exist_ok=True) creates it automatically |
Already handled by the write_summary function |