Address remaining CodeRabbit review comments

astefanutti · claude · astefanutti · commit 9219ee17c4a9 · 2026-04-03T20:18:44.000+02:00
- Guard __main__.py main() with if __name__ check
- Add bounds check for sys.argv in state.py commands
- Log parse failures in discover.py instead of swallowing exceptions
- Replace MD5 with SHA256 for skill hash (discover.py, __main__.py)
- Add language identifiers to fenced code blocks (eval-optimize, eval-run)
- Handle unknown runner with clear error in execute.py
- Remove stale args.settings reference in execute.py
- Handle empty case dirs in score_cases() to prevent ThreadPoolExecutor crash
- Remove extraneous f-string prefix in score.py
- Catch malformed YAML/JSON in workspace.py _read_input()
- Use parsed config path instead of hardcoded eval.yaml in eval-run SKILL.md
- Remove stderr suppression in eval-setup SKILL.md

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/agent_eval/__main__.py b/agent_eval/__main__.py
@@ -95,7 +95,7 @@ def _check_eval_md(path):
     for skills_dir in [Path(".claude/skills"), Path("skills")]:
         skill_path = skills_dir / skill_name / "SKILL.md"
         if skill_path.exists():
-            current_hash = hashlib.md5(skill_path.read_bytes()).hexdigest()[:12]
+            current_hash = hashlib.sha256(skill_path.read_bytes()).hexdigest()[:12]
             if current_hash == stored_hash:
                 print("FRESH")
                 return
@@ -119,7 +119,7 @@ def _write_eval_md(path, skill, data_yaml):
     for skills_dir in [Path(".claude/skills"), Path("skills")]:
         skill_path = skills_dir / skill / "SKILL.md"
         if skill_path.exists():
-            skill_hash = hashlib.md5(skill_path.read_bytes()).hexdigest()[:12]
+            skill_hash = hashlib.sha256(skill_path.read_bytes()).hexdigest()[:12]
             break
 
     fm = {
@@ -135,4 +135,5 @@ def _write_eval_md(path, skill, data_yaml):
     print(f"Written: {path}")
 
 
-main()
+if __name__ == "__main__":
+    main()
diff --git a/agent_eval/state.py b/agent_eval/state.py
@@ -47,6 +47,10 @@ def main():
 
     cmd = sys.argv[1]
 
+    if cmd in ("init", "set", "read", "write-ids", "read-ids") and len(sys.argv) < 3:
+        print(f"Usage: state.py {cmd} <path> [args]", file=sys.stderr)
+        sys.exit(1)
+
     if cmd == "init":
         path = Path(sys.argv[2])
         path.parent.mkdir(parents=True, exist_ok=True)
diff --git a/skills/eval-analyze/scripts/discover.py b/skills/eval-analyze/scripts/discover.py
@@ -32,8 +32,8 @@ def discover_skills():
                 if content.startswith("---"):
                     fm = yaml.safe_load(content.split("---")[1])
                     desc = fm.get("description", "")[:80]
-            except Exception:
-                pass
+            except Exception as e:
+                print(f"  WARNING: failed to parse {path}: {e}", file=sys.stderr)
             skills.append({"name": name, "path": path, "description": desc})
 
     if skills:
@@ -103,7 +103,7 @@ def check_eval_md(path="eval.md"):
     for skills_dir in [Path(".claude/skills"), Path("skills")]:
         skill_path = skills_dir / skill_name / "SKILL.md"
         if skill_path.exists():
-            current_hash = hashlib.md5(skill_path.read_bytes()).hexdigest()[:12]
+            current_hash = hashlib.sha256(skill_path.read_bytes()).hexdigest()[:12]
             if current_hash == stored_hash:
                 print(f"FRESH: {skill_name} (hash={stored_hash})")
                 return
diff --git a/skills/eval-optimize/SKILL.md b/skills/eval-optimize/SKILL.md
@@ -9,7 +9,7 @@ Automated refinement loop for skills based on evaluation feedback.
 
 ## Usage
 
-```
+```text
 /eval-optimize [config_file] --model <model> [--max-iterations <N>]
 ```
 
@@ -38,7 +38,7 @@ This skill implements the refinement loop from the MLflow skill evaluation metho
 3. **Iteration loop** (up to max-iterations):
 
    a. Run eval by invoking the eval-run skill:
-   ```
+   ```text
    Use the Skill tool to invoke /eval-run --config <config> --model <model> --run-id <id>-iter-<N> --score
    ```
 
@@ -56,7 +56,7 @@ This skill implements the refinement loop from the MLflow skill evaluation metho
       - **Important**: the fix must be specific and grounded in the trace evidence, not a generic instruction
 
    e. After fixing, re-run eval to verify:
-   ```
+   ```text
    Use the Skill tool to invoke /eval-run --config <config> --model <model> --run-id <id>-iter-<N+1> --score --baseline <id>-iter-<N>
    ```
 
diff --git a/skills/eval-run/SKILL.md b/skills/eval-run/SKILL.md
@@ -20,15 +20,15 @@ Parse `$ARGUMENTS` for:
 - `--no-judge` (skip LLM judges)
 - `--gold` (save outputs as gold references after run)
 
-Check if eval.yaml exists:
+Check if the config file exists (use the parsed config path, not hardcoded `eval.yaml`):
 
 ```bash
-test -f eval.yaml && echo "CONFIG_EXISTS" || echo "NO_CONFIG"
+test -f <config> && echo "CONFIG_EXISTS" || echo "NO_CONFIG"
 ```
 
-**If eval.yaml is missing**: invoke eval-analyze to bootstrap:
+**If config is missing**: invoke eval-analyze to bootstrap:
 
-```
+```text
 Use the Skill tool to invoke /eval-analyze [--skill <skill>]
 ```
 
@@ -138,7 +138,7 @@ Present a structured report to the user. Be decisive — state assessments, not
 
 If `mlflow_experiment` is configured in eval.yaml:
 
-```
+```text
 Use the Skill tool to invoke /eval-mlflow --action log-results --run-id <id> --config <config>
 ```
 
diff --git a/skills/eval-run/scripts/execute.py b/skills/eval-run/scripts/execute.py
@@ -53,6 +53,10 @@ def main():
 
     # Build runner
     agent = args.agent or config.runner
+    if agent not in RUNNERS:
+        print(f"ERROR: unknown runner '{agent}'. Available: {list(RUNNERS.keys())}",
+              file=sys.stderr)
+        sys.exit(1)
     runner_cls = RUNNERS[agent]
 
     runner = runner_cls(
@@ -68,13 +72,11 @@ def main():
     print(f"Workspace: {args.workspace}", file=sys.stderr)
 
     # Run via the abstraction
-    settings_path = Path(args.settings) if args.settings else None
     result = runner.run_skill(
         skill_name=args.skill,
         args=args.skill_args,
         workspace=Path(args.workspace),
         model=args.model,
-        settings_path=settings_path,
         max_budget_usd=args.max_budget,
         timeout_s=args.timeout,
     )
diff --git a/skills/eval-run/scripts/score.py b/skills/eval-run/scripts/score.py
@@ -122,6 +122,8 @@ def load_judges(config, project_root=None):
 
 def score_cases(judges, case_dirs, config):
     """Score all cases with all judges in parallel."""
+    if not case_dirs:
+        return {"per_case": {}, "aggregated": {n: {"values": [], "mean": None, "pass_rate": None} for n, _ in judges}}
     per_case = {}
     aggregated = {name: {"values": []} for name, _ in judges}
     parallelism = min(len(case_dirs), os.cpu_count() or 4)
@@ -499,7 +501,7 @@ def cmd_judges(args):
                 print(f"    [{r.judge_name}] {r.metric}: "
                       f"{r.baseline_value} -> {r.current_value}")
         else:
-            print(f"\n  REGRESSIONS: 0")
+            print("\n  REGRESSIONS: 0")
 
 
 def cmd_pairwise(args):
diff --git a/skills/eval-run/scripts/workspace.py b/skills/eval-run/scripts/workspace.py
@@ -125,17 +125,21 @@ def _read_input(case_dir):
     Tries .yaml/.yml first, then .json.
     """
     for name in sorted(case_dir.iterdir()):
-        if name.is_file() and name.suffix in (".yaml", ".yml"):
-            with open(name) as f:
-                data = yaml.safe_load(f)
-            if isinstance(data, dict):
-                return data
-        elif name.is_file() and name.suffix == ".json":
-            import json
-            with open(name) as f:
-                data = json.load(f)
-            if isinstance(data, dict):
-                return data
+        try:
+            if name.is_file() and name.suffix in (".yaml", ".yml"):
+                with open(name) as f:
+                    data = yaml.safe_load(f)
+                if isinstance(data, dict):
+                    return data
+            elif name.is_file() and name.suffix == ".json":
+                import json
+                with open(name) as f:
+                    data = json.load(f)
+                if isinstance(data, dict):
+                    return data
+        except Exception as e:
+            print(f"WARNING: failed to parse {name}: {e}", file=sys.stderr)
+            continue
     return None
 
 
diff --git a/skills/eval-setup/SKILL.md b/skills/eval-setup/SKILL.md
@@ -104,7 +104,7 @@ if config.mlflow_experiment:
     print(f'Experiment created: {config.mlflow_experiment}')
 else:
     print('No mlflow_experiment in eval.yaml, skipping')
-" 2>/dev/null || echo "eval.yaml not found, skipping experiment creation"
+" || echo "eval.yaml not found or invalid, skipping experiment creation"
 ```
 
 ## Step 7: Final Verification