Skip to content

Commit 9219ee1

Browse files
astefanutticlaude
andcommitted
Address remaining CodeRabbit review comments
- Guard __main__.py main() with if __name__ check - Add bounds check for sys.argv in state.py commands - Log parse failures in discover.py instead of swallowing exceptions - Replace MD5 with SHA256 for skill hash (discover.py, __main__.py) - Add language identifiers to fenced code blocks (eval-optimize, eval-run) - Handle unknown runner with clear error in execute.py - Remove stale args.settings reference in execute.py - Handle empty case dirs in score_cases() to prevent ThreadPoolExecutor crash - Remove extraneous f-string prefix in score.py - Catch malformed YAML/JSON in workspace.py _read_input() - Use parsed config path instead of hardcoded eval.yaml in eval-run SKILL.md - Remove stderr suppression in eval-setup SKILL.md Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 0ea6439 commit 9219ee1

9 files changed

Lines changed: 42 additions & 29 deletions

File tree

agent_eval/__main__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def _check_eval_md(path):
9595
for skills_dir in [Path(".claude/skills"), Path("skills")]:
9696
skill_path = skills_dir / skill_name / "SKILL.md"
9797
if skill_path.exists():
98-
current_hash = hashlib.md5(skill_path.read_bytes()).hexdigest()[:12]
98+
current_hash = hashlib.sha256(skill_path.read_bytes()).hexdigest()[:12]
9999
if current_hash == stored_hash:
100100
print("FRESH")
101101
return
@@ -119,7 +119,7 @@ def _write_eval_md(path, skill, data_yaml):
119119
for skills_dir in [Path(".claude/skills"), Path("skills")]:
120120
skill_path = skills_dir / skill / "SKILL.md"
121121
if skill_path.exists():
122-
skill_hash = hashlib.md5(skill_path.read_bytes()).hexdigest()[:12]
122+
skill_hash = hashlib.sha256(skill_path.read_bytes()).hexdigest()[:12]
123123
break
124124

125125
fm = {
@@ -135,4 +135,5 @@ def _write_eval_md(path, skill, data_yaml):
135135
print(f"Written: {path}")
136136

137137

138-
main()
138+
if __name__ == "__main__":
139+
main()

agent_eval/state.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ def main():
4747

4848
cmd = sys.argv[1]
4949

50+
if cmd in ("init", "set", "read", "write-ids", "read-ids") and len(sys.argv) < 3:
51+
print(f"Usage: state.py {cmd} <path> [args]", file=sys.stderr)
52+
sys.exit(1)
53+
5054
if cmd == "init":
5155
path = Path(sys.argv[2])
5256
path.parent.mkdir(parents=True, exist_ok=True)

skills/eval-analyze/scripts/discover.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ def discover_skills():
3232
if content.startswith("---"):
3333
fm = yaml.safe_load(content.split("---")[1])
3434
desc = fm.get("description", "")[:80]
35-
except Exception:
36-
pass
35+
except Exception as e:
36+
print(f" WARNING: failed to parse {path}: {e}", file=sys.stderr)
3737
skills.append({"name": name, "path": path, "description": desc})
3838

3939
if skills:
@@ -103,7 +103,7 @@ def check_eval_md(path="eval.md"):
103103
for skills_dir in [Path(".claude/skills"), Path("skills")]:
104104
skill_path = skills_dir / skill_name / "SKILL.md"
105105
if skill_path.exists():
106-
current_hash = hashlib.md5(skill_path.read_bytes()).hexdigest()[:12]
106+
current_hash = hashlib.sha256(skill_path.read_bytes()).hexdigest()[:12]
107107
if current_hash == stored_hash:
108108
print(f"FRESH: {skill_name} (hash={stored_hash})")
109109
return

skills/eval-optimize/SKILL.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Automated refinement loop for skills based on evaluation feedback.
99

1010
## Usage
1111

12-
```
12+
```text
1313
/eval-optimize [config_file] --model <model> [--max-iterations <N>]
1414
```
1515

@@ -38,7 +38,7 @@ This skill implements the refinement loop from the MLflow skill evaluation metho
3838
3. **Iteration loop** (up to max-iterations):
3939

4040
a. Run eval by invoking the eval-run skill:
41-
```
41+
```text
4242
Use the Skill tool to invoke /eval-run --config <config> --model <model> --run-id <id>-iter-<N> --score
4343
```
4444

@@ -56,7 +56,7 @@ This skill implements the refinement loop from the MLflow skill evaluation metho
5656
- **Important**: the fix must be specific and grounded in the trace evidence, not a generic instruction
5757

5858
e. After fixing, re-run eval to verify:
59-
```
59+
```text
6060
Use the Skill tool to invoke /eval-run --config <config> --model <model> --run-id <id>-iter-<N+1> --score --baseline <id>-iter-<N>
6161
```
6262

skills/eval-run/SKILL.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,15 @@ Parse `$ARGUMENTS` for:
2020
- `--no-judge` (skip LLM judges)
2121
- `--gold` (save outputs as gold references after run)
2222

23-
Check if eval.yaml exists:
23+
Check if the config file exists (use the parsed config path, not hardcoded `eval.yaml`):
2424

2525
```bash
26-
test -f eval.yaml && echo "CONFIG_EXISTS" || echo "NO_CONFIG"
26+
test -f <config> && echo "CONFIG_EXISTS" || echo "NO_CONFIG"
2727
```
2828

29-
**If eval.yaml is missing**: invoke eval-analyze to bootstrap:
29+
**If config is missing**: invoke eval-analyze to bootstrap:
3030

31-
```
31+
```text
3232
Use the Skill tool to invoke /eval-analyze [--skill <skill>]
3333
```
3434

@@ -138,7 +138,7 @@ Present a structured report to the user. Be decisive — state assessments, not
138138

139139
If `mlflow_experiment` is configured in eval.yaml:
140140

141-
```
141+
```text
142142
Use the Skill tool to invoke /eval-mlflow --action log-results --run-id <id> --config <config>
143143
```
144144

skills/eval-run/scripts/execute.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ def main():
5353

5454
# Build runner
5555
agent = args.agent or config.runner
56+
if agent not in RUNNERS:
57+
print(f"ERROR: unknown runner '{agent}'. Available: {list(RUNNERS.keys())}",
58+
file=sys.stderr)
59+
sys.exit(1)
5660
runner_cls = RUNNERS[agent]
5761

5862
runner = runner_cls(
@@ -68,13 +72,11 @@ def main():
6872
print(f"Workspace: {args.workspace}", file=sys.stderr)
6973

7074
# Run via the abstraction
71-
settings_path = Path(args.settings) if args.settings else None
7275
result = runner.run_skill(
7376
skill_name=args.skill,
7477
args=args.skill_args,
7578
workspace=Path(args.workspace),
7679
model=args.model,
77-
settings_path=settings_path,
7880
max_budget_usd=args.max_budget,
7981
timeout_s=args.timeout,
8082
)

skills/eval-run/scripts/score.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,8 @@ def load_judges(config, project_root=None):
122122

123123
def score_cases(judges, case_dirs, config):
124124
"""Score all cases with all judges in parallel."""
125+
if not case_dirs:
126+
return {"per_case": {}, "aggregated": {n: {"values": [], "mean": None, "pass_rate": None} for n, _ in judges}}
125127
per_case = {}
126128
aggregated = {name: {"values": []} for name, _ in judges}
127129
parallelism = min(len(case_dirs), os.cpu_count() or 4)
@@ -499,7 +501,7 @@ def cmd_judges(args):
499501
print(f" [{r.judge_name}] {r.metric}: "
500502
f"{r.baseline_value} -> {r.current_value}")
501503
else:
502-
print(f"\n REGRESSIONS: 0")
504+
print("\n REGRESSIONS: 0")
503505

504506

505507
def cmd_pairwise(args):

skills/eval-run/scripts/workspace.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -125,17 +125,21 @@ def _read_input(case_dir):
125125
Tries .yaml/.yml first, then .json.
126126
"""
127127
for name in sorted(case_dir.iterdir()):
128-
if name.is_file() and name.suffix in (".yaml", ".yml"):
129-
with open(name) as f:
130-
data = yaml.safe_load(f)
131-
if isinstance(data, dict):
132-
return data
133-
elif name.is_file() and name.suffix == ".json":
134-
import json
135-
with open(name) as f:
136-
data = json.load(f)
137-
if isinstance(data, dict):
138-
return data
128+
try:
129+
if name.is_file() and name.suffix in (".yaml", ".yml"):
130+
with open(name) as f:
131+
data = yaml.safe_load(f)
132+
if isinstance(data, dict):
133+
return data
134+
elif name.is_file() and name.suffix == ".json":
135+
import json
136+
with open(name) as f:
137+
data = json.load(f)
138+
if isinstance(data, dict):
139+
return data
140+
except Exception as e:
141+
print(f"WARNING: failed to parse {name}: {e}", file=sys.stderr)
142+
continue
139143
return None
140144

141145

skills/eval-setup/SKILL.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ if config.mlflow_experiment:
104104
print(f'Experiment created: {config.mlflow_experiment}')
105105
else:
106106
print('No mlflow_experiment in eval.yaml, skipping')
107-
" 2>/dev/null || echo "eval.yaml not found, skipping experiment creation"
107+
" || echo "eval.yaml not found or invalid, skipping experiment creation"
108108
```
109109

110110
## Step 7: Final Verification

0 commit comments

Comments
 (0)