#!/usr/bin/env python3 """Generate an HTML report from run_loop.py output. Takes the JSON output from run_loop.py and generates a visual HTML report showing each description attempt with check/x for each test case. Distinguishes between train and test queries. """ import argparse import html import json import sys from pathlib import Path def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str: """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag.""" history = data.get("history", []) holdout = data.get("holdout", 0) title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else "" # Get all unique queries from train and test sets, with should_trigger info train_queries: list[dict] = [] test_queries: list[dict] = [] if history: for r in history[0].get("train_results", history[0].get("results", [])): train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)}) if history[0].get("test_results"): for r in history[0].get("test_results", []): test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)}) refresh_tag = ' \n' if auto_refresh else "" html_parts = [""" """ + refresh_tag + """ """ + title_prefix + """Skill Description Optimization

""" + title_prefix + """Skill Description Optimization

Optimizing your skill's description. This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.

"""] # Summary section best_test_score = data.get('best_test_score') best_train_score = data.get('best_train_score') html_parts.append(f"""

Original: {html.escape(data.get('original_description', 'N/A'))}

Best: {html.escape(data.get('best_description', 'N/A'))}

Best Score: {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}

Iterations: {data.get('iterations_run', 0)} | Train: {data.get('train_size', '?')} | Test: {data.get('test_size', '?')}

""") # Legend html_parts.append("""

Query columns: Should trigger Should NOT trigger Train Test

""") # Table header html_parts.append("""

""") # Add column headers for train queries for qinfo in train_queries: polarity = "positive-col" if qinfo["should_trigger"] else "negative-col" html_parts.append(f' \n') # Add column headers for test queries (different color) for qinfo in test_queries: polarity = "positive-col" if qinfo["should_trigger"] else "negative-col" html_parts.append(f' \n') html_parts.append(""" """) # Find best iteration for highlighting if test_queries: best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration") else: best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration") # Add rows for each iteration for h in history: iteration = h.get("iteration", "?") train_passed = h.get("train_passed", h.get("passed", 0)) train_total = h.get("train_total", h.get("total", 0)) test_passed = h.get("test_passed") test_total = h.get("test_total") description = h.get("description", "") train_results = h.get("train_results", h.get("results", [])) test_results = h.get("test_results", []) # Create lookups for results by query train_by_query = {r["query"]: r for r in train_results} test_by_query = {r["query"]: r for r in test_results} if test_results else {} # Compute aggregate correct/total runs across all retries def aggregate_runs(results: list[dict]) -> tuple[int, int]: correct = 0 total = 0 for r in results: runs = r.get("runs", 0) triggers = r.get("triggers", 0) total += runs if r.get("should_trigger", True): correct += triggers else: correct += runs - triggers return correct, total train_correct, train_runs = aggregate_runs(train_results) test_correct, test_runs = aggregate_runs(test_results) # Determine score classes def score_class(correct: int, total: int) -> str: if total > 0: ratio = correct / total if ratio >= 0.8: return "score-good" elif ratio >= 0.5: return "score-ok" return "score-bad" train_class = score_class(train_correct, train_runs) test_class = score_class(test_correct, test_runs) row_class = "best-row" if iteration == best_iter else "" html_parts.append(f""" """) # Add result for each train query for qinfo in train_queries: r = train_by_query.get(qinfo["query"], {}) did_pass = r.get("pass", False) triggers = r.get("triggers", 0) runs = r.get("runs", 0) icon = "✓" if did_pass else "✗" css_class = "pass" if did_pass else "fail" html_parts.append(f' \n') # Add result for each test query (with different background) for qinfo in test_queries: r = test_by_query.get(qinfo["query"], {}) did_pass = r.get("pass", False) triggers = r.get("triggers", 0) runs = r.get("runs", 0) icon = "✓" if did_pass else "✗" css_class = "pass" if did_pass else "fail" html_parts.append(f' \n') html_parts.append(" \n") html_parts.append("""

Iter	Train	Test	Description	{html.escape(qinfo["query"])}	{html.escape(qinfo["query"])}
{iteration}	{train_correct}/{train_runs}	{test_correct}/{test_runs}	{html.escape(description)}	{icon}{triggers}/{runs}	{icon}{triggers}/{runs}

""") html_parts.append(""" """) return "".join(html_parts) def main(): parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output") parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)") parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)") parser.add_argument("--skill-name", default="", help="Skill name to include in the report title") args = parser.parse_args() if args.input == "-": data = json.load(sys.stdin) else: data = json.loads(Path(args.input).read_text()) html_output = generate_html(data, skill_name=args.skill_name) if args.output: Path(args.output).write_text(html_output) print(f"Report written to {args.output}", file=sys.stderr) else: print(html_output) if __name__ == "__main__": main()