Automated Rubric Improvement¶
Use LLM-driven feedback loops to iteratively refine rubrics until they meet quality standards.
The Scenario¶
You're building an evaluation system for a new domain, but crafting high-quality rubrics from scratch is difficult. Your initial rubrics suffer from common problems: vague language, double-barreled criteria, and generic boilerplate that doesn't capture task-specific quality dimensions.
Rather than manually iterating through rubric revisions—evaluating, identifying issues, rewriting, and re-evaluating—you want to automate this refinement process. The system should:
- Evaluate rubric quality using meta-rubrics
- Extract actionable feedback from the evaluation
- Use an LLM to revise the rubric based on that feedback
- Repeat until no issues remain or a quality threshold is met
This approach transforms rubric design from a tedious manual process into an automated optimization loop.
What You'll Learn¶
- How to build an automated rubric improvement pipeline
- How to extract actionable feedback from meta-rubric evaluations
- How to prompt an LLM to revise rubrics based on feedback
- How to track improvement across iterations
- Best practices for convergence and quality thresholds
The Solution¶
The Improvement Loop¶
The automated improvement process follows a simple feedback loop:
flowchart TD
A[Initial Rubric] --> B[Evaluate with Meta-Rubric]
B --> C{Issues Found?}
C -->|No issues| D[Final Rubric]
C -->|Yes| E[LLM Revises Rubric]
E --> B
style A fill:#e8f4f8,stroke:#5dade2
style D fill:#d5f5e3,stroke:#58d68d
style E fill:#fdebd0,stroke:#f5b041
Step 1: Define the Flawed Initial Rubric¶
Start with a rubric that exhibits common anti-patterns. In practice, this might be a first draft or a rubric borrowed from a similar domain:
from autorubric import Rubric
def create_initial_rubric() -> Rubric:
"""A rubric with intentional quality issues."""
return Rubric.from_dict([
# Double-barreled: assesses multiple distinct things
{
"weight": 10,
"requirement": "The response is well-written, accurate, and demonstrates creativity",
},
# Vague wording: "good" is undefined
{
"weight": 8,
"requirement": "The response shows good understanding of the topic",
},
# Hedging language: "may" makes assessment uncertain
{
"weight": 6,
"requirement": "The response may include some relevant examples if appropriate",
},
# Generic boilerplate: could apply to any task
{
"weight": 5,
"requirement": "The response is of high quality",
},
# Overlaps with first criterion
{
"weight": 7,
"requirement": "Writing quality is acceptable and the text flows well",
},
])
Step 2: Extract Actionable Issues from Evaluation¶
After running meta-rubric evaluation, extract the issues that need fixing:
from autorubric.types import CriterionVerdict
def extract_issues(evaluation_report) -> list[dict]:
"""Extract actionable issues from a meta-rubric evaluation."""
issues = []
for criterion_report in evaluation_report.report:
weight = criterion_report.criterion.weight
verdict = criterion_report.final_verdict
# Issue detected when:
# - Positive criterion is UNMET (quality check failed)
# - Negative criterion is MET (anti-pattern detected)
is_issue = (weight > 0 and verdict == CriterionVerdict.UNMET) or (
weight < 0 and verdict == CriterionVerdict.MET
)
if is_issue:
issues.append({
"criterion": criterion_report.criterion.name or "unnamed",
"is_antipattern": weight < 0,
"feedback": criterion_report.final_reason,
})
return issues
The final_reason field contains detailed, actionable feedback from the LLM judge explaining exactly what's wrong and why.
Step 3: Prompt the LLM to Revise the Rubric¶
Use the extracted issues to construct a revision prompt:
import json
from autorubric import LLMClient, LLMConfig
def format_issues(issues: list[dict]) -> str:
"""Format issues into a prompt section."""
lines = []
for i, issue in enumerate(issues, 1):
issue_type = "ANTI-PATTERN" if issue["is_antipattern"] else "QUALITY ISSUE"
lines.append(f"{i}. [{issue_type}] {issue['criterion']}")
lines.append(f" Feedback: {issue['feedback']}\n")
return "\n".join(lines)
async def revise_rubric(
current_rubric: Rubric,
task_prompt: str,
issues: list[dict],
llm_config: LLMConfig,
) -> Rubric:
"""Use an LLM to revise the rubric based on evaluation feedback."""
current_criteria = json.dumps(
[{"weight": c.weight, "requirement": c.requirement} for c in current_rubric.rubric],
indent=2,
)
system_prompt = (
"You are a rubric design expert specializing in creating clear, "
"task-specific evaluation criteria."
)
user_prompt = f"""Revise this rubric based on quality feedback.
## Task Being Evaluated
{task_prompt}
## Current Rubric
{current_criteria}
## Issues to Fix
{format_issues(issues)}
## Guidelines
1. Fix all identified issues while preserving the original intent
2. Each criterion should assess ONE specific, observable quality
3. Use clear, unambiguous language (no hedging: may, might, could)
4. Avoid generic criteria—tailor to the specific task
5. Ensure criteria are distinct and don't overlap
6. Keep weights between 5-15 based on importance
7. Aim for 4-6 focused criteria
Return ONLY a JSON array of criteria with "weight" and "requirement" fields."""
client = LLMClient(llm_config)
response = await client.generate(system_prompt, user_prompt)
# Parse JSON from response
start = response.find("[")
end = response.rfind("]") + 1
criteria_data = json.loads(response[start:end])
return Rubric.from_dict(criteria_data)
Step 4: Run the Improvement Loop¶
Combine the components into an iterative refinement loop:
import asyncio
from pathlib import Path
from autorubric.meta import evaluate_rubric_in_context
MAX_ITERATIONS = 10
async def improve_rubric(
initial_rubric: Rubric,
task_prompt: str,
eval_llm: LLMConfig,
revision_llm: LLMConfig,
output_dir: Path,
) -> tuple[Rubric, list[dict]]:
"""Iteratively improve a rubric until no issues remain."""
current_rubric = initial_rubric
iteration_history = []
# Save initial rubric
save_rubric(current_rubric, output_dir / "rubric-iter-00.json")
for iteration in range(MAX_ITERATIONS):
# Evaluate current rubric
result = await evaluate_rubric_in_context(
current_rubric,
task_prompt,
eval_llm,
display="html",
output_html_path=str(output_dir / f"eval-iter-{iteration:02d}.html"),
)
issues = extract_issues(result)
iteration_history.append({
"iteration": iteration,
"score": result.score,
"issues": len(issues),
})
print(f"Iteration {iteration}: score={result.score:.1%}, issues={len(issues)}")
# Check if we're done
if not issues:
print("No issues remaining—optimization complete!")
break
# Revise the rubric
current_rubric = await revise_rubric(
current_rubric, task_prompt, issues, revision_llm
)
# Save revised rubric
save_rubric(current_rubric, output_dir / f"rubric-iter-{iteration + 1:02d}.json")
return current_rubric, iteration_history
Step 5: Configure and Run¶
Use different models for evaluation vs revision—a faster model for evaluation (many parallel criterion assessments) and a stronger model for revision (complex reasoning):
async def main():
# Fast model for evaluation (runs many parallel assessments)
eval_llm = LLMConfig(
model="gemini/gemini-2.5-flash",
temperature=0.0,
thinking="medium",
max_parallel_requests=10,
)
# Stronger model for rubric revision
revision_llm = LLMConfig(
model="gemini/gemini-2.5-pro",
temperature=0.3,
thinking="medium",
)
task_prompt = (
"Write a comprehensive analysis of the environmental impact of electric "
"vehicles compared to traditional gasoline vehicles. Include discussion "
"of manufacturing, operation, and end-of-life considerations."
)
output_dir = Path("rubric_improvement_experiment")
output_dir.mkdir(exist_ok=True)
initial_rubric = create_initial_rubric()
final_rubric, history = await improve_rubric(
initial_rubric, task_prompt, eval_llm, revision_llm, output_dir
)
# Print summary
print("\nImprovement Summary:")
print(f" Initial: {history[0]['score']:.1%} ({history[0]['issues']} issues)")
print(f" Final: {history[-1]['score']:.1%} ({history[-1]['issues']} issues)")
asyncio.run(main())
Example Results¶
Running this pipeline on the flawed rubric above produces dramatic improvement:

The chart shows how the quality score increases from 0% to 100% while detected issues drop from 21 to 0 over 5 iterations.
| Iteration | Score | Issues | Key Changes |
|---|---|---|---|
| 0 (Initial) | 0% | 21 | Generic, vague, double-barreled criteria |
| 1 | 89% | 2 | Task-specific criteria addressing manufacturing, operation, end-of-life |
| 2 | 92% | 1 | Split compound criteria, clarified wording |
| 3 | 95% | 1 | Improved comparative framing |
| 4 (Final) | 100% | 0 | Balanced weights, fully optimized |
Qualitative Transformation¶
The table below shows how specific criteria evolved through the refinement process:
| Aspect | Initial (Iteration 0) | Final (Iteration 4) |
|---|---|---|
| Manufacturing | ❌ Not addressed | ✅ "Compares the environmental impacts of manufacturing electric vehicle batteries with the manufacturing processes of gasoline vehicles" |
| Operation | ❌ Not addressed | ✅ "Contrasts the indirect emissions from electricity generation for EVs against the direct tailpipe emissions of gasoline vehicles" |
| End-of-Life | ❌ Not addressed | ✅ "Compares the environmental implications of EV battery disposal or recycling against the end-of-life processing of gasoline vehicles" |
| Writing Quality | ❌ "well-written, accurate, and demonstrates creativity" (double-barreled, vague) | ✅ "Organizes the content using specific headers for manufacturing, operation, and end-of-life phases" (specific, observable) |
| Overall Quality | ❌ "The response is of high quality" (circular, generic) | ✅ "Provides a concluding assessment that weighs the total environmental footprint of EVs against gasoline vehicles" (task-specific) |
The transformation illustrates three key improvements:
- Generic → Task-specific: Criteria now directly address the EV analysis requirements
- Vague → Observable: "high quality" becomes measurable structural requirements
- Compound → Unidimensional: Multi-aspect criteria split into focused assessments
Key Takeaways¶
- Automate the feedback loop: Manual iteration is slow and inconsistent; LLMs can systematically address identified issues
- Use different models for different tasks: Fast models for evaluation (parallelizable), strong models for revision (complex reasoning)
- Track artifacts: Save rubrics and evaluation reports at each iteration for debugging and audit trails
- Set convergence criteria: Stop when no issues remain or after a maximum number of iterations
- The meta-rubric is your quality oracle: It provides structured, actionable feedback that LLMs can act on
Best Practices¶
Model Selection¶
| Task | Recommended Model | Why |
|---|---|---|
| Evaluation | Fast model (Flash/Mini) | Many parallel criterion assessments |
| Revision | Strong model (Pro/4o) | Complex reasoning about rubric design |
Convergence¶
The loop should terminate when:
- No issues detected: All meta-rubric criteria pass
- Score threshold met: e.g., score ≥ 0.95
- Max iterations reached: Prevent infinite loops (typically 5-10)
- Diminishing returns: Score improvement < 1% between iterations
Debugging Stuck Loops¶
If the loop doesn't converge:
- Check if the same issues keep appearing (revision prompt may need adjustment)
- Look for conflicting meta-rubric criteria
- Lower the revision LLM temperature for more deterministic outputs
- Add the previous iteration's feedback to the prompt to show what was already tried
Going Further¶
- Evaluating Rubric Quality - Understanding meta-rubrics in depth
- Extended Thinking - Using thinking models for complex evaluations
- Configuration Management - Sharing optimized rubrics across teams
Appendix: Complete Code¶
See examples/rubric_improvement_demo.py for a complete, runnable implementation.
#!/usr/bin/env python3
"""Automated Rubric Improvement Demo
Iteratively refines a rubric using meta-rubric evaluation feedback.
"""
import asyncio
import json
from datetime import datetime
from pathlib import Path
from autorubric import LLMClient, LLMConfig, Rubric
from autorubric.meta import evaluate_rubric_in_context
from autorubric.types import CriterionVerdict
MAX_ITERATIONS = 10
def create_flawed_rubric() -> Rubric:
"""Create a rubric with intentional quality issues."""
return Rubric.from_dict([
{
"weight": 10,
"requirement": (
"The response is well-written, accurate, and demonstrates creativity"
),
},
{
"weight": 8,
"requirement": "The response shows good understanding of the topic",
},
{
"weight": 6,
"requirement": "The response may include relevant examples if appropriate",
},
{
"weight": 5,
"requirement": "The response is of high quality",
},
{
"weight": 7,
"requirement": "Writing quality is acceptable and the text flows well",
},
])
def extract_issues(report) -> list[dict]:
"""Extract actionable issues from an evaluation report."""
issues = []
for criterion_report in report.report:
weight = criterion_report.criterion.weight
verdict = criterion_report.final_verdict
is_issue = (weight > 0 and verdict == CriterionVerdict.UNMET) or (
weight < 0 and verdict == CriterionVerdict.MET
)
if is_issue:
issues.append({
"criterion": criterion_report.criterion.name or "unnamed",
"is_antipattern": weight < 0,
"feedback": criterion_report.final_reason,
})
return issues
def format_issues(issues: list[dict]) -> str:
"""Format issues for the revision prompt."""
lines = []
for i, issue in enumerate(issues, 1):
issue_type = "ANTI-PATTERN" if issue["is_antipattern"] else "QUALITY ISSUE"
lines.append(f"{i}. [{issue_type}] {issue['criterion']}")
lines.append(f" Feedback: {issue['feedback']}\n")
return "\n".join(lines)
async def revise_rubric(
current_rubric: Rubric,
task_prompt: str,
issues: list[dict],
llm_config: LLMConfig,
) -> Rubric:
"""Use an LLM to revise the rubric based on feedback."""
current_criteria = json.dumps(
[{"weight": c.weight, "requirement": c.requirement} for c in current_rubric.rubric],
indent=2,
)
system_prompt = (
"You are a rubric design expert specializing in creating clear, "
"task-specific evaluation criteria."
)
user_prompt = f"""Revise this rubric based on quality feedback.
## Task Being Evaluated
{task_prompt}
## Current Rubric
{current_criteria}
## Issues to Fix
{format_issues(issues)}
## Guidelines
1. Fix all identified issues while preserving the original intent
2. Each criterion should assess ONE specific, observable quality
3. Use clear, unambiguous language (no hedging: may, might, could)
4. Avoid generic criteria—tailor to the specific task
5. Ensure criteria are distinct and don't overlap
6. Keep weights between 5-15 based on importance
7. Aim for 4-6 focused criteria
Return ONLY a JSON array of criteria with "weight" and "requirement" fields."""
client = LLMClient(llm_config)
response = await client.generate(system_prompt, user_prompt)
start = response.find("[")
end = response.rfind("]") + 1
criteria_data = json.loads(response[start:end])
return Rubric.from_dict(criteria_data)
def save_rubric(rubric: Rubric, path: Path) -> None:
"""Save rubric to JSON."""
criteria = [
{"weight": c.weight, "requirement": c.requirement} for c in rubric.rubric
]
with open(path, "w", encoding="utf-8") as f:
json.dump(criteria, f, indent=2)
async def main():
# Create experiment directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
exp_dir = Path(f"rubric_improvement_exp_{timestamp}")
exp_dir.mkdir(exist_ok=True)
eval_llm = LLMConfig(
model="gemini/gemini-2.5-flash",
temperature=0.0,
thinking="medium",
max_parallel_requests=10,
)
revision_llm = LLMConfig(
model="gemini/gemini-2.5-pro",
temperature=0.3,
thinking="medium",
)
task_prompt = (
"Write a comprehensive analysis of the environmental impact of electric "
"vehicles compared to traditional gasoline vehicles. Include discussion "
"of manufacturing, operation, and end-of-life considerations."
)
current_rubric = create_flawed_rubric()
save_rubric(current_rubric, exp_dir / "rubric-iter-00.json")
history = []
for iteration in range(MAX_ITERATIONS):
result = await evaluate_rubric_in_context(
current_rubric,
task_prompt,
eval_llm,
display="html",
output_html_path=str(exp_dir / f"eval-iter-{iteration:02d}.html"),
)
issues = extract_issues(result)
history.append({"iteration": iteration, "score": result.score, "issues": len(issues)})
print(f"Iteration {iteration}: score={result.score:.1%}, issues={len(issues)}")
if not issues:
print("Optimization complete!")
break
current_rubric = await revise_rubric(current_rubric, task_prompt, issues, revision_llm)
save_rubric(current_rubric, exp_dir / f"rubric-iter-{iteration + 1:02d}.json")
# Summary
print(f"\nImprovement: {history[0]['score']:.1%} → {history[-1]['score']:.1%}")
print(f"Issues: {history[0]['issues']} → {history[-1]['issues']}")
print(f"Artifacts: {exp_dir}")
if __name__ == "__main__":
asyncio.run(main())