Model Comparison Guide

Compare multiple LLM outputs using epistemic uncertainty to identify the most reliable answer and rank model performance objectively.

Why Compare Model Outputs?

When using multiple LLMs, how do you choose the best answer? AletheionGuard provides objective uncertainty metrics to rank and select the most reliable response.

Traditional Approach

  • ❌ Manual evaluation (slow, subjective)
  • ❌ Majority voting (no confidence scores)
  • ❌ Cost-based selection (cheapest ≠ best)
  • ❌ Model preference bias
  • ❌ No uncertainty quantification

With AletheionGuard

  • ✓ Objective uncertainty metrics
  • ✓ Automatic ranking by confidence
  • ✓ Identify consensus vs disagreement
  • ✓ Cost-aware selection
  • ✓ Transparent decision-making

Basic Model Comparison

Compare outputs from multiple models and rank them by epistemic uncertainty.

from openai import OpenAI
from anthropic import Anthropic
from aletheion_guard import EpistemicAuditor
def compare_models(question: str):
"""Compare multiple model outputs."""
openai_client = OpenAI()
anthropic_client = Anthropic()
auditor = EpistemicAuditor()
# Get responses from different models
responses = {}
# GPT-4
gpt4_response = openai_client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": question}]
)
responses["gpt-4"] = gpt4_response.choices[0].message.content
# GPT-3.5-turbo
gpt35_response = openai_client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": question}]
)
responses["gpt-3.5-turbo"] = gpt35_response.choices[0].message.content
# Claude
claude_response = anthropic_client.messages.create(
model="claude-3-opus-20240229",
max_tokens=1024,
messages=[{"role": "user", "content": question}]
)
responses["claude-3-opus"] = claude_response.content[0].text
# Audit each response
results = []
for model, answer in responses.items():
audit = auditor.evaluate(answer, context=question)
results.append({
"model": model,
"answer": answer,
"verdict": audit.verdict,
"q1": audit.q1,
"q2": audit.q2,
"height": audit.height
})
# Sort by height (confidence) descending
results.sort(key=lambda x: x["height"], reverse=True)
return results
# Usage
results = compare_models("Explain quantum entanglement")
print(f"Best answer from: {results[0]['model']}")
print(f"Confidence: {results[0]['height']:.2f}")

Example Output:

1. claude-3-opus: height=0.89 (ACCEPT)
2. gpt-4: height=0.82 (ACCEPT)
3. gpt-3.5-turbo: height=0.71 (MAYBE)

Consensus Detection

Identify when models agree (consensus) or disagree (high variance in Q2 scores).

import numpy as np
def analyze_consensus(results: list) -> dict:
"""Analyze model consensus using uncertainty metrics."""
heights = [r["height"] for r in results]
q2_scores = [r["q2"] for r in results]
# Calculate statistics
height_mean = np.mean(heights)
height_std = np.std(heights)
q2_mean = np.mean(q2_scores)
q2_std = np.std(q2_scores)
# Determine consensus level
if height_std < 0.1 and q2_std < 0.1:
consensus = "strong" # Models agree
elif height_std < 0.2 and q2_std < 0.2:
consensus = "moderate" # Some agreement
else:
consensus = "weak" # Models disagree
return {
"consensus": consensus,
"avg_confidence": height_mean,
"confidence_variance": height_std,
"avg_epistemic_uncertainty": q2_mean,
"uncertainty_variance": q2_std,
"recommendation": {
"strong": "High agreement - trust the top answer",
"moderate": "Some disagreement - review top 2 answers",
"weak": "Low agreement - consult human expert"
}[consensus]
}

Cost-Aware Model Selection

Balance cost and quality by selecting the cheapest model that meets your confidence threshold.

class CostAwareModelSelector:
"""Select optimal model based on cost and confidence."""
def __init__(self):
self.auditor = EpistemicAuditor()
# Cost per 1K tokens (example prices)
self.model_costs = {
"gpt-3.5-turbo": 0.002,
"gpt-4": 0.03,
"claude-3-haiku": 0.0025,
"claude-3-opus": 0.015
}
def select_model(self, question: str, min_height=0.7):
"""Try models from cheapest to most expensive."""
# Sort models by cost
models_by_cost = sorted(
self.model_costs.items(),
key=lambda x: x[1]
)
for model, cost in models_by_cost:
# Generate answer
answer = self._call_model(model, question)
# Audit
audit = self.auditor.evaluate(answer, context=question)
# Check if meets threshold
if audit.height >= min_height:
return {
"model": model,
"answer": answer,
"height": audit.height,
"cost": cost,
"verdict": audit.verdict
}
# No model met threshold
return {
"model": None,
"answer": "Unable to provide confident answer",
"error": "no_model_met_threshold"
}

Cost Optimization: Start with cheap models (gpt-3.5-turbo) and only use expensive ones (gpt-4) when necessary. This can reduce costs by 50-80% while maintaining quality.

Ensemble Strategy

Combine multiple model outputs weighted by their confidence scores.

def ensemble_models(question: str, models: list):
"""Ensemble approach: use all models, weighted by confidence."""
auditor = EpistemicAuditor()
# Get all responses with audits
results = []
for model in models:
answer = call_model(model, question)
audit = auditor.evaluate(answer, context=question)
results.append({
"model": model,
"answer": answer,
"height": audit.height,
"verdict": audit.verdict
})
# Filter only ACCEPT verdicts
accepted = [r for r in results if r["verdict"] == "ACCEPT"]
if not accepted:
return {"answer": "No confident answers available"}
# Calculate weights
total_height = sum(r["height"] for r in accepted)
weights = [r["height"] / total_height for r in accepted]
# Return top answer or synthesized response
if weights[0] > 0.6: # Clear winner
return accepted[0]
else:
# Synthesize from top answers
return {
"answer": synthesize_answers(accepted, weights),
"sources": [r["model"] for r in accepted]
}

A/B Testing Models

Use epistemic uncertainty metrics to evaluate and compare model performance over time.

class ModelABTest:
"""A/B test models using audit metrics."""
def __init__(self, model_a: str, model_b: str):
self.model_a = model_a
self.model_b = model_b
self.auditor = EpistemicAuditor()
self.metrics = {
model_a: {"heights": [], "q2s": [], "verdicts": []},
model_b: {"heights": [], "q2s": [], "verdicts": []}
}
def run_test(self, questions: list):
"""Run A/B test on a list of questions."""
for question in questions:
# Get responses from both models
answer_a = call_model(self.model_a, question)
answer_b = call_model(self.model_b, question)
# Audit both
audit_a = self.auditor.evaluate(answer_a, context=question)
audit_b = self.auditor.evaluate(answer_b, context=question)
# Track metrics
self.metrics[self.model_a]["heights"].append(audit_a.height)
self.metrics[self.model_a]["q2s"].append(audit_a.q2)
self.metrics[self.model_a]["verdicts"].append(audit_a.verdict)
self.metrics[self.model_b]["heights"].append(audit_b.height)
self.metrics[self.model_b]["q2s"].append(audit_b.q2)
self.metrics[self.model_b]["verdicts"].append(audit_b.verdict)
def analyze_results(self):
"""Analyze and compare model performance."""
results = {}
for model in [self.model_a, self.model_b]:
metrics = self.metrics[model]
results[model] = {
"avg_height": np.mean(metrics["heights"]),
"avg_q2": np.mean(metrics["q2s"]),
"accept_rate": metrics["verdicts"].count("ACCEPT") / len(metrics["verdicts"]),
"refuse_rate": metrics["verdicts"].count("REFUSED") / len(metrics["verdicts"])
}
# Determine winner
winner = self.model_a if results[self.model_a]["avg_height"] > results[self.model_b]["avg_height"] else self.model_b
return {"results": results, "winner": winner}

Example Results:

Model A (gpt-4):
• Avg Height: 0.85
• Avg Q2: 0.12
• Accept Rate: 78%
Model B (claude-3-opus):
• Avg Height: 0.88
• Avg Q2: 0.09
• Accept Rate: 84%
Winner: claude-3-opus

Intelligent Routing

Route questions to different models based on their strengths and question characteristics.

class IntelligentRouter:
"""Route questions to optimal models."""
def __init__(self):
self.auditor = EpistemicAuditor()
# Model specializations (from A/B testing)
self.specializations = {
"gpt-4": ["coding", "analysis"],
"claude-3-opus": ["creative", "writing"],
"gpt-3.5-turbo": ["factual", "simple"]
}
def classify_question(self, question: str) -> str:
"""Classify question type."""
keywords = {
"coding": ["code", "function", "debug", "error"],
"creative": ["write", "story", "poem", "creative"],
"factual": ["what", "when", "who", "define"]
}
# Simple keyword matching (use ML classifier in production)
for category, words in keywords.items():
if any(w in question.lower() for w in words):
return category
return "general"
def route(self, question: str):
# Classify question
question_type = self.classify_question(question)
# Find matching model
for model, specialties in self.specializations.items():
if question_type in specialties:
answer = call_model(model, question)
audit = self.auditor.evaluate(answer, context=question)
return {
"model": model,
"answer": answer,
"audit": audit,
"reason": f"Specialized in {question_type}"
}
# Fallback to general model
return self.route_to_default(question)

Best Practices

✓ Do

  • • Compare at least 2-3 models for critical questions
  • • Use cost-aware selection for high-volume workloads
  • • Track consensus levels to identify controversial topics
  • • A/B test models regularly with real user questions
  • • Cache comparison results to save costs
  • • Use ensemble when models have similar confidence

✗ Don't

  • • Don't always use the most expensive model
  • • Don't ignore weak consensus signals
  • • Don't compare models without context
  • • Don't use simple majority voting
  • • Don't forget to audit the comparison itself
  • • Don't optimize for cost alone

Comparison Metrics

Key metrics to evaluate and compare model performance.

MetricDescriptionGood Value
Avg HeightAverage confidence across queries> 0.75
Avg Q2Average epistemic uncertainty< 0.25
Accept Rate% of ACCEPT verdicts> 70%
Refuse Rate% of REFUSED verdicts< 10%
Consensus ScoreAgreement between models> 0.8
Cost per QueryAverage cost to answerMinimize

Next Steps