RAG Integration Guide

Build reliable Retrieval-Augmented Generation systems with epistemic uncertainty detection to prevent hallucinations and improve answer quality.

Why RAG Systems Need Uncertainty Detection

RAG systems combine retrieval and generation but face unique challenges that epistemic uncertainty detection can solve:

❌ Without AletheionGuard

  • • Confident-sounding hallucinations
  • • No way to detect insufficient context
  • • Can't identify when retrieval failed
  • • No signal for when to retrieve more
  • • Blind to model knowledge gaps

✓ With AletheionGuard

  • • Detect low-confidence responses
  • • Trigger adaptive retrieval (more docs)
  • • Identify knowledge gaps (high Q2)
  • • Route to human experts when unsure
  • • Build trust with transparency

Key Insight: High Q2 (epistemic uncertainty) indicates the model doesn't have enough knowledge to answer confidently - a perfect signal to retrieve more context or escalate.

Basic RAG Integration

Add epistemic uncertainty auditing to your existing RAG pipeline.

from langchain.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.chains import RetrievalQA
from aletheion_guard import EpistemicAuditor
class AuditedRAG:
"""RAG system with epistemic uncertainty auditing."""
def __init__(self, documents):
# Initialize auditor
self.auditor = EpistemicAuditor()
# Setup vector store
embeddings = OpenAIEmbeddings()
self.vectorstore = Chroma.from_documents(documents, embeddings)
# Setup QA chain
self.llm = ChatOpenAI(model="gpt-4", temperature=0)
self.qa_chain = RetrievalQA.from_chain_type(
llm=self.llm,
retriever=self.vectorstore.as_retriever(search_kwargs={"k": 5})
)
def query(self, question: str):
# Generate answer
answer = self.qa_chain.invoke(question)["result"]
# Audit the answer
audit = self.auditor.evaluate(answer, context=question)
# Add confidence warning if needed
if audit.verdict == "REFUSED":
answer = "I don't have enough information to answer this confidently."
elif audit.verdict == "MAYBE":
answer += "\n\n⚠️ Note: This answer has moderate confidence. Please verify."
return {
"answer": answer,
"verdict": audit.verdict,
"confidence": audit.height,
"q2": audit.q2 # Epistemic uncertainty
}

Usage:

rag = AuditedRAG(documents)
result = rag.query("What are the key findings?")
print(f"Answer: {result['answer']}")
print(f"Confidence: {result['confidence']:.2f}")

Adaptive Retrieval

Automatically retrieve more documents when epistemic uncertainty (Q2) is high.

class AdaptiveRAG:
"""RAG with adaptive retrieval based on Q2."""
def __init__(self, documents, q2_threshold=0.35):
self.auditor = EpistemicAuditor()
self.q2_threshold = q2_threshold
embeddings = OpenAIEmbeddings()
self.vectorstore = Chroma.from_documents(documents, embeddings)
self.llm = ChatOpenAI(model="gpt-4", temperature=0)
def query(self, question: str, max_retries=3):
k = 3 # Start with 3 documents
attempts = []
for attempt in range(max_retries):
# Retrieve documents
docs = self.vectorstore.similarity_search(question, k=k)
context = "\n\n".join([d.page_content for d in docs])
# Generate answer with context
prompt = f"""Context:
{context}
Question: {question}
Answer based only on the context above:"""
answer = self.llm.invoke(prompt).content
# Audit the answer
audit = self.auditor.evaluate(answer, context=question)
attempts.append({
"attempt": attempt + 1,
"docs_retrieved": k,
"q2": audit.q2,
"verdict": audit.verdict
})
# Check if Q2 is acceptable
if audit.q2 < self.q2_threshold:
# Good confidence - return answer
return {
"answer": answer,
"verdict": audit.verdict,
"q2": audit.q2,
"height": audit.height,
"attempts": attempts,
"docs_used": k
}
# High Q2 - retrieve more documents
print(f"Attempt {attempt+1}: Q2={audit.q2:.3f} (high). Retrieving more docs...")
k += 3 # Increase by 3 each time
# Max retries reached - insufficient information
return {
"answer": "I cannot provide a confident answer with the available information.",
"verdict": "REFUSED",
"q2": audit.q2,
"height": audit.height,
"attempts": attempts,
"reason": "max_retries_reached"
}

Adaptive Strategy: Start with fewer documents (faster), and only retrieve more when Q2 indicates the model needs additional context. This optimizes both cost and latency.

Confidence-Based Routing

Route questions to different processing paths based on epistemic uncertainty.

class RoutedRAG:
"""RAG with confidence-based routing."""
def __init__(self, documents):
self.auditor = EpistemicAuditor()
self.rag = AdaptiveRAG(documents)
self.expert_queue = [] # For human review
def query(self, question: str, user_id: str):
# Get RAG answer with audit
result = self.rag.query(question)
# Route based on verdict
if result["verdict"] == "ACCEPT":
# High confidence - return immediately
return {
"answer": result["answer"],
"source": "rag_auto",
"confidence": result["height"]
}
elif result["verdict"] == "MAYBE":
# Medium confidence - return with warning
return {
"answer": result["answer"],
"source": "rag_moderate",
"confidence": result["height"],
"warning": "This answer has moderate confidence. Consider verifying."
}
else: # REFUSED
# Low confidence - escalate to human
self.expert_queue.append({
"question": question,
"user_id": user_id,
"q2": result["q2"],
"timestamp": datetime.now()
})
return {
"answer": "This question requires expert review. A specialist will respond within 24 hours.",
"source": "escalated_to_human",
"confidence": result["height"],
"ticket_id": len(self.expert_queue)
}
ACCEPT

High confidence → Return immediately

MAYBE

Medium confidence → Return with warning

REFUSED

Low confidence → Escalate to human

Hybrid Search Strategy

Combine semantic and keyword search, using Q2 to decide search strategy.

from langchain.retrievers import EnsembleRetriever
from langchain.retrievers import BM25Retriever
class HybridRAG:
def __init__(self, documents):
self.auditor = EpistemicAuditor()
# Semantic retriever (vector)
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(documents, embeddings)
semantic_retriever = vectorstore.as_retriever()
# Keyword retriever (BM25)
keyword_retriever = BM25Retriever.from_documents(documents)
# Ensemble retriever (combines both)
self.ensemble_retriever = EnsembleRetriever(
retrievers=[semantic_retriever, keyword_retriever],
weights=[0.5, 0.5]
)
self.llm = ChatOpenAI(model="gpt-4", temperature=0)
def query(self, question: str):
# Try semantic search first (faster)
docs = self.ensemble_retriever.get_relevant_documents(question)[:5]
context = "\n\n".join([d.page_content for d in docs])
prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
answer = self.llm.invoke(prompt).content
# Audit
audit = self.auditor.evaluate(answer, context=question)
if audit.q2 > 0.4:
# High Q2 - try with more keyword emphasis
self.ensemble_retriever.weights = [0.3, 0.7] # More keyword
docs = self.ensemble_retriever.get_relevant_documents(question)[:8]
# Re-generate and re-audit...
return {"answer": answer, "audit": audit}

Citation Validation

Audit individual citations to ensure they're backed by retrieved context.

class CitationRAG:
"""RAG with citation-level auditing."""
def extract_citations(self, answer: str) -> list:
"""Extract statements that need citations."""
# Simple sentence split (use more sophisticated parser in production)
return [s.strip() for s in answer.split(".") if s.strip()]
def validate_answer(self, answer: str, context: str):
"""Validate each statement in the answer."""
auditor = EpistemicAuditor()
statements = self.extract_citations(answer)
validated = []
for stmt in statements:
# Audit each statement with context
audit = auditor.evaluate(stmt, context=context)
validated.append({
"statement": stmt,
"verdict": audit.verdict,
"q2": audit.q2,
"supported": audit.verdict == "ACCEPT"
})
# Calculate overall support score
support_score = sum(v["supported"] for v in validated) / len(validated)
return {
"validated_statements": validated,
"support_score": support_score,
"fully_supported": support_score == 1.0
}

Production-Ready Example

Complete production implementation with logging, metrics, and error handling.

import logging
from datetime import datetime
from typing import Dict, Optional
class ProductionRAG:
"""Production-grade RAG with comprehensive monitoring."""
def __init__(self, documents, config: Dict):
self.auditor = EpistemicAuditor()
self.config = config
self.logger = logging.getLogger(__name__)
# Setup RAG components...
self.setup_retrieval(documents)
# Metrics
self.metrics = {
"total_queries": 0,
"accepted": 0,
"maybe": 0,
"refused": 0,
"avg_q2": 0,
"escalated": 0
}
def query(
self,
question: str,
user_id: Optional[str] = None,
session_id: Optional[str] = None
) -> Dict:
start_time = datetime.now()
try:
# Log request
self.logger.info(f"Query: {question[:100]}...", extra={
"user_id": user_id,
"session_id": session_id
})
# Generate answer
result = self.adaptive_query(question)
# Update metrics
self.metrics["total_queries"] += 1
self.metrics[result["verdict"].lower()] += 1
# Calculate latency
latency = (datetime.now() - start_time).total_seconds()
# Log result
self.logger.info(f"Verdict: {result['verdict']}", extra={
"q2": result["q2"],
"height": result["height"],
"latency": latency
})
result["metadata"] = {
"latency_ms": int(latency * 1000),
"timestamp": start_time.isoformat()
}
return result
except Exception as e:
self.logger.error(f"Query failed: {e}")
return {
"answer": "An error occurred. Please try again.",
"error": str(e)
}

Best Practices

✓ Do

  • • Set Q2 threshold based on your use case (healthcare: 0.20, general: 0.35)
  • • Log all verdicts and Q2 scores for monitoring trends
  • • Use adaptive retrieval to optimize cost and latency
  • • Provide fallback mechanisms for REFUSED verdicts
  • • Monitor the ratio of ACCEPT/MAYBE/REFUSED over time
  • • A/B test different retrieval strategies using audit metrics

✗ Don't

  • • Don't return REFUSED answers without explanation
  • • Don't ignore MAYBE verdicts - they indicate risk
  • • Don't retrieve maximum documents by default (wasteful)
  • • Don't audit only the final answer - audit citations too
  • • Don't use the same thresholds for all domains

Monitoring Metrics

Key metrics to track for your RAG system with epistemic uncertainty.

Verdict Distribution

ACCEPTTarget: 70-85%
MAYBETarget: 10-20%
REFUSEDTarget: <10%

Quality Metrics

• Average Q2 score (lower is better)
• Adaptive retrieval success rate
• Human escalation rate
• User satisfaction by verdict
• Latency by retrieval attempts

Next Steps