# llm_with_tracing.py
# Function: A production-ready LLM name wrapper with full observability.
# Each name is traced in Langfuse: enter, output, tokens, value, latency.
#
# Conditions:
# pip set up langfuse anthropic python-dotenv
#
# Setup:
# 1. Create a free account at https://cloud.langfuse.com
# 2. Get your keys from Settings > API Keys
# 3. Create a .env file with the variables under
#
# Run:
# python llm_with_tracing.py
import os
import time
from dotenv import load_dotenv
import anthropic
from langfuse import Langfuse
# Load surroundings variables from .env file
load_dotenv()
# Required surroundings variables in your .env:
# LANGFUSE_PUBLIC_KEY=pk-lf-…
# LANGFUSE_SECRET_KEY=sk-lf-…
# LANGFUSE_HOST=https://cloud.langfuse.com (or your self-hosted URL)
# ANTHROPIC_API_KEY=sk-ant-…
# Initialize shoppers
langfuse_client = Langfuse() # Reads keys routinely from surroundings
anthropic_client = anthropic.Anthropic() # Reads ANTHROPIC_API_KEY from surroundings
# ── Configuration ─────────────────────────────────────────────────────────────
# Retailer your immediate right here, not inline within the API name.
# This makes it versionable and testable independently.
SYSTEM_PROMPT = “”“You’re a useful buyer assist assistant.
Reply questions clearly and concisely.
Should you have no idea one thing, say so instantly — don’t guess.”“”
MODEL = “claude-sonnet-4-20250514”
# Anthropic’s pricing as of mid-2026 (replace when pricing adjustments)
# Used to calculate value per name for value monitoring
COST_PER_INPUT_TOKEN = 3.00 / 1_000_000 # $3.00 per million enter tokens
COST_PER_OUTPUT_TOKEN = 15.00 / 1_000_000 # $15.00 per million output tokens
def call_llm_with_tracing(
user_message: str,
session_id: str = “default-session”,
user_id: str = “nameless”
) -> str:
“”“
Make a traced LLM name. Each name creates a Langfuse hint with:
– Full enter and output
– Token utilization (enter, output, complete)
– Calculated value in USD
– Latency in milliseconds
– Mannequin used and session context
Parameters:
user_message : The message from the person
session_id : Teams associated calls into one dialog in Langfuse
user_id : Associates the decision with a selected person for analytics
Returns:
The LLM response as a string
““”
# Create a top-level hint for this person interplay
# The hint seems within the Langfuse dashboard as one unit of labor
hint = langfuse_client.hint(
title=“customer-support-call”,
session_id=session_id,
user_id=user_id,
enter={“user_message”: user_message, “system_prompt”: SYSTEM_PROMPT}
)
# Create a era span contained in the hint
# This captures model-specific particulars: mannequin title, tokens, value
era = hint.era(
title=“claude-completion”,
mannequin=MODEL,
enter={
“system”: SYSTEM_PROMPT,
“messages”: [{“role”: “user”, “content”: user_message}]
}
)
start_time = time.time()
strive:
# Make the API name
response = anthropic_client.messages.create(
mannequin=MODEL,
max_tokens=1024,
system=SYSTEM_PROMPT,
messages=[{“role”: “user”, “content”: user_message}]
)
latency_ms = int((time.time() – start_time) * 1000)
# Extract the response textual content
response_text = response.content material[0].textual content
# Extract token utilization from the response
input_tokens = response.utilization.input_tokens
output_tokens = response.utilization.output_tokens
total_tokens = input_tokens + output_tokens
# Calculate value for this name
cost_usd = (
input_tokens * COST_PER_INPUT_TOKEN +
output_tokens * COST_PER_OUTPUT_TOKEN
)
# Replace the era span with outcomes
# This information populates the Langfuse value and token dashboards
era.finish(
output=response_text,
utilization={
“enter”: input_tokens,
“output”: output_tokens,
“complete”: total_tokens,
“unit”: “TOKENS”
},
metadata={
“latency_ms”: latency_ms,
“cost_usd”: spherical(cost_usd, 6),
“mannequin”: MODEL
}
)
# Replace the hint with the ultimate output
hint.replace(
output={“response”: response_text},
metadata={“total_cost_usd”: spherical(cost_usd, 6)}
)
# Print a abstract to stdout for native visibility
print(f“n{‘─’ * 60}”)
print(f“Person: {user_message}”)
print(f“Claude: {response_text}”)
print(f“Tokens: {input_tokens} in / {output_tokens} out / {total_tokens} complete”)
print(f“Value: ${cost_usd:.6f}”)
print(f“Latency: {latency_ms}ms”)
print(f“Hint: {langfuse_client.base_url}/hint/{hint.id}”)
print(f“{‘─’ * 60}n”)
return response_text
besides Exception as e:
# Document the error within the hint so it exhibits up in Langfuse
era.finish(
output=None,
metadata={“error”: str(e), “latency_ms”: int((time.time() – start_time) * 1000)}
)
hint.replace(output={“error”: str(e)})
# All the time flush earlier than elevating — ensures the error hint is shipped
langfuse_client.flush()
increase
lastly:
# Flush sends all buffered occasions to Langfuse
# In a long-running service, Langfuse flushes routinely.
# In a script, you have to flush manually earlier than the method exits.
langfuse_client.flush()
# ── Run an indication ────────────────────────────────────────────────────────
if __name__ == “__main__”:
# Simulate two turns of a buyer assist dialog
test_messages = [
“What is your return policy for electronics?”,
“Can I return an item I bought 45 days ago?”
]
session = “demo-session-001”
for i, message in enumerate(test_messages):
print(f“nCall {i + 1}/{len(test_messages)}”)
strive:
call_llm_with_tracing(
user_message=message,
session_id=session,
user_id=“test-user-42”
)
besides Exception as e:
print(f“Error on name {i + 1}: {e}”)
