vex_router/observability/
mod.rs

1//! Observability - Metrics, tracing, and cost tracking
2
3use chrono::Utc;
4use parking_lot::RwLock;
5use serde::{Deserialize, Serialize};
6use std::collections::HashMap;
7use std::sync::Arc;
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct RequestMetrics {
11    pub request_id: String,
12    pub timestamp: i64,
13    pub model_used: String,
14    pub routing_strategy: String,
15    pub complexity_score: f64,
16    pub tokens_input: u32,
17    pub tokens_output: u32,
18    pub cost_usd: f64,
19    pub latency_ms: u64,
20    pub first_token_ms: Option<u64>,
21    pub cache_hit: bool,
22    pub cache_similarity: Option<f32>,
23    pub compression_ratio: Option<f64>,
24    pub guardrails_passed: bool,
25    pub error: Option<String>,
26}
27
28impl RequestMetrics {
29    pub fn new(request_id: String, model_used: String, routing_strategy: String) -> Self {
30        Self {
31            request_id,
32            timestamp: Utc::now().timestamp(),
33            model_used,
34            routing_strategy,
35            complexity_score: 0.0,
36            tokens_input: 0,
37            tokens_output: 0,
38            cost_usd: 0.0,
39            latency_ms: 0,
40            first_token_ms: None,
41            cache_hit: false,
42            cache_similarity: None,
43            compression_ratio: None,
44            guardrails_passed: true,
45            error: None,
46        }
47    }
48}
49
50#[derive(Debug)]
51pub struct Observability {
52    metrics: Arc<RwLock<Vec<RequestMetrics>>>,
53    daily_stats: Arc<RwLock<DailyStats>>,
54    max_metrics_stored: usize,
55}
56
57impl Observability {
58    pub fn new(max_metrics_stored: usize) -> Self {
59        Self {
60            metrics: Arc::new(RwLock::new(Vec::new())),
61            daily_stats: Arc::new(RwLock::new(DailyStats::new())),
62            max_metrics_stored,
63        }
64    }
65
66    pub fn record(&self, metric: RequestMetrics) {
67        let mut metrics = self.metrics.write();
68
69        if metrics.len() >= self.max_metrics_stored {
70            metrics.remove(0);
71        }
72
73        metrics.push(metric.clone());
74
75        let mut daily = self.daily_stats.write();
76        daily.record(&metric);
77    }
78
79    pub fn get_metrics(&self, limit: usize) -> Vec<RequestMetrics> {
80        let metrics = self.metrics.read();
81        metrics.iter().rev().take(limit).cloned().collect()
82    }
83
84    pub fn get_summary(&self) -> ObservabilitySummary {
85        let metrics = self.metrics.read();
86
87        if metrics.is_empty() {
88            return ObservabilitySummary::default();
89        }
90
91        let total_requests = metrics.len();
92        let total_cost: f64 = metrics.iter().map(|m| m.cost_usd).sum();
93        let total_tokens_input: u64 = metrics.iter().map(|m| m.tokens_input as u64).sum();
94        let total_tokens_output: u64 = metrics.iter().map(|m| m.tokens_output as u64).sum();
95        let cache_hits = metrics.iter().filter(|m| m.cache_hit).count();
96        let errors = metrics.iter().filter(|m| m.error.is_some()).count();
97
98        let mut latencies: Vec<u64> = metrics.iter().map(|m| m.latency_ms).collect();
99        latencies.sort();
100
101        let avg_latency = latencies.iter().sum::<u64>() as f64 / latencies.len() as f64;
102        let p50_latency = latencies[latencies.len() / 2];
103        let p95_latency = latencies[(latencies.len() * 95) / 100];
104        let p99_latency = latencies[(latencies.len() * 99) / 100];
105
106        ObservabilitySummary {
107            total_requests,
108            total_cost_usd: total_cost,
109            total_tokens_input: total_tokens_input as u32,
110            total_tokens_output: total_tokens_output as u32,
111            avg_cost_per_request: total_cost / total_requests as f64,
112            avg_latency_ms: avg_latency,
113            p50_latency_ms: p50_latency,
114            p95_latency_ms: p95_latency,
115            p99_latency_ms: p99_latency,
116            cache_hit_rate: cache_hits as f64 / total_requests as f64,
117            error_rate: errors as f64 / total_requests as f64,
118        }
119    }
120
121    pub fn get_cost_by_model(&self) -> HashMap<String, f64> {
122        let metrics = self.metrics.read();
123        let mut costs: HashMap<String, f64> = HashMap::new();
124
125        for m in metrics.iter() {
126            *costs.entry(m.model_used.clone()).or_insert(0.0) += m.cost_usd;
127        }
128
129        costs
130    }
131
132    pub fn get_savings(&self) -> SavingsReport {
133        let metrics = self.metrics.read();
134
135        let baseline_cost: f64 = metrics
136            .iter()
137            .map(|m| {
138                m.tokens_input as f64 * 15.0 / 1_000_000.0
139                    + m.tokens_output as f64 * 15.0 / 1_000_000.0
140            })
141            .sum();
142
143        let actual_cost: f64 = metrics.iter().map(|m| m.cost_usd).sum();
144
145        let routing_savings = baseline_cost * 0.6;
146        let cache_savings = baseline_cost * metrics.iter().filter(|m| m.cache_hit).count() as f64
147            / metrics.len().max(1) as f64;
148        let compression_savings = baseline_cost
149            * metrics
150                .iter()
151                .filter_map(|m| m.compression_ratio)
152                .sum::<f64>()
153            / metrics.len().max(1) as f64;
154
155        SavingsReport {
156            baseline_cost,
157            actual_cost,
158            total_savings: baseline_cost - actual_cost,
159            savings_percentage: if baseline_cost > 0.0 {
160                (baseline_cost - actual_cost) / baseline_cost * 100.0
161            } else {
162                0.0
163            },
164            routing_savings,
165            cache_savings,
166            compression_savings,
167        }
168    }
169
170    pub fn clear(&self) {
171        let mut metrics = self.metrics.write();
172        metrics.clear();
173
174        let mut daily = self.daily_stats.write();
175        *daily = DailyStats::new();
176    }
177}
178
179impl Default for Observability {
180    fn default() -> Self {
181        Self::new(10000)
182    }
183}
184
185#[derive(Debug, Clone, Serialize, Deserialize, Default, utoipa::ToSchema)]
186pub struct ObservabilitySummary {
187    pub total_requests: usize,
188    pub total_cost_usd: f64,
189    pub total_tokens_input: u32,
190    pub total_tokens_output: u32,
191    pub avg_cost_per_request: f64,
192    pub avg_latency_ms: f64,
193    pub p50_latency_ms: u64,
194    pub p95_latency_ms: u64,
195    pub p99_latency_ms: u64,
196    pub cache_hit_rate: f64,
197    pub error_rate: f64,
198}
199
200#[derive(Debug, Clone, Serialize, Deserialize, Default, utoipa::ToSchema)]
201pub struct SavingsReport {
202    pub baseline_cost: f64,
203    pub actual_cost: f64,
204    pub total_savings: f64,
205    pub savings_percentage: f64,
206    pub routing_savings: f64,
207    pub cache_savings: f64,
208    pub compression_savings: f64,
209}
210
211#[derive(Debug)]
212struct DailyStats {
213    date: String,
214    total_requests: u64,
215    total_cost: f64,
216    total_tokens: u64,
217    errors: u64,
218}
219
220impl DailyStats {
221    fn new() -> Self {
222        Self {
223            date: Utc::now().format("%Y-%m-%d").to_string(),
224            total_requests: 0,
225            total_cost: 0.0,
226            total_tokens: 0,
227            errors: 0,
228        }
229    }
230
231    fn record(&mut self, metric: &RequestMetrics) {
232        let today = Utc::now().format("%Y-%m-%d").to_string();
233
234        if self.date != today {
235            *self = Self::new();
236            self.date = today;
237        }
238
239        self.total_requests += 1;
240        self.total_cost += metric.cost_usd;
241        self.total_tokens += (metric.tokens_input + metric.tokens_output) as u64;
242
243        if metric.error.is_some() {
244            self.errors += 1;
245        }
246    }
247}
248
249pub fn calculate_cost(
250    tokens: u32,
251    input_cost_per_million: f64,
252    output_cost_per_million: f64,
253    is_output: bool,
254) -> f64 {
255    if is_output {
256        tokens as f64 * output_cost_per_million / 1_000_000.0
257    } else {
258        tokens as f64 * input_cost_per_million / 1_000_000.0
259    }
260}