vex_llm/
metrics.rs

1//! Metrics and tracing for VEX
2
3use serde::{Deserialize, Serialize};
4use std::sync::atomic::{AtomicU64, Ordering};
5use std::sync::Arc;
6use std::time::{Duration, Instant};
7
8/// Global metrics collector
9#[derive(Debug, Default)]
10pub struct Metrics {
11    /// Total LLM calls
12    pub llm_calls: AtomicU64,
13    /// Total LLM errors
14    pub llm_errors: AtomicU64,
15    /// Total tokens used
16    pub tokens_used: AtomicU64,
17    /// Total debates run
18    pub debates: AtomicU64,
19    /// Total agents created
20    pub agents_created: AtomicU64,
21    /// Total verifications (adversarial)
22    pub verifications: AtomicU64,
23    /// Successful verifications
24    pub verifications_passed: AtomicU64,
25    /// Audit events logged
26    pub audit_events: AtomicU64,
27}
28
29impl Metrics {
30    /// Create new metrics collector
31    pub fn new() -> Self {
32        Self::default()
33    }
34
35    /// Record an LLM call
36    pub fn record_llm_call(&self, tokens: u64, error: bool) {
37        self.llm_calls.fetch_add(1, Ordering::Relaxed);
38        self.tokens_used.fetch_add(tokens, Ordering::Relaxed);
39        if error {
40            self.llm_errors.fetch_add(1, Ordering::Relaxed);
41        }
42    }
43
44    /// Record a debate
45    pub fn record_debate(&self) {
46        self.debates.fetch_add(1, Ordering::Relaxed);
47    }
48
49    /// Record agent creation
50    pub fn record_agent_created(&self) {
51        self.agents_created.fetch_add(1, Ordering::Relaxed);
52    }
53
54    /// Record verification
55    pub fn record_verification(&self, passed: bool) {
56        self.verifications.fetch_add(1, Ordering::Relaxed);
57        if passed {
58            self.verifications_passed.fetch_add(1, Ordering::Relaxed);
59        }
60    }
61
62    /// Record audit event
63    pub fn record_audit_event(&self) {
64        self.audit_events.fetch_add(1, Ordering::Relaxed);
65    }
66
67    /// Get snapshot of all metrics
68    pub fn snapshot(&self) -> MetricsSnapshot {
69        MetricsSnapshot {
70            llm_calls: self.llm_calls.load(Ordering::Relaxed),
71            llm_errors: self.llm_errors.load(Ordering::Relaxed),
72            tokens_used: self.tokens_used.load(Ordering::Relaxed),
73            debates: self.debates.load(Ordering::Relaxed),
74            agents_created: self.agents_created.load(Ordering::Relaxed),
75            verifications: self.verifications.load(Ordering::Relaxed),
76            verifications_passed: self.verifications_passed.load(Ordering::Relaxed),
77            audit_events: self.audit_events.load(Ordering::Relaxed),
78        }
79    }
80
81    /// Get verification success rate
82    pub fn verification_rate(&self) -> f64 {
83        let total = self.verifications.load(Ordering::Relaxed);
84        let passed = self.verifications_passed.load(Ordering::Relaxed);
85        if total == 0 {
86            0.0
87        } else {
88            passed as f64 / total as f64
89        }
90    }
91
92    /// Get LLM error rate
93    pub fn llm_error_rate(&self) -> f64 {
94        let total = self.llm_calls.load(Ordering::Relaxed);
95        let errors = self.llm_errors.load(Ordering::Relaxed);
96        if total == 0 {
97            0.0
98        } else {
99            errors as f64 / total as f64
100        }
101    }
102}
103
104/// Snapshot of metrics at a point in time
105#[derive(Debug, Clone, Serialize, Deserialize)]
106pub struct MetricsSnapshot {
107    pub llm_calls: u64,
108    pub llm_errors: u64,
109    pub tokens_used: u64,
110    pub debates: u64,
111    pub agents_created: u64,
112    pub verifications: u64,
113    pub verifications_passed: u64,
114    pub audit_events: u64,
115}
116
117impl MetricsSnapshot {
118    /// Export metrics in Prometheus text format
119    pub fn to_prometheus(&self) -> String {
120        let mut output = String::new();
121
122        // LLM metrics
123        output.push_str("# HELP vex_llm_calls_total Total number of LLM API calls\n");
124        output.push_str("# TYPE vex_llm_calls_total counter\n");
125        output.push_str(&format!("vex_llm_calls_total {}\n", self.llm_calls));
126
127        output.push_str("# HELP vex_llm_errors_total Total number of LLM API errors\n");
128        output.push_str("# TYPE vex_llm_errors_total counter\n");
129        output.push_str(&format!("vex_llm_errors_total {}\n", self.llm_errors));
130
131        output.push_str("# HELP vex_tokens_used_total Total tokens consumed by LLM calls\n");
132        output.push_str("# TYPE vex_tokens_used_total counter\n");
133        output.push_str(&format!("vex_tokens_used_total {}\n", self.tokens_used));
134
135        // Agent metrics
136        output.push_str("# HELP vex_agents_created_total Total number of agents created\n");
137        output.push_str("# TYPE vex_agents_created_total counter\n");
138        output.push_str(&format!(
139            "vex_agents_created_total {}\n",
140            self.agents_created
141        ));
142
143        output.push_str("# HELP vex_debates_total Total number of debates conducted\n");
144        output.push_str("# TYPE vex_debates_total counter\n");
145        output.push_str(&format!("vex_debates_total {}\n", self.debates));
146
147        // Verification metrics
148        output.push_str("# HELP vex_verifications_total Total adversarial verifications\n");
149        output.push_str("# TYPE vex_verifications_total counter\n");
150        output.push_str(&format!("vex_verifications_total {}\n", self.verifications));
151
152        output.push_str("# HELP vex_verifications_passed_total Successful verifications\n");
153        output.push_str("# TYPE vex_verifications_passed_total counter\n");
154        output.push_str(&format!(
155            "vex_verifications_passed_total {}\n",
156            self.verifications_passed
157        ));
158
159        // Audit metrics
160        output.push_str("# HELP vex_audit_events_total Total audit events logged\n");
161        output.push_str("# TYPE vex_audit_events_total counter\n");
162        output.push_str(&format!("vex_audit_events_total {}\n", self.audit_events));
163
164        // Derived gauges
165        let error_rate = if self.llm_calls > 0 {
166            self.llm_errors as f64 / self.llm_calls as f64
167        } else {
168            0.0
169        };
170        output.push_str("# HELP vex_llm_error_rate Current LLM error rate\n");
171        output.push_str("# TYPE vex_llm_error_rate gauge\n");
172        output.push_str(&format!("vex_llm_error_rate {:.4}\n", error_rate));
173
174        let verification_rate = if self.verifications > 0 {
175            self.verifications_passed as f64 / self.verifications as f64
176        } else {
177            0.0
178        };
179        output.push_str("# HELP vex_verification_success_rate Verification success rate\n");
180        output.push_str("# TYPE vex_verification_success_rate gauge\n");
181        output.push_str(&format!(
182            "vex_verification_success_rate {:.4}\n",
183            verification_rate
184        ));
185
186        output
187    }
188}
189
190/// Timer for measuring durations
191pub struct Timer {
192    start: Instant,
193    name: String,
194}
195
196impl Timer {
197    pub fn new(name: &str) -> Self {
198        Self {
199            start: Instant::now(),
200            name: name.to_string(),
201        }
202    }
203
204    pub fn elapsed(&self) -> Duration {
205        self.start.elapsed()
206    }
207
208    pub fn elapsed_ms(&self) -> u64 {
209        self.elapsed().as_millis() as u64
210    }
211}
212
213impl Drop for Timer {
214    fn drop(&mut self) {
215        // In production, this would emit to a tracing system
216        #[cfg(debug_assertions)]
217        {
218            let elapsed = self.elapsed();
219            if elapsed > Duration::from_secs(1) {
220                eprintln!("[SLOW] {} took {:?}", self.name, elapsed);
221            }
222        }
223    }
224}
225
226/// Trace span for structured logging
227#[derive(Debug)]
228pub struct Span {
229    name: String,
230    start: Instant,
231    attributes: Vec<(String, String)>,
232}
233
234impl Span {
235    pub fn new(name: &str) -> Self {
236        Self {
237            name: name.to_string(),
238            start: Instant::now(),
239            attributes: Vec::new(),
240        }
241    }
242
243    pub fn set_attribute(&mut self, key: &str, value: &str) {
244        self.attributes.push((key.to_string(), value.to_string()));
245    }
246
247    pub fn with_attribute(mut self, key: &str, value: &str) -> Self {
248        self.set_attribute(key, value);
249        self
250    }
251}
252
253impl Drop for Span {
254    fn drop(&mut self) {
255        // In production, this would emit to OpenTelemetry
256        #[cfg(debug_assertions)]
257        {
258            let elapsed = self.start.elapsed();
259            if !self.attributes.is_empty() || elapsed > Duration::from_millis(100) {
260                eprintln!(
261                    "[TRACE] {} ({:?}) {:?}",
262                    self.name, elapsed, self.attributes
263                );
264            }
265        }
266    }
267}
268
269/// Global metrics instance
270static GLOBAL_METRICS: std::sync::OnceLock<Arc<Metrics>> = std::sync::OnceLock::new();
271
272/// Get or initialize global metrics
273pub fn global_metrics() -> Arc<Metrics> {
274    GLOBAL_METRICS
275        .get_or_init(|| Arc::new(Metrics::new()))
276        .clone()
277}
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282
283    #[test]
284    fn test_metrics() {
285        let metrics = Metrics::new();
286
287        metrics.record_llm_call(100, false);
288        metrics.record_llm_call(50, true);
289        metrics.record_verification(true);
290        metrics.record_verification(false);
291
292        let snapshot = metrics.snapshot();
293        assert_eq!(snapshot.llm_calls, 2);
294        assert_eq!(snapshot.llm_errors, 1);
295        assert_eq!(snapshot.tokens_used, 150);
296        assert_eq!(snapshot.verifications, 2);
297        assert_eq!(snapshot.verifications_passed, 1);
298
299        assert_eq!(metrics.verification_rate(), 0.5);
300        assert_eq!(metrics.llm_error_rate(), 0.5);
301    }
302
303    #[test]
304    fn test_timer() {
305        let timer = Timer::new("test_operation");
306        std::thread::sleep(std::time::Duration::from_millis(10));
307        assert!(timer.elapsed_ms() >= 10);
308    }
309}