metrics: add latency histogram statistics

QuerthDP · QuerthDP · commit d989a59fac99 · 2024-12-11T11:50:44.000+01:00
I've added histogram metrics used in cpp-rust-driver.

The snapshot of histogram statistics is taken under concurrency precautions using lock-free histogram features.

I've adjusted the docs book adding an example of taking the snapshot and accessing it's values.
diff --git a/docs/source/metrics/metrics.md b/docs/source/metrics/metrics.md
@@ -1,5 +1,7 @@
 # Driver metrics
 
+This feature is available only under the crate feature `metrics`.
+
 During operation the driver collects various metrics.
 
 They can be accessed at any moment using `Session::get_metrics()`
@@ -11,6 +13,7 @@ They can be accessed at any moment using `Session::get_metrics()`
 * Total number of paged queries
 * Number of errors during paged queries
 * Number of retries
+* Latency histogram statistics (min, max, mean, standard deviation, percentiles)
 
 ### Example
 ```rust
@@ -29,6 +32,18 @@ println!(
     "99.9 latency percentile: {}",
     metrics.get_latency_percentile_ms(99.9).unwrap()
 );
+
+let snapshot = metrics.get_snapshot().unwrap();
+println!("Min: {}", snapshot.min);
+println!("Max: {}", snapshot.max);
+println!("Mean: {}", snapshot.mean);
+println!("Standard deviation: {}", snapshot.stddev);
+println!("Median: {}", snapshot.median);
+println!("75th percentile: {}", snapshot.percentile_75);
+println!("90th percentile: {}", snapshot.percentile_90);
+println!("95th percentile: {}", snapshot.percentile_95);
+println!("99th percentile: {}", snapshot.percentile_99);
+println!("99.9th percentile: {}", snapshot.percentile_99_9);
 # Ok(())
 # }
 ```
diff --git a/examples/basic.rs b/examples/basic.rs
@@ -100,6 +100,18 @@ async fn main() -> Result<()> {
         metrics.get_latency_percentile_ms(99.9).unwrap()
     );
 
+    let snapshot = metrics.get_snapshot().unwrap();
+    println!("Min: {}", snapshot.min);
+    println!("Max: {}", snapshot.max);
+    println!("Mean: {}", snapshot.mean);
+    println!("Standard deviation: {}", snapshot.stddev);
+    println!("Median: {}", snapshot.median);
+    println!("75th percentile: {}", snapshot.percentile_75);
+    println!("90th percentile: {}", snapshot.percentile_90);
+    println!("95th percentile: {}", snapshot.percentile_95);
+    println!("99th percentile: {}", snapshot.percentile_99);
+    println!("99.9th percentile: {}", snapshot.percentile_99_9);
+
     println!("Ok.");
 
     Ok(())
diff --git a/scylla/src/transport/histogram/lock_free_histogram.rs b/scylla/src/transport/histogram/lock_free_histogram.rs
@@ -31,6 +31,23 @@ pub struct Histogram {
     config: Config,
 }
 
+/// Snapshot is a structure that contains histogram statistics such as
+/// min, max, mean, standard deviation, median, and most common percentiles
+/// collected in a certain moment.
+#[derive(Debug)]
+pub struct Snapshot {
+    pub min: u64,
+    pub max: u64,
+    pub mean: u64,
+    pub stddev: u64,
+    pub median: u64,
+    pub percentile_75: u64,
+    pub percentile_90: u64,
+    pub percentile_95: u64,
+    pub percentile_99: u64,
+    pub percentile_99_9: u64,
+}
+
 impl Histogram {
     pub fn new() -> Self {
         let grouping_power = 7;
@@ -109,6 +126,95 @@ impl Histogram {
         }
     }
 
+    pub fn snapshot() -> impl FnOnce(&[AtomicU64], &Config) -> Result<Snapshot, &'static str> {
+        |buckets, config| {
+            let total_count = Histogram::get_total_count(buckets);
+
+            let mut min = u64::MAX;
+            let mut max = 0;
+            let mut weighted_sum = 0;
+            let mut pref_sum = 0;
+            let mut percentile_75 = 0;
+            let mut percentile_90 = 0;
+            let mut percentile_95 = 0;
+            let mut percentile_99 = 0;
+            let mut percentile_99_9 = 0;
+
+            let percentile_75_threshold = (0.75 * total_count as f64).ceil() as u128;
+            let percentile_90_threshold = (0.9 * total_count as f64).ceil() as u128;
+            let percentile_95_threshold = (0.95 * total_count as f64).ceil() as u128;
+            let percentile_99_threshold = (0.99 * total_count as f64).ceil() as u128;
+            let percentile_99_9_threshold = (0.999 * total_count as f64).ceil() as u128;
+
+            for (i, bucket) in buckets.iter().enumerate() {
+                let count = bucket.load(ORDER_TYPE) as u128;
+                if count == 0 {
+                    continue;
+                }
+
+                let lower_bound = config.index_to_lower_bound(i);
+                let upper_bound = config.index_to_upper_bound(i);
+
+                if lower_bound < min {
+                    min = lower_bound;
+                }
+                if upper_bound > max {
+                    max = upper_bound;
+                }
+
+                weighted_sum += count * lower_bound as u128;
+
+                let next_pref_sum = pref_sum + count;
+                if pref_sum < percentile_75_threshold && next_pref_sum >= percentile_75_threshold {
+                    percentile_75 = lower_bound;
+                }
+                if pref_sum < percentile_90_threshold && next_pref_sum >= percentile_90_threshold {
+                    percentile_90 = lower_bound;
+                }
+                if pref_sum < percentile_95_threshold && next_pref_sum >= percentile_95_threshold {
+                    percentile_95 = lower_bound;
+                }
+                if pref_sum < percentile_99_threshold && next_pref_sum >= percentile_99_threshold {
+                    percentile_99 = lower_bound;
+                }
+                if pref_sum < percentile_99_9_threshold
+                    && next_pref_sum >= percentile_99_9_threshold
+                {
+                    percentile_99_9 = lower_bound;
+                }
+
+                pref_sum = next_pref_sum;
+            }
+
+            let mean = (weighted_sum / total_count) as u64;
+            let mut variance_sum = 0;
+            for (i, bucket) in buckets.iter().enumerate() {
+                let count = bucket.load(ORDER_TYPE) as u128;
+                if count == 0 {
+                    continue;
+                }
+
+                let lower_bound = config.index_to_lower_bound(i);
+                variance_sum += count * (lower_bound as u128 - mean as u128).pow(2);
+            }
+            let variance = variance_sum / total_count;
+            let stddev = (variance as f64).sqrt() as u64;
+
+            Ok(Snapshot {
+                min,
+                max,
+                mean,
+                stddev,
+                median: config.index_to_lower_bound(buckets.len() / 2),
+                percentile_75,
+                percentile_90,
+                percentile_95,
+                percentile_99,
+                percentile_99_9,
+            })
+        }
+    }
+
     pub fn get_total_count(buckets: &[AtomicU64]) -> u128 {
         buckets.iter().map(|v| v.load(ORDER_TYPE) as u128).sum()
     }
diff --git a/scylla/src/transport/histogram/mod.rs b/scylla/src/transport/histogram/mod.rs
@@ -3,3 +3,4 @@ mod lock_free_histogram;
 
 pub use config::Config;
 pub use lock_free_histogram::Histogram;
+pub use lock_free_histogram::Snapshot;
diff --git a/scylla/src/transport/metrics.rs b/scylla/src/transport/metrics.rs
@@ -1,4 +1,4 @@
-use crate::transport::histogram::Histogram;
+use crate::transport::histogram::{Histogram, Snapshot};
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::Arc;
 
@@ -97,6 +97,14 @@ impl Metrics {
         Ok(result)
     }
 
+    /// Returns snapshot of histogram metrics taken at the moment of calling this function. \
+    /// Available metrics: min, max, mean, std_dev, median,
+    ///                    percentile_90, percentile_95, percentile_99, percentile_99_9.
+    pub fn get_snapshot(&self) -> Result<Snapshot, MetricsError> {
+        let snapshot = self.histogram.log_operation(Histogram::snapshot())?;
+        Ok(snapshot)
+    }
+
     /// Returns counter for errors occurred in nonpaged queries
     pub fn get_errors_num(&self) -> u64 {
         self.errors_num.load(ORDER_TYPE)

Original file line number	Diff line number	Diff line change
`@@ -3,3 +3,4 @@ mod lock_free_histogram;`
`3`	`3`
`4`	`4`	`pub use config::Config;`
`5`	`5`	`pub use lock_free_histogram::Histogram;`
	`6`	`+pub use lock_free_histogram::Snapshot;`