001package io.prometheus.client; 002 003import io.prometheus.client.CKMSQuantiles.Quantile; 004 005import java.io.Closeable; 006import java.util.ArrayList; 007import java.util.Collections; 008import java.util.List; 009import java.util.Map; 010import java.util.SortedMap; 011import java.util.TreeMap; 012import java.util.concurrent.Callable; 013import java.util.concurrent.TimeUnit; 014 015/** 016 * {@link Summary} metrics and {@link Histogram} metrics can both be used to monitor distributions like latencies or request sizes. 017 * <p> 018 * An overview of when to use Summaries and when to use Histograms can be found on <a href="https://prometheus.io/docs/practices/histograms">https://prometheus.io/docs/practices/histograms</a>. 019 * <p> 020 * The following example shows how to measure latencies and request sizes: 021 * 022 * <pre> 023 * class YourClass { 024 * 025 * private static final Summary requestLatency = Summary.build() 026 * .name("requests_latency_seconds") 027 * .help("request latency in seconds") 028 * .register(); 029 * 030 * private static final Summary receivedBytes = Summary.build() 031 * .name("requests_size_bytes") 032 * .help("request size in bytes") 033 * .register(); 034 * 035 * public void processRequest(Request req) { 036 * Summary.Timer requestTimer = requestLatency.startTimer(); 037 * try { 038 * // Your code here. 039 * } finally { 040 * requestTimer.observeDuration(); 041 * receivedBytes.observe(req.size()); 042 * } 043 * } 044 * } 045 * </pre> 046 * 047 * The {@link Summary} class provides different utility methods for observing values, like {@link #observe(double)}, 048 * {@link #startTimer()} and {@link Timer#observeDuration()}, {@link #time(Callable)}, etc. 049 * <p> 050 * By default, {@link Summary} metrics provide the {@code count} and the {@code sum}. For example, if you measure 051 * latencies of a REST service, the {@code count} will tell you how often the REST service was called, 052 * and the {@code sum} will tell you the total aggregated response time. 053 * You can calculate the average response time using a Prometheus query dividing {@code sum / count}. 054 * <p> 055 * In addition to {@code count} and {@code sum}, you can configure a Summary to provide quantiles: 056 * 057 * <pre> 058 * Summary requestLatency = Summary.build() 059 * .name("requests_latency_seconds") 060 * .help("Request latency in seconds.") 061 * .quantile(0.5, 0.01) // 0.5 quantile (median) with 0.01 allowed error 062 * .quantile(0.95, 0.005) // 0.95 quantile with 0.005 allowed error 063 * // ... 064 * .register(); 065 * </pre> 066 * 067 * As an example, a 0.95 quantile of 120ms tells you that 95% of the calls were faster than 120ms, and 5% of the calls were slower than 120ms. 068 * <p> 069 * Tracking exact quantiles require a large amount of memory, because all observations need to be stored in a sorted list. Therefore, we allow an error to significantly reduce memory usage. 070 * <p> 071 * In the example, the allowed error of 0.005 means that you will not get the exact 0.95 quantile, but anything between the 0.945 quantile and the 0.955 quantile. 072 * <p> 073 * Experiments show that the {@link Summary} typically needs to keep less than 100 samples to provide that precision, even if you have hundreds of millions of observations. 074 * <p> 075 * There are a few special cases: 076 * 077 * <ul> 078 * <li>You can set an allowed error of 0, but then the {@link Summary} will keep all observations in memory.</li> 079 * <li>You can track the minimum value with {@code .quantile(0.0, 0.0)}. 080 * This special case will not use additional memory even though the allowed error is 0.</li> 081 * <li>You can track the maximum value with {@code .quantile(1.0, 0.0)}. 082 * This special case will not use additional memory even though the allowed error is 0.</li> 083 * </ul> 084 * 085 * Typically, you don't want to have a {@link Summary} representing the entire runtime of the application, 086 * but you want to look at a reasonable time interval. {@link Summary} metrics implement a configurable sliding 087 * time window: 088 * 089 * <pre> 090 * Summary requestLatency = Summary.build() 091 * .name("requests_latency_seconds") 092 * .help("Request latency in seconds.") 093 * .maxAgeSeconds(10 * 60) 094 * .ageBuckets(5) 095 * // ... 096 * .register(); 097 * </pre> 098 * 099 * The default is a time window of 10 minutes and 5 age buckets, i.e. the time window is 10 minutes wide, and 100 * we slide it forward every 2 minutes. 101 */ 102public class Summary extends SimpleCollector<Summary.Child> implements Counter.Describable { 103 104 final List<Quantile> quantiles; // Can be empty, but can never be null. 105 final long maxAgeSeconds; 106 final int ageBuckets; 107 108 Summary(Builder b) { 109 super(b); 110 quantiles = Collections.unmodifiableList(new ArrayList<Quantile>(b.quantiles)); 111 this.maxAgeSeconds = b.maxAgeSeconds; 112 this.ageBuckets = b.ageBuckets; 113 initializeNoLabelsChild(); 114 } 115 116 public static class Builder extends SimpleCollector.Builder<Builder, Summary> { 117 118 private final List<Quantile> quantiles = new ArrayList<Quantile>(); 119 private long maxAgeSeconds = TimeUnit.MINUTES.toSeconds(10); 120 private int ageBuckets = 5; 121 122 /** 123 * The class JavaDoc for {@link Summary} has more information on {@link #quantile(double, double)}. 124 * @see Summary 125 */ 126 public Builder quantile(double quantile, double error) { 127 if (quantile < 0.0 || quantile > 1.0) { 128 throw new IllegalArgumentException("Quantile " + quantile + " invalid: Expected number between 0.0 and 1.0."); 129 } 130 if (error < 0.0 || error > 1.0) { 131 throw new IllegalArgumentException("Error " + error + " invalid: Expected number between 0.0 and 1.0."); 132 } 133 quantiles.add(new Quantile(quantile, error)); 134 return this; 135 } 136 137 /** 138 * The class JavaDoc for {@link Summary} has more information on {@link #maxAgeSeconds(long)} 139 * @see Summary 140 */ 141 public Builder maxAgeSeconds(long maxAgeSeconds) { 142 if (maxAgeSeconds <= 0) { 143 throw new IllegalArgumentException("maxAgeSeconds cannot be " + maxAgeSeconds); 144 } 145 this.maxAgeSeconds = maxAgeSeconds; 146 return this; 147 } 148 149 /** 150 * The class JavaDoc for {@link Summary} has more information on {@link #ageBuckets(int)} 151 * @see Summary 152 */ 153 public Builder ageBuckets(int ageBuckets) { 154 if (ageBuckets <= 0) { 155 throw new IllegalArgumentException("ageBuckets cannot be " + ageBuckets); 156 } 157 this.ageBuckets = ageBuckets; 158 return this; 159 } 160 161 @Override 162 public Summary create() { 163 for (String label : labelNames) { 164 if (label.equals("quantile")) { 165 throw new IllegalStateException("Summary cannot have a label named 'quantile'."); 166 } 167 } 168 dontInitializeNoLabelsChild = true; 169 return new Summary(this); 170 } 171 } 172 173 /** 174 * Return a Builder to allow configuration of a new Summary. Ensures required fields are provided. 175 * 176 * @param name The name of the metric 177 * @param help The help string of the metric 178 */ 179 public static Builder build(String name, String help) { 180 return new Builder().name(name).help(help); 181 } 182 183 /** 184 * Return a Builder to allow configuration of a new Summary. 185 */ 186 public static Builder build() { 187 return new Builder(); 188 } 189 190 @Override 191 protected Child newChild() { 192 return new Child(quantiles, maxAgeSeconds, ageBuckets); 193 } 194 195 196 /** 197 * Represents an event being timed. 198 */ 199 public static class Timer implements Closeable { 200 private final Child child; 201 private final long start; 202 private Timer(Child child, long start) { 203 this.child = child; 204 this.start = start; 205 } 206 /** 207 * Observe the amount of time in seconds since {@link Child#startTimer} was called. 208 * @return Measured duration in seconds since {@link Child#startTimer} was called. 209 */ 210 public double observeDuration() { 211 double elapsed = SimpleTimer.elapsedSecondsFromNanos(start, SimpleTimer.defaultTimeProvider.nanoTime()); 212 child.observe(elapsed); 213 return elapsed; 214 } 215 216 /** 217 * Equivalent to calling {@link #observeDuration()}. 218 */ 219 @Override 220 public void close() { 221 observeDuration(); 222 } 223 } 224 225 /** 226 * The value of a single Summary. 227 * <p> 228 * <em>Warning:</em> References to a Child become invalid after using 229 * {@link SimpleCollector#remove} or {@link SimpleCollector#clear}. 230 */ 231 public static class Child { 232 233 /** 234 * Executes runnable code (e.g. a Java 8 Lambda) and observes a duration of how long it took to run. 235 * 236 * @param timeable Code that is being timed 237 * @return Measured duration in seconds for timeable to complete. 238 */ 239 public double time(Runnable timeable) { 240 Timer timer = startTimer(); 241 242 double elapsed; 243 try { 244 timeable.run(); 245 } finally { 246 elapsed = timer.observeDuration(); 247 } 248 return elapsed; 249 } 250 251 /** 252 * Executes callable code (e.g. a Java 8 Lambda) and observes a duration of how long it took to run. 253 * 254 * @param timeable Code that is being timed 255 * @return Result returned by callable. 256 */ 257 public <E> E time(Callable<E> timeable) { 258 Timer timer = startTimer(); 259 260 try { 261 return timeable.call(); 262 } catch (RuntimeException e) { 263 throw e; 264 } catch (Exception e) { 265 throw new RuntimeException(e); 266 } finally { 267 timer.observeDuration(); 268 } 269 } 270 271 public static class Value { 272 public final double count; 273 public final double sum; 274 public final SortedMap<Double, Double> quantiles; 275 public final long created; 276 277 private Value(double count, double sum, List<Quantile> quantiles, TimeWindowQuantiles quantileValues, long created) { 278 this.count = count; 279 this.sum = sum; 280 this.quantiles = Collections.unmodifiableSortedMap(snapshot(quantiles, quantileValues)); 281 this.created = created; 282 } 283 284 private SortedMap<Double, Double> snapshot(List<Quantile> quantiles, TimeWindowQuantiles quantileValues) { 285 SortedMap<Double, Double> result = new TreeMap<Double, Double>(); 286 for (Quantile q : quantiles) { 287 result.put(q.quantile, quantileValues.get(q.quantile)); 288 } 289 return result; 290 } 291 } 292 293 // Having these separate leaves us open to races, 294 // however Prometheus as whole has other races 295 // that mean adding atomicity here wouldn't be useful. 296 // This should be reevaluated in the future. 297 private final DoubleAdder count = new DoubleAdder(); 298 private final DoubleAdder sum = new DoubleAdder(); 299 private final List<Quantile> quantiles; 300 private final TimeWindowQuantiles quantileValues; 301 private final long created = System.currentTimeMillis(); 302 303 private Child(List<Quantile> quantiles, long maxAgeSeconds, int ageBuckets) { 304 this.quantiles = quantiles; 305 if (quantiles.size() > 0) { 306 quantileValues = new TimeWindowQuantiles(quantiles.toArray(new Quantile[]{}), maxAgeSeconds, ageBuckets); 307 } else { 308 quantileValues = null; 309 } 310 } 311 312 /** 313 * Observe the given amount. 314 * @param amt in most cases amt should be >= 0. Negative values are supported, but you should read 315 * <a href="https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations"> 316 * https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations</a> for 317 * implications and alternatives. 318 */ 319 public void observe(double amt) { 320 count.add(1); 321 sum.add(amt); 322 if (quantileValues != null) { 323 quantileValues.insert(amt); 324 } 325 } 326 /** 327 * Start a timer to track a duration. 328 * <p> 329 * Call {@link Timer#observeDuration} at the end of what you want to measure the duration of. 330 */ 331 public Timer startTimer() { 332 return new Timer(this, SimpleTimer.defaultTimeProvider.nanoTime()); 333 } 334 /** 335 * Get the value of the Summary. 336 * <p> 337 * <em>Warning:</em> The definition of {@link Value} is subject to change. 338 */ 339 public Value get() { 340 return new Value(count.sum(), sum.sum(), quantiles, quantileValues, created); 341 } 342 } 343 344 // Convenience methods. 345 /** 346 * Observe the given amount on the summary with no labels. 347 * @param amt in most cases amt should be >= 0. Negative values are supported, but you should read 348 * <a href="https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations"> 349 * https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations</a> for 350 * implications and alternatives. 351 */ 352 public void observe(double amt) { 353 noLabelsChild.observe(amt); 354 } 355 /** 356 * Start a timer to track a duration on the summary with no labels. 357 * <p> 358 * Call {@link Timer#observeDuration} at the end of what you want to measure the duration of. 359 */ 360 public Timer startTimer() { 361 return noLabelsChild.startTimer(); 362 } 363 364 /** 365 * Executes runnable code (e.g. a Java 8 Lambda) and observes a duration of how long it took to run. 366 * 367 * @param timeable Code that is being timed 368 * @return Measured duration in seconds for timeable to complete. 369 */ 370 public double time(Runnable timeable){ 371 return noLabelsChild.time(timeable); 372 } 373 374 /** 375 * Executes callable code (e.g. a Java 8 Lambda) and observes a duration of how long it took to run. 376 * 377 * @param timeable Code that is being timed 378 * @return Result returned by callable. 379 */ 380 public <E> E time(Callable<E> timeable){ 381 return noLabelsChild.time(timeable); 382 } 383 384 /** 385 * Get the value of the Summary. 386 * <p> 387 * <em>Warning:</em> The definition of {@link Child.Value} is subject to change. 388 */ 389 public Child.Value get() { 390 return noLabelsChild.get(); 391 } 392 393 @Override 394 public List<MetricFamilySamples> collect() { 395 List<MetricFamilySamples.Sample> samples = new ArrayList<MetricFamilySamples.Sample>(); 396 for(Map.Entry<List<String>, Child> c: children.entrySet()) { 397 Child.Value v = c.getValue().get(); 398 List<String> labelNamesWithQuantile = new ArrayList<String>(labelNames); 399 labelNamesWithQuantile.add("quantile"); 400 for(Map.Entry<Double, Double> q : v.quantiles.entrySet()) { 401 List<String> labelValuesWithQuantile = new ArrayList<String>(c.getKey()); 402 labelValuesWithQuantile.add(doubleToGoString(q.getKey())); 403 samples.add(new MetricFamilySamples.Sample(fullname, labelNamesWithQuantile, labelValuesWithQuantile, q.getValue())); 404 } 405 samples.add(new MetricFamilySamples.Sample(fullname + "_count", labelNames, c.getKey(), v.count)); 406 samples.add(new MetricFamilySamples.Sample(fullname + "_sum", labelNames, c.getKey(), v.sum)); 407 samples.add(new MetricFamilySamples.Sample(fullname + "_created", labelNames, c.getKey(), v.created / 1000.0)); 408 } 409 410 return familySamplesList(Type.SUMMARY, samples); 411 } 412 413 @Override 414 public List<MetricFamilySamples> describe() { 415 return Collections.<MetricFamilySamples>singletonList(new SummaryMetricFamily(fullname, help, labelNames)); 416 } 417 418}