fix(docs): Static content
[csit.git] / docs / report / introduction / methodology_telemetry.rst
1 .. _telemetry:
2
3 OpenMetrics
4 -----------
5
6 OpenMetrics specifies the de-facto standard for transmitting cloud-native
7 metrics at scale, with support for both text representation and Protocol
8 Buffers.
9
10 RFC
11 ~~~
12
13 - RFC2119
14 - RFC5234
15 - RFC8174
16 - draft-richih-opsawg-openmetrics-00
17
18 Reference
19 ~~~~~~~~~
20
21 `OpenMetrics <https://github.com/OpenObservability/OpenMetrics/blob/master/specification/OpenMetrics.md>`_
22
23 Metric Types
24 ~~~~~~~~~~~~
25
26 - Gauge
27 - Counter
28 - StateSet
29 - Info
30 - Histogram
31 - GaugeHistogram
32 - Summary
33 - Unknown
34
35 Telemetry module in CSIT currently support only Gauge, Counter and Info.
36
37 Example metric file
38 ~~~~~~~~~~~~~~~~~~~
39
40 ::
41
42   # HELP calls_total Number of calls total
43   # TYPE calls_total counter
44   calls_total{name="api-rx-from-ring",state="active",thread_id="0",thread_lcore="1",thread_name="vpp_main"} 0.0
45   calls_total{name="fib-walk",state="any wait",thread_id="0",thread_lcore="1",thread_name="vpp_main"} 0.0
46   calls_total{name="ip6-mld-process",state="any wait",thread_id="0",thread_lcore="1",thread_name="vpp_main"} 0.0
47   calls_total{name="ip6-ra-process",state="any wait",thread_id="0",thread_lcore="1",thread_name="vpp_main"} 0.0
48   calls_total{name="unix-epoll-input",state="polling",thread_id="0",thread_lcore="1",thread_name="vpp_main"} 39584.0
49   calls_total{name="avf-0/18/6/0-output",state="active",thread_id="1",thread_lcore="2",thread_name="vpp_wk_0"} 91.0
50   calls_total{name="avf-0/18/6/0-tx",state="active",thread_id="1",thread_lcore="2",thread_name="vpp_wk_0"} 91.0
51   calls_total{name="avf-input",state="polling",thread_id="1",thread_lcore="2",thread_name="vpp_wk_0"} 91.0
52   calls_total{name="ethernet-input",state="active",thread_id="1",thread_lcore="2",thread_name="vpp_wk_0"} 91.0
53   calls_total{name="ip4-input-no-checksum",state="active",thread_id="1",thread_lcore="2",thread_name="vpp_wk_0"} 91.0
54   calls_total{name="ip4-lookup",state="active",thread_id="1",thread_lcore="2",thread_name="vpp_wk_0"} 91.0
55   calls_total{name="ip4-rewrite",state="active",thread_id="1",thread_lcore="2",thread_name="vpp_wk_0"} 91.0
56   calls_total{name="avf-0/18/2/0-output",state="active",thread_id="2",thread_lcore="0",thread_name="vpp_wk_1"} 91.0
57   calls_total{name="avf-0/18/2/0-tx",state="active",thread_id="2",thread_lcore="0",thread_name="vpp_wk_1"} 91.0
58   calls_total{name="avf-input",state="polling",thread_id="2",thread_lcore="0",thread_name="vpp_wk_1"} 91.0
59   calls_total{name="ethernet-input",state="active",thread_id="2",thread_lcore="0",thread_name="vpp_wk_1"} 91.0
60   calls_total{name="ip4-input-no-checksum",state="active",thread_id="2",thread_lcore="0",thread_name="vpp_wk_1"} 91.0
61   calls_total{name="ip4-lookup",state="active",thread_id="2",thread_lcore="0",thread_name="vpp_wk_1"} 91.0
62   calls_total{name="ip4-rewrite",state="active",thread_id="2",thread_lcore="0",thread_name="vpp_wk_1"} 91.0
63   calls_total{name="unix-epoll-input",state="polling",thread_id="2",thread_lcore="0",thread_name="vpp_wk_1"} 1.0
64
65
66 Anatomy of existing CSIT telemetry implementation
67 -------------------------------------------------
68
69 Existing implementation consists of several measurment building blocks:
70 the main measuring block running search algorithms (MLR, PLR, SOAK, MRR, ...),
71 the latency measuring block and the several telemetry blocks with or without
72 traffic running on a background.
73
74 The main measuring block must not be interrupted by any read operation that can
75 impact data plane traffic processing during throughput search algorithm. Thus
76 operational reads are done before (pre-stat) and after (post-stat) that block.
77
78 Some operational reads must be done while traffic is running and usually
79 consists of two reads (pre-run-stat, post-run-stat) with defined delay between
80 them.
81
82 MRR measurement
83 ~~~~~~~~~~~~~~~
84
85 ::
86
87   traffic_start(r=mrr)               traffic_stop       |<     measure     >|
88     |                                  |                |      (r=mrr)      |
89     |   pre_run_stat   post_run_stat   |    pre_stat    |                   |  post_stat
90     |        |               |         |       |        |                   |      |
91   --o--------o---------------o---------o-------o--------+-------------------+------o------------>
92                                                                                               t
93
94   Legend:
95     - pre_run_stat
96       - vpp-clear-runtime
97     - post_run_stat
98       - vpp-show-runtime
99       - bash-perf-stat            // if extended_debug == True
100     - pre_stat
101       - vpp-clear-stats
102       - vpp-enable-packettrace    // if extended_debug == True
103       - vpp-enable-elog
104     - post_stat
105       - vpp-show-stats
106       - vpp-show-packettrace      // if extended_debug == True
107       - vpp-show-elog
108
109 ::
110
111     |<                                measure                                 >|
112     |                                 (r=mrr)                                  |
113     |                                                                          |
114     |<    traffic_trial0    >|<    traffic_trial1    >|<    traffic_trialN    >|
115     |    (i=0,t=duration)    |    (i=1,t=duration)    |    (i=N,t=duration)    |
116     |                        |                        |                        |
117   --o------------------------o------------------------o------------------------o--->
118                                                                                  t
119
120
121 MLR measurement
122 ~~~~~~~~~~~~~~~
123
124 ::
125
126     |<     measure     >|   traffic_start(r=pdr)               traffic_stop   traffic_start(r=ndr)               traffic_stop  |< [    latency    ] >|
127     |      (r=mlr)      |    |                                  |              |                                  |            |     .9/.5/.1/.0     |
128     |                   |    |   pre_run_stat   post_run_stat   |              |   pre_run_stat   post_run_stat   |            |                     |
129     |                   |    |        |               |         |              |        |               |         |            |                     |
130   --+-------------------+----o--------o---------------o---------o--------------o--------o---------------o---------o------------[---------------------]--->
131                                                                                                                                                        t
132
133   Legend:
134     - pre_run_stat
135       - vpp-clear-runtime
136     - post_run_stat
137       - vpp-show-runtime
138       - bash-perf-stat          // if extended_debug == True
139     - pre_stat
140       - vpp-clear-stats
141       - vpp-enable-packettrace  // if extended_debug == True
142       - vpp-enable-elog
143     - post_stat
144       - vpp-show-stats
145       - vpp-show-packettrace    // if extended_debug == True
146       - vpp-show-elog
147
148
149 Improving existing solution
150 ---------------------------
151
152 Improving existing CSIT telemetry implementaion including these areas.
153
154 - telemetry optimization
155   - reducing ssh overhead
156   - removing stats without added value
157 - telemetry scheduling
158   - improve accuracy
159   - improve configuration
160 - telemetry output
161   - standardize output
162
163 Exesting stats implementation was abstracted to having pre-/post-run-stats
164 phases. Improvement will be done by merging pre-/post- logic implementation into
165 separated stat-runtime block configurable and locally executed on SUT.
166
167 This will increase precision, remove complexity and move implementation into
168 spearated module.
169
170 OpenMetric format for cloud native metric capturing will be used to ensure
171 integration with post processing module.
172
173 MRR measurement
174 ~~~~~~~~~~~~~~~
175
176 ::
177
178     traffic_start(r=mrr)               traffic_stop                 |<     measure     >|
179       |                                  |                          |      (r=mrr)      |
180       |   |<      stat_runtime      >|   |          stat_pre_trial  |                   |  stat_post_trial
181       |   |                          |   |             |            |                   |     |
182   ----o---+--------------------------+---o-------------o------------+-------------------+-----o------------->
183                                                                                                           t
184
185   Legend:
186     - stat_runtime
187       - vpp-runtime
188     - stat_pre_trial
189       - vpp-clear-stats
190       - vpp-enable-packettrace  // if extended_debug == True
191     - stat_post_trial
192       - vpp-show-stats
193       - vpp-show-packettrace    // if extended_debug == True
194
195
196 ::
197
198     |<                                measure                                 >|
199     |                                 (r=mrr)                                  |
200     |                                                                          |
201     |<    traffic_trial0    >|<    traffic_trial1    >|<    traffic_trialN    >|
202     |    (i=0,t=duration)    |    (i=1,t=duration)    |    (i=N,t=duration)    |
203     |                        |                        |                        |
204   --o------------------------o------------------------o------------------------o--->
205                                                                                  t
206
207 ::
208
209     |<                              stat_runtime                              >|
210     |                                                                          |
211     |<       program0       >|<       program1       >|<       programN       >|
212     |       (@=params)       |       (@=params)       |       (@=params)       |
213     |                        |                        |                        |
214   --o------------------------o------------------------o------------------------o--->
215                                                                                  t
216
217
218 MLR measurement
219 ~~~~~~~~~~~~~~~
220
221 ::
222
223     |<     measure     >|   traffic_start(r=pdr)               traffic_stop   traffic_start(r=ndr)               traffic_stop  |< [    latency    ] >|
224     |      (r=mlr)      |     |                                  |              |                                  |           |     .9/.5/.1/.0     |
225     |                   |     |   |<      stat_runtime      >|   |              |   |<      stat_runtime      >|   |           |                     |
226     |                   |     |   |                          |   |              |   |                          |   |           |                     |
227   --+-------------------+-----o---+--------------------------+---o--------------o---+--------------------------+---o-----------[---------------------]--->
228                                                                                                                                                        t
229
230   Legend:
231     - stat_runtime
232       - vpp-runtime
233     - stat_pre_trial
234       - vpp-clear-stats
235       - vpp-enable-packettrace  // if extended_debug == True
236     - stat_post_trial
237       - vpp-show-stats
238       - vpp-show-packettrace    // if extended_debug == True