2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
18 #include <vnet/gre/packet.h>
19 #include <lb/lbhash.h>
21 #define foreach_lb_error \
23 _(PROTO_NOT_SUPPORTED, "protocol not supported") \
24 _(NO_SERVER, "no configured application server")
27 #define _(sym,str) LB_ERROR_##sym,
33 static char *lb_error_strings[] = {
34 #define _(sym,string) string,
50 u8 *lb_format_adjacency(u8 * s,
51 struct ip_lookup_main_t * lm,
54 lb_main_t *lbm = &lb_main;
55 lb_adj_data_t *ad = (lb_adj_data_t *) &adj->opaque;
56 __attribute__((unused)) lb_vip_t *vip = pool_elt_at_index (lbm->vips, ad->vip_index);
57 return format(s, "idx:%d", ad->vip_index);
61 format_lb_trace (u8 * s, va_list * args)
63 lb_main_t *lbm = &lb_main;
64 CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
65 CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
66 lb_trace_t *t = va_arg (*args, lb_trace_t *);
67 s = format(s, "lb vip[%d]: %U\n", t->vip_index, format_lb_vip, &lbm->vips[t->vip_index]);
68 s = format(s, "lb as[%d]: %U\n", t->as_index, format_lb_as, &lbm->ass[t->as_index]);
72 lb_hash_t *lb_get_sticky_table(u32 cpu_index)
74 lb_main_t *lbm = &lb_main;
75 lb_hash_t *sticky_ht = lbm->per_cpu[cpu_index].sticky_ht;
76 //Check if size changed
77 if (PREDICT_FALSE(sticky_ht && (lbm->per_cpu_sticky_buckets != lb_hash_nbuckets(sticky_ht)))) {
79 //Dereference everything in there
81 lb_hash_foreach_entry(sticky_ht, e) {
82 vlib_refcount_add(&lbm->as_refcount, cpu_index, e->value, -1);
83 vlib_refcount_add(&lbm->as_refcount, cpu_index, 0, -1);
86 lb_hash_free(sticky_ht);
91 if (PREDICT_FALSE(sticky_ht == NULL)) {
92 lbm->per_cpu[cpu_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout);
93 sticky_ht = lbm->per_cpu[cpu_index].sticky_ht;
94 clib_warning("Regenerated sticky table %p", sticky_ht);
100 sticky_ht->timeout = lbm->flow_timeout;
104 static_always_inline uword
105 lb_node_fn (vlib_main_t * vm,
106 vlib_node_runtime_t * node, vlib_frame_t * frame,
107 u8 is_input_v4, //Compile-time parameter stating that is input is v4 (or v6)
108 u8 is_encap_v4) //Compile-time parameter stating that is GRE encap is v4 (or v6)
110 ip_lookup_main_t *lm = (is_input_v4)?&ip4_main.lookup_main:&ip6_main.lookup_main;
111 lb_main_t *lbm = &lb_main;
112 vlib_node_runtime_t *error_node = node;
113 u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
114 u32 cpu_index = os_get_cpu_number();
115 u32 lb_time = lb_hash_time_now(vm);
117 lb_hash_t *sticky_ht = lb_get_sticky_table(cpu_index);
118 from = vlib_frame_vector_args (frame);
119 n_left_from = frame->n_vectors;
120 next_index = node->cached_next_index;
122 while (n_left_from > 0)
124 vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
125 while (n_left_from > 0 && n_left_to_next > 0)
129 ip_adjacency_t *adj0;
135 u32 value0, available_index0, hash0;
137 lb_error_t error0 = LB_ERROR_NONE;
138 lb_next_t next0 = LB_NEXT_LOOKUP;
140 if (PREDICT_TRUE(n_left_from > 1))
143 p2 = vlib_get_buffer(vm, from[1]);
144 vlib_prefetch_buffer_header(p2, STORE);
145 /* IPv4 + 8 = 28. possibly plus -40 */
146 CLIB_PREFETCH (vlib_buffer_get_current(p2) - 40, 128, STORE);
149 pi0 = to_next[0] = from[0];
155 p0 = vlib_get_buffer (vm, pi0);
156 adj0 = ip_get_adjacency (lm, vnet_buffer (p0)->ip.adj_index[VLIB_TX]);
157 ad0 = (lb_adj_data_t *) &adj0->opaque;
158 vip0 = pool_elt_at_index (lbm->vips, ad0->vip_index);
162 ip40 = vlib_buffer_get_current (p0);
163 len0 = clib_net_to_host_u16(ip40->length);
164 key0[0] = (u64) ip40->src_address.as_u32;
165 key0[1] = (u64) ip40->dst_address.as_u32;
168 key0[4] = ((u64)((udp_header_t *)(ip40 + 1))->src_port << 32) |
169 ((u64)((udp_header_t *)(ip40 + 1))->dst_port << 16);
171 hash0 = lb_hash_hash(key0);
174 ip60 = vlib_buffer_get_current (p0);
175 len0 = clib_net_to_host_u16(ip60->payload_length) + sizeof(ip6_header_t);
176 key0[0] = ip60->src_address.as_u64[0];
177 key0[1] = ip60->src_address.as_u64[1];
178 key0[2] = ip60->dst_address.as_u64[0];
179 key0[3] = ip60->dst_address.as_u64[1];
180 key0[4] = ((u64)((udp_header_t *)(ip60 + 1))->src_port << 32) |
181 ((u64)((udp_header_t *)(ip60 + 1))->dst_port << 16);
183 hash0 = lb_hash_hash(key0);
186 //NOTE: This is an ugly trick to not include the VIP index in the hash calculation
187 //but actually use it in the key determination.
188 key0[4] |= ((vip0 - lbm->vips));
190 lb_hash_get(sticky_ht, key0, hash0, lb_time, &available_index0, &value0);
191 if (PREDICT_TRUE(value0 != ~0)) {
192 //Found an existing entry
193 as0 = &lbm->ass[value0];
194 } else if (PREDICT_TRUE(available_index0 != ~0)) {
195 //There is an available slot for a new flow
196 as0 = &lbm->ass[vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index];
197 if (PREDICT_FALSE(as0 == lbm->ass)) { //Special first element
198 error0 = LB_ERROR_NO_SERVER;
199 next0 = LB_NEXT_DROP;
201 vlib_increment_simple_counter(&lbm->vip_counters[LB_VIP_COUNTER_TRACKED_SESSION],
202 cpu_index, vip0 - lbm->vips, 1);
205 //TODO: There are race conditions with as0 and vip0 manipulation.
206 //Configuration may be changed, vectors resized, etc...
208 //Dereference previously used
209 vlib_refcount_add(&lbm->as_refcount, cpu_index, lb_hash_available_value(sticky_ht, available_index0), -1);
210 vlib_refcount_add(&lbm->as_refcount, cpu_index, as0 - lbm->ass, 1);
213 //Note that when there is no AS configured, an entry is configured anyway.
214 //But no configured AS is not something that should happen
215 lb_hash_put(sticky_ht, key0, as0 - lbm->ass, available_index0, lb_time);
217 //Could not store new entry in the table
218 as0 = &lbm->ass[vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index];
219 vlib_increment_simple_counter(&lbm->vip_counters[LB_VIP_COUNTER_UNTRACKED_PACKET],
220 cpu_index, vip0 - lbm->vips, 1);
226 vlib_buffer_advance(p0, - sizeof(ip4_header_t) - sizeof(gre_header_t));
227 ip40 = vlib_buffer_get_current(p0);
228 gre0 = (gre_header_t *)(ip40 + 1);
229 ip40->src_address = lbm->ip4_src_address;
230 ip40->dst_address = as0->address.ip4;
231 ip40->ip_version_and_header_length = 0x45;
233 ip40->length = clib_host_to_net_u16(len0 + sizeof(gre_header_t) + sizeof(ip4_header_t));
234 ip40->protocol = IP_PROTOCOL_GRE;
235 ip40->checksum = ip4_header_checksum (ip40);
238 vlib_buffer_advance(p0, - sizeof(ip6_header_t) - sizeof(gre_header_t));
239 ip60 = vlib_buffer_get_current(p0);
240 gre0 = (gre_header_t *)(ip60 + 1);
241 ip60->dst_address = as0->address.ip6;
242 ip60->src_address = lbm->ip6_src_address;
243 ip60->hop_limit = 128;
244 ip60->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6<<28);
245 ip60->payload_length = clib_host_to_net_u16(len0 + sizeof(gre_header_t));
246 ip60->protocol = IP_PROTOCOL_GRE;
249 gre0->flags_and_version = 0;
250 gre0->protocol = (is_input_v4)?
251 clib_host_to_net_u16(0x0800):
252 clib_host_to_net_u16(0x86DD);
254 if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
256 lb_trace_t *tr = vlib_add_trace (vm, node, p0, sizeof (*tr));
257 tr->as_index = as0 - lbm->ass;
258 tr->vip_index = ad0->vip_index;
261 p0->error = error_node->errors[error0];
262 vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
263 n_left_to_next, pi0, next0);
265 vlib_put_next_frame (vm, node, next_index, n_left_to_next);
268 return frame->n_vectors;
272 lb6_gre6_node_fn (vlib_main_t * vm,
273 vlib_node_runtime_t * node, vlib_frame_t * frame)
275 return lb_node_fn(vm, node, frame, 0, 0);
279 lb6_gre4_node_fn (vlib_main_t * vm,
280 vlib_node_runtime_t * node, vlib_frame_t * frame)
282 return lb_node_fn(vm, node, frame, 0, 1);
286 lb4_gre6_node_fn (vlib_main_t * vm,
287 vlib_node_runtime_t * node, vlib_frame_t * frame)
289 return lb_node_fn(vm, node, frame, 1, 0);
293 lb4_gre4_node_fn (vlib_main_t * vm,
294 vlib_node_runtime_t * node, vlib_frame_t * frame)
296 return lb_node_fn(vm, node, frame, 1, 1);
299 VLIB_REGISTER_NODE (lb6_gre6_node) =
301 .function = lb6_gre6_node_fn,
303 .vector_size = sizeof (u32),
304 .format_trace = format_lb_trace,
306 .n_errors = LB_N_ERROR,
307 .error_strings = lb_error_strings,
309 .n_next_nodes = LB_N_NEXT,
312 [LB_NEXT_LOOKUP] = "ip6-lookup",
313 [LB_NEXT_DROP] = "error-drop"
317 VNET_IP6_REGISTER_ADJACENCY(lb6_gre6) = {
318 .node_name = "lb6-gre6",
319 .fn = lb_format_adjacency,
320 .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP6_GRE6]
323 VLIB_REGISTER_NODE (lb6_gre4_node) =
325 .function = lb6_gre4_node_fn,
327 .vector_size = sizeof (u32),
328 .format_trace = format_lb_trace,
330 .n_errors = LB_N_ERROR,
331 .error_strings = lb_error_strings,
333 .n_next_nodes = LB_N_NEXT,
336 [LB_NEXT_LOOKUP] = "ip4-lookup",
337 [LB_NEXT_DROP] = "error-drop"
341 VNET_IP6_REGISTER_ADJACENCY(lb6_gre4) = {
342 .node_name = "lb6-gre4",
343 .fn = lb_format_adjacency,
344 .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP6_GRE4]
347 VLIB_REGISTER_NODE (lb4_gre6_node) =
349 .function = lb4_gre6_node_fn,
351 .vector_size = sizeof (u32),
352 .format_trace = format_lb_trace,
354 .n_errors = LB_N_ERROR,
355 .error_strings = lb_error_strings,
357 .n_next_nodes = LB_N_NEXT,
360 [LB_NEXT_LOOKUP] = "ip6-lookup",
361 [LB_NEXT_DROP] = "error-drop"
365 VNET_IP4_REGISTER_ADJACENCY(lb4_gre6) = {
366 .node_name = "lb4-gre6",
367 .fn = lb_format_adjacency,
368 .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP4_GRE6]
371 VLIB_REGISTER_NODE (lb4_gre4_node) =
373 .function = lb4_gre4_node_fn,
375 .vector_size = sizeof (u32),
376 .format_trace = format_lb_trace,
378 .n_errors = LB_N_ERROR,
379 .error_strings = lb_error_strings,
381 .n_next_nodes = LB_N_NEXT,
384 [LB_NEXT_LOOKUP] = "ip4-lookup",
385 [LB_NEXT_DROP] = "error-drop"
389 VNET_IP4_REGISTER_ADJACENCY(lb4_gre4) = {
390 .node_name = "lb4-gre4",
391 .fn = lb_format_adjacency,
392 .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP4_GRE4]