1// SPDX-License-Identifier: GPL-2.0
2/* IPVS: Maglev Hashing scheduling module
3 *
4 * Authors: Inju Song <inju.song@navercorp.com>
5 *
6 */
7
8/* The mh algorithm is to assign a preference list of all the lookup
9 * table positions to each destination and populate the table with
10 * the most-preferred position of destinations. Then it is to select
11 * destination with the hash key of source IP address through looking
12 * up a the lookup table.
13 *
14 * The algorithm is detailed in:
15 * [3.4 Consistent Hasing]
16https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
17 *
18 */
19
20#define pr_fmt(fmt) "IPVS: " fmt
21
22#include <linux/ip.h>
23#include <linux/slab.h>
24#include <linux/module.h>
25#include <linux/kernel.h>
26#include <linux/skbuff.h>
27
28#include <net/ip_vs.h>
29
30#include <linux/siphash.h>
31#include <linux/bitops.h>
32#include <linux/gcd.h>
33
34#define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */
35#define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */
36
37struct ip_vs_mh_lookup {
38 struct ip_vs_dest __rcu *dest; /* real server (cache) */
39};
40
41struct ip_vs_mh_dest_setup {
42 unsigned int offset; /* starting offset */
43 unsigned int skip; /* skip */
44 unsigned int perm; /* next_offset */
45 int turns; /* weight / gcd() and rshift */
46};
47
48/* Available prime numbers for MH table */
49static int primes[] = {251, 509, 1021, 2039, 4093,
50 8191, 16381, 32749, 65521, 131071};
51
52/* For IPVS MH entry hash table */
53#ifndef CONFIG_IP_VS_MH_TAB_INDEX
54#define CONFIG_IP_VS_MH_TAB_INDEX 12
55#endif
56#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2)
57#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8)
58#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX]
59
60struct ip_vs_mh_state {
61 struct rcu_head rcu_head;
62 struct ip_vs_mh_lookup *lookup;
63 struct ip_vs_mh_dest_setup *dest_setup;
64 hsiphash_key_t hash1, hash2;
65 int gcd;
66 int rshift;
67};
68
69static inline void generate_hash_secret(hsiphash_key_t *hash1,
70 hsiphash_key_t *hash2)
71{
72 hash1->key[0] = 2654435761UL;
73 hash1->key[1] = 2654435761UL;
74
75 hash2->key[0] = 2654446892UL;
76 hash2->key[1] = 2654446892UL;
77}
78
79/* Helper function to determine if server is unavailable */
80static inline bool is_unavailable(struct ip_vs_dest *dest)
81{
82 return atomic_read(v: &dest->weight) <= 0 ||
83 dest->flags & IP_VS_DEST_F_OVERLOAD;
84}
85
86/* Returns hash value for IPVS MH entry */
87static inline unsigned int
88ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
89 __be16 port, hsiphash_key_t *key, unsigned int offset)
90{
91 unsigned int v;
92 __be32 addr_fold = addr->ip;
93
94#ifdef CONFIG_IP_VS_IPV6
95 if (af == AF_INET6)
96 addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
97 addr->ip6[2] ^ addr->ip6[3];
98#endif
99 v = (offset + ntohs(port) + ntohl(addr_fold));
100 return hsiphash(data: &v, len: sizeof(v), key);
101}
102
103/* Reset all the hash buckets of the specified table. */
104static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
105{
106 int i;
107 struct ip_vs_mh_lookup *l;
108 struct ip_vs_dest *dest;
109
110 l = &s->lookup[0];
111 for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
112 dest = rcu_dereference_protected(l->dest, 1);
113 if (dest) {
114 ip_vs_dest_put(dest);
115 RCU_INIT_POINTER(l->dest, NULL);
116 }
117 l++;
118 }
119}
120
121static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
122 struct ip_vs_service *svc)
123{
124 struct list_head *p;
125 struct ip_vs_mh_dest_setup *ds;
126 struct ip_vs_dest *dest;
127 int lw;
128
129 /* If gcd is smaller then 1, number of dests or
130 * all last_weight of dests are zero. So, skip
131 * permutation for the dests.
132 */
133 if (s->gcd < 1)
134 return 0;
135
136 /* Set dest_setup for the dests permutation */
137 p = &svc->destinations;
138 ds = &s->dest_setup[0];
139 while ((p = p->next) != &svc->destinations) {
140 dest = list_entry(p, struct ip_vs_dest, n_list);
141
142 ds->offset = ip_vs_mh_hashkey(af: svc->af, addr: &dest->addr,
143 port: dest->port, key: &s->hash1, offset: 0) %
144 IP_VS_MH_TAB_SIZE;
145 ds->skip = ip_vs_mh_hashkey(af: svc->af, addr: &dest->addr,
146 port: dest->port, key: &s->hash2, offset: 0) %
147 (IP_VS_MH_TAB_SIZE - 1) + 1;
148 ds->perm = ds->offset;
149
150 lw = atomic_read(v: &dest->last_weight);
151 ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
152 ds++;
153 }
154
155 return 0;
156}
157
158static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
159 struct ip_vs_service *svc)
160{
161 int n, c, dt_count;
162 unsigned long *table;
163 struct list_head *p;
164 struct ip_vs_mh_dest_setup *ds;
165 struct ip_vs_dest *dest, *new_dest;
166
167 /* If gcd is smaller then 1, number of dests or
168 * all last_weight of dests are zero. So, skip
169 * the population for the dests and reset lookup table.
170 */
171 if (s->gcd < 1) {
172 ip_vs_mh_reset(s);
173 return 0;
174 }
175
176 table = bitmap_zalloc(IP_VS_MH_TAB_SIZE, GFP_KERNEL);
177 if (!table)
178 return -ENOMEM;
179
180 p = &svc->destinations;
181 n = 0;
182 dt_count = 0;
183 while (n < IP_VS_MH_TAB_SIZE) {
184 if (p == &svc->destinations)
185 p = p->next;
186
187 ds = &s->dest_setup[0];
188 while (p != &svc->destinations) {
189 /* Ignore added server with zero weight */
190 if (ds->turns < 1) {
191 p = p->next;
192 ds++;
193 continue;
194 }
195
196 c = ds->perm;
197 while (test_bit(c, table)) {
198 /* Add skip, mod IP_VS_MH_TAB_SIZE */
199 ds->perm += ds->skip;
200 if (ds->perm >= IP_VS_MH_TAB_SIZE)
201 ds->perm -= IP_VS_MH_TAB_SIZE;
202 c = ds->perm;
203 }
204
205 __set_bit(c, table);
206
207 dest = rcu_dereference_protected(s->lookup[c].dest, 1);
208 new_dest = list_entry(p, struct ip_vs_dest, n_list);
209 if (dest != new_dest) {
210 if (dest)
211 ip_vs_dest_put(dest);
212 ip_vs_dest_hold(dest: new_dest);
213 RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
214 }
215
216 if (++n == IP_VS_MH_TAB_SIZE)
217 goto out;
218
219 if (++dt_count >= ds->turns) {
220 dt_count = 0;
221 p = p->next;
222 ds++;
223 }
224 }
225 }
226
227out:
228 bitmap_free(bitmap: table);
229 return 0;
230}
231
232/* Get ip_vs_dest associated with supplied parameters. */
233static inline struct ip_vs_dest *
234ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
235 const union nf_inet_addr *addr, __be16 port)
236{
237 unsigned int hash = ip_vs_mh_hashkey(af: svc->af, addr, port, key: &s->hash1, offset: 0)
238 % IP_VS_MH_TAB_SIZE;
239 struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
240
241 return (!dest || is_unavailable(dest)) ? NULL : dest;
242}
243
244/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
245static inline struct ip_vs_dest *
246ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
247 const union nf_inet_addr *addr, __be16 port)
248{
249 unsigned int offset, roffset;
250 unsigned int hash, ihash;
251 struct ip_vs_dest *dest;
252
253 /* First try the dest it's supposed to go to */
254 ihash = ip_vs_mh_hashkey(af: svc->af, addr, port,
255 key: &s->hash1, offset: 0) % IP_VS_MH_TAB_SIZE;
256 dest = rcu_dereference(s->lookup[ihash].dest);
257 if (!dest)
258 return NULL;
259 if (!is_unavailable(dest))
260 return dest;
261
262 IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
263 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
264
265 /* If the original dest is unavailable, loop around the table
266 * starting from ihash to find a new dest
267 */
268 for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
269 roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
270 hash = ip_vs_mh_hashkey(af: svc->af, addr, port, key: &s->hash1,
271 offset: roffset) % IP_VS_MH_TAB_SIZE;
272 dest = rcu_dereference(s->lookup[hash].dest);
273 if (!dest)
274 break;
275 if (!is_unavailable(dest))
276 return dest;
277 IP_VS_DBG_BUF(6,
278 "MH: selected unavailable server %s:%u (offset %u), reselecting",
279 IP_VS_DBG_ADDR(dest->af, &dest->addr),
280 ntohs(dest->port), roffset);
281 }
282
283 return NULL;
284}
285
286/* Assign all the hash buckets of the specified table with the service. */
287static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
288 struct ip_vs_service *svc)
289{
290 int ret;
291
292 if (svc->num_dests > IP_VS_MH_TAB_SIZE)
293 return -EINVAL;
294
295 if (svc->num_dests >= 1) {
296 s->dest_setup = kcalloc(svc->num_dests,
297 sizeof(struct ip_vs_mh_dest_setup),
298 GFP_KERNEL);
299 if (!s->dest_setup)
300 return -ENOMEM;
301 }
302
303 ip_vs_mh_permutate(s, svc);
304
305 ret = ip_vs_mh_populate(s, svc);
306 if (ret < 0)
307 goto out;
308
309 IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
310 IP_VS_DBG_ADDR(svc->af, &svc->addr),
311 ntohs(svc->port));
312
313out:
314 if (svc->num_dests >= 1) {
315 kfree(objp: s->dest_setup);
316 s->dest_setup = NULL;
317 }
318 return ret;
319}
320
321static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
322{
323 struct ip_vs_dest *dest;
324 int weight;
325 int g = 0;
326
327 list_for_each_entry(dest, &svc->destinations, n_list) {
328 weight = atomic_read(v: &dest->last_weight);
329 if (weight > 0) {
330 if (g > 0)
331 g = gcd(a: weight, b: g);
332 else
333 g = weight;
334 }
335 }
336 return g;
337}
338
339/* To avoid assigning huge weight for the MH table,
340 * calculate shift value with gcd.
341 */
342static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
343{
344 struct ip_vs_dest *dest;
345 int new_weight, weight = 0;
346 int mw, shift;
347
348 /* If gcd is smaller then 1, number of dests or
349 * all last_weight of dests are zero. So, return
350 * shift value as zero.
351 */
352 if (gcd < 1)
353 return 0;
354
355 list_for_each_entry(dest, &svc->destinations, n_list) {
356 new_weight = atomic_read(v: &dest->last_weight);
357 if (new_weight > weight)
358 weight = new_weight;
359 }
360
361 /* Because gcd is greater than zero,
362 * the maximum weight and gcd are always greater than zero
363 */
364 mw = weight / gcd;
365
366 /* shift = occupied bits of weight/gcd - MH highest bits */
367 shift = fls(x: mw) - IP_VS_MH_TAB_BITS;
368 return (shift >= 0) ? shift : 0;
369}
370
371static void ip_vs_mh_state_free(struct rcu_head *head)
372{
373 struct ip_vs_mh_state *s;
374
375 s = container_of(head, struct ip_vs_mh_state, rcu_head);
376 kfree(objp: s->lookup);
377 kfree(objp: s);
378}
379
380static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
381{
382 int ret;
383 struct ip_vs_mh_state *s;
384
385 /* Allocate the MH table for this service */
386 s = kzalloc(sizeof(*s), GFP_KERNEL);
387 if (!s)
388 return -ENOMEM;
389
390 s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
391 GFP_KERNEL);
392 if (!s->lookup) {
393 kfree(objp: s);
394 return -ENOMEM;
395 }
396
397 generate_hash_secret(hash1: &s->hash1, hash2: &s->hash2);
398 s->gcd = ip_vs_mh_gcd_weight(svc);
399 s->rshift = ip_vs_mh_shift_weight(svc, gcd: s->gcd);
400
401 IP_VS_DBG(6,
402 "MH lookup table (memory=%zdbytes) allocated for current service\n",
403 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
404
405 /* Assign the lookup table with current dests */
406 ret = ip_vs_mh_reassign(s, svc);
407 if (ret < 0) {
408 ip_vs_mh_reset(s);
409 ip_vs_mh_state_free(head: &s->rcu_head);
410 return ret;
411 }
412
413 /* No more failures, attach state */
414 svc->sched_data = s;
415 return 0;
416}
417
418static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
419{
420 struct ip_vs_mh_state *s = svc->sched_data;
421
422 /* Got to clean up lookup entry here */
423 ip_vs_mh_reset(s);
424
425 call_rcu(head: &s->rcu_head, func: ip_vs_mh_state_free);
426 IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
427 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
428}
429
430static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
431 struct ip_vs_dest *dest)
432{
433 struct ip_vs_mh_state *s = svc->sched_data;
434
435 s->gcd = ip_vs_mh_gcd_weight(svc);
436 s->rshift = ip_vs_mh_shift_weight(svc, gcd: s->gcd);
437
438 /* Assign the lookup table with the updated service */
439 return ip_vs_mh_reassign(s, svc);
440}
441
442/* Helper function to get port number */
443static inline __be16
444ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
445{
446 __be16 _ports[2], *ports;
447
448 /* At this point we know that we have a valid packet of some kind.
449 * Because ICMP packets are only guaranteed to have the first 8
450 * bytes, let's just grab the ports. Fortunately they're in the
451 * same position for all three of the protocols we care about.
452 */
453 switch (iph->protocol) {
454 case IPPROTO_TCP:
455 case IPPROTO_UDP:
456 case IPPROTO_SCTP:
457 ports = skb_header_pointer(skb, offset: iph->len, len: sizeof(_ports),
458 buffer: &_ports);
459 if (unlikely(!ports))
460 return 0;
461
462 if (likely(!ip_vs_iph_inverse(iph)))
463 return ports[0];
464 else
465 return ports[1];
466 default:
467 return 0;
468 }
469}
470
471/* Maglev Hashing scheduling */
472static struct ip_vs_dest *
473ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
474 struct ip_vs_iphdr *iph)
475{
476 struct ip_vs_dest *dest;
477 struct ip_vs_mh_state *s;
478 __be16 port = 0;
479 const union nf_inet_addr *hash_addr;
480
481 hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
482
483 IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
484
485 if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
486 port = ip_vs_mh_get_port(skb, iph);
487
488 s = (struct ip_vs_mh_state *)svc->sched_data;
489
490 if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
491 dest = ip_vs_mh_get_fallback(svc, s, addr: hash_addr, port);
492 else
493 dest = ip_vs_mh_get(svc, s, addr: hash_addr, port);
494
495 if (!dest) {
496 ip_vs_scheduler_err(svc, msg: "no destination available");
497 return NULL;
498 }
499
500 IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
501 IP_VS_DBG_ADDR(svc->af, hash_addr),
502 ntohs(port),
503 IP_VS_DBG_ADDR(dest->af, &dest->addr),
504 ntohs(dest->port));
505
506 return dest;
507}
508
509/* IPVS MH Scheduler structure */
510static struct ip_vs_scheduler ip_vs_mh_scheduler = {
511 .name = "mh",
512 .refcnt = ATOMIC_INIT(0),
513 .module = THIS_MODULE,
514 .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
515 .init_service = ip_vs_mh_init_svc,
516 .done_service = ip_vs_mh_done_svc,
517 .add_dest = ip_vs_mh_dest_changed,
518 .del_dest = ip_vs_mh_dest_changed,
519 .upd_dest = ip_vs_mh_dest_changed,
520 .schedule = ip_vs_mh_schedule,
521};
522
523static int __init ip_vs_mh_init(void)
524{
525 return register_ip_vs_scheduler(scheduler: &ip_vs_mh_scheduler);
526}
527
528static void __exit ip_vs_mh_cleanup(void)
529{
530 unregister_ip_vs_scheduler(scheduler: &ip_vs_mh_scheduler);
531 rcu_barrier();
532}
533
534module_init(ip_vs_mh_init);
535module_exit(ip_vs_mh_cleanup);
536MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
537MODULE_LICENSE("GPL v2");
538MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
539

source code of linux/net/netfilter/ipvs/ip_vs_mh.c