1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * IPVS: Locality-Based Least-Connection with Replication scheduler |
4 | * |
5 | * Authors: Wensong Zhang <wensong@gnuchina.org> |
6 | * |
7 | * Changes: |
8 | * Julian Anastasov : Added the missing (dest->weight>0) |
9 | * condition in the ip_vs_dest_set_max. |
10 | */ |
11 | |
12 | /* |
13 | * The lblc/r algorithm is as follows (pseudo code): |
14 | * |
15 | * if serverSet[dest_ip] is null then |
16 | * n, serverSet[dest_ip] <- {weighted least-conn node}; |
17 | * else |
18 | * n <- {least-conn (alive) node in serverSet[dest_ip]}; |
19 | * if (n is null) OR |
20 | * (n.conns>n.weight AND |
21 | * there is a node m with m.conns<m.weight/2) then |
22 | * n <- {weighted least-conn node}; |
23 | * add n to serverSet[dest_ip]; |
24 | * if |serverSet[dest_ip]| > 1 AND |
25 | * now - serverSet[dest_ip].lastMod > T then |
26 | * m <- {most conn node in serverSet[dest_ip]}; |
27 | * remove m from serverSet[dest_ip]; |
28 | * if serverSet[dest_ip] changed then |
29 | * serverSet[dest_ip].lastMod <- now; |
30 | * |
31 | * return n; |
32 | * |
33 | */ |
34 | |
35 | #define KMSG_COMPONENT "IPVS" |
36 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt |
37 | |
38 | #include <linux/ip.h> |
39 | #include <linux/module.h> |
40 | #include <linux/kernel.h> |
41 | #include <linux/skbuff.h> |
42 | #include <linux/jiffies.h> |
43 | #include <linux/list.h> |
44 | #include <linux/slab.h> |
45 | #include <linux/hash.h> |
46 | |
47 | /* for sysctl */ |
48 | #include <linux/fs.h> |
49 | #include <linux/sysctl.h> |
50 | #include <net/net_namespace.h> |
51 | |
52 | #include <net/ip_vs.h> |
53 | |
54 | |
55 | /* |
56 | * It is for garbage collection of stale IPVS lblcr entries, |
57 | * when the table is full. |
58 | */ |
59 | #define CHECK_EXPIRE_INTERVAL (60*HZ) |
60 | #define ENTRY_TIMEOUT (6*60*HZ) |
61 | |
62 | #define DEFAULT_EXPIRATION (24*60*60*HZ) |
63 | |
64 | /* |
65 | * It is for full expiration check. |
66 | * When there is no partial expiration check (garbage collection) |
67 | * in a half hour, do a full expiration check to collect stale |
68 | * entries that haven't been touched for a day. |
69 | */ |
70 | #define COUNT_FOR_FULL_EXPIRATION 30 |
71 | |
72 | /* |
73 | * for IPVS lblcr entry hash table |
74 | */ |
75 | #ifndef CONFIG_IP_VS_LBLCR_TAB_BITS |
76 | #define CONFIG_IP_VS_LBLCR_TAB_BITS 10 |
77 | #endif |
78 | #define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS |
79 | #define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) |
80 | #define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) |
81 | |
82 | |
83 | /* |
84 | * IPVS destination set structure and operations |
85 | */ |
86 | struct ip_vs_dest_set_elem { |
87 | struct list_head list; /* list link */ |
88 | struct ip_vs_dest *dest; /* destination server */ |
89 | struct rcu_head rcu_head; |
90 | }; |
91 | |
92 | struct ip_vs_dest_set { |
93 | atomic_t size; /* set size */ |
94 | unsigned long lastmod; /* last modified time */ |
95 | struct list_head list; /* destination list */ |
96 | }; |
97 | |
98 | |
99 | static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set, |
100 | struct ip_vs_dest *dest, bool check) |
101 | { |
102 | struct ip_vs_dest_set_elem *e; |
103 | |
104 | if (check) { |
105 | list_for_each_entry(e, &set->list, list) { |
106 | if (e->dest == dest) |
107 | return; |
108 | } |
109 | } |
110 | |
111 | e = kmalloc(size: sizeof(*e), GFP_ATOMIC); |
112 | if (e == NULL) |
113 | return; |
114 | |
115 | ip_vs_dest_hold(dest); |
116 | e->dest = dest; |
117 | |
118 | list_add_rcu(new: &e->list, head: &set->list); |
119 | atomic_inc(v: &set->size); |
120 | |
121 | set->lastmod = jiffies; |
122 | } |
123 | |
124 | static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head) |
125 | { |
126 | struct ip_vs_dest_set_elem *e; |
127 | |
128 | e = container_of(head, struct ip_vs_dest_set_elem, rcu_head); |
129 | ip_vs_dest_put_and_free(dest: e->dest); |
130 | kfree(objp: e); |
131 | } |
132 | |
133 | static void |
134 | ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) |
135 | { |
136 | struct ip_vs_dest_set_elem *e; |
137 | |
138 | list_for_each_entry(e, &set->list, list) { |
139 | if (e->dest == dest) { |
140 | /* HIT */ |
141 | atomic_dec(v: &set->size); |
142 | set->lastmod = jiffies; |
143 | list_del_rcu(entry: &e->list); |
144 | call_rcu(head: &e->rcu_head, func: ip_vs_lblcr_elem_rcu_free); |
145 | break; |
146 | } |
147 | } |
148 | } |
149 | |
150 | static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) |
151 | { |
152 | struct ip_vs_dest_set_elem *e, *ep; |
153 | |
154 | list_for_each_entry_safe(e, ep, &set->list, list) { |
155 | list_del_rcu(entry: &e->list); |
156 | call_rcu(head: &e->rcu_head, func: ip_vs_lblcr_elem_rcu_free); |
157 | } |
158 | } |
159 | |
160 | /* get weighted least-connection node in the destination set */ |
161 | static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) |
162 | { |
163 | struct ip_vs_dest_set_elem *e; |
164 | struct ip_vs_dest *dest, *least; |
165 | int loh, doh; |
166 | |
167 | /* select the first destination server, whose weight > 0 */ |
168 | list_for_each_entry_rcu(e, &set->list, list) { |
169 | least = e->dest; |
170 | if (least->flags & IP_VS_DEST_F_OVERLOAD) |
171 | continue; |
172 | |
173 | if ((atomic_read(v: &least->weight) > 0) |
174 | && (least->flags & IP_VS_DEST_F_AVAILABLE)) { |
175 | loh = ip_vs_dest_conn_overhead(dest: least); |
176 | goto nextstage; |
177 | } |
178 | } |
179 | return NULL; |
180 | |
181 | /* find the destination with the weighted least load */ |
182 | nextstage: |
183 | list_for_each_entry_continue_rcu(e, &set->list, list) { |
184 | dest = e->dest; |
185 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) |
186 | continue; |
187 | |
188 | doh = ip_vs_dest_conn_overhead(dest); |
189 | if (((__s64)loh * atomic_read(v: &dest->weight) > |
190 | (__s64)doh * atomic_read(v: &least->weight)) |
191 | && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { |
192 | least = dest; |
193 | loh = doh; |
194 | } |
195 | } |
196 | |
197 | IP_VS_DBG_BUF(6, "%s(): server %s:%d " |
198 | "activeconns %d refcnt %d weight %d overhead %d\n" , |
199 | __func__, |
200 | IP_VS_DBG_ADDR(least->af, &least->addr), |
201 | ntohs(least->port), |
202 | atomic_read(&least->activeconns), |
203 | refcount_read(&least->refcnt), |
204 | atomic_read(&least->weight), loh); |
205 | return least; |
206 | } |
207 | |
208 | |
209 | /* get weighted most-connection node in the destination set */ |
210 | static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) |
211 | { |
212 | struct ip_vs_dest_set_elem *e; |
213 | struct ip_vs_dest *dest, *most; |
214 | int moh, doh; |
215 | |
216 | if (set == NULL) |
217 | return NULL; |
218 | |
219 | /* select the first destination server, whose weight > 0 */ |
220 | list_for_each_entry(e, &set->list, list) { |
221 | most = e->dest; |
222 | if (atomic_read(v: &most->weight) > 0) { |
223 | moh = ip_vs_dest_conn_overhead(dest: most); |
224 | goto nextstage; |
225 | } |
226 | } |
227 | return NULL; |
228 | |
229 | /* find the destination with the weighted most load */ |
230 | nextstage: |
231 | list_for_each_entry_continue(e, &set->list, list) { |
232 | dest = e->dest; |
233 | doh = ip_vs_dest_conn_overhead(dest); |
234 | /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ |
235 | if (((__s64)moh * atomic_read(v: &dest->weight) < |
236 | (__s64)doh * atomic_read(v: &most->weight)) |
237 | && (atomic_read(v: &dest->weight) > 0)) { |
238 | most = dest; |
239 | moh = doh; |
240 | } |
241 | } |
242 | |
243 | IP_VS_DBG_BUF(6, "%s(): server %s:%d " |
244 | "activeconns %d refcnt %d weight %d overhead %d\n" , |
245 | __func__, |
246 | IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port), |
247 | atomic_read(&most->activeconns), |
248 | refcount_read(&most->refcnt), |
249 | atomic_read(&most->weight), moh); |
250 | return most; |
251 | } |
252 | |
253 | |
254 | /* |
255 | * IPVS lblcr entry represents an association between destination |
256 | * IP address and its destination server set |
257 | */ |
258 | struct ip_vs_lblcr_entry { |
259 | struct hlist_node list; |
260 | int af; /* address family */ |
261 | union nf_inet_addr addr; /* destination IP address */ |
262 | struct ip_vs_dest_set set; /* destination server set */ |
263 | unsigned long lastuse; /* last used time */ |
264 | struct rcu_head rcu_head; |
265 | }; |
266 | |
267 | |
268 | /* |
269 | * IPVS lblcr hash table |
270 | */ |
271 | struct ip_vs_lblcr_table { |
272 | struct rcu_head rcu_head; |
273 | struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ |
274 | atomic_t entries; /* number of entries */ |
275 | int max_size; /* maximum size of entries */ |
276 | struct timer_list periodic_timer; /* collect stale entries */ |
277 | struct ip_vs_service *svc; /* pointer back to service */ |
278 | int rover; /* rover for expire check */ |
279 | int counter; /* counter for no expire */ |
280 | bool dead; |
281 | }; |
282 | |
283 | |
284 | #ifdef CONFIG_SYSCTL |
285 | /* |
286 | * IPVS LBLCR sysctl table |
287 | */ |
288 | |
289 | static struct ctl_table vs_vars_table[] = { |
290 | { |
291 | .procname = "lblcr_expiration" , |
292 | .data = NULL, |
293 | .maxlen = sizeof(int), |
294 | .mode = 0644, |
295 | .proc_handler = proc_dointvec_jiffies, |
296 | }, |
297 | { } |
298 | }; |
299 | #endif |
300 | |
301 | static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) |
302 | { |
303 | hlist_del_rcu(n: &en->list); |
304 | ip_vs_dest_set_eraseall(set: &en->set); |
305 | kfree_rcu(en, rcu_head); |
306 | } |
307 | |
308 | |
309 | /* |
310 | * Returns hash value for IPVS LBLCR entry |
311 | */ |
312 | static inline unsigned int |
313 | ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) |
314 | { |
315 | __be32 addr_fold = addr->ip; |
316 | |
317 | #ifdef CONFIG_IP_VS_IPV6 |
318 | if (af == AF_INET6) |
319 | addr_fold = addr->ip6[0]^addr->ip6[1]^ |
320 | addr->ip6[2]^addr->ip6[3]; |
321 | #endif |
322 | return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS); |
323 | } |
324 | |
325 | |
326 | /* |
327 | * Hash an entry in the ip_vs_lblcr_table. |
328 | * returns bool success. |
329 | */ |
330 | static void |
331 | ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) |
332 | { |
333 | unsigned int hash = ip_vs_lblcr_hashkey(af: en->af, addr: &en->addr); |
334 | |
335 | hlist_add_head_rcu(n: &en->list, h: &tbl->bucket[hash]); |
336 | atomic_inc(v: &tbl->entries); |
337 | } |
338 | |
339 | |
340 | /* Get ip_vs_lblcr_entry associated with supplied parameters. */ |
341 | static inline struct ip_vs_lblcr_entry * |
342 | ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, |
343 | const union nf_inet_addr *addr) |
344 | { |
345 | unsigned int hash = ip_vs_lblcr_hashkey(af, addr); |
346 | struct ip_vs_lblcr_entry *en; |
347 | |
348 | hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) |
349 | if (ip_vs_addr_equal(af, a: &en->addr, b: addr)) |
350 | return en; |
351 | |
352 | return NULL; |
353 | } |
354 | |
355 | |
356 | /* |
357 | * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination |
358 | * IP address to a server. Called under spin lock. |
359 | */ |
360 | static inline struct ip_vs_lblcr_entry * |
361 | ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, |
362 | u16 af, struct ip_vs_dest *dest) |
363 | { |
364 | struct ip_vs_lblcr_entry *en; |
365 | |
366 | en = ip_vs_lblcr_get(af, tbl, addr: daddr); |
367 | if (!en) { |
368 | en = kmalloc(size: sizeof(*en), GFP_ATOMIC); |
369 | if (!en) |
370 | return NULL; |
371 | |
372 | en->af = af; |
373 | ip_vs_addr_copy(af, dst: &en->addr, src: daddr); |
374 | en->lastuse = jiffies; |
375 | |
376 | /* initialize its dest set */ |
377 | atomic_set(v: &(en->set.size), i: 0); |
378 | INIT_LIST_HEAD(list: &en->set.list); |
379 | |
380 | ip_vs_dest_set_insert(set: &en->set, dest, check: false); |
381 | |
382 | ip_vs_lblcr_hash(tbl, en); |
383 | return en; |
384 | } |
385 | |
386 | ip_vs_dest_set_insert(set: &en->set, dest, check: true); |
387 | |
388 | return en; |
389 | } |
390 | |
391 | |
392 | /* |
393 | * Flush all the entries of the specified table. |
394 | */ |
395 | static void ip_vs_lblcr_flush(struct ip_vs_service *svc) |
396 | { |
397 | struct ip_vs_lblcr_table *tbl = svc->sched_data; |
398 | int i; |
399 | struct ip_vs_lblcr_entry *en; |
400 | struct hlist_node *next; |
401 | |
402 | spin_lock_bh(lock: &svc->sched_lock); |
403 | tbl->dead = true; |
404 | for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { |
405 | hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { |
406 | ip_vs_lblcr_free(en); |
407 | } |
408 | } |
409 | spin_unlock_bh(lock: &svc->sched_lock); |
410 | } |
411 | |
412 | static int sysctl_lblcr_expiration(struct ip_vs_service *svc) |
413 | { |
414 | #ifdef CONFIG_SYSCTL |
415 | return svc->ipvs->sysctl_lblcr_expiration; |
416 | #else |
417 | return DEFAULT_EXPIRATION; |
418 | #endif |
419 | } |
420 | |
421 | static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) |
422 | { |
423 | struct ip_vs_lblcr_table *tbl = svc->sched_data; |
424 | unsigned long now = jiffies; |
425 | int i, j; |
426 | struct ip_vs_lblcr_entry *en; |
427 | struct hlist_node *next; |
428 | |
429 | for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { |
430 | j = (j + 1) & IP_VS_LBLCR_TAB_MASK; |
431 | |
432 | spin_lock(lock: &svc->sched_lock); |
433 | hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { |
434 | if (time_after(en->lastuse + |
435 | sysctl_lblcr_expiration(svc), now)) |
436 | continue; |
437 | |
438 | ip_vs_lblcr_free(en); |
439 | atomic_dec(v: &tbl->entries); |
440 | } |
441 | spin_unlock(lock: &svc->sched_lock); |
442 | } |
443 | tbl->rover = j; |
444 | } |
445 | |
446 | |
447 | /* |
448 | * Periodical timer handler for IPVS lblcr table |
449 | * It is used to collect stale entries when the number of entries |
450 | * exceeds the maximum size of the table. |
451 | * |
452 | * Fixme: we probably need more complicated algorithm to collect |
453 | * entries that have not been used for a long time even |
454 | * if the number of entries doesn't exceed the maximum size |
455 | * of the table. |
456 | * The full expiration check is for this purpose now. |
457 | */ |
458 | static void ip_vs_lblcr_check_expire(struct timer_list *t) |
459 | { |
460 | struct ip_vs_lblcr_table *tbl = from_timer(tbl, t, periodic_timer); |
461 | struct ip_vs_service *svc = tbl->svc; |
462 | unsigned long now = jiffies; |
463 | int goal; |
464 | int i, j; |
465 | struct ip_vs_lblcr_entry *en; |
466 | struct hlist_node *next; |
467 | |
468 | if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { |
469 | /* do full expiration check */ |
470 | ip_vs_lblcr_full_check(svc); |
471 | tbl->counter = 1; |
472 | goto out; |
473 | } |
474 | |
475 | if (atomic_read(v: &tbl->entries) <= tbl->max_size) { |
476 | tbl->counter++; |
477 | goto out; |
478 | } |
479 | |
480 | goal = (atomic_read(v: &tbl->entries) - tbl->max_size)*4/3; |
481 | if (goal > tbl->max_size/2) |
482 | goal = tbl->max_size/2; |
483 | |
484 | for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { |
485 | j = (j + 1) & IP_VS_LBLCR_TAB_MASK; |
486 | |
487 | spin_lock(lock: &svc->sched_lock); |
488 | hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { |
489 | if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) |
490 | continue; |
491 | |
492 | ip_vs_lblcr_free(en); |
493 | atomic_dec(v: &tbl->entries); |
494 | goal--; |
495 | } |
496 | spin_unlock(lock: &svc->sched_lock); |
497 | if (goal <= 0) |
498 | break; |
499 | } |
500 | tbl->rover = j; |
501 | |
502 | out: |
503 | mod_timer(timer: &tbl->periodic_timer, expires: jiffies+CHECK_EXPIRE_INTERVAL); |
504 | } |
505 | |
506 | static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) |
507 | { |
508 | int i; |
509 | struct ip_vs_lblcr_table *tbl; |
510 | |
511 | /* |
512 | * Allocate the ip_vs_lblcr_table for this service |
513 | */ |
514 | tbl = kmalloc(size: sizeof(*tbl), GFP_KERNEL); |
515 | if (tbl == NULL) |
516 | return -ENOMEM; |
517 | |
518 | svc->sched_data = tbl; |
519 | IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) allocated for " |
520 | "current service\n" , sizeof(*tbl)); |
521 | |
522 | /* |
523 | * Initialize the hash buckets |
524 | */ |
525 | for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { |
526 | INIT_HLIST_HEAD(&tbl->bucket[i]); |
527 | } |
528 | tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; |
529 | tbl->rover = 0; |
530 | tbl->counter = 1; |
531 | tbl->dead = false; |
532 | tbl->svc = svc; |
533 | atomic_set(v: &tbl->entries, i: 0); |
534 | |
535 | /* |
536 | * Hook periodic timer for garbage collection |
537 | */ |
538 | timer_setup(&tbl->periodic_timer, ip_vs_lblcr_check_expire, 0); |
539 | mod_timer(timer: &tbl->periodic_timer, expires: jiffies + CHECK_EXPIRE_INTERVAL); |
540 | |
541 | return 0; |
542 | } |
543 | |
544 | |
545 | static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc) |
546 | { |
547 | struct ip_vs_lblcr_table *tbl = svc->sched_data; |
548 | |
549 | /* remove periodic timer */ |
550 | timer_shutdown_sync(timer: &tbl->periodic_timer); |
551 | |
552 | /* got to clean up table entries here */ |
553 | ip_vs_lblcr_flush(svc); |
554 | |
555 | /* release the table itself */ |
556 | kfree_rcu(tbl, rcu_head); |
557 | IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) released\n" , |
558 | sizeof(*tbl)); |
559 | } |
560 | |
561 | |
562 | static inline struct ip_vs_dest * |
563 | __ip_vs_lblcr_schedule(struct ip_vs_service *svc) |
564 | { |
565 | struct ip_vs_dest *dest, *least; |
566 | int loh, doh; |
567 | |
568 | /* |
569 | * We use the following formula to estimate the load: |
570 | * (dest overhead) / dest->weight |
571 | * |
572 | * Remember -- no floats in kernel mode!!! |
573 | * The comparison of h1*w2 > h2*w1 is equivalent to that of |
574 | * h1/w1 > h2/w2 |
575 | * if every weight is larger than zero. |
576 | * |
577 | * The server with weight=0 is quiesced and will not receive any |
578 | * new connection. |
579 | */ |
580 | list_for_each_entry_rcu(dest, &svc->destinations, n_list) { |
581 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) |
582 | continue; |
583 | |
584 | if (atomic_read(v: &dest->weight) > 0) { |
585 | least = dest; |
586 | loh = ip_vs_dest_conn_overhead(dest: least); |
587 | goto nextstage; |
588 | } |
589 | } |
590 | return NULL; |
591 | |
592 | /* |
593 | * Find the destination with the least load. |
594 | */ |
595 | nextstage: |
596 | list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { |
597 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) |
598 | continue; |
599 | |
600 | doh = ip_vs_dest_conn_overhead(dest); |
601 | if ((__s64)loh * atomic_read(v: &dest->weight) > |
602 | (__s64)doh * atomic_read(v: &least->weight)) { |
603 | least = dest; |
604 | loh = doh; |
605 | } |
606 | } |
607 | |
608 | IP_VS_DBG_BUF(6, "LBLCR: server %s:%d " |
609 | "activeconns %d refcnt %d weight %d overhead %d\n" , |
610 | IP_VS_DBG_ADDR(least->af, &least->addr), |
611 | ntohs(least->port), |
612 | atomic_read(&least->activeconns), |
613 | refcount_read(&least->refcnt), |
614 | atomic_read(&least->weight), loh); |
615 | |
616 | return least; |
617 | } |
618 | |
619 | |
620 | /* |
621 | * If this destination server is overloaded and there is a less loaded |
622 | * server, then return true. |
623 | */ |
624 | static inline int |
625 | is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) |
626 | { |
627 | if (atomic_read(v: &dest->activeconns) > atomic_read(v: &dest->weight)) { |
628 | struct ip_vs_dest *d; |
629 | |
630 | list_for_each_entry_rcu(d, &svc->destinations, n_list) { |
631 | if (atomic_read(v: &d->activeconns)*2 |
632 | < atomic_read(v: &d->weight)) { |
633 | return 1; |
634 | } |
635 | } |
636 | } |
637 | return 0; |
638 | } |
639 | |
640 | |
641 | /* |
642 | * Locality-Based (weighted) Least-Connection scheduling |
643 | */ |
644 | static struct ip_vs_dest * |
645 | ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, |
646 | struct ip_vs_iphdr *iph) |
647 | { |
648 | struct ip_vs_lblcr_table *tbl = svc->sched_data; |
649 | struct ip_vs_dest *dest; |
650 | struct ip_vs_lblcr_entry *en; |
651 | |
652 | IP_VS_DBG(6, "%s(): Scheduling...\n" , __func__); |
653 | |
654 | /* First look in our cache */ |
655 | en = ip_vs_lblcr_get(af: svc->af, tbl, addr: &iph->daddr); |
656 | if (en) { |
657 | en->lastuse = jiffies; |
658 | |
659 | /* Get the least loaded destination */ |
660 | dest = ip_vs_dest_set_min(set: &en->set); |
661 | |
662 | /* More than one destination + enough time passed by, cleanup */ |
663 | if (atomic_read(v: &en->set.size) > 1 && |
664 | time_after(jiffies, en->set.lastmod + |
665 | sysctl_lblcr_expiration(svc))) { |
666 | spin_lock_bh(lock: &svc->sched_lock); |
667 | if (atomic_read(v: &en->set.size) > 1) { |
668 | struct ip_vs_dest *m; |
669 | |
670 | m = ip_vs_dest_set_max(set: &en->set); |
671 | if (m) |
672 | ip_vs_dest_set_erase(set: &en->set, dest: m); |
673 | } |
674 | spin_unlock_bh(lock: &svc->sched_lock); |
675 | } |
676 | |
677 | /* If the destination is not overloaded, use it */ |
678 | if (dest && !is_overloaded(dest, svc)) |
679 | goto out; |
680 | |
681 | /* The cache entry is invalid, time to schedule */ |
682 | dest = __ip_vs_lblcr_schedule(svc); |
683 | if (!dest) { |
684 | ip_vs_scheduler_err(svc, msg: "no destination available" ); |
685 | return NULL; |
686 | } |
687 | |
688 | /* Update our cache entry */ |
689 | spin_lock_bh(lock: &svc->sched_lock); |
690 | if (!tbl->dead) |
691 | ip_vs_dest_set_insert(set: &en->set, dest, check: true); |
692 | spin_unlock_bh(lock: &svc->sched_lock); |
693 | goto out; |
694 | } |
695 | |
696 | /* No cache entry, time to schedule */ |
697 | dest = __ip_vs_lblcr_schedule(svc); |
698 | if (!dest) { |
699 | IP_VS_DBG(1, "no destination available\n" ); |
700 | return NULL; |
701 | } |
702 | |
703 | /* If we fail to create a cache entry, we'll just use the valid dest */ |
704 | spin_lock_bh(lock: &svc->sched_lock); |
705 | if (!tbl->dead) |
706 | ip_vs_lblcr_new(tbl, daddr: &iph->daddr, af: svc->af, dest); |
707 | spin_unlock_bh(lock: &svc->sched_lock); |
708 | |
709 | out: |
710 | IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n" , |
711 | IP_VS_DBG_ADDR(svc->af, &iph->daddr), |
712 | IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); |
713 | |
714 | return dest; |
715 | } |
716 | |
717 | |
718 | /* |
719 | * IPVS LBLCR Scheduler structure |
720 | */ |
721 | static struct ip_vs_scheduler ip_vs_lblcr_scheduler = |
722 | { |
723 | .name = "lblcr" , |
724 | .refcnt = ATOMIC_INIT(0), |
725 | .module = THIS_MODULE, |
726 | .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), |
727 | .init_service = ip_vs_lblcr_init_svc, |
728 | .done_service = ip_vs_lblcr_done_svc, |
729 | .schedule = ip_vs_lblcr_schedule, |
730 | }; |
731 | |
732 | /* |
733 | * per netns init. |
734 | */ |
735 | #ifdef CONFIG_SYSCTL |
736 | static int __net_init __ip_vs_lblcr_init(struct net *net) |
737 | { |
738 | struct netns_ipvs *ipvs = net_ipvs(net); |
739 | size_t vars_table_size = ARRAY_SIZE(vs_vars_table); |
740 | |
741 | if (!ipvs) |
742 | return -ENOENT; |
743 | |
744 | if (!net_eq(net1: net, net2: &init_net)) { |
745 | ipvs->lblcr_ctl_table = kmemdup(p: vs_vars_table, |
746 | size: sizeof(vs_vars_table), |
747 | GFP_KERNEL); |
748 | if (ipvs->lblcr_ctl_table == NULL) |
749 | return -ENOMEM; |
750 | |
751 | /* Don't export sysctls to unprivileged users */ |
752 | if (net->user_ns != &init_user_ns) { |
753 | ipvs->lblcr_ctl_table[0].procname = NULL; |
754 | vars_table_size = 0; |
755 | } |
756 | } else |
757 | ipvs->lblcr_ctl_table = vs_vars_table; |
758 | ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; |
759 | ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; |
760 | |
761 | ipvs->lblcr_ctl_header = register_net_sysctl_sz(net, path: "net/ipv4/vs" , |
762 | table: ipvs->lblcr_ctl_table, |
763 | table_size: vars_table_size); |
764 | if (!ipvs->lblcr_ctl_header) { |
765 | if (!net_eq(net1: net, net2: &init_net)) |
766 | kfree(objp: ipvs->lblcr_ctl_table); |
767 | return -ENOMEM; |
768 | } |
769 | |
770 | return 0; |
771 | } |
772 | |
773 | static void __net_exit __ip_vs_lblcr_exit(struct net *net) |
774 | { |
775 | struct netns_ipvs *ipvs = net_ipvs(net); |
776 | |
777 | unregister_net_sysctl_table(header: ipvs->lblcr_ctl_header); |
778 | |
779 | if (!net_eq(net1: net, net2: &init_net)) |
780 | kfree(objp: ipvs->lblcr_ctl_table); |
781 | } |
782 | |
783 | #else |
784 | |
785 | static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; } |
786 | static void __net_exit __ip_vs_lblcr_exit(struct net *net) { } |
787 | |
788 | #endif |
789 | |
790 | static struct pernet_operations ip_vs_lblcr_ops = { |
791 | .init = __ip_vs_lblcr_init, |
792 | .exit = __ip_vs_lblcr_exit, |
793 | }; |
794 | |
795 | static int __init ip_vs_lblcr_init(void) |
796 | { |
797 | int ret; |
798 | |
799 | ret = register_pernet_subsys(&ip_vs_lblcr_ops); |
800 | if (ret) |
801 | return ret; |
802 | |
803 | ret = register_ip_vs_scheduler(scheduler: &ip_vs_lblcr_scheduler); |
804 | if (ret) |
805 | unregister_pernet_subsys(&ip_vs_lblcr_ops); |
806 | return ret; |
807 | } |
808 | |
809 | static void __exit ip_vs_lblcr_cleanup(void) |
810 | { |
811 | unregister_ip_vs_scheduler(scheduler: &ip_vs_lblcr_scheduler); |
812 | unregister_pernet_subsys(&ip_vs_lblcr_ops); |
813 | rcu_barrier(); |
814 | } |
815 | |
816 | |
817 | module_init(ip_vs_lblcr_init); |
818 | module_exit(ip_vs_lblcr_cleanup); |
819 | MODULE_LICENSE("GPL" ); |
820 | MODULE_DESCRIPTION("ipvs locality-based least-connection with replication scheduler" ); |
821 | |