1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * IPVS: Weighted Round-Robin Scheduling module |
4 | * |
5 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> |
6 | * |
7 | * Changes: |
8 | * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest |
9 | * Wensong Zhang : changed some comestics things for debugging |
10 | * Wensong Zhang : changed for the d-linked destination list |
11 | * Wensong Zhang : added the ip_vs_wrr_update_svc |
12 | * Julian Anastasov : fixed the bug of returning destination |
13 | * with weight 0 when all weights are zero |
14 | */ |
15 | |
16 | #define KMSG_COMPONENT "IPVS" |
17 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt |
18 | |
19 | #include <linux/module.h> |
20 | #include <linux/kernel.h> |
21 | #include <linux/slab.h> |
22 | #include <linux/net.h> |
23 | #include <linux/gcd.h> |
24 | |
25 | #include <net/ip_vs.h> |
26 | |
27 | /* The WRR algorithm depends on some caclulations: |
28 | * - mw: maximum weight |
29 | * - di: weight step, greatest common divisor from all weights |
30 | * - cw: current required weight |
31 | * As result, all weights are in the [di..mw] range with a step=di. |
32 | * |
33 | * First, we start with cw = mw and select dests with weight >= cw. |
34 | * Then cw is reduced with di and all dests are checked again. |
35 | * Last pass should be with cw = di. We have mw/di passes in total: |
36 | * |
37 | * pass 1: cw = max weight |
38 | * pass 2: cw = max weight - di |
39 | * pass 3: cw = max weight - 2 * di |
40 | * ... |
41 | * last pass: cw = di |
42 | * |
43 | * Weights are supposed to be >= di but we run in parallel with |
44 | * weight changes, it is possible some dest weight to be reduced |
45 | * below di, bad if it is the only available dest. |
46 | * |
47 | * So, we modify how mw is calculated, now it is reduced with (di - 1), |
48 | * so that last cw is 1 to catch such dests with weight below di: |
49 | * pass 1: cw = max weight - (di - 1) |
50 | * pass 2: cw = max weight - di - (di - 1) |
51 | * pass 3: cw = max weight - 2 * di - (di - 1) |
52 | * ... |
53 | * last pass: cw = 1 |
54 | * |
55 | */ |
56 | |
57 | /* |
58 | * current destination pointer for weighted round-robin scheduling |
59 | */ |
60 | struct ip_vs_wrr_mark { |
61 | struct ip_vs_dest *cl; /* current dest or head */ |
62 | int cw; /* current weight */ |
63 | int mw; /* maximum weight */ |
64 | int di; /* decreasing interval */ |
65 | struct rcu_head rcu_head; |
66 | }; |
67 | |
68 | |
69 | static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) |
70 | { |
71 | struct ip_vs_dest *dest; |
72 | int weight; |
73 | int g = 0; |
74 | |
75 | list_for_each_entry(dest, &svc->destinations, n_list) { |
76 | weight = atomic_read(v: &dest->weight); |
77 | if (weight > 0) { |
78 | if (g > 0) |
79 | g = gcd(a: weight, b: g); |
80 | else |
81 | g = weight; |
82 | } |
83 | } |
84 | return g ? g : 1; |
85 | } |
86 | |
87 | |
88 | /* |
89 | * Get the maximum weight of the service destinations. |
90 | */ |
91 | static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) |
92 | { |
93 | struct ip_vs_dest *dest; |
94 | int new_weight, weight = 0; |
95 | |
96 | list_for_each_entry(dest, &svc->destinations, n_list) { |
97 | new_weight = atomic_read(v: &dest->weight); |
98 | if (new_weight > weight) |
99 | weight = new_weight; |
100 | } |
101 | |
102 | return weight; |
103 | } |
104 | |
105 | |
106 | static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) |
107 | { |
108 | struct ip_vs_wrr_mark *mark; |
109 | |
110 | /* |
111 | * Allocate the mark variable for WRR scheduling |
112 | */ |
113 | mark = kmalloc(size: sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); |
114 | if (mark == NULL) |
115 | return -ENOMEM; |
116 | |
117 | mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); |
118 | mark->di = ip_vs_wrr_gcd_weight(svc); |
119 | mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); |
120 | mark->cw = mark->mw; |
121 | svc->sched_data = mark; |
122 | |
123 | return 0; |
124 | } |
125 | |
126 | |
127 | static void ip_vs_wrr_done_svc(struct ip_vs_service *svc) |
128 | { |
129 | struct ip_vs_wrr_mark *mark = svc->sched_data; |
130 | |
131 | /* |
132 | * Release the mark variable |
133 | */ |
134 | kfree_rcu(mark, rcu_head); |
135 | } |
136 | |
137 | |
138 | static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc, |
139 | struct ip_vs_dest *dest) |
140 | { |
141 | struct ip_vs_wrr_mark *mark = svc->sched_data; |
142 | |
143 | spin_lock_bh(lock: &svc->sched_lock); |
144 | mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); |
145 | mark->di = ip_vs_wrr_gcd_weight(svc); |
146 | mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); |
147 | if (mark->cw > mark->mw || !mark->cw) |
148 | mark->cw = mark->mw; |
149 | else if (mark->di > 1) |
150 | mark->cw = (mark->cw / mark->di) * mark->di + 1; |
151 | spin_unlock_bh(lock: &svc->sched_lock); |
152 | return 0; |
153 | } |
154 | |
155 | |
156 | /* |
157 | * Weighted Round-Robin Scheduling |
158 | */ |
159 | static struct ip_vs_dest * |
160 | ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, |
161 | struct ip_vs_iphdr *iph) |
162 | { |
163 | struct ip_vs_dest *dest, *last, *stop = NULL; |
164 | struct ip_vs_wrr_mark *mark = svc->sched_data; |
165 | bool last_pass = false, restarted = false; |
166 | |
167 | IP_VS_DBG(6, "%s(): Scheduling...\n" , __func__); |
168 | |
169 | spin_lock_bh(lock: &svc->sched_lock); |
170 | dest = mark->cl; |
171 | /* No available dests? */ |
172 | if (mark->mw == 0) |
173 | goto err_noavail; |
174 | last = dest; |
175 | /* Stop only after all dests were checked for weight >= 1 (last pass) */ |
176 | while (1) { |
177 | list_for_each_entry_continue_rcu(dest, |
178 | &svc->destinations, |
179 | n_list) { |
180 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && |
181 | atomic_read(v: &dest->weight) >= mark->cw) |
182 | goto found; |
183 | if (dest == stop) |
184 | goto err_over; |
185 | } |
186 | mark->cw -= mark->di; |
187 | if (mark->cw <= 0) { |
188 | mark->cw = mark->mw; |
189 | /* Stop if we tried last pass from first dest: |
190 | * 1. last_pass: we started checks when cw > di but |
191 | * then all dests were checked for w >= 1 |
192 | * 2. last was head: the first and only traversal |
193 | * was for weight >= 1, for all dests. |
194 | */ |
195 | if (last_pass || |
196 | &last->n_list == &svc->destinations) |
197 | goto err_over; |
198 | restarted = true; |
199 | } |
200 | last_pass = mark->cw <= mark->di; |
201 | if (last_pass && restarted && |
202 | &last->n_list != &svc->destinations) { |
203 | /* First traversal was for w >= 1 but only |
204 | * for dests after 'last', now do the same |
205 | * for all dests up to 'last'. |
206 | */ |
207 | stop = last; |
208 | } |
209 | } |
210 | |
211 | found: |
212 | IP_VS_DBG_BUF(6, "WRR: server %s:%u " |
213 | "activeconns %d refcnt %d weight %d\n" , |
214 | IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), |
215 | atomic_read(&dest->activeconns), |
216 | refcount_read(&dest->refcnt), |
217 | atomic_read(&dest->weight)); |
218 | mark->cl = dest; |
219 | |
220 | out: |
221 | spin_unlock_bh(lock: &svc->sched_lock); |
222 | return dest; |
223 | |
224 | err_noavail: |
225 | mark->cl = dest; |
226 | dest = NULL; |
227 | ip_vs_scheduler_err(svc, msg: "no destination available" ); |
228 | goto out; |
229 | |
230 | err_over: |
231 | mark->cl = dest; |
232 | dest = NULL; |
233 | ip_vs_scheduler_err(svc, msg: "no destination available: " |
234 | "all destinations are overloaded" ); |
235 | goto out; |
236 | } |
237 | |
238 | |
239 | static struct ip_vs_scheduler ip_vs_wrr_scheduler = { |
240 | .name = "wrr" , |
241 | .refcnt = ATOMIC_INIT(0), |
242 | .module = THIS_MODULE, |
243 | .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), |
244 | .init_service = ip_vs_wrr_init_svc, |
245 | .done_service = ip_vs_wrr_done_svc, |
246 | .add_dest = ip_vs_wrr_dest_changed, |
247 | .del_dest = ip_vs_wrr_dest_changed, |
248 | .upd_dest = ip_vs_wrr_dest_changed, |
249 | .schedule = ip_vs_wrr_schedule, |
250 | }; |
251 | |
252 | static int __init ip_vs_wrr_init(void) |
253 | { |
254 | return register_ip_vs_scheduler(scheduler: &ip_vs_wrr_scheduler) ; |
255 | } |
256 | |
257 | static void __exit ip_vs_wrr_cleanup(void) |
258 | { |
259 | unregister_ip_vs_scheduler(scheduler: &ip_vs_wrr_scheduler); |
260 | synchronize_rcu(); |
261 | } |
262 | |
263 | module_init(ip_vs_wrr_init); |
264 | module_exit(ip_vs_wrr_cleanup); |
265 | MODULE_LICENSE("GPL" ); |
266 | MODULE_DESCRIPTION("ipvs weighted round-robin scheduler" ); |
267 | |