Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2021 Intel Corporation.
3 : * All rights reserved.
4 : */
5 :
6 : #include "spdk/stdinc.h"
7 : #include "spdk/likely.h"
8 : #include "spdk/event.h"
9 : #include "spdk/log.h"
10 : #include "spdk/env.h"
11 :
12 : #include "spdk/thread.h"
13 : #include "spdk_internal/event.h"
14 : #include "spdk/scheduler.h"
15 : #include "spdk_internal/usdt.h"
16 :
17 : static uint32_t g_main_lcore;
18 :
19 : struct core_stats {
20 : uint64_t busy;
21 : uint64_t idle;
22 : uint32_t thread_count;
23 : };
24 :
25 : static struct core_stats *g_cores;
26 :
27 : uint8_t g_scheduler_load_limit = 20;
28 : uint8_t g_scheduler_core_limit = 80;
29 : uint8_t g_scheduler_core_busy = 95;
30 :
31 : static uint8_t
32 52 : _busy_pct(uint64_t busy, uint64_t idle)
33 : {
34 52 : if ((busy + idle) == 0) {
35 0 : return 0;
36 : }
37 :
38 52 : return busy * 100 / (busy + idle);
39 : }
40 :
41 : static uint8_t
42 30 : _get_thread_load(struct spdk_scheduler_thread_info *thread_info)
43 : {
44 : uint64_t busy, idle;
45 :
46 30 : busy = thread_info->current_stats.busy_tsc;
47 30 : idle = thread_info->current_stats.idle_tsc;
48 :
49 : /* return percentage of time thread was busy */
50 30 : return _busy_pct(busy, idle);
51 : }
52 :
53 : typedef void (*_foreach_fn)(struct spdk_scheduler_thread_info *thread_info);
54 :
55 : static void
56 12 : _foreach_thread(struct spdk_scheduler_core_info *cores_info, _foreach_fn fn)
57 : {
58 : struct spdk_scheduler_core_info *core;
59 : uint32_t i, j;
60 :
61 42 : SPDK_ENV_FOREACH_CORE(i) {
62 30 : core = &cores_info[i];
63 60 : for (j = 0; j < core->threads_count; j++) {
64 30 : fn(&core->thread_infos[j]);
65 : }
66 : }
67 12 : }
68 :
69 : static void
70 15 : _move_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core)
71 : {
72 15 : struct core_stats *dst = &g_cores[dst_core];
73 15 : struct core_stats *src = &g_cores[thread_info->lcore];
74 15 : uint64_t busy_tsc = thread_info->current_stats.busy_tsc;
75 15 : uint8_t busy_pct = _busy_pct(src->busy, src->idle);
76 : uint64_t tsc;
77 :
78 : SPDK_DTRACE_PROBE2(dynsched_move, thread_info, dst_core);
79 :
80 15 : if (src == dst) {
81 : /* Don't modify stats if thread is already on that core. */
82 7 : return;
83 : }
84 :
85 8 : dst->busy += spdk_min(UINT64_MAX - dst->busy, busy_tsc);
86 8 : dst->idle -= spdk_min(dst->idle, busy_tsc);
87 8 : dst->thread_count++;
88 :
89 : /* Adjust busy/idle from core as if thread was not present on it.
90 : * Core load will reflect the sum of all remaining threads on it. */
91 8 : src->busy -= spdk_min(src->busy, busy_tsc);
92 8 : src->idle += spdk_min(UINT64_MAX - src->idle, busy_tsc);
93 :
94 8 : if (busy_pct >= g_scheduler_core_busy &&
95 2 : _busy_pct(src->busy, src->idle) < g_scheduler_core_limit) {
96 : /* This core was so busy that we cannot assume all of busy_tsc
97 : * consumed by the moved thread will now be idle_tsc - it's
98 : * very possible the remaining threads will use these cycles
99 : * as busy_tsc.
100 : *
101 : * So make sure we don't drop the updated estimate below
102 : * g_scheduler_core_limit, so that other cores can't
103 : * move threads to this core during this scheduling
104 : * period.
105 : */
106 2 : tsc = src->busy + src->idle;
107 2 : src->busy = tsc * g_scheduler_core_limit / 100;
108 2 : src->idle = tsc - src->busy;
109 : }
110 8 : assert(src->thread_count > 0);
111 8 : src->thread_count--;
112 :
113 8 : thread_info->lcore = dst_core;
114 : }
115 :
116 : static bool
117 5 : _is_core_at_limit(uint32_t core_id)
118 : {
119 5 : struct core_stats *core = &g_cores[core_id];
120 : uint64_t busy, idle;
121 :
122 : /* Core with no or single thread cannot be over the limit. */
123 5 : if (core->thread_count <= 1) {
124 0 : return false;
125 : }
126 :
127 5 : busy = core->busy;
128 5 : idle = core->idle;
129 :
130 : /* No work was done, exit before possible division by 0. */
131 5 : if (busy == 0) {
132 0 : return false;
133 : }
134 :
135 : /* Work done was less than the limit */
136 5 : if (_busy_pct(busy, idle) < g_scheduler_core_limit) {
137 1 : return false;
138 : }
139 :
140 4 : return true;
141 : }
142 :
143 : static bool
144 5 : _can_core_fit_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core)
145 : {
146 5 : struct core_stats *dst = &g_cores[dst_core];
147 : uint64_t new_busy_tsc, new_idle_tsc;
148 :
149 : /* Thread can always fit on the core it's currently on. */
150 5 : if (thread_info->lcore == dst_core) {
151 2 : return true;
152 : }
153 :
154 : /* Reactors in interrupt mode do not update stats,
155 : * a thread can always fit into reactor in interrupt mode. */
156 3 : if (dst->busy + dst->idle == 0) {
157 3 : return true;
158 : }
159 :
160 : /* Core has no threads. */
161 0 : if (dst->thread_count == 0) {
162 0 : return true;
163 : }
164 :
165 : /* Core doesn't have enough idle_tsc to take this thread. */
166 0 : if (dst->idle < thread_info->current_stats.busy_tsc) {
167 0 : return false;
168 : }
169 :
170 0 : new_busy_tsc = dst->busy + thread_info->current_stats.busy_tsc;
171 0 : new_idle_tsc = dst->idle - thread_info->current_stats.busy_tsc;
172 :
173 : /* Core cannot fit this thread if it would put it over the
174 : * g_scheduler_core_limit. */
175 0 : return _busy_pct(new_busy_tsc, new_idle_tsc) < g_scheduler_core_limit;
176 : }
177 :
178 : static uint32_t
179 5 : _find_optimal_core(struct spdk_scheduler_thread_info *thread_info)
180 : {
181 : uint32_t i;
182 5 : uint32_t current_lcore = thread_info->lcore;
183 5 : uint32_t least_busy_lcore = thread_info->lcore;
184 : struct spdk_thread *thread;
185 : struct spdk_cpuset *cpumask;
186 5 : bool core_at_limit = _is_core_at_limit(current_lcore);
187 :
188 5 : thread = spdk_thread_get_by_id(thread_info->thread_id);
189 5 : if (thread == NULL) {
190 0 : return current_lcore;
191 : }
192 5 : cpumask = spdk_thread_get_cpumask(thread);
193 :
194 : /* Find a core that can fit the thread. */
195 14 : SPDK_ENV_FOREACH_CORE(i) {
196 : /* Ignore cores outside cpumask. */
197 12 : if (!spdk_cpuset_get_cpu(cpumask, i)) {
198 7 : continue;
199 : }
200 :
201 : /* Search for least busy core. */
202 5 : if (g_cores[i].busy < g_cores[least_busy_lcore].busy) {
203 3 : least_busy_lcore = i;
204 : }
205 :
206 : /* Skip cores that cannot fit the thread and current one. */
207 5 : if (!_can_core_fit_thread(thread_info, i) || i == current_lcore) {
208 2 : continue;
209 : }
210 3 : if (i == g_main_lcore) {
211 : /* First consider g_main_lcore, consolidate threads on main lcore if possible. */
212 0 : return i;
213 3 : } else if (i < current_lcore && current_lcore != g_main_lcore) {
214 : /* Lower core id was found, move to consolidate threads on lowest core ids. */
215 0 : return i;
216 3 : } else if (core_at_limit) {
217 : /* When core is over the limit, any core id is better than current one. */
218 3 : return i;
219 : }
220 : }
221 :
222 : /* For cores over the limit, place the thread on least busy core
223 : * to balance threads. */
224 2 : if (core_at_limit) {
225 1 : return least_busy_lcore;
226 : }
227 :
228 : /* If no better core is found, remain on the same one. */
229 1 : return current_lcore;
230 : }
231 :
232 : static int
233 1 : init(void)
234 : {
235 1 : g_main_lcore = spdk_env_get_current_core();
236 :
237 1 : if (spdk_governor_set("dpdk_governor") != 0) {
238 1 : SPDK_NOTICELOG("Unable to initialize dpdk governor\n");
239 : }
240 :
241 1 : g_cores = calloc(spdk_env_get_last_core() + 1, sizeof(struct core_stats));
242 1 : if (g_cores == NULL) {
243 0 : SPDK_ERRLOG("Failed to allocate memory for dynamic scheduler core stats.\n");
244 0 : return -ENOMEM;
245 : }
246 :
247 1 : return 0;
248 : }
249 :
250 : static void
251 0 : deinit(void)
252 : {
253 0 : free(g_cores);
254 0 : g_cores = NULL;
255 0 : spdk_governor_set(NULL);
256 0 : }
257 :
258 : static void
259 15 : _balance_idle(struct spdk_scheduler_thread_info *thread_info)
260 : {
261 15 : if (_get_thread_load(thread_info) >= g_scheduler_load_limit) {
262 5 : return;
263 : }
264 : /* This thread is idle, move it to the main core. */
265 10 : _move_thread(thread_info, g_main_lcore);
266 : }
267 :
268 : static void
269 15 : _balance_active(struct spdk_scheduler_thread_info *thread_info)
270 : {
271 : uint32_t target_lcore;
272 :
273 15 : if (_get_thread_load(thread_info) < g_scheduler_load_limit) {
274 10 : return;
275 : }
276 :
277 : /* This thread is active. */
278 5 : target_lcore = _find_optimal_core(thread_info);
279 5 : _move_thread(thread_info, target_lcore);
280 : }
281 :
282 : static void
283 6 : balance(struct spdk_scheduler_core_info *cores_info, uint32_t cores_count)
284 : {
285 : struct spdk_reactor *reactor;
286 : struct spdk_governor *governor;
287 : struct spdk_scheduler_core_info *core;
288 : struct core_stats *main_core;
289 : uint32_t i;
290 : int rc;
291 6 : bool busy_threads_present = false;
292 :
293 : SPDK_DTRACE_PROBE1(dynsched_balance, cores_count);
294 :
295 21 : SPDK_ENV_FOREACH_CORE(i) {
296 15 : g_cores[i].thread_count = cores_info[i].threads_count;
297 15 : g_cores[i].busy = cores_info[i].current_busy_tsc;
298 15 : g_cores[i].idle = cores_info[i].current_idle_tsc;
299 : SPDK_DTRACE_PROBE2(dynsched_core_info, i, &cores_info[i]);
300 : }
301 6 : main_core = &g_cores[g_main_lcore];
302 :
303 : /* Distribute threads in two passes, to make sure updated core stats are considered on each pass.
304 : * 1) Move all idle threads to main core. */
305 6 : _foreach_thread(cores_info, _balance_idle);
306 : /* 2) Distribute active threads across all cores. */
307 6 : _foreach_thread(cores_info, _balance_active);
308 :
309 : /* Switch unused cores to interrupt mode and switch cores to polled mode
310 : * if they will be used after rebalancing */
311 21 : SPDK_ENV_FOREACH_CORE(i) {
312 15 : reactor = spdk_reactor_get(i);
313 15 : assert(reactor != NULL);
314 :
315 15 : core = &cores_info[i];
316 : /* We can switch mode only if reactor already does not have any threads */
317 15 : if (g_cores[i].thread_count == 0 && TAILQ_EMPTY(&reactor->threads)) {
318 1 : core->interrupt_mode = true;
319 14 : } else if (g_cores[i].thread_count != 0) {
320 9 : core->interrupt_mode = false;
321 9 : if (i != g_main_lcore) {
322 : /* If a thread is present on non g_main_lcore,
323 : * it has to be busy. */
324 3 : busy_threads_present = true;
325 : }
326 : }
327 : }
328 :
329 6 : governor = spdk_governor_get();
330 6 : if (governor == NULL) {
331 3 : return;
332 : }
333 :
334 : /* Change main core frequency if needed */
335 3 : if (busy_threads_present) {
336 1 : rc = governor->set_core_freq_max(g_main_lcore);
337 1 : if (rc < 0) {
338 0 : SPDK_ERRLOG("setting default frequency for core %u failed\n", g_main_lcore);
339 : }
340 2 : } else if (main_core->busy > main_core->idle) {
341 1 : rc = governor->core_freq_up(g_main_lcore);
342 1 : if (rc < 0) {
343 0 : SPDK_ERRLOG("increasing frequency for core %u failed\n", g_main_lcore);
344 : }
345 : } else {
346 1 : rc = governor->core_freq_down(g_main_lcore);
347 1 : if (rc < 0) {
348 0 : SPDK_ERRLOG("lowering frequency for core %u failed\n", g_main_lcore);
349 : }
350 : }
351 : }
352 :
353 : struct json_scheduler_opts {
354 : uint8_t load_limit;
355 : uint8_t core_limit;
356 : uint8_t core_busy;
357 : };
358 :
359 : static const struct spdk_json_object_decoder sched_decoders[] = {
360 : {"load_limit", offsetof(struct json_scheduler_opts, load_limit), spdk_json_decode_uint8, true},
361 : {"core_limit", offsetof(struct json_scheduler_opts, core_limit), spdk_json_decode_uint8, true},
362 : {"core_busy", offsetof(struct json_scheduler_opts, core_busy), spdk_json_decode_uint8, true},
363 : };
364 :
365 : static int
366 0 : set_opts(const struct spdk_json_val *opts)
367 : {
368 0 : struct json_scheduler_opts scheduler_opts;
369 :
370 0 : scheduler_opts.load_limit = g_scheduler_load_limit;
371 0 : scheduler_opts.core_limit = g_scheduler_core_limit;
372 0 : scheduler_opts.core_busy = g_scheduler_core_busy;
373 :
374 0 : if (opts != NULL) {
375 0 : if (spdk_json_decode_object_relaxed(opts, sched_decoders,
376 : SPDK_COUNTOF(sched_decoders), &scheduler_opts)) {
377 0 : SPDK_ERRLOG("Decoding scheduler opts JSON failed\n");
378 0 : return -1;
379 : }
380 : }
381 :
382 0 : SPDK_NOTICELOG("Setting scheduler load limit to %d\n", scheduler_opts.load_limit);
383 0 : g_scheduler_load_limit = scheduler_opts.load_limit;
384 0 : SPDK_NOTICELOG("Setting scheduler core limit to %d\n", scheduler_opts.core_limit);
385 0 : g_scheduler_core_limit = scheduler_opts.core_limit;
386 0 : SPDK_NOTICELOG("Setting scheduler core busy to %d\n", scheduler_opts.core_busy);
387 0 : g_scheduler_core_busy = scheduler_opts.core_busy;
388 :
389 0 : return 0;
390 : }
391 :
392 : static void
393 0 : get_opts(struct spdk_json_write_ctx *ctx)
394 : {
395 0 : spdk_json_write_named_uint8(ctx, "load_limit", g_scheduler_load_limit);
396 0 : spdk_json_write_named_uint8(ctx, "core_limit", g_scheduler_core_limit);
397 0 : spdk_json_write_named_uint8(ctx, "core_busy", g_scheduler_core_busy);
398 0 : }
399 :
400 : static struct spdk_scheduler scheduler_dynamic = {
401 : .name = "dynamic",
402 : .init = init,
403 : .deinit = deinit,
404 : .balance = balance,
405 : .set_opts = set_opts,
406 : .get_opts = get_opts,
407 : };
408 :
409 1 : SPDK_SCHEDULER_REGISTER(scheduler_dynamic);
|