LCOV - code coverage report
Current view: top level - module/scheduler/dynamic - scheduler_dynamic.c (source / functions) Hit Total Coverage
Test: ut_cov_unit.info Lines: 132 181 72.9 %
Date: 2024-12-15 17:29:35 Functions: 14 17 82.4 %

          Line data    Source code
       1             : /*   SPDX-License-Identifier: BSD-3-Clause
       2             :  *   Copyright (C) 2021 Intel Corporation.
       3             :  *   All rights reserved.
       4             :  */
       5             : 
       6             : #include "spdk/stdinc.h"
       7             : #include "spdk/likely.h"
       8             : #include "spdk/event.h"
       9             : #include "spdk/log.h"
      10             : #include "spdk/env.h"
      11             : 
      12             : #include "spdk/thread.h"
      13             : #include "spdk_internal/event.h"
      14             : #include "spdk/scheduler.h"
      15             : #include "spdk_internal/usdt.h"
      16             : 
      17             : static uint32_t g_main_lcore;
      18             : 
      19             : struct core_stats {
      20             :         uint64_t busy;
      21             :         uint64_t idle;
      22             :         uint32_t thread_count;
      23             :         bool isolated;
      24             : };
      25             : 
      26             : static struct core_stats *g_cores;
      27             : 
      28             : uint8_t g_scheduler_load_limit = 20;
      29             : uint8_t g_scheduler_core_limit = 80;
      30             : uint8_t g_scheduler_core_busy = 95;
      31             : 
      32             : static uint8_t
      33          52 : _busy_pct(uint64_t busy, uint64_t idle)
      34             : {
      35          52 :         if ((busy + idle) == 0) {
      36           0 :                 return 0;
      37             :         }
      38             : 
      39          52 :         return busy * 100 / (busy + idle);
      40             : }
      41             : 
      42             : static uint8_t
      43          30 : _get_thread_load(struct spdk_scheduler_thread_info *thread_info)
      44             : {
      45             :         uint64_t busy, idle;
      46             : 
      47          30 :         busy = thread_info->current_stats.busy_tsc;
      48          30 :         idle = thread_info->current_stats.idle_tsc;
      49             : 
      50             :         /* return percentage of time thread was busy */
      51          30 :         return _busy_pct(busy, idle);
      52             : }
      53             : 
      54             : typedef void (*_foreach_fn)(struct spdk_scheduler_thread_info *thread_info);
      55             : 
      56             : static void
      57          12 : _foreach_thread(struct spdk_scheduler_core_info *cores_info, _foreach_fn fn)
      58             : {
      59             :         struct spdk_scheduler_core_info *core;
      60             :         uint32_t i, j;
      61             : 
      62          42 :         SPDK_ENV_FOREACH_CORE(i) {
      63          30 :                 core = &cores_info[i];
      64             :                 /* Skip cores that are isolated */
      65          30 :                 if (core->isolated) {
      66           0 :                         continue;
      67             :                 }
      68          60 :                 for (j = 0; j < core->threads_count; j++) {
      69          30 :                         fn(&core->thread_infos[j]);
      70             :                 }
      71             :         }
      72          12 : }
      73             : 
      74             : static void
      75           1 : prepare_to_sleep(uint32_t core)
      76             : {
      77           1 :         struct spdk_governor *governor = spdk_governor_get();
      78             :         int rc;
      79             : 
      80           1 :         if (governor == NULL) {
      81           0 :                 return;
      82             :         }
      83             : 
      84           1 :         rc = governor->set_core_freq_min(core);
      85           1 :         if (rc < 0) {
      86           0 :                 SPDK_ERRLOG("could not set_core_freq_min(%d)\n", core);
      87             :         }
      88             : }
      89             : 
      90             : static void
      91           3 : prepare_to_wake(uint32_t core)
      92             : {
      93           3 :         struct spdk_governor *governor = spdk_governor_get();
      94             :         int rc;
      95             : 
      96           3 :         if (governor == NULL) {
      97           2 :                 return;
      98             :         }
      99             : 
     100           1 :         rc = governor->set_core_freq_max(core);
     101           1 :         if (rc < 0) {
     102           0 :                 SPDK_ERRLOG("could not set_core_freq_max(%d)\n", core);
     103             :         }
     104             : }
     105             : 
     106             : static void
     107          15 : _move_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core)
     108             : {
     109          15 :         struct core_stats *dst = &g_cores[dst_core];
     110          15 :         struct core_stats *src = &g_cores[thread_info->lcore];
     111          15 :         uint64_t busy_tsc = thread_info->current_stats.busy_tsc;
     112          15 :         uint8_t busy_pct = _busy_pct(src->busy, src->idle);
     113             :         uint64_t tsc;
     114             : 
     115             :         SPDK_DTRACE_PROBE2(dynsched_move, thread_info, dst_core);
     116             : 
     117          15 :         if (src == dst) {
     118             :                 /* Don't modify stats if thread is already on that core. */
     119           7 :                 return;
     120             :         }
     121             : 
     122           8 :         dst->busy += spdk_min(UINT64_MAX - dst->busy, busy_tsc);
     123           8 :         dst->idle -= spdk_min(dst->idle, busy_tsc);
     124           8 :         dst->thread_count++;
     125             : 
     126             :         /* Adjust busy/idle from core as if thread was not present on it.
     127             :          * Core load will reflect the sum of all remaining threads on it. */
     128           8 :         src->busy -= spdk_min(src->busy, busy_tsc);
     129           8 :         src->idle += spdk_min(UINT64_MAX - src->idle, busy_tsc);
     130             : 
     131           8 :         if (busy_pct >= g_scheduler_core_busy &&
     132           2 :             _busy_pct(src->busy, src->idle) < g_scheduler_core_limit) {
     133             :                 /* This core was so busy that we cannot assume all of busy_tsc
     134             :                  * consumed by the moved thread will now be idle_tsc - it's
     135             :                  * very possible the remaining threads will use these cycles
     136             :                  * as busy_tsc.
     137             :                  *
     138             :                  * So make sure we don't drop the updated estimate below
     139             :                  * g_scheduler_core_limit, so that other cores can't
     140             :                  * move threads to this core during this scheduling
     141             :                  * period.
     142             :                  */
     143           2 :                 tsc = src->busy + src->idle;
     144           2 :                 src->busy = tsc * g_scheduler_core_limit / 100;
     145           2 :                 src->idle = tsc - src->busy;
     146             :         }
     147           8 :         assert(src->thread_count > 0);
     148           8 :         src->thread_count--;
     149             : 
     150           8 :         thread_info->lcore = dst_core;
     151             : }
     152             : 
     153             : static bool
     154           5 : _is_core_at_limit(uint32_t core_id)
     155             : {
     156           5 :         struct core_stats *core = &g_cores[core_id];
     157             :         uint64_t busy, idle;
     158             : 
     159             :         /* Core with no or single thread cannot be over the limit. */
     160           5 :         if (core->thread_count <= 1) {
     161           0 :                 return false;
     162             :         }
     163             : 
     164           5 :         busy = core->busy;
     165           5 :         idle = core->idle;
     166             : 
     167             :         /* No work was done, exit before possible division by 0. */
     168           5 :         if (busy == 0) {
     169           0 :                 return false;
     170             :         }
     171             : 
     172             :         /* Work done was less than the limit */
     173           5 :         if (_busy_pct(busy, idle) < g_scheduler_core_limit) {
     174           1 :                 return false;
     175             :         }
     176             : 
     177           4 :         return true;
     178             : }
     179             : 
     180             : static bool
     181           5 : _can_core_fit_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core)
     182             : {
     183           5 :         struct core_stats *dst = &g_cores[dst_core];
     184             :         uint64_t new_busy_tsc, new_idle_tsc;
     185             : 
     186             :         /* Thread can always fit on the core it's currently on. */
     187           5 :         if (thread_info->lcore == dst_core) {
     188           2 :                 return true;
     189             :         }
     190             : 
     191             :         /* Reactors in interrupt mode do not update stats,
     192             :          * a thread can always fit into reactor in interrupt mode. */
     193           3 :         if (dst->busy + dst->idle == 0) {
     194           3 :                 return true;
     195             :         }
     196             : 
     197             :         /* Core has no threads. */
     198           0 :         if (dst->thread_count == 0) {
     199           0 :                 return true;
     200             :         }
     201             : 
     202             :         /* Core doesn't have enough idle_tsc to take this thread. */
     203           0 :         if (dst->idle < thread_info->current_stats.busy_tsc) {
     204           0 :                 return false;
     205             :         }
     206             : 
     207           0 :         new_busy_tsc = dst->busy + thread_info->current_stats.busy_tsc;
     208           0 :         new_idle_tsc = dst->idle - thread_info->current_stats.busy_tsc;
     209             : 
     210             :         /* Core cannot fit this thread if it would put it over the
     211             :          * g_scheduler_core_limit. */
     212           0 :         return _busy_pct(new_busy_tsc, new_idle_tsc) < g_scheduler_core_limit;
     213             : }
     214             : 
     215             : static uint32_t
     216           5 : _find_optimal_core(struct spdk_scheduler_thread_info *thread_info)
     217             : {
     218             :         uint32_t i;
     219           5 :         uint32_t current_lcore = thread_info->lcore;
     220           5 :         uint32_t least_busy_lcore = thread_info->lcore;
     221             :         struct spdk_thread *thread;
     222             :         struct spdk_cpuset *cpumask;
     223           5 :         bool core_at_limit = _is_core_at_limit(current_lcore);
     224             : 
     225           5 :         thread = spdk_thread_get_by_id(thread_info->thread_id);
     226           5 :         if (thread == NULL) {
     227           0 :                 return current_lcore;
     228             :         }
     229           5 :         cpumask = spdk_thread_get_cpumask(thread);
     230             : 
     231             :         /* Find a core that can fit the thread. */
     232          14 :         SPDK_ENV_FOREACH_CORE(i) {
     233             :                 /* Ignore cores outside cpumask. */
     234          12 :                 if (!spdk_cpuset_get_cpu(cpumask, i)) {
     235           7 :                         continue;
     236             :                 }
     237             : 
     238             :                 /* Skip cores that are isolated */
     239           5 :                 if (g_cores[i].isolated) {
     240           0 :                         continue;
     241             :                 }
     242             : 
     243             :                 /* Search for least busy core. */
     244           5 :                 if (g_cores[i].busy < g_cores[least_busy_lcore].busy) {
     245           3 :                         least_busy_lcore = i;
     246             :                 }
     247             : 
     248             :                 /* Skip cores that cannot fit the thread and current one. */
     249           5 :                 if (!_can_core_fit_thread(thread_info, i) || i == current_lcore) {
     250           2 :                         continue;
     251             :                 }
     252           3 :                 if (i == g_main_lcore) {
     253             :                         /* First consider g_main_lcore, consolidate threads on main lcore if possible. */
     254           0 :                         return i;
     255           3 :                 } else if (i < current_lcore && current_lcore != g_main_lcore) {
     256             :                         /* Lower core id was found, move to consolidate threads on lowest core ids. */
     257           0 :                         return i;
     258           3 :                 } else if (core_at_limit) {
     259             :                         /* When core is over the limit, any core id is better than current one. */
     260           3 :                         return i;
     261             :                 }
     262             :         }
     263             : 
     264             :         /* For cores over the limit, place the thread on least busy core
     265             :          * to balance threads. */
     266           2 :         if (core_at_limit) {
     267           1 :                 return least_busy_lcore;
     268             :         }
     269             : 
     270             :         /* If no better core is found, remain on the same one. */
     271           1 :         return current_lcore;
     272             : }
     273             : 
     274             : static int
     275           1 : init(void)
     276             : {
     277           1 :         g_main_lcore = spdk_scheduler_get_scheduling_lcore();
     278             : 
     279           1 :         if (spdk_governor_set("dpdk_governor") != 0) {
     280           1 :                 SPDK_NOTICELOG("Unable to initialize dpdk governor\n");
     281             :         }
     282             : 
     283           1 :         g_cores = calloc(spdk_env_get_last_core() + 1, sizeof(struct core_stats));
     284           1 :         if (g_cores == NULL) {
     285           0 :                 SPDK_ERRLOG("Failed to allocate memory for dynamic scheduler core stats.\n");
     286           0 :                 return -ENOMEM;
     287             :         }
     288             : 
     289           1 :         return 0;
     290             : }
     291             : 
     292             : static void
     293           0 : deinit(void)
     294             : {
     295           0 :         free(g_cores);
     296           0 :         g_cores = NULL;
     297           0 :         spdk_governor_set(NULL);
     298           0 : }
     299             : 
     300             : static void
     301          15 : _balance_idle(struct spdk_scheduler_thread_info *thread_info)
     302             : {
     303          15 :         if (_get_thread_load(thread_info) >= g_scheduler_load_limit) {
     304           5 :                 return;
     305             :         }
     306             :         /* This thread is idle, move it to the main core. */
     307          10 :         _move_thread(thread_info, g_main_lcore);
     308             : }
     309             : 
     310             : static void
     311          15 : _balance_active(struct spdk_scheduler_thread_info *thread_info)
     312             : {
     313             :         uint32_t target_lcore;
     314             : 
     315          15 :         if (_get_thread_load(thread_info) < g_scheduler_load_limit) {
     316          10 :                 return;
     317             :         }
     318             : 
     319             :         /* This thread is active. */
     320           5 :         target_lcore = _find_optimal_core(thread_info);
     321           5 :         _move_thread(thread_info, target_lcore);
     322             : }
     323             : 
     324             : static void
     325           6 : balance(struct spdk_scheduler_core_info *cores_info, uint32_t cores_count)
     326             : {
     327             :         struct spdk_reactor *reactor;
     328             :         struct spdk_governor *governor;
     329             :         struct spdk_scheduler_core_info *core;
     330             :         struct core_stats *main_core;
     331             :         uint32_t i;
     332             :         int rc;
     333           6 :         bool busy_threads_present = false;
     334             : 
     335             :         SPDK_DTRACE_PROBE1(dynsched_balance, cores_count);
     336             : 
     337          21 :         SPDK_ENV_FOREACH_CORE(i) {
     338          15 :                 g_cores[i].thread_count = cores_info[i].threads_count;
     339          15 :                 g_cores[i].busy = cores_info[i].current_busy_tsc;
     340          15 :                 g_cores[i].idle = cores_info[i].current_idle_tsc;
     341          15 :                 g_cores[i].isolated = cores_info[i].isolated;
     342             :                 SPDK_DTRACE_PROBE2(dynsched_core_info, i, &cores_info[i]);
     343             :         }
     344           6 :         main_core = &g_cores[g_main_lcore];
     345             : 
     346             :         /* Distribute threads in two passes, to make sure updated core stats are considered on each pass.
     347             :          * 1) Move all idle threads to main core. */
     348           6 :         _foreach_thread(cores_info, _balance_idle);
     349             :         /* 2) Distribute active threads across all cores. */
     350           6 :         _foreach_thread(cores_info, _balance_active);
     351             : 
     352             :         /* Switch unused cores to interrupt mode and switch cores to polled mode
     353             :          * if they will be used after rebalancing */
     354          21 :         SPDK_ENV_FOREACH_CORE(i) {
     355          15 :                 reactor = spdk_reactor_get(i);
     356          15 :                 assert(reactor != NULL);
     357             : 
     358          15 :                 core = &cores_info[i];
     359             :                 /* We can switch mode only if reactor already does not have any threads */
     360          15 :                 if (g_cores[i].thread_count == 0 && TAILQ_EMPTY(&reactor->threads)) {
     361           1 :                         core->interrupt_mode = true;
     362           1 :                         prepare_to_sleep(i);
     363          14 :                 } else if (g_cores[i].thread_count != 0) {
     364           9 :                         core->interrupt_mode = false;
     365           9 :                         if (i != g_main_lcore) {
     366             :                                 /* If a thread is present on non g_main_lcore,
     367             :                                  * it has to be busy. */
     368           3 :                                 busy_threads_present = true;
     369           3 :                                 prepare_to_wake(i);
     370             :                         }
     371             :                 }
     372             :         }
     373             : 
     374           6 :         governor = spdk_governor_get();
     375           6 :         if (governor == NULL) {
     376           3 :                 return;
     377             :         }
     378             : 
     379             :         /* Change main core frequency if needed */
     380           3 :         if (busy_threads_present) {
     381           1 :                 rc = governor->set_core_freq_max(g_main_lcore);
     382           1 :                 if (rc < 0) {
     383           0 :                         SPDK_ERRLOG("setting default frequency for core %u failed\n", g_main_lcore);
     384             :                 }
     385           2 :         } else if (main_core->busy > main_core->idle) {
     386           1 :                 rc = governor->core_freq_up(g_main_lcore);
     387           1 :                 if (rc < 0) {
     388           0 :                         SPDK_ERRLOG("increasing frequency for core %u failed\n", g_main_lcore);
     389             :                 }
     390             :         } else {
     391           1 :                 rc = governor->core_freq_down(g_main_lcore);
     392           1 :                 if (rc < 0) {
     393           0 :                         SPDK_ERRLOG("lowering frequency for core %u failed\n", g_main_lcore);
     394             :                 }
     395             :         }
     396             : }
     397             : 
     398             : struct json_scheduler_opts {
     399             :         uint8_t load_limit;
     400             :         uint8_t core_limit;
     401             :         uint8_t core_busy;
     402             : };
     403             : 
     404             : static const struct spdk_json_object_decoder sched_decoders[] = {
     405             :         {"load_limit", offsetof(struct json_scheduler_opts, load_limit), spdk_json_decode_uint8, true},
     406             :         {"core_limit", offsetof(struct json_scheduler_opts, core_limit), spdk_json_decode_uint8, true},
     407             :         {"core_busy", offsetof(struct json_scheduler_opts, core_busy), spdk_json_decode_uint8, true},
     408             : };
     409             : 
     410             : static int
     411           0 : set_opts(const struct spdk_json_val *opts)
     412             : {
     413           0 :         struct json_scheduler_opts scheduler_opts;
     414             : 
     415           0 :         scheduler_opts.load_limit = g_scheduler_load_limit;
     416           0 :         scheduler_opts.core_limit = g_scheduler_core_limit;
     417           0 :         scheduler_opts.core_busy = g_scheduler_core_busy;
     418             : 
     419           0 :         if (opts != NULL) {
     420           0 :                 if (spdk_json_decode_object_relaxed(opts, sched_decoders,
     421             :                                                     SPDK_COUNTOF(sched_decoders), &scheduler_opts)) {
     422           0 :                         SPDK_ERRLOG("Decoding scheduler opts JSON failed\n");
     423           0 :                         return -1;
     424             :                 }
     425             :         }
     426             : 
     427           0 :         SPDK_NOTICELOG("Setting scheduler load limit to %d\n", scheduler_opts.load_limit);
     428           0 :         g_scheduler_load_limit = scheduler_opts.load_limit;
     429           0 :         SPDK_NOTICELOG("Setting scheduler core limit to %d\n", scheduler_opts.core_limit);
     430           0 :         g_scheduler_core_limit = scheduler_opts.core_limit;
     431           0 :         SPDK_NOTICELOG("Setting scheduler core busy to %d\n", scheduler_opts.core_busy);
     432           0 :         g_scheduler_core_busy = scheduler_opts.core_busy;
     433             : 
     434           0 :         return 0;
     435             : }
     436             : 
     437             : static void
     438           0 : get_opts(struct spdk_json_write_ctx *ctx)
     439             : {
     440           0 :         spdk_json_write_named_uint8(ctx, "load_limit", g_scheduler_load_limit);
     441           0 :         spdk_json_write_named_uint8(ctx, "core_limit", g_scheduler_core_limit);
     442           0 :         spdk_json_write_named_uint8(ctx, "core_busy", g_scheduler_core_busy);
     443           0 : }
     444             : 
     445             : static struct spdk_scheduler scheduler_dynamic = {
     446             :         .name = "dynamic",
     447             :         .init = init,
     448             :         .deinit = deinit,
     449             :         .balance = balance,
     450             :         .set_opts = set_opts,
     451             :         .get_opts = get_opts,
     452             : };
     453             : 
     454           1 : SPDK_SCHEDULER_REGISTER(scheduler_dynamic);

Generated by: LCOV version 1.15