LCOV - code coverage report
Current view: top level - module/scheduler/dynamic - scheduler_dynamic.c (source / functions) Hit Total Coverage
Test: ut_cov_unit.info Lines: 129 176 73.3 %
Date: 2024-07-12 08:45:11 Functions: 14 17 82.4 %

          Line data    Source code
       1             : /*   SPDX-License-Identifier: BSD-3-Clause
       2             :  *   Copyright (C) 2021 Intel Corporation.
       3             :  *   All rights reserved.
       4             :  */
       5             : 
       6             : #include "spdk/stdinc.h"
       7             : #include "spdk/likely.h"
       8             : #include "spdk/event.h"
       9             : #include "spdk/log.h"
      10             : #include "spdk/env.h"
      11             : 
      12             : #include "spdk/thread.h"
      13             : #include "spdk_internal/event.h"
      14             : #include "spdk/scheduler.h"
      15             : #include "spdk_internal/usdt.h"
      16             : 
      17             : static uint32_t g_main_lcore;
      18             : 
      19             : struct core_stats {
      20             :         uint64_t busy;
      21             :         uint64_t idle;
      22             :         uint32_t thread_count;
      23             : };
      24             : 
      25             : static struct core_stats *g_cores;
      26             : 
      27             : uint8_t g_scheduler_load_limit = 20;
      28             : uint8_t g_scheduler_core_limit = 80;
      29             : uint8_t g_scheduler_core_busy = 95;
      30             : 
      31             : static uint8_t
      32          52 : _busy_pct(uint64_t busy, uint64_t idle)
      33             : {
      34          52 :         if ((busy + idle) == 0) {
      35           0 :                 return 0;
      36             :         }
      37             : 
      38          52 :         return busy * 100 / (busy + idle);
      39             : }
      40             : 
      41             : static uint8_t
      42          30 : _get_thread_load(struct spdk_scheduler_thread_info *thread_info)
      43             : {
      44             :         uint64_t busy, idle;
      45             : 
      46          30 :         busy = thread_info->current_stats.busy_tsc;
      47          30 :         idle = thread_info->current_stats.idle_tsc;
      48             : 
      49             :         /* return percentage of time thread was busy */
      50          30 :         return _busy_pct(busy, idle);
      51             : }
      52             : 
      53             : typedef void (*_foreach_fn)(struct spdk_scheduler_thread_info *thread_info);
      54             : 
      55             : static void
      56          12 : _foreach_thread(struct spdk_scheduler_core_info *cores_info, _foreach_fn fn)
      57             : {
      58             :         struct spdk_scheduler_core_info *core;
      59             :         uint32_t i, j;
      60             : 
      61          42 :         SPDK_ENV_FOREACH_CORE(i) {
      62          30 :                 core = &cores_info[i];
      63          60 :                 for (j = 0; j < core->threads_count; j++) {
      64          30 :                         fn(&core->thread_infos[j]);
      65             :                 }
      66             :         }
      67          12 : }
      68             : 
      69             : static void
      70           1 : prepare_to_sleep(uint32_t core)
      71             : {
      72           1 :         struct spdk_governor *governor = spdk_governor_get();
      73             :         int rc;
      74             : 
      75           1 :         if (governor == NULL) {
      76           0 :                 return;
      77             :         }
      78             : 
      79           1 :         rc = governor->set_core_freq_min(core);
      80           1 :         if (rc < 0) {
      81           0 :                 SPDK_ERRLOG("could not set_core_freq_min(%d)\n", core);
      82             :         }
      83             : }
      84             : 
      85             : static void
      86           3 : prepare_to_wake(uint32_t core)
      87             : {
      88           3 :         struct spdk_governor *governor = spdk_governor_get();
      89             :         int rc;
      90             : 
      91           3 :         if (governor == NULL) {
      92           2 :                 return;
      93             :         }
      94             : 
      95           1 :         rc = governor->set_core_freq_max(core);
      96           1 :         if (rc < 0) {
      97           0 :                 SPDK_ERRLOG("could not set_core_freq_max(%d)\n", core);
      98             :         }
      99             : }
     100             : 
     101             : static void
     102          15 : _move_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core)
     103             : {
     104          15 :         struct core_stats *dst = &g_cores[dst_core];
     105          15 :         struct core_stats *src = &g_cores[thread_info->lcore];
     106          15 :         uint64_t busy_tsc = thread_info->current_stats.busy_tsc;
     107          15 :         uint8_t busy_pct = _busy_pct(src->busy, src->idle);
     108             :         uint64_t tsc;
     109             : 
     110             :         SPDK_DTRACE_PROBE2(dynsched_move, thread_info, dst_core);
     111             : 
     112          15 :         if (src == dst) {
     113             :                 /* Don't modify stats if thread is already on that core. */
     114           7 :                 return;
     115             :         }
     116             : 
     117           8 :         dst->busy += spdk_min(UINT64_MAX - dst->busy, busy_tsc);
     118           8 :         dst->idle -= spdk_min(dst->idle, busy_tsc);
     119           8 :         dst->thread_count++;
     120             : 
     121             :         /* Adjust busy/idle from core as if thread was not present on it.
     122             :          * Core load will reflect the sum of all remaining threads on it. */
     123           8 :         src->busy -= spdk_min(src->busy, busy_tsc);
     124           8 :         src->idle += spdk_min(UINT64_MAX - src->idle, busy_tsc);
     125             : 
     126           8 :         if (busy_pct >= g_scheduler_core_busy &&
     127           2 :             _busy_pct(src->busy, src->idle) < g_scheduler_core_limit) {
     128             :                 /* This core was so busy that we cannot assume all of busy_tsc
     129             :                  * consumed by the moved thread will now be idle_tsc - it's
     130             :                  * very possible the remaining threads will use these cycles
     131             :                  * as busy_tsc.
     132             :                  *
     133             :                  * So make sure we don't drop the updated estimate below
     134             :                  * g_scheduler_core_limit, so that other cores can't
     135             :                  * move threads to this core during this scheduling
     136             :                  * period.
     137             :                  */
     138           2 :                 tsc = src->busy + src->idle;
     139           2 :                 src->busy = tsc * g_scheduler_core_limit / 100;
     140           2 :                 src->idle = tsc - src->busy;
     141             :         }
     142           8 :         assert(src->thread_count > 0);
     143           8 :         src->thread_count--;
     144             : 
     145           8 :         thread_info->lcore = dst_core;
     146             : }
     147             : 
     148             : static bool
     149           5 : _is_core_at_limit(uint32_t core_id)
     150             : {
     151           5 :         struct core_stats *core = &g_cores[core_id];
     152             :         uint64_t busy, idle;
     153             : 
     154             :         /* Core with no or single thread cannot be over the limit. */
     155           5 :         if (core->thread_count <= 1) {
     156           0 :                 return false;
     157             :         }
     158             : 
     159           5 :         busy = core->busy;
     160           5 :         idle = core->idle;
     161             : 
     162             :         /* No work was done, exit before possible division by 0. */
     163           5 :         if (busy == 0) {
     164           0 :                 return false;
     165             :         }
     166             : 
     167             :         /* Work done was less than the limit */
     168           5 :         if (_busy_pct(busy, idle) < g_scheduler_core_limit) {
     169           1 :                 return false;
     170             :         }
     171             : 
     172           4 :         return true;
     173             : }
     174             : 
     175             : static bool
     176           5 : _can_core_fit_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core)
     177             : {
     178           5 :         struct core_stats *dst = &g_cores[dst_core];
     179             :         uint64_t new_busy_tsc, new_idle_tsc;
     180             : 
     181             :         /* Thread can always fit on the core it's currently on. */
     182           5 :         if (thread_info->lcore == dst_core) {
     183           2 :                 return true;
     184             :         }
     185             : 
     186             :         /* Reactors in interrupt mode do not update stats,
     187             :          * a thread can always fit into reactor in interrupt mode. */
     188           3 :         if (dst->busy + dst->idle == 0) {
     189           3 :                 return true;
     190             :         }
     191             : 
     192             :         /* Core has no threads. */
     193           0 :         if (dst->thread_count == 0) {
     194           0 :                 return true;
     195             :         }
     196             : 
     197             :         /* Core doesn't have enough idle_tsc to take this thread. */
     198           0 :         if (dst->idle < thread_info->current_stats.busy_tsc) {
     199           0 :                 return false;
     200             :         }
     201             : 
     202           0 :         new_busy_tsc = dst->busy + thread_info->current_stats.busy_tsc;
     203           0 :         new_idle_tsc = dst->idle - thread_info->current_stats.busy_tsc;
     204             : 
     205             :         /* Core cannot fit this thread if it would put it over the
     206             :          * g_scheduler_core_limit. */
     207           0 :         return _busy_pct(new_busy_tsc, new_idle_tsc) < g_scheduler_core_limit;
     208             : }
     209             : 
     210             : static uint32_t
     211           5 : _find_optimal_core(struct spdk_scheduler_thread_info *thread_info)
     212             : {
     213             :         uint32_t i;
     214           5 :         uint32_t current_lcore = thread_info->lcore;
     215           5 :         uint32_t least_busy_lcore = thread_info->lcore;
     216             :         struct spdk_thread *thread;
     217             :         struct spdk_cpuset *cpumask;
     218           5 :         bool core_at_limit = _is_core_at_limit(current_lcore);
     219             : 
     220           5 :         thread = spdk_thread_get_by_id(thread_info->thread_id);
     221           5 :         if (thread == NULL) {
     222           0 :                 return current_lcore;
     223             :         }
     224           5 :         cpumask = spdk_thread_get_cpumask(thread);
     225             : 
     226             :         /* Find a core that can fit the thread. */
     227          14 :         SPDK_ENV_FOREACH_CORE(i) {
     228             :                 /* Ignore cores outside cpumask. */
     229          12 :                 if (!spdk_cpuset_get_cpu(cpumask, i)) {
     230           7 :                         continue;
     231             :                 }
     232             : 
     233             :                 /* Search for least busy core. */
     234           5 :                 if (g_cores[i].busy < g_cores[least_busy_lcore].busy) {
     235           3 :                         least_busy_lcore = i;
     236             :                 }
     237             : 
     238             :                 /* Skip cores that cannot fit the thread and current one. */
     239           5 :                 if (!_can_core_fit_thread(thread_info, i) || i == current_lcore) {
     240           2 :                         continue;
     241             :                 }
     242           3 :                 if (i == g_main_lcore) {
     243             :                         /* First consider g_main_lcore, consolidate threads on main lcore if possible. */
     244           0 :                         return i;
     245           3 :                 } else if (i < current_lcore && current_lcore != g_main_lcore) {
     246             :                         /* Lower core id was found, move to consolidate threads on lowest core ids. */
     247           0 :                         return i;
     248           3 :                 } else if (core_at_limit) {
     249             :                         /* When core is over the limit, any core id is better than current one. */
     250           3 :                         return i;
     251             :                 }
     252             :         }
     253             : 
     254             :         /* For cores over the limit, place the thread on least busy core
     255             :          * to balance threads. */
     256           2 :         if (core_at_limit) {
     257           1 :                 return least_busy_lcore;
     258             :         }
     259             : 
     260             :         /* If no better core is found, remain on the same one. */
     261           1 :         return current_lcore;
     262             : }
     263             : 
     264             : static int
     265           1 : init(void)
     266             : {
     267           1 :         g_main_lcore = spdk_env_get_current_core();
     268             : 
     269           1 :         if (spdk_governor_set("dpdk_governor") != 0) {
     270           1 :                 SPDK_NOTICELOG("Unable to initialize dpdk governor\n");
     271             :         }
     272             : 
     273           1 :         g_cores = calloc(spdk_env_get_last_core() + 1, sizeof(struct core_stats));
     274           1 :         if (g_cores == NULL) {
     275           0 :                 SPDK_ERRLOG("Failed to allocate memory for dynamic scheduler core stats.\n");
     276           0 :                 return -ENOMEM;
     277             :         }
     278             : 
     279           1 :         return 0;
     280             : }
     281             : 
     282             : static void
     283           0 : deinit(void)
     284             : {
     285           0 :         free(g_cores);
     286           0 :         g_cores = NULL;
     287           0 :         spdk_governor_set(NULL);
     288           0 : }
     289             : 
     290             : static void
     291          15 : _balance_idle(struct spdk_scheduler_thread_info *thread_info)
     292             : {
     293          15 :         if (_get_thread_load(thread_info) >= g_scheduler_load_limit) {
     294           5 :                 return;
     295             :         }
     296             :         /* This thread is idle, move it to the main core. */
     297          10 :         _move_thread(thread_info, g_main_lcore);
     298             : }
     299             : 
     300             : static void
     301          15 : _balance_active(struct spdk_scheduler_thread_info *thread_info)
     302             : {
     303             :         uint32_t target_lcore;
     304             : 
     305          15 :         if (_get_thread_load(thread_info) < g_scheduler_load_limit) {
     306          10 :                 return;
     307             :         }
     308             : 
     309             :         /* This thread is active. */
     310           5 :         target_lcore = _find_optimal_core(thread_info);
     311           5 :         _move_thread(thread_info, target_lcore);
     312             : }
     313             : 
     314             : static void
     315           6 : balance(struct spdk_scheduler_core_info *cores_info, uint32_t cores_count)
     316             : {
     317             :         struct spdk_reactor *reactor;
     318             :         struct spdk_governor *governor;
     319             :         struct spdk_scheduler_core_info *core;
     320             :         struct core_stats *main_core;
     321             :         uint32_t i;
     322             :         int rc;
     323           6 :         bool busy_threads_present = false;
     324             : 
     325             :         SPDK_DTRACE_PROBE1(dynsched_balance, cores_count);
     326             : 
     327          21 :         SPDK_ENV_FOREACH_CORE(i) {
     328          15 :                 g_cores[i].thread_count = cores_info[i].threads_count;
     329          15 :                 g_cores[i].busy = cores_info[i].current_busy_tsc;
     330          15 :                 g_cores[i].idle = cores_info[i].current_idle_tsc;
     331             :                 SPDK_DTRACE_PROBE2(dynsched_core_info, i, &cores_info[i]);
     332             :         }
     333           6 :         main_core = &g_cores[g_main_lcore];
     334             : 
     335             :         /* Distribute threads in two passes, to make sure updated core stats are considered on each pass.
     336             :          * 1) Move all idle threads to main core. */
     337           6 :         _foreach_thread(cores_info, _balance_idle);
     338             :         /* 2) Distribute active threads across all cores. */
     339           6 :         _foreach_thread(cores_info, _balance_active);
     340             : 
     341             :         /* Switch unused cores to interrupt mode and switch cores to polled mode
     342             :          * if they will be used after rebalancing */
     343          21 :         SPDK_ENV_FOREACH_CORE(i) {
     344          15 :                 reactor = spdk_reactor_get(i);
     345          15 :                 assert(reactor != NULL);
     346             : 
     347          15 :                 core = &cores_info[i];
     348             :                 /* We can switch mode only if reactor already does not have any threads */
     349          15 :                 if (g_cores[i].thread_count == 0 && TAILQ_EMPTY(&reactor->threads)) {
     350           1 :                         core->interrupt_mode = true;
     351           1 :                         prepare_to_sleep(i);
     352          14 :                 } else if (g_cores[i].thread_count != 0) {
     353           9 :                         core->interrupt_mode = false;
     354           9 :                         if (i != g_main_lcore) {
     355             :                                 /* If a thread is present on non g_main_lcore,
     356             :                                  * it has to be busy. */
     357           3 :                                 busy_threads_present = true;
     358           3 :                                 prepare_to_wake(i);
     359             :                         }
     360             :                 }
     361             :         }
     362             : 
     363           6 :         governor = spdk_governor_get();
     364           6 :         if (governor == NULL) {
     365           3 :                 return;
     366             :         }
     367             : 
     368             :         /* Change main core frequency if needed */
     369           3 :         if (busy_threads_present) {
     370           1 :                 rc = governor->set_core_freq_max(g_main_lcore);
     371           1 :                 if (rc < 0) {
     372           0 :                         SPDK_ERRLOG("setting default frequency for core %u failed\n", g_main_lcore);
     373             :                 }
     374           2 :         } else if (main_core->busy > main_core->idle) {
     375           1 :                 rc = governor->core_freq_up(g_main_lcore);
     376           1 :                 if (rc < 0) {
     377           0 :                         SPDK_ERRLOG("increasing frequency for core %u failed\n", g_main_lcore);
     378             :                 }
     379             :         } else {
     380           1 :                 rc = governor->core_freq_down(g_main_lcore);
     381           1 :                 if (rc < 0) {
     382           0 :                         SPDK_ERRLOG("lowering frequency for core %u failed\n", g_main_lcore);
     383             :                 }
     384             :         }
     385             : }
     386             : 
     387             : struct json_scheduler_opts {
     388             :         uint8_t load_limit;
     389             :         uint8_t core_limit;
     390             :         uint8_t core_busy;
     391             : };
     392             : 
     393             : static const struct spdk_json_object_decoder sched_decoders[] = {
     394             :         {"load_limit", offsetof(struct json_scheduler_opts, load_limit), spdk_json_decode_uint8, true},
     395             :         {"core_limit", offsetof(struct json_scheduler_opts, core_limit), spdk_json_decode_uint8, true},
     396             :         {"core_busy", offsetof(struct json_scheduler_opts, core_busy), spdk_json_decode_uint8, true},
     397             : };
     398             : 
     399             : static int
     400           0 : set_opts(const struct spdk_json_val *opts)
     401             : {
     402           0 :         struct json_scheduler_opts scheduler_opts;
     403             : 
     404           0 :         scheduler_opts.load_limit = g_scheduler_load_limit;
     405           0 :         scheduler_opts.core_limit = g_scheduler_core_limit;
     406           0 :         scheduler_opts.core_busy = g_scheduler_core_busy;
     407             : 
     408           0 :         if (opts != NULL) {
     409           0 :                 if (spdk_json_decode_object_relaxed(opts, sched_decoders,
     410             :                                                     SPDK_COUNTOF(sched_decoders), &scheduler_opts)) {
     411           0 :                         SPDK_ERRLOG("Decoding scheduler opts JSON failed\n");
     412           0 :                         return -1;
     413             :                 }
     414             :         }
     415             : 
     416           0 :         SPDK_NOTICELOG("Setting scheduler load limit to %d\n", scheduler_opts.load_limit);
     417           0 :         g_scheduler_load_limit = scheduler_opts.load_limit;
     418           0 :         SPDK_NOTICELOG("Setting scheduler core limit to %d\n", scheduler_opts.core_limit);
     419           0 :         g_scheduler_core_limit = scheduler_opts.core_limit;
     420           0 :         SPDK_NOTICELOG("Setting scheduler core busy to %d\n", scheduler_opts.core_busy);
     421           0 :         g_scheduler_core_busy = scheduler_opts.core_busy;
     422             : 
     423           0 :         return 0;
     424             : }
     425             : 
     426             : static void
     427           0 : get_opts(struct spdk_json_write_ctx *ctx)
     428             : {
     429           0 :         spdk_json_write_named_uint8(ctx, "load_limit", g_scheduler_load_limit);
     430           0 :         spdk_json_write_named_uint8(ctx, "core_limit", g_scheduler_core_limit);
     431           0 :         spdk_json_write_named_uint8(ctx, "core_busy", g_scheduler_core_busy);
     432           0 : }
     433             : 
     434             : static struct spdk_scheduler scheduler_dynamic = {
     435             :         .name = "dynamic",
     436             :         .init = init,
     437             :         .deinit = deinit,
     438             :         .balance = balance,
     439             :         .set_opts = set_opts,
     440             :         .get_opts = get_opts,
     441             : };
     442             : 
     443           1 : SPDK_SCHEDULER_REGISTER(scheduler_dynamic);

Generated by: LCOV version 1.15