Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2016 Intel Corporation.
3 : * All rights reserved.
4 : */
5 :
6 : #include "spdk/stdinc.h"
7 : #include "spdk/likely.h"
8 :
9 : #include "event_internal.h"
10 :
11 : #include "spdk_internal/event.h"
12 : #include "spdk_internal/usdt.h"
13 :
14 : #include "spdk/log.h"
15 : #include "spdk/thread.h"
16 : #include "spdk/env.h"
17 : #include "spdk/util.h"
18 : #include "spdk/scheduler.h"
19 : #include "spdk/string.h"
20 : #include "spdk/fd_group.h"
21 : #include "spdk/trace.h"
22 : #include "spdk_internal/trace_defs.h"
23 :
24 : #ifdef __linux__
25 : #include <sys/prctl.h>
26 : #include <sys/eventfd.h>
27 : #endif
28 :
29 : #ifdef __FreeBSD__
30 : #include <pthread_np.h>
31 : #endif
32 :
33 : #define SPDK_EVENT_BATCH_SIZE 8
34 :
35 : static struct spdk_reactor *g_reactors;
36 : static uint32_t g_reactor_count;
37 : static struct spdk_cpuset g_reactor_core_mask;
38 : static enum spdk_reactor_state g_reactor_state = SPDK_REACTOR_STATE_UNINITIALIZED;
39 :
40 : static bool g_framework_context_switch_monitor_enabled = true;
41 :
42 : static struct spdk_mempool *g_spdk_event_mempool = NULL;
43 :
44 : TAILQ_HEAD(, spdk_scheduler) g_scheduler_list
45 : = TAILQ_HEAD_INITIALIZER(g_scheduler_list);
46 :
47 : static struct spdk_scheduler *g_scheduler = NULL;
48 : static struct spdk_reactor *g_scheduling_reactor;
49 : bool g_scheduling_in_progress = false;
50 : static uint64_t g_scheduler_period_in_tsc = 0;
51 : static uint64_t g_scheduler_period_in_us;
52 : static uint32_t g_scheduler_core_number;
53 : static struct spdk_scheduler_core_info *g_core_infos = NULL;
54 : static struct spdk_cpuset g_scheduler_isolated_core_mask;
55 :
56 : TAILQ_HEAD(, spdk_governor) g_governor_list
57 : = TAILQ_HEAD_INITIALIZER(g_governor_list);
58 :
59 : static struct spdk_governor *g_governor = NULL;
60 :
61 : static int reactor_interrupt_init(struct spdk_reactor *reactor);
62 : static void reactor_interrupt_fini(struct spdk_reactor *reactor);
63 :
64 : static pthread_mutex_t g_stopping_reactors_mtx = PTHREAD_MUTEX_INITIALIZER;
65 : static bool g_stopping_reactors = false;
66 :
67 : static struct spdk_scheduler *
68 4 : _scheduler_find(const char *name)
69 : {
70 : struct spdk_scheduler *tmp;
71 :
72 7 : TAILQ_FOREACH(tmp, &g_scheduler_list, link) {
73 5 : if (strcmp(name, tmp->name) == 0) {
74 2 : return tmp;
75 : }
76 3 : }
77 :
78 2 : return NULL;
79 4 : }
80 :
81 : int
82 2 : spdk_scheduler_set(const char *name)
83 : {
84 : struct spdk_scheduler *scheduler;
85 2 : int rc = 0;
86 :
87 : /* NULL scheduler was specifically requested */
88 2 : if (name == NULL) {
89 0 : if (g_scheduler) {
90 0 : g_scheduler->deinit();
91 0 : }
92 0 : g_scheduler = NULL;
93 0 : return 0;
94 : }
95 :
96 2 : scheduler = _scheduler_find(name);
97 2 : if (scheduler == NULL) {
98 0 : SPDK_ERRLOG("Requested scheduler is missing\n");
99 0 : return -EINVAL;
100 : }
101 :
102 2 : if (g_scheduler == scheduler) {
103 1 : return 0;
104 : }
105 :
106 1 : if (g_scheduler) {
107 0 : g_scheduler->deinit();
108 0 : }
109 :
110 1 : rc = scheduler->init();
111 1 : if (rc == 0) {
112 1 : g_scheduler = scheduler;
113 1 : } else {
114 : /* Could not switch to the new scheduler, so keep the old
115 : * one. We need to check if it wasn't NULL, and ->init() it again.
116 : */
117 0 : if (g_scheduler) {
118 0 : SPDK_ERRLOG("Could not ->init() '%s' scheduler, reverting to '%s'\n",
119 : name, g_scheduler->name);
120 0 : g_scheduler->init();
121 0 : } else {
122 0 : SPDK_ERRLOG("Could not ->init() '%s' scheduler.\n", name);
123 : }
124 : }
125 :
126 1 : return rc;
127 2 : }
128 :
129 : struct spdk_scheduler *
130 3 : spdk_scheduler_get(void)
131 : {
132 3 : return g_scheduler;
133 : }
134 :
135 : uint64_t
136 0 : spdk_scheduler_get_period(void)
137 : {
138 0 : return g_scheduler_period_in_us;
139 : }
140 :
141 : void
142 0 : spdk_scheduler_set_period(uint64_t period)
143 : {
144 0 : g_scheduler_period_in_us = period;
145 0 : g_scheduler_period_in_tsc = period * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
146 0 : }
147 :
148 : void
149 2 : spdk_scheduler_register(struct spdk_scheduler *scheduler)
150 : {
151 2 : if (_scheduler_find(scheduler->name)) {
152 0 : SPDK_ERRLOG("scheduler named '%s' already registered.\n", scheduler->name);
153 0 : assert(false);
154 : return;
155 : }
156 :
157 2 : TAILQ_INSERT_TAIL(&g_scheduler_list, scheduler, link);
158 2 : }
159 :
160 : uint32_t
161 10 : spdk_scheduler_get_scheduling_lcore(void)
162 : {
163 10 : return g_scheduling_reactor->lcore;
164 : }
165 :
166 : bool
167 0 : spdk_scheduler_set_scheduling_lcore(uint32_t core)
168 : {
169 0 : struct spdk_reactor *reactor = spdk_reactor_get(core);
170 0 : if (reactor == NULL) {
171 0 : SPDK_ERRLOG("Failed to set scheduling reactor. Reactor(lcore:%d) does not exist", core);
172 0 : return false;
173 : }
174 :
175 0 : g_scheduling_reactor = reactor;
176 0 : return true;
177 0 : }
178 :
179 : bool
180 3 : scheduler_set_isolated_core_mask(struct spdk_cpuset isolated_core_mask)
181 : {
182 : struct spdk_cpuset tmp_mask;
183 :
184 3 : spdk_cpuset_copy(&tmp_mask, spdk_app_get_core_mask());
185 3 : spdk_cpuset_or(&tmp_mask, &isolated_core_mask);
186 3 : if (spdk_cpuset_equal(&tmp_mask, spdk_app_get_core_mask()) == false) {
187 2 : SPDK_ERRLOG("Isolated core mask is not included in app core mask.\n");
188 2 : return false;
189 : }
190 1 : spdk_cpuset_copy(&g_scheduler_isolated_core_mask, &isolated_core_mask);
191 1 : return true;
192 3 : }
193 :
194 : const char *
195 0 : scheduler_get_isolated_core_mask(void)
196 : {
197 0 : return spdk_cpuset_fmt(&g_scheduler_isolated_core_mask);
198 : }
199 :
200 : static bool
201 9 : scheduler_is_isolated_core(uint32_t core)
202 : {
203 9 : return spdk_cpuset_get_cpu(&g_scheduler_isolated_core_mask, core);
204 : }
205 :
206 : static void
207 29 : reactor_construct(struct spdk_reactor *reactor, uint32_t lcore)
208 : {
209 29 : reactor->lcore = lcore;
210 29 : reactor->flags.is_valid = true;
211 :
212 29 : TAILQ_INIT(&reactor->threads);
213 29 : reactor->thread_count = 0;
214 29 : spdk_cpuset_zero(&reactor->notify_cpuset);
215 :
216 29 : reactor->events = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, SPDK_ENV_NUMA_ID_ANY);
217 29 : if (reactor->events == NULL) {
218 0 : SPDK_ERRLOG("Failed to allocate events ring\n");
219 0 : assert(false);
220 : }
221 :
222 : /* Always initialize interrupt facilities for reactor */
223 29 : if (reactor_interrupt_init(reactor) != 0) {
224 : /* Reactor interrupt facilities are necessary if setting app to interrupt mode. */
225 29 : if (spdk_interrupt_mode_is_enabled()) {
226 0 : SPDK_ERRLOG("Failed to prepare intr facilities\n");
227 0 : assert(false);
228 : }
229 29 : return;
230 : }
231 :
232 : /* If application runs with full interrupt ability,
233 : * all reactors are going to run in interrupt mode.
234 : */
235 0 : if (spdk_interrupt_mode_is_enabled()) {
236 : uint32_t i;
237 :
238 0 : SPDK_ENV_FOREACH_CORE(i) {
239 0 : spdk_cpuset_set_cpu(&reactor->notify_cpuset, i, true);
240 0 : }
241 0 : reactor->in_interrupt = true;
242 0 : }
243 29 : }
244 :
245 : struct spdk_reactor *
246 279 : spdk_reactor_get(uint32_t lcore)
247 : {
248 : struct spdk_reactor *reactor;
249 :
250 279 : if (g_reactors == NULL) {
251 0 : SPDK_WARNLOG("Called spdk_reactor_get() while the g_reactors array was NULL!\n");
252 0 : return NULL;
253 : }
254 :
255 279 : if (lcore >= g_reactor_count) {
256 0 : return NULL;
257 : }
258 :
259 279 : reactor = &g_reactors[lcore];
260 :
261 279 : if (reactor->flags.is_valid == false) {
262 0 : return NULL;
263 : }
264 :
265 279 : return reactor;
266 279 : }
267 :
268 : static int reactor_thread_op(struct spdk_thread *thread, enum spdk_thread_op op);
269 : static bool reactor_thread_op_supported(enum spdk_thread_op op);
270 :
271 : /* Power of 2 minus 1 is optimal for memory consumption */
272 : #define EVENT_MSG_MEMPOOL_SHIFT 14 /* 2^14 = 16384 */
273 : #define EVENT_MSG_MEMPOOL_SIZE ((1 << EVENT_MSG_MEMPOOL_SHIFT) - 1)
274 :
275 : int
276 10 : spdk_reactors_init(size_t msg_mempool_size)
277 : {
278 : struct spdk_reactor *reactor;
279 : int rc;
280 : uint32_t i, current_core;
281 : char mempool_name[32];
282 :
283 10 : snprintf(mempool_name, sizeof(mempool_name), "evtpool_%d", getpid());
284 10 : g_spdk_event_mempool = spdk_mempool_create(mempool_name,
285 : EVENT_MSG_MEMPOOL_SIZE,
286 : sizeof(struct spdk_event),
287 : SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
288 : SPDK_ENV_NUMA_ID_ANY);
289 :
290 10 : if (g_spdk_event_mempool == NULL) {
291 0 : SPDK_ERRLOG("spdk_event_mempool creation failed\n");
292 0 : return -1;
293 : }
294 :
295 : /* struct spdk_reactor must be aligned on 64 byte boundary */
296 10 : g_reactor_count = spdk_env_get_last_core() + 1;
297 10 : rc = posix_memalign((void **)&g_reactors, 64,
298 10 : g_reactor_count * sizeof(struct spdk_reactor));
299 10 : if (rc != 0) {
300 0 : SPDK_ERRLOG("Could not allocate array size=%u for g_reactors\n",
301 : g_reactor_count);
302 0 : spdk_mempool_free(g_spdk_event_mempool);
303 0 : return -1;
304 : }
305 :
306 10 : g_core_infos = calloc(g_reactor_count, sizeof(*g_core_infos));
307 10 : if (g_core_infos == NULL) {
308 0 : SPDK_ERRLOG("Could not allocate memory for g_core_infos\n");
309 0 : spdk_mempool_free(g_spdk_event_mempool);
310 0 : free(g_reactors);
311 0 : return -ENOMEM;
312 : }
313 :
314 10 : memset(g_reactors, 0, (g_reactor_count) * sizeof(struct spdk_reactor));
315 :
316 10 : rc = spdk_thread_lib_init_ext(reactor_thread_op, reactor_thread_op_supported,
317 10 : sizeof(struct spdk_lw_thread), msg_mempool_size);
318 10 : if (rc != 0) {
319 0 : SPDK_ERRLOG("Initialize spdk thread lib failed\n");
320 0 : spdk_mempool_free(g_spdk_event_mempool);
321 0 : free(g_reactors);
322 0 : free(g_core_infos);
323 0 : return rc;
324 : }
325 :
326 38 : SPDK_ENV_FOREACH_CORE(i) {
327 28 : reactor_construct(&g_reactors[i], i);
328 28 : }
329 :
330 10 : current_core = spdk_env_get_current_core();
331 10 : reactor = spdk_reactor_get(current_core);
332 10 : assert(reactor != NULL);
333 10 : g_scheduling_reactor = reactor;
334 :
335 10 : g_reactor_state = SPDK_REACTOR_STATE_INITIALIZED;
336 :
337 10 : return 0;
338 10 : }
339 :
340 : void
341 10 : spdk_reactors_fini(void)
342 : {
343 : uint32_t i;
344 : struct spdk_reactor *reactor;
345 :
346 10 : if (g_reactor_state == SPDK_REACTOR_STATE_UNINITIALIZED) {
347 0 : return;
348 : }
349 :
350 10 : spdk_thread_lib_fini();
351 :
352 38 : SPDK_ENV_FOREACH_CORE(i) {
353 28 : reactor = spdk_reactor_get(i);
354 28 : assert(reactor != NULL);
355 28 : assert(reactor->thread_count == 0);
356 28 : if (reactor->events != NULL) {
357 28 : spdk_ring_free(reactor->events);
358 28 : }
359 :
360 28 : reactor_interrupt_fini(reactor);
361 :
362 28 : if (g_core_infos != NULL) {
363 28 : free(g_core_infos[i].thread_infos);
364 28 : }
365 28 : }
366 :
367 10 : spdk_mempool_free(g_spdk_event_mempool);
368 :
369 10 : free(g_reactors);
370 10 : g_reactors = NULL;
371 10 : free(g_core_infos);
372 10 : g_core_infos = NULL;
373 10 : }
374 :
375 : static void _reactor_set_interrupt_mode(void *arg1, void *arg2);
376 :
377 : static void
378 0 : _reactor_set_notify_cpuset(void *arg1, void *arg2)
379 : {
380 0 : struct spdk_reactor *target = arg1;
381 0 : struct spdk_reactor *reactor = spdk_reactor_get(spdk_env_get_current_core());
382 :
383 0 : assert(reactor != NULL);
384 0 : spdk_cpuset_set_cpu(&reactor->notify_cpuset, target->lcore, target->new_in_interrupt);
385 0 : }
386 :
387 : static void
388 10 : _event_call(uint32_t lcore, spdk_event_fn fn, void *arg1, void *arg2)
389 : {
390 : struct spdk_event *ev;
391 :
392 10 : ev = spdk_event_allocate(lcore, fn, arg1, arg2);
393 10 : assert(ev);
394 10 : spdk_event_call(ev);
395 10 : }
396 :
397 : static void
398 0 : _reactor_set_notify_cpuset_cpl(void *arg1, void *arg2)
399 : {
400 0 : struct spdk_reactor *target = arg1;
401 :
402 0 : if (target->new_in_interrupt == false) {
403 0 : target->set_interrupt_mode_in_progress = false;
404 0 : _event_call(spdk_scheduler_get_scheduling_lcore(), target->set_interrupt_mode_cb_fn,
405 0 : target->set_interrupt_mode_cb_arg, NULL);
406 0 : } else {
407 0 : _event_call(target->lcore, _reactor_set_interrupt_mode, target, NULL);
408 : }
409 0 : }
410 :
411 : static void
412 0 : _reactor_set_thread_interrupt_mode(void *ctx)
413 : {
414 0 : struct spdk_reactor *reactor = ctx;
415 :
416 0 : spdk_thread_set_interrupt_mode(reactor->in_interrupt);
417 0 : }
418 :
419 : static void
420 0 : _reactor_set_interrupt_mode(void *arg1, void *arg2)
421 : {
422 0 : struct spdk_reactor *target = arg1;
423 : struct spdk_thread *thread;
424 : struct spdk_fd_group *grp;
425 : struct spdk_lw_thread *lw_thread, *tmp;
426 :
427 0 : assert(target == spdk_reactor_get(spdk_env_get_current_core()));
428 0 : assert(target != NULL);
429 0 : assert(target->in_interrupt != target->new_in_interrupt);
430 0 : SPDK_DEBUGLOG(reactor, "Do reactor set on core %u from %s to state %s\n",
431 : target->lcore, target->in_interrupt ? "intr" : "poll", target->new_in_interrupt ? "intr" : "poll");
432 :
433 0 : target->in_interrupt = target->new_in_interrupt;
434 :
435 0 : if (spdk_interrupt_mode_is_enabled()) {
436 : /* Align spdk_thread with reactor to interrupt mode or poll mode */
437 0 : TAILQ_FOREACH_SAFE(lw_thread, &target->threads, link, tmp) {
438 0 : thread = spdk_thread_get_from_ctx(lw_thread);
439 0 : if (target->in_interrupt) {
440 0 : grp = spdk_thread_get_interrupt_fd_group(thread);
441 0 : spdk_fd_group_nest(target->fgrp, grp);
442 0 : } else {
443 0 : grp = spdk_thread_get_interrupt_fd_group(thread);
444 0 : spdk_fd_group_unnest(target->fgrp, grp);
445 : }
446 :
447 0 : spdk_thread_send_msg(thread, _reactor_set_thread_interrupt_mode, target);
448 0 : }
449 0 : }
450 :
451 0 : if (target->new_in_interrupt == false) {
452 : /* Reactor is no longer in interrupt mode. Refresh the tsc_last to accurately
453 : * track reactor stats. */
454 0 : target->tsc_last = spdk_get_ticks();
455 0 : spdk_for_each_reactor(_reactor_set_notify_cpuset, target, NULL, _reactor_set_notify_cpuset_cpl);
456 0 : } else {
457 0 : uint64_t notify = 1;
458 0 : int rc = 0;
459 :
460 : /* Always trigger spdk_event and resched event in case of race condition */
461 0 : rc = write(target->events_fd, ¬ify, sizeof(notify));
462 0 : if (rc < 0) {
463 0 : SPDK_ERRLOG("failed to notify event queue: %s.\n", spdk_strerror(errno));
464 0 : }
465 0 : rc = write(target->resched_fd, ¬ify, sizeof(notify));
466 0 : if (rc < 0) {
467 0 : SPDK_ERRLOG("failed to notify reschedule: %s.\n", spdk_strerror(errno));
468 0 : }
469 :
470 0 : target->set_interrupt_mode_in_progress = false;
471 0 : _event_call(spdk_scheduler_get_scheduling_lcore(), target->set_interrupt_mode_cb_fn,
472 0 : target->set_interrupt_mode_cb_arg, NULL);
473 : }
474 0 : }
475 :
476 : int
477 0 : spdk_reactor_set_interrupt_mode(uint32_t lcore, bool new_in_interrupt,
478 : spdk_reactor_set_interrupt_mode_cb cb_fn, void *cb_arg)
479 : {
480 : struct spdk_reactor *target;
481 :
482 0 : target = spdk_reactor_get(lcore);
483 0 : if (target == NULL) {
484 0 : return -EINVAL;
485 : }
486 :
487 : /* Eventfd has to be supported in order to use interrupt functionality. */
488 0 : if (target->fgrp == NULL) {
489 0 : return -ENOTSUP;
490 : }
491 :
492 0 : if (spdk_env_get_current_core() != g_scheduling_reactor->lcore) {
493 0 : SPDK_ERRLOG("It is only permitted within scheduling reactor.\n");
494 0 : return -EPERM;
495 : }
496 :
497 0 : if (target->in_interrupt == new_in_interrupt) {
498 0 : cb_fn(cb_arg, NULL);
499 0 : return 0;
500 : }
501 :
502 0 : if (target->set_interrupt_mode_in_progress) {
503 0 : SPDK_NOTICELOG("Reactor(%u) is already in progress to set interrupt mode\n", lcore);
504 0 : return -EBUSY;
505 : }
506 0 : target->set_interrupt_mode_in_progress = true;
507 :
508 0 : target->new_in_interrupt = new_in_interrupt;
509 0 : target->set_interrupt_mode_cb_fn = cb_fn;
510 0 : target->set_interrupt_mode_cb_arg = cb_arg;
511 :
512 0 : SPDK_DEBUGLOG(reactor, "Starting reactor event from %d to %d\n",
513 : spdk_env_get_current_core(), lcore);
514 :
515 0 : if (new_in_interrupt == false) {
516 : /* For potential race cases, when setting the reactor to poll mode,
517 : * first change the mode of the reactor and then clear the corresponding
518 : * bit of the notify_cpuset of each reactor.
519 : */
520 0 : _event_call(lcore, _reactor_set_interrupt_mode, target, NULL);
521 0 : } else {
522 : /* For race cases, when setting the reactor to interrupt mode, first set the
523 : * corresponding bit of the notify_cpuset of each reactor and then change the mode.
524 : */
525 0 : spdk_for_each_reactor(_reactor_set_notify_cpuset, target, NULL, _reactor_set_notify_cpuset_cpl);
526 : }
527 :
528 0 : return 0;
529 0 : }
530 :
531 : struct spdk_event *
532 34 : spdk_event_allocate(uint32_t lcore, spdk_event_fn fn, void *arg1, void *arg2)
533 : {
534 34 : struct spdk_event *event = NULL;
535 34 : struct spdk_reactor *reactor = spdk_reactor_get(lcore);
536 :
537 34 : if (!reactor) {
538 0 : assert(false);
539 : return NULL;
540 : }
541 :
542 34 : event = spdk_mempool_get(g_spdk_event_mempool);
543 34 : if (event == NULL) {
544 0 : assert(false);
545 : return NULL;
546 : }
547 :
548 34 : event->lcore = lcore;
549 34 : event->fn = fn;
550 34 : event->arg1 = arg1;
551 34 : event->arg2 = arg2;
552 :
553 34 : return event;
554 : }
555 :
556 : void
557 34 : spdk_event_call(struct spdk_event *event)
558 : {
559 : int rc;
560 : struct spdk_reactor *reactor;
561 34 : struct spdk_reactor *local_reactor = NULL;
562 34 : uint32_t current_core = spdk_env_get_current_core();
563 :
564 34 : reactor = spdk_reactor_get(event->lcore);
565 :
566 34 : assert(reactor != NULL);
567 34 : assert(reactor->events != NULL);
568 :
569 34 : rc = spdk_ring_enqueue(reactor->events, (void **)&event, 1, NULL);
570 34 : if (rc != 1) {
571 0 : assert(false);
572 : }
573 :
574 34 : if (current_core != SPDK_ENV_LCORE_ID_ANY) {
575 34 : local_reactor = spdk_reactor_get(current_core);
576 34 : }
577 :
578 : /* If spdk_event_call isn't called on a reactor, always send a notification.
579 : * If it is called on a reactor, send a notification if the destination reactor
580 : * is indicated in interrupt mode state.
581 : */
582 34 : if (spdk_unlikely(local_reactor == NULL) ||
583 34 : spdk_unlikely(spdk_cpuset_get_cpu(&local_reactor->notify_cpuset, event->lcore))) {
584 0 : uint64_t notify = 1;
585 :
586 0 : rc = write(reactor->events_fd, ¬ify, sizeof(notify));
587 0 : if (rc < 0) {
588 0 : SPDK_ERRLOG("failed to notify event queue: %s.\n", spdk_strerror(errno));
589 0 : }
590 0 : }
591 34 : }
592 :
593 : static inline int
594 88 : event_queue_run_batch(void *arg)
595 : {
596 88 : struct spdk_reactor *reactor = arg;
597 : size_t count, i;
598 : void *events[SPDK_EVENT_BATCH_SIZE];
599 :
600 : #ifdef DEBUG
601 : /*
602 : * spdk_ring_dequeue() fills events and returns how many entries it wrote,
603 : * so we will never actually read uninitialized data from events, but just to be sure
604 : * (and to silence a static analyzer false positive), initialize the array to NULL pointers.
605 : */
606 88 : memset(events, 0, sizeof(events));
607 : #endif
608 :
609 : /* Operate event notification if this reactor currently runs in interrupt state */
610 88 : if (spdk_unlikely(reactor->in_interrupt)) {
611 0 : uint64_t notify = 1;
612 : int rc;
613 :
614 : /* There may be race between event_acknowledge and another producer's event_notify,
615 : * so event_acknowledge should be applied ahead. And then check for self's event_notify.
616 : * This can avoid event notification missing.
617 : */
618 0 : rc = read(reactor->events_fd, ¬ify, sizeof(notify));
619 0 : if (rc < 0) {
620 0 : SPDK_ERRLOG("failed to acknowledge event queue: %s.\n", spdk_strerror(errno));
621 0 : return -errno;
622 : }
623 :
624 0 : count = spdk_ring_dequeue(reactor->events, events, SPDK_EVENT_BATCH_SIZE);
625 :
626 0 : if (spdk_ring_count(reactor->events) != 0) {
627 : /* Trigger new notification if there are still events in event-queue waiting for processing. */
628 0 : rc = write(reactor->events_fd, ¬ify, sizeof(notify));
629 0 : if (rc < 0) {
630 0 : SPDK_ERRLOG("failed to notify event queue: %s.\n", spdk_strerror(errno));
631 0 : return -errno;
632 : }
633 0 : }
634 0 : } else {
635 88 : count = spdk_ring_dequeue(reactor->events, events, SPDK_EVENT_BATCH_SIZE);
636 : }
637 :
638 88 : if (count == 0) {
639 56 : return 0;
640 : }
641 :
642 66 : for (i = 0; i < count; i++) {
643 34 : struct spdk_event *event = events[i];
644 :
645 34 : assert(event != NULL);
646 34 : assert(spdk_get_thread() == NULL);
647 : SPDK_DTRACE_PROBE3(event_exec, event->fn,
648 : event->arg1, event->arg2);
649 34 : event->fn(event->arg1, event->arg2);
650 34 : }
651 :
652 32 : spdk_mempool_put_bulk(g_spdk_event_mempool, events, count);
653 :
654 32 : return (int)count;
655 88 : }
656 :
657 : /* 1s */
658 : #define CONTEXT_SWITCH_MONITOR_PERIOD 1000000
659 :
660 : static int
661 6 : get_rusage(struct spdk_reactor *reactor)
662 : {
663 : struct rusage rusage;
664 :
665 6 : if (getrusage(RUSAGE_THREAD, &rusage) != 0) {
666 0 : return -1;
667 : }
668 :
669 6 : if (rusage.ru_nvcsw != reactor->rusage.ru_nvcsw || rusage.ru_nivcsw != reactor->rusage.ru_nivcsw) {
670 6 : SPDK_INFOLOG(reactor,
671 : "Reactor %d: %ld voluntary context switches and %ld involuntary context switches in the last second.\n",
672 : reactor->lcore, rusage.ru_nvcsw - reactor->rusage.ru_nvcsw,
673 : rusage.ru_nivcsw - reactor->rusage.ru_nivcsw);
674 6 : }
675 6 : reactor->rusage = rusage;
676 :
677 6 : return -1;
678 6 : }
679 :
680 : void
681 0 : spdk_framework_enable_context_switch_monitor(bool enable)
682 : {
683 : /* This global is being read by multiple threads, so this isn't
684 : * strictly thread safe. However, we're toggling between true and
685 : * false here, and if a thread sees the value update later than it
686 : * should, it's no big deal. */
687 0 : g_framework_context_switch_monitor_enabled = enable;
688 0 : }
689 :
690 : bool
691 0 : spdk_framework_context_switch_monitor_enabled(void)
692 : {
693 0 : return g_framework_context_switch_monitor_enabled;
694 : }
695 :
696 : static void
697 7 : _set_thread_name(const char *thread_name)
698 : {
699 : #if defined(__linux__)
700 : prctl(PR_SET_NAME, thread_name, 0, 0, 0);
701 : #elif defined(__FreeBSD__)
702 7 : pthread_set_name_np(pthread_self(), thread_name);
703 : #else
704 : pthread_setname_np(pthread_self(), thread_name);
705 : #endif
706 7 : }
707 :
708 : static void
709 9 : _init_thread_stats(struct spdk_reactor *reactor, struct spdk_lw_thread *lw_thread)
710 : {
711 9 : struct spdk_thread *thread = spdk_thread_get_from_ctx(lw_thread);
712 : struct spdk_thread_stats prev_total_stats;
713 :
714 : /* Read total_stats before updating it to calculate stats during the last scheduling period. */
715 9 : prev_total_stats = lw_thread->total_stats;
716 :
717 9 : spdk_set_thread(thread);
718 9 : spdk_thread_get_stats(&lw_thread->total_stats);
719 9 : spdk_set_thread(NULL);
720 :
721 9 : lw_thread->current_stats.busy_tsc = lw_thread->total_stats.busy_tsc - prev_total_stats.busy_tsc;
722 9 : lw_thread->current_stats.idle_tsc = lw_thread->total_stats.idle_tsc - prev_total_stats.idle_tsc;
723 9 : }
724 :
725 : static void
726 6 : _threads_reschedule_thread(struct spdk_scheduler_thread_info *thread_info)
727 : {
728 : struct spdk_lw_thread *lw_thread;
729 : struct spdk_thread *thread;
730 :
731 6 : thread = spdk_thread_get_by_id(thread_info->thread_id);
732 6 : if (thread == NULL) {
733 : /* Thread no longer exists. */
734 0 : return;
735 : }
736 6 : lw_thread = spdk_thread_get_ctx(thread);
737 6 : assert(lw_thread != NULL);
738 :
739 6 : lw_thread->lcore = thread_info->lcore;
740 6 : lw_thread->resched = true;
741 6 : }
742 :
743 : static void
744 3 : _threads_reschedule(struct spdk_scheduler_core_info *cores_info)
745 : {
746 : struct spdk_scheduler_core_info *core;
747 : struct spdk_scheduler_thread_info *thread_info;
748 : uint32_t i, j;
749 :
750 12 : SPDK_ENV_FOREACH_CORE(i) {
751 9 : core = &cores_info[i];
752 18 : for (j = 0; j < core->threads_count; j++) {
753 9 : thread_info = &core->thread_infos[j];
754 9 : if (thread_info->lcore != i) {
755 6 : if (core->isolated || cores_info[thread_info->lcore].isolated) {
756 0 : SPDK_ERRLOG("A thread cannot be moved from an isolated core or \
757 : moved to an isolated core. Skip rescheduling thread\n");
758 0 : continue;
759 : }
760 6 : _threads_reschedule_thread(thread_info);
761 6 : }
762 9 : }
763 9 : core->threads_count = 0;
764 9 : free(core->thread_infos);
765 9 : core->thread_infos = NULL;
766 9 : }
767 3 : }
768 :
769 : static void
770 3 : _reactors_scheduler_fini(void)
771 : {
772 : /* Reschedule based on the balancing output */
773 3 : _threads_reschedule(g_core_infos);
774 :
775 3 : g_scheduling_in_progress = false;
776 3 : }
777 :
778 : static void
779 3 : _reactors_scheduler_update_core_mode(void *ctx1, void *ctx2)
780 : {
781 : struct spdk_reactor *reactor;
782 : uint32_t i;
783 3 : int rc = 0;
784 :
785 12 : for (i = g_scheduler_core_number; i < SPDK_ENV_LCORE_ID_ANY; i = spdk_env_get_next_core(i)) {
786 9 : reactor = spdk_reactor_get(i);
787 9 : assert(reactor != NULL);
788 9 : if (reactor->in_interrupt != g_core_infos[i].interrupt_mode) {
789 : /* Switch next found reactor to new state */
790 0 : rc = spdk_reactor_set_interrupt_mode(i, g_core_infos[i].interrupt_mode,
791 : _reactors_scheduler_update_core_mode, NULL);
792 0 : if (rc == 0) {
793 : /* Set core to start with after callback completes */
794 0 : g_scheduler_core_number = spdk_env_get_next_core(i);
795 0 : return;
796 : }
797 0 : }
798 9 : }
799 3 : _reactors_scheduler_fini();
800 3 : }
801 :
802 : static void
803 0 : _reactors_scheduler_cancel(void *arg1, void *arg2)
804 : {
805 : struct spdk_scheduler_core_info *core;
806 : uint32_t i;
807 :
808 0 : SPDK_ENV_FOREACH_CORE(i) {
809 0 : core = &g_core_infos[i];
810 0 : core->threads_count = 0;
811 0 : free(core->thread_infos);
812 0 : core->thread_infos = NULL;
813 0 : }
814 :
815 0 : g_scheduling_in_progress = false;
816 0 : }
817 :
818 : static void
819 3 : _reactors_scheduler_balance(void *arg1, void *arg2)
820 : {
821 3 : struct spdk_scheduler *scheduler = spdk_scheduler_get();
822 :
823 3 : if (g_reactor_state != SPDK_REACTOR_STATE_RUNNING || scheduler == NULL) {
824 0 : _reactors_scheduler_cancel(NULL, NULL);
825 0 : return;
826 : }
827 :
828 3 : scheduler->balance(g_core_infos, g_reactor_count);
829 :
830 3 : g_scheduler_core_number = spdk_env_get_first_core();
831 3 : _reactors_scheduler_update_core_mode(NULL, NULL);
832 3 : }
833 :
834 : /* Phase 1 of thread scheduling is to gather metrics on the existing threads */
835 : static void
836 9 : _reactors_scheduler_gather_metrics(void *arg1, void *arg2)
837 : {
838 : struct spdk_scheduler_core_info *core_info;
839 : struct spdk_lw_thread *lw_thread;
840 : struct spdk_thread *thread;
841 : struct spdk_reactor *reactor;
842 : uint32_t next_core;
843 9 : uint32_t i = 0;
844 :
845 9 : reactor = spdk_reactor_get(spdk_env_get_current_core());
846 9 : assert(reactor != NULL);
847 9 : core_info = &g_core_infos[reactor->lcore];
848 9 : core_info->lcore = reactor->lcore;
849 9 : core_info->current_idle_tsc = reactor->idle_tsc - core_info->total_idle_tsc;
850 9 : core_info->total_idle_tsc = reactor->idle_tsc;
851 9 : core_info->current_busy_tsc = reactor->busy_tsc - core_info->total_busy_tsc;
852 9 : core_info->total_busy_tsc = reactor->busy_tsc;
853 9 : core_info->interrupt_mode = reactor->in_interrupt;
854 9 : core_info->threads_count = 0;
855 9 : core_info->isolated = scheduler_is_isolated_core(reactor->lcore);
856 :
857 9 : SPDK_DEBUGLOG(reactor, "Gathering metrics on %u\n", reactor->lcore);
858 :
859 9 : spdk_trace_record(TRACE_SCHEDULER_CORE_STATS, reactor->trace_id, 0, 0,
860 : core_info->current_busy_tsc,
861 : core_info->current_idle_tsc);
862 :
863 9 : if (reactor->thread_count > 0) {
864 7 : core_info->thread_infos = calloc(reactor->thread_count, sizeof(*core_info->thread_infos));
865 7 : if (core_info->thread_infos == NULL) {
866 0 : SPDK_ERRLOG("Failed to allocate memory when gathering metrics on %u\n", reactor->lcore);
867 :
868 : /* Cancel this round of schedule work */
869 0 : _event_call(spdk_scheduler_get_scheduling_lcore(), _reactors_scheduler_cancel, NULL, NULL);
870 0 : return;
871 : }
872 :
873 16 : TAILQ_FOREACH(lw_thread, &reactor->threads, link) {
874 9 : _init_thread_stats(reactor, lw_thread);
875 :
876 9 : core_info->thread_infos[i].lcore = lw_thread->lcore;
877 9 : thread = spdk_thread_get_from_ctx(lw_thread);
878 9 : assert(thread != NULL);
879 9 : core_info->thread_infos[i].thread_id = spdk_thread_get_id(thread);
880 9 : core_info->thread_infos[i].total_stats = lw_thread->total_stats;
881 9 : core_info->thread_infos[i].current_stats = lw_thread->current_stats;
882 9 : core_info->threads_count++;
883 9 : assert(core_info->threads_count <= reactor->thread_count);
884 :
885 9 : spdk_trace_record(TRACE_SCHEDULER_THREAD_STATS, spdk_thread_get_trace_id(thread), 0, 0,
886 : lw_thread->current_stats.busy_tsc,
887 : lw_thread->current_stats.idle_tsc);
888 :
889 9 : i++;
890 9 : }
891 7 : }
892 :
893 9 : next_core = spdk_env_get_next_core(reactor->lcore);
894 9 : if (next_core == UINT32_MAX) {
895 3 : next_core = spdk_env_get_first_core();
896 3 : }
897 :
898 : /* If we've looped back around to the scheduler thread, move to the next phase */
899 9 : if (next_core == spdk_scheduler_get_scheduling_lcore()) {
900 : /* Phase 2 of scheduling is rebalancing - deciding which threads to move where */
901 3 : _event_call(next_core, _reactors_scheduler_balance, NULL, NULL);
902 3 : return;
903 : }
904 :
905 6 : _event_call(next_core, _reactors_scheduler_gather_metrics, NULL, NULL);
906 9 : }
907 :
908 : static int _reactor_schedule_thread(struct spdk_thread *thread);
909 : static uint64_t g_rusage_period;
910 :
911 : static void
912 16 : _reactor_remove_lw_thread(struct spdk_reactor *reactor, struct spdk_lw_thread *lw_thread)
913 : {
914 16 : struct spdk_thread *thread = spdk_thread_get_from_ctx(lw_thread);
915 : struct spdk_fd_group *grp;
916 :
917 16 : TAILQ_REMOVE(&reactor->threads, lw_thread, link);
918 16 : assert(reactor->thread_count > 0);
919 16 : reactor->thread_count--;
920 :
921 : /* Operate thread intr if running with full interrupt ability */
922 16 : if (spdk_interrupt_mode_is_enabled()) {
923 0 : if (reactor->in_interrupt) {
924 0 : grp = spdk_thread_get_interrupt_fd_group(thread);
925 0 : spdk_fd_group_unnest(reactor->fgrp, grp);
926 0 : }
927 0 : }
928 16 : }
929 :
930 : static bool
931 43 : reactor_post_process_lw_thread(struct spdk_reactor *reactor, struct spdk_lw_thread *lw_thread)
932 : {
933 43 : struct spdk_thread *thread = spdk_thread_get_from_ctx(lw_thread);
934 :
935 43 : if (spdk_unlikely(spdk_thread_is_exited(thread) &&
936 : spdk_thread_is_idle(thread))) {
937 10 : _reactor_remove_lw_thread(reactor, lw_thread);
938 10 : spdk_thread_destroy(thread);
939 10 : return true;
940 : }
941 :
942 33 : if (spdk_unlikely(lw_thread->resched && !spdk_thread_is_bound(thread))) {
943 6 : lw_thread->resched = false;
944 6 : _reactor_remove_lw_thread(reactor, lw_thread);
945 6 : _reactor_schedule_thread(thread);
946 6 : return true;
947 : }
948 :
949 27 : return false;
950 43 : }
951 :
952 : static void
953 0 : reactor_interrupt_run(struct spdk_reactor *reactor)
954 : {
955 0 : int block_timeout = -1; /* _EPOLL_WAIT_FOREVER */
956 :
957 0 : spdk_fd_group_wait(reactor->fgrp, block_timeout);
958 0 : }
959 :
960 : static void
961 33 : _reactor_run(struct spdk_reactor *reactor)
962 : {
963 : struct spdk_thread *thread;
964 : struct spdk_lw_thread *lw_thread, *tmp;
965 : uint64_t now;
966 : int rc;
967 :
968 33 : event_queue_run_batch(reactor);
969 :
970 : /* If no threads are present on the reactor,
971 : * tsc_last gets outdated. Update it to track
972 : * thread execution time correctly. */
973 33 : if (spdk_unlikely(TAILQ_EMPTY(&reactor->threads))) {
974 2 : now = spdk_get_ticks();
975 2 : reactor->idle_tsc += now - reactor->tsc_last;
976 2 : reactor->tsc_last = now;
977 2 : return;
978 : }
979 :
980 74 : TAILQ_FOREACH_SAFE(lw_thread, &reactor->threads, link, tmp) {
981 43 : thread = spdk_thread_get_from_ctx(lw_thread);
982 43 : rc = spdk_thread_poll(thread, 0, reactor->tsc_last);
983 :
984 43 : now = spdk_thread_get_last_tsc(thread);
985 43 : if (rc == 0) {
986 37 : reactor->idle_tsc += now - reactor->tsc_last;
987 43 : } else if (rc > 0) {
988 6 : reactor->busy_tsc += now - reactor->tsc_last;
989 6 : }
990 43 : reactor->tsc_last = now;
991 :
992 43 : reactor_post_process_lw_thread(reactor, lw_thread);
993 43 : }
994 33 : }
995 :
996 : static int
997 7 : reactor_run(void *arg)
998 : {
999 7 : struct spdk_reactor *reactor = arg;
1000 : struct spdk_thread *thread;
1001 : struct spdk_lw_thread *lw_thread, *tmp;
1002 : char thread_name[32];
1003 7 : uint64_t last_sched = 0;
1004 :
1005 7 : SPDK_NOTICELOG("Reactor started on core %u\n", reactor->lcore);
1006 :
1007 : /* Rename the POSIX thread because the reactor is tied to the POSIX
1008 : * thread in the SPDK event library.
1009 : */
1010 7 : snprintf(thread_name, sizeof(thread_name), "reactor_%u", reactor->lcore);
1011 7 : _set_thread_name(thread_name);
1012 :
1013 7 : reactor->trace_id = spdk_trace_register_owner(OWNER_TYPE_REACTOR, thread_name);
1014 :
1015 7 : reactor->tsc_last = spdk_get_ticks();
1016 :
1017 7 : while (1) {
1018 : /* Execute interrupt process fn if this reactor currently runs in interrupt state */
1019 7 : if (spdk_unlikely(reactor->in_interrupt)) {
1020 0 : reactor_interrupt_run(reactor);
1021 0 : } else {
1022 7 : _reactor_run(reactor);
1023 : }
1024 :
1025 7 : if (g_framework_context_switch_monitor_enabled) {
1026 7 : if ((reactor->last_rusage + g_rusage_period) < reactor->tsc_last) {
1027 6 : get_rusage(reactor);
1028 6 : reactor->last_rusage = reactor->tsc_last;
1029 6 : }
1030 7 : }
1031 :
1032 7 : if (spdk_unlikely(g_scheduler_period_in_tsc > 0 &&
1033 : (reactor->tsc_last - last_sched) > g_scheduler_period_in_tsc &&
1034 : reactor == g_scheduling_reactor &&
1035 : !g_scheduling_in_progress)) {
1036 0 : last_sched = reactor->tsc_last;
1037 0 : g_scheduling_in_progress = true;
1038 0 : spdk_trace_record(TRACE_SCHEDULER_PERIOD_START, 0, 0, 0);
1039 0 : _reactors_scheduler_gather_metrics(NULL, NULL);
1040 0 : }
1041 :
1042 7 : if (g_reactor_state != SPDK_REACTOR_STATE_RUNNING) {
1043 7 : break;
1044 : }
1045 : }
1046 :
1047 7 : TAILQ_FOREACH(lw_thread, &reactor->threads, link) {
1048 0 : thread = spdk_thread_get_from_ctx(lw_thread);
1049 : /* All threads should have already had spdk_thread_exit() called on them, except
1050 : * for the app thread.
1051 : */
1052 0 : if (spdk_thread_is_running(thread)) {
1053 0 : if (!spdk_thread_is_app_thread(thread)) {
1054 0 : SPDK_ERRLOG("spdk_thread_exit() was not called on thread '%s'\n",
1055 : spdk_thread_get_name(thread));
1056 0 : SPDK_ERRLOG("This will result in a non-zero exit code in a future release.\n");
1057 0 : }
1058 0 : spdk_set_thread(thread);
1059 0 : spdk_thread_exit(thread);
1060 0 : }
1061 0 : }
1062 :
1063 7 : while (!TAILQ_EMPTY(&reactor->threads)) {
1064 0 : TAILQ_FOREACH_SAFE(lw_thread, &reactor->threads, link, tmp) {
1065 0 : thread = spdk_thread_get_from_ctx(lw_thread);
1066 0 : spdk_set_thread(thread);
1067 0 : if (spdk_thread_is_exited(thread)) {
1068 0 : _reactor_remove_lw_thread(reactor, lw_thread);
1069 0 : spdk_thread_destroy(thread);
1070 0 : } else {
1071 0 : if (spdk_unlikely(reactor->in_interrupt)) {
1072 0 : reactor_interrupt_run(reactor);
1073 0 : } else {
1074 0 : spdk_thread_poll(thread, 0, 0);
1075 : }
1076 : }
1077 0 : }
1078 : }
1079 :
1080 7 : return 0;
1081 : }
1082 :
1083 : int
1084 0 : spdk_app_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
1085 : {
1086 : int ret;
1087 : const struct spdk_cpuset *validmask;
1088 :
1089 0 : ret = spdk_cpuset_parse(cpumask, mask);
1090 0 : if (ret < 0) {
1091 0 : return ret;
1092 : }
1093 :
1094 0 : validmask = spdk_app_get_core_mask();
1095 0 : spdk_cpuset_and(cpumask, validmask);
1096 :
1097 0 : return 0;
1098 0 : }
1099 :
1100 : const struct spdk_cpuset *
1101 6 : spdk_app_get_core_mask(void)
1102 : {
1103 6 : return &g_reactor_core_mask;
1104 : }
1105 :
1106 : void
1107 0 : spdk_reactors_start(void)
1108 : {
1109 : struct spdk_reactor *reactor;
1110 : uint32_t i, current_core;
1111 : int rc;
1112 :
1113 0 : g_rusage_period = (CONTEXT_SWITCH_MONITOR_PERIOD * spdk_get_ticks_hz()) / SPDK_SEC_TO_USEC;
1114 0 : g_reactor_state = SPDK_REACTOR_STATE_RUNNING;
1115 : /* Reinitialize to false, in case the app framework is restarting in the same process. */
1116 0 : g_stopping_reactors = false;
1117 :
1118 0 : current_core = spdk_env_get_current_core();
1119 0 : SPDK_ENV_FOREACH_CORE(i) {
1120 0 : if (i != current_core) {
1121 0 : reactor = spdk_reactor_get(i);
1122 0 : if (reactor == NULL) {
1123 0 : continue;
1124 : }
1125 :
1126 0 : rc = spdk_env_thread_launch_pinned(reactor->lcore, reactor_run, reactor);
1127 0 : if (rc < 0) {
1128 0 : SPDK_ERRLOG("Unable to start reactor thread on core %u\n", reactor->lcore);
1129 0 : assert(false);
1130 : return;
1131 : }
1132 0 : }
1133 0 : spdk_cpuset_set_cpu(&g_reactor_core_mask, i, true);
1134 0 : }
1135 :
1136 : /* Start the main reactor */
1137 0 : reactor = spdk_reactor_get(current_core);
1138 0 : assert(reactor != NULL);
1139 0 : reactor_run(reactor);
1140 :
1141 0 : spdk_env_thread_wait_all();
1142 :
1143 0 : g_reactor_state = SPDK_REACTOR_STATE_SHUTDOWN;
1144 0 : }
1145 :
1146 : static void
1147 0 : _reactors_stop(void *arg1, void *arg2)
1148 : {
1149 : uint32_t i;
1150 : int rc;
1151 : struct spdk_reactor *reactor;
1152 : struct spdk_reactor *local_reactor;
1153 0 : uint64_t notify = 1;
1154 :
1155 0 : g_reactor_state = SPDK_REACTOR_STATE_EXITING;
1156 0 : local_reactor = spdk_reactor_get(spdk_env_get_current_core());
1157 :
1158 0 : SPDK_ENV_FOREACH_CORE(i) {
1159 : /* If spdk_event_call isn't called on a reactor, always send a notification.
1160 : * If it is called on a reactor, send a notification if the destination reactor
1161 : * is indicated in interrupt mode state.
1162 : */
1163 0 : if (local_reactor == NULL || spdk_cpuset_get_cpu(&local_reactor->notify_cpuset, i)) {
1164 0 : reactor = spdk_reactor_get(i);
1165 0 : assert(reactor != NULL);
1166 0 : rc = write(reactor->events_fd, ¬ify, sizeof(notify));
1167 0 : if (rc < 0) {
1168 0 : SPDK_ERRLOG("failed to notify event queue for reactor(%u): %s.\n", i, spdk_strerror(errno));
1169 0 : continue;
1170 : }
1171 0 : }
1172 0 : }
1173 0 : }
1174 :
1175 : static void
1176 0 : nop(void *arg1, void *arg2)
1177 : {
1178 0 : }
1179 :
1180 : void
1181 0 : spdk_reactors_stop(void *arg1)
1182 : {
1183 0 : spdk_for_each_reactor(nop, NULL, NULL, _reactors_stop);
1184 0 : }
1185 :
1186 : static pthread_mutex_t g_scheduler_mtx = PTHREAD_MUTEX_INITIALIZER;
1187 : static uint32_t g_next_core = UINT32_MAX;
1188 :
1189 : static void
1190 18 : _schedule_thread(void *arg1, void *arg2)
1191 : {
1192 18 : struct spdk_lw_thread *lw_thread = arg1;
1193 : struct spdk_thread *thread;
1194 : struct spdk_reactor *reactor;
1195 : uint32_t current_core;
1196 : struct spdk_fd_group *grp;
1197 :
1198 18 : current_core = spdk_env_get_current_core();
1199 18 : reactor = spdk_reactor_get(current_core);
1200 18 : assert(reactor != NULL);
1201 :
1202 : /* Update total_stats to reflect state of thread
1203 : * at the end of the move. */
1204 18 : thread = spdk_thread_get_from_ctx(lw_thread);
1205 18 : spdk_set_thread(thread);
1206 18 : spdk_thread_get_stats(&lw_thread->total_stats);
1207 18 : spdk_set_thread(NULL);
1208 :
1209 18 : if (lw_thread->initial_lcore == SPDK_ENV_LCORE_ID_ANY) {
1210 12 : lw_thread->initial_lcore = current_core;
1211 12 : }
1212 18 : lw_thread->lcore = current_core;
1213 :
1214 18 : TAILQ_INSERT_TAIL(&reactor->threads, lw_thread, link);
1215 18 : reactor->thread_count++;
1216 :
1217 : /* Operate thread intr if running with full interrupt ability */
1218 18 : if (spdk_interrupt_mode_is_enabled()) {
1219 : int rc;
1220 :
1221 0 : if (reactor->in_interrupt) {
1222 0 : grp = spdk_thread_get_interrupt_fd_group(thread);
1223 0 : rc = spdk_fd_group_nest(reactor->fgrp, grp);
1224 0 : if (rc < 0) {
1225 0 : SPDK_ERRLOG("Failed to schedule spdk_thread: %s.\n", spdk_strerror(-rc));
1226 0 : }
1227 0 : }
1228 :
1229 : /* Align spdk_thread with reactor to interrupt mode or poll mode */
1230 0 : spdk_thread_send_msg(thread, _reactor_set_thread_interrupt_mode, reactor);
1231 0 : }
1232 18 : }
1233 :
1234 : static int
1235 18 : _reactor_schedule_thread(struct spdk_thread *thread)
1236 : {
1237 : uint32_t core, initial_core;
1238 : struct spdk_lw_thread *lw_thread;
1239 18 : struct spdk_event *evt = NULL;
1240 : struct spdk_cpuset *cpumask;
1241 : uint32_t i;
1242 18 : struct spdk_reactor *local_reactor = NULL;
1243 18 : uint32_t current_lcore = spdk_env_get_current_core();
1244 : struct spdk_cpuset polling_cpumask;
1245 : struct spdk_cpuset valid_cpumask;
1246 :
1247 18 : cpumask = spdk_thread_get_cpumask(thread);
1248 :
1249 18 : lw_thread = spdk_thread_get_ctx(thread);
1250 18 : assert(lw_thread != NULL);
1251 18 : core = lw_thread->lcore;
1252 18 : initial_core = lw_thread->initial_lcore;
1253 18 : memset(lw_thread, 0, sizeof(*lw_thread));
1254 18 : lw_thread->initial_lcore = initial_core;
1255 :
1256 18 : if (current_lcore != SPDK_ENV_LCORE_ID_ANY) {
1257 18 : local_reactor = spdk_reactor_get(current_lcore);
1258 18 : assert(local_reactor);
1259 18 : }
1260 :
1261 : /* When interrupt ability of spdk_thread is not enabled and the current
1262 : * reactor runs on DPDK thread, skip reactors which are in interrupt mode.
1263 : */
1264 18 : if (!spdk_interrupt_mode_is_enabled() && local_reactor != NULL) {
1265 : /* Get the cpumask of all reactors in polling */
1266 18 : spdk_cpuset_zero(&polling_cpumask);
1267 66 : SPDK_ENV_FOREACH_CORE(i) {
1268 48 : spdk_cpuset_set_cpu(&polling_cpumask, i, true);
1269 48 : }
1270 18 : spdk_cpuset_xor(&polling_cpumask, &local_reactor->notify_cpuset);
1271 :
1272 18 : if (core == SPDK_ENV_LCORE_ID_ANY) {
1273 : /* Get the cpumask of all valid reactors which are suggested and also in polling */
1274 13 : spdk_cpuset_copy(&valid_cpumask, &polling_cpumask);
1275 13 : spdk_cpuset_and(&valid_cpumask, spdk_thread_get_cpumask(thread));
1276 :
1277 : /* If there are any valid reactors, spdk_thread should be scheduled
1278 : * into one of the valid reactors.
1279 : * If there is no valid reactors, spdk_thread should be scheduled
1280 : * into one of the polling reactors.
1281 : */
1282 13 : if (spdk_cpuset_count(&valid_cpumask) != 0) {
1283 13 : cpumask = &valid_cpumask;
1284 13 : } else {
1285 0 : cpumask = &polling_cpumask;
1286 : }
1287 18 : } else if (!spdk_cpuset_get_cpu(&polling_cpumask, core)) {
1288 : /* If specified reactor is not in polling, spdk_thread should be scheduled
1289 : * into one of the polling reactors.
1290 : */
1291 0 : core = SPDK_ENV_LCORE_ID_ANY;
1292 0 : cpumask = &polling_cpumask;
1293 0 : }
1294 18 : }
1295 :
1296 18 : pthread_mutex_lock(&g_scheduler_mtx);
1297 18 : if (core == SPDK_ENV_LCORE_ID_ANY) {
1298 18 : for (i = 0; i < spdk_env_get_core_count(); i++) {
1299 18 : if (g_next_core >= g_reactor_count) {
1300 5 : g_next_core = spdk_env_get_first_core();
1301 5 : }
1302 18 : core = g_next_core;
1303 18 : g_next_core = spdk_env_get_next_core(g_next_core);
1304 :
1305 18 : if (spdk_cpuset_get_cpu(cpumask, core)) {
1306 13 : break;
1307 : }
1308 5 : }
1309 13 : }
1310 :
1311 18 : evt = spdk_event_allocate(core, _schedule_thread, lw_thread, NULL);
1312 :
1313 18 : if (current_lcore != core) {
1314 7 : spdk_trace_record(TRACE_SCHEDULER_MOVE_THREAD, spdk_thread_get_trace_id(thread), 0, 0,
1315 : current_lcore, core);
1316 7 : }
1317 :
1318 18 : pthread_mutex_unlock(&g_scheduler_mtx);
1319 :
1320 18 : assert(evt != NULL);
1321 18 : if (evt == NULL) {
1322 0 : SPDK_ERRLOG("Unable to schedule thread on requested core mask.\n");
1323 0 : return -1;
1324 : }
1325 :
1326 18 : lw_thread->tsc_start = spdk_get_ticks();
1327 :
1328 18 : spdk_event_call(evt);
1329 :
1330 18 : return 0;
1331 18 : }
1332 :
1333 : static void
1334 2 : _reactor_request_thread_reschedule(struct spdk_thread *thread)
1335 : {
1336 : struct spdk_lw_thread *lw_thread;
1337 : struct spdk_reactor *reactor;
1338 : uint32_t current_core;
1339 :
1340 2 : assert(thread == spdk_get_thread());
1341 :
1342 2 : lw_thread = spdk_thread_get_ctx(thread);
1343 :
1344 2 : assert(lw_thread != NULL);
1345 2 : lw_thread->resched = true;
1346 2 : lw_thread->lcore = SPDK_ENV_LCORE_ID_ANY;
1347 :
1348 2 : current_core = spdk_env_get_current_core();
1349 2 : reactor = spdk_reactor_get(current_core);
1350 2 : assert(reactor != NULL);
1351 :
1352 : /* Send a notification if the destination reactor is indicated in intr mode state */
1353 2 : if (spdk_unlikely(spdk_cpuset_get_cpu(&reactor->notify_cpuset, reactor->lcore))) {
1354 0 : uint64_t notify = 1;
1355 :
1356 0 : if (write(reactor->resched_fd, ¬ify, sizeof(notify)) < 0) {
1357 0 : SPDK_ERRLOG("failed to notify reschedule: %s.\n", spdk_strerror(errno));
1358 0 : }
1359 0 : }
1360 2 : }
1361 :
1362 : static int
1363 14 : reactor_thread_op(struct spdk_thread *thread, enum spdk_thread_op op)
1364 : {
1365 : struct spdk_lw_thread *lw_thread;
1366 :
1367 14 : switch (op) {
1368 : case SPDK_THREAD_OP_NEW:
1369 12 : lw_thread = spdk_thread_get_ctx(thread);
1370 12 : lw_thread->lcore = SPDK_ENV_LCORE_ID_ANY;
1371 12 : lw_thread->initial_lcore = SPDK_ENV_LCORE_ID_ANY;
1372 12 : return _reactor_schedule_thread(thread);
1373 : case SPDK_THREAD_OP_RESCHED:
1374 2 : _reactor_request_thread_reschedule(thread);
1375 2 : return 0;
1376 : default:
1377 0 : return -ENOTSUP;
1378 : }
1379 14 : }
1380 :
1381 : static bool
1382 14 : reactor_thread_op_supported(enum spdk_thread_op op)
1383 : {
1384 14 : switch (op) {
1385 : case SPDK_THREAD_OP_NEW:
1386 : case SPDK_THREAD_OP_RESCHED:
1387 14 : return true;
1388 : default:
1389 0 : return false;
1390 : }
1391 14 : }
1392 :
1393 : struct call_reactor {
1394 : uint32_t cur_core;
1395 : spdk_event_fn fn;
1396 : void *arg1;
1397 : void *arg2;
1398 :
1399 : uint32_t orig_core;
1400 : spdk_event_fn cpl;
1401 : };
1402 :
1403 : static void
1404 5 : on_reactor(void *arg1, void *arg2)
1405 : {
1406 5 : struct call_reactor *cr = arg1;
1407 : struct spdk_event *evt;
1408 :
1409 5 : cr->fn(cr->arg1, cr->arg2);
1410 :
1411 5 : cr->cur_core = spdk_env_get_next_core(cr->cur_core);
1412 :
1413 5 : if (cr->cur_core >= g_reactor_count) {
1414 1 : SPDK_DEBUGLOG(reactor, "Completed reactor iteration\n");
1415 :
1416 1 : evt = spdk_event_allocate(cr->orig_core, cr->cpl, cr->arg1, cr->arg2);
1417 1 : free(cr);
1418 1 : } else {
1419 4 : SPDK_DEBUGLOG(reactor, "Continuing reactor iteration to %d\n",
1420 : cr->cur_core);
1421 :
1422 4 : evt = spdk_event_allocate(cr->cur_core, on_reactor, arg1, NULL);
1423 : }
1424 5 : assert(evt != NULL);
1425 5 : spdk_event_call(evt);
1426 5 : }
1427 :
1428 : void
1429 1 : spdk_for_each_reactor(spdk_event_fn fn, void *arg1, void *arg2, spdk_event_fn cpl)
1430 : {
1431 : struct call_reactor *cr;
1432 :
1433 : /* When the application framework is shutting down, we will send one
1434 : * final for_each_reactor operation with completion callback _reactors_stop,
1435 : * to flush any existing for_each_reactor operations to avoid any memory
1436 : * leaks. We use a mutex here to protect a boolean flag that will ensure
1437 : * we don't start any more operations once we've started shutting down.
1438 : */
1439 1 : pthread_mutex_lock(&g_stopping_reactors_mtx);
1440 1 : if (g_stopping_reactors) {
1441 0 : pthread_mutex_unlock(&g_stopping_reactors_mtx);
1442 0 : return;
1443 1 : } else if (cpl == _reactors_stop) {
1444 0 : g_stopping_reactors = true;
1445 0 : }
1446 1 : pthread_mutex_unlock(&g_stopping_reactors_mtx);
1447 :
1448 1 : cr = calloc(1, sizeof(*cr));
1449 1 : if (!cr) {
1450 0 : SPDK_ERRLOG("Unable to perform reactor iteration\n");
1451 0 : cpl(arg1, arg2);
1452 0 : return;
1453 : }
1454 :
1455 1 : cr->fn = fn;
1456 1 : cr->arg1 = arg1;
1457 1 : cr->arg2 = arg2;
1458 1 : cr->cpl = cpl;
1459 1 : cr->orig_core = spdk_env_get_current_core();
1460 1 : cr->cur_core = spdk_env_get_first_core();
1461 :
1462 1 : SPDK_DEBUGLOG(reactor, "Starting reactor iteration from %d\n", cr->orig_core);
1463 :
1464 1 : _event_call(cr->cur_core, on_reactor, cr, NULL);
1465 1 : }
1466 :
1467 : #ifdef __linux__
1468 : static int
1469 : reactor_schedule_thread_event(void *arg)
1470 : {
1471 : struct spdk_reactor *reactor = arg;
1472 : struct spdk_lw_thread *lw_thread, *tmp;
1473 : uint32_t count = 0;
1474 : uint64_t notify = 1;
1475 :
1476 : assert(reactor->in_interrupt);
1477 :
1478 : if (read(reactor->resched_fd, ¬ify, sizeof(notify)) < 0) {
1479 : SPDK_ERRLOG("failed to acknowledge reschedule: %s.\n", spdk_strerror(errno));
1480 : return -errno;
1481 : }
1482 :
1483 : TAILQ_FOREACH_SAFE(lw_thread, &reactor->threads, link, tmp) {
1484 : count += reactor_post_process_lw_thread(reactor, lw_thread) ? 1 : 0;
1485 : }
1486 :
1487 : return count;
1488 : }
1489 :
1490 : static int
1491 : reactor_interrupt_init(struct spdk_reactor *reactor)
1492 : {
1493 : int rc;
1494 :
1495 : rc = spdk_fd_group_create(&reactor->fgrp);
1496 : if (rc != 0) {
1497 : return rc;
1498 : }
1499 :
1500 : reactor->resched_fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1501 : if (reactor->resched_fd < 0) {
1502 : rc = -EBADF;
1503 : goto err;
1504 : }
1505 :
1506 : rc = SPDK_FD_GROUP_ADD(reactor->fgrp, reactor->resched_fd, reactor_schedule_thread_event,
1507 : reactor);
1508 : if (rc) {
1509 : close(reactor->resched_fd);
1510 : goto err;
1511 : }
1512 :
1513 : reactor->events_fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1514 : if (reactor->events_fd < 0) {
1515 : spdk_fd_group_remove(reactor->fgrp, reactor->resched_fd);
1516 : close(reactor->resched_fd);
1517 :
1518 : rc = -EBADF;
1519 : goto err;
1520 : }
1521 :
1522 : rc = SPDK_FD_GROUP_ADD(reactor->fgrp, reactor->events_fd,
1523 : event_queue_run_batch, reactor);
1524 : if (rc) {
1525 : spdk_fd_group_remove(reactor->fgrp, reactor->resched_fd);
1526 : close(reactor->resched_fd);
1527 : close(reactor->events_fd);
1528 : goto err;
1529 : }
1530 :
1531 : return 0;
1532 :
1533 : err:
1534 : spdk_fd_group_destroy(reactor->fgrp);
1535 : reactor->fgrp = NULL;
1536 : return rc;
1537 : }
1538 : #else
1539 : static int
1540 29 : reactor_interrupt_init(struct spdk_reactor *reactor)
1541 : {
1542 29 : return -ENOTSUP;
1543 : }
1544 : #endif
1545 :
1546 : static void
1547 29 : reactor_interrupt_fini(struct spdk_reactor *reactor)
1548 : {
1549 29 : struct spdk_fd_group *fgrp = reactor->fgrp;
1550 :
1551 29 : if (!fgrp) {
1552 29 : return;
1553 : }
1554 :
1555 0 : spdk_fd_group_remove(fgrp, reactor->events_fd);
1556 0 : spdk_fd_group_remove(fgrp, reactor->resched_fd);
1557 :
1558 0 : close(reactor->events_fd);
1559 0 : close(reactor->resched_fd);
1560 :
1561 0 : spdk_fd_group_destroy(fgrp);
1562 0 : reactor->fgrp = NULL;
1563 29 : }
1564 :
1565 : static struct spdk_governor *
1566 1 : _governor_find(const char *name)
1567 : {
1568 : struct spdk_governor *governor, *tmp;
1569 :
1570 1 : TAILQ_FOREACH_SAFE(governor, &g_governor_list, link, tmp) {
1571 0 : if (strcmp(name, governor->name) == 0) {
1572 0 : return governor;
1573 : }
1574 0 : }
1575 :
1576 1 : return NULL;
1577 1 : }
1578 :
1579 : int
1580 1 : spdk_governor_set(const char *name)
1581 : {
1582 : struct spdk_governor *governor;
1583 1 : int rc = 0;
1584 :
1585 : /* NULL governor was specifically requested */
1586 1 : if (name == NULL) {
1587 0 : if (g_governor) {
1588 0 : g_governor->deinit();
1589 0 : }
1590 0 : g_governor = NULL;
1591 0 : return 0;
1592 : }
1593 :
1594 1 : governor = _governor_find(name);
1595 1 : if (governor == NULL) {
1596 1 : return -EINVAL;
1597 : }
1598 :
1599 0 : if (g_governor == governor) {
1600 0 : return 0;
1601 : }
1602 :
1603 0 : rc = governor->init();
1604 0 : if (rc == 0) {
1605 0 : if (g_governor) {
1606 0 : g_governor->deinit();
1607 0 : }
1608 0 : g_governor = governor;
1609 0 : }
1610 :
1611 0 : return rc;
1612 1 : }
1613 :
1614 : struct spdk_governor *
1615 5 : spdk_governor_get(void)
1616 : {
1617 5 : return g_governor;
1618 : }
1619 :
1620 : void
1621 0 : spdk_governor_register(struct spdk_governor *governor)
1622 : {
1623 0 : if (_governor_find(governor->name)) {
1624 0 : SPDK_ERRLOG("governor named '%s' already registered.\n", governor->name);
1625 0 : assert(false);
1626 : return;
1627 : }
1628 :
1629 0 : TAILQ_INSERT_TAIL(&g_governor_list, governor, link);
1630 0 : }
1631 :
1632 1 : SPDK_LOG_REGISTER_COMPONENT(reactor)
1633 :
1634 : static void
1635 0 : scheduler_trace(void)
1636 : {
1637 0 : struct spdk_trace_tpoint_opts opts[] = {
1638 : {
1639 : "SCHEDULER_PERIOD_START", TRACE_SCHEDULER_PERIOD_START,
1640 : OWNER_TYPE_NONE, OBJECT_NONE, 0,
1641 : {
1642 :
1643 : }
1644 : },
1645 : {
1646 : "SCHEDULER_CORE_STATS", TRACE_SCHEDULER_CORE_STATS,
1647 : OWNER_TYPE_REACTOR, OBJECT_NONE, 0,
1648 : {
1649 : { "busy", SPDK_TRACE_ARG_TYPE_INT, 8},
1650 : { "idle", SPDK_TRACE_ARG_TYPE_INT, 8}
1651 : }
1652 : },
1653 : {
1654 : "SCHEDULER_THREAD_STATS", TRACE_SCHEDULER_THREAD_STATS,
1655 : OWNER_TYPE_THREAD, OBJECT_NONE, 0,
1656 : {
1657 : { "busy", SPDK_TRACE_ARG_TYPE_INT, 8},
1658 : { "idle", SPDK_TRACE_ARG_TYPE_INT, 8}
1659 : }
1660 : },
1661 : {
1662 : "SCHEDULER_MOVE_THREAD", TRACE_SCHEDULER_MOVE_THREAD,
1663 : OWNER_TYPE_THREAD, OBJECT_NONE, 0,
1664 : {
1665 : { "src", SPDK_TRACE_ARG_TYPE_INT, 8 },
1666 : { "dst", SPDK_TRACE_ARG_TYPE_INT, 8 }
1667 : }
1668 : }
1669 : };
1670 :
1671 0 : spdk_trace_register_owner_type(OWNER_TYPE_REACTOR, 'r');
1672 0 : spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
1673 :
1674 0 : }
1675 :
1676 1 : SPDK_TRACE_REGISTER_FN(scheduler_trace, "scheduler", TRACE_GROUP_SCHEDULER)
|