Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2016 Intel Corporation. All rights reserved.
3 : * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4 : * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 : */
6 :
7 : #include "spdk/stdinc.h"
8 :
9 : #include "spdk/bdev.h"
10 :
11 : #include "spdk/accel.h"
12 : #include "spdk/config.h"
13 : #include "spdk/env.h"
14 : #include "spdk/thread.h"
15 : #include "spdk/likely.h"
16 : #include "spdk/queue.h"
17 : #include "spdk/nvme_spec.h"
18 : #include "spdk/scsi_spec.h"
19 : #include "spdk/notify.h"
20 : #include "spdk/util.h"
21 : #include "spdk/trace.h"
22 : #include "spdk/dma.h"
23 :
24 : #include "spdk/bdev_module.h"
25 : #include "spdk/log.h"
26 : #include "spdk/string.h"
27 :
28 : #include "bdev_internal.h"
29 : #include "spdk_internal/trace_defs.h"
30 : #include "spdk_internal/assert.h"
31 :
32 : #ifdef SPDK_CONFIG_VTUNE
33 : #include "ittnotify.h"
34 : #include "ittnotify_types.h"
35 : int __itt_init_ittlib(const char *, __itt_group_id);
36 : #endif
37 :
38 : #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1)
39 : #define SPDK_BDEV_IO_CACHE_SIZE 256
40 : #define SPDK_BDEV_AUTO_EXAMINE true
41 : #define BUF_SMALL_CACHE_SIZE 128
42 : #define BUF_LARGE_CACHE_SIZE 16
43 : #define NOMEM_THRESHOLD_COUNT 8
44 :
45 : #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000
46 : #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1
47 : #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512
48 : #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000
49 : #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024)
50 : #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024))
51 : #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX
52 : #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000
53 :
54 : /* The maximum number of children requests for a UNMAP or WRITE ZEROES command
55 : * when splitting into children requests at a time.
56 : */
57 : #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8)
58 : #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000
59 :
60 : /* The maximum number of children requests for a COPY command
61 : * when splitting into children requests at a time.
62 : */
63 : #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8)
64 :
65 : #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \
66 : log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev)
67 : #ifdef DEBUG
68 : #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \
69 : log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev)
70 : #else
71 : #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0)
72 : #endif
73 :
74 : static void log_already_claimed(enum spdk_log_level level, const int line, const char *func,
75 : const char *detail, struct spdk_bdev *bdev);
76 :
77 : static const char *qos_rpc_type[] = {"rw_ios_per_sec",
78 : "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
79 : };
80 :
81 : TAILQ_HEAD(spdk_bdev_list, spdk_bdev);
82 :
83 : RB_HEAD(bdev_name_tree, spdk_bdev_name);
84 :
85 : static int
86 572 : bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2)
87 : {
88 572 : return strcmp(name1->name, name2->name);
89 : }
90 :
91 1786 : RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp);
92 :
93 : struct spdk_bdev_mgr {
94 : struct spdk_mempool *bdev_io_pool;
95 :
96 : void *zero_buffer;
97 :
98 : TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules;
99 :
100 : struct spdk_bdev_list bdevs;
101 : struct bdev_name_tree bdev_names;
102 :
103 : bool init_complete;
104 : bool module_init_complete;
105 :
106 : struct spdk_spinlock spinlock;
107 :
108 : TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens;
109 :
110 : #ifdef SPDK_CONFIG_VTUNE
111 : __itt_domain *domain;
112 : #endif
113 : };
114 :
115 : static struct spdk_bdev_mgr g_bdev_mgr = {
116 : .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
117 : .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
118 : .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names),
119 : .init_complete = false,
120 : .module_init_complete = false,
121 : .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens),
122 : };
123 :
124 : static void
125 : __attribute__((constructor))
126 3 : _bdev_init(void)
127 : {
128 3 : spdk_spin_init(&g_bdev_mgr.spinlock);
129 3 : }
130 :
131 : typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status);
132 :
133 : typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc);
134 :
135 : struct lba_range {
136 : struct spdk_bdev *bdev;
137 : uint64_t offset;
138 : uint64_t length;
139 : bool quiesce;
140 : void *locked_ctx;
141 : struct spdk_thread *owner_thread;
142 : struct spdk_bdev_channel *owner_ch;
143 : TAILQ_ENTRY(lba_range) tailq;
144 : TAILQ_ENTRY(lba_range) tailq_module;
145 : };
146 :
147 : static struct spdk_bdev_opts g_bdev_opts = {
148 : .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
149 : .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
150 : .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE,
151 : .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE,
152 : .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE,
153 : };
154 :
155 : static spdk_bdev_init_cb g_init_cb_fn = NULL;
156 : static void *g_init_cb_arg = NULL;
157 :
158 : static spdk_bdev_fini_cb g_fini_cb_fn = NULL;
159 : static void *g_fini_cb_arg = NULL;
160 : static struct spdk_thread *g_fini_thread = NULL;
161 :
162 : struct spdk_bdev_qos_limit {
163 : /** IOs or bytes allowed per second (i.e., 1s). */
164 : uint64_t limit;
165 :
166 : /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms).
167 : * For remaining bytes, allowed to run negative if an I/O is submitted when
168 : * some bytes are remaining, but the I/O is bigger than that amount. The
169 : * excess will be deducted from the next timeslice.
170 : */
171 : int64_t remaining_this_timeslice;
172 :
173 : /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
174 : uint32_t min_per_timeslice;
175 :
176 : /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
177 : uint32_t max_per_timeslice;
178 :
179 : /** Function to check whether to queue the IO.
180 : * If The IO is allowed to pass, the quota will be reduced correspondingly.
181 : */
182 : bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
183 :
184 : /** Function to rewind the quota once the IO was allowed to be sent by this
185 : * limit but queued due to one of the further limits.
186 : */
187 : void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
188 : };
189 :
190 : struct spdk_bdev_qos {
191 : /** Types of structure of rate limits. */
192 : struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
193 :
194 : /** The channel that all I/O are funneled through. */
195 : struct spdk_bdev_channel *ch;
196 :
197 : /** The thread on which the poller is running. */
198 : struct spdk_thread *thread;
199 :
200 : /** Size of a timeslice in tsc ticks. */
201 : uint64_t timeslice_size;
202 :
203 : /** Timestamp of start of last timeslice. */
204 : uint64_t last_timeslice;
205 :
206 : /** Poller that processes queued I/O commands each time slice. */
207 : struct spdk_poller *poller;
208 : };
209 :
210 : struct spdk_bdev_mgmt_channel {
211 : /*
212 : * Each thread keeps a cache of bdev_io - this allows
213 : * bdev threads which are *not* DPDK threads to still
214 : * benefit from a per-thread bdev_io cache. Without
215 : * this, non-DPDK threads fetching from the mempool
216 : * incur a cmpxchg on get and put.
217 : */
218 : bdev_io_stailq_t per_thread_cache;
219 : uint32_t per_thread_cache_count;
220 : uint32_t bdev_io_cache_size;
221 :
222 : struct spdk_iobuf_channel iobuf;
223 :
224 : TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources;
225 : TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue;
226 : };
227 :
228 : /*
229 : * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
230 : * will queue here their IO that awaits retry. It makes it possible to retry sending
231 : * IO to one bdev after IO from other bdev completes.
232 : */
233 : struct spdk_bdev_shared_resource {
234 : /* The bdev management channel */
235 : struct spdk_bdev_mgmt_channel *mgmt_ch;
236 :
237 : /*
238 : * Count of I/O submitted to bdev module and waiting for completion.
239 : * Incremented before submit_request() is called on an spdk_bdev_io.
240 : */
241 : uint64_t io_outstanding;
242 :
243 : /*
244 : * Queue of IO awaiting retry because of a previous NOMEM status returned
245 : * on this channel.
246 : */
247 : bdev_io_tailq_t nomem_io;
248 :
249 : /*
250 : * Threshold which io_outstanding must drop to before retrying nomem_io.
251 : */
252 : uint64_t nomem_threshold;
253 :
254 : /*
255 : * Indicate whether aborting nomem I/Os is in progress.
256 : * If true, we should not touch the nomem_io list on I/O completions.
257 : */
258 : bool nomem_abort_in_progress;
259 :
260 : /* I/O channel allocated by a bdev module */
261 : struct spdk_io_channel *shared_ch;
262 :
263 : struct spdk_poller *nomem_poller;
264 :
265 : /* Refcount of bdev channels using this resource */
266 : uint32_t ref;
267 :
268 : TAILQ_ENTRY(spdk_bdev_shared_resource) link;
269 : };
270 :
271 : #define BDEV_CH_RESET_IN_PROGRESS (1 << 0)
272 : #define BDEV_CH_QOS_ENABLED (1 << 1)
273 :
274 : struct spdk_bdev_channel {
275 : struct spdk_bdev *bdev;
276 :
277 : /* The channel for the underlying device */
278 : struct spdk_io_channel *channel;
279 :
280 : /* Accel channel */
281 : struct spdk_io_channel *accel_channel;
282 :
283 : /* Per io_device per thread data */
284 : struct spdk_bdev_shared_resource *shared_resource;
285 :
286 : struct spdk_bdev_io_stat *stat;
287 :
288 : /*
289 : * Count of I/O submitted to the underlying dev module through this channel
290 : * and waiting for completion.
291 : */
292 : uint64_t io_outstanding;
293 :
294 : /*
295 : * List of all submitted I/Os including I/O that are generated via splitting.
296 : */
297 : bdev_io_tailq_t io_submitted;
298 :
299 : /*
300 : * List of spdk_bdev_io that are currently queued because they write to a locked
301 : * LBA range.
302 : */
303 : bdev_io_tailq_t io_locked;
304 :
305 : /* List of I/Os with accel sequence being currently executed */
306 : bdev_io_tailq_t io_accel_exec;
307 :
308 : /* List of I/Os doing memory domain pull/push */
309 : bdev_io_tailq_t io_memory_domain;
310 :
311 : uint32_t flags;
312 :
313 : /* Counts number of bdev_io in the io_submitted TAILQ */
314 : uint16_t queue_depth;
315 :
316 : uint16_t trace_id;
317 :
318 : struct spdk_histogram_data *histogram;
319 :
320 : #ifdef SPDK_CONFIG_VTUNE
321 : uint64_t start_tsc;
322 : uint64_t interval_tsc;
323 : __itt_string_handle *handle;
324 : struct spdk_bdev_io_stat *prev_stat;
325 : #endif
326 :
327 : lba_range_tailq_t locked_ranges;
328 :
329 : /** List of I/Os queued by QoS. */
330 : bdev_io_tailq_t qos_queued_io;
331 : };
332 :
333 : struct media_event_entry {
334 : struct spdk_bdev_media_event event;
335 : TAILQ_ENTRY(media_event_entry) tailq;
336 : };
337 :
338 : #define MEDIA_EVENT_POOL_SIZE 64
339 :
340 : struct spdk_bdev_desc {
341 : struct spdk_bdev *bdev;
342 : bool write;
343 : bool memory_domains_supported;
344 : bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES];
345 : struct spdk_bdev_open_opts opts;
346 : struct spdk_thread *thread;
347 : struct {
348 : spdk_bdev_event_cb_t event_fn;
349 : void *ctx;
350 : } callback;
351 : bool closed;
352 : struct spdk_spinlock spinlock;
353 : uint32_t refs;
354 : TAILQ_HEAD(, media_event_entry) pending_media_events;
355 : TAILQ_HEAD(, media_event_entry) free_media_events;
356 : struct media_event_entry *media_events_buffer;
357 : TAILQ_ENTRY(spdk_bdev_desc) link;
358 :
359 : uint64_t timeout_in_sec;
360 : spdk_bdev_io_timeout_cb cb_fn;
361 : void *cb_arg;
362 : struct spdk_poller *io_timeout_poller;
363 : struct spdk_bdev_module_claim *claim;
364 : };
365 :
366 : struct spdk_bdev_iostat_ctx {
367 : struct spdk_bdev_io_stat *stat;
368 : enum spdk_bdev_reset_stat_mode reset_mode;
369 : spdk_bdev_get_device_stat_cb cb;
370 : void *cb_arg;
371 : };
372 :
373 : struct set_qos_limit_ctx {
374 : void (*cb_fn)(void *cb_arg, int status);
375 : void *cb_arg;
376 : struct spdk_bdev *bdev;
377 : };
378 :
379 : struct spdk_bdev_channel_iter {
380 : spdk_bdev_for_each_channel_msg fn;
381 : spdk_bdev_for_each_channel_done cpl;
382 : struct spdk_io_channel_iter *i;
383 : void *ctx;
384 : };
385 :
386 : struct spdk_bdev_io_error_stat {
387 : uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS];
388 : };
389 :
390 : enum bdev_io_retry_state {
391 : BDEV_IO_RETRY_STATE_INVALID,
392 : BDEV_IO_RETRY_STATE_PULL,
393 : BDEV_IO_RETRY_STATE_PULL_MD,
394 : BDEV_IO_RETRY_STATE_SUBMIT,
395 : BDEV_IO_RETRY_STATE_PUSH,
396 : BDEV_IO_RETRY_STATE_PUSH_MD,
397 : BDEV_IO_RETRY_STATE_GET_ACCEL_BUF,
398 : };
399 :
400 : #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1)
401 : #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1))
402 : #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch))
403 : #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch))
404 :
405 : static inline void bdev_io_complete(void *ctx);
406 : static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io);
407 : static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io);
408 : static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io);
409 : static void _bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io);
410 :
411 : static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
412 : static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io);
413 :
414 : static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
415 : struct spdk_io_channel *ch, void *_ctx);
416 : static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status);
417 :
418 : static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
419 : struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
420 : uint64_t num_blocks,
421 : struct spdk_memory_domain *domain, void *domain_ctx,
422 : struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
423 : spdk_bdev_io_completion_cb cb, void *cb_arg);
424 : static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
425 : struct iovec *iov, int iovcnt, void *md_buf,
426 : uint64_t offset_blocks, uint64_t num_blocks,
427 : struct spdk_memory_domain *domain, void *domain_ctx,
428 : struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
429 : uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw,
430 : spdk_bdev_io_completion_cb cb, void *cb_arg);
431 :
432 : static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
433 : uint64_t offset, uint64_t length,
434 : lock_range_cb cb_fn, void *cb_arg);
435 :
436 : static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
437 : uint64_t offset, uint64_t length,
438 : lock_range_cb cb_fn, void *cb_arg);
439 :
440 : static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort);
441 : static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort);
442 :
443 : static bool claim_type_is_v2(enum spdk_bdev_claim_type type);
444 : static void bdev_desc_release_claims(struct spdk_bdev_desc *desc);
445 : static void claim_reset(struct spdk_bdev *bdev);
446 :
447 : static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch);
448 :
449 : static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io);
450 :
451 : #define bdev_get_ext_io_opt(opts, field, defval) \
452 : ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval))
453 :
454 : static inline void
455 688 : bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io)
456 : {
457 688 : TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
458 688 : bdev_io->internal.ch->queue_depth++;
459 688 : }
460 :
461 : static inline void
462 688 : bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io)
463 : {
464 688 : TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
465 688 : bdev_io->internal.ch->queue_depth--;
466 688 : }
467 :
468 : void
469 16 : spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size)
470 : {
471 16 : if (!opts) {
472 0 : SPDK_ERRLOG("opts should not be NULL\n");
473 0 : return;
474 : }
475 :
476 16 : if (!opts_size) {
477 0 : SPDK_ERRLOG("opts_size should not be zero value\n");
478 0 : return;
479 : }
480 :
481 16 : opts->opts_size = opts_size;
482 :
483 : #define SET_FIELD(field) \
484 : if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \
485 : opts->field = g_bdev_opts.field; \
486 : } \
487 :
488 16 : SET_FIELD(bdev_io_pool_size);
489 16 : SET_FIELD(bdev_io_cache_size);
490 16 : SET_FIELD(bdev_auto_examine);
491 16 : SET_FIELD(iobuf_small_cache_size);
492 16 : SET_FIELD(iobuf_large_cache_size);
493 :
494 : /* Do not remove this statement, you should always update this statement when you adding a new field,
495 : * and do not forget to add the SET_FIELD statement for your added field. */
496 : SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size");
497 :
498 : #undef SET_FIELD
499 : }
500 :
501 : int
502 17 : spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
503 : {
504 : uint32_t min_pool_size;
505 :
506 17 : if (!opts) {
507 0 : SPDK_ERRLOG("opts cannot be NULL\n");
508 0 : return -1;
509 : }
510 :
511 17 : if (!opts->opts_size) {
512 1 : SPDK_ERRLOG("opts_size inside opts cannot be zero value\n");
513 1 : return -1;
514 : }
515 :
516 : /*
517 : * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem
518 : * initialization. A second mgmt_ch will be created on the same thread when the application starts
519 : * but before the deferred put_io_channel event is executed for the first mgmt_ch.
520 : */
521 16 : min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1);
522 16 : if (opts->bdev_io_pool_size < min_pool_size) {
523 0 : SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
524 : " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
525 : spdk_thread_get_count());
526 0 : SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size);
527 0 : return -1;
528 : }
529 :
530 : #define SET_FIELD(field) \
531 : if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \
532 : g_bdev_opts.field = opts->field; \
533 : } \
534 :
535 16 : SET_FIELD(bdev_io_pool_size);
536 16 : SET_FIELD(bdev_io_cache_size);
537 16 : SET_FIELD(bdev_auto_examine);
538 16 : SET_FIELD(iobuf_small_cache_size);
539 16 : SET_FIELD(iobuf_large_cache_size);
540 :
541 16 : g_bdev_opts.opts_size = opts->opts_size;
542 :
543 : #undef SET_FIELD
544 :
545 16 : return 0;
546 : }
547 :
548 : static struct spdk_bdev *
549 156 : bdev_get_by_name(const char *bdev_name)
550 : {
551 156 : struct spdk_bdev_name find;
552 : struct spdk_bdev_name *res;
553 :
554 156 : find.name = (char *)bdev_name;
555 156 : res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find);
556 156 : if (res != NULL) {
557 149 : return res->bdev;
558 : }
559 :
560 7 : return NULL;
561 : }
562 :
563 : struct spdk_bdev *
564 19 : spdk_bdev_get_by_name(const char *bdev_name)
565 : {
566 : struct spdk_bdev *bdev;
567 :
568 19 : spdk_spin_lock(&g_bdev_mgr.spinlock);
569 19 : bdev = bdev_get_by_name(bdev_name);
570 19 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
571 :
572 19 : return bdev;
573 : }
574 :
575 : struct bdev_io_status_string {
576 : enum spdk_bdev_io_status status;
577 : const char *str;
578 : };
579 :
580 : static const struct bdev_io_status_string bdev_io_status_strings[] = {
581 : { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" },
582 : { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" },
583 : { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" },
584 : { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" },
585 : { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" },
586 : { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" },
587 : { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" },
588 : { SPDK_BDEV_IO_STATUS_FAILED, "failed" },
589 : { SPDK_BDEV_IO_STATUS_PENDING, "pending" },
590 : { SPDK_BDEV_IO_STATUS_SUCCESS, "success" },
591 : };
592 :
593 : static const char *
594 0 : bdev_io_status_get_string(enum spdk_bdev_io_status status)
595 : {
596 : uint32_t i;
597 :
598 0 : for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) {
599 0 : if (bdev_io_status_strings[i].status == status) {
600 0 : return bdev_io_status_strings[i].str;
601 : }
602 : }
603 :
604 0 : return "reserved";
605 : }
606 :
607 : struct spdk_bdev_wait_for_examine_ctx {
608 : struct spdk_poller *poller;
609 : spdk_bdev_wait_for_examine_cb cb_fn;
610 : void *cb_arg;
611 : };
612 :
613 : static bool bdev_module_all_actions_completed(void);
614 :
615 : static int
616 205 : bdev_wait_for_examine_cb(void *arg)
617 : {
618 205 : struct spdk_bdev_wait_for_examine_ctx *ctx = arg;
619 :
620 205 : if (!bdev_module_all_actions_completed()) {
621 0 : return SPDK_POLLER_IDLE;
622 : }
623 :
624 205 : spdk_poller_unregister(&ctx->poller);
625 205 : ctx->cb_fn(ctx->cb_arg);
626 205 : free(ctx);
627 :
628 205 : return SPDK_POLLER_BUSY;
629 : }
630 :
631 : int
632 205 : spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg)
633 : {
634 : struct spdk_bdev_wait_for_examine_ctx *ctx;
635 :
636 205 : ctx = calloc(1, sizeof(*ctx));
637 205 : if (ctx == NULL) {
638 0 : return -ENOMEM;
639 : }
640 205 : ctx->cb_fn = cb_fn;
641 205 : ctx->cb_arg = cb_arg;
642 205 : ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0);
643 :
644 205 : return 0;
645 : }
646 :
647 : struct spdk_bdev_examine_item {
648 : char *name;
649 : TAILQ_ENTRY(spdk_bdev_examine_item) link;
650 : };
651 :
652 : TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item);
653 :
654 : struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER(
655 : g_bdev_examine_allowlist);
656 :
657 : static inline bool
658 24 : bdev_examine_allowlist_check(const char *name)
659 : {
660 : struct spdk_bdev_examine_item *item;
661 24 : TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
662 3 : if (strcmp(name, item->name) == 0) {
663 3 : return true;
664 : }
665 : }
666 21 : return false;
667 : }
668 :
669 : static inline void
670 260 : bdev_examine_allowlist_remove(const char *name)
671 : {
672 : struct spdk_bdev_examine_item *item;
673 260 : TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
674 3 : if (strcmp(name, item->name) == 0) {
675 3 : TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link);
676 3 : free(item->name);
677 3 : free(item);
678 3 : break;
679 : }
680 : }
681 260 : }
682 :
683 : static inline void
684 69 : bdev_examine_allowlist_free(void)
685 : {
686 : struct spdk_bdev_examine_item *item;
687 69 : while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) {
688 0 : item = TAILQ_FIRST(&g_bdev_examine_allowlist);
689 0 : TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link);
690 0 : free(item->name);
691 0 : free(item);
692 : }
693 69 : }
694 :
695 : static inline bool
696 12 : bdev_in_examine_allowlist(struct spdk_bdev *bdev)
697 : {
698 : struct spdk_bdev_alias *tmp;
699 12 : if (bdev_examine_allowlist_check(bdev->name)) {
700 3 : return true;
701 : }
702 18 : TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
703 9 : if (bdev_examine_allowlist_check(tmp->alias.name)) {
704 0 : return true;
705 : }
706 : }
707 9 : return false;
708 : }
709 :
710 : static inline bool
711 134 : bdev_ok_to_examine(struct spdk_bdev *bdev)
712 : {
713 : /* Some bdevs may not support the READ command.
714 : * Do not try to examine them.
715 : */
716 134 : if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) {
717 0 : return false;
718 : }
719 :
720 134 : if (g_bdev_opts.bdev_auto_examine) {
721 122 : return true;
722 : } else {
723 12 : return bdev_in_examine_allowlist(bdev);
724 : }
725 : }
726 :
727 : static void
728 134 : bdev_examine(struct spdk_bdev *bdev)
729 : {
730 : struct spdk_bdev_module *module;
731 : struct spdk_bdev_module_claim *claim, *tmpclaim;
732 : uint32_t action;
733 :
734 134 : if (!bdev_ok_to_examine(bdev)) {
735 9 : return;
736 : }
737 :
738 508 : TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
739 383 : if (module->examine_config) {
740 258 : spdk_spin_lock(&module->internal.spinlock);
741 258 : action = module->internal.action_in_progress;
742 258 : module->internal.action_in_progress++;
743 258 : spdk_spin_unlock(&module->internal.spinlock);
744 258 : module->examine_config(bdev);
745 258 : if (action != module->internal.action_in_progress) {
746 0 : SPDK_ERRLOG("examine_config for module %s did not call "
747 : "spdk_bdev_module_examine_done()\n", module->name);
748 : }
749 : }
750 : }
751 :
752 125 : spdk_spin_lock(&bdev->internal.spinlock);
753 :
754 125 : switch (bdev->internal.claim_type) {
755 117 : case SPDK_BDEV_CLAIM_NONE:
756 : /* Examine by all bdev modules */
757 468 : TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
758 351 : if (module->examine_disk) {
759 225 : spdk_spin_lock(&module->internal.spinlock);
760 225 : module->internal.action_in_progress++;
761 225 : spdk_spin_unlock(&module->internal.spinlock);
762 225 : spdk_spin_unlock(&bdev->internal.spinlock);
763 225 : module->examine_disk(bdev);
764 225 : spdk_spin_lock(&bdev->internal.spinlock);
765 : }
766 : }
767 117 : break;
768 1 : case SPDK_BDEV_CLAIM_EXCL_WRITE:
769 : /* Examine by the one bdev module with a v1 claim */
770 1 : module = bdev->internal.claim.v1.module;
771 1 : if (module->examine_disk) {
772 1 : spdk_spin_lock(&module->internal.spinlock);
773 1 : module->internal.action_in_progress++;
774 1 : spdk_spin_unlock(&module->internal.spinlock);
775 1 : spdk_spin_unlock(&bdev->internal.spinlock);
776 1 : module->examine_disk(bdev);
777 1 : return;
778 : }
779 0 : break;
780 7 : default:
781 : /* Examine by all bdev modules with a v2 claim */
782 7 : assert(claim_type_is_v2(bdev->internal.claim_type));
783 : /*
784 : * Removal of tailq nodes while iterating can cause the iteration to jump out of the
785 : * list, perhaps accessing freed memory. Without protection, this could happen
786 : * while the lock is dropped during the examine callback.
787 : */
788 7 : bdev->internal.examine_in_progress++;
789 :
790 16 : TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
791 9 : module = claim->module;
792 :
793 9 : if (module == NULL) {
794 : /* This is a vestigial claim, held by examine_count */
795 0 : continue;
796 : }
797 :
798 9 : if (module->examine_disk == NULL) {
799 0 : continue;
800 : }
801 :
802 9 : spdk_spin_lock(&module->internal.spinlock);
803 9 : module->internal.action_in_progress++;
804 9 : spdk_spin_unlock(&module->internal.spinlock);
805 :
806 : /* Call examine_disk without holding internal.spinlock. */
807 9 : spdk_spin_unlock(&bdev->internal.spinlock);
808 9 : module->examine_disk(bdev);
809 9 : spdk_spin_lock(&bdev->internal.spinlock);
810 : }
811 :
812 7 : assert(bdev->internal.examine_in_progress > 0);
813 7 : bdev->internal.examine_in_progress--;
814 7 : if (bdev->internal.examine_in_progress == 0) {
815 : /* Remove any claims that were released during examine_disk */
816 16 : TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) {
817 9 : if (claim->desc != NULL) {
818 9 : continue;
819 : }
820 :
821 0 : TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link);
822 0 : free(claim);
823 : }
824 7 : if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
825 0 : claim_reset(bdev);
826 : }
827 : }
828 : }
829 :
830 124 : spdk_spin_unlock(&bdev->internal.spinlock);
831 : }
832 :
833 : int
834 4 : spdk_bdev_examine(const char *name)
835 : {
836 : struct spdk_bdev *bdev;
837 : struct spdk_bdev_examine_item *item;
838 4 : struct spdk_thread *thread = spdk_get_thread();
839 :
840 4 : if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) {
841 1 : SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread,
842 : thread ? spdk_thread_get_name(thread) : "null");
843 1 : return -EINVAL;
844 : }
845 :
846 3 : if (g_bdev_opts.bdev_auto_examine) {
847 0 : SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n");
848 0 : return -EINVAL;
849 : }
850 :
851 3 : if (bdev_examine_allowlist_check(name)) {
852 0 : SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name);
853 0 : return -EEXIST;
854 : }
855 :
856 3 : item = calloc(1, sizeof(*item));
857 3 : if (!item) {
858 0 : return -ENOMEM;
859 : }
860 3 : item->name = strdup(name);
861 3 : if (!item->name) {
862 0 : free(item);
863 0 : return -ENOMEM;
864 : }
865 3 : TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link);
866 :
867 3 : bdev = spdk_bdev_get_by_name(name);
868 3 : if (bdev) {
869 3 : bdev_examine(bdev);
870 : }
871 3 : return 0;
872 : }
873 :
874 : static inline void
875 0 : bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w)
876 : {
877 : struct spdk_bdev_examine_item *item;
878 0 : TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
879 0 : spdk_json_write_object_begin(w);
880 0 : spdk_json_write_named_string(w, "method", "bdev_examine");
881 0 : spdk_json_write_named_object_begin(w, "params");
882 0 : spdk_json_write_named_string(w, "name", item->name);
883 0 : spdk_json_write_object_end(w);
884 0 : spdk_json_write_object_end(w);
885 : }
886 0 : }
887 :
888 : struct spdk_bdev *
889 1 : spdk_bdev_first(void)
890 : {
891 : struct spdk_bdev *bdev;
892 :
893 1 : bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
894 1 : if (bdev) {
895 1 : SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
896 : }
897 :
898 1 : return bdev;
899 : }
900 :
901 : struct spdk_bdev *
902 8 : spdk_bdev_next(struct spdk_bdev *prev)
903 : {
904 : struct spdk_bdev *bdev;
905 :
906 8 : bdev = TAILQ_NEXT(prev, internal.link);
907 8 : if (bdev) {
908 7 : SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
909 : }
910 :
911 8 : return bdev;
912 : }
913 :
914 : static struct spdk_bdev *
915 6 : _bdev_next_leaf(struct spdk_bdev *bdev)
916 : {
917 9 : while (bdev != NULL) {
918 8 : if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
919 5 : return bdev;
920 : } else {
921 3 : bdev = TAILQ_NEXT(bdev, internal.link);
922 : }
923 : }
924 :
925 1 : return bdev;
926 : }
927 :
928 : struct spdk_bdev *
929 1 : spdk_bdev_first_leaf(void)
930 : {
931 : struct spdk_bdev *bdev;
932 :
933 1 : bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
934 :
935 1 : if (bdev) {
936 1 : SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
937 : }
938 :
939 1 : return bdev;
940 : }
941 :
942 : struct spdk_bdev *
943 5 : spdk_bdev_next_leaf(struct spdk_bdev *prev)
944 : {
945 : struct spdk_bdev *bdev;
946 :
947 5 : bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link));
948 :
949 5 : if (bdev) {
950 4 : SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
951 : }
952 :
953 5 : return bdev;
954 : }
955 :
956 : static inline bool
957 820 : bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io)
958 : {
959 820 : return bdev_io->internal.f.has_memory_domain;
960 : }
961 :
962 : static inline bool
963 1568 : bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io)
964 : {
965 1568 : return bdev_io->internal.f.has_accel_sequence;
966 : }
967 :
968 : static inline uint32_t
969 389 : bdev_desc_get_block_size(struct spdk_bdev_desc *desc)
970 : {
971 389 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
972 :
973 389 : if (spdk_unlikely(desc->opts.hide_metadata)) {
974 0 : return bdev->blocklen - bdev->md_len;
975 : } else {
976 389 : return bdev->blocklen;
977 : }
978 : }
979 :
980 : static inline uint32_t
981 110 : bdev_io_get_block_size(struct spdk_bdev_io *bdev_io)
982 : {
983 110 : struct spdk_bdev *bdev = bdev_io->bdev;
984 :
985 110 : if (bdev_io->u.bdev.dif_check_flags & SPDK_DIF_FLAGS_NVME_PRACT) {
986 0 : if (bdev->md_len == spdk_dif_pi_format_get_size(bdev->dif_pi_format)) {
987 0 : return bdev->blocklen - bdev->md_len;
988 : } else {
989 0 : return bdev->blocklen;
990 : }
991 : }
992 :
993 110 : return bdev_desc_get_block_size(bdev_io->internal.desc);
994 : }
995 :
996 : static inline void
997 23 : bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource,
998 : struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
999 : {
1000 : /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io.
1001 : * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth
1002 : * channels we will instead wait for half to complete.
1003 : */
1004 23 : shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
1005 : (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
1006 :
1007 23 : assert(state != BDEV_IO_RETRY_STATE_INVALID);
1008 23 : bdev_io->internal.retry_state = state;
1009 23 : TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link);
1010 23 : }
1011 :
1012 : static inline void
1013 58 : bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource,
1014 : struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
1015 : {
1016 : /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while
1017 : * the queue isn't empty, so we don't need to update the nomem_threshold here */
1018 58 : assert(!TAILQ_EMPTY(&shared_resource->nomem_io));
1019 :
1020 58 : assert(state != BDEV_IO_RETRY_STATE_INVALID);
1021 58 : bdev_io->internal.retry_state = state;
1022 58 : TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link);
1023 58 : }
1024 :
1025 : void
1026 16 : spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
1027 : {
1028 : struct iovec *iovs;
1029 :
1030 16 : if (bdev_io->u.bdev.iovs == NULL) {
1031 3 : bdev_io->u.bdev.iovs = &bdev_io->iov;
1032 3 : bdev_io->u.bdev.iovcnt = 1;
1033 : }
1034 :
1035 16 : iovs = bdev_io->u.bdev.iovs;
1036 :
1037 16 : assert(iovs != NULL);
1038 16 : assert(bdev_io->u.bdev.iovcnt >= 1);
1039 :
1040 16 : iovs[0].iov_base = buf;
1041 16 : iovs[0].iov_len = len;
1042 16 : }
1043 :
1044 : void
1045 3 : spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
1046 : {
1047 3 : assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks);
1048 3 : bdev_io->u.bdev.md_buf = md_buf;
1049 3 : }
1050 :
1051 : static bool
1052 167 : _is_buf_allocated(const struct iovec *iovs)
1053 : {
1054 167 : if (iovs == NULL) {
1055 6 : return false;
1056 : }
1057 :
1058 161 : return iovs[0].iov_base != NULL;
1059 : }
1060 :
1061 : static bool
1062 50 : _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment)
1063 : {
1064 : int i;
1065 : uintptr_t iov_base;
1066 :
1067 50 : if (spdk_likely(alignment == 1)) {
1068 21 : return true;
1069 : }
1070 :
1071 36 : for (i = 0; i < iovcnt; i++) {
1072 29 : iov_base = (uintptr_t)iovs[i].iov_base;
1073 29 : if ((iov_base & (alignment - 1)) != 0) {
1074 22 : return false;
1075 : }
1076 : }
1077 :
1078 7 : return true;
1079 : }
1080 :
1081 : static inline bool
1082 895 : bdev_io_needs_metadata(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
1083 : {
1084 1048 : return (bdev_io->bdev->md_len != 0) &&
1085 153 : (desc->opts.hide_metadata ||
1086 153 : (bdev_io->u.bdev.dif_check_flags & SPDK_DIF_FLAGS_NVME_PRACT));
1087 : }
1088 :
1089 : static inline bool
1090 852 : bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
1091 : {
1092 852 : if (!bdev_io_use_accel_sequence(bdev_io)) {
1093 852 : return false;
1094 : }
1095 :
1096 : /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if
1097 : * bdev module didn't support accel sequences */
1098 0 : return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split;
1099 : }
1100 :
1101 : static inline void
1102 624 : bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch,
1103 : struct spdk_bdev_shared_resource *shared_resource)
1104 : {
1105 624 : bdev_ch->io_outstanding++;
1106 624 : shared_resource->io_outstanding++;
1107 624 : }
1108 :
1109 : static inline void
1110 624 : bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch,
1111 : struct spdk_bdev_shared_resource *shared_resource)
1112 : {
1113 624 : assert(bdev_ch->io_outstanding > 0);
1114 624 : assert(shared_resource->io_outstanding > 0);
1115 624 : bdev_ch->io_outstanding--;
1116 624 : shared_resource->io_outstanding--;
1117 624 : }
1118 :
1119 : static void
1120 0 : bdev_io_submit_sequence_cb(void *ctx, int status)
1121 : {
1122 0 : struct spdk_bdev_io *bdev_io = ctx;
1123 :
1124 0 : assert(bdev_io_use_accel_sequence(bdev_io));
1125 :
1126 0 : bdev_io->u.bdev.accel_sequence = NULL;
1127 0 : bdev_io->internal.f.has_accel_sequence = false;
1128 :
1129 0 : if (spdk_unlikely(status != 0)) {
1130 0 : SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
1131 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1132 0 : bdev_io_complete_unsubmitted(bdev_io);
1133 0 : return;
1134 : }
1135 :
1136 0 : bdev_io_submit(bdev_io);
1137 : }
1138 :
1139 : static void
1140 0 : bdev_io_exec_sequence_cb(void *ctx, int status)
1141 : {
1142 0 : struct spdk_bdev_io *bdev_io = ctx;
1143 0 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1144 :
1145 0 : TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1146 0 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1147 :
1148 0 : if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1149 0 : bdev_ch_retry_io(ch);
1150 : }
1151 :
1152 0 : bdev_io->internal.data_transfer_cpl(bdev_io, status);
1153 0 : }
1154 :
1155 : static void
1156 0 : bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status))
1157 : {
1158 0 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1159 :
1160 0 : assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1161 0 : assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1162 0 : assert(bdev_io_use_accel_sequence(bdev_io));
1163 :
1164 : /* Since the operations are appended during submission, they're in the opposite order than
1165 : * how we want to execute them for reads (i.e. we need to execute the most recently added
1166 : * operation first), so reverse the sequence before executing it.
1167 : */
1168 0 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1169 0 : spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence);
1170 : }
1171 :
1172 0 : TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1173 0 : bdev_io_increment_outstanding(ch, ch->shared_resource);
1174 0 : bdev_io->internal.data_transfer_cpl = cb_fn;
1175 :
1176 0 : spdk_accel_sequence_finish(bdev_io->internal.accel_sequence,
1177 : bdev_io_exec_sequence_cb, bdev_io);
1178 0 : }
1179 :
1180 : static void
1181 42 : bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status)
1182 : {
1183 42 : struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
1184 : void *buf;
1185 :
1186 42 : if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1187 0 : buf = bdev_io->internal.buf.ptr;
1188 0 : bdev_io->internal.buf.ptr = NULL;
1189 0 : bdev_io->internal.f.has_buf = false;
1190 0 : bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf);
1191 0 : bdev_io->internal.get_aux_buf_cb = NULL;
1192 : } else {
1193 42 : assert(bdev_io->internal.get_buf_cb != NULL);
1194 42 : bdev_io->internal.get_buf_cb(ch, bdev_io, status);
1195 42 : bdev_io->internal.get_buf_cb = NULL;
1196 : }
1197 42 : }
1198 :
1199 : static void
1200 4 : _bdev_io_pull_buffer_cpl(void *ctx, int rc)
1201 : {
1202 4 : struct spdk_bdev_io *bdev_io = ctx;
1203 :
1204 4 : if (rc) {
1205 0 : SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc);
1206 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1207 : }
1208 4 : bdev_io_get_buf_complete(bdev_io, !rc);
1209 4 : }
1210 :
1211 : static void
1212 2 : bdev_io_pull_md_buf_done(void *ctx, int status)
1213 : {
1214 2 : struct spdk_bdev_io *bdev_io = ctx;
1215 2 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1216 :
1217 2 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1218 2 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1219 :
1220 2 : if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1221 0 : bdev_ch_retry_io(ch);
1222 : }
1223 :
1224 2 : assert(bdev_io->internal.data_transfer_cpl);
1225 2 : bdev_io->internal.data_transfer_cpl(bdev_io, status);
1226 2 : }
1227 :
1228 : static void
1229 4 : bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io)
1230 : {
1231 4 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1232 4 : int rc = 0;
1233 :
1234 4 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1235 2 : assert(bdev_io->internal.f.has_bounce_buf);
1236 2 : if (bdev_io_use_memory_domain(bdev_io)) {
1237 2 : TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1238 2 : bdev_io_increment_outstanding(ch, ch->shared_resource);
1239 2 : rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1240 : bdev_io->internal.memory_domain_ctx,
1241 : &bdev_io->internal.bounce_buf.orig_md_iov, 1,
1242 : &bdev_io->internal.bounce_buf.md_iov, 1,
1243 : bdev_io_pull_md_buf_done, bdev_io);
1244 2 : if (rc == 0) {
1245 : /* Continue to submit IO in completion callback */
1246 2 : return;
1247 : }
1248 0 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1249 0 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1250 0 : if (rc != -ENOMEM) {
1251 0 : SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n",
1252 : spdk_memory_domain_get_dma_device_id(
1253 : bdev_io->internal.memory_domain), rc);
1254 : }
1255 : } else {
1256 0 : memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base,
1257 0 : bdev_io->internal.bounce_buf.orig_md_iov.iov_base,
1258 : bdev_io->internal.bounce_buf.orig_md_iov.iov_len);
1259 : }
1260 : }
1261 :
1262 2 : if (spdk_unlikely(rc == -ENOMEM)) {
1263 0 : bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD);
1264 : } else {
1265 2 : assert(bdev_io->internal.data_transfer_cpl);
1266 2 : bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1267 : }
1268 : }
1269 :
1270 : static void
1271 4 : _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
1272 : {
1273 4 : assert(bdev_io->internal.f.has_bounce_buf);
1274 :
1275 : /* save original md_buf */
1276 4 : bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf;
1277 4 : bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len;
1278 4 : bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf;
1279 4 : bdev_io->internal.bounce_buf.md_iov.iov_len = len;
1280 : /* set bounce md_buf */
1281 4 : bdev_io->u.bdev.md_buf = md_buf;
1282 :
1283 4 : bdev_io_pull_md_buf(bdev_io);
1284 4 : }
1285 :
1286 : static void
1287 42 : _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io)
1288 : {
1289 42 : struct spdk_bdev *bdev = bdev_io->bdev;
1290 : uint64_t md_len;
1291 : void *buf;
1292 :
1293 42 : if (spdk_bdev_is_md_separate(bdev)) {
1294 7 : assert(!bdev_io_use_accel_sequence(bdev_io));
1295 :
1296 7 : buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len;
1297 7 : md_len = bdev_io->u.bdev.num_blocks * bdev->md_len;
1298 :
1299 7 : assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0);
1300 :
1301 7 : if (bdev_io->u.bdev.md_buf != NULL) {
1302 4 : _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len);
1303 4 : return;
1304 : } else {
1305 3 : spdk_bdev_io_set_md_buf(bdev_io, buf, md_len);
1306 : }
1307 : }
1308 :
1309 38 : bdev_io_get_buf_complete(bdev_io, true);
1310 : }
1311 :
1312 : static inline void
1313 26 : bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc)
1314 : {
1315 26 : if (rc) {
1316 0 : SPDK_ERRLOG("Failed to get data buffer\n");
1317 0 : assert(bdev_io->internal.data_transfer_cpl);
1318 0 : bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1319 0 : return;
1320 : }
1321 :
1322 26 : _bdev_io_set_md_buf(bdev_io);
1323 : }
1324 :
1325 : static void
1326 2 : bdev_io_pull_data_done_and_track(void *ctx, int status)
1327 : {
1328 2 : struct spdk_bdev_io *bdev_io = ctx;
1329 2 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1330 :
1331 2 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1332 2 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1333 :
1334 2 : if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1335 0 : bdev_ch_retry_io(ch);
1336 : }
1337 :
1338 2 : bdev_io_pull_data_done(bdev_io, status);
1339 2 : }
1340 :
1341 : static void
1342 27 : bdev_io_pull_data(struct spdk_bdev_io *bdev_io)
1343 : {
1344 27 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1345 27 : struct spdk_bdev_desc *desc = bdev_io->internal.desc;
1346 27 : int rc = 0;
1347 :
1348 27 : assert(bdev_io->internal.f.has_bounce_buf);
1349 :
1350 27 : if (bdev_io_needs_metadata(desc, bdev_io)) {
1351 0 : assert(bdev_io->bdev->md_interleave);
1352 :
1353 0 : bdev_io->u.bdev.dif_check_flags &= ~SPDK_DIF_FLAGS_NVME_PRACT;
1354 :
1355 0 : if (!bdev_io_use_accel_sequence(bdev_io)) {
1356 0 : bdev_io->internal.accel_sequence = NULL;
1357 : }
1358 :
1359 0 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1360 0 : rc = spdk_accel_append_dif_generate_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1361 0 : bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1362 : bdev_io->u.bdev.memory_domain,
1363 : bdev_io->u.bdev.memory_domain_ctx,
1364 : bdev_io->internal.bounce_buf.orig_iovs,
1365 0 : bdev_io->internal.bounce_buf.orig_iovcnt,
1366 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1367 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1368 0 : bdev_io->u.bdev.num_blocks,
1369 0 : &bdev_io->u.bdev.dif_ctx,
1370 : NULL, NULL);
1371 : } else {
1372 0 : assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1373 0 : rc = spdk_accel_append_dif_verify_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1374 : bdev_io->internal.bounce_buf.orig_iovs,
1375 0 : bdev_io->internal.bounce_buf.orig_iovcnt,
1376 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1377 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1378 0 : bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1379 : bdev_io->u.bdev.memory_domain,
1380 : bdev_io->u.bdev.memory_domain_ctx,
1381 0 : bdev_io->u.bdev.num_blocks,
1382 0 : &bdev_io->u.bdev.dif_ctx,
1383 : &bdev_io->u.bdev.dif_err,
1384 : NULL, NULL);
1385 : }
1386 :
1387 0 : if (spdk_likely(rc == 0)) {
1388 0 : bdev_io->internal.f.has_accel_sequence = true;
1389 0 : bdev_io->u.bdev.accel_sequence = bdev_io->internal.accel_sequence;
1390 0 : } else if (rc != -ENOMEM) {
1391 0 : SPDK_ERRLOG("Failed to append generate/verify_copy to accel sequence: %p\n",
1392 : bdev_io->internal.accel_sequence);
1393 : }
1394 54 : } else if (bdev_io_needs_sequence_exec(desc, bdev_io) ||
1395 27 : (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) {
1396 : /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a
1397 : * sequence, append a copy operation making accel change the src/dst buffers of the previous
1398 : * operation */
1399 0 : assert(bdev_io_use_accel_sequence(bdev_io));
1400 0 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1401 0 : rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1402 0 : bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1403 : NULL, NULL,
1404 : bdev_io->internal.bounce_buf.orig_iovs,
1405 0 : bdev_io->internal.bounce_buf.orig_iovcnt,
1406 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1407 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1408 : NULL, NULL);
1409 : } else {
1410 : /* We need to reverse the src/dst for reads */
1411 0 : assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1412 0 : rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1413 : bdev_io->internal.bounce_buf.orig_iovs,
1414 0 : bdev_io->internal.bounce_buf.orig_iovcnt,
1415 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1416 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1417 0 : bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1418 : NULL, NULL, NULL, NULL);
1419 : }
1420 :
1421 0 : if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
1422 0 : SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n",
1423 : bdev_io->internal.accel_sequence);
1424 : }
1425 27 : } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1426 : /* if this is write path, copy data from original buffer to bounce buffer */
1427 17 : if (bdev_io_use_memory_domain(bdev_io)) {
1428 3 : TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1429 3 : bdev_io_increment_outstanding(ch, ch->shared_resource);
1430 6 : rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1431 : bdev_io->internal.memory_domain_ctx,
1432 : bdev_io->internal.bounce_buf.orig_iovs,
1433 3 : (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt,
1434 : bdev_io->u.bdev.iovs, 1,
1435 : bdev_io_pull_data_done_and_track,
1436 : bdev_io);
1437 3 : if (rc == 0) {
1438 : /* Continue to submit IO in completion callback */
1439 2 : return;
1440 : }
1441 1 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1442 1 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1443 1 : if (rc != -ENOMEM) {
1444 0 : SPDK_ERRLOG("Failed to pull data from memory domain %s\n",
1445 : spdk_memory_domain_get_dma_device_id(
1446 : bdev_io->internal.memory_domain));
1447 : }
1448 : } else {
1449 14 : assert(bdev_io->u.bdev.iovcnt == 1);
1450 28 : spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base,
1451 14 : bdev_io->u.bdev.iovs[0].iov_len,
1452 : bdev_io->internal.bounce_buf.orig_iovs,
1453 : bdev_io->internal.bounce_buf.orig_iovcnt);
1454 : }
1455 : }
1456 :
1457 25 : if (spdk_unlikely(rc == -ENOMEM)) {
1458 1 : bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1459 : } else {
1460 24 : bdev_io_pull_data_done(bdev_io, rc);
1461 : }
1462 : }
1463 :
1464 : static void
1465 26 : _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len,
1466 : bdev_copy_bounce_buffer_cpl cpl_cb)
1467 : {
1468 26 : struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource;
1469 :
1470 26 : assert(bdev_io->internal.f.has_bounce_buf == false);
1471 :
1472 26 : bdev_io->internal.data_transfer_cpl = cpl_cb;
1473 26 : bdev_io->internal.f.has_bounce_buf = true;
1474 : /* save original iovec */
1475 26 : bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs;
1476 26 : bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt;
1477 : /* zero the other data members */
1478 26 : bdev_io->internal.bounce_buf.iov.iov_base = NULL;
1479 26 : bdev_io->internal.bounce_buf.md_iov.iov_base = NULL;
1480 26 : bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL;
1481 : /* set bounce iov */
1482 26 : bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov;
1483 26 : bdev_io->u.bdev.iovcnt = 1;
1484 : /* set bounce buffer for this operation */
1485 26 : bdev_io->u.bdev.iovs[0].iov_base = buf;
1486 26 : bdev_io->u.bdev.iovs[0].iov_len = len;
1487 : /* Now we use 1 iov, the split condition could have been changed */
1488 26 : bdev_io->internal.f.split = bdev_io_should_split(bdev_io);
1489 :
1490 26 : if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1491 0 : bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1492 : } else {
1493 26 : bdev_io_pull_data(bdev_io);
1494 : }
1495 26 : }
1496 :
1497 : static void
1498 42 : _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len)
1499 : {
1500 42 : struct spdk_bdev *bdev = bdev_io->bdev;
1501 : bool buf_allocated;
1502 : uint64_t alignment;
1503 : void *aligned_buf;
1504 :
1505 42 : bdev_io->internal.buf.ptr = buf;
1506 42 : bdev_io->internal.f.has_buf = true;
1507 :
1508 42 : if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1509 0 : bdev_io_get_buf_complete(bdev_io, true);
1510 0 : return;
1511 : }
1512 :
1513 42 : alignment = spdk_bdev_get_buf_align(bdev);
1514 42 : buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs);
1515 42 : aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1));
1516 :
1517 42 : if (buf_allocated) {
1518 26 : _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl);
1519 : /* Continue in completion callback */
1520 26 : return;
1521 : } else {
1522 16 : spdk_bdev_io_set_buf(bdev_io, aligned_buf, len);
1523 : }
1524 :
1525 16 : _bdev_io_set_md_buf(bdev_io);
1526 : }
1527 :
1528 : static inline uint64_t
1529 42 : bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len)
1530 : {
1531 42 : struct spdk_bdev *bdev = bdev_io->bdev;
1532 : uint64_t md_len, alignment;
1533 :
1534 42 : md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
1535 :
1536 : /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */
1537 42 : alignment = spdk_bdev_get_buf_align(bdev) - 1;
1538 :
1539 42 : return len + alignment + md_len;
1540 : }
1541 :
1542 : static void
1543 42 : bdev_io_put_accel_buf(struct spdk_bdev_io *bdev_io)
1544 : {
1545 42 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1546 :
1547 42 : spdk_accel_put_buf(ch->accel_channel,
1548 : bdev_io->internal.buf.ptr,
1549 : bdev_io->u.bdev.memory_domain,
1550 : bdev_io->u.bdev.memory_domain_ctx);
1551 42 : }
1552 :
1553 : static void
1554 0 : _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len)
1555 : {
1556 : struct spdk_bdev_mgmt_channel *ch;
1557 :
1558 0 : ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1559 0 : spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len));
1560 0 : }
1561 :
1562 : static void
1563 42 : bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
1564 : {
1565 42 : assert(bdev_io->internal.f.has_buf);
1566 :
1567 42 : if (bdev_io->u.bdev.memory_domain == spdk_accel_get_memory_domain()) {
1568 42 : bdev_io_put_accel_buf(bdev_io);
1569 : } else {
1570 0 : assert(bdev_io->u.bdev.memory_domain == NULL);
1571 0 : _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr,
1572 : bdev_io->internal.buf.len);
1573 : }
1574 42 : bdev_io->internal.buf.ptr = NULL;
1575 42 : bdev_io->internal.f.has_buf = false;
1576 42 : }
1577 :
1578 3 : SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf,
1579 : "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0);
1580 :
1581 : void
1582 0 : spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf)
1583 : {
1584 0 : uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1585 :
1586 0 : SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf);
1587 :
1588 0 : assert(buf != NULL);
1589 0 : _bdev_io_put_buf(bdev_io, buf, len);
1590 0 : }
1591 :
1592 : static inline void
1593 566 : bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch,
1594 : struct spdk_bdev_io *bdev_io)
1595 : {
1596 : /* After a request is submitted to a bdev module, the ownership of an accel sequence
1597 : * associated with that bdev_io is transferred to the bdev module. So, clear the internal
1598 : * sequence pointer to make sure we won't touch it anymore. */
1599 566 : if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE ||
1600 566 : bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) {
1601 0 : assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1602 0 : bdev_io->internal.f.has_accel_sequence = false;
1603 : }
1604 :
1605 : /* The generic bdev layer should not pass an I/O with a dif_check_flags set that
1606 : * the underlying bdev does not support. Add an assert to check this.
1607 : */
1608 566 : assert((bdev_io->type != SPDK_BDEV_IO_TYPE_WRITE &&
1609 : bdev_io->type != SPDK_BDEV_IO_TYPE_READ) ||
1610 : ((bdev_io->u.bdev.dif_check_flags & bdev->dif_check_flags) ==
1611 : bdev_io->u.bdev.dif_check_flags));
1612 :
1613 566 : bdev->fn_table->submit_request(ioch, bdev_io);
1614 566 : }
1615 :
1616 : static inline void
1617 25 : bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io)
1618 : {
1619 25 : struct spdk_bdev *bdev = bdev_io->bdev;
1620 :
1621 25 : bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource);
1622 25 : bdev_io->internal.error.nvme.cdw0 = 0;
1623 25 : bdev_io->num_retries++;
1624 25 : bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io);
1625 25 : }
1626 :
1627 : static void
1628 93 : bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource)
1629 : {
1630 : struct spdk_bdev_io *bdev_io;
1631 :
1632 93 : if (shared_resource->nomem_abort_in_progress) {
1633 : /**
1634 : * We are aborting nomem I/Os, do not touch nomem_io list now.
1635 : */
1636 51 : return;
1637 : }
1638 :
1639 42 : if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
1640 : /*
1641 : * Allow some more I/O to complete before retrying the nomem_io queue.
1642 : * Some drivers (such as nvme) cannot immediately take a new I/O in
1643 : * the context of a completion, because the resources for the I/O are
1644 : * not released until control returns to the bdev poller. Also, we
1645 : * may require several small I/O to complete before a larger I/O
1646 : * (that requires splitting) can be submitted.
1647 : */
1648 22 : return;
1649 : }
1650 :
1651 31 : while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1652 27 : bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
1653 27 : TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link);
1654 :
1655 27 : switch (bdev_io->internal.retry_state) {
1656 25 : case BDEV_IO_RETRY_STATE_SUBMIT:
1657 25 : bdev_ch_resubmit_io(shared_resource, bdev_io);
1658 25 : break;
1659 1 : case BDEV_IO_RETRY_STATE_PULL:
1660 1 : bdev_io_pull_data(bdev_io);
1661 1 : break;
1662 0 : case BDEV_IO_RETRY_STATE_PULL_MD:
1663 0 : bdev_io_pull_md_buf(bdev_io);
1664 0 : break;
1665 1 : case BDEV_IO_RETRY_STATE_PUSH:
1666 1 : bdev_io_push_bounce_data(bdev_io);
1667 1 : break;
1668 0 : case BDEV_IO_RETRY_STATE_PUSH_MD:
1669 0 : bdev_io_push_bounce_md_buf(bdev_io);
1670 0 : break;
1671 0 : case BDEV_IO_RETRY_STATE_GET_ACCEL_BUF:
1672 0 : _bdev_io_get_accel_buf(bdev_io);
1673 0 : break;
1674 0 : default:
1675 0 : assert(0 && "invalid retry state");
1676 : break;
1677 : }
1678 :
1679 27 : if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) {
1680 : /* This IO completed again with NOMEM status, so break the loop and
1681 : * don't try anymore. Note that a bdev_io that fails with NOMEM
1682 : * always gets requeued at the front of the list, to maintain
1683 : * ordering.
1684 : */
1685 16 : break;
1686 : }
1687 : }
1688 : }
1689 :
1690 : static void
1691 78 : bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
1692 : {
1693 78 : bdev_shared_ch_retry_io(bdev_ch->shared_resource);
1694 78 : }
1695 :
1696 : static int
1697 0 : bdev_no_mem_poller(void *ctx)
1698 : {
1699 0 : struct spdk_bdev_shared_resource *shared_resource = ctx;
1700 :
1701 0 : spdk_poller_unregister(&shared_resource->nomem_poller);
1702 :
1703 0 : if (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1704 0 : bdev_shared_ch_retry_io(shared_resource);
1705 : }
1706 : /* the retry cb may re-register the poller so double check */
1707 0 : if (!TAILQ_EMPTY(&shared_resource->nomem_io) &&
1708 0 : shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) {
1709 : /* No IOs were submitted, try again */
1710 0 : shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource,
1711 : SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10);
1712 : }
1713 :
1714 0 : return SPDK_POLLER_BUSY;
1715 : }
1716 :
1717 : static inline bool
1718 588 : _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
1719 : {
1720 588 : struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1721 588 : struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1722 :
1723 588 : if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) {
1724 21 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
1725 21 : bdev_queue_nomem_io_head(shared_resource, bdev_io, state);
1726 :
1727 21 : if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) {
1728 : /* Special case when we have nomem IOs and no outstanding IOs which completions
1729 : * could trigger retry of queued IOs
1730 : * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no
1731 : * new IOs submitted, e.g. qd==1 */
1732 1 : shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource,
1733 : SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10);
1734 : }
1735 : /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the
1736 : * ownership of that sequence is transferred back to the bdev layer, so we need to
1737 : * restore internal.accel_sequence to make sure that the sequence is handled
1738 : * correctly in case the I/O is later aborted. */
1739 21 : if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
1740 21 : bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) {
1741 0 : assert(!bdev_io_use_accel_sequence(bdev_io));
1742 0 : bdev_io->internal.f.has_accel_sequence = true;
1743 0 : bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence;
1744 : }
1745 :
1746 21 : return true;
1747 : }
1748 :
1749 567 : if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1750 78 : bdev_ch_retry_io(bdev_ch);
1751 : }
1752 :
1753 567 : return false;
1754 : }
1755 :
1756 : static void
1757 26 : _bdev_io_complete_push_bounce_done(void *ctx, int rc)
1758 : {
1759 26 : struct spdk_bdev_io *bdev_io = ctx;
1760 26 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1761 :
1762 26 : if (rc) {
1763 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1764 : }
1765 : /* We want to free the bounce buffer here since we know we're done with it (as opposed
1766 : * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()).
1767 : */
1768 26 : bdev_io_put_buf(bdev_io);
1769 :
1770 26 : if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1771 0 : bdev_ch_retry_io(ch);
1772 : }
1773 :
1774 : /* Continue with IO completion flow */
1775 26 : bdev_io_complete(bdev_io);
1776 26 : }
1777 :
1778 : static void
1779 2 : bdev_io_push_bounce_md_buf_done(void *ctx, int rc)
1780 : {
1781 2 : struct spdk_bdev_io *bdev_io = ctx;
1782 2 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1783 :
1784 2 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1785 2 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1786 2 : bdev_io->internal.f.has_bounce_buf = false;
1787 :
1788 2 : if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1789 0 : bdev_ch_retry_io(ch);
1790 : }
1791 :
1792 2 : bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1793 2 : }
1794 :
1795 : static inline void
1796 26 : bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io)
1797 : {
1798 26 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1799 26 : int rc = 0;
1800 :
1801 26 : assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1802 26 : assert(bdev_io->internal.f.has_bounce_buf);
1803 :
1804 : /* do the same for metadata buffer */
1805 26 : if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) {
1806 4 : assert(spdk_bdev_is_md_separate(bdev_io->bdev));
1807 :
1808 4 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1809 2 : if (bdev_io_use_memory_domain(bdev_io)) {
1810 2 : TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1811 2 : bdev_io_increment_outstanding(ch, ch->shared_resource);
1812 : /* If memory domain is used then we need to call async push function */
1813 4 : rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1814 : bdev_io->internal.memory_domain_ctx,
1815 : &bdev_io->internal.bounce_buf.orig_md_iov,
1816 2 : (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt,
1817 : &bdev_io->internal.bounce_buf.md_iov, 1,
1818 : bdev_io_push_bounce_md_buf_done,
1819 : bdev_io);
1820 2 : if (rc == 0) {
1821 : /* Continue IO completion in async callback */
1822 2 : return;
1823 : }
1824 0 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1825 0 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1826 0 : if (rc != -ENOMEM) {
1827 0 : SPDK_ERRLOG("Failed to push md to memory domain %s\n",
1828 : spdk_memory_domain_get_dma_device_id(
1829 : bdev_io->internal.memory_domain));
1830 : }
1831 : } else {
1832 0 : memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf,
1833 : bdev_io->internal.bounce_buf.orig_md_iov.iov_len);
1834 : }
1835 : }
1836 : }
1837 :
1838 24 : if (spdk_unlikely(rc == -ENOMEM)) {
1839 0 : bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD);
1840 : } else {
1841 24 : assert(bdev_io->internal.data_transfer_cpl);
1842 24 : bdev_io->internal.f.has_bounce_buf = false;
1843 24 : bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1844 : }
1845 : }
1846 :
1847 : static inline void
1848 26 : bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc)
1849 : {
1850 26 : assert(bdev_io->internal.data_transfer_cpl);
1851 26 : if (rc) {
1852 0 : bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1853 0 : return;
1854 : }
1855 :
1856 : /* set original buffer for this io */
1857 26 : bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt;
1858 26 : bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs;
1859 :
1860 : /* We don't set bdev_io->internal.f.has_bounce_buf to false here because
1861 : * we still need to clear the md buf */
1862 :
1863 26 : bdev_io_push_bounce_md_buf(bdev_io);
1864 : }
1865 :
1866 : static void
1867 2 : bdev_io_push_bounce_data_done_and_track(void *ctx, int status)
1868 : {
1869 2 : struct spdk_bdev_io *bdev_io = ctx;
1870 2 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1871 :
1872 2 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1873 2 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1874 :
1875 2 : if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1876 0 : bdev_ch_retry_io(ch);
1877 : }
1878 :
1879 2 : bdev_io_push_bounce_data_done(bdev_io, status);
1880 2 : }
1881 :
1882 : static inline void
1883 27 : bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io)
1884 : {
1885 27 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1886 27 : int rc = 0;
1887 :
1888 27 : assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1889 27 : assert(!bdev_io_use_accel_sequence(bdev_io));
1890 27 : assert(bdev_io->internal.f.has_bounce_buf);
1891 :
1892 : /* if this is read path, copy data from bounce buffer to original buffer */
1893 27 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1894 11 : if (bdev_io_use_memory_domain(bdev_io)) {
1895 3 : TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1896 3 : bdev_io_increment_outstanding(ch, ch->shared_resource);
1897 : /* If memory domain is used then we need to call async push function */
1898 6 : rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1899 : bdev_io->internal.memory_domain_ctx,
1900 : bdev_io->internal.bounce_buf.orig_iovs,
1901 3 : (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt,
1902 : &bdev_io->internal.bounce_buf.iov, 1,
1903 : bdev_io_push_bounce_data_done_and_track,
1904 : bdev_io);
1905 3 : if (rc == 0) {
1906 : /* Continue IO completion in async callback */
1907 2 : return;
1908 : }
1909 :
1910 1 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1911 1 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1912 1 : if (rc != -ENOMEM) {
1913 0 : SPDK_ERRLOG("Failed to push data to memory domain %s\n",
1914 : spdk_memory_domain_get_dma_device_id(
1915 : bdev_io->internal.memory_domain));
1916 : }
1917 : } else {
1918 8 : spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs,
1919 : bdev_io->internal.bounce_buf.orig_iovcnt,
1920 : bdev_io->internal.bounce_buf.iov.iov_base,
1921 : bdev_io->internal.bounce_buf.iov.iov_len);
1922 : }
1923 : }
1924 :
1925 25 : if (spdk_unlikely(rc == -ENOMEM)) {
1926 1 : bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH);
1927 : } else {
1928 24 : bdev_io_push_bounce_data_done(bdev_io, rc);
1929 : }
1930 : }
1931 :
1932 : static inline void
1933 26 : _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb)
1934 : {
1935 26 : bdev_io->internal.data_transfer_cpl = cpl_cb;
1936 26 : bdev_io_push_bounce_data(bdev_io);
1937 26 : }
1938 :
1939 : static void
1940 0 : bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf)
1941 : {
1942 : struct spdk_bdev_io *bdev_io;
1943 :
1944 0 : bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf);
1945 0 : _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len);
1946 0 : }
1947 :
1948 : static void
1949 42 : bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len)
1950 : {
1951 : struct spdk_bdev_mgmt_channel *mgmt_ch;
1952 : uint64_t max_len;
1953 : void *buf;
1954 :
1955 42 : assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread());
1956 42 : mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1957 42 : max_len = bdev_io_get_max_buf_len(bdev_io, len);
1958 :
1959 42 : if (spdk_unlikely(max_len > mgmt_ch->iobuf.cache[0].large.bufsize)) {
1960 0 : SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len);
1961 0 : bdev_io_get_buf_complete(bdev_io, false);
1962 0 : return;
1963 : }
1964 :
1965 42 : bdev_io->internal.buf.len = len;
1966 42 : buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf,
1967 : bdev_io_get_iobuf_cb);
1968 42 : if (buf != NULL) {
1969 42 : _bdev_io_set_buf(bdev_io, buf, len);
1970 : }
1971 : }
1972 :
1973 : void
1974 56 : spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
1975 : {
1976 56 : struct spdk_bdev *bdev = bdev_io->bdev;
1977 : uint64_t alignment;
1978 :
1979 56 : assert(cb != NULL);
1980 56 : bdev_io->internal.get_buf_cb = cb;
1981 :
1982 56 : alignment = spdk_bdev_get_buf_align(bdev);
1983 :
1984 96 : if (_is_buf_allocated(bdev_io->u.bdev.iovs) &&
1985 40 : _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) {
1986 : /* Buffer already present and aligned */
1987 18 : cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true);
1988 18 : return;
1989 : }
1990 :
1991 38 : bdev_io_get_buf(bdev_io, len);
1992 : }
1993 :
1994 : static void
1995 4 : _bdev_io_get_bounce_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb,
1996 : uint64_t len)
1997 : {
1998 4 : assert(cb != NULL);
1999 4 : bdev_io->internal.get_buf_cb = cb;
2000 :
2001 4 : bdev_io_get_buf(bdev_io, len);
2002 4 : }
2003 :
2004 : static void
2005 0 : _bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io)
2006 : {
2007 0 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
2008 0 : void *buf;
2009 : int rc;
2010 :
2011 0 : rc = spdk_accel_get_buf(ch->accel_channel,
2012 : bdev_io->internal.buf.len,
2013 : &buf,
2014 : &bdev_io->u.bdev.memory_domain,
2015 : &bdev_io->u.bdev.memory_domain_ctx);
2016 0 : if (rc != 0) {
2017 0 : bdev_queue_nomem_io_tail(ch->shared_resource, bdev_io,
2018 : BDEV_IO_RETRY_STATE_GET_ACCEL_BUF);
2019 0 : return;
2020 : }
2021 :
2022 0 : _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len);
2023 : }
2024 :
2025 : static inline void
2026 0 : bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb,
2027 : uint64_t len)
2028 : {
2029 0 : bdev_io->internal.buf.len = len;
2030 0 : bdev_io->internal.get_buf_cb = cb;
2031 :
2032 0 : _bdev_io_get_accel_buf(bdev_io);
2033 0 : }
2034 :
2035 3 : SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf,
2036 : "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0);
2037 :
2038 : void
2039 0 : spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb)
2040 : {
2041 0 : uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
2042 :
2043 0 : SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf);
2044 :
2045 0 : assert(cb != NULL);
2046 0 : assert(bdev_io->internal.get_aux_buf_cb == NULL);
2047 0 : bdev_io->internal.get_aux_buf_cb = cb;
2048 0 : bdev_io_get_buf(bdev_io, len);
2049 0 : }
2050 :
2051 : static int
2052 69 : bdev_module_get_max_ctx_size(void)
2053 : {
2054 : struct spdk_bdev_module *bdev_module;
2055 69 : int max_bdev_module_size = 0;
2056 :
2057 268 : TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2058 199 : if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
2059 68 : max_bdev_module_size = bdev_module->get_ctx_size();
2060 : }
2061 : }
2062 :
2063 69 : return max_bdev_module_size;
2064 : }
2065 :
2066 : static void
2067 0 : bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
2068 : {
2069 0 : if (!bdev->internal.histogram_enabled) {
2070 0 : return;
2071 : }
2072 :
2073 0 : spdk_json_write_object_begin(w);
2074 0 : spdk_json_write_named_string(w, "method", "bdev_enable_histogram");
2075 :
2076 0 : spdk_json_write_named_object_begin(w, "params");
2077 0 : spdk_json_write_named_string(w, "name", bdev->name);
2078 :
2079 0 : spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled);
2080 :
2081 0 : if (bdev->internal.histogram_io_type) {
2082 0 : spdk_json_write_named_string(w, "opc",
2083 0 : spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type));
2084 : }
2085 :
2086 0 : spdk_json_write_object_end(w);
2087 :
2088 0 : spdk_json_write_object_end(w);
2089 : }
2090 :
2091 : static void
2092 0 : bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
2093 : {
2094 : int i;
2095 0 : struct spdk_bdev_qos *qos = bdev->internal.qos;
2096 0 : uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
2097 :
2098 0 : if (!qos) {
2099 0 : return;
2100 : }
2101 :
2102 0 : spdk_bdev_get_qos_rate_limits(bdev, limits);
2103 :
2104 0 : spdk_json_write_object_begin(w);
2105 0 : spdk_json_write_named_string(w, "method", "bdev_set_qos_limit");
2106 :
2107 0 : spdk_json_write_named_object_begin(w, "params");
2108 0 : spdk_json_write_named_string(w, "name", bdev->name);
2109 0 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2110 0 : if (limits[i] > 0) {
2111 0 : spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]);
2112 : }
2113 : }
2114 0 : spdk_json_write_object_end(w);
2115 :
2116 0 : spdk_json_write_object_end(w);
2117 : }
2118 :
2119 : void
2120 0 : spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
2121 : {
2122 : struct spdk_bdev_module *bdev_module;
2123 : struct spdk_bdev *bdev;
2124 :
2125 0 : assert(w != NULL);
2126 :
2127 0 : spdk_json_write_array_begin(w);
2128 :
2129 0 : spdk_json_write_object_begin(w);
2130 0 : spdk_json_write_named_string(w, "method", "bdev_set_options");
2131 0 : spdk_json_write_named_object_begin(w, "params");
2132 0 : spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size);
2133 0 : spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size);
2134 0 : spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine);
2135 0 : spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size);
2136 0 : spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size);
2137 0 : spdk_json_write_object_end(w);
2138 0 : spdk_json_write_object_end(w);
2139 :
2140 0 : bdev_examine_allowlist_config_json(w);
2141 :
2142 0 : TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2143 0 : if (bdev_module->config_json) {
2144 0 : bdev_module->config_json(w);
2145 : }
2146 : }
2147 :
2148 0 : spdk_spin_lock(&g_bdev_mgr.spinlock);
2149 :
2150 0 : TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) {
2151 0 : if (bdev->fn_table->write_config_json) {
2152 0 : bdev->fn_table->write_config_json(bdev, w);
2153 : }
2154 :
2155 0 : bdev_qos_config_json(bdev, w);
2156 0 : bdev_enable_histogram_config_json(bdev, w);
2157 : }
2158 :
2159 0 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
2160 :
2161 : /* This has to be last RPC in array to make sure all bdevs finished examine */
2162 0 : spdk_json_write_object_begin(w);
2163 0 : spdk_json_write_named_string(w, "method", "bdev_wait_for_examine");
2164 0 : spdk_json_write_object_end(w);
2165 :
2166 0 : spdk_json_write_array_end(w);
2167 0 : }
2168 :
2169 : static void
2170 73 : bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
2171 : {
2172 73 : struct spdk_bdev_mgmt_channel *ch = ctx_buf;
2173 : struct spdk_bdev_io *bdev_io;
2174 :
2175 73 : spdk_iobuf_channel_fini(&ch->iobuf);
2176 :
2177 10483 : while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
2178 10410 : bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
2179 10410 : STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
2180 10410 : ch->per_thread_cache_count--;
2181 10410 : spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
2182 : }
2183 :
2184 73 : assert(ch->per_thread_cache_count == 0);
2185 73 : }
2186 :
2187 : static int
2188 73 : bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
2189 : {
2190 73 : struct spdk_bdev_mgmt_channel *ch = ctx_buf;
2191 : struct spdk_bdev_io *bdev_io;
2192 : uint32_t i;
2193 : int rc;
2194 :
2195 73 : rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev",
2196 : g_bdev_opts.iobuf_small_cache_size,
2197 : g_bdev_opts.iobuf_large_cache_size);
2198 73 : if (rc != 0) {
2199 0 : SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc));
2200 0 : return -1;
2201 : }
2202 :
2203 73 : STAILQ_INIT(&ch->per_thread_cache);
2204 73 : ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
2205 :
2206 : /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */
2207 73 : ch->per_thread_cache_count = 0;
2208 10483 : for (i = 0; i < ch->bdev_io_cache_size; i++) {
2209 10410 : bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
2210 10410 : if (bdev_io == NULL) {
2211 0 : SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n");
2212 0 : assert(false);
2213 : bdev_mgmt_channel_destroy(io_device, ctx_buf);
2214 : return -1;
2215 : }
2216 10410 : ch->per_thread_cache_count++;
2217 10410 : STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
2218 : }
2219 :
2220 73 : TAILQ_INIT(&ch->shared_resources);
2221 73 : TAILQ_INIT(&ch->io_wait_queue);
2222 :
2223 73 : return 0;
2224 : }
2225 :
2226 : static void
2227 69 : bdev_init_complete(int rc)
2228 : {
2229 69 : spdk_bdev_init_cb cb_fn = g_init_cb_fn;
2230 69 : void *cb_arg = g_init_cb_arg;
2231 : struct spdk_bdev_module *m;
2232 :
2233 69 : g_bdev_mgr.init_complete = true;
2234 69 : g_init_cb_fn = NULL;
2235 69 : g_init_cb_arg = NULL;
2236 :
2237 : /*
2238 : * For modules that need to know when subsystem init is complete,
2239 : * inform them now.
2240 : */
2241 69 : if (rc == 0) {
2242 268 : TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
2243 199 : if (m->init_complete) {
2244 25 : m->init_complete();
2245 : }
2246 : }
2247 : }
2248 :
2249 69 : cb_fn(cb_arg, rc);
2250 69 : }
2251 :
2252 : static bool
2253 274 : bdev_module_all_actions_completed(void)
2254 : {
2255 : struct spdk_bdev_module *m;
2256 :
2257 1084 : TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
2258 810 : if (m->internal.action_in_progress > 0) {
2259 0 : return false;
2260 : }
2261 : }
2262 274 : return true;
2263 : }
2264 :
2265 : static void
2266 631 : bdev_module_action_complete(void)
2267 : {
2268 : /*
2269 : * Don't finish bdev subsystem initialization if
2270 : * module pre-initialization is still in progress, or
2271 : * the subsystem been already initialized.
2272 : */
2273 631 : if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
2274 562 : return;
2275 : }
2276 :
2277 : /*
2278 : * Check all bdev modules for inits/examinations in progress. If any
2279 : * exist, return immediately since we cannot finish bdev subsystem
2280 : * initialization until all are completed.
2281 : */
2282 69 : if (!bdev_module_all_actions_completed()) {
2283 0 : return;
2284 : }
2285 :
2286 : /*
2287 : * Modules already finished initialization - now that all
2288 : * the bdev modules have finished their asynchronous I/O
2289 : * processing, the entire bdev layer can be marked as complete.
2290 : */
2291 69 : bdev_init_complete(0);
2292 : }
2293 :
2294 : static void
2295 562 : bdev_module_action_done(struct spdk_bdev_module *module)
2296 : {
2297 562 : spdk_spin_lock(&module->internal.spinlock);
2298 562 : assert(module->internal.action_in_progress > 0);
2299 562 : module->internal.action_in_progress--;
2300 562 : spdk_spin_unlock(&module->internal.spinlock);
2301 562 : bdev_module_action_complete();
2302 562 : }
2303 :
2304 : void
2305 69 : spdk_bdev_module_init_done(struct spdk_bdev_module *module)
2306 : {
2307 69 : assert(module->async_init);
2308 69 : bdev_module_action_done(module);
2309 69 : }
2310 :
2311 : void
2312 493 : spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
2313 : {
2314 493 : bdev_module_action_done(module);
2315 493 : }
2316 :
2317 : /** The last initialized bdev module */
2318 : static struct spdk_bdev_module *g_resume_bdev_module = NULL;
2319 :
2320 : static void
2321 0 : bdev_init_failed(void *cb_arg)
2322 : {
2323 0 : struct spdk_bdev_module *module = cb_arg;
2324 :
2325 0 : spdk_spin_lock(&module->internal.spinlock);
2326 0 : assert(module->internal.action_in_progress > 0);
2327 0 : module->internal.action_in_progress--;
2328 0 : spdk_spin_unlock(&module->internal.spinlock);
2329 0 : bdev_init_complete(-1);
2330 0 : }
2331 :
2332 : static int
2333 69 : bdev_modules_init(void)
2334 : {
2335 : struct spdk_bdev_module *module;
2336 69 : int rc = 0;
2337 :
2338 268 : TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2339 199 : g_resume_bdev_module = module;
2340 199 : if (module->async_init) {
2341 69 : spdk_spin_lock(&module->internal.spinlock);
2342 69 : module->internal.action_in_progress = 1;
2343 69 : spdk_spin_unlock(&module->internal.spinlock);
2344 : }
2345 199 : rc = module->module_init();
2346 199 : if (rc != 0) {
2347 : /* Bump action_in_progress to prevent other modules from completion of modules_init
2348 : * Send message to defer application shutdown until resources are cleaned up */
2349 0 : spdk_spin_lock(&module->internal.spinlock);
2350 0 : module->internal.action_in_progress = 1;
2351 0 : spdk_spin_unlock(&module->internal.spinlock);
2352 0 : spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module);
2353 0 : return rc;
2354 : }
2355 : }
2356 :
2357 69 : g_resume_bdev_module = NULL;
2358 69 : return 0;
2359 : }
2360 :
2361 : void
2362 69 : spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
2363 : {
2364 69 : int rc = 0;
2365 69 : char mempool_name[32];
2366 :
2367 69 : assert(cb_fn != NULL);
2368 :
2369 69 : g_init_cb_fn = cb_fn;
2370 69 : g_init_cb_arg = cb_arg;
2371 :
2372 69 : spdk_notify_type_register("bdev_register");
2373 69 : spdk_notify_type_register("bdev_unregister");
2374 :
2375 69 : snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
2376 :
2377 69 : rc = spdk_iobuf_register_module("bdev");
2378 69 : if (rc != 0) {
2379 0 : SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc));
2380 0 : bdev_init_complete(-1);
2381 0 : return;
2382 : }
2383 :
2384 138 : g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
2385 69 : g_bdev_opts.bdev_io_pool_size,
2386 : sizeof(struct spdk_bdev_io) +
2387 69 : bdev_module_get_max_ctx_size(),
2388 : 0,
2389 : SPDK_ENV_NUMA_ID_ANY);
2390 :
2391 69 : if (g_bdev_mgr.bdev_io_pool == NULL) {
2392 0 : SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
2393 0 : bdev_init_complete(-1);
2394 0 : return;
2395 : }
2396 :
2397 69 : g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
2398 : NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
2399 69 : if (!g_bdev_mgr.zero_buffer) {
2400 0 : SPDK_ERRLOG("create bdev zero buffer failed\n");
2401 0 : bdev_init_complete(-1);
2402 0 : return;
2403 : }
2404 :
2405 : #ifdef SPDK_CONFIG_VTUNE
2406 : g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
2407 : #endif
2408 :
2409 69 : spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create,
2410 : bdev_mgmt_channel_destroy,
2411 : sizeof(struct spdk_bdev_mgmt_channel),
2412 : "bdev_mgr");
2413 :
2414 69 : rc = bdev_modules_init();
2415 69 : g_bdev_mgr.module_init_complete = true;
2416 69 : if (rc != 0) {
2417 0 : SPDK_ERRLOG("bdev modules init failed\n");
2418 0 : return;
2419 : }
2420 :
2421 69 : bdev_module_action_complete();
2422 : }
2423 :
2424 : static void
2425 69 : bdev_mgr_unregister_cb(void *io_device)
2426 : {
2427 69 : spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
2428 :
2429 69 : if (g_bdev_mgr.bdev_io_pool) {
2430 69 : if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
2431 0 : SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
2432 : spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
2433 : g_bdev_opts.bdev_io_pool_size);
2434 : }
2435 :
2436 69 : spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
2437 : }
2438 :
2439 69 : spdk_free(g_bdev_mgr.zero_buffer);
2440 :
2441 69 : bdev_examine_allowlist_free();
2442 :
2443 69 : cb_fn(g_fini_cb_arg);
2444 69 : g_fini_cb_fn = NULL;
2445 69 : g_fini_cb_arg = NULL;
2446 69 : g_bdev_mgr.init_complete = false;
2447 69 : g_bdev_mgr.module_init_complete = false;
2448 69 : }
2449 :
2450 : static void
2451 69 : bdev_module_fini_iter(void *arg)
2452 : {
2453 : struct spdk_bdev_module *bdev_module;
2454 :
2455 : /* FIXME: Handling initialization failures is broken now,
2456 : * so we won't even try cleaning up after successfully
2457 : * initialized modules. if module_init_complete is false,
2458 : * just call spdk_bdev_mgr_unregister_cb
2459 : */
2460 69 : if (!g_bdev_mgr.module_init_complete) {
2461 0 : bdev_mgr_unregister_cb(NULL);
2462 0 : return;
2463 : }
2464 :
2465 : /* Start iterating from the last touched module */
2466 69 : if (!g_resume_bdev_module) {
2467 69 : bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2468 : } else {
2469 0 : bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list,
2470 : internal.tailq);
2471 : }
2472 :
2473 268 : while (bdev_module) {
2474 199 : if (bdev_module->async_fini) {
2475 : /* Save our place so we can resume later. We must
2476 : * save the variable here, before calling module_fini()
2477 : * below, because in some cases the module may immediately
2478 : * call spdk_bdev_module_fini_done() and re-enter
2479 : * this function to continue iterating. */
2480 0 : g_resume_bdev_module = bdev_module;
2481 : }
2482 :
2483 199 : if (bdev_module->module_fini) {
2484 199 : bdev_module->module_fini();
2485 : }
2486 :
2487 199 : if (bdev_module->async_fini) {
2488 0 : return;
2489 : }
2490 :
2491 199 : bdev_module = TAILQ_PREV(bdev_module, bdev_module_list,
2492 : internal.tailq);
2493 : }
2494 :
2495 69 : g_resume_bdev_module = NULL;
2496 69 : spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb);
2497 : }
2498 :
2499 : void
2500 0 : spdk_bdev_module_fini_done(void)
2501 : {
2502 0 : if (spdk_get_thread() != g_fini_thread) {
2503 0 : spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL);
2504 : } else {
2505 0 : bdev_module_fini_iter(NULL);
2506 : }
2507 0 : }
2508 :
2509 : static void
2510 69 : bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
2511 : {
2512 69 : struct spdk_bdev *bdev = cb_arg;
2513 :
2514 69 : if (bdeverrno && bdev) {
2515 0 : SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
2516 : bdev->name);
2517 :
2518 : /*
2519 : * Since the call to spdk_bdev_unregister() failed, we have no way to free this
2520 : * bdev; try to continue by manually removing this bdev from the list and continue
2521 : * with the next bdev in the list.
2522 : */
2523 0 : TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
2524 : }
2525 :
2526 69 : if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
2527 69 : SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n");
2528 : /*
2529 : * Bdev module finish need to be deferred as we might be in the middle of some context
2530 : * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
2531 : * after returning.
2532 : */
2533 69 : spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL);
2534 69 : return;
2535 : }
2536 :
2537 : /*
2538 : * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem
2539 : * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity
2540 : * to detect clean shutdown as opposed to run-time hot removal of the underlying
2541 : * base bdevs.
2542 : *
2543 : * Also, walk the list in the reverse order.
2544 : */
2545 0 : for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2546 0 : bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2547 0 : spdk_spin_lock(&bdev->internal.spinlock);
2548 0 : if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
2549 0 : LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev);
2550 0 : spdk_spin_unlock(&bdev->internal.spinlock);
2551 0 : continue;
2552 : }
2553 0 : spdk_spin_unlock(&bdev->internal.spinlock);
2554 :
2555 0 : SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name);
2556 0 : spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2557 0 : return;
2558 : }
2559 :
2560 : /*
2561 : * If any bdev fails to unclaim underlying bdev properly, we may face the
2562 : * case of bdev list consisting of claimed bdevs only (if claims are managed
2563 : * correctly, this would mean there's a loop in the claims graph which is
2564 : * clearly impossible). Warn and unregister last bdev on the list then.
2565 : */
2566 0 : for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2567 0 : bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2568 0 : SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name);
2569 0 : spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2570 0 : return;
2571 : }
2572 : }
2573 :
2574 : static void
2575 69 : bdev_module_fini_start_iter(void *arg)
2576 : {
2577 : struct spdk_bdev_module *bdev_module;
2578 :
2579 69 : if (!g_resume_bdev_module) {
2580 69 : bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2581 : } else {
2582 0 : bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq);
2583 : }
2584 :
2585 268 : while (bdev_module) {
2586 199 : if (bdev_module->async_fini_start) {
2587 : /* Save our place so we can resume later. We must
2588 : * save the variable here, before calling fini_start()
2589 : * below, because in some cases the module may immediately
2590 : * call spdk_bdev_module_fini_start_done() and re-enter
2591 : * this function to continue iterating. */
2592 0 : g_resume_bdev_module = bdev_module;
2593 : }
2594 :
2595 199 : if (bdev_module->fini_start) {
2596 25 : bdev_module->fini_start();
2597 : }
2598 :
2599 199 : if (bdev_module->async_fini_start) {
2600 0 : return;
2601 : }
2602 :
2603 199 : bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq);
2604 : }
2605 :
2606 69 : g_resume_bdev_module = NULL;
2607 :
2608 69 : bdev_finish_unregister_bdevs_iter(NULL, 0);
2609 : }
2610 :
2611 : void
2612 0 : spdk_bdev_module_fini_start_done(void)
2613 : {
2614 0 : if (spdk_get_thread() != g_fini_thread) {
2615 0 : spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL);
2616 : } else {
2617 0 : bdev_module_fini_start_iter(NULL);
2618 : }
2619 0 : }
2620 :
2621 : static void
2622 69 : bdev_finish_wait_for_examine_done(void *cb_arg)
2623 : {
2624 69 : bdev_module_fini_start_iter(NULL);
2625 69 : }
2626 :
2627 : static void bdev_open_async_fini(void);
2628 :
2629 : void
2630 69 : spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
2631 : {
2632 : int rc;
2633 :
2634 69 : assert(cb_fn != NULL);
2635 :
2636 69 : g_fini_thread = spdk_get_thread();
2637 :
2638 69 : g_fini_cb_fn = cb_fn;
2639 69 : g_fini_cb_arg = cb_arg;
2640 :
2641 69 : bdev_open_async_fini();
2642 :
2643 69 : rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL);
2644 69 : if (rc != 0) {
2645 0 : SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc));
2646 0 : bdev_finish_wait_for_examine_done(NULL);
2647 : }
2648 69 : }
2649 :
2650 : struct spdk_bdev_io *
2651 716 : bdev_channel_get_io(struct spdk_bdev_channel *channel)
2652 : {
2653 716 : struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
2654 : struct spdk_bdev_io *bdev_io;
2655 :
2656 716 : if (ch->per_thread_cache_count > 0) {
2657 656 : bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
2658 656 : STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
2659 656 : ch->per_thread_cache_count--;
2660 60 : } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) {
2661 : /*
2662 : * Don't try to look for bdev_ios in the global pool if there are
2663 : * waiters on bdev_ios - we don't want this caller to jump the line.
2664 : */
2665 0 : bdev_io = NULL;
2666 : } else {
2667 60 : bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
2668 : }
2669 :
2670 716 : return bdev_io;
2671 : }
2672 :
2673 : void
2674 710 : spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
2675 : {
2676 : struct spdk_bdev_mgmt_channel *ch;
2677 :
2678 710 : assert(bdev_io != NULL);
2679 710 : assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING);
2680 :
2681 710 : ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
2682 :
2683 710 : if (bdev_io->internal.f.has_buf) {
2684 16 : bdev_io_put_buf(bdev_io);
2685 : }
2686 :
2687 710 : if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
2688 656 : ch->per_thread_cache_count++;
2689 656 : STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
2690 660 : while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) {
2691 : struct spdk_bdev_io_wait_entry *entry;
2692 :
2693 4 : entry = TAILQ_FIRST(&ch->io_wait_queue);
2694 4 : TAILQ_REMOVE(&ch->io_wait_queue, entry, link);
2695 4 : entry->cb_fn(entry->cb_arg);
2696 : }
2697 : } else {
2698 : /* We should never have a full cache with entries on the io wait queue. */
2699 54 : assert(TAILQ_EMPTY(&ch->io_wait_queue));
2700 54 : spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
2701 : }
2702 710 : }
2703 :
2704 : static bool
2705 72 : bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit)
2706 : {
2707 72 : assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
2708 :
2709 72 : switch (limit) {
2710 18 : case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2711 18 : return true;
2712 54 : case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2713 : case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2714 : case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2715 54 : return false;
2716 0 : case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES:
2717 : default:
2718 0 : return false;
2719 : }
2720 : }
2721 :
2722 : static bool
2723 25 : bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io)
2724 : {
2725 25 : switch (bdev_io->type) {
2726 23 : case SPDK_BDEV_IO_TYPE_NVME_IO:
2727 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2728 : case SPDK_BDEV_IO_TYPE_READ:
2729 : case SPDK_BDEV_IO_TYPE_WRITE:
2730 23 : return true;
2731 0 : case SPDK_BDEV_IO_TYPE_ZCOPY:
2732 0 : if (bdev_io->u.bdev.zcopy.start) {
2733 0 : return true;
2734 : } else {
2735 0 : return false;
2736 : }
2737 2 : default:
2738 2 : return false;
2739 : }
2740 : }
2741 :
2742 : static bool
2743 33 : bdev_is_read_io(struct spdk_bdev_io *bdev_io)
2744 : {
2745 33 : switch (bdev_io->type) {
2746 0 : case SPDK_BDEV_IO_TYPE_NVME_IO:
2747 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2748 : /* Bit 1 (0x2) set for read operation */
2749 0 : if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) {
2750 0 : return true;
2751 : } else {
2752 0 : return false;
2753 : }
2754 30 : case SPDK_BDEV_IO_TYPE_READ:
2755 30 : return true;
2756 0 : case SPDK_BDEV_IO_TYPE_ZCOPY:
2757 : /* Populate to read from disk */
2758 0 : if (bdev_io->u.bdev.zcopy.populate) {
2759 0 : return true;
2760 : } else {
2761 0 : return false;
2762 : }
2763 3 : default:
2764 3 : return false;
2765 : }
2766 : }
2767 :
2768 : static uint64_t
2769 43 : bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
2770 : {
2771 43 : uint32_t blocklen = bdev_io_get_block_size(bdev_io);
2772 :
2773 43 : switch (bdev_io->type) {
2774 0 : case SPDK_BDEV_IO_TYPE_NVME_IO:
2775 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2776 0 : return bdev_io->u.nvme_passthru.nbytes;
2777 43 : case SPDK_BDEV_IO_TYPE_READ:
2778 : case SPDK_BDEV_IO_TYPE_WRITE:
2779 43 : return bdev_io->u.bdev.num_blocks * blocklen;
2780 0 : case SPDK_BDEV_IO_TYPE_ZCOPY:
2781 : /* Track the data in the start phase only */
2782 0 : if (bdev_io->u.bdev.zcopy.start) {
2783 0 : return bdev_io->u.bdev.num_blocks * blocklen;
2784 : } else {
2785 0 : return 0;
2786 : }
2787 0 : default:
2788 0 : return 0;
2789 : }
2790 : }
2791 :
2792 : static inline bool
2793 64 : bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta)
2794 : {
2795 : int64_t remaining_this_timeslice;
2796 :
2797 64 : if (!limit->max_per_timeslice) {
2798 : /* The QoS is disabled */
2799 0 : return false;
2800 : }
2801 :
2802 64 : remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta,
2803 : __ATOMIC_RELAXED);
2804 64 : if (remaining_this_timeslice + (int64_t)delta > 0) {
2805 : /* There was still a quota for this delta -> the IO shouldn't be queued
2806 : *
2807 : * We allow a slight quota overrun here so an IO bigger than the per-timeslice
2808 : * quota can be allowed once a while. Such overrun then taken into account in
2809 : * the QoS poller, where the next timeslice quota is calculated.
2810 : */
2811 59 : return false;
2812 : }
2813 :
2814 : /* There was no quota for this delta -> the IO should be queued
2815 : * The remaining_this_timeslice must be rewinded so it reflects the real
2816 : * amount of IOs or bytes allowed.
2817 : */
2818 5 : __atomic_add_fetch(
2819 5 : &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED);
2820 5 : return true;
2821 : }
2822 :
2823 : static inline void
2824 5 : bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta)
2825 : {
2826 5 : __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED);
2827 5 : }
2828 :
2829 : static bool
2830 23 : bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2831 : {
2832 23 : return bdev_qos_rw_queue_io(limit, io, 1);
2833 : }
2834 :
2835 : static void
2836 3 : bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2837 : {
2838 3 : bdev_qos_rw_rewind_io(limit, io, 1);
2839 3 : }
2840 :
2841 : static bool
2842 41 : bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2843 : {
2844 41 : return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io));
2845 : }
2846 :
2847 : static void
2848 2 : bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2849 : {
2850 2 : bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2851 2 : }
2852 :
2853 : static bool
2854 19 : bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2855 : {
2856 19 : if (bdev_is_read_io(io) == false) {
2857 1 : return false;
2858 : }
2859 :
2860 18 : return bdev_qos_rw_bps_queue(limit, io);
2861 : }
2862 :
2863 : static void
2864 0 : bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2865 : {
2866 0 : if (bdev_is_read_io(io) != false) {
2867 0 : bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2868 : }
2869 0 : }
2870 :
2871 : static bool
2872 14 : bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2873 : {
2874 14 : if (bdev_is_read_io(io) == true) {
2875 12 : return false;
2876 : }
2877 :
2878 2 : return bdev_qos_rw_bps_queue(limit, io);
2879 : }
2880 :
2881 : static void
2882 0 : bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2883 : {
2884 0 : if (bdev_is_read_io(io) != true) {
2885 0 : bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2886 : }
2887 0 : }
2888 :
2889 : static void
2890 10 : bdev_qos_set_ops(struct spdk_bdev_qos *qos)
2891 : {
2892 : int i;
2893 :
2894 50 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2895 40 : if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
2896 15 : qos->rate_limits[i].queue_io = NULL;
2897 15 : continue;
2898 : }
2899 :
2900 25 : switch (i) {
2901 9 : case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2902 9 : qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue;
2903 9 : qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota;
2904 9 : break;
2905 7 : case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2906 7 : qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue;
2907 7 : qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota;
2908 7 : break;
2909 5 : case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2910 5 : qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue;
2911 5 : qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota;
2912 5 : break;
2913 4 : case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2914 4 : qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue;
2915 4 : qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota;
2916 4 : break;
2917 0 : default:
2918 0 : break;
2919 : }
2920 : }
2921 10 : }
2922 :
2923 : static void
2924 6 : _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch,
2925 : struct spdk_bdev_io *bdev_io,
2926 : enum spdk_bdev_io_status status)
2927 : {
2928 6 : bdev_io->internal.f.in_submit_request = true;
2929 6 : bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource);
2930 6 : spdk_bdev_io_complete(bdev_io, status);
2931 6 : bdev_io->internal.f.in_submit_request = false;
2932 6 : }
2933 :
2934 : static inline void
2935 590 : bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
2936 : {
2937 590 : struct spdk_bdev *bdev = bdev_io->bdev;
2938 590 : struct spdk_io_channel *ch = bdev_ch->channel;
2939 590 : struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2940 :
2941 590 : if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
2942 16 : struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch;
2943 16 : struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
2944 :
2945 32 : if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) ||
2946 16 : bdev_abort_buf_io(mgmt_channel, bio_to_abort)) {
2947 0 : _bdev_io_complete_in_submit(bdev_ch, bdev_io,
2948 : SPDK_BDEV_IO_STATUS_SUCCESS);
2949 0 : return;
2950 : }
2951 : }
2952 :
2953 590 : if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE &&
2954 : bdev_io->bdev->split_on_write_unit &&
2955 : bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) {
2956 4 : SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n",
2957 : bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size);
2958 4 : _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2959 4 : return;
2960 : }
2961 :
2962 586 : if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
2963 528 : bdev_io_increment_outstanding(bdev_ch, shared_resource);
2964 528 : bdev_io->internal.f.in_submit_request = true;
2965 528 : bdev_submit_request(bdev, ch, bdev_io);
2966 528 : bdev_io->internal.f.in_submit_request = false;
2967 : } else {
2968 58 : bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT);
2969 58 : if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) {
2970 : /* Special case when we have nomem IOs and no outstanding IOs which completions
2971 : * could trigger retry of queued IOs */
2972 15 : bdev_shared_ch_retry_io(shared_resource);
2973 : }
2974 : }
2975 : }
2976 :
2977 : static bool
2978 25 : bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io)
2979 : {
2980 : int i;
2981 :
2982 25 : if (bdev_qos_io_to_limit(bdev_io) == true) {
2983 100 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2984 82 : if (!qos->rate_limits[i].queue_io) {
2985 5 : continue;
2986 : }
2987 :
2988 77 : if (qos->rate_limits[i].queue_io(&qos->rate_limits[i],
2989 : bdev_io) == true) {
2990 10 : for (i -= 1; i >= 0 ; i--) {
2991 5 : if (!qos->rate_limits[i].queue_io) {
2992 0 : continue;
2993 : }
2994 :
2995 5 : qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io);
2996 : }
2997 5 : return true;
2998 : }
2999 : }
3000 : }
3001 :
3002 20 : return false;
3003 : }
3004 :
3005 : static int
3006 27 : bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos)
3007 : {
3008 27 : struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL;
3009 27 : int submitted_ios = 0;
3010 :
3011 52 : TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) {
3012 25 : if (!bdev_qos_queue_io(qos, bdev_io)) {
3013 20 : TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link);
3014 20 : bdev_io_do_submit(ch, bdev_io);
3015 :
3016 20 : submitted_ios++;
3017 : }
3018 : }
3019 :
3020 27 : return submitted_ios;
3021 : }
3022 :
3023 : static void
3024 2 : bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn)
3025 : {
3026 : int rc;
3027 :
3028 2 : bdev_io->internal.waitq_entry.bdev = bdev_io->bdev;
3029 2 : bdev_io->internal.waitq_entry.cb_fn = cb_fn;
3030 2 : bdev_io->internal.waitq_entry.cb_arg = bdev_io;
3031 2 : rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch),
3032 : &bdev_io->internal.waitq_entry);
3033 2 : if (rc != 0) {
3034 0 : SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc);
3035 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3036 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3037 : }
3038 2 : }
3039 :
3040 : static bool
3041 637 : bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
3042 : {
3043 : uint32_t io_boundary;
3044 637 : struct spdk_bdev *bdev = bdev_io->bdev;
3045 637 : uint32_t max_segment_size = bdev->max_segment_size;
3046 637 : uint32_t max_size = bdev->max_rw_size;
3047 637 : int max_segs = bdev->max_num_segments;
3048 :
3049 637 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
3050 24 : io_boundary = bdev->write_unit_size;
3051 613 : } else if (bdev->split_on_optimal_io_boundary) {
3052 168 : io_boundary = bdev->optimal_io_boundary;
3053 : } else {
3054 445 : io_boundary = 0;
3055 : }
3056 :
3057 637 : if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) {
3058 259 : return false;
3059 : }
3060 :
3061 378 : if (io_boundary) {
3062 : uint64_t start_stripe, end_stripe;
3063 :
3064 192 : start_stripe = bdev_io->u.bdev.offset_blocks;
3065 192 : end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1;
3066 : /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
3067 192 : if (spdk_likely(spdk_u32_is_pow2(io_boundary))) {
3068 192 : start_stripe >>= spdk_u32log2(io_boundary);
3069 192 : end_stripe >>= spdk_u32log2(io_boundary);
3070 : } else {
3071 0 : start_stripe /= io_boundary;
3072 0 : end_stripe /= io_boundary;
3073 : }
3074 :
3075 192 : if (start_stripe != end_stripe) {
3076 75 : return true;
3077 : }
3078 : }
3079 :
3080 303 : if (max_segs) {
3081 150 : if (bdev_io->u.bdev.iovcnt > max_segs) {
3082 15 : return true;
3083 : }
3084 : }
3085 :
3086 288 : if (max_segment_size) {
3087 470 : for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) {
3088 346 : if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) {
3089 12 : return true;
3090 : }
3091 : }
3092 : }
3093 :
3094 276 : if (max_size) {
3095 52 : if (bdev_io->u.bdev.num_blocks > max_size) {
3096 7 : return true;
3097 : }
3098 : }
3099 :
3100 269 : return false;
3101 : }
3102 :
3103 : static bool
3104 24 : bdev_unmap_should_split(struct spdk_bdev_io *bdev_io)
3105 : {
3106 : uint32_t num_unmap_segments;
3107 :
3108 24 : if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) {
3109 3 : return false;
3110 : }
3111 21 : num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap);
3112 21 : if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) {
3113 4 : return true;
3114 : }
3115 :
3116 17 : return false;
3117 : }
3118 :
3119 : static bool
3120 37 : bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io)
3121 : {
3122 37 : if (!bdev_io->bdev->max_write_zeroes) {
3123 4 : return false;
3124 : }
3125 :
3126 33 : if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) {
3127 10 : return true;
3128 : }
3129 :
3130 23 : return false;
3131 : }
3132 :
3133 : static bool
3134 30 : bdev_copy_should_split(struct spdk_bdev_io *bdev_io)
3135 : {
3136 30 : if (bdev_io->bdev->max_copy != 0 &&
3137 25 : bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) {
3138 6 : return true;
3139 : }
3140 :
3141 24 : return false;
3142 : }
3143 :
3144 : static bool
3145 811 : bdev_io_should_split(struct spdk_bdev_io *bdev_io)
3146 : {
3147 811 : switch (bdev_io->type) {
3148 637 : case SPDK_BDEV_IO_TYPE_READ:
3149 : case SPDK_BDEV_IO_TYPE_WRITE:
3150 637 : return bdev_rw_should_split(bdev_io);
3151 24 : case SPDK_BDEV_IO_TYPE_UNMAP:
3152 24 : return bdev_unmap_should_split(bdev_io);
3153 37 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3154 37 : return bdev_write_zeroes_should_split(bdev_io);
3155 30 : case SPDK_BDEV_IO_TYPE_COPY:
3156 30 : return bdev_copy_should_split(bdev_io);
3157 83 : default:
3158 83 : return false;
3159 : }
3160 : }
3161 :
3162 : static uint32_t
3163 249 : _to_next_boundary(uint64_t offset, uint32_t boundary)
3164 : {
3165 249 : return (boundary - (offset % boundary));
3166 : }
3167 :
3168 : static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
3169 :
3170 : static void _bdev_rw_split(void *_bdev_io);
3171 :
3172 : static void bdev_unmap_split(struct spdk_bdev_io *bdev_io);
3173 :
3174 : static void
3175 0 : _bdev_unmap_split(void *_bdev_io)
3176 : {
3177 0 : return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io);
3178 : }
3179 :
3180 : static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io);
3181 :
3182 : static void
3183 0 : _bdev_write_zeroes_split(void *_bdev_io)
3184 : {
3185 0 : return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io);
3186 : }
3187 :
3188 : static void bdev_copy_split(struct spdk_bdev_io *bdev_io);
3189 :
3190 : static void
3191 0 : _bdev_copy_split(void *_bdev_io)
3192 : {
3193 0 : return bdev_copy_split((struct spdk_bdev_io *)_bdev_io);
3194 : }
3195 :
3196 : static int
3197 305 : bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf,
3198 : uint64_t num_blocks, uint64_t *offset, uint64_t *remaining)
3199 : {
3200 : int rc;
3201 : uint64_t current_offset, current_remaining, current_src_offset;
3202 : spdk_bdev_io_wait_cb io_wait_fn;
3203 :
3204 305 : current_offset = *offset;
3205 305 : current_remaining = *remaining;
3206 :
3207 305 : assert(bdev_io->internal.f.split);
3208 :
3209 305 : bdev_io->internal.split.outstanding++;
3210 :
3211 305 : io_wait_fn = _bdev_rw_split;
3212 305 : switch (bdev_io->type) {
3213 196 : case SPDK_BDEV_IO_TYPE_READ:
3214 196 : assert(bdev_io->u.bdev.accel_sequence == NULL);
3215 784 : rc = bdev_readv_blocks_with_md(bdev_io->internal.desc,
3216 196 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
3217 : iov, iovcnt, md_buf, current_offset,
3218 : num_blocks,
3219 196 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
3220 196 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
3221 : NULL,
3222 : bdev_io->u.bdev.dif_check_flags,
3223 : bdev_io_split_done, bdev_io);
3224 196 : break;
3225 50 : case SPDK_BDEV_IO_TYPE_WRITE:
3226 50 : assert(bdev_io->u.bdev.accel_sequence == NULL);
3227 200 : rc = bdev_writev_blocks_with_md(bdev_io->internal.desc,
3228 50 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
3229 : iov, iovcnt, md_buf, current_offset,
3230 : num_blocks,
3231 50 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
3232 50 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
3233 : NULL,
3234 : bdev_io->u.bdev.dif_check_flags,
3235 : bdev_io->u.bdev.nvme_cdw12.raw,
3236 : bdev_io->u.bdev.nvme_cdw13.raw,
3237 : bdev_io_split_done, bdev_io);
3238 50 : break;
3239 17 : case SPDK_BDEV_IO_TYPE_UNMAP:
3240 17 : io_wait_fn = _bdev_unmap_split;
3241 17 : rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc,
3242 17 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
3243 : current_offset, num_blocks,
3244 : bdev_io_split_done, bdev_io);
3245 17 : break;
3246 23 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3247 23 : io_wait_fn = _bdev_write_zeroes_split;
3248 23 : rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc,
3249 23 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
3250 : current_offset, num_blocks,
3251 : bdev_io_split_done, bdev_io);
3252 23 : break;
3253 19 : case SPDK_BDEV_IO_TYPE_COPY:
3254 19 : io_wait_fn = _bdev_copy_split;
3255 19 : current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks +
3256 19 : (current_offset - bdev_io->u.bdev.offset_blocks);
3257 19 : rc = spdk_bdev_copy_blocks(bdev_io->internal.desc,
3258 19 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
3259 : current_offset, current_src_offset, num_blocks,
3260 : bdev_io_split_done, bdev_io);
3261 19 : break;
3262 0 : default:
3263 0 : assert(false);
3264 : rc = -EINVAL;
3265 : break;
3266 : }
3267 :
3268 305 : if (rc == 0) {
3269 301 : current_offset += num_blocks;
3270 301 : current_remaining -= num_blocks;
3271 301 : bdev_io->internal.split.current_offset_blocks = current_offset;
3272 301 : bdev_io->internal.split.remaining_num_blocks = current_remaining;
3273 301 : *offset = current_offset;
3274 301 : *remaining = current_remaining;
3275 : } else {
3276 4 : bdev_io->internal.split.outstanding--;
3277 4 : if (rc == -ENOMEM) {
3278 4 : if (bdev_io->internal.split.outstanding == 0) {
3279 : /* No I/O is outstanding. Hence we should wait here. */
3280 1 : bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn);
3281 : }
3282 : } else {
3283 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3284 0 : if (bdev_io->internal.split.outstanding == 0) {
3285 0 : bdev_ch_remove_from_io_submitted(bdev_io);
3286 0 : spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id,
3287 : 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx,
3288 : bdev_io->internal.ch->queue_depth);
3289 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3290 : }
3291 : }
3292 : }
3293 :
3294 305 : return rc;
3295 : }
3296 :
3297 : static void
3298 67 : _bdev_rw_split(void *_bdev_io)
3299 : {
3300 : struct iovec *parent_iov, *iov;
3301 67 : struct spdk_bdev_io *bdev_io = _bdev_io;
3302 67 : struct spdk_bdev *bdev = bdev_io->bdev;
3303 67 : uint64_t parent_offset, current_offset, remaining;
3304 : uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt;
3305 : uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
3306 : uint32_t iovcnt, iov_len, child_iovsize;
3307 : uint32_t blocklen;
3308 : uint32_t io_boundary;
3309 67 : uint32_t max_segment_size = bdev->max_segment_size;
3310 67 : uint32_t max_child_iovcnt = bdev->max_num_segments;
3311 67 : uint32_t max_size = bdev->max_rw_size;
3312 67 : void *md_buf = NULL;
3313 : int rc;
3314 :
3315 67 : blocklen = bdev_io_get_block_size(bdev_io);
3316 :
3317 67 : max_size = max_size ? max_size : UINT32_MAX;
3318 67 : max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX;
3319 67 : max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) :
3320 : SPDK_BDEV_IO_NUM_CHILD_IOV;
3321 :
3322 67 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
3323 5 : io_boundary = bdev->write_unit_size;
3324 62 : } else if (bdev->split_on_optimal_io_boundary) {
3325 40 : io_boundary = bdev->optimal_io_boundary;
3326 : } else {
3327 22 : io_boundary = UINT32_MAX;
3328 : }
3329 :
3330 67 : assert(bdev_io->internal.f.split);
3331 :
3332 67 : remaining = bdev_io->internal.split.remaining_num_blocks;
3333 67 : current_offset = bdev_io->internal.split.current_offset_blocks;
3334 67 : parent_offset = bdev_io->u.bdev.offset_blocks;
3335 67 : parent_iov_offset = (current_offset - parent_offset) * blocklen;
3336 67 : parent_iovcnt = bdev_io->u.bdev.iovcnt;
3337 :
3338 420 : for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) {
3339 420 : parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
3340 420 : if (parent_iov_offset < parent_iov->iov_len) {
3341 67 : break;
3342 : }
3343 353 : parent_iov_offset -= parent_iov->iov_len;
3344 : }
3345 :
3346 67 : child_iovcnt = 0;
3347 309 : while (remaining > 0 && parent_iovpos < parent_iovcnt &&
3348 : child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) {
3349 249 : to_next_boundary = _to_next_boundary(current_offset, io_boundary);
3350 249 : to_next_boundary = spdk_min(remaining, to_next_boundary);
3351 249 : to_next_boundary = spdk_min(max_size, to_next_boundary);
3352 249 : to_next_boundary_bytes = to_next_boundary * blocklen;
3353 :
3354 249 : iov = &bdev_io->child_iov[child_iovcnt];
3355 249 : iovcnt = 0;
3356 :
3357 249 : if (bdev_io->u.bdev.md_buf) {
3358 48 : md_buf = (char *)bdev_io->u.bdev.md_buf +
3359 24 : (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev);
3360 : }
3361 :
3362 249 : child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt);
3363 974 : while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt &&
3364 : iovcnt < child_iovsize) {
3365 725 : parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
3366 725 : iov_len = parent_iov->iov_len - parent_iov_offset;
3367 :
3368 725 : iov_len = spdk_min(iov_len, max_segment_size);
3369 725 : iov_len = spdk_min(iov_len, to_next_boundary_bytes);
3370 725 : to_next_boundary_bytes -= iov_len;
3371 :
3372 725 : bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset;
3373 725 : bdev_io->child_iov[child_iovcnt].iov_len = iov_len;
3374 :
3375 725 : if (iov_len < parent_iov->iov_len - parent_iov_offset) {
3376 183 : parent_iov_offset += iov_len;
3377 : } else {
3378 542 : parent_iovpos++;
3379 542 : parent_iov_offset = 0;
3380 : }
3381 725 : child_iovcnt++;
3382 725 : iovcnt++;
3383 : }
3384 :
3385 249 : if (to_next_boundary_bytes > 0) {
3386 : /* We had to stop this child I/O early because we ran out of
3387 : * child_iov space or were limited by max_num_segments.
3388 : * Ensure the iovs to be aligned with block size and
3389 : * then adjust to_next_boundary before starting the
3390 : * child I/O.
3391 : */
3392 111 : assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV ||
3393 : iovcnt == child_iovsize);
3394 111 : to_last_block_bytes = to_next_boundary_bytes % blocklen;
3395 111 : if (to_last_block_bytes != 0) {
3396 24 : uint32_t child_iovpos = child_iovcnt - 1;
3397 : /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV
3398 : * so the loop will naturally end
3399 : */
3400 :
3401 24 : to_last_block_bytes = blocklen - to_last_block_bytes;
3402 24 : to_next_boundary_bytes += to_last_block_bytes;
3403 53 : while (to_last_block_bytes > 0 && iovcnt > 0) {
3404 32 : iov_len = spdk_min(to_last_block_bytes,
3405 : bdev_io->child_iov[child_iovpos].iov_len);
3406 32 : bdev_io->child_iov[child_iovpos].iov_len -= iov_len;
3407 32 : if (bdev_io->child_iov[child_iovpos].iov_len == 0) {
3408 15 : child_iovpos--;
3409 15 : if (--iovcnt == 0) {
3410 : /* If the child IO is less than a block size just return.
3411 : * If the first child IO of any split round is less than
3412 : * a block size, an error exit.
3413 : */
3414 3 : if (bdev_io->internal.split.outstanding == 0) {
3415 1 : SPDK_ERRLOG("The first child io was less than a block size\n");
3416 1 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3417 1 : bdev_ch_remove_from_io_submitted(bdev_io);
3418 1 : spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id,
3419 : 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx,
3420 : bdev_io->internal.ch->queue_depth);
3421 1 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3422 : }
3423 :
3424 3 : return;
3425 : }
3426 : }
3427 :
3428 29 : to_last_block_bytes -= iov_len;
3429 :
3430 29 : if (parent_iov_offset == 0) {
3431 14 : parent_iovpos--;
3432 14 : parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len;
3433 : }
3434 29 : parent_iov_offset -= iov_len;
3435 : }
3436 :
3437 21 : assert(to_last_block_bytes == 0);
3438 : }
3439 108 : to_next_boundary -= to_next_boundary_bytes / blocklen;
3440 : }
3441 :
3442 246 : rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary,
3443 : ¤t_offset, &remaining);
3444 246 : if (spdk_unlikely(rc)) {
3445 4 : return;
3446 : }
3447 : }
3448 : }
3449 :
3450 : static void
3451 3 : bdev_unmap_split(struct spdk_bdev_io *bdev_io)
3452 : {
3453 3 : uint64_t offset, unmap_blocks, remaining, max_unmap_blocks;
3454 3 : uint32_t num_children_reqs = 0;
3455 : int rc;
3456 :
3457 3 : assert(bdev_io->internal.f.split);
3458 :
3459 3 : offset = bdev_io->internal.split.current_offset_blocks;
3460 3 : remaining = bdev_io->internal.split.remaining_num_blocks;
3461 3 : max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments;
3462 :
3463 20 : while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3464 17 : unmap_blocks = spdk_min(remaining, max_unmap_blocks);
3465 :
3466 17 : rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks,
3467 : &offset, &remaining);
3468 17 : if (spdk_likely(rc == 0)) {
3469 17 : num_children_reqs++;
3470 : } else {
3471 0 : return;
3472 : }
3473 : }
3474 : }
3475 :
3476 : static void
3477 6 : bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io)
3478 : {
3479 6 : uint64_t offset, write_zeroes_blocks, remaining;
3480 6 : uint32_t num_children_reqs = 0;
3481 : int rc;
3482 :
3483 6 : assert(bdev_io->internal.f.split);
3484 :
3485 6 : offset = bdev_io->internal.split.current_offset_blocks;
3486 6 : remaining = bdev_io->internal.split.remaining_num_blocks;
3487 :
3488 29 : while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3489 23 : write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes);
3490 :
3491 23 : rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks,
3492 : &offset, &remaining);
3493 23 : if (spdk_likely(rc == 0)) {
3494 23 : num_children_reqs++;
3495 : } else {
3496 0 : return;
3497 : }
3498 : }
3499 : }
3500 :
3501 : static void
3502 4 : bdev_copy_split(struct spdk_bdev_io *bdev_io)
3503 : {
3504 4 : uint64_t offset, copy_blocks, remaining;
3505 4 : uint32_t num_children_reqs = 0;
3506 : int rc;
3507 :
3508 4 : assert(bdev_io->internal.f.split);
3509 :
3510 4 : offset = bdev_io->internal.split.current_offset_blocks;
3511 4 : remaining = bdev_io->internal.split.remaining_num_blocks;
3512 :
3513 4 : assert(bdev_io->bdev->max_copy != 0);
3514 23 : while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) {
3515 19 : copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy);
3516 :
3517 19 : rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks,
3518 : &offset, &remaining);
3519 19 : if (spdk_likely(rc == 0)) {
3520 19 : num_children_reqs++;
3521 : } else {
3522 0 : return;
3523 : }
3524 : }
3525 : }
3526 :
3527 : static void
3528 58 : parent_bdev_io_complete(void *ctx, int rc)
3529 : {
3530 58 : struct spdk_bdev_io *parent_io = ctx;
3531 :
3532 58 : if (rc) {
3533 0 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3534 : }
3535 :
3536 58 : parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
3537 : parent_io->internal.caller_ctx);
3538 58 : }
3539 :
3540 : static void
3541 0 : bdev_io_complete_parent_sequence_cb(void *ctx, int status)
3542 : {
3543 0 : struct spdk_bdev_io *bdev_io = ctx;
3544 :
3545 : /* u.bdev.accel_sequence should have already been cleared at this point */
3546 0 : assert(bdev_io->u.bdev.accel_sequence == NULL);
3547 0 : assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
3548 0 : bdev_io->internal.f.has_accel_sequence = false;
3549 :
3550 0 : if (spdk_unlikely(status != 0)) {
3551 0 : SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
3552 : }
3553 :
3554 0 : parent_bdev_io_complete(bdev_io, status);
3555 0 : }
3556 :
3557 : static void
3558 301 : bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
3559 : {
3560 301 : struct spdk_bdev_io *parent_io = cb_arg;
3561 :
3562 301 : spdk_bdev_free_io(bdev_io);
3563 :
3564 301 : assert(parent_io->internal.f.split);
3565 :
3566 301 : if (!success) {
3567 21 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3568 : /* If any child I/O failed, stop further splitting process. */
3569 21 : parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks;
3570 21 : parent_io->internal.split.remaining_num_blocks = 0;
3571 : }
3572 301 : parent_io->internal.split.outstanding--;
3573 301 : if (parent_io->internal.split.outstanding != 0) {
3574 223 : return;
3575 : }
3576 :
3577 : /*
3578 : * Parent I/O finishes when all blocks are consumed.
3579 : */
3580 78 : if (parent_io->internal.split.remaining_num_blocks == 0) {
3581 58 : assert(parent_io->internal.cb != bdev_io_split_done);
3582 58 : bdev_ch_remove_from_io_submitted(parent_io);
3583 58 : spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id,
3584 : 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx,
3585 : parent_io->internal.ch->queue_depth);
3586 :
3587 58 : if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
3588 48 : if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) {
3589 0 : bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb);
3590 0 : return;
3591 48 : } else if (parent_io->internal.f.has_bounce_buf &&
3592 0 : !bdev_io_use_accel_sequence(bdev_io)) {
3593 : /* bdev IO will be completed in the callback */
3594 0 : _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete);
3595 0 : return;
3596 : }
3597 : }
3598 :
3599 58 : parent_bdev_io_complete(parent_io, 0);
3600 58 : return;
3601 : }
3602 :
3603 : /*
3604 : * Continue with the splitting process. This function will complete the parent I/O if the
3605 : * splitting is done.
3606 : */
3607 20 : switch (parent_io->type) {
3608 17 : case SPDK_BDEV_IO_TYPE_READ:
3609 : case SPDK_BDEV_IO_TYPE_WRITE:
3610 17 : _bdev_rw_split(parent_io);
3611 17 : break;
3612 1 : case SPDK_BDEV_IO_TYPE_UNMAP:
3613 1 : bdev_unmap_split(parent_io);
3614 1 : break;
3615 1 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3616 1 : bdev_write_zeroes_split(parent_io);
3617 1 : break;
3618 1 : case SPDK_BDEV_IO_TYPE_COPY:
3619 1 : bdev_copy_split(parent_io);
3620 1 : break;
3621 0 : default:
3622 0 : assert(false);
3623 : break;
3624 : }
3625 : }
3626 :
3627 : static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3628 : bool success);
3629 :
3630 : static void
3631 59 : bdev_io_split(struct spdk_bdev_io *bdev_io)
3632 : {
3633 59 : assert(bdev_io_should_split(bdev_io));
3634 59 : assert(bdev_io->internal.f.split);
3635 :
3636 59 : bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks;
3637 59 : bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks;
3638 59 : bdev_io->internal.split.outstanding = 0;
3639 59 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
3640 :
3641 59 : switch (bdev_io->type) {
3642 49 : case SPDK_BDEV_IO_TYPE_READ:
3643 : case SPDK_BDEV_IO_TYPE_WRITE:
3644 49 : if (_is_buf_allocated(bdev_io->u.bdev.iovs)) {
3645 49 : _bdev_rw_split(bdev_io);
3646 : } else {
3647 0 : assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3648 0 : spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb,
3649 0 : bdev_io->u.bdev.num_blocks * bdev_io_get_block_size(bdev_io));
3650 : }
3651 49 : break;
3652 2 : case SPDK_BDEV_IO_TYPE_UNMAP:
3653 2 : bdev_unmap_split(bdev_io);
3654 2 : break;
3655 5 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3656 5 : bdev_write_zeroes_split(bdev_io);
3657 5 : break;
3658 3 : case SPDK_BDEV_IO_TYPE_COPY:
3659 3 : bdev_copy_split(bdev_io);
3660 3 : break;
3661 0 : default:
3662 0 : assert(false);
3663 : break;
3664 : }
3665 59 : }
3666 :
3667 : static void
3668 0 : bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
3669 : {
3670 0 : if (!success) {
3671 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3672 0 : return;
3673 : }
3674 :
3675 0 : _bdev_rw_split(bdev_io);
3676 : }
3677 :
3678 : static inline void
3679 595 : _bdev_io_submit(struct spdk_bdev_io *bdev_io)
3680 : {
3681 595 : struct spdk_bdev *bdev = bdev_io->bdev;
3682 595 : struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3683 :
3684 595 : if (spdk_likely(bdev_ch->flags == 0)) {
3685 570 : bdev_io_do_submit(bdev_ch, bdev_io);
3686 570 : return;
3687 : }
3688 :
3689 25 : if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
3690 2 : _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
3691 23 : } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
3692 25 : if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) &&
3693 2 : bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) {
3694 0 : _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
3695 : } else {
3696 23 : TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link);
3697 23 : bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
3698 : }
3699 : } else {
3700 0 : SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
3701 0 : _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3702 : }
3703 : }
3704 :
3705 : bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2);
3706 :
3707 : bool
3708 23 : bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2)
3709 : {
3710 23 : if (range1->length == 0 || range2->length == 0) {
3711 1 : return false;
3712 : }
3713 :
3714 22 : if (range1->offset + range1->length <= range2->offset) {
3715 1 : return false;
3716 : }
3717 :
3718 21 : if (range2->offset + range2->length <= range1->offset) {
3719 3 : return false;
3720 : }
3721 :
3722 18 : return true;
3723 : }
3724 :
3725 : static bool
3726 11 : bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range)
3727 : {
3728 11 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3729 11 : struct lba_range r;
3730 :
3731 11 : switch (bdev_io->type) {
3732 0 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3733 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3734 : /* Don't try to decode the NVMe command - just assume worst-case and that
3735 : * it overlaps a locked range.
3736 : */
3737 0 : return true;
3738 6 : case SPDK_BDEV_IO_TYPE_READ:
3739 6 : if (!range->quiesce) {
3740 4 : return false;
3741 : }
3742 : /* fallthrough */
3743 : case SPDK_BDEV_IO_TYPE_WRITE:
3744 : case SPDK_BDEV_IO_TYPE_UNMAP:
3745 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3746 : case SPDK_BDEV_IO_TYPE_ZCOPY:
3747 : case SPDK_BDEV_IO_TYPE_COPY:
3748 7 : r.offset = bdev_io->u.bdev.offset_blocks;
3749 7 : r.length = bdev_io->u.bdev.num_blocks;
3750 7 : if (!bdev_lba_range_overlapped(range, &r)) {
3751 : /* This I/O doesn't overlap the specified LBA range. */
3752 0 : return false;
3753 7 : } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) {
3754 : /* This I/O overlaps, but the I/O is on the same channel that locked this
3755 : * range, and the caller_ctx is the same as the locked_ctx. This means
3756 : * that this I/O is associated with the lock, and is allowed to execute.
3757 : */
3758 2 : return false;
3759 : } else {
3760 5 : return true;
3761 : }
3762 0 : default:
3763 0 : return false;
3764 : }
3765 : }
3766 :
3767 : void
3768 655 : bdev_io_submit(struct spdk_bdev_io *bdev_io)
3769 : {
3770 655 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3771 :
3772 655 : assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3773 :
3774 655 : if (!TAILQ_EMPTY(&ch->locked_ranges)) {
3775 : struct lba_range *range;
3776 :
3777 13 : TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
3778 8 : if (bdev_io_range_is_locked(bdev_io, range)) {
3779 3 : TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link);
3780 3 : return;
3781 : }
3782 : }
3783 : }
3784 :
3785 652 : bdev_ch_add_to_io_submitted(bdev_io);
3786 :
3787 652 : bdev_io->internal.submit_tsc = spdk_get_ticks();
3788 652 : spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START,
3789 : ch->trace_id, bdev_io->u.bdev.num_blocks,
3790 : (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx,
3791 : bdev_io->u.bdev.offset_blocks, ch->queue_depth);
3792 :
3793 652 : if (bdev_io->internal.f.split) {
3794 59 : bdev_io_split(bdev_io);
3795 59 : return;
3796 : }
3797 :
3798 593 : _bdev_io_submit(bdev_io);
3799 : }
3800 :
3801 : static inline int
3802 2 : bdev_io_init_dif_ctx(struct spdk_bdev_io *bdev_io)
3803 : {
3804 2 : struct spdk_bdev *bdev = bdev_io->bdev;
3805 2 : struct spdk_dif_ctx_init_ext_opts dif_opts;
3806 :
3807 2 : memset(&bdev_io->u.bdev.dif_err, 0, sizeof(struct spdk_dif_error));
3808 :
3809 2 : dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
3810 2 : dif_opts.dif_pi_format = bdev->dif_pi_format;
3811 :
3812 4 : return spdk_dif_ctx_init(&bdev_io->u.bdev.dif_ctx,
3813 : bdev->blocklen,
3814 : bdev->md_len,
3815 2 : bdev->md_interleave,
3816 2 : bdev->dif_is_head_of_md,
3817 : bdev->dif_type,
3818 : bdev_io->u.bdev.dif_check_flags,
3819 2 : bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF,
3820 : 0xFFFF, 0, 0, 0, &dif_opts);
3821 : }
3822 :
3823 : static void
3824 4 : _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3825 : bool success)
3826 : {
3827 4 : if (!success) {
3828 0 : SPDK_ERRLOG("Failed to get data buffer, completing IO\n");
3829 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3830 0 : bdev_io_complete_unsubmitted(bdev_io);
3831 0 : return;
3832 : }
3833 :
3834 4 : if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
3835 0 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
3836 0 : bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
3837 0 : return;
3838 : }
3839 : /* For reads we'll execute the sequence after the data is read, so, for now, only
3840 : * clear out accel_sequence pointer and submit the IO */
3841 0 : assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3842 0 : bdev_io->u.bdev.accel_sequence = NULL;
3843 : }
3844 :
3845 4 : bdev_io_submit(bdev_io);
3846 : }
3847 :
3848 : static inline void
3849 4 : _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io)
3850 : {
3851 : /* bdev doesn't support memory domains, thereby buffers in this IO request can't
3852 : * be accessed directly. It is needed to allocate buffers before issuing IO operation.
3853 : * For write operation we need to pull buffers from memory domain before submitting IO.
3854 : * Once read operation completes, we need to use memory_domain push functionality to
3855 : * update data in original memory domain IO buffer.
3856 : *
3857 : * If this I/O request is not aware of metadata, buffers in thsi IO request can't be
3858 : * accessed directly too. It is needed to allocate buffers before issuing IO operation.
3859 : * For write operation we need to insert metadata before submitting IO. Once read
3860 : * operation completes, we need to strip metadata in original IO buffer.
3861 : *
3862 : * This IO request will go through a regular IO flow, so clear memory domains pointers */
3863 4 : assert(bdev_io_use_memory_domain(bdev_io) ||
3864 : bdev_io_needs_metadata(bdev_io->internal.desc, bdev_io));
3865 :
3866 4 : bdev_io->u.bdev.memory_domain = NULL;
3867 4 : bdev_io->u.bdev.memory_domain_ctx = NULL;
3868 4 : _bdev_io_get_bounce_buf(bdev_io, _bdev_memory_domain_get_io_cb,
3869 4 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3870 4 : }
3871 :
3872 : static inline void
3873 0 : _bdev_io_ext_use_accel_buffer(struct spdk_bdev_io *bdev_io)
3874 : {
3875 0 : assert(bdev_io_use_memory_domain(bdev_io));
3876 0 : assert(bdev_io_needs_metadata(bdev_io->internal.desc, bdev_io));
3877 :
3878 0 : bdev_io->u.bdev.memory_domain = NULL;
3879 0 : bdev_io->u.bdev.memory_domain_ctx = NULL;
3880 0 : bdev_io_get_accel_buf(bdev_io, _bdev_memory_domain_get_io_cb,
3881 0 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3882 0 : }
3883 :
3884 : /* We need to allocate bounce buffer
3885 : * - if bdev doesn't support memory domains,
3886 : * - if it does support them, but we need to execute an accel sequence and the data buffer is
3887 : * from accel memory domain (to avoid doing a push/pull from that domain), or
3888 : * - if IO is not aware of metadata.
3889 : */
3890 : static inline bool
3891 292 : bdev_io_needs_bounce_buffer(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
3892 : {
3893 292 : if (bdev_io_use_memory_domain(bdev_io)) {
3894 4 : if (!desc->memory_domains_supported ||
3895 0 : (bdev_io_needs_sequence_exec(desc, bdev_io) &&
3896 0 : (bdev_io->internal.memory_domain == spdk_accel_get_memory_domain() ||
3897 0 : bdev_io_needs_metadata(desc, bdev_io)))) {
3898 4 : return true;
3899 : }
3900 :
3901 0 : return false;
3902 : }
3903 :
3904 288 : if (bdev_io_needs_metadata(desc, bdev_io)) {
3905 0 : return true;
3906 : }
3907 :
3908 288 : return false;
3909 : }
3910 :
3911 : /* We need to allocate fake accel buffer if bdev supports memory domains but IO is not
3912 : * aware of metadata.
3913 : */
3914 : static inline bool
3915 288 : bdev_io_needs_accel_buffer(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
3916 : {
3917 288 : if (bdev_io_needs_metadata(desc, bdev_io)) {
3918 0 : assert(bdev_io_use_memory_domain(bdev_io));
3919 0 : return true;
3920 : }
3921 :
3922 288 : return false;
3923 : }
3924 :
3925 : static inline void
3926 292 : _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
3927 : {
3928 292 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3929 : int rc;
3930 :
3931 292 : if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) {
3932 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED;
3933 0 : bdev_io_complete_unsubmitted(bdev_io);
3934 0 : return;
3935 : }
3936 :
3937 292 : if (bdev_io_needs_metadata(desc, bdev_io)) {
3938 0 : rc = bdev_io_init_dif_ctx(bdev_io);
3939 0 : if (spdk_unlikely(rc != 0)) {
3940 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3941 0 : bdev_io_complete_unsubmitted(bdev_io);
3942 0 : return;
3943 : }
3944 : }
3945 :
3946 292 : if (bdev_io_needs_bounce_buffer(desc, bdev_io)) {
3947 4 : _bdev_io_ext_use_bounce_buffer(bdev_io);
3948 4 : return;
3949 : }
3950 :
3951 288 : if (bdev_io_needs_accel_buffer(desc, bdev_io)) {
3952 0 : _bdev_io_ext_use_accel_buffer(bdev_io);
3953 0 : return;
3954 : }
3955 :
3956 288 : if (bdev_io_needs_sequence_exec(desc, bdev_io)) {
3957 0 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
3958 0 : bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
3959 0 : return;
3960 : }
3961 : /* For reads we'll execute the sequence after the data is read, so, for now, only
3962 : * clear out accel_sequence pointer and submit the IO */
3963 0 : assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3964 0 : bdev_io->u.bdev.accel_sequence = NULL;
3965 : }
3966 :
3967 288 : bdev_io_submit(bdev_io);
3968 : }
3969 :
3970 : static void
3971 13 : bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
3972 : {
3973 13 : struct spdk_bdev *bdev = bdev_io->bdev;
3974 13 : struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3975 13 : struct spdk_io_channel *ch = bdev_ch->channel;
3976 :
3977 13 : assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3978 :
3979 13 : bdev_io->internal.f.in_submit_request = true;
3980 13 : bdev_submit_request(bdev, ch, bdev_io);
3981 13 : bdev_io->internal.f.in_submit_request = false;
3982 13 : }
3983 :
3984 : void
3985 710 : bdev_io_init(struct spdk_bdev_io *bdev_io,
3986 : struct spdk_bdev *bdev, void *cb_arg,
3987 : spdk_bdev_io_completion_cb cb)
3988 : {
3989 710 : bdev_io->bdev = bdev;
3990 710 : bdev_io->internal.f.raw = 0;
3991 710 : bdev_io->internal.caller_ctx = cb_arg;
3992 710 : bdev_io->internal.cb = cb;
3993 710 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
3994 710 : bdev_io->internal.f.in_submit_request = false;
3995 710 : bdev_io->internal.error.nvme.cdw0 = 0;
3996 710 : bdev_io->num_retries = 0;
3997 710 : bdev_io->internal.get_buf_cb = NULL;
3998 710 : bdev_io->internal.get_aux_buf_cb = NULL;
3999 710 : bdev_io->internal.data_transfer_cpl = NULL;
4000 710 : bdev_io->internal.f.split = bdev_io_should_split(bdev_io);
4001 710 : }
4002 :
4003 : static bool
4004 543 : bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
4005 : {
4006 543 : return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
4007 : }
4008 :
4009 : bool
4010 179 : spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
4011 : {
4012 : bool supported;
4013 :
4014 179 : supported = bdev_io_type_supported(bdev, io_type);
4015 :
4016 179 : if (!supported) {
4017 7 : switch (io_type) {
4018 0 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
4019 : /* The bdev layer will emulate write zeroes as long as write is supported. */
4020 0 : supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
4021 0 : break;
4022 7 : default:
4023 7 : break;
4024 : }
4025 : }
4026 :
4027 179 : return supported;
4028 : }
4029 :
4030 : static const char *g_io_type_strings[] = {
4031 : [SPDK_BDEV_IO_TYPE_READ] = "read",
4032 : [SPDK_BDEV_IO_TYPE_WRITE] = "write",
4033 : [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap",
4034 : [SPDK_BDEV_IO_TYPE_FLUSH] = "flush",
4035 : [SPDK_BDEV_IO_TYPE_RESET] = "reset",
4036 : [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin",
4037 : [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io",
4038 : [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md",
4039 : [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes",
4040 : [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy",
4041 : [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info",
4042 : [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management",
4043 : [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append",
4044 : [SPDK_BDEV_IO_TYPE_COMPARE] = "compare",
4045 : [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write",
4046 : [SPDK_BDEV_IO_TYPE_ABORT] = "abort",
4047 : [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole",
4048 : [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data",
4049 : [SPDK_BDEV_IO_TYPE_COPY] = "copy",
4050 : [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md",
4051 : };
4052 :
4053 : const char *
4054 0 : spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type)
4055 : {
4056 0 : if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) {
4057 0 : return NULL;
4058 : }
4059 :
4060 0 : return g_io_type_strings[io_type];
4061 : }
4062 :
4063 : int
4064 0 : spdk_bdev_get_io_type(const char *io_type_string)
4065 : {
4066 : int i;
4067 :
4068 0 : for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
4069 0 : if (!strcmp(io_type_string, g_io_type_strings[i])) {
4070 0 : return i;
4071 : }
4072 : }
4073 :
4074 0 : return -1;
4075 : }
4076 :
4077 : uint64_t
4078 0 : spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io)
4079 : {
4080 0 : return bdev_io->internal.submit_tsc;
4081 : }
4082 :
4083 : bool
4084 0 : spdk_bdev_io_hide_metadata(struct spdk_bdev_io *bdev_io)
4085 : {
4086 0 : return bdev_io->internal.desc->opts.hide_metadata;
4087 : }
4088 :
4089 : int
4090 0 : spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
4091 : {
4092 0 : if (bdev->fn_table->dump_info_json) {
4093 0 : return bdev->fn_table->dump_info_json(bdev->ctxt, w);
4094 : }
4095 :
4096 0 : return 0;
4097 : }
4098 :
4099 : static void
4100 10 : bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
4101 : {
4102 10 : uint32_t max_per_timeslice = 0;
4103 : int i;
4104 :
4105 50 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4106 40 : if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
4107 15 : qos->rate_limits[i].max_per_timeslice = 0;
4108 15 : continue;
4109 : }
4110 :
4111 25 : max_per_timeslice = qos->rate_limits[i].limit *
4112 25 : SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC;
4113 :
4114 25 : qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice,
4115 : qos->rate_limits[i].min_per_timeslice);
4116 :
4117 25 : __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice,
4118 25 : qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE);
4119 : }
4120 :
4121 10 : bdev_qos_set_ops(qos);
4122 10 : }
4123 :
4124 : static void
4125 4 : bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4126 : struct spdk_io_channel *io_ch, void *ctx)
4127 : {
4128 4 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
4129 : int status;
4130 :
4131 4 : bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
4132 :
4133 : /* if all IOs were sent then continue the iteration, otherwise - stop it */
4134 : /* TODO: channels round robing */
4135 4 : status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1;
4136 :
4137 4 : spdk_bdev_for_each_channel_continue(i, status);
4138 4 : }
4139 :
4140 :
4141 : static void
4142 2 : bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status)
4143 : {
4144 :
4145 2 : }
4146 :
4147 : static int
4148 3 : bdev_channel_poll_qos(void *arg)
4149 : {
4150 3 : struct spdk_bdev *bdev = arg;
4151 3 : struct spdk_bdev_qos *qos = bdev->internal.qos;
4152 3 : uint64_t now = spdk_get_ticks();
4153 : int i;
4154 : int64_t remaining_last_timeslice;
4155 :
4156 3 : if (spdk_unlikely(qos->thread == NULL)) {
4157 : /* Old QoS was unbound to remove and new QoS is not enabled yet. */
4158 1 : return SPDK_POLLER_IDLE;
4159 : }
4160 :
4161 2 : if (now < (qos->last_timeslice + qos->timeslice_size)) {
4162 : /* We received our callback earlier than expected - return
4163 : * immediately and wait to do accounting until at least one
4164 : * timeslice has actually expired. This should never happen
4165 : * with a well-behaved timer implementation.
4166 : */
4167 0 : return SPDK_POLLER_IDLE;
4168 : }
4169 :
4170 : /* Reset for next round of rate limiting */
4171 10 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4172 : /* We may have allowed the IOs or bytes to slightly overrun in the last
4173 : * timeslice. remaining_this_timeslice is signed, so if it's negative
4174 : * here, we'll account for the overrun so that the next timeslice will
4175 : * be appropriately reduced.
4176 : */
4177 8 : remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice,
4178 : 0, __ATOMIC_RELAXED);
4179 8 : if (remaining_last_timeslice < 0) {
4180 : /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos()
4181 : * potentially use 2 atomic ops each, so they can intertwine.
4182 : * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage.
4183 : */
4184 0 : __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice,
4185 : remaining_last_timeslice, __ATOMIC_RELAXED);
4186 : }
4187 : }
4188 :
4189 4 : while (now >= (qos->last_timeslice + qos->timeslice_size)) {
4190 2 : qos->last_timeslice += qos->timeslice_size;
4191 10 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4192 8 : __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice,
4193 8 : qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED);
4194 : }
4195 : }
4196 :
4197 2 : spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos,
4198 : bdev_channel_submit_qos_io_done);
4199 :
4200 2 : return SPDK_POLLER_BUSY;
4201 : }
4202 :
4203 : static void
4204 76 : bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
4205 : {
4206 : struct spdk_bdev_shared_resource *shared_resource;
4207 : struct lba_range *range;
4208 :
4209 76 : bdev_free_io_stat(ch->stat);
4210 : #ifdef SPDK_CONFIG_VTUNE
4211 : bdev_free_io_stat(ch->prev_stat);
4212 : #endif
4213 :
4214 76 : while (!TAILQ_EMPTY(&ch->locked_ranges)) {
4215 0 : range = TAILQ_FIRST(&ch->locked_ranges);
4216 0 : TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
4217 0 : free(range);
4218 : }
4219 :
4220 76 : spdk_put_io_channel(ch->channel);
4221 76 : spdk_put_io_channel(ch->accel_channel);
4222 :
4223 76 : shared_resource = ch->shared_resource;
4224 :
4225 76 : assert(TAILQ_EMPTY(&ch->io_locked));
4226 76 : assert(TAILQ_EMPTY(&ch->io_submitted));
4227 76 : assert(TAILQ_EMPTY(&ch->io_accel_exec));
4228 76 : assert(TAILQ_EMPTY(&ch->io_memory_domain));
4229 76 : assert(ch->io_outstanding == 0);
4230 76 : assert(shared_resource->ref > 0);
4231 76 : shared_resource->ref--;
4232 76 : if (shared_resource->ref == 0) {
4233 75 : assert(shared_resource->io_outstanding == 0);
4234 75 : TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
4235 75 : spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
4236 75 : spdk_poller_unregister(&shared_resource->nomem_poller);
4237 75 : free(shared_resource);
4238 : }
4239 76 : }
4240 :
4241 : static void
4242 85 : bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
4243 : {
4244 85 : struct spdk_bdev_qos *qos = bdev->internal.qos;
4245 : int i;
4246 :
4247 85 : assert(spdk_spin_held(&bdev->internal.spinlock));
4248 :
4249 : /* Rate limiting on this bdev enabled */
4250 85 : if (qos) {
4251 17 : if (qos->ch == NULL) {
4252 : struct spdk_io_channel *io_ch;
4253 :
4254 9 : SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
4255 : bdev->name, spdk_get_thread());
4256 :
4257 : /* No qos channel has been selected, so set one up */
4258 :
4259 : /* Take another reference to ch */
4260 9 : io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
4261 9 : assert(io_ch != NULL);
4262 9 : qos->ch = ch;
4263 :
4264 9 : qos->thread = spdk_io_channel_get_thread(io_ch);
4265 :
4266 45 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4267 36 : if (bdev_qos_is_iops_rate_limit(i) == true) {
4268 9 : qos->rate_limits[i].min_per_timeslice =
4269 : SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE;
4270 : } else {
4271 27 : qos->rate_limits[i].min_per_timeslice =
4272 : SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE;
4273 : }
4274 :
4275 36 : if (qos->rate_limits[i].limit == 0) {
4276 2 : qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
4277 : }
4278 : }
4279 9 : bdev_qos_update_max_quota_per_timeslice(qos);
4280 9 : qos->timeslice_size =
4281 9 : SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
4282 9 : qos->last_timeslice = spdk_get_ticks();
4283 9 : qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos,
4284 : bdev,
4285 : SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
4286 : }
4287 :
4288 17 : ch->flags |= BDEV_CH_QOS_ENABLED;
4289 : }
4290 85 : }
4291 :
4292 : struct poll_timeout_ctx {
4293 : struct spdk_bdev_desc *desc;
4294 : uint64_t timeout_in_sec;
4295 : spdk_bdev_io_timeout_cb cb_fn;
4296 : void *cb_arg;
4297 : };
4298 :
4299 : static void
4300 280 : bdev_desc_free(struct spdk_bdev_desc *desc)
4301 : {
4302 280 : spdk_spin_destroy(&desc->spinlock);
4303 280 : free(desc->media_events_buffer);
4304 280 : free(desc);
4305 280 : }
4306 :
4307 : static void
4308 8 : bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
4309 : {
4310 8 : struct poll_timeout_ctx *ctx = _ctx;
4311 8 : struct spdk_bdev_desc *desc = ctx->desc;
4312 :
4313 8 : free(ctx);
4314 :
4315 8 : spdk_spin_lock(&desc->spinlock);
4316 8 : desc->refs--;
4317 8 : if (desc->closed == true && desc->refs == 0) {
4318 1 : spdk_spin_unlock(&desc->spinlock);
4319 1 : bdev_desc_free(desc);
4320 1 : return;
4321 : }
4322 7 : spdk_spin_unlock(&desc->spinlock);
4323 : }
4324 :
4325 : static void
4326 13 : bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4327 : struct spdk_io_channel *io_ch, void *_ctx)
4328 : {
4329 13 : struct poll_timeout_ctx *ctx = _ctx;
4330 13 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
4331 13 : struct spdk_bdev_desc *desc = ctx->desc;
4332 : struct spdk_bdev_io *bdev_io;
4333 : uint64_t now;
4334 :
4335 13 : spdk_spin_lock(&desc->spinlock);
4336 13 : if (desc->closed == true) {
4337 1 : spdk_spin_unlock(&desc->spinlock);
4338 1 : spdk_bdev_for_each_channel_continue(i, -1);
4339 1 : return;
4340 : }
4341 12 : spdk_spin_unlock(&desc->spinlock);
4342 :
4343 12 : now = spdk_get_ticks();
4344 22 : TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
4345 : /* Exclude any I/O that are generated via splitting. */
4346 15 : if (bdev_io->internal.cb == bdev_io_split_done) {
4347 3 : continue;
4348 : }
4349 :
4350 : /* Once we find an I/O that has not timed out, we can immediately
4351 : * exit the loop.
4352 : */
4353 12 : if (now < (bdev_io->internal.submit_tsc +
4354 12 : ctx->timeout_in_sec * spdk_get_ticks_hz())) {
4355 5 : goto end;
4356 : }
4357 :
4358 7 : if (bdev_io->internal.desc == desc) {
4359 7 : ctx->cb_fn(ctx->cb_arg, bdev_io);
4360 : }
4361 : }
4362 :
4363 7 : end:
4364 12 : spdk_bdev_for_each_channel_continue(i, 0);
4365 : }
4366 :
4367 : static int
4368 8 : bdev_poll_timeout_io(void *arg)
4369 : {
4370 8 : struct spdk_bdev_desc *desc = arg;
4371 8 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4372 : struct poll_timeout_ctx *ctx;
4373 :
4374 8 : ctx = calloc(1, sizeof(struct poll_timeout_ctx));
4375 8 : if (!ctx) {
4376 0 : SPDK_ERRLOG("failed to allocate memory\n");
4377 0 : return SPDK_POLLER_BUSY;
4378 : }
4379 8 : ctx->desc = desc;
4380 8 : ctx->cb_arg = desc->cb_arg;
4381 8 : ctx->cb_fn = desc->cb_fn;
4382 8 : ctx->timeout_in_sec = desc->timeout_in_sec;
4383 :
4384 : /* Take a ref on the descriptor in case it gets closed while we are checking
4385 : * all of the channels.
4386 : */
4387 8 : spdk_spin_lock(&desc->spinlock);
4388 8 : desc->refs++;
4389 8 : spdk_spin_unlock(&desc->spinlock);
4390 :
4391 8 : spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx,
4392 : bdev_channel_poll_timeout_io_done);
4393 :
4394 8 : return SPDK_POLLER_BUSY;
4395 : }
4396 :
4397 : int
4398 5 : spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec,
4399 : spdk_bdev_io_timeout_cb cb_fn, void *cb_arg)
4400 : {
4401 5 : assert(desc->thread == spdk_get_thread());
4402 :
4403 5 : spdk_poller_unregister(&desc->io_timeout_poller);
4404 :
4405 5 : if (timeout_in_sec) {
4406 4 : assert(cb_fn != NULL);
4407 4 : desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io,
4408 : desc,
4409 : SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC /
4410 : 1000);
4411 4 : if (desc->io_timeout_poller == NULL) {
4412 0 : SPDK_ERRLOG("can not register the desc timeout IO poller\n");
4413 0 : return -1;
4414 : }
4415 : }
4416 :
4417 5 : desc->cb_fn = cb_fn;
4418 5 : desc->cb_arg = cb_arg;
4419 5 : desc->timeout_in_sec = timeout_in_sec;
4420 :
4421 5 : return 0;
4422 : }
4423 :
4424 : static int
4425 78 : bdev_channel_create(void *io_device, void *ctx_buf)
4426 : {
4427 78 : struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
4428 78 : struct spdk_bdev_channel *ch = ctx_buf;
4429 : struct spdk_io_channel *mgmt_io_ch;
4430 : struct spdk_bdev_mgmt_channel *mgmt_ch;
4431 : struct spdk_bdev_shared_resource *shared_resource;
4432 : struct lba_range *range;
4433 :
4434 78 : ch->bdev = bdev;
4435 78 : ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
4436 78 : if (!ch->channel) {
4437 2 : return -1;
4438 : }
4439 :
4440 76 : ch->accel_channel = spdk_accel_get_io_channel();
4441 76 : if (!ch->accel_channel) {
4442 0 : spdk_put_io_channel(ch->channel);
4443 0 : return -1;
4444 : }
4445 :
4446 76 : spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0,
4447 : spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
4448 :
4449 76 : assert(ch->histogram == NULL);
4450 76 : if (bdev->internal.histogram_enabled) {
4451 0 : ch->histogram = spdk_histogram_data_alloc();
4452 0 : if (ch->histogram == NULL) {
4453 0 : SPDK_ERRLOG("Could not allocate histogram\n");
4454 : }
4455 : }
4456 :
4457 76 : mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
4458 76 : if (!mgmt_io_ch) {
4459 0 : spdk_put_io_channel(ch->channel);
4460 0 : spdk_put_io_channel(ch->accel_channel);
4461 0 : return -1;
4462 : }
4463 :
4464 76 : mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch);
4465 78 : TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
4466 3 : if (shared_resource->shared_ch == ch->channel) {
4467 1 : spdk_put_io_channel(mgmt_io_ch);
4468 1 : shared_resource->ref++;
4469 1 : break;
4470 : }
4471 : }
4472 :
4473 76 : if (shared_resource == NULL) {
4474 75 : shared_resource = calloc(1, sizeof(*shared_resource));
4475 75 : if (shared_resource == NULL) {
4476 0 : spdk_put_io_channel(ch->channel);
4477 0 : spdk_put_io_channel(ch->accel_channel);
4478 0 : spdk_put_io_channel(mgmt_io_ch);
4479 0 : return -1;
4480 : }
4481 :
4482 75 : shared_resource->mgmt_ch = mgmt_ch;
4483 75 : shared_resource->io_outstanding = 0;
4484 75 : TAILQ_INIT(&shared_resource->nomem_io);
4485 75 : shared_resource->nomem_threshold = 0;
4486 75 : shared_resource->shared_ch = ch->channel;
4487 75 : shared_resource->ref = 1;
4488 75 : TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
4489 : }
4490 :
4491 76 : ch->io_outstanding = 0;
4492 76 : TAILQ_INIT(&ch->locked_ranges);
4493 76 : TAILQ_INIT(&ch->qos_queued_io);
4494 76 : ch->flags = 0;
4495 76 : ch->trace_id = bdev->internal.trace_id;
4496 76 : ch->shared_resource = shared_resource;
4497 :
4498 76 : TAILQ_INIT(&ch->io_submitted);
4499 76 : TAILQ_INIT(&ch->io_locked);
4500 76 : TAILQ_INIT(&ch->io_accel_exec);
4501 76 : TAILQ_INIT(&ch->io_memory_domain);
4502 :
4503 76 : ch->stat = bdev_alloc_io_stat(false);
4504 76 : if (ch->stat == NULL) {
4505 0 : bdev_channel_destroy_resource(ch);
4506 0 : return -1;
4507 : }
4508 :
4509 76 : ch->stat->ticks_rate = spdk_get_ticks_hz();
4510 :
4511 : #ifdef SPDK_CONFIG_VTUNE
4512 : {
4513 : char *name;
4514 : __itt_init_ittlib(NULL, 0);
4515 : name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
4516 : if (!name) {
4517 : bdev_channel_destroy_resource(ch);
4518 : return -1;
4519 : }
4520 : ch->handle = __itt_string_handle_create(name);
4521 : free(name);
4522 : ch->start_tsc = spdk_get_ticks();
4523 : ch->interval_tsc = spdk_get_ticks_hz() / 100;
4524 : ch->prev_stat = bdev_alloc_io_stat(false);
4525 : if (ch->prev_stat == NULL) {
4526 : bdev_channel_destroy_resource(ch);
4527 : return -1;
4528 : }
4529 : }
4530 : #endif
4531 :
4532 76 : spdk_spin_lock(&bdev->internal.spinlock);
4533 76 : bdev_enable_qos(bdev, ch);
4534 :
4535 77 : TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
4536 : struct lba_range *new_range;
4537 :
4538 1 : new_range = calloc(1, sizeof(*new_range));
4539 1 : if (new_range == NULL) {
4540 0 : spdk_spin_unlock(&bdev->internal.spinlock);
4541 0 : bdev_channel_destroy_resource(ch);
4542 0 : return -1;
4543 : }
4544 1 : new_range->length = range->length;
4545 1 : new_range->offset = range->offset;
4546 1 : new_range->locked_ctx = range->locked_ctx;
4547 1 : TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq);
4548 : }
4549 :
4550 76 : spdk_spin_unlock(&bdev->internal.spinlock);
4551 :
4552 76 : return 0;
4553 : }
4554 :
4555 : static int
4556 0 : bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry,
4557 : void *cb_ctx)
4558 : {
4559 0 : struct spdk_bdev_channel *bdev_ch = cb_ctx;
4560 : struct spdk_bdev_io *bdev_io;
4561 : uint64_t buf_len;
4562 :
4563 0 : bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4564 0 : if (bdev_io->internal.ch == bdev_ch) {
4565 0 : buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len);
4566 0 : spdk_iobuf_entry_abort(ch, entry, buf_len);
4567 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4568 : }
4569 :
4570 0 : return 0;
4571 : }
4572 :
4573 : /*
4574 : * Abort I/O that are waiting on a data buffer.
4575 : */
4576 : static void
4577 100 : bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch)
4578 : {
4579 100 : spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_all_buf_io_cb, ch);
4580 100 : }
4581 :
4582 : /*
4583 : * Abort I/O that are queued waiting for submission. These types of I/O are
4584 : * linked using the spdk_bdev_io link TAILQ_ENTRY.
4585 : */
4586 : static void
4587 102 : bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
4588 : {
4589 : struct spdk_bdev_io *bdev_io, *tmp;
4590 :
4591 157 : TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) {
4592 55 : if (bdev_io->internal.ch == ch) {
4593 55 : TAILQ_REMOVE(queue, bdev_io, internal.link);
4594 : /*
4595 : * spdk_bdev_io_complete() assumes that the completed I/O had
4596 : * been submitted to the bdev module. Since in this case it
4597 : * hadn't, bump io_outstanding to account for the decrement
4598 : * that spdk_bdev_io_complete() will do.
4599 : */
4600 55 : if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
4601 55 : bdev_io_increment_outstanding(ch, ch->shared_resource);
4602 : }
4603 55 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4604 : }
4605 : }
4606 102 : }
4607 :
4608 : static inline void
4609 100 : bdev_abort_all_nomem_io(struct spdk_bdev_channel *ch)
4610 : {
4611 100 : struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
4612 :
4613 100 : shared_resource->nomem_abort_in_progress = true;
4614 100 : bdev_abort_all_queued_io(&shared_resource->nomem_io, ch);
4615 100 : shared_resource->nomem_abort_in_progress = false;
4616 100 : }
4617 :
4618 : static bool
4619 18 : bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort)
4620 : {
4621 : struct spdk_bdev_io *bdev_io;
4622 :
4623 18 : TAILQ_FOREACH(bdev_io, queue, internal.link) {
4624 0 : if (bdev_io == bio_to_abort) {
4625 0 : TAILQ_REMOVE(queue, bio_to_abort, internal.link);
4626 0 : spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4627 0 : return true;
4628 : }
4629 : }
4630 :
4631 18 : return false;
4632 : }
4633 :
4634 : static int
4635 0 : bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx)
4636 : {
4637 0 : struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx;
4638 : uint64_t buf_len;
4639 :
4640 0 : bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4641 0 : if (bdev_io == bio_to_abort) {
4642 0 : buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len);
4643 0 : spdk_iobuf_entry_abort(ch, entry, buf_len);
4644 0 : spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4645 0 : return 1;
4646 : }
4647 :
4648 0 : return 0;
4649 : }
4650 :
4651 : static bool
4652 16 : bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort)
4653 : {
4654 : int rc;
4655 :
4656 16 : rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_buf_io_cb, bio_to_abort);
4657 16 : return rc == 1;
4658 : }
4659 :
4660 : static void
4661 7 : bdev_qos_channel_destroy(void *cb_arg)
4662 : {
4663 7 : struct spdk_bdev_qos *qos = cb_arg;
4664 :
4665 7 : spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
4666 7 : spdk_poller_unregister(&qos->poller);
4667 :
4668 7 : SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos);
4669 :
4670 7 : free(qos);
4671 7 : }
4672 :
4673 : static int
4674 7 : bdev_qos_destroy(struct spdk_bdev *bdev)
4675 : {
4676 : int i;
4677 :
4678 : /*
4679 : * Cleanly shutting down the QoS poller is tricky, because
4680 : * during the asynchronous operation the user could open
4681 : * a new descriptor and create a new channel, spawning
4682 : * a new QoS poller.
4683 : *
4684 : * The strategy is to create a new QoS structure here and swap it
4685 : * in. The shutdown path then continues to refer to the old one
4686 : * until it completes and then releases it.
4687 : */
4688 : struct spdk_bdev_qos *new_qos, *old_qos;
4689 :
4690 7 : old_qos = bdev->internal.qos;
4691 :
4692 7 : new_qos = calloc(1, sizeof(*new_qos));
4693 7 : if (!new_qos) {
4694 0 : SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
4695 0 : return -ENOMEM;
4696 : }
4697 :
4698 : /* Copy the old QoS data into the newly allocated structure */
4699 7 : memcpy(new_qos, old_qos, sizeof(*new_qos));
4700 :
4701 : /* Zero out the key parts of the QoS structure */
4702 7 : new_qos->ch = NULL;
4703 7 : new_qos->thread = NULL;
4704 7 : new_qos->poller = NULL;
4705 : /*
4706 : * The limit member of spdk_bdev_qos_limit structure is not zeroed.
4707 : * It will be used later for the new QoS structure.
4708 : */
4709 35 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4710 28 : new_qos->rate_limits[i].remaining_this_timeslice = 0;
4711 28 : new_qos->rate_limits[i].min_per_timeslice = 0;
4712 28 : new_qos->rate_limits[i].max_per_timeslice = 0;
4713 : }
4714 :
4715 7 : bdev->internal.qos = new_qos;
4716 :
4717 7 : if (old_qos->thread == NULL) {
4718 0 : free(old_qos);
4719 : } else {
4720 7 : spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos);
4721 : }
4722 :
4723 : /* It is safe to continue with destroying the bdev even though the QoS channel hasn't
4724 : * been destroyed yet. The destruction path will end up waiting for the final
4725 : * channel to be put before it releases resources. */
4726 :
4727 7 : return 0;
4728 : }
4729 :
4730 : void
4731 80 : spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add)
4732 : {
4733 80 : total->bytes_read += add->bytes_read;
4734 80 : total->num_read_ops += add->num_read_ops;
4735 80 : total->bytes_written += add->bytes_written;
4736 80 : total->num_write_ops += add->num_write_ops;
4737 80 : total->bytes_unmapped += add->bytes_unmapped;
4738 80 : total->num_unmap_ops += add->num_unmap_ops;
4739 80 : total->bytes_copied += add->bytes_copied;
4740 80 : total->num_copy_ops += add->num_copy_ops;
4741 80 : total->read_latency_ticks += add->read_latency_ticks;
4742 80 : total->write_latency_ticks += add->write_latency_ticks;
4743 80 : total->unmap_latency_ticks += add->unmap_latency_ticks;
4744 80 : total->copy_latency_ticks += add->copy_latency_ticks;
4745 80 : if (total->max_read_latency_ticks < add->max_read_latency_ticks) {
4746 7 : total->max_read_latency_ticks = add->max_read_latency_ticks;
4747 : }
4748 80 : if (total->min_read_latency_ticks > add->min_read_latency_ticks) {
4749 39 : total->min_read_latency_ticks = add->min_read_latency_ticks;
4750 : }
4751 80 : if (total->max_write_latency_ticks < add->max_write_latency_ticks) {
4752 4 : total->max_write_latency_ticks = add->max_write_latency_ticks;
4753 : }
4754 80 : if (total->min_write_latency_ticks > add->min_write_latency_ticks) {
4755 24 : total->min_write_latency_ticks = add->min_write_latency_ticks;
4756 : }
4757 80 : if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) {
4758 0 : total->max_unmap_latency_ticks = add->max_unmap_latency_ticks;
4759 : }
4760 80 : if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) {
4761 3 : total->min_unmap_latency_ticks = add->min_unmap_latency_ticks;
4762 : }
4763 80 : if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) {
4764 0 : total->max_copy_latency_ticks = add->max_copy_latency_ticks;
4765 : }
4766 80 : if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) {
4767 4 : total->min_copy_latency_ticks = add->min_copy_latency_ticks;
4768 : }
4769 80 : }
4770 :
4771 : static void
4772 5 : bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat)
4773 : {
4774 5 : memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error));
4775 :
4776 5 : if (to_stat->io_error != NULL && from_stat->io_error != NULL) {
4777 0 : memcpy(to_stat->io_error, from_stat->io_error,
4778 : sizeof(struct spdk_bdev_io_error_stat));
4779 : }
4780 5 : }
4781 :
4782 : void
4783 218 : spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode)
4784 : {
4785 218 : if (mode == SPDK_BDEV_RESET_STAT_NONE) {
4786 5 : return;
4787 : }
4788 :
4789 213 : stat->max_read_latency_ticks = 0;
4790 213 : stat->min_read_latency_ticks = UINT64_MAX;
4791 213 : stat->max_write_latency_ticks = 0;
4792 213 : stat->min_write_latency_ticks = UINT64_MAX;
4793 213 : stat->max_unmap_latency_ticks = 0;
4794 213 : stat->min_unmap_latency_ticks = UINT64_MAX;
4795 213 : stat->max_copy_latency_ticks = 0;
4796 213 : stat->min_copy_latency_ticks = UINT64_MAX;
4797 :
4798 213 : if (mode != SPDK_BDEV_RESET_STAT_ALL) {
4799 2 : return;
4800 : }
4801 :
4802 211 : stat->bytes_read = 0;
4803 211 : stat->num_read_ops = 0;
4804 211 : stat->bytes_written = 0;
4805 211 : stat->num_write_ops = 0;
4806 211 : stat->bytes_unmapped = 0;
4807 211 : stat->num_unmap_ops = 0;
4808 211 : stat->bytes_copied = 0;
4809 211 : stat->num_copy_ops = 0;
4810 211 : stat->read_latency_ticks = 0;
4811 211 : stat->write_latency_ticks = 0;
4812 211 : stat->unmap_latency_ticks = 0;
4813 211 : stat->copy_latency_ticks = 0;
4814 :
4815 211 : if (stat->io_error != NULL) {
4816 134 : memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat));
4817 : }
4818 : }
4819 :
4820 : struct spdk_bdev_io_stat *
4821 209 : bdev_alloc_io_stat(bool io_error_stat)
4822 : {
4823 : struct spdk_bdev_io_stat *stat;
4824 :
4825 209 : stat = malloc(sizeof(struct spdk_bdev_io_stat));
4826 209 : if (stat == NULL) {
4827 0 : return NULL;
4828 : }
4829 :
4830 209 : if (io_error_stat) {
4831 133 : stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat));
4832 133 : if (stat->io_error == NULL) {
4833 0 : free(stat);
4834 0 : return NULL;
4835 : }
4836 : } else {
4837 76 : stat->io_error = NULL;
4838 : }
4839 :
4840 209 : spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL);
4841 :
4842 209 : return stat;
4843 : }
4844 :
4845 : void
4846 209 : bdev_free_io_stat(struct spdk_bdev_io_stat *stat)
4847 : {
4848 209 : if (stat != NULL) {
4849 209 : free(stat->io_error);
4850 209 : free(stat);
4851 : }
4852 209 : }
4853 :
4854 : void
4855 0 : spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w)
4856 : {
4857 : int i;
4858 :
4859 0 : spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read);
4860 0 : spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops);
4861 0 : spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written);
4862 0 : spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops);
4863 0 : spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped);
4864 0 : spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops);
4865 0 : spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied);
4866 0 : spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops);
4867 0 : spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks);
4868 0 : spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks);
4869 0 : spdk_json_write_named_uint64(w, "min_read_latency_ticks",
4870 0 : stat->min_read_latency_ticks != UINT64_MAX ?
4871 : stat->min_read_latency_ticks : 0);
4872 0 : spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks);
4873 0 : spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks);
4874 0 : spdk_json_write_named_uint64(w, "min_write_latency_ticks",
4875 0 : stat->min_write_latency_ticks != UINT64_MAX ?
4876 : stat->min_write_latency_ticks : 0);
4877 0 : spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks);
4878 0 : spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks);
4879 0 : spdk_json_write_named_uint64(w, "min_unmap_latency_ticks",
4880 0 : stat->min_unmap_latency_ticks != UINT64_MAX ?
4881 : stat->min_unmap_latency_ticks : 0);
4882 0 : spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks);
4883 0 : spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks);
4884 0 : spdk_json_write_named_uint64(w, "min_copy_latency_ticks",
4885 0 : stat->min_copy_latency_ticks != UINT64_MAX ?
4886 : stat->min_copy_latency_ticks : 0);
4887 :
4888 0 : if (stat->io_error != NULL) {
4889 0 : spdk_json_write_named_object_begin(w, "io_error");
4890 0 : for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) {
4891 0 : if (stat->io_error->error_status[i] != 0) {
4892 0 : spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)),
4893 0 : stat->io_error->error_status[i]);
4894 : }
4895 : }
4896 0 : spdk_json_write_object_end(w);
4897 : }
4898 0 : }
4899 :
4900 : static void
4901 80 : bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch)
4902 : {
4903 80 : struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
4904 80 : struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch;
4905 :
4906 80 : bdev_abort_all_nomem_io(ch);
4907 80 : bdev_abort_all_buf_io(mgmt_ch, ch);
4908 80 : }
4909 :
4910 : static void
4911 76 : bdev_channel_destroy(void *io_device, void *ctx_buf)
4912 : {
4913 76 : struct spdk_bdev_channel *ch = ctx_buf;
4914 :
4915 76 : SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
4916 : spdk_get_thread());
4917 :
4918 76 : spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0,
4919 : spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
4920 :
4921 : /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */
4922 76 : spdk_spin_lock(&ch->bdev->internal.spinlock);
4923 76 : spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat);
4924 76 : spdk_spin_unlock(&ch->bdev->internal.spinlock);
4925 :
4926 76 : bdev_channel_abort_queued_ios(ch);
4927 :
4928 76 : if (ch->histogram) {
4929 0 : spdk_histogram_data_free(ch->histogram);
4930 : }
4931 :
4932 76 : bdev_channel_destroy_resource(ch);
4933 76 : }
4934 :
4935 : /*
4936 : * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer
4937 : * to it. Hence we do not have to call bdev_get_by_name() when using this function.
4938 : */
4939 : static int
4940 269 : bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name)
4941 : {
4942 : struct spdk_bdev_name *tmp;
4943 :
4944 269 : bdev_name->name = strdup(name);
4945 269 : if (bdev_name->name == NULL) {
4946 0 : SPDK_ERRLOG("Unable to allocate bdev name\n");
4947 0 : return -ENOMEM;
4948 : }
4949 :
4950 269 : bdev_name->bdev = bdev;
4951 :
4952 269 : spdk_spin_lock(&g_bdev_mgr.spinlock);
4953 269 : tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4954 269 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
4955 :
4956 269 : if (tmp != NULL) {
4957 4 : SPDK_ERRLOG("Bdev name %s already exists\n", name);
4958 4 : free(bdev_name->name);
4959 4 : return -EEXIST;
4960 : }
4961 :
4962 265 : return 0;
4963 : }
4964 :
4965 : static void
4966 265 : bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name)
4967 : {
4968 265 : RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4969 265 : free(bdev_name->name);
4970 265 : }
4971 :
4972 : static void
4973 5 : bdev_name_del(struct spdk_bdev_name *bdev_name)
4974 : {
4975 5 : spdk_spin_lock(&g_bdev_mgr.spinlock);
4976 5 : bdev_name_del_unsafe(bdev_name);
4977 5 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
4978 5 : }
4979 :
4980 : int
4981 139 : spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
4982 : {
4983 : struct spdk_bdev_alias *tmp;
4984 : int ret;
4985 :
4986 139 : if (alias == NULL) {
4987 1 : SPDK_ERRLOG("Empty alias passed\n");
4988 1 : return -EINVAL;
4989 : }
4990 :
4991 138 : tmp = calloc(1, sizeof(*tmp));
4992 138 : if (tmp == NULL) {
4993 0 : SPDK_ERRLOG("Unable to allocate alias\n");
4994 0 : return -ENOMEM;
4995 : }
4996 :
4997 138 : ret = bdev_name_add(&tmp->alias, bdev, alias);
4998 138 : if (ret != 0) {
4999 4 : free(tmp);
5000 4 : return ret;
5001 : }
5002 :
5003 134 : TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
5004 :
5005 134 : return 0;
5006 : }
5007 :
5008 : static int
5009 135 : bdev_alias_del(struct spdk_bdev *bdev, const char *alias,
5010 : void (*alias_del_fn)(struct spdk_bdev_name *n))
5011 : {
5012 : struct spdk_bdev_alias *tmp;
5013 :
5014 140 : TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
5015 136 : if (strcmp(alias, tmp->alias.name) == 0) {
5016 131 : TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
5017 131 : alias_del_fn(&tmp->alias);
5018 131 : free(tmp);
5019 131 : return 0;
5020 : }
5021 : }
5022 :
5023 4 : return -ENOENT;
5024 : }
5025 :
5026 : int
5027 4 : spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
5028 : {
5029 : int rc;
5030 :
5031 4 : rc = bdev_alias_del(bdev, alias, bdev_name_del);
5032 4 : if (rc == -ENOENT) {
5033 2 : SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias);
5034 : }
5035 :
5036 4 : return rc;
5037 : }
5038 :
5039 : void
5040 2 : spdk_bdev_alias_del_all(struct spdk_bdev *bdev)
5041 : {
5042 : struct spdk_bdev_alias *p, *tmp;
5043 :
5044 5 : TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) {
5045 3 : TAILQ_REMOVE(&bdev->aliases, p, tailq);
5046 3 : bdev_name_del(&p->alias);
5047 3 : free(p);
5048 : }
5049 2 : }
5050 :
5051 : struct spdk_io_channel *
5052 78 : spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
5053 : {
5054 78 : return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc)));
5055 : }
5056 :
5057 : void *
5058 0 : spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc)
5059 : {
5060 0 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5061 0 : void *ctx = NULL;
5062 :
5063 0 : if (bdev->fn_table->get_module_ctx) {
5064 0 : ctx = bdev->fn_table->get_module_ctx(bdev->ctxt);
5065 : }
5066 :
5067 0 : return ctx;
5068 : }
5069 :
5070 : const char *
5071 0 : spdk_bdev_get_module_name(const struct spdk_bdev *bdev)
5072 : {
5073 0 : return bdev->module->name;
5074 : }
5075 :
5076 : const char *
5077 265 : spdk_bdev_get_name(const struct spdk_bdev *bdev)
5078 : {
5079 265 : return bdev->name;
5080 : }
5081 :
5082 : const char *
5083 0 : spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
5084 : {
5085 0 : return bdev->product_name;
5086 : }
5087 :
5088 : const struct spdk_bdev_aliases_list *
5089 0 : spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
5090 : {
5091 0 : return &bdev->aliases;
5092 : }
5093 :
5094 : uint32_t
5095 5 : spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
5096 : {
5097 5 : return bdev->blocklen;
5098 : }
5099 :
5100 : uint32_t
5101 0 : spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev)
5102 : {
5103 0 : return bdev->write_unit_size;
5104 : }
5105 :
5106 : uint64_t
5107 0 : spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
5108 : {
5109 0 : return bdev->blockcnt;
5110 : }
5111 :
5112 : const char *
5113 0 : spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type)
5114 : {
5115 0 : return qos_rpc_type[type];
5116 : }
5117 :
5118 : void
5119 0 : spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
5120 : {
5121 : int i;
5122 :
5123 0 : memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
5124 :
5125 0 : spdk_spin_lock(&bdev->internal.spinlock);
5126 0 : if (bdev->internal.qos) {
5127 0 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
5128 0 : if (bdev->internal.qos->rate_limits[i].limit !=
5129 : SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
5130 0 : limits[i] = bdev->internal.qos->rate_limits[i].limit;
5131 0 : if (bdev_qos_is_iops_rate_limit(i) == false) {
5132 : /* Change from Byte to Megabyte which is user visible. */
5133 0 : limits[i] = limits[i] / 1024 / 1024;
5134 : }
5135 : }
5136 : }
5137 : }
5138 0 : spdk_spin_unlock(&bdev->internal.spinlock);
5139 0 : }
5140 :
5141 : size_t
5142 281 : spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
5143 : {
5144 281 : return 1 << bdev->required_alignment;
5145 : }
5146 :
5147 : uint32_t
5148 0 : spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
5149 : {
5150 0 : return bdev->optimal_io_boundary;
5151 : }
5152 :
5153 : bool
5154 0 : spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
5155 : {
5156 0 : return bdev->write_cache;
5157 : }
5158 :
5159 : const struct spdk_uuid *
5160 0 : spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
5161 : {
5162 0 : return &bdev->uuid;
5163 : }
5164 :
5165 : uint16_t
5166 0 : spdk_bdev_get_acwu(const struct spdk_bdev *bdev)
5167 : {
5168 0 : return bdev->acwu;
5169 : }
5170 :
5171 : uint32_t
5172 29 : spdk_bdev_get_md_size(const struct spdk_bdev *bdev)
5173 : {
5174 29 : return bdev->md_len;
5175 : }
5176 :
5177 : bool
5178 136 : spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev)
5179 : {
5180 136 : return (bdev->md_len != 0) && bdev->md_interleave;
5181 : }
5182 :
5183 : bool
5184 117 : spdk_bdev_is_md_separate(const struct spdk_bdev *bdev)
5185 : {
5186 117 : return (bdev->md_len != 0) && !bdev->md_interleave;
5187 : }
5188 :
5189 : bool
5190 0 : spdk_bdev_is_zoned(const struct spdk_bdev *bdev)
5191 : {
5192 0 : return bdev->zoned;
5193 : }
5194 :
5195 : uint32_t
5196 127 : spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev)
5197 : {
5198 127 : if (spdk_bdev_is_md_interleaved(bdev)) {
5199 0 : return bdev->blocklen - bdev->md_len;
5200 : } else {
5201 127 : return bdev->blocklen;
5202 : }
5203 : }
5204 :
5205 : uint32_t
5206 0 : spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev)
5207 : {
5208 0 : return bdev->phys_blocklen;
5209 : }
5210 :
5211 : static uint32_t
5212 9 : _bdev_get_block_size_with_md(const struct spdk_bdev *bdev)
5213 : {
5214 9 : if (!spdk_bdev_is_md_interleaved(bdev)) {
5215 6 : return bdev->blocklen + bdev->md_len;
5216 : } else {
5217 3 : return bdev->blocklen;
5218 : }
5219 : }
5220 :
5221 : /* We have to use the typedef in the function declaration to appease astyle. */
5222 : typedef enum spdk_dif_type spdk_dif_type_t;
5223 : typedef enum spdk_dif_pi_format spdk_dif_pi_format_t;
5224 :
5225 : spdk_dif_type_t
5226 0 : spdk_bdev_get_dif_type(const struct spdk_bdev *bdev)
5227 : {
5228 0 : if (bdev->md_len != 0) {
5229 0 : return bdev->dif_type;
5230 : } else {
5231 0 : return SPDK_DIF_DISABLE;
5232 : }
5233 : }
5234 :
5235 : spdk_dif_pi_format_t
5236 0 : spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev)
5237 : {
5238 0 : return bdev->dif_pi_format;
5239 : }
5240 :
5241 : bool
5242 0 : spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev)
5243 : {
5244 0 : if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
5245 0 : return bdev->dif_is_head_of_md;
5246 : } else {
5247 0 : return false;
5248 : }
5249 : }
5250 :
5251 : bool
5252 0 : spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev,
5253 : enum spdk_dif_check_type check_type)
5254 : {
5255 0 : if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) {
5256 0 : return false;
5257 : }
5258 :
5259 0 : switch (check_type) {
5260 0 : case SPDK_DIF_CHECK_TYPE_REFTAG:
5261 0 : return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0;
5262 0 : case SPDK_DIF_CHECK_TYPE_APPTAG:
5263 0 : return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0;
5264 0 : case SPDK_DIF_CHECK_TYPE_GUARD:
5265 0 : return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0;
5266 0 : default:
5267 0 : return false;
5268 : }
5269 : }
5270 :
5271 : static uint32_t
5272 3 : bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes)
5273 : {
5274 : uint64_t aligned_length, max_write_blocks;
5275 :
5276 3 : aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1);
5277 3 : max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev);
5278 3 : max_write_blocks -= max_write_blocks % bdev->write_unit_size;
5279 :
5280 3 : return max_write_blocks;
5281 : }
5282 :
5283 : uint32_t
5284 1 : spdk_bdev_get_max_copy(const struct spdk_bdev *bdev)
5285 : {
5286 1 : return bdev->max_copy;
5287 : }
5288 :
5289 : uint64_t
5290 0 : spdk_bdev_get_qd(const struct spdk_bdev *bdev)
5291 : {
5292 0 : return bdev->internal.measured_queue_depth;
5293 : }
5294 :
5295 : uint64_t
5296 0 : spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev)
5297 : {
5298 0 : return bdev->internal.period;
5299 : }
5300 :
5301 : uint64_t
5302 0 : spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev)
5303 : {
5304 0 : return bdev->internal.weighted_io_time;
5305 : }
5306 :
5307 : uint64_t
5308 0 : spdk_bdev_get_io_time(const struct spdk_bdev *bdev)
5309 : {
5310 0 : return bdev->internal.io_time;
5311 : }
5312 :
5313 0 : union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev)
5314 : {
5315 0 : return bdev->ctratt;
5316 : }
5317 :
5318 : uint32_t
5319 0 : spdk_bdev_get_nvme_nsid(struct spdk_bdev *bdev)
5320 : {
5321 0 : return bdev->nsid;
5322 : }
5323 :
5324 : uint32_t
5325 0 : spdk_bdev_desc_get_block_size(struct spdk_bdev_desc *desc)
5326 : {
5327 0 : struct spdk_bdev *bdev = desc->bdev;
5328 :
5329 0 : return desc->opts.hide_metadata ? bdev->blocklen - bdev->md_len : bdev->blocklen;
5330 : }
5331 :
5332 : uint32_t
5333 0 : spdk_bdev_desc_get_md_size(struct spdk_bdev_desc *desc)
5334 : {
5335 0 : struct spdk_bdev *bdev = desc->bdev;
5336 :
5337 0 : return desc->opts.hide_metadata ? 0 : bdev->md_len;
5338 : }
5339 :
5340 : bool
5341 0 : spdk_bdev_desc_is_md_interleaved(struct spdk_bdev_desc *desc)
5342 : {
5343 0 : struct spdk_bdev *bdev = desc->bdev;
5344 :
5345 0 : return desc->opts.hide_metadata ? false : spdk_bdev_is_md_interleaved(bdev);
5346 : }
5347 :
5348 : bool
5349 0 : spdk_bdev_desc_is_md_separate(struct spdk_bdev_desc *desc)
5350 : {
5351 0 : struct spdk_bdev *bdev = desc->bdev;
5352 :
5353 0 : return desc->opts.hide_metadata ? false : spdk_bdev_is_md_separate(bdev);
5354 : }
5355 :
5356 : spdk_dif_type_t
5357 0 : spdk_bdev_desc_get_dif_type(struct spdk_bdev_desc *desc)
5358 : {
5359 0 : struct spdk_bdev *bdev = desc->bdev;
5360 :
5361 0 : return desc->opts.hide_metadata ? SPDK_DIF_DISABLE : spdk_bdev_get_dif_type(bdev);
5362 : }
5363 :
5364 : spdk_dif_pi_format_t
5365 0 : spdk_bdev_desc_get_dif_pi_format(struct spdk_bdev_desc *desc)
5366 : {
5367 0 : struct spdk_bdev *bdev = desc->bdev;
5368 :
5369 0 : return desc->opts.hide_metadata ? SPDK_DIF_PI_FORMAT_16 : spdk_bdev_get_dif_pi_format(bdev);
5370 : }
5371 :
5372 : bool
5373 0 : spdk_bdev_desc_is_dif_head_of_md(struct spdk_bdev_desc *desc)
5374 : {
5375 0 : struct spdk_bdev *bdev = desc->bdev;
5376 :
5377 0 : return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_head_of_md(bdev);
5378 : }
5379 :
5380 : bool
5381 0 : spdk_bdev_desc_is_dif_check_enabled(struct spdk_bdev_desc *desc,
5382 : enum spdk_dif_check_type check_type)
5383 : {
5384 0 : struct spdk_bdev *bdev = desc->bdev;
5385 :
5386 0 : return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_check_enabled(bdev, check_type);
5387 : }
5388 :
5389 : static void bdev_update_qd_sampling_period(void *ctx);
5390 :
5391 : static void
5392 1 : _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status)
5393 : {
5394 1 : bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth;
5395 :
5396 1 : if (bdev->internal.measured_queue_depth) {
5397 0 : bdev->internal.io_time += bdev->internal.period;
5398 0 : bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth;
5399 : }
5400 :
5401 1 : bdev->internal.qd_poll_in_progress = false;
5402 :
5403 1 : bdev_update_qd_sampling_period(bdev);
5404 1 : }
5405 :
5406 : static void
5407 1 : _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
5408 : struct spdk_io_channel *io_ch, void *_ctx)
5409 : {
5410 1 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch);
5411 :
5412 1 : bdev->internal.temporary_queue_depth += ch->io_outstanding;
5413 1 : spdk_bdev_for_each_channel_continue(i, 0);
5414 1 : }
5415 :
5416 : static int
5417 1 : bdev_calculate_measured_queue_depth(void *ctx)
5418 : {
5419 1 : struct spdk_bdev *bdev = ctx;
5420 :
5421 1 : bdev->internal.qd_poll_in_progress = true;
5422 1 : bdev->internal.temporary_queue_depth = 0;
5423 1 : spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl);
5424 1 : return SPDK_POLLER_BUSY;
5425 : }
5426 :
5427 : static void
5428 5 : bdev_update_qd_sampling_period(void *ctx)
5429 : {
5430 5 : struct spdk_bdev *bdev = ctx;
5431 :
5432 5 : if (bdev->internal.period == bdev->internal.new_period) {
5433 0 : return;
5434 : }
5435 :
5436 5 : if (bdev->internal.qd_poll_in_progress) {
5437 1 : return;
5438 : }
5439 :
5440 4 : bdev->internal.period = bdev->internal.new_period;
5441 :
5442 4 : spdk_poller_unregister(&bdev->internal.qd_poller);
5443 4 : if (bdev->internal.period != 0) {
5444 2 : bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
5445 : bdev, bdev->internal.period);
5446 : } else {
5447 2 : spdk_bdev_close(bdev->internal.qd_desc);
5448 2 : bdev->internal.qd_desc = NULL;
5449 : }
5450 : }
5451 :
5452 : static void
5453 0 : _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
5454 : {
5455 0 : SPDK_NOTICELOG("Unexpected event type: %d\n", type);
5456 0 : }
5457 :
5458 : void
5459 136 : spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period)
5460 : {
5461 : int rc;
5462 :
5463 136 : if (bdev->internal.new_period == period) {
5464 130 : return;
5465 : }
5466 :
5467 6 : bdev->internal.new_period = period;
5468 :
5469 6 : if (bdev->internal.qd_desc != NULL) {
5470 4 : assert(bdev->internal.period != 0);
5471 :
5472 4 : spdk_thread_send_msg(bdev->internal.qd_desc->thread,
5473 : bdev_update_qd_sampling_period, bdev);
5474 4 : return;
5475 : }
5476 :
5477 2 : assert(bdev->internal.period == 0);
5478 :
5479 2 : rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb,
5480 : NULL, &bdev->internal.qd_desc);
5481 2 : if (rc != 0) {
5482 0 : return;
5483 : }
5484 :
5485 2 : bdev->internal.period = period;
5486 2 : bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
5487 : bdev, period);
5488 : }
5489 :
5490 : struct bdev_get_current_qd_ctx {
5491 : uint64_t current_qd;
5492 : spdk_bdev_get_current_qd_cb cb_fn;
5493 : void *cb_arg;
5494 : };
5495 :
5496 : static void
5497 0 : bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status)
5498 : {
5499 0 : struct bdev_get_current_qd_ctx *ctx = _ctx;
5500 :
5501 0 : ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0);
5502 :
5503 0 : free(ctx);
5504 0 : }
5505 :
5506 : static void
5507 0 : bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
5508 : struct spdk_io_channel *io_ch, void *_ctx)
5509 : {
5510 0 : struct bdev_get_current_qd_ctx *ctx = _ctx;
5511 0 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
5512 :
5513 0 : ctx->current_qd += bdev_ch->io_outstanding;
5514 :
5515 0 : spdk_bdev_for_each_channel_continue(i, 0);
5516 0 : }
5517 :
5518 : void
5519 0 : spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn,
5520 : void *cb_arg)
5521 : {
5522 : struct bdev_get_current_qd_ctx *ctx;
5523 :
5524 0 : assert(cb_fn != NULL);
5525 :
5526 0 : ctx = calloc(1, sizeof(*ctx));
5527 0 : if (ctx == NULL) {
5528 0 : cb_fn(bdev, 0, cb_arg, -ENOMEM);
5529 0 : return;
5530 : }
5531 :
5532 0 : ctx->cb_fn = cb_fn;
5533 0 : ctx->cb_arg = cb_arg;
5534 :
5535 0 : spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done);
5536 : }
5537 :
5538 : static void
5539 25 : _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type)
5540 : {
5541 25 : assert(desc->thread == spdk_get_thread());
5542 :
5543 25 : spdk_spin_lock(&desc->spinlock);
5544 25 : desc->refs--;
5545 25 : if (!desc->closed) {
5546 14 : spdk_spin_unlock(&desc->spinlock);
5547 14 : desc->callback.event_fn(type,
5548 : desc->bdev,
5549 : desc->callback.ctx);
5550 14 : return;
5551 11 : } else if (desc->refs == 0) {
5552 : /* This descriptor was closed after this event_notify message was sent.
5553 : * spdk_bdev_close() could not free the descriptor since this message was
5554 : * in flight, so we free it now using bdev_desc_free().
5555 : */
5556 10 : spdk_spin_unlock(&desc->spinlock);
5557 10 : bdev_desc_free(desc);
5558 10 : return;
5559 : }
5560 1 : spdk_spin_unlock(&desc->spinlock);
5561 : }
5562 :
5563 : static void
5564 25 : event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn)
5565 : {
5566 25 : spdk_spin_lock(&desc->spinlock);
5567 25 : desc->refs++;
5568 25 : spdk_thread_send_msg(desc->thread, event_notify_fn, desc);
5569 25 : spdk_spin_unlock(&desc->spinlock);
5570 25 : }
5571 :
5572 : static void
5573 6 : _resize_notify(void *ctx)
5574 : {
5575 6 : struct spdk_bdev_desc *desc = ctx;
5576 :
5577 6 : _event_notify(desc, SPDK_BDEV_EVENT_RESIZE);
5578 6 : }
5579 :
5580 : int
5581 11 : spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
5582 : {
5583 : struct spdk_bdev_desc *desc;
5584 : int ret;
5585 :
5586 11 : if (size == bdev->blockcnt) {
5587 0 : return 0;
5588 : }
5589 :
5590 11 : spdk_spin_lock(&bdev->internal.spinlock);
5591 :
5592 : /* bdev has open descriptors */
5593 11 : if (!TAILQ_EMPTY(&bdev->internal.open_descs) &&
5594 7 : bdev->blockcnt > size) {
5595 1 : ret = -EBUSY;
5596 : } else {
5597 10 : bdev->blockcnt = size;
5598 16 : TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
5599 6 : event_notify(desc, _resize_notify);
5600 : }
5601 10 : ret = 0;
5602 : }
5603 :
5604 11 : spdk_spin_unlock(&bdev->internal.spinlock);
5605 :
5606 11 : return ret;
5607 : }
5608 :
5609 : /*
5610 : * Convert I/O offset and length from bytes to blocks.
5611 : *
5612 : * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
5613 : */
5614 : static uint64_t
5615 20 : bdev_bytes_to_blocks(struct spdk_bdev_desc *desc, uint64_t offset_bytes,
5616 : uint64_t *offset_blocks, uint64_t num_bytes, uint64_t *num_blocks)
5617 : {
5618 20 : uint32_t block_size = bdev_desc_get_block_size(desc);
5619 : uint8_t shift_cnt;
5620 :
5621 : /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
5622 20 : if (spdk_likely(spdk_u32_is_pow2(block_size))) {
5623 17 : shift_cnt = spdk_u32log2(block_size);
5624 17 : *offset_blocks = offset_bytes >> shift_cnt;
5625 17 : *num_blocks = num_bytes >> shift_cnt;
5626 17 : return (offset_bytes - (*offset_blocks << shift_cnt)) |
5627 17 : (num_bytes - (*num_blocks << shift_cnt));
5628 : } else {
5629 3 : *offset_blocks = offset_bytes / block_size;
5630 3 : *num_blocks = num_bytes / block_size;
5631 3 : return (offset_bytes % block_size) | (num_bytes % block_size);
5632 : }
5633 : }
5634 :
5635 : static bool
5636 705 : bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
5637 : {
5638 : /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
5639 : * has been an overflow and hence the offset has been wrapped around */
5640 705 : if (offset_blocks + num_blocks < offset_blocks) {
5641 1 : return false;
5642 : }
5643 :
5644 : /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
5645 704 : if (offset_blocks + num_blocks > bdev->blockcnt) {
5646 2 : return false;
5647 : }
5648 :
5649 702 : return true;
5650 : }
5651 :
5652 : static void
5653 2 : bdev_seek_complete_cb(void *ctx)
5654 : {
5655 2 : struct spdk_bdev_io *bdev_io = ctx;
5656 :
5657 2 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
5658 2 : bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
5659 2 : }
5660 :
5661 : static int
5662 4 : bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5663 : uint64_t offset_blocks, enum spdk_bdev_io_type io_type,
5664 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5665 : {
5666 4 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5667 : struct spdk_bdev_io *bdev_io;
5668 4 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5669 :
5670 4 : assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE);
5671 :
5672 : /* Check if offset_blocks is valid looking at the validity of one block */
5673 4 : if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) {
5674 0 : return -EINVAL;
5675 : }
5676 :
5677 4 : bdev_io = bdev_channel_get_io(channel);
5678 4 : if (!bdev_io) {
5679 0 : return -ENOMEM;
5680 : }
5681 :
5682 4 : bdev_io->internal.ch = channel;
5683 4 : bdev_io->internal.desc = desc;
5684 4 : bdev_io->type = io_type;
5685 4 : bdev_io->u.bdev.offset_blocks = offset_blocks;
5686 4 : bdev_io->u.bdev.memory_domain = NULL;
5687 4 : bdev_io->u.bdev.memory_domain_ctx = NULL;
5688 4 : bdev_io->u.bdev.accel_sequence = NULL;
5689 4 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
5690 :
5691 4 : if (!spdk_bdev_io_type_supported(bdev, io_type)) {
5692 : /* In case bdev doesn't support seek to next data/hole offset,
5693 : * it is assumed that only data and no holes are present */
5694 2 : if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) {
5695 1 : bdev_io->u.bdev.seek.offset = offset_blocks;
5696 : } else {
5697 1 : bdev_io->u.bdev.seek.offset = UINT64_MAX;
5698 : }
5699 :
5700 2 : spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io);
5701 2 : return 0;
5702 : }
5703 :
5704 2 : bdev_io_submit(bdev_io);
5705 2 : return 0;
5706 : }
5707 :
5708 : int
5709 2 : spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5710 : uint64_t offset_blocks,
5711 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5712 : {
5713 2 : return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg);
5714 : }
5715 :
5716 : int
5717 2 : spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5718 : uint64_t offset_blocks,
5719 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5720 : {
5721 2 : return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg);
5722 : }
5723 :
5724 : uint64_t
5725 4 : spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io)
5726 : {
5727 4 : return bdev_io->u.bdev.seek.offset;
5728 : }
5729 :
5730 : static int
5731 220 : bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf,
5732 : void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5733 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5734 : {
5735 220 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5736 : struct spdk_bdev_io *bdev_io;
5737 220 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5738 :
5739 220 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5740 0 : return -EINVAL;
5741 : }
5742 :
5743 220 : bdev_io = bdev_channel_get_io(channel);
5744 220 : if (!bdev_io) {
5745 1 : return -ENOMEM;
5746 : }
5747 :
5748 219 : bdev_io->internal.ch = channel;
5749 219 : bdev_io->internal.desc = desc;
5750 219 : bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5751 219 : bdev_io->u.bdev.iovs = &bdev_io->iov;
5752 219 : bdev_io->u.bdev.iovs[0].iov_base = buf;
5753 219 : bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc);
5754 219 : bdev_io->u.bdev.iovcnt = 1;
5755 219 : bdev_io->u.bdev.md_buf = md_buf;
5756 219 : bdev_io->u.bdev.num_blocks = num_blocks;
5757 219 : bdev_io->u.bdev.offset_blocks = offset_blocks;
5758 219 : bdev_io->u.bdev.memory_domain = NULL;
5759 219 : bdev_io->u.bdev.memory_domain_ctx = NULL;
5760 219 : bdev_io->u.bdev.accel_sequence = NULL;
5761 219 : bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags;
5762 219 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
5763 :
5764 219 : bdev_io_submit(bdev_io);
5765 219 : return 0;
5766 : }
5767 :
5768 : int
5769 3 : spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5770 : void *buf, uint64_t offset, uint64_t nbytes,
5771 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5772 : {
5773 3 : uint64_t offset_blocks, num_blocks;
5774 :
5775 3 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
5776 0 : return -EINVAL;
5777 : }
5778 :
5779 3 : return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
5780 : }
5781 :
5782 : int
5783 216 : spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5784 : void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5785 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5786 : {
5787 216 : return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg);
5788 : }
5789 :
5790 : int
5791 4 : spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5792 : void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5793 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5794 : {
5795 4 : struct iovec iov = {
5796 : .iov_base = buf,
5797 : };
5798 :
5799 4 : if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5800 0 : return -EINVAL;
5801 : }
5802 :
5803 4 : if ((md_buf || desc->opts.hide_metadata) && !_is_buf_allocated(&iov)) {
5804 0 : return -EINVAL;
5805 : }
5806 :
5807 4 : return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5808 : cb, cb_arg);
5809 : }
5810 :
5811 : int
5812 5 : spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5813 : struct iovec *iov, int iovcnt,
5814 : uint64_t offset, uint64_t nbytes,
5815 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5816 : {
5817 5 : uint64_t offset_blocks, num_blocks;
5818 :
5819 5 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
5820 0 : return -EINVAL;
5821 : }
5822 :
5823 5 : return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
5824 : }
5825 :
5826 : static int
5827 226 : bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5828 : struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
5829 : uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx,
5830 : struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
5831 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5832 : {
5833 226 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5834 : struct spdk_bdev_io *bdev_io;
5835 226 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5836 :
5837 226 : if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) {
5838 0 : return -EINVAL;
5839 : }
5840 :
5841 226 : bdev_io = bdev_channel_get_io(channel);
5842 226 : if (spdk_unlikely(!bdev_io)) {
5843 2 : return -ENOMEM;
5844 : }
5845 :
5846 224 : bdev_io->internal.ch = channel;
5847 224 : bdev_io->internal.desc = desc;
5848 224 : bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5849 224 : bdev_io->u.bdev.iovs = iov;
5850 224 : bdev_io->u.bdev.iovcnt = iovcnt;
5851 224 : bdev_io->u.bdev.md_buf = md_buf;
5852 224 : bdev_io->u.bdev.num_blocks = num_blocks;
5853 224 : bdev_io->u.bdev.offset_blocks = offset_blocks;
5854 224 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
5855 :
5856 224 : if (seq != NULL) {
5857 0 : bdev_io->internal.f.has_accel_sequence = true;
5858 0 : bdev_io->internal.accel_sequence = seq;
5859 : }
5860 :
5861 224 : if (domain != NULL) {
5862 2 : bdev_io->internal.f.has_memory_domain = true;
5863 2 : bdev_io->internal.memory_domain = domain;
5864 2 : bdev_io->internal.memory_domain_ctx = domain_ctx;
5865 : }
5866 :
5867 224 : bdev_io->u.bdev.memory_domain = domain;
5868 224 : bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
5869 224 : bdev_io->u.bdev.accel_sequence = seq;
5870 224 : bdev_io->u.bdev.dif_check_flags = dif_check_flags;
5871 :
5872 224 : _bdev_io_submit_ext(desc, bdev_io);
5873 :
5874 224 : return 0;
5875 : }
5876 :
5877 : int
5878 21 : spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5879 : struct iovec *iov, int iovcnt,
5880 : uint64_t offset_blocks, uint64_t num_blocks,
5881 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5882 : {
5883 21 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5884 :
5885 21 : return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5886 : num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg);
5887 : }
5888 :
5889 : int
5890 4 : spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5891 : struct iovec *iov, int iovcnt, void *md_buf,
5892 : uint64_t offset_blocks, uint64_t num_blocks,
5893 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5894 : {
5895 4 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5896 :
5897 4 : if (md_buf && !spdk_bdev_is_md_separate(bdev)) {
5898 0 : return -EINVAL;
5899 : }
5900 :
5901 4 : if (md_buf && !_is_buf_allocated(iov)) {
5902 0 : return -EINVAL;
5903 : }
5904 :
5905 4 : return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5906 : num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg);
5907 : }
5908 :
5909 : static inline bool
5910 14 : _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov)
5911 : {
5912 : /*
5913 : * We check if opts size is at least of size when we first introduced
5914 : * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members
5915 : * are not checked internal.
5916 : */
5917 14 : return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) +
5918 10 : sizeof(opts->metadata) &&
5919 22 : opts->size <= sizeof(*opts) &&
5920 : /* When memory domain is used, the user must provide data buffers */
5921 8 : (!opts->memory_domain || (iov && iov[0].iov_base));
5922 : }
5923 :
5924 : int
5925 8 : spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5926 : struct iovec *iov, int iovcnt,
5927 : uint64_t offset_blocks, uint64_t num_blocks,
5928 : spdk_bdev_io_completion_cb cb, void *cb_arg,
5929 : struct spdk_bdev_ext_io_opts *opts)
5930 : {
5931 8 : struct spdk_memory_domain *domain = NULL;
5932 8 : struct spdk_accel_sequence *seq = NULL;
5933 8 : void *domain_ctx = NULL, *md = NULL;
5934 8 : uint32_t dif_check_flags = 0;
5935 : uint32_t nvme_cdw12_raw;
5936 8 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5937 :
5938 8 : if (opts) {
5939 7 : if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
5940 3 : return -EINVAL;
5941 : }
5942 :
5943 4 : md = opts->metadata;
5944 4 : domain = bdev_get_ext_io_opt(opts, memory_domain, NULL);
5945 4 : domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL);
5946 4 : seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL);
5947 4 : nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0);
5948 4 : if (md) {
5949 4 : if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) {
5950 0 : return -EINVAL;
5951 : }
5952 :
5953 4 : if (spdk_unlikely(!_is_buf_allocated(iov))) {
5954 0 : return -EINVAL;
5955 : }
5956 :
5957 4 : if (spdk_unlikely(seq != NULL)) {
5958 0 : return -EINVAL;
5959 : }
5960 :
5961 4 : if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) {
5962 0 : SPDK_ERRLOG("Separate metadata with NVMe PRACT is not supported.\n");
5963 0 : return -ENOTSUP;
5964 : }
5965 : }
5966 :
5967 4 : if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) {
5968 0 : dif_check_flags |= SPDK_DIF_FLAGS_NVME_PRACT;
5969 : }
5970 : }
5971 :
5972 5 : dif_check_flags |= bdev->dif_check_flags &
5973 5 : ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0));
5974 :
5975 5 : return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks,
5976 : num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg);
5977 : }
5978 :
5979 : static int
5980 36 : bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5981 : void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5982 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5983 : {
5984 36 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5985 : struct spdk_bdev_io *bdev_io;
5986 36 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5987 :
5988 36 : if (!desc->write) {
5989 0 : return -EBADF;
5990 : }
5991 :
5992 36 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5993 0 : return -EINVAL;
5994 : }
5995 :
5996 36 : bdev_io = bdev_channel_get_io(channel);
5997 36 : if (!bdev_io) {
5998 0 : return -ENOMEM;
5999 : }
6000 :
6001 36 : bdev_io->internal.ch = channel;
6002 36 : bdev_io->internal.desc = desc;
6003 36 : bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
6004 36 : bdev_io->u.bdev.iovs = &bdev_io->iov;
6005 36 : bdev_io->u.bdev.iovs[0].iov_base = buf;
6006 36 : bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc);
6007 36 : bdev_io->u.bdev.iovcnt = 1;
6008 36 : bdev_io->u.bdev.md_buf = md_buf;
6009 36 : bdev_io->u.bdev.num_blocks = num_blocks;
6010 36 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6011 36 : bdev_io->u.bdev.memory_domain = NULL;
6012 36 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6013 36 : bdev_io->u.bdev.accel_sequence = NULL;
6014 36 : bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags;
6015 36 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6016 :
6017 36 : bdev_io_submit(bdev_io);
6018 36 : return 0;
6019 : }
6020 :
6021 : int
6022 3 : spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6023 : void *buf, uint64_t offset, uint64_t nbytes,
6024 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6025 : {
6026 3 : uint64_t offset_blocks, num_blocks;
6027 :
6028 3 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
6029 0 : return -EINVAL;
6030 : }
6031 :
6032 3 : return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
6033 : }
6034 :
6035 : int
6036 27 : spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6037 : void *buf, uint64_t offset_blocks, uint64_t num_blocks,
6038 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6039 : {
6040 27 : return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
6041 : cb, cb_arg);
6042 : }
6043 :
6044 : int
6045 3 : spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6046 : void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
6047 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6048 : {
6049 3 : struct iovec iov = {
6050 : .iov_base = buf,
6051 : };
6052 :
6053 3 : if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
6054 0 : return -EINVAL;
6055 : }
6056 :
6057 3 : if (md_buf && !_is_buf_allocated(&iov)) {
6058 0 : return -EINVAL;
6059 : }
6060 :
6061 3 : return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
6062 : cb, cb_arg);
6063 : }
6064 :
6065 : static int
6066 70 : bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6067 : struct iovec *iov, int iovcnt, void *md_buf,
6068 : uint64_t offset_blocks, uint64_t num_blocks,
6069 : struct spdk_memory_domain *domain, void *domain_ctx,
6070 : struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
6071 : uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw,
6072 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6073 : {
6074 70 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6075 : struct spdk_bdev_io *bdev_io;
6076 70 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6077 :
6078 70 : if (spdk_unlikely(!desc->write)) {
6079 0 : return -EBADF;
6080 : }
6081 :
6082 70 : if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) {
6083 0 : return -EINVAL;
6084 : }
6085 :
6086 70 : bdev_io = bdev_channel_get_io(channel);
6087 70 : if (spdk_unlikely(!bdev_io)) {
6088 2 : return -ENOMEM;
6089 : }
6090 :
6091 68 : bdev_io->internal.ch = channel;
6092 68 : bdev_io->internal.desc = desc;
6093 68 : bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
6094 68 : bdev_io->u.bdev.iovs = iov;
6095 68 : bdev_io->u.bdev.iovcnt = iovcnt;
6096 68 : bdev_io->u.bdev.md_buf = md_buf;
6097 68 : bdev_io->u.bdev.num_blocks = num_blocks;
6098 68 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6099 68 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6100 68 : if (seq != NULL) {
6101 0 : bdev_io->internal.f.has_accel_sequence = true;
6102 0 : bdev_io->internal.accel_sequence = seq;
6103 : }
6104 :
6105 68 : if (domain != NULL) {
6106 2 : bdev_io->internal.f.has_memory_domain = true;
6107 2 : bdev_io->internal.memory_domain = domain;
6108 2 : bdev_io->internal.memory_domain_ctx = domain_ctx;
6109 : }
6110 :
6111 68 : bdev_io->u.bdev.memory_domain = domain;
6112 68 : bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
6113 68 : bdev_io->u.bdev.accel_sequence = seq;
6114 68 : bdev_io->u.bdev.dif_check_flags = dif_check_flags;
6115 68 : bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw;
6116 68 : bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw;
6117 :
6118 68 : _bdev_io_submit_ext(desc, bdev_io);
6119 :
6120 68 : return 0;
6121 : }
6122 :
6123 : int
6124 3 : spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6125 : struct iovec *iov, int iovcnt,
6126 : uint64_t offset, uint64_t len,
6127 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6128 : {
6129 3 : uint64_t offset_blocks, num_blocks;
6130 :
6131 3 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) {
6132 0 : return -EINVAL;
6133 : }
6134 :
6135 3 : return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
6136 : }
6137 :
6138 : int
6139 14 : spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6140 : struct iovec *iov, int iovcnt,
6141 : uint64_t offset_blocks, uint64_t num_blocks,
6142 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6143 : {
6144 14 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6145 :
6146 14 : return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
6147 : num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0,
6148 : cb, cb_arg);
6149 : }
6150 :
6151 : int
6152 1 : spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6153 : struct iovec *iov, int iovcnt, void *md_buf,
6154 : uint64_t offset_blocks, uint64_t num_blocks,
6155 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6156 : {
6157 1 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6158 :
6159 1 : if (md_buf && !spdk_bdev_is_md_separate(bdev)) {
6160 0 : return -EINVAL;
6161 : }
6162 :
6163 1 : if (md_buf && !_is_buf_allocated(iov)) {
6164 0 : return -EINVAL;
6165 : }
6166 :
6167 1 : return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
6168 : num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0,
6169 : cb, cb_arg);
6170 : }
6171 :
6172 : int
6173 8 : spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6174 : struct iovec *iov, int iovcnt,
6175 : uint64_t offset_blocks, uint64_t num_blocks,
6176 : spdk_bdev_io_completion_cb cb, void *cb_arg,
6177 : struct spdk_bdev_ext_io_opts *opts)
6178 : {
6179 8 : struct spdk_memory_domain *domain = NULL;
6180 8 : struct spdk_accel_sequence *seq = NULL;
6181 8 : void *domain_ctx = NULL, *md = NULL;
6182 8 : uint32_t dif_check_flags = 0;
6183 8 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6184 8 : uint32_t nvme_cdw12_raw = 0;
6185 8 : uint32_t nvme_cdw13_raw = 0;
6186 :
6187 8 : if (opts) {
6188 7 : if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
6189 3 : return -EINVAL;
6190 : }
6191 4 : md = opts->metadata;
6192 4 : domain = bdev_get_ext_io_opt(opts, memory_domain, NULL);
6193 4 : domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL);
6194 4 : seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL);
6195 4 : nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0);
6196 4 : nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0);
6197 4 : if (md) {
6198 4 : if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) {
6199 0 : return -EINVAL;
6200 : }
6201 :
6202 4 : if (spdk_unlikely(!_is_buf_allocated(iov))) {
6203 0 : return -EINVAL;
6204 : }
6205 :
6206 4 : if (spdk_unlikely(seq != NULL)) {
6207 0 : return -EINVAL;
6208 : }
6209 :
6210 4 : if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) {
6211 0 : SPDK_ERRLOG("Separate metadata with NVMe PRACT is not supported.\n");
6212 0 : return -ENOTSUP;
6213 : }
6214 : }
6215 :
6216 4 : if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) {
6217 0 : dif_check_flags |= SPDK_DIF_FLAGS_NVME_PRACT;
6218 : }
6219 : }
6220 :
6221 5 : dif_check_flags |= bdev->dif_check_flags &
6222 5 : ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0));
6223 :
6224 5 : return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks,
6225 : domain, domain_ctx, seq, dif_check_flags,
6226 : nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg);
6227 : }
6228 :
6229 : static void
6230 11 : bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6231 : {
6232 11 : struct spdk_bdev_io *parent_io = cb_arg;
6233 11 : struct spdk_bdev *bdev = parent_io->bdev;
6234 11 : uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base;
6235 11 : int i, rc = 0;
6236 :
6237 11 : if (!success) {
6238 0 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6239 0 : parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
6240 0 : spdk_bdev_free_io(bdev_io);
6241 0 : return;
6242 : }
6243 :
6244 17 : for (i = 0; i < parent_io->u.bdev.iovcnt; i++) {
6245 22 : rc = memcmp(read_buf,
6246 11 : parent_io->u.bdev.iovs[i].iov_base,
6247 11 : parent_io->u.bdev.iovs[i].iov_len);
6248 11 : if (rc) {
6249 5 : break;
6250 : }
6251 6 : read_buf += parent_io->u.bdev.iovs[i].iov_len;
6252 : }
6253 :
6254 11 : if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) {
6255 2 : rc = memcmp(bdev_io->u.bdev.md_buf,
6256 2 : parent_io->u.bdev.md_buf,
6257 2 : spdk_bdev_get_md_size(bdev));
6258 : }
6259 :
6260 11 : spdk_bdev_free_io(bdev_io);
6261 :
6262 11 : if (rc == 0) {
6263 5 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6264 5 : parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
6265 : } else {
6266 6 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE;
6267 6 : parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
6268 : }
6269 : }
6270 :
6271 : static void
6272 11 : bdev_compare_do_read(void *_bdev_io)
6273 : {
6274 11 : struct spdk_bdev_io *bdev_io = _bdev_io;
6275 : int rc;
6276 :
6277 22 : rc = spdk_bdev_read_blocks(bdev_io->internal.desc,
6278 11 : spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL,
6279 : bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6280 : bdev_compare_do_read_done, bdev_io);
6281 :
6282 11 : if (rc == -ENOMEM) {
6283 0 : bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read);
6284 11 : } else if (rc != 0) {
6285 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6286 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
6287 : }
6288 11 : }
6289 :
6290 : static int
6291 16 : bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6292 : struct iovec *iov, int iovcnt, void *md_buf,
6293 : uint64_t offset_blocks, uint64_t num_blocks,
6294 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6295 : {
6296 16 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6297 : struct spdk_bdev_io *bdev_io;
6298 16 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6299 :
6300 16 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6301 0 : return -EINVAL;
6302 : }
6303 :
6304 16 : bdev_io = bdev_channel_get_io(channel);
6305 16 : if (!bdev_io) {
6306 0 : return -ENOMEM;
6307 : }
6308 :
6309 16 : bdev_io->internal.ch = channel;
6310 16 : bdev_io->internal.desc = desc;
6311 16 : bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
6312 16 : bdev_io->u.bdev.iovs = iov;
6313 16 : bdev_io->u.bdev.iovcnt = iovcnt;
6314 16 : bdev_io->u.bdev.md_buf = md_buf;
6315 16 : bdev_io->u.bdev.num_blocks = num_blocks;
6316 16 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6317 16 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6318 16 : bdev_io->u.bdev.memory_domain = NULL;
6319 16 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6320 16 : bdev_io->u.bdev.accel_sequence = NULL;
6321 :
6322 16 : if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
6323 7 : bdev_io_submit(bdev_io);
6324 7 : return 0;
6325 : }
6326 :
6327 9 : bdev_compare_do_read(bdev_io);
6328 :
6329 9 : return 0;
6330 : }
6331 :
6332 : int
6333 10 : spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6334 : struct iovec *iov, int iovcnt,
6335 : uint64_t offset_blocks, uint64_t num_blocks,
6336 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6337 : {
6338 10 : return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
6339 : num_blocks, cb, cb_arg);
6340 : }
6341 :
6342 : int
6343 6 : spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6344 : struct iovec *iov, int iovcnt, void *md_buf,
6345 : uint64_t offset_blocks, uint64_t num_blocks,
6346 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6347 : {
6348 6 : if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
6349 0 : return -EINVAL;
6350 : }
6351 :
6352 6 : if (md_buf && !_is_buf_allocated(iov)) {
6353 0 : return -EINVAL;
6354 : }
6355 :
6356 6 : return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
6357 : num_blocks, cb, cb_arg);
6358 : }
6359 :
6360 : static int
6361 4 : bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6362 : void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
6363 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6364 : {
6365 4 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6366 : struct spdk_bdev_io *bdev_io;
6367 4 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6368 :
6369 4 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6370 0 : return -EINVAL;
6371 : }
6372 :
6373 4 : bdev_io = bdev_channel_get_io(channel);
6374 4 : if (!bdev_io) {
6375 0 : return -ENOMEM;
6376 : }
6377 :
6378 4 : bdev_io->internal.ch = channel;
6379 4 : bdev_io->internal.desc = desc;
6380 4 : bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
6381 4 : bdev_io->u.bdev.iovs = &bdev_io->iov;
6382 4 : bdev_io->u.bdev.iovs[0].iov_base = buf;
6383 4 : bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc);
6384 4 : bdev_io->u.bdev.iovcnt = 1;
6385 4 : bdev_io->u.bdev.md_buf = md_buf;
6386 4 : bdev_io->u.bdev.num_blocks = num_blocks;
6387 4 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6388 4 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6389 4 : bdev_io->u.bdev.memory_domain = NULL;
6390 4 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6391 4 : bdev_io->u.bdev.accel_sequence = NULL;
6392 :
6393 4 : if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
6394 2 : bdev_io_submit(bdev_io);
6395 2 : return 0;
6396 : }
6397 :
6398 2 : bdev_compare_do_read(bdev_io);
6399 :
6400 2 : return 0;
6401 : }
6402 :
6403 : int
6404 4 : spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6405 : void *buf, uint64_t offset_blocks, uint64_t num_blocks,
6406 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6407 : {
6408 4 : return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
6409 : cb, cb_arg);
6410 : }
6411 :
6412 : int
6413 0 : spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6414 : void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
6415 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6416 : {
6417 0 : struct iovec iov = {
6418 : .iov_base = buf,
6419 : };
6420 :
6421 0 : if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
6422 0 : return -EINVAL;
6423 : }
6424 :
6425 0 : if (md_buf && !_is_buf_allocated(&iov)) {
6426 0 : return -EINVAL;
6427 : }
6428 :
6429 0 : return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
6430 : cb, cb_arg);
6431 : }
6432 :
6433 : static void
6434 2 : bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status)
6435 : {
6436 2 : struct spdk_bdev_io *bdev_io = ctx;
6437 :
6438 2 : if (unlock_status) {
6439 0 : SPDK_ERRLOG("LBA range unlock failed\n");
6440 : }
6441 :
6442 2 : bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true :
6443 : false, bdev_io->internal.caller_ctx);
6444 2 : }
6445 :
6446 : static void
6447 2 : bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status)
6448 : {
6449 2 : bdev_io->internal.status = status;
6450 :
6451 2 : bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch),
6452 : bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6453 : bdev_comparev_and_writev_blocks_unlocked, bdev_io);
6454 2 : }
6455 :
6456 : static void
6457 1 : bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6458 : {
6459 1 : struct spdk_bdev_io *parent_io = cb_arg;
6460 :
6461 1 : if (!success) {
6462 0 : SPDK_ERRLOG("Compare and write operation failed\n");
6463 : }
6464 :
6465 1 : spdk_bdev_free_io(bdev_io);
6466 :
6467 1 : bdev_comparev_and_writev_blocks_unlock(parent_io,
6468 : success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
6469 1 : }
6470 :
6471 : static void
6472 1 : bdev_compare_and_write_do_write(void *_bdev_io)
6473 : {
6474 1 : struct spdk_bdev_io *bdev_io = _bdev_io;
6475 : int rc;
6476 :
6477 2 : rc = spdk_bdev_writev_blocks(bdev_io->internal.desc,
6478 1 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
6479 : bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt,
6480 : bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6481 : bdev_compare_and_write_do_write_done, bdev_io);
6482 :
6483 :
6484 1 : if (rc == -ENOMEM) {
6485 0 : bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write);
6486 1 : } else if (rc != 0) {
6487 0 : bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
6488 : }
6489 1 : }
6490 :
6491 : static void
6492 2 : bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6493 : {
6494 2 : struct spdk_bdev_io *parent_io = cb_arg;
6495 :
6496 2 : spdk_bdev_free_io(bdev_io);
6497 :
6498 2 : if (!success) {
6499 1 : bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE);
6500 1 : return;
6501 : }
6502 :
6503 1 : bdev_compare_and_write_do_write(parent_io);
6504 : }
6505 :
6506 : static void
6507 2 : bdev_compare_and_write_do_compare(void *_bdev_io)
6508 : {
6509 2 : struct spdk_bdev_io *bdev_io = _bdev_io;
6510 : int rc;
6511 :
6512 4 : rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc,
6513 2 : spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs,
6514 : bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6515 : bdev_compare_and_write_do_compare_done, bdev_io);
6516 :
6517 2 : if (rc == -ENOMEM) {
6518 0 : bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare);
6519 2 : } else if (rc != 0) {
6520 0 : bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED);
6521 : }
6522 2 : }
6523 :
6524 : static void
6525 2 : bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status)
6526 : {
6527 2 : struct spdk_bdev_io *bdev_io = ctx;
6528 :
6529 2 : if (status) {
6530 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED;
6531 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
6532 0 : return;
6533 : }
6534 :
6535 2 : bdev_compare_and_write_do_compare(bdev_io);
6536 : }
6537 :
6538 : int
6539 2 : spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6540 : struct iovec *compare_iov, int compare_iovcnt,
6541 : struct iovec *write_iov, int write_iovcnt,
6542 : uint64_t offset_blocks, uint64_t num_blocks,
6543 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6544 : {
6545 2 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6546 : struct spdk_bdev_io *bdev_io;
6547 2 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6548 :
6549 2 : if (!desc->write) {
6550 0 : return -EBADF;
6551 : }
6552 :
6553 2 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6554 0 : return -EINVAL;
6555 : }
6556 :
6557 2 : if (num_blocks > bdev->acwu) {
6558 0 : return -EINVAL;
6559 : }
6560 :
6561 2 : bdev_io = bdev_channel_get_io(channel);
6562 2 : if (!bdev_io) {
6563 0 : return -ENOMEM;
6564 : }
6565 :
6566 2 : bdev_io->internal.ch = channel;
6567 2 : bdev_io->internal.desc = desc;
6568 2 : bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE;
6569 2 : bdev_io->u.bdev.iovs = compare_iov;
6570 2 : bdev_io->u.bdev.iovcnt = compare_iovcnt;
6571 2 : bdev_io->u.bdev.fused_iovs = write_iov;
6572 2 : bdev_io->u.bdev.fused_iovcnt = write_iovcnt;
6573 2 : bdev_io->u.bdev.md_buf = NULL;
6574 2 : bdev_io->u.bdev.num_blocks = num_blocks;
6575 2 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6576 2 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6577 2 : bdev_io->u.bdev.memory_domain = NULL;
6578 2 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6579 2 : bdev_io->u.bdev.accel_sequence = NULL;
6580 :
6581 2 : if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
6582 0 : bdev_io_submit(bdev_io);
6583 0 : return 0;
6584 : }
6585 :
6586 2 : return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks,
6587 : bdev_comparev_and_writev_blocks_locked, bdev_io);
6588 : }
6589 :
6590 : int
6591 2 : spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6592 : struct iovec *iov, int iovcnt,
6593 : uint64_t offset_blocks, uint64_t num_blocks,
6594 : bool populate,
6595 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6596 : {
6597 2 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6598 : struct spdk_bdev_io *bdev_io;
6599 2 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6600 :
6601 2 : if (!desc->write) {
6602 0 : return -EBADF;
6603 : }
6604 :
6605 2 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6606 0 : return -EINVAL;
6607 : }
6608 :
6609 2 : if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
6610 0 : return -ENOTSUP;
6611 : }
6612 :
6613 2 : bdev_io = bdev_channel_get_io(channel);
6614 2 : if (!bdev_io) {
6615 0 : return -ENOMEM;
6616 : }
6617 :
6618 2 : bdev_io->internal.ch = channel;
6619 2 : bdev_io->internal.desc = desc;
6620 2 : bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
6621 2 : bdev_io->u.bdev.num_blocks = num_blocks;
6622 2 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6623 2 : bdev_io->u.bdev.iovs = iov;
6624 2 : bdev_io->u.bdev.iovcnt = iovcnt;
6625 2 : bdev_io->u.bdev.md_buf = NULL;
6626 2 : bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0;
6627 2 : bdev_io->u.bdev.zcopy.commit = 0;
6628 2 : bdev_io->u.bdev.zcopy.start = 1;
6629 2 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6630 2 : bdev_io->u.bdev.memory_domain = NULL;
6631 2 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6632 2 : bdev_io->u.bdev.accel_sequence = NULL;
6633 :
6634 2 : bdev_io_submit(bdev_io);
6635 :
6636 2 : return 0;
6637 : }
6638 :
6639 : int
6640 2 : spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
6641 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6642 : {
6643 2 : if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) {
6644 0 : return -EINVAL;
6645 : }
6646 :
6647 2 : bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0;
6648 2 : bdev_io->u.bdev.zcopy.start = 0;
6649 2 : bdev_io->internal.caller_ctx = cb_arg;
6650 2 : bdev_io->internal.cb = cb;
6651 2 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
6652 :
6653 2 : bdev_io_submit(bdev_io);
6654 :
6655 2 : return 0;
6656 : }
6657 :
6658 : int
6659 0 : spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6660 : uint64_t offset, uint64_t len,
6661 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6662 : {
6663 0 : uint64_t offset_blocks, num_blocks;
6664 :
6665 0 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) {
6666 0 : return -EINVAL;
6667 : }
6668 :
6669 0 : return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6670 : }
6671 :
6672 : int
6673 33 : spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6674 : uint64_t offset_blocks, uint64_t num_blocks,
6675 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6676 : {
6677 33 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6678 : struct spdk_bdev_io *bdev_io;
6679 33 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6680 :
6681 33 : if (!desc->write) {
6682 0 : return -EBADF;
6683 : }
6684 :
6685 33 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6686 0 : return -EINVAL;
6687 : }
6688 :
6689 33 : if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) &&
6690 10 : !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) {
6691 1 : return -ENOTSUP;
6692 : }
6693 :
6694 32 : bdev_io = bdev_channel_get_io(channel);
6695 :
6696 32 : if (!bdev_io) {
6697 0 : return -ENOMEM;
6698 : }
6699 :
6700 32 : bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
6701 32 : bdev_io->internal.ch = channel;
6702 32 : bdev_io->internal.desc = desc;
6703 32 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6704 32 : bdev_io->u.bdev.num_blocks = num_blocks;
6705 32 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6706 32 : bdev_io->u.bdev.memory_domain = NULL;
6707 32 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6708 32 : bdev_io->u.bdev.accel_sequence = NULL;
6709 :
6710 : /* If the write_zeroes size is large and should be split, use the generic split
6711 : * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not.
6712 : *
6713 : * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported
6714 : * or emulate it using regular write request otherwise.
6715 : */
6716 32 : if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) ||
6717 : bdev_io->internal.f.split) {
6718 26 : bdev_io_submit(bdev_io);
6719 26 : return 0;
6720 : }
6721 :
6722 6 : assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE);
6723 :
6724 6 : return bdev_write_zero_buffer(bdev_io);
6725 : }
6726 :
6727 : int
6728 0 : spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6729 : uint64_t offset, uint64_t nbytes,
6730 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6731 : {
6732 0 : uint64_t offset_blocks, num_blocks;
6733 :
6734 0 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
6735 0 : return -EINVAL;
6736 : }
6737 :
6738 0 : return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6739 : }
6740 :
6741 : static void
6742 0 : bdev_io_complete_cb(void *ctx)
6743 : {
6744 0 : struct spdk_bdev_io *bdev_io = ctx;
6745 :
6746 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6747 0 : bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
6748 0 : }
6749 :
6750 : int
6751 22 : spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6752 : uint64_t offset_blocks, uint64_t num_blocks,
6753 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6754 : {
6755 22 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6756 : struct spdk_bdev_io *bdev_io;
6757 22 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6758 :
6759 22 : if (!desc->write) {
6760 0 : return -EBADF;
6761 : }
6762 :
6763 22 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6764 0 : return -EINVAL;
6765 : }
6766 :
6767 22 : bdev_io = bdev_channel_get_io(channel);
6768 22 : if (!bdev_io) {
6769 0 : return -ENOMEM;
6770 : }
6771 :
6772 22 : bdev_io->internal.ch = channel;
6773 22 : bdev_io->internal.desc = desc;
6774 22 : bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
6775 :
6776 22 : bdev_io->u.bdev.iovs = &bdev_io->iov;
6777 22 : bdev_io->u.bdev.iovs[0].iov_base = NULL;
6778 22 : bdev_io->u.bdev.iovs[0].iov_len = 0;
6779 22 : bdev_io->u.bdev.iovcnt = 1;
6780 :
6781 22 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6782 22 : bdev_io->u.bdev.num_blocks = num_blocks;
6783 22 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6784 22 : bdev_io->u.bdev.memory_domain = NULL;
6785 22 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6786 22 : bdev_io->u.bdev.accel_sequence = NULL;
6787 :
6788 22 : if (num_blocks == 0) {
6789 0 : spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io);
6790 0 : return 0;
6791 : }
6792 :
6793 22 : bdev_io_submit(bdev_io);
6794 22 : return 0;
6795 : }
6796 :
6797 : int
6798 0 : spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6799 : uint64_t offset, uint64_t length,
6800 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6801 : {
6802 0 : uint64_t offset_blocks, num_blocks;
6803 :
6804 0 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, length, &num_blocks) != 0) {
6805 0 : return -EINVAL;
6806 : }
6807 :
6808 0 : return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6809 : }
6810 :
6811 : int
6812 2 : spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6813 : uint64_t offset_blocks, uint64_t num_blocks,
6814 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6815 : {
6816 2 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6817 : struct spdk_bdev_io *bdev_io;
6818 2 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6819 :
6820 2 : if (!desc->write) {
6821 0 : return -EBADF;
6822 : }
6823 :
6824 2 : if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH))) {
6825 0 : return -ENOTSUP;
6826 : }
6827 :
6828 2 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6829 0 : return -EINVAL;
6830 : }
6831 :
6832 2 : bdev_io = bdev_channel_get_io(channel);
6833 2 : if (!bdev_io) {
6834 0 : return -ENOMEM;
6835 : }
6836 :
6837 2 : bdev_io->internal.ch = channel;
6838 2 : bdev_io->internal.desc = desc;
6839 2 : bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
6840 2 : bdev_io->u.bdev.iovs = NULL;
6841 2 : bdev_io->u.bdev.iovcnt = 0;
6842 2 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6843 2 : bdev_io->u.bdev.num_blocks = num_blocks;
6844 2 : bdev_io->u.bdev.memory_domain = NULL;
6845 2 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6846 2 : bdev_io->u.bdev.accel_sequence = NULL;
6847 2 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6848 :
6849 2 : bdev_io_submit(bdev_io);
6850 2 : return 0;
6851 : }
6852 :
6853 : static int bdev_reset_poll_for_outstanding_io(void *ctx);
6854 :
6855 : static void
6856 13 : bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
6857 : {
6858 13 : struct spdk_bdev_io *bdev_io = _ctx;
6859 13 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
6860 :
6861 13 : if (status == -EBUSY) {
6862 9 : if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
6863 8 : bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
6864 : bdev_io, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
6865 : } else {
6866 1 : if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) {
6867 : /* If outstanding IOs are still present and reset_io_drain_timeout
6868 : * seconds passed, start the reset. */
6869 1 : bdev_io_submit_reset(bdev_io);
6870 : } else {
6871 : /* We still have in progress memory domain pull/push or we're
6872 : * executing accel sequence. Since we cannot abort either of those
6873 : * operations, fail the reset request. */
6874 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
6875 : }
6876 : }
6877 : } else {
6878 4 : SPDK_DEBUGLOG(bdev,
6879 : "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
6880 : ch->bdev->name);
6881 : /* Mark the completion status as a SUCCESS and complete the reset. */
6882 4 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
6883 : }
6884 13 : }
6885 :
6886 : static void
6887 13 : bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6888 : struct spdk_io_channel *io_ch, void *_ctx)
6889 : {
6890 13 : struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch);
6891 13 : int status = 0;
6892 :
6893 13 : if (cur_ch->io_outstanding > 0 ||
6894 4 : !TAILQ_EMPTY(&cur_ch->io_memory_domain) ||
6895 4 : !TAILQ_EMPTY(&cur_ch->io_accel_exec)) {
6896 : /* If a channel has outstanding IO, set status to -EBUSY code. This will stop
6897 : * further iteration over the rest of the channels and pass non-zero status
6898 : * to the callback function. */
6899 9 : status = -EBUSY;
6900 : }
6901 13 : spdk_bdev_for_each_channel_continue(i, status);
6902 13 : }
6903 :
6904 : static int
6905 8 : bdev_reset_poll_for_outstanding_io(void *ctx)
6906 : {
6907 8 : struct spdk_bdev_io *bdev_io = ctx;
6908 :
6909 8 : spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
6910 8 : spdk_bdev_for_each_channel(bdev_io->bdev, bdev_reset_check_outstanding_io, bdev_io,
6911 : bdev_reset_check_outstanding_io_done);
6912 :
6913 8 : return SPDK_POLLER_BUSY;
6914 : }
6915 :
6916 : static void
6917 17 : bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status)
6918 : {
6919 17 : struct spdk_bdev_io *bdev_io = _ctx;
6920 :
6921 17 : if (bdev->reset_io_drain_timeout == 0) {
6922 12 : bdev_io_submit_reset(bdev_io);
6923 12 : return;
6924 : }
6925 :
6926 10 : bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
6927 5 : (bdev->reset_io_drain_timeout * spdk_get_ticks_hz());
6928 :
6929 : /* In case bdev->reset_io_drain_timeout is not equal to zero,
6930 : * submit the reset to the underlying module only if outstanding I/O
6931 : * remain after reset_io_drain_timeout seconds have passed. */
6932 5 : spdk_bdev_for_each_channel(bdev, bdev_reset_check_outstanding_io, bdev_io,
6933 : bdev_reset_check_outstanding_io_done);
6934 : }
6935 :
6936 : static void
6937 20 : bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6938 : struct spdk_io_channel *ch, void *_ctx)
6939 : {
6940 : struct spdk_bdev_channel *channel;
6941 : struct spdk_bdev_mgmt_channel *mgmt_channel;
6942 : struct spdk_bdev_shared_resource *shared_resource;
6943 :
6944 20 : channel = __io_ch_to_bdev_ch(ch);
6945 20 : shared_resource = channel->shared_resource;
6946 20 : mgmt_channel = shared_resource->mgmt_ch;
6947 :
6948 20 : channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
6949 :
6950 : /**
6951 : * Abort nomem I/Os first so that aborting other queued I/Os won't resubmit
6952 : * nomem I/Os of this channel.
6953 : */
6954 20 : bdev_abort_all_nomem_io(channel);
6955 20 : bdev_abort_all_buf_io(mgmt_channel, channel);
6956 :
6957 20 : if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
6958 2 : bdev_abort_all_queued_io(&channel->qos_queued_io, channel);
6959 : }
6960 :
6961 20 : spdk_bdev_for_each_channel_continue(i, 0);
6962 20 : }
6963 :
6964 : static void
6965 19 : bdev_start_reset(struct spdk_bdev_io *bdev_io)
6966 : {
6967 19 : struct spdk_bdev *bdev = bdev_io->bdev;
6968 19 : bool freeze_channel = false;
6969 :
6970 19 : bdev_ch_add_to_io_submitted(bdev_io);
6971 :
6972 : /**
6973 : * Take a channel reference for the target bdev for the life of this
6974 : * reset. This guards against the channel getting destroyed before
6975 : * the reset is completed. We will release the reference when this
6976 : * reset is completed.
6977 : */
6978 19 : bdev_io->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
6979 :
6980 19 : spdk_spin_lock(&bdev->internal.spinlock);
6981 19 : if (bdev->internal.reset_in_progress == NULL) {
6982 17 : bdev->internal.reset_in_progress = bdev_io;
6983 17 : freeze_channel = true;
6984 : } else {
6985 2 : TAILQ_INSERT_TAIL(&bdev->internal.queued_resets, bdev_io, internal.link);
6986 : }
6987 19 : spdk_spin_unlock(&bdev->internal.spinlock);
6988 :
6989 19 : if (freeze_channel) {
6990 17 : spdk_bdev_for_each_channel(bdev, bdev_reset_freeze_channel, bdev_io,
6991 : bdev_reset_freeze_channel_done);
6992 : }
6993 19 : }
6994 :
6995 : int
6996 19 : spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6997 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6998 : {
6999 19 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7000 : struct spdk_bdev_io *bdev_io;
7001 19 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7002 :
7003 19 : bdev_io = bdev_channel_get_io(channel);
7004 19 : if (!bdev_io) {
7005 0 : return -ENOMEM;
7006 : }
7007 :
7008 19 : bdev_io->internal.ch = channel;
7009 19 : bdev_io->internal.desc = desc;
7010 19 : bdev_io->internal.submit_tsc = spdk_get_ticks();
7011 19 : bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
7012 19 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7013 :
7014 19 : bdev_start_reset(bdev_io);
7015 19 : return 0;
7016 : }
7017 :
7018 : void
7019 0 : spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
7020 : struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode)
7021 : {
7022 0 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7023 :
7024 0 : bdev_get_io_stat(stat, channel->stat);
7025 0 : spdk_bdev_reset_io_stat(channel->stat, reset_mode);
7026 0 : }
7027 :
7028 : static void
7029 5 : bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
7030 : {
7031 5 : struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
7032 :
7033 5 : bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat,
7034 : bdev_iostat_ctx->cb_arg, 0);
7035 5 : free(bdev_iostat_ctx);
7036 5 : }
7037 :
7038 : static void
7039 4 : bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7040 : struct spdk_io_channel *ch, void *_ctx)
7041 : {
7042 4 : struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
7043 4 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7044 :
7045 4 : spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat);
7046 4 : spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode);
7047 4 : spdk_bdev_for_each_channel_continue(i, 0);
7048 4 : }
7049 :
7050 : void
7051 5 : spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
7052 : enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg)
7053 : {
7054 : struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
7055 :
7056 5 : assert(bdev != NULL);
7057 5 : assert(stat != NULL);
7058 5 : assert(cb != NULL);
7059 :
7060 5 : bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
7061 5 : if (bdev_iostat_ctx == NULL) {
7062 0 : SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
7063 0 : cb(bdev, stat, cb_arg, -ENOMEM);
7064 0 : return;
7065 : }
7066 :
7067 5 : bdev_iostat_ctx->stat = stat;
7068 5 : bdev_iostat_ctx->cb = cb;
7069 5 : bdev_iostat_ctx->cb_arg = cb_arg;
7070 5 : bdev_iostat_ctx->reset_mode = reset_mode;
7071 :
7072 : /* Start with the statistics from previously deleted channels. */
7073 5 : spdk_spin_lock(&bdev->internal.spinlock);
7074 5 : bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat);
7075 5 : spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode);
7076 5 : spdk_spin_unlock(&bdev->internal.spinlock);
7077 :
7078 : /* Then iterate and add the statistics from each existing channel. */
7079 5 : spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx,
7080 : bdev_get_device_stat_done);
7081 : }
7082 :
7083 : struct bdev_iostat_reset_ctx {
7084 : enum spdk_bdev_reset_stat_mode mode;
7085 : bdev_reset_device_stat_cb cb;
7086 : void *cb_arg;
7087 : };
7088 :
7089 : static void
7090 0 : bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
7091 : {
7092 0 : struct bdev_iostat_reset_ctx *ctx = _ctx;
7093 :
7094 0 : ctx->cb(bdev, ctx->cb_arg, 0);
7095 :
7096 0 : free(ctx);
7097 0 : }
7098 :
7099 : static void
7100 0 : bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7101 : struct spdk_io_channel *ch, void *_ctx)
7102 : {
7103 0 : struct bdev_iostat_reset_ctx *ctx = _ctx;
7104 0 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7105 :
7106 0 : spdk_bdev_reset_io_stat(channel->stat, ctx->mode);
7107 :
7108 0 : spdk_bdev_for_each_channel_continue(i, 0);
7109 0 : }
7110 :
7111 : void
7112 0 : bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode,
7113 : bdev_reset_device_stat_cb cb, void *cb_arg)
7114 : {
7115 : struct bdev_iostat_reset_ctx *ctx;
7116 :
7117 0 : assert(bdev != NULL);
7118 0 : assert(cb != NULL);
7119 :
7120 0 : ctx = calloc(1, sizeof(*ctx));
7121 0 : if (ctx == NULL) {
7122 0 : SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n");
7123 0 : cb(bdev, cb_arg, -ENOMEM);
7124 0 : return;
7125 : }
7126 :
7127 0 : ctx->mode = mode;
7128 0 : ctx->cb = cb;
7129 0 : ctx->cb_arg = cb_arg;
7130 :
7131 0 : spdk_spin_lock(&bdev->internal.spinlock);
7132 0 : spdk_bdev_reset_io_stat(bdev->internal.stat, mode);
7133 0 : spdk_spin_unlock(&bdev->internal.spinlock);
7134 :
7135 0 : spdk_bdev_for_each_channel(bdev,
7136 : bdev_reset_each_channel_stat,
7137 : ctx,
7138 : bdev_reset_device_stat_done);
7139 : }
7140 :
7141 : int
7142 1 : spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
7143 : const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
7144 : spdk_bdev_io_completion_cb cb, void *cb_arg)
7145 : {
7146 1 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7147 : struct spdk_bdev_io *bdev_io;
7148 1 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7149 :
7150 1 : if (!desc->write) {
7151 0 : return -EBADF;
7152 : }
7153 :
7154 1 : if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) {
7155 1 : return -ENOTSUP;
7156 : }
7157 :
7158 0 : bdev_io = bdev_channel_get_io(channel);
7159 0 : if (!bdev_io) {
7160 0 : return -ENOMEM;
7161 : }
7162 :
7163 0 : bdev_io->internal.ch = channel;
7164 0 : bdev_io->internal.desc = desc;
7165 0 : bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
7166 0 : bdev_io->u.nvme_passthru.cmd = *cmd;
7167 0 : bdev_io->u.nvme_passthru.buf = buf;
7168 0 : bdev_io->u.nvme_passthru.nbytes = nbytes;
7169 0 : bdev_io->u.nvme_passthru.md_buf = NULL;
7170 0 : bdev_io->u.nvme_passthru.md_len = 0;
7171 :
7172 0 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7173 :
7174 0 : bdev_io_submit(bdev_io);
7175 0 : return 0;
7176 : }
7177 :
7178 : int
7179 1 : spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
7180 : const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
7181 : spdk_bdev_io_completion_cb cb, void *cb_arg)
7182 : {
7183 1 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7184 : struct spdk_bdev_io *bdev_io;
7185 1 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7186 :
7187 1 : if (!desc->write) {
7188 : /*
7189 : * Do not try to parse the NVMe command - we could maybe use bits in the opcode
7190 : * to easily determine if the command is a read or write, but for now just
7191 : * do not allow io_passthru with a read-only descriptor.
7192 : */
7193 0 : return -EBADF;
7194 : }
7195 :
7196 1 : if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
7197 1 : return -ENOTSUP;
7198 : }
7199 :
7200 0 : bdev_io = bdev_channel_get_io(channel);
7201 0 : if (!bdev_io) {
7202 0 : return -ENOMEM;
7203 : }
7204 :
7205 0 : bdev_io->internal.ch = channel;
7206 0 : bdev_io->internal.desc = desc;
7207 0 : bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
7208 0 : bdev_io->u.nvme_passthru.cmd = *cmd;
7209 0 : bdev_io->u.nvme_passthru.buf = buf;
7210 0 : bdev_io->u.nvme_passthru.nbytes = nbytes;
7211 0 : bdev_io->u.nvme_passthru.md_buf = NULL;
7212 0 : bdev_io->u.nvme_passthru.md_len = 0;
7213 :
7214 0 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7215 :
7216 0 : bdev_io_submit(bdev_io);
7217 0 : return 0;
7218 : }
7219 :
7220 : int
7221 1 : spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
7222 : const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
7223 : spdk_bdev_io_completion_cb cb, void *cb_arg)
7224 : {
7225 1 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7226 : struct spdk_bdev_io *bdev_io;
7227 1 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7228 :
7229 1 : if (!desc->write) {
7230 : /*
7231 : * Do not try to parse the NVMe command - we could maybe use bits in the opcode
7232 : * to easily determine if the command is a read or write, but for now just
7233 : * do not allow io_passthru with a read-only descriptor.
7234 : */
7235 0 : return -EBADF;
7236 : }
7237 :
7238 1 : if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
7239 1 : return -ENOTSUP;
7240 : }
7241 :
7242 0 : bdev_io = bdev_channel_get_io(channel);
7243 0 : if (!bdev_io) {
7244 0 : return -ENOMEM;
7245 : }
7246 :
7247 0 : bdev_io->internal.ch = channel;
7248 0 : bdev_io->internal.desc = desc;
7249 0 : bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
7250 0 : bdev_io->u.nvme_passthru.cmd = *cmd;
7251 0 : bdev_io->u.nvme_passthru.buf = buf;
7252 0 : bdev_io->u.nvme_passthru.nbytes = nbytes;
7253 0 : bdev_io->u.nvme_passthru.md_buf = md_buf;
7254 0 : bdev_io->u.nvme_passthru.md_len = md_len;
7255 :
7256 0 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7257 :
7258 0 : bdev_io_submit(bdev_io);
7259 0 : return 0;
7260 : }
7261 :
7262 : int
7263 0 : spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc,
7264 : struct spdk_io_channel *ch,
7265 : const struct spdk_nvme_cmd *cmd,
7266 : struct iovec *iov, int iovcnt, size_t nbytes,
7267 : void *md_buf, size_t md_len,
7268 : spdk_bdev_io_completion_cb cb, void *cb_arg)
7269 : {
7270 0 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7271 : struct spdk_bdev_io *bdev_io;
7272 0 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7273 :
7274 0 : if (!desc->write) {
7275 : /*
7276 : * Do not try to parse the NVMe command - we could maybe use bits in the opcode
7277 : * to easily determine if the command is a read or write, but for now just
7278 : * do not allow io_passthru with a read-only descriptor.
7279 : */
7280 0 : return -EBADF;
7281 : }
7282 :
7283 0 : if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
7284 0 : return -ENOTSUP;
7285 0 : } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
7286 0 : return -ENOTSUP;
7287 : }
7288 :
7289 0 : bdev_io = bdev_channel_get_io(channel);
7290 0 : if (!bdev_io) {
7291 0 : return -ENOMEM;
7292 : }
7293 :
7294 0 : bdev_io->internal.ch = channel;
7295 0 : bdev_io->internal.desc = desc;
7296 0 : bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD;
7297 0 : bdev_io->u.nvme_passthru.cmd = *cmd;
7298 0 : bdev_io->u.nvme_passthru.iovs = iov;
7299 0 : bdev_io->u.nvme_passthru.iovcnt = iovcnt;
7300 0 : bdev_io->u.nvme_passthru.nbytes = nbytes;
7301 0 : bdev_io->u.nvme_passthru.md_buf = md_buf;
7302 0 : bdev_io->u.nvme_passthru.md_len = md_len;
7303 :
7304 0 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7305 :
7306 0 : bdev_io_submit(bdev_io);
7307 0 : return 0;
7308 : }
7309 :
7310 : static void bdev_abort_retry(void *ctx);
7311 : static void bdev_abort(struct spdk_bdev_io *parent_io);
7312 :
7313 : static void
7314 22 : bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
7315 : {
7316 22 : struct spdk_bdev_channel *channel = bdev_io->internal.ch;
7317 22 : struct spdk_bdev_io *parent_io = cb_arg;
7318 : struct spdk_bdev_io *bio_to_abort, *tmp_io;
7319 :
7320 22 : bio_to_abort = bdev_io->u.abort.bio_to_abort;
7321 :
7322 22 : spdk_bdev_free_io(bdev_io);
7323 :
7324 22 : if (!success) {
7325 : /* Check if the target I/O completed in the meantime. */
7326 2 : TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) {
7327 1 : if (tmp_io == bio_to_abort) {
7328 0 : break;
7329 : }
7330 : }
7331 :
7332 : /* If the target I/O still exists, set the parent to failed. */
7333 1 : if (tmp_io != NULL) {
7334 0 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7335 : }
7336 : }
7337 :
7338 22 : assert(parent_io->internal.f.split);
7339 :
7340 22 : parent_io->internal.split.outstanding--;
7341 22 : if (parent_io->internal.split.outstanding == 0) {
7342 16 : if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
7343 0 : bdev_abort_retry(parent_io);
7344 : } else {
7345 16 : bdev_io_complete(parent_io);
7346 : }
7347 : }
7348 22 : }
7349 :
7350 : static int
7351 23 : bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel,
7352 : struct spdk_bdev_io *bio_to_abort,
7353 : spdk_bdev_io_completion_cb cb, void *cb_arg)
7354 : {
7355 23 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7356 : struct spdk_bdev_io *bdev_io;
7357 :
7358 23 : if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT ||
7359 23 : bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) {
7360 : /* TODO: Abort reset or abort request. */
7361 0 : return -ENOTSUP;
7362 : }
7363 :
7364 23 : bdev_io = bdev_channel_get_io(channel);
7365 23 : if (bdev_io == NULL) {
7366 1 : return -ENOMEM;
7367 : }
7368 :
7369 22 : bdev_io->internal.ch = channel;
7370 22 : bdev_io->internal.desc = desc;
7371 22 : bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
7372 22 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7373 :
7374 22 : if (bio_to_abort->internal.f.split) {
7375 6 : assert(bdev_io_should_split(bio_to_abort));
7376 6 : bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort;
7377 :
7378 : /* Parent abort request is not submitted directly, but to manage its
7379 : * execution add it to the submitted list here.
7380 : */
7381 6 : bdev_io->internal.submit_tsc = spdk_get_ticks();
7382 6 : bdev_ch_add_to_io_submitted(bdev_io);
7383 :
7384 6 : bdev_abort(bdev_io);
7385 :
7386 6 : return 0;
7387 : }
7388 :
7389 16 : bdev_io->u.abort.bio_to_abort = bio_to_abort;
7390 :
7391 : /* Submit the abort request to the underlying bdev module. */
7392 16 : bdev_io_submit(bdev_io);
7393 :
7394 16 : return 0;
7395 : }
7396 :
7397 : static bool
7398 46 : bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq)
7399 : {
7400 : struct spdk_bdev_io *iter;
7401 :
7402 46 : TAILQ_FOREACH(iter, tailq, internal.link) {
7403 0 : if (iter == bdev_io) {
7404 0 : return true;
7405 : }
7406 : }
7407 :
7408 46 : return false;
7409 : }
7410 :
7411 : static uint32_t
7412 18 : _bdev_abort(struct spdk_bdev_io *parent_io)
7413 : {
7414 18 : struct spdk_bdev_desc *desc = parent_io->internal.desc;
7415 18 : struct spdk_bdev_channel *channel = parent_io->internal.ch;
7416 : void *bio_cb_arg;
7417 : struct spdk_bdev_io *bio_to_abort;
7418 : uint32_t matched_ios;
7419 : int rc;
7420 :
7421 18 : bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg;
7422 :
7423 : /* matched_ios is returned and will be kept by the caller.
7424 : *
7425 : * This function will be used for two cases, 1) the same cb_arg is used for
7426 : * multiple I/Os, 2) a single large I/O is split into smaller ones.
7427 : * Incrementing split_outstanding directly here may confuse readers especially
7428 : * for the 1st case.
7429 : *
7430 : * Completion of I/O abort is processed after stack unwinding. Hence this trick
7431 : * works as expected.
7432 : */
7433 18 : matched_ios = 0;
7434 18 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
7435 :
7436 105 : TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) {
7437 88 : if (bio_to_abort->internal.caller_ctx != bio_cb_arg) {
7438 65 : continue;
7439 : }
7440 :
7441 23 : if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) {
7442 : /* Any I/O which was submitted after this abort command should be excluded. */
7443 0 : continue;
7444 : }
7445 :
7446 : /* We can't abort a request that's being pushed/pulled or executed by accel */
7447 46 : if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) ||
7448 23 : bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) {
7449 0 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7450 0 : break;
7451 : }
7452 :
7453 23 : rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io);
7454 23 : if (rc != 0) {
7455 1 : if (rc == -ENOMEM) {
7456 1 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM;
7457 : } else {
7458 0 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7459 : }
7460 1 : break;
7461 : }
7462 22 : matched_ios++;
7463 : }
7464 :
7465 18 : return matched_ios;
7466 : }
7467 :
7468 : static void
7469 1 : bdev_abort_retry(void *ctx)
7470 : {
7471 1 : struct spdk_bdev_io *parent_io = ctx;
7472 : uint32_t matched_ios;
7473 :
7474 1 : matched_ios = _bdev_abort(parent_io);
7475 :
7476 1 : if (matched_ios == 0) {
7477 0 : if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
7478 0 : bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
7479 : } else {
7480 : /* For retry, the case that no target I/O was found is success
7481 : * because it means target I/Os completed in the meantime.
7482 : */
7483 0 : bdev_io_complete(parent_io);
7484 : }
7485 0 : return;
7486 : }
7487 :
7488 : /* Use split_outstanding to manage the progress of aborting I/Os. */
7489 1 : parent_io->internal.f.split = true;
7490 1 : parent_io->internal.split.outstanding = matched_ios;
7491 : }
7492 :
7493 : static void
7494 17 : bdev_abort(struct spdk_bdev_io *parent_io)
7495 : {
7496 : uint32_t matched_ios;
7497 :
7498 17 : matched_ios = _bdev_abort(parent_io);
7499 :
7500 17 : if (matched_ios == 0) {
7501 2 : if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
7502 1 : bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
7503 : } else {
7504 : /* The case the no target I/O was found is failure. */
7505 1 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7506 1 : bdev_io_complete(parent_io);
7507 : }
7508 2 : return;
7509 : }
7510 :
7511 : /* Use split_outstanding to manage the progress of aborting I/Os. */
7512 15 : parent_io->internal.f.split = true;
7513 15 : parent_io->internal.split.outstanding = matched_ios;
7514 : }
7515 :
7516 : int
7517 12 : spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
7518 : void *bio_cb_arg,
7519 : spdk_bdev_io_completion_cb cb, void *cb_arg)
7520 : {
7521 12 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7522 12 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7523 : struct spdk_bdev_io *bdev_io;
7524 :
7525 12 : if (bio_cb_arg == NULL) {
7526 0 : return -EINVAL;
7527 : }
7528 :
7529 12 : if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
7530 1 : return -ENOTSUP;
7531 : }
7532 :
7533 11 : bdev_io = bdev_channel_get_io(channel);
7534 11 : if (bdev_io == NULL) {
7535 0 : return -ENOMEM;
7536 : }
7537 :
7538 11 : bdev_io->internal.ch = channel;
7539 11 : bdev_io->internal.desc = desc;
7540 11 : bdev_io->internal.submit_tsc = spdk_get_ticks();
7541 11 : bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
7542 11 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7543 :
7544 11 : bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg;
7545 :
7546 : /* Parent abort request is not submitted directly, but to manage its execution,
7547 : * add it to the submitted list here.
7548 : */
7549 11 : bdev_ch_add_to_io_submitted(bdev_io);
7550 :
7551 11 : bdev_abort(bdev_io);
7552 :
7553 11 : return 0;
7554 : }
7555 :
7556 : int
7557 4 : spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
7558 : struct spdk_bdev_io_wait_entry *entry)
7559 : {
7560 4 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7561 4 : struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch;
7562 :
7563 4 : if (bdev != entry->bdev) {
7564 0 : SPDK_ERRLOG("bdevs do not match\n");
7565 0 : return -EINVAL;
7566 : }
7567 :
7568 4 : if (mgmt_ch->per_thread_cache_count > 0) {
7569 0 : SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n");
7570 0 : return -EINVAL;
7571 : }
7572 :
7573 4 : TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link);
7574 4 : return 0;
7575 : }
7576 :
7577 : static inline void
7578 629 : bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff)
7579 : {
7580 629 : enum spdk_bdev_io_status io_status = bdev_io->internal.status;
7581 629 : struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat;
7582 629 : uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
7583 629 : uint32_t blocklen = bdev_io->bdev->blocklen;
7584 :
7585 629 : if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
7586 520 : switch (bdev_io->type) {
7587 321 : case SPDK_BDEV_IO_TYPE_READ:
7588 321 : io_stat->bytes_read += num_blocks * blocklen;
7589 321 : io_stat->num_read_ops++;
7590 321 : io_stat->read_latency_ticks += tsc_diff;
7591 321 : if (io_stat->max_read_latency_ticks < tsc_diff) {
7592 7 : io_stat->max_read_latency_ticks = tsc_diff;
7593 : }
7594 321 : if (io_stat->min_read_latency_ticks > tsc_diff) {
7595 42 : io_stat->min_read_latency_ticks = tsc_diff;
7596 : }
7597 321 : break;
7598 75 : case SPDK_BDEV_IO_TYPE_WRITE:
7599 75 : io_stat->bytes_written += num_blocks * blocklen;
7600 75 : io_stat->num_write_ops++;
7601 75 : io_stat->write_latency_ticks += tsc_diff;
7602 75 : if (io_stat->max_write_latency_ticks < tsc_diff) {
7603 4 : io_stat->max_write_latency_ticks = tsc_diff;
7604 : }
7605 75 : if (io_stat->min_write_latency_ticks > tsc_diff) {
7606 25 : io_stat->min_write_latency_ticks = tsc_diff;
7607 : }
7608 75 : break;
7609 20 : case SPDK_BDEV_IO_TYPE_UNMAP:
7610 20 : io_stat->bytes_unmapped += num_blocks * blocklen;
7611 20 : io_stat->num_unmap_ops++;
7612 20 : io_stat->unmap_latency_ticks += tsc_diff;
7613 20 : if (io_stat->max_unmap_latency_ticks < tsc_diff) {
7614 0 : io_stat->max_unmap_latency_ticks = tsc_diff;
7615 : }
7616 20 : if (io_stat->min_unmap_latency_ticks > tsc_diff) {
7617 3 : io_stat->min_unmap_latency_ticks = tsc_diff;
7618 : }
7619 20 : break;
7620 4 : case SPDK_BDEV_IO_TYPE_ZCOPY:
7621 : /* Track the data in the start phase only */
7622 4 : if (bdev_io->u.bdev.zcopy.start) {
7623 2 : if (bdev_io->u.bdev.zcopy.populate) {
7624 1 : io_stat->bytes_read += num_blocks * blocklen;
7625 1 : io_stat->num_read_ops++;
7626 1 : io_stat->read_latency_ticks += tsc_diff;
7627 1 : if (io_stat->max_read_latency_ticks < tsc_diff) {
7628 0 : io_stat->max_read_latency_ticks = tsc_diff;
7629 : }
7630 1 : if (io_stat->min_read_latency_ticks > tsc_diff) {
7631 1 : io_stat->min_read_latency_ticks = tsc_diff;
7632 : }
7633 : } else {
7634 1 : io_stat->bytes_written += num_blocks * blocklen;
7635 1 : io_stat->num_write_ops++;
7636 1 : io_stat->write_latency_ticks += tsc_diff;
7637 1 : if (io_stat->max_write_latency_ticks < tsc_diff) {
7638 0 : io_stat->max_write_latency_ticks = tsc_diff;
7639 : }
7640 1 : if (io_stat->min_write_latency_ticks > tsc_diff) {
7641 1 : io_stat->min_write_latency_ticks = tsc_diff;
7642 : }
7643 : }
7644 : }
7645 4 : break;
7646 21 : case SPDK_BDEV_IO_TYPE_COPY:
7647 21 : io_stat->bytes_copied += num_blocks * blocklen;
7648 21 : io_stat->num_copy_ops++;
7649 21 : bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff;
7650 21 : if (io_stat->max_copy_latency_ticks < tsc_diff) {
7651 0 : io_stat->max_copy_latency_ticks = tsc_diff;
7652 : }
7653 21 : if (io_stat->min_copy_latency_ticks > tsc_diff) {
7654 4 : io_stat->min_copy_latency_ticks = tsc_diff;
7655 : }
7656 21 : break;
7657 79 : default:
7658 79 : break;
7659 : }
7660 109 : } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) {
7661 109 : io_stat = bdev_io->bdev->internal.stat;
7662 109 : assert(io_stat->io_error != NULL);
7663 :
7664 109 : spdk_spin_lock(&bdev_io->bdev->internal.spinlock);
7665 109 : io_stat->io_error->error_status[-io_status - 1]++;
7666 109 : spdk_spin_unlock(&bdev_io->bdev->internal.spinlock);
7667 : }
7668 :
7669 : #ifdef SPDK_CONFIG_VTUNE
7670 : uint64_t now_tsc = spdk_get_ticks();
7671 : if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) {
7672 : uint64_t data[5];
7673 : struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat;
7674 :
7675 : data[0] = io_stat->num_read_ops - prev_stat->num_read_ops;
7676 : data[1] = io_stat->bytes_read - prev_stat->bytes_read;
7677 : data[2] = io_stat->num_write_ops - prev_stat->num_write_ops;
7678 : data[3] = io_stat->bytes_written - prev_stat->bytes_written;
7679 : data[4] = bdev_io->bdev->fn_table->get_spin_time ?
7680 : bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0;
7681 :
7682 : __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle,
7683 : __itt_metadata_u64, 5, data);
7684 :
7685 : memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat));
7686 : bdev_io->internal.ch->start_tsc = now_tsc;
7687 : }
7688 : #endif
7689 629 : }
7690 :
7691 : static inline void
7692 629 : _bdev_io_complete(void *ctx)
7693 : {
7694 629 : struct spdk_bdev_io *bdev_io = ctx;
7695 :
7696 629 : if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) {
7697 0 : assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
7698 0 : spdk_accel_sequence_abort(bdev_io->internal.accel_sequence);
7699 : }
7700 :
7701 629 : assert(bdev_io->internal.cb != NULL);
7702 629 : assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io));
7703 :
7704 629 : bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
7705 : bdev_io->internal.caller_ctx);
7706 629 : }
7707 :
7708 : static inline void
7709 637 : bdev_io_complete(void *ctx)
7710 : {
7711 637 : struct spdk_bdev_io *bdev_io = ctx;
7712 637 : struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
7713 : uint64_t tsc, tsc_diff;
7714 :
7715 637 : if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) {
7716 : /*
7717 : * Defer completion to avoid potential infinite recursion if the
7718 : * user's completion callback issues a new I/O.
7719 : */
7720 8 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7721 : bdev_io_complete, bdev_io);
7722 8 : return;
7723 : }
7724 :
7725 629 : tsc = spdk_get_ticks();
7726 629 : tsc_diff = tsc - bdev_io->internal.submit_tsc;
7727 :
7728 629 : bdev_ch_remove_from_io_submitted(bdev_io);
7729 629 : spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io,
7730 : bdev_io->internal.caller_ctx, bdev_ch->queue_depth);
7731 :
7732 629 : if (bdev_ch->histogram) {
7733 4 : if (bdev_io->bdev->internal.histogram_io_type == 0 ||
7734 0 : bdev_io->bdev->internal.histogram_io_type == bdev_io->type) {
7735 : /*
7736 : * Tally all I/O types if the histogram_io_type is set to 0.
7737 : */
7738 4 : spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff);
7739 : }
7740 : }
7741 :
7742 629 : bdev_io_update_io_stat(bdev_io, tsc_diff);
7743 629 : _bdev_io_complete(bdev_io);
7744 : }
7745 :
7746 : /* The difference between this function and bdev_io_complete() is that this should be called to
7747 : * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the
7748 : * io_submitted list and don't have submit_tsc updated.
7749 : */
7750 : static inline void
7751 0 : bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io)
7752 : {
7753 : /* Since the IO hasn't been submitted it's bound to be failed */
7754 0 : assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
7755 :
7756 : /* At this point we don't know if the IO is completed from submission context or not, but,
7757 : * since this is an error path, we can always do an spdk_thread_send_msg(). */
7758 0 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7759 : _bdev_io_complete, bdev_io);
7760 0 : }
7761 :
7762 : static void bdev_destroy_cb(void *io_device);
7763 :
7764 : static inline void
7765 19 : _bdev_reset_complete(void *ctx)
7766 : {
7767 19 : struct spdk_bdev_io *bdev_io = ctx;
7768 :
7769 : /* Put the channel reference we got in submission. */
7770 19 : assert(bdev_io->u.reset.ch_ref != NULL);
7771 19 : spdk_put_io_channel(bdev_io->u.reset.ch_ref);
7772 19 : bdev_io->u.reset.ch_ref = NULL;
7773 :
7774 19 : bdev_io_complete(bdev_io);
7775 19 : }
7776 :
7777 : static void
7778 17 : bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status)
7779 : {
7780 17 : struct spdk_bdev_io *bdev_io = _ctx;
7781 17 : bdev_io_tailq_t queued_resets;
7782 : struct spdk_bdev_io *queued_reset;
7783 :
7784 17 : assert(bdev_io == bdev->internal.reset_in_progress);
7785 :
7786 17 : TAILQ_INIT(&queued_resets);
7787 :
7788 17 : spdk_spin_lock(&bdev->internal.spinlock);
7789 17 : TAILQ_SWAP(&bdev->internal.queued_resets, &queued_resets,
7790 : spdk_bdev_io, internal.link);
7791 17 : bdev->internal.reset_in_progress = NULL;
7792 17 : spdk_spin_unlock(&bdev->internal.spinlock);
7793 :
7794 19 : while (!TAILQ_EMPTY(&queued_resets)) {
7795 2 : queued_reset = TAILQ_FIRST(&queued_resets);
7796 2 : TAILQ_REMOVE(&queued_resets, queued_reset, internal.link);
7797 2 : queued_reset->internal.status = bdev_io->internal.status;
7798 2 : spdk_thread_send_msg(spdk_bdev_io_get_thread(queued_reset),
7799 : _bdev_reset_complete, queued_reset);
7800 : }
7801 :
7802 17 : _bdev_reset_complete(bdev_io);
7803 :
7804 17 : if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING &&
7805 1 : TAILQ_EMPTY(&bdev->internal.open_descs)) {
7806 1 : spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
7807 : }
7808 17 : }
7809 :
7810 : static void
7811 21 : bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7812 : struct spdk_io_channel *_ch, void *_ctx)
7813 : {
7814 21 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
7815 :
7816 21 : ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
7817 :
7818 21 : spdk_bdev_for_each_channel_continue(i, 0);
7819 21 : }
7820 :
7821 : static void
7822 0 : bdev_io_complete_sequence_cb(void *ctx, int status)
7823 : {
7824 0 : struct spdk_bdev_io *bdev_io = ctx;
7825 :
7826 : /* u.bdev.accel_sequence should have already been cleared at this point */
7827 0 : assert(bdev_io->u.bdev.accel_sequence == NULL);
7828 0 : assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
7829 0 : bdev_io->internal.f.has_accel_sequence = false;
7830 :
7831 0 : if (spdk_unlikely(status != 0)) {
7832 0 : SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
7833 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7834 : }
7835 :
7836 0 : bdev_io_complete(bdev_io);
7837 0 : }
7838 :
7839 : void
7840 631 : spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
7841 : {
7842 631 : struct spdk_bdev *bdev = bdev_io->bdev;
7843 631 : struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
7844 631 : struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
7845 :
7846 631 : if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) {
7847 0 : SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n",
7848 : spdk_bdev_get_module_name(bdev),
7849 : bdev_io_status_get_string(bdev_io->internal.status));
7850 0 : assert(false);
7851 : }
7852 631 : bdev_io->internal.status = status;
7853 :
7854 631 : if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
7855 17 : assert(bdev_io == bdev->internal.reset_in_progress);
7856 17 : spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io,
7857 : bdev_reset_complete);
7858 17 : return;
7859 : } else {
7860 614 : bdev_io_decrement_outstanding(bdev_ch, shared_resource);
7861 614 : if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
7862 485 : if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
7863 0 : bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb);
7864 0 : return;
7865 485 : } else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf &&
7866 : !bdev_io_use_accel_sequence(bdev_io))) {
7867 26 : _bdev_io_push_bounce_data_buffer(bdev_io,
7868 : _bdev_io_complete_push_bounce_done);
7869 : /* bdev IO will be completed in the callback */
7870 26 : return;
7871 : }
7872 : }
7873 :
7874 588 : if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) {
7875 21 : return;
7876 : }
7877 : }
7878 :
7879 567 : bdev_io_complete(bdev_io);
7880 : }
7881 :
7882 : void
7883 0 : spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
7884 : enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
7885 : {
7886 : enum spdk_bdev_io_status status;
7887 :
7888 0 : if (sc == SPDK_SCSI_STATUS_GOOD) {
7889 0 : status = SPDK_BDEV_IO_STATUS_SUCCESS;
7890 : } else {
7891 0 : status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
7892 0 : bdev_io->internal.error.scsi.sc = sc;
7893 0 : bdev_io->internal.error.scsi.sk = sk;
7894 0 : bdev_io->internal.error.scsi.asc = asc;
7895 0 : bdev_io->internal.error.scsi.ascq = ascq;
7896 : }
7897 :
7898 0 : spdk_bdev_io_complete(bdev_io, status);
7899 0 : }
7900 :
7901 : void
7902 0 : spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
7903 : int *sc, int *sk, int *asc, int *ascq)
7904 : {
7905 0 : assert(sc != NULL);
7906 0 : assert(sk != NULL);
7907 0 : assert(asc != NULL);
7908 0 : assert(ascq != NULL);
7909 :
7910 0 : switch (bdev_io->internal.status) {
7911 0 : case SPDK_BDEV_IO_STATUS_SUCCESS:
7912 0 : *sc = SPDK_SCSI_STATUS_GOOD;
7913 0 : *sk = SPDK_SCSI_SENSE_NO_SENSE;
7914 0 : *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7915 0 : *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7916 0 : break;
7917 0 : case SPDK_BDEV_IO_STATUS_NVME_ERROR:
7918 0 : spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
7919 0 : break;
7920 0 : case SPDK_BDEV_IO_STATUS_MISCOMPARE:
7921 0 : *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
7922 0 : *sk = SPDK_SCSI_SENSE_MISCOMPARE;
7923 0 : *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION;
7924 0 : *ascq = bdev_io->internal.error.scsi.ascq;
7925 0 : break;
7926 0 : case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
7927 0 : *sc = bdev_io->internal.error.scsi.sc;
7928 0 : *sk = bdev_io->internal.error.scsi.sk;
7929 0 : *asc = bdev_io->internal.error.scsi.asc;
7930 0 : *ascq = bdev_io->internal.error.scsi.ascq;
7931 0 : break;
7932 0 : default:
7933 0 : *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
7934 0 : *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
7935 0 : *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7936 0 : *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7937 0 : break;
7938 : }
7939 0 : }
7940 :
7941 : void
7942 0 : spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result)
7943 : {
7944 : enum spdk_bdev_io_status status;
7945 :
7946 0 : if (aio_result == 0) {
7947 0 : status = SPDK_BDEV_IO_STATUS_SUCCESS;
7948 : } else {
7949 0 : status = SPDK_BDEV_IO_STATUS_AIO_ERROR;
7950 : }
7951 :
7952 0 : bdev_io->internal.error.aio_result = aio_result;
7953 :
7954 0 : spdk_bdev_io_complete(bdev_io, status);
7955 0 : }
7956 :
7957 : void
7958 0 : spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result)
7959 : {
7960 0 : assert(aio_result != NULL);
7961 :
7962 0 : if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) {
7963 0 : *aio_result = bdev_io->internal.error.aio_result;
7964 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7965 0 : *aio_result = 0;
7966 : } else {
7967 0 : *aio_result = -EIO;
7968 : }
7969 0 : }
7970 :
7971 : void
7972 0 : spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc)
7973 : {
7974 : enum spdk_bdev_io_status status;
7975 :
7976 0 : if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) {
7977 0 : status = SPDK_BDEV_IO_STATUS_SUCCESS;
7978 0 : } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) {
7979 0 : status = SPDK_BDEV_IO_STATUS_ABORTED;
7980 : } else {
7981 0 : status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
7982 : }
7983 :
7984 0 : bdev_io->internal.error.nvme.cdw0 = cdw0;
7985 0 : bdev_io->internal.error.nvme.sct = sct;
7986 0 : bdev_io->internal.error.nvme.sc = sc;
7987 :
7988 0 : spdk_bdev_io_complete(bdev_io, status);
7989 0 : }
7990 :
7991 : void
7992 0 : spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc)
7993 : {
7994 0 : assert(sct != NULL);
7995 0 : assert(sc != NULL);
7996 0 : assert(cdw0 != NULL);
7997 :
7998 0 : if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
7999 0 : *sct = SPDK_NVME_SCT_GENERIC;
8000 0 : *sc = SPDK_NVME_SC_SUCCESS;
8001 0 : if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
8002 0 : *cdw0 = 0;
8003 : } else {
8004 0 : *cdw0 = 1U;
8005 : }
8006 0 : return;
8007 : }
8008 :
8009 0 : if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
8010 0 : *sct = SPDK_NVME_SCT_GENERIC;
8011 0 : *sc = SPDK_NVME_SC_SUCCESS;
8012 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
8013 0 : *sct = bdev_io->internal.error.nvme.sct;
8014 0 : *sc = bdev_io->internal.error.nvme.sc;
8015 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
8016 0 : *sct = SPDK_NVME_SCT_GENERIC;
8017 0 : *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
8018 : } else {
8019 0 : *sct = SPDK_NVME_SCT_GENERIC;
8020 0 : *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
8021 : }
8022 :
8023 0 : *cdw0 = bdev_io->internal.error.nvme.cdw0;
8024 : }
8025 :
8026 : void
8027 0 : spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0,
8028 : int *first_sct, int *first_sc, int *second_sct, int *second_sc)
8029 : {
8030 0 : assert(first_sct != NULL);
8031 0 : assert(first_sc != NULL);
8032 0 : assert(second_sct != NULL);
8033 0 : assert(second_sc != NULL);
8034 0 : assert(cdw0 != NULL);
8035 :
8036 0 : if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
8037 0 : if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR &&
8038 0 : bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) {
8039 0 : *first_sct = bdev_io->internal.error.nvme.sct;
8040 0 : *first_sc = bdev_io->internal.error.nvme.sc;
8041 0 : *second_sct = SPDK_NVME_SCT_GENERIC;
8042 0 : *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
8043 : } else {
8044 0 : *first_sct = SPDK_NVME_SCT_GENERIC;
8045 0 : *first_sc = SPDK_NVME_SC_SUCCESS;
8046 0 : *second_sct = bdev_io->internal.error.nvme.sct;
8047 0 : *second_sc = bdev_io->internal.error.nvme.sc;
8048 : }
8049 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
8050 0 : *first_sct = SPDK_NVME_SCT_GENERIC;
8051 0 : *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
8052 0 : *second_sct = SPDK_NVME_SCT_GENERIC;
8053 0 : *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
8054 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
8055 0 : *first_sct = SPDK_NVME_SCT_GENERIC;
8056 0 : *first_sc = SPDK_NVME_SC_SUCCESS;
8057 0 : *second_sct = SPDK_NVME_SCT_GENERIC;
8058 0 : *second_sc = SPDK_NVME_SC_SUCCESS;
8059 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) {
8060 0 : *first_sct = SPDK_NVME_SCT_GENERIC;
8061 0 : *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
8062 0 : *second_sct = SPDK_NVME_SCT_GENERIC;
8063 0 : *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
8064 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) {
8065 0 : *first_sct = SPDK_NVME_SCT_MEDIA_ERROR;
8066 0 : *first_sc = SPDK_NVME_SC_COMPARE_FAILURE;
8067 0 : *second_sct = SPDK_NVME_SCT_GENERIC;
8068 0 : *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
8069 : } else {
8070 0 : *first_sct = SPDK_NVME_SCT_GENERIC;
8071 0 : *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
8072 0 : *second_sct = SPDK_NVME_SCT_GENERIC;
8073 0 : *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
8074 : }
8075 :
8076 0 : *cdw0 = bdev_io->internal.error.nvme.cdw0;
8077 0 : }
8078 :
8079 : void
8080 0 : spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io,
8081 : const struct spdk_bdev_io *base_io)
8082 : {
8083 0 : switch (base_io->internal.status) {
8084 0 : case SPDK_BDEV_IO_STATUS_NVME_ERROR:
8085 0 : spdk_bdev_io_complete_nvme_status(bdev_io,
8086 0 : base_io->internal.error.nvme.cdw0,
8087 0 : base_io->internal.error.nvme.sct,
8088 0 : base_io->internal.error.nvme.sc);
8089 0 : break;
8090 0 : case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
8091 0 : spdk_bdev_io_complete_scsi_status(bdev_io,
8092 0 : base_io->internal.error.scsi.sc,
8093 0 : base_io->internal.error.scsi.sk,
8094 0 : base_io->internal.error.scsi.asc,
8095 0 : base_io->internal.error.scsi.ascq);
8096 0 : break;
8097 0 : case SPDK_BDEV_IO_STATUS_AIO_ERROR:
8098 0 : spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result);
8099 0 : break;
8100 0 : default:
8101 0 : spdk_bdev_io_complete(bdev_io, base_io->internal.status);
8102 0 : break;
8103 : }
8104 0 : }
8105 :
8106 : struct spdk_thread *
8107 681 : spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
8108 : {
8109 681 : return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
8110 : }
8111 :
8112 : struct spdk_io_channel *
8113 85 : spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io)
8114 : {
8115 85 : return bdev_io->internal.ch->channel;
8116 : }
8117 :
8118 : static int
8119 133 : bdev_register(struct spdk_bdev *bdev)
8120 : {
8121 : char *bdev_name;
8122 133 : char uuid[SPDK_UUID_STRING_LEN];
8123 133 : struct spdk_iobuf_opts iobuf_opts;
8124 : int ret;
8125 :
8126 133 : assert(bdev->module != NULL);
8127 :
8128 133 : if (!bdev->name) {
8129 0 : SPDK_ERRLOG("Bdev name is NULL\n");
8130 0 : return -EINVAL;
8131 : }
8132 :
8133 133 : if (!strlen(bdev->name)) {
8134 0 : SPDK_ERRLOG("Bdev name must not be an empty string\n");
8135 0 : return -EINVAL;
8136 : }
8137 :
8138 : /* Users often register their own I/O devices using the bdev name. In
8139 : * order to avoid conflicts, prepend bdev_. */
8140 133 : bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name);
8141 133 : if (!bdev_name) {
8142 0 : SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n");
8143 0 : return -ENOMEM;
8144 : }
8145 :
8146 133 : bdev->internal.stat = bdev_alloc_io_stat(true);
8147 133 : if (!bdev->internal.stat) {
8148 0 : SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n");
8149 0 : free(bdev_name);
8150 0 : return -ENOMEM;
8151 : }
8152 :
8153 133 : bdev->internal.status = SPDK_BDEV_STATUS_READY;
8154 133 : bdev->internal.measured_queue_depth = UINT64_MAX;
8155 133 : bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
8156 133 : memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
8157 133 : bdev->internal.qd_poller = NULL;
8158 133 : bdev->internal.qos = NULL;
8159 :
8160 133 : TAILQ_INIT(&bdev->internal.open_descs);
8161 133 : TAILQ_INIT(&bdev->internal.locked_ranges);
8162 133 : TAILQ_INIT(&bdev->internal.pending_locked_ranges);
8163 133 : TAILQ_INIT(&bdev->internal.queued_resets);
8164 133 : TAILQ_INIT(&bdev->aliases);
8165 :
8166 : /* UUID may be specified by the user or defined by bdev itself.
8167 : * Otherwise it will be generated here, so this field will never be empty. */
8168 133 : if (spdk_uuid_is_null(&bdev->uuid)) {
8169 44 : spdk_uuid_generate(&bdev->uuid);
8170 : }
8171 :
8172 : /* Add the UUID alias only if it's different than the name */
8173 133 : spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
8174 133 : if (strcmp(bdev->name, uuid) != 0) {
8175 132 : ret = spdk_bdev_alias_add(bdev, uuid);
8176 132 : if (ret != 0) {
8177 2 : SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name);
8178 2 : bdev_free_io_stat(bdev->internal.stat);
8179 2 : free(bdev_name);
8180 2 : return ret;
8181 : }
8182 : }
8183 :
8184 131 : spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts));
8185 131 : if (spdk_bdev_get_buf_align(bdev) > 1) {
8186 0 : bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX,
8187 : iobuf_opts.large_bufsize / bdev->blocklen);
8188 : }
8189 :
8190 : /* If the user didn't specify a write unit size, set it to one. */
8191 131 : if (bdev->write_unit_size == 0) {
8192 127 : bdev->write_unit_size = 1;
8193 : }
8194 :
8195 : /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */
8196 131 : if (bdev->acwu == 0) {
8197 127 : bdev->acwu = bdev->write_unit_size;
8198 : }
8199 :
8200 131 : if (bdev->phys_blocklen == 0) {
8201 127 : bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev);
8202 : }
8203 :
8204 131 : if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) {
8205 0 : bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize);
8206 : }
8207 :
8208 131 : if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
8209 0 : bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE);
8210 : }
8211 :
8212 131 : bdev->internal.reset_in_progress = NULL;
8213 131 : bdev->internal.qd_poll_in_progress = false;
8214 131 : bdev->internal.period = 0;
8215 131 : bdev->internal.new_period = 0;
8216 131 : bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name);
8217 :
8218 : /*
8219 : * Initialize spinlock before registering IO device because spinlock is used in
8220 : * bdev_channel_create
8221 : */
8222 131 : spdk_spin_init(&bdev->internal.spinlock);
8223 :
8224 131 : spdk_io_device_register(__bdev_to_io_dev(bdev),
8225 : bdev_channel_create, bdev_channel_destroy,
8226 : sizeof(struct spdk_bdev_channel),
8227 : bdev_name);
8228 :
8229 : /*
8230 : * Register bdev name only after the bdev object is ready.
8231 : * After bdev_name_add returns, it is possible for other threads to start using the bdev,
8232 : * create IO channels...
8233 : */
8234 131 : ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name);
8235 131 : if (ret != 0) {
8236 0 : spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL);
8237 0 : bdev_free_io_stat(bdev->internal.stat);
8238 0 : spdk_spin_destroy(&bdev->internal.spinlock);
8239 0 : free(bdev_name);
8240 0 : return ret;
8241 : }
8242 :
8243 131 : free(bdev_name);
8244 :
8245 131 : SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name);
8246 131 : TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
8247 :
8248 131 : return 0;
8249 : }
8250 :
8251 : static void
8252 132 : bdev_destroy_cb(void *io_device)
8253 : {
8254 : int rc;
8255 : struct spdk_bdev *bdev;
8256 : spdk_bdev_unregister_cb cb_fn;
8257 : void *cb_arg;
8258 :
8259 132 : bdev = __bdev_from_io_dev(io_device);
8260 :
8261 132 : if (bdev->internal.unregister_td != spdk_get_thread()) {
8262 1 : spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device);
8263 1 : return;
8264 : }
8265 :
8266 131 : cb_fn = bdev->internal.unregister_cb;
8267 131 : cb_arg = bdev->internal.unregister_ctx;
8268 :
8269 131 : spdk_spin_destroy(&bdev->internal.spinlock);
8270 131 : free(bdev->internal.qos);
8271 131 : bdev_free_io_stat(bdev->internal.stat);
8272 131 : spdk_trace_unregister_owner(bdev->internal.trace_id);
8273 :
8274 131 : rc = bdev->fn_table->destruct(bdev->ctxt);
8275 131 : if (rc < 0) {
8276 0 : SPDK_ERRLOG("destruct failed\n");
8277 : }
8278 131 : if (rc <= 0 && cb_fn != NULL) {
8279 10 : cb_fn(cb_arg, rc);
8280 : }
8281 : }
8282 :
8283 : void
8284 2 : spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
8285 : {
8286 2 : if (bdev->internal.unregister_cb != NULL) {
8287 0 : bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno);
8288 : }
8289 2 : }
8290 :
8291 : static void
8292 19 : _remove_notify(void *arg)
8293 : {
8294 19 : struct spdk_bdev_desc *desc = arg;
8295 :
8296 19 : _event_notify(desc, SPDK_BDEV_EVENT_REMOVE);
8297 19 : }
8298 :
8299 : /* returns: 0 - bdev removed and ready to be destructed.
8300 : * -EBUSY - bdev can't be destructed yet. */
8301 : static int
8302 146 : bdev_unregister_unsafe(struct spdk_bdev *bdev)
8303 : {
8304 : struct spdk_bdev_desc *desc, *tmp;
8305 : struct spdk_bdev_alias *alias;
8306 146 : int rc = 0;
8307 146 : char uuid[SPDK_UUID_STRING_LEN];
8308 :
8309 146 : assert(spdk_spin_held(&g_bdev_mgr.spinlock));
8310 146 : assert(spdk_spin_held(&bdev->internal.spinlock));
8311 :
8312 : /* Notify each descriptor about hotremoval */
8313 165 : TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) {
8314 19 : rc = -EBUSY;
8315 : /*
8316 : * Defer invocation of the event_cb to a separate message that will
8317 : * run later on its thread. This ensures this context unwinds and
8318 : * we don't recursively unregister this bdev again if the event_cb
8319 : * immediately closes its descriptor.
8320 : */
8321 19 : event_notify(desc, _remove_notify);
8322 : }
8323 :
8324 : /* If there are no descriptors, proceed removing the bdev */
8325 146 : if (rc == 0) {
8326 131 : bdev_examine_allowlist_remove(bdev->name);
8327 260 : TAILQ_FOREACH(alias, &bdev->aliases, tailq) {
8328 129 : bdev_examine_allowlist_remove(alias->alias.name);
8329 : }
8330 131 : TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
8331 131 : SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name);
8332 :
8333 : /* Delete the name and the UUID alias */
8334 131 : spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
8335 131 : bdev_name_del_unsafe(&bdev->internal.bdev_name);
8336 131 : bdev_alias_del(bdev, uuid, bdev_name_del_unsafe);
8337 :
8338 131 : spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev));
8339 :
8340 131 : if (bdev->internal.reset_in_progress != NULL) {
8341 : /* If reset is in progress, let the completion callback for reset
8342 : * unregister the bdev.
8343 : */
8344 1 : rc = -EBUSY;
8345 : }
8346 : }
8347 :
8348 146 : return rc;
8349 : }
8350 :
8351 : static void
8352 4 : bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8353 : struct spdk_io_channel *io_ch, void *_ctx)
8354 : {
8355 4 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
8356 :
8357 4 : bdev_channel_abort_queued_ios(bdev_ch);
8358 4 : spdk_bdev_for_each_channel_continue(i, 0);
8359 4 : }
8360 :
8361 : static void
8362 131 : bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status)
8363 : {
8364 : int rc;
8365 :
8366 131 : spdk_spin_lock(&g_bdev_mgr.spinlock);
8367 131 : spdk_spin_lock(&bdev->internal.spinlock);
8368 : /*
8369 : * Set the status to REMOVING after completing to abort channels. Otherwise,
8370 : * the last spdk_bdev_close() may call spdk_io_device_unregister() while
8371 : * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister()
8372 : * may fail.
8373 : */
8374 131 : bdev->internal.status = SPDK_BDEV_STATUS_REMOVING;
8375 131 : rc = bdev_unregister_unsafe(bdev);
8376 131 : spdk_spin_unlock(&bdev->internal.spinlock);
8377 131 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8378 :
8379 131 : if (rc == 0) {
8380 115 : spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
8381 : }
8382 131 : }
8383 :
8384 : void
8385 138 : spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
8386 : {
8387 : struct spdk_thread *thread;
8388 :
8389 138 : SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name);
8390 :
8391 138 : thread = spdk_get_thread();
8392 138 : if (!thread) {
8393 : /* The user called this from a non-SPDK thread. */
8394 0 : if (cb_fn != NULL) {
8395 0 : cb_fn(cb_arg, -ENOTSUP);
8396 : }
8397 0 : return;
8398 : }
8399 :
8400 138 : spdk_spin_lock(&g_bdev_mgr.spinlock);
8401 138 : if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
8402 138 : bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
8403 7 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8404 7 : if (cb_fn) {
8405 0 : cb_fn(cb_arg, -EBUSY);
8406 : }
8407 7 : return;
8408 : }
8409 :
8410 131 : spdk_spin_lock(&bdev->internal.spinlock);
8411 131 : bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING;
8412 131 : bdev->internal.unregister_cb = cb_fn;
8413 131 : bdev->internal.unregister_ctx = cb_arg;
8414 131 : bdev->internal.unregister_td = thread;
8415 131 : spdk_spin_unlock(&bdev->internal.spinlock);
8416 131 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8417 :
8418 131 : spdk_bdev_set_qd_sampling_period(bdev, 0);
8419 :
8420 131 : spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev,
8421 : bdev_unregister);
8422 : }
8423 :
8424 : int
8425 4 : spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module,
8426 : spdk_bdev_unregister_cb cb_fn, void *cb_arg)
8427 : {
8428 4 : struct spdk_bdev_desc *desc;
8429 : struct spdk_bdev *bdev;
8430 : int rc;
8431 :
8432 4 : rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc);
8433 4 : if (rc != 0) {
8434 1 : SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name);
8435 1 : return rc;
8436 : }
8437 :
8438 3 : bdev = spdk_bdev_desc_get_bdev(desc);
8439 :
8440 3 : if (bdev->module != module) {
8441 1 : spdk_bdev_close(desc);
8442 1 : SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n",
8443 : bdev_name);
8444 1 : return -ENODEV;
8445 : }
8446 :
8447 2 : spdk_bdev_unregister(bdev, cb_fn, cb_arg);
8448 :
8449 2 : spdk_bdev_close(desc);
8450 :
8451 2 : return 0;
8452 : }
8453 :
8454 : static int
8455 271 : bdev_start_qos(struct spdk_bdev *bdev)
8456 : {
8457 : struct set_qos_limit_ctx *ctx;
8458 :
8459 : /* Enable QoS */
8460 271 : if (bdev->internal.qos && bdev->internal.qos->thread == NULL) {
8461 2 : ctx = calloc(1, sizeof(*ctx));
8462 2 : if (ctx == NULL) {
8463 0 : SPDK_ERRLOG("Failed to allocate memory for QoS context\n");
8464 0 : return -ENOMEM;
8465 : }
8466 2 : ctx->bdev = bdev;
8467 2 : spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done);
8468 : }
8469 :
8470 271 : return 0;
8471 : }
8472 :
8473 : static void
8474 25 : log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail,
8475 : struct spdk_bdev *bdev)
8476 : {
8477 : enum spdk_bdev_claim_type type;
8478 : const char *typename, *modname;
8479 : extern struct spdk_log_flag SPDK_LOG_bdev;
8480 :
8481 25 : assert(spdk_spin_held(&bdev->internal.spinlock));
8482 :
8483 25 : if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) {
8484 0 : return;
8485 : }
8486 :
8487 25 : type = bdev->internal.claim_type;
8488 25 : typename = spdk_bdev_claim_get_name(type);
8489 :
8490 25 : if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) {
8491 6 : modname = bdev->internal.claim.v1.module->name;
8492 6 : spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
8493 : bdev->name, detail, typename, modname);
8494 6 : return;
8495 : }
8496 :
8497 19 : if (claim_type_is_v2(type)) {
8498 : struct spdk_bdev_module_claim *claim;
8499 :
8500 38 : TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
8501 19 : modname = claim->module->name;
8502 19 : spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
8503 : bdev->name, detail, typename, modname);
8504 : }
8505 19 : return;
8506 : }
8507 :
8508 0 : assert(false);
8509 : }
8510 :
8511 : static int
8512 280 : bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc)
8513 : {
8514 : struct spdk_thread *thread;
8515 280 : int rc = 0;
8516 :
8517 280 : thread = spdk_get_thread();
8518 280 : if (!thread) {
8519 0 : SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n");
8520 0 : return -ENOTSUP;
8521 : }
8522 :
8523 280 : SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
8524 : spdk_get_thread());
8525 :
8526 280 : desc->bdev = bdev;
8527 280 : desc->thread = thread;
8528 280 : desc->write = write;
8529 :
8530 280 : spdk_spin_lock(&bdev->internal.spinlock);
8531 280 : if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
8532 280 : bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
8533 3 : spdk_spin_unlock(&bdev->internal.spinlock);
8534 3 : return -ENODEV;
8535 : }
8536 :
8537 277 : if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
8538 6 : LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8539 6 : spdk_spin_unlock(&bdev->internal.spinlock);
8540 6 : return -EPERM;
8541 : }
8542 :
8543 271 : rc = bdev_start_qos(bdev);
8544 271 : if (rc != 0) {
8545 0 : SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name);
8546 0 : spdk_spin_unlock(&bdev->internal.spinlock);
8547 0 : return rc;
8548 : }
8549 :
8550 271 : TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link);
8551 :
8552 271 : spdk_spin_unlock(&bdev->internal.spinlock);
8553 :
8554 271 : return 0;
8555 : }
8556 :
8557 : static void
8558 281 : bdev_open_opts_get_defaults(struct spdk_bdev_open_opts *opts, size_t opts_size)
8559 : {
8560 281 : if (!opts) {
8561 0 : SPDK_ERRLOG("opts should not be NULL.\n");
8562 0 : return;
8563 : }
8564 :
8565 281 : if (!opts_size) {
8566 0 : SPDK_ERRLOG("opts_size should not be zero.\n");
8567 0 : return;
8568 : }
8569 :
8570 281 : memset(opts, 0, opts_size);
8571 281 : opts->size = opts_size;
8572 :
8573 : #define FIELD_OK(field) \
8574 : offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size
8575 :
8576 : #define SET_FIELD(field, value) \
8577 : if (FIELD_OK(field)) { \
8578 : opts->field = value; \
8579 : } \
8580 :
8581 281 : SET_FIELD(hide_metadata, false);
8582 :
8583 : #undef FIELD_OK
8584 : #undef SET_FIELD
8585 : }
8586 :
8587 : static void
8588 2 : bdev_open_opts_copy(struct spdk_bdev_open_opts *opts,
8589 : const struct spdk_bdev_open_opts *opts_src, size_t opts_size)
8590 : {
8591 2 : assert(opts);
8592 2 : assert(opts_src);
8593 :
8594 : #define SET_FIELD(field) \
8595 : if (offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size) { \
8596 : opts->field = opts_src->field; \
8597 : } \
8598 :
8599 2 : SET_FIELD(hide_metadata);
8600 :
8601 2 : opts->size = opts_src->size;
8602 :
8603 : /* We should not remove this statement, but need to update the assert statement
8604 : * if we add a new field, and also add a corresponding SET_FIELD statement.
8605 : */
8606 : SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_opts) == 16, "Incorrect size");
8607 :
8608 : #undef SET_FIELD
8609 2 : }
8610 :
8611 : void
8612 1 : spdk_bdev_open_opts_init(struct spdk_bdev_open_opts *opts, size_t opts_size)
8613 : {
8614 1 : struct spdk_bdev_open_opts opts_local;
8615 :
8616 1 : bdev_open_opts_get_defaults(&opts_local, sizeof(opts_local));
8617 1 : bdev_open_opts_copy(opts, &opts_local, opts_size);
8618 1 : }
8619 :
8620 : static int
8621 280 : bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx,
8622 : struct spdk_bdev_open_opts *user_opts, struct spdk_bdev_desc **_desc)
8623 : {
8624 : struct spdk_bdev_desc *desc;
8625 280 : struct spdk_bdev_open_opts opts;
8626 : unsigned int i;
8627 :
8628 280 : bdev_open_opts_get_defaults(&opts, sizeof(opts));
8629 280 : if (user_opts != NULL) {
8630 1 : bdev_open_opts_copy(&opts, user_opts, user_opts->size);
8631 : }
8632 :
8633 280 : desc = calloc(1, sizeof(*desc));
8634 280 : if (desc == NULL) {
8635 0 : SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
8636 0 : return -ENOMEM;
8637 : }
8638 :
8639 280 : desc->opts = opts;
8640 :
8641 280 : TAILQ_INIT(&desc->pending_media_events);
8642 280 : TAILQ_INIT(&desc->free_media_events);
8643 :
8644 280 : desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0;
8645 280 : desc->callback.event_fn = event_cb;
8646 280 : desc->callback.ctx = event_ctx;
8647 280 : spdk_spin_init(&desc->spinlock);
8648 :
8649 280 : if (desc->opts.hide_metadata) {
8650 1 : if (spdk_bdev_is_md_separate(bdev)) {
8651 0 : SPDK_ERRLOG("hide_metadata option is not supported with separate metadata.\n");
8652 0 : bdev_desc_free(desc);
8653 0 : return -EINVAL;
8654 : }
8655 : }
8656 :
8657 280 : if (bdev->media_events) {
8658 0 : desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE,
8659 : sizeof(*desc->media_events_buffer));
8660 0 : if (desc->media_events_buffer == NULL) {
8661 0 : SPDK_ERRLOG("Failed to initialize media event pool\n");
8662 0 : bdev_desc_free(desc);
8663 0 : return -ENOMEM;
8664 : }
8665 :
8666 0 : for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) {
8667 0 : TAILQ_INSERT_TAIL(&desc->free_media_events,
8668 : &desc->media_events_buffer[i], tailq);
8669 : }
8670 : }
8671 :
8672 280 : if (bdev->fn_table->accel_sequence_supported != NULL) {
8673 0 : for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
8674 0 : desc->accel_sequence_supported[i] =
8675 0 : bdev->fn_table->accel_sequence_supported(bdev->ctxt,
8676 : (enum spdk_bdev_io_type)i);
8677 : }
8678 : }
8679 :
8680 280 : *_desc = desc;
8681 :
8682 280 : return 0;
8683 : }
8684 :
8685 : static int
8686 137 : bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8687 : void *event_ctx, struct spdk_bdev_open_opts *opts,
8688 : struct spdk_bdev_desc **_desc)
8689 : {
8690 137 : struct spdk_bdev_desc *desc;
8691 : struct spdk_bdev *bdev;
8692 : int rc;
8693 :
8694 137 : bdev = bdev_get_by_name(bdev_name);
8695 :
8696 137 : if (bdev == NULL) {
8697 1 : SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name);
8698 1 : return -ENODEV;
8699 : }
8700 :
8701 136 : rc = bdev_desc_alloc(bdev, event_cb, event_ctx, opts, &desc);
8702 136 : if (rc != 0) {
8703 0 : return rc;
8704 : }
8705 :
8706 136 : rc = bdev_open(bdev, write, desc);
8707 136 : if (rc != 0) {
8708 7 : bdev_desc_free(desc);
8709 7 : desc = NULL;
8710 : }
8711 :
8712 136 : *_desc = desc;
8713 :
8714 136 : return rc;
8715 : }
8716 :
8717 : int
8718 139 : spdk_bdev_open_ext_v2(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8719 : void *event_ctx, struct spdk_bdev_open_opts *opts,
8720 : struct spdk_bdev_desc **_desc)
8721 : {
8722 : int rc;
8723 :
8724 139 : if (event_cb == NULL) {
8725 2 : SPDK_ERRLOG("Missing event callback function\n");
8726 2 : return -EINVAL;
8727 : }
8728 :
8729 137 : spdk_spin_lock(&g_bdev_mgr.spinlock);
8730 137 : rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, opts, _desc);
8731 137 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8732 :
8733 137 : return rc;
8734 : }
8735 :
8736 : int
8737 137 : spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8738 : void *event_ctx, struct spdk_bdev_desc **_desc)
8739 : {
8740 137 : return spdk_bdev_open_ext_v2(bdev_name, write, event_cb, event_ctx, NULL, _desc);
8741 : }
8742 :
8743 : struct spdk_bdev_open_async_ctx {
8744 : char *bdev_name;
8745 : spdk_bdev_event_cb_t event_cb;
8746 : void *event_ctx;
8747 : bool write;
8748 : int rc;
8749 : spdk_bdev_open_async_cb_t cb_fn;
8750 : void *cb_arg;
8751 : struct spdk_bdev_desc *desc;
8752 : struct spdk_bdev_open_async_opts opts;
8753 : uint64_t start_ticks;
8754 : struct spdk_thread *orig_thread;
8755 : struct spdk_poller *poller;
8756 : TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq;
8757 : };
8758 :
8759 : static void
8760 0 : bdev_open_async_done(void *arg)
8761 : {
8762 0 : struct spdk_bdev_open_async_ctx *ctx = arg;
8763 :
8764 0 : ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg);
8765 :
8766 0 : free(ctx->bdev_name);
8767 0 : free(ctx);
8768 0 : }
8769 :
8770 : static void
8771 0 : bdev_open_async_cancel(void *arg)
8772 : {
8773 0 : struct spdk_bdev_open_async_ctx *ctx = arg;
8774 :
8775 0 : assert(ctx->rc == -ESHUTDOWN);
8776 :
8777 0 : spdk_poller_unregister(&ctx->poller);
8778 :
8779 0 : bdev_open_async_done(ctx);
8780 0 : }
8781 :
8782 : /* This is called when the bdev library finishes at shutdown. */
8783 : static void
8784 69 : bdev_open_async_fini(void)
8785 : {
8786 : struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx;
8787 :
8788 69 : spdk_spin_lock(&g_bdev_mgr.spinlock);
8789 69 : TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) {
8790 0 : TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8791 : /*
8792 : * We have to move to ctx->orig_thread to unregister ctx->poller.
8793 : * However, there is a chance that ctx->poller is executed before
8794 : * message is executed, which could result in bdev_open_async_done()
8795 : * being called twice. To avoid such race condition, set ctx->rc to
8796 : * -ESHUTDOWN.
8797 : */
8798 0 : ctx->rc = -ESHUTDOWN;
8799 0 : spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx);
8800 : }
8801 69 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8802 69 : }
8803 :
8804 : static int bdev_open_async(void *arg);
8805 :
8806 : static void
8807 0 : _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx)
8808 : {
8809 : uint64_t timeout_ticks;
8810 :
8811 0 : if (ctx->rc == -ESHUTDOWN) {
8812 : /* This context is being canceled. Do nothing. */
8813 0 : return;
8814 : }
8815 :
8816 0 : ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx,
8817 : NULL, &ctx->desc);
8818 0 : if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) {
8819 0 : goto exit;
8820 : }
8821 :
8822 0 : timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull;
8823 0 : if (spdk_get_ticks() >= timeout_ticks) {
8824 0 : SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name);
8825 0 : ctx->rc = -ETIMEDOUT;
8826 0 : goto exit;
8827 : }
8828 :
8829 0 : return;
8830 :
8831 0 : exit:
8832 0 : spdk_poller_unregister(&ctx->poller);
8833 0 : TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8834 :
8835 : /* Completion callback is processed after stack unwinding. */
8836 0 : spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx);
8837 : }
8838 :
8839 : static int
8840 0 : bdev_open_async(void *arg)
8841 : {
8842 0 : struct spdk_bdev_open_async_ctx *ctx = arg;
8843 :
8844 0 : spdk_spin_lock(&g_bdev_mgr.spinlock);
8845 :
8846 0 : _bdev_open_async(ctx);
8847 :
8848 0 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8849 :
8850 0 : return SPDK_POLLER_BUSY;
8851 : }
8852 :
8853 : static void
8854 0 : bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts,
8855 : struct spdk_bdev_open_async_opts *opts_src,
8856 : size_t size)
8857 : {
8858 0 : assert(opts);
8859 0 : assert(opts_src);
8860 :
8861 0 : opts->size = size;
8862 :
8863 : #define SET_FIELD(field) \
8864 : if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \
8865 : opts->field = opts_src->field; \
8866 : } \
8867 :
8868 0 : SET_FIELD(timeout_ms);
8869 :
8870 : /* Do not remove this statement, you should always update this statement when you adding a new field,
8871 : * and do not forget to add the SET_FIELD statement for your added field. */
8872 : SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size");
8873 :
8874 : #undef SET_FIELD
8875 0 : }
8876 :
8877 : static void
8878 0 : bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size)
8879 : {
8880 0 : assert(opts);
8881 :
8882 0 : opts->size = size;
8883 :
8884 : #define SET_FIELD(field, value) \
8885 : if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \
8886 : opts->field = value; \
8887 : } \
8888 :
8889 0 : SET_FIELD(timeout_ms, 0);
8890 :
8891 : #undef SET_FIELD
8892 0 : }
8893 :
8894 : int
8895 0 : spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8896 : void *event_ctx, struct spdk_bdev_open_async_opts *opts,
8897 : spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg)
8898 : {
8899 : struct spdk_bdev_open_async_ctx *ctx;
8900 :
8901 0 : if (event_cb == NULL) {
8902 0 : SPDK_ERRLOG("Missing event callback function\n");
8903 0 : return -EINVAL;
8904 : }
8905 :
8906 0 : if (open_cb == NULL) {
8907 0 : SPDK_ERRLOG("Missing open callback function\n");
8908 0 : return -EINVAL;
8909 : }
8910 :
8911 0 : if (opts != NULL && opts->size == 0) {
8912 0 : SPDK_ERRLOG("size in the options structure should not be zero\n");
8913 0 : return -EINVAL;
8914 : }
8915 :
8916 0 : ctx = calloc(1, sizeof(*ctx));
8917 0 : if (ctx == NULL) {
8918 0 : SPDK_ERRLOG("Failed to allocate open context\n");
8919 0 : return -ENOMEM;
8920 : }
8921 :
8922 0 : ctx->bdev_name = strdup(bdev_name);
8923 0 : if (ctx->bdev_name == NULL) {
8924 0 : SPDK_ERRLOG("Failed to duplicate bdev_name\n");
8925 0 : free(ctx);
8926 0 : return -ENOMEM;
8927 : }
8928 :
8929 0 : ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000);
8930 0 : if (ctx->poller == NULL) {
8931 0 : SPDK_ERRLOG("Failed to register bdev_open_async poller\n");
8932 0 : free(ctx->bdev_name);
8933 0 : free(ctx);
8934 0 : return -ENOMEM;
8935 : }
8936 :
8937 0 : ctx->cb_fn = open_cb;
8938 0 : ctx->cb_arg = open_cb_arg;
8939 0 : ctx->write = write;
8940 0 : ctx->event_cb = event_cb;
8941 0 : ctx->event_ctx = event_ctx;
8942 0 : ctx->orig_thread = spdk_get_thread();
8943 0 : ctx->start_ticks = spdk_get_ticks();
8944 :
8945 0 : bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts));
8946 0 : if (opts != NULL) {
8947 0 : bdev_open_async_opts_copy(&ctx->opts, opts, opts->size);
8948 : }
8949 :
8950 0 : spdk_spin_lock(&g_bdev_mgr.spinlock);
8951 :
8952 0 : TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8953 0 : _bdev_open_async(ctx);
8954 :
8955 0 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8956 :
8957 0 : return 0;
8958 : }
8959 :
8960 : static void
8961 271 : bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc)
8962 : {
8963 : int rc;
8964 :
8965 271 : spdk_spin_lock(&bdev->internal.spinlock);
8966 271 : spdk_spin_lock(&desc->spinlock);
8967 :
8968 271 : TAILQ_REMOVE(&bdev->internal.open_descs, desc, link);
8969 :
8970 271 : desc->closed = true;
8971 :
8972 271 : if (desc->claim != NULL) {
8973 20 : bdev_desc_release_claims(desc);
8974 : }
8975 :
8976 271 : if (0 == desc->refs) {
8977 260 : spdk_spin_unlock(&desc->spinlock);
8978 260 : bdev_desc_free(desc);
8979 : } else {
8980 11 : spdk_spin_unlock(&desc->spinlock);
8981 : }
8982 :
8983 : /* If no more descriptors, kill QoS channel */
8984 271 : if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) {
8985 7 : SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
8986 : bdev->name, spdk_get_thread());
8987 :
8988 7 : if (bdev_qos_destroy(bdev)) {
8989 : /* There isn't anything we can do to recover here. Just let the
8990 : * old QoS poller keep running. The QoS handling won't change
8991 : * cores when the user allocates a new channel, but it won't break. */
8992 0 : SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
8993 : }
8994 : }
8995 :
8996 271 : if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
8997 15 : rc = bdev_unregister_unsafe(bdev);
8998 15 : spdk_spin_unlock(&bdev->internal.spinlock);
8999 :
9000 15 : if (rc == 0) {
9001 15 : spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
9002 : }
9003 : } else {
9004 256 : spdk_spin_unlock(&bdev->internal.spinlock);
9005 : }
9006 271 : }
9007 :
9008 : void
9009 129 : spdk_bdev_close(struct spdk_bdev_desc *desc)
9010 : {
9011 129 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
9012 :
9013 129 : SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
9014 : spdk_get_thread());
9015 :
9016 129 : assert(desc->thread == spdk_get_thread());
9017 :
9018 129 : spdk_poller_unregister(&desc->io_timeout_poller);
9019 :
9020 129 : spdk_spin_lock(&g_bdev_mgr.spinlock);
9021 :
9022 129 : bdev_close(bdev, desc);
9023 :
9024 129 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
9025 129 : }
9026 :
9027 : int32_t
9028 3 : spdk_bdev_get_numa_id(struct spdk_bdev *bdev)
9029 : {
9030 3 : if (bdev->numa.id_valid) {
9031 2 : return bdev->numa.id;
9032 : } else {
9033 1 : return SPDK_ENV_NUMA_ID_ANY;
9034 : }
9035 : }
9036 :
9037 : static void
9038 131 : bdev_register_finished(void *arg)
9039 : {
9040 131 : struct spdk_bdev_desc *desc = arg;
9041 131 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
9042 :
9043 131 : spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev));
9044 :
9045 131 : spdk_spin_lock(&g_bdev_mgr.spinlock);
9046 :
9047 131 : bdev_close(bdev, desc);
9048 :
9049 131 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
9050 131 : }
9051 :
9052 : int
9053 134 : spdk_bdev_register(struct spdk_bdev *bdev)
9054 : {
9055 134 : struct spdk_bdev_desc *desc;
9056 134 : struct spdk_thread *thread = spdk_get_thread();
9057 : int rc;
9058 :
9059 134 : if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) {
9060 1 : SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread,
9061 : thread ? spdk_thread_get_name(thread) : "null");
9062 1 : return -EINVAL;
9063 : }
9064 :
9065 133 : rc = bdev_register(bdev);
9066 133 : if (rc != 0) {
9067 2 : return rc;
9068 : }
9069 :
9070 : /* A descriptor is opened to prevent bdev deletion during examination */
9071 131 : rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc);
9072 131 : if (rc != 0) {
9073 0 : spdk_bdev_unregister(bdev, NULL, NULL);
9074 0 : return rc;
9075 : }
9076 :
9077 131 : rc = bdev_open(bdev, false, desc);
9078 131 : if (rc != 0) {
9079 0 : bdev_desc_free(desc);
9080 0 : spdk_bdev_unregister(bdev, NULL, NULL);
9081 0 : return rc;
9082 : }
9083 :
9084 : /* Examine configuration before initializing I/O */
9085 131 : bdev_examine(bdev);
9086 :
9087 131 : rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc);
9088 131 : if (rc != 0) {
9089 0 : bdev_close(bdev, desc);
9090 0 : spdk_bdev_unregister(bdev, NULL, NULL);
9091 : }
9092 :
9093 131 : return rc;
9094 : }
9095 :
9096 : int
9097 26 : spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
9098 : struct spdk_bdev_module *module)
9099 : {
9100 26 : spdk_spin_lock(&bdev->internal.spinlock);
9101 :
9102 26 : if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
9103 6 : LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
9104 6 : spdk_spin_unlock(&bdev->internal.spinlock);
9105 6 : return -EPERM;
9106 : }
9107 :
9108 20 : if (desc && !desc->write) {
9109 5 : desc->write = true;
9110 : }
9111 :
9112 20 : bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE;
9113 20 : bdev->internal.claim.v1.module = module;
9114 :
9115 20 : spdk_spin_unlock(&bdev->internal.spinlock);
9116 20 : return 0;
9117 : }
9118 :
9119 : void
9120 8 : spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
9121 : {
9122 8 : spdk_spin_lock(&bdev->internal.spinlock);
9123 :
9124 8 : assert(bdev->internal.claim.v1.module != NULL);
9125 8 : assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE);
9126 8 : bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
9127 8 : bdev->internal.claim.v1.module = NULL;
9128 :
9129 8 : spdk_spin_unlock(&bdev->internal.spinlock);
9130 8 : }
9131 :
9132 : /*
9133 : * Start claims v2
9134 : */
9135 :
9136 : const char *
9137 25 : spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type)
9138 : {
9139 25 : switch (type) {
9140 0 : case SPDK_BDEV_CLAIM_NONE:
9141 0 : return "not_claimed";
9142 6 : case SPDK_BDEV_CLAIM_EXCL_WRITE:
9143 6 : return "exclusive_write";
9144 8 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
9145 8 : return "read_many_write_one";
9146 5 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
9147 5 : return "read_many_write_none";
9148 6 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9149 6 : return "read_many_write_many";
9150 0 : default:
9151 0 : break;
9152 : }
9153 0 : return "invalid_claim";
9154 : }
9155 :
9156 : static bool
9157 115 : claim_type_is_v2(enum spdk_bdev_claim_type type)
9158 : {
9159 115 : switch (type) {
9160 115 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
9161 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
9162 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9163 115 : return true;
9164 0 : default:
9165 0 : break;
9166 : }
9167 0 : return false;
9168 : }
9169 :
9170 : /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */
9171 : static bool
9172 17 : claim_type_promotes_to_write(enum spdk_bdev_claim_type type)
9173 : {
9174 17 : switch (type) {
9175 6 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
9176 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9177 6 : return true;
9178 11 : default:
9179 11 : break;
9180 : }
9181 11 : return false;
9182 : }
9183 :
9184 : void
9185 57 : spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size)
9186 : {
9187 57 : if (opts == NULL) {
9188 0 : SPDK_ERRLOG("opts should not be NULL\n");
9189 0 : assert(opts != NULL);
9190 0 : return;
9191 : }
9192 57 : if (size == 0) {
9193 0 : SPDK_ERRLOG("size should not be zero\n");
9194 0 : assert(size != 0);
9195 0 : return;
9196 : }
9197 :
9198 57 : memset(opts, 0, size);
9199 57 : opts->opts_size = size;
9200 :
9201 : #define FIELD_OK(field) \
9202 : offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size
9203 :
9204 : #define SET_FIELD(field, value) \
9205 : if (FIELD_OK(field)) { \
9206 : opts->field = value; \
9207 : } \
9208 :
9209 57 : SET_FIELD(shared_claim_key, 0);
9210 :
9211 : #undef FIELD_OK
9212 : #undef SET_FIELD
9213 : }
9214 :
9215 : static int
9216 22 : claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst)
9217 : {
9218 22 : if (src->opts_size == 0) {
9219 0 : SPDK_ERRLOG("size should not be zero\n");
9220 0 : return -1;
9221 : }
9222 :
9223 22 : memset(dst, 0, sizeof(*dst));
9224 22 : dst->opts_size = src->opts_size;
9225 :
9226 : #define FIELD_OK(field) \
9227 : offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size
9228 :
9229 : #define SET_FIELD(field) \
9230 : if (FIELD_OK(field)) { \
9231 : dst->field = src->field; \
9232 : } \
9233 :
9234 22 : if (FIELD_OK(name)) {
9235 22 : snprintf(dst->name, sizeof(dst->name), "%s", src->name);
9236 : }
9237 :
9238 22 : SET_FIELD(shared_claim_key);
9239 :
9240 : /* You should not remove this statement, but need to update the assert statement
9241 : * if you add a new field, and also add a corresponding SET_FIELD statement */
9242 : SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size");
9243 :
9244 : #undef FIELD_OK
9245 : #undef SET_FIELD
9246 22 : return 0;
9247 : }
9248 :
9249 : /* Returns 0 if a read-write-once claim can be taken. */
9250 : static int
9251 10 : claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9252 : struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
9253 : {
9254 10 : struct spdk_bdev *bdev = desc->bdev;
9255 : struct spdk_bdev_desc *open_desc;
9256 :
9257 10 : assert(spdk_spin_held(&bdev->internal.spinlock));
9258 10 : assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE);
9259 :
9260 10 : if (opts->shared_claim_key != 0) {
9261 1 : SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n",
9262 : bdev->name);
9263 1 : return -EINVAL;
9264 : }
9265 9 : if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
9266 1 : LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
9267 1 : return -EPERM;
9268 : }
9269 8 : if (desc->claim != NULL) {
9270 0 : SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n",
9271 : bdev->name, desc->claim->module->name);
9272 0 : return -EPERM;
9273 : }
9274 16 : TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
9275 10 : if (desc != open_desc && open_desc->write) {
9276 2 : SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while "
9277 : "another descriptor is open for writing\n",
9278 : bdev->name);
9279 2 : return -EPERM;
9280 : }
9281 : }
9282 :
9283 6 : return 0;
9284 : }
9285 :
9286 : /* Returns 0 if a read-only-many claim can be taken. */
9287 : static int
9288 15 : claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9289 : struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
9290 : {
9291 15 : struct spdk_bdev *bdev = desc->bdev;
9292 : struct spdk_bdev_desc *open_desc;
9293 :
9294 15 : assert(spdk_spin_held(&bdev->internal.spinlock));
9295 15 : assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE);
9296 15 : assert(desc->claim == NULL);
9297 :
9298 15 : if (desc->write) {
9299 3 : SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n",
9300 : bdev->name);
9301 3 : return -EINVAL;
9302 : }
9303 12 : if (opts->shared_claim_key != 0) {
9304 1 : SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name);
9305 1 : return -EINVAL;
9306 : }
9307 11 : if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
9308 19 : TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
9309 11 : if (open_desc->write) {
9310 0 : SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while "
9311 : "another descriptor is open for writing\n",
9312 : bdev->name);
9313 0 : return -EPERM;
9314 : }
9315 : }
9316 : }
9317 :
9318 11 : return 0;
9319 : }
9320 :
9321 : /* Returns 0 if a read-write-many claim can be taken. */
9322 : static int
9323 8 : claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9324 : struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
9325 : {
9326 8 : struct spdk_bdev *bdev = desc->bdev;
9327 : struct spdk_bdev_desc *open_desc;
9328 :
9329 8 : assert(spdk_spin_held(&bdev->internal.spinlock));
9330 8 : assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED);
9331 8 : assert(desc->claim == NULL);
9332 :
9333 8 : if (opts->shared_claim_key == 0) {
9334 2 : SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n",
9335 : bdev->name);
9336 2 : return -EINVAL;
9337 : }
9338 6 : switch (bdev->internal.claim_type) {
9339 4 : case SPDK_BDEV_CLAIM_NONE:
9340 7 : TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
9341 5 : if (open_desc == desc) {
9342 3 : continue;
9343 : }
9344 2 : if (open_desc->write) {
9345 2 : SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while "
9346 : "another descriptor is open for writing without a "
9347 : "claim\n", bdev->name);
9348 2 : return -EPERM;
9349 : }
9350 : }
9351 2 : break;
9352 2 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9353 2 : if (opts->shared_claim_key != bdev->internal.claim.v2.key) {
9354 1 : LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev);
9355 1 : return -EPERM;
9356 : }
9357 1 : break;
9358 0 : default:
9359 0 : LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
9360 0 : return -EBUSY;
9361 : }
9362 :
9363 3 : return 0;
9364 : }
9365 :
9366 : /* Updates desc and its bdev with a v2 claim. */
9367 : static int
9368 20 : claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9369 : struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
9370 : {
9371 20 : struct spdk_bdev *bdev = desc->bdev;
9372 : struct spdk_bdev_module_claim *claim;
9373 :
9374 20 : assert(spdk_spin_held(&bdev->internal.spinlock));
9375 20 : assert(claim_type_is_v2(type));
9376 20 : assert(desc->claim == NULL);
9377 :
9378 20 : claim = calloc(1, sizeof(*desc->claim));
9379 20 : if (claim == NULL) {
9380 0 : SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name);
9381 0 : return -ENOMEM;
9382 : }
9383 20 : claim->module = module;
9384 20 : claim->desc = desc;
9385 : SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match");
9386 20 : memcpy(claim->name, opts->name, sizeof(claim->name));
9387 20 : desc->claim = claim;
9388 :
9389 20 : if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
9390 16 : bdev->internal.claim_type = type;
9391 16 : TAILQ_INIT(&bdev->internal.claim.v2.claims);
9392 16 : bdev->internal.claim.v2.key = opts->shared_claim_key;
9393 : }
9394 20 : assert(type == bdev->internal.claim_type);
9395 :
9396 20 : TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link);
9397 :
9398 20 : if (!desc->write && claim_type_promotes_to_write(type)) {
9399 6 : desc->write = true;
9400 : }
9401 :
9402 20 : return 0;
9403 : }
9404 :
9405 : int
9406 44 : spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9407 : struct spdk_bdev_claim_opts *_opts,
9408 : struct spdk_bdev_module *module)
9409 : {
9410 : struct spdk_bdev *bdev;
9411 44 : struct spdk_bdev_claim_opts opts;
9412 44 : int rc = 0;
9413 :
9414 44 : if (desc == NULL) {
9415 0 : SPDK_ERRLOG("descriptor must not be NULL\n");
9416 0 : return -EINVAL;
9417 : }
9418 :
9419 44 : bdev = desc->bdev;
9420 :
9421 44 : if (_opts == NULL) {
9422 22 : spdk_bdev_claim_opts_init(&opts, sizeof(opts));
9423 22 : } else if (claim_opts_copy(_opts, &opts) != 0) {
9424 0 : return -EINVAL;
9425 : }
9426 :
9427 44 : spdk_spin_lock(&bdev->internal.spinlock);
9428 :
9429 44 : if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE &&
9430 17 : bdev->internal.claim_type != type) {
9431 11 : LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
9432 11 : spdk_spin_unlock(&bdev->internal.spinlock);
9433 11 : return -EPERM;
9434 : }
9435 :
9436 33 : if (claim_type_is_v2(type) && desc->claim != NULL) {
9437 0 : SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n",
9438 : bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name);
9439 0 : spdk_spin_unlock(&bdev->internal.spinlock);
9440 0 : return -EPERM;
9441 : }
9442 :
9443 33 : switch (type) {
9444 0 : case SPDK_BDEV_CLAIM_EXCL_WRITE:
9445 0 : spdk_spin_unlock(&bdev->internal.spinlock);
9446 0 : return spdk_bdev_module_claim_bdev(bdev, desc, module);
9447 10 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
9448 10 : rc = claim_verify_rwo(desc, type, &opts, module);
9449 10 : break;
9450 15 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
9451 15 : rc = claim_verify_rom(desc, type, &opts, module);
9452 15 : break;
9453 8 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9454 8 : rc = claim_verify_rwm(desc, type, &opts, module);
9455 8 : break;
9456 0 : default:
9457 0 : SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type);
9458 0 : rc = -ENOTSUP;
9459 : }
9460 :
9461 33 : if (rc == 0) {
9462 20 : rc = claim_bdev(desc, type, &opts, module);
9463 : }
9464 :
9465 33 : spdk_spin_unlock(&bdev->internal.spinlock);
9466 33 : return rc;
9467 : }
9468 :
9469 : static void
9470 16 : claim_reset(struct spdk_bdev *bdev)
9471 : {
9472 16 : assert(spdk_spin_held(&bdev->internal.spinlock));
9473 16 : assert(claim_type_is_v2(bdev->internal.claim_type));
9474 16 : assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims));
9475 :
9476 16 : memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
9477 16 : bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
9478 16 : }
9479 :
9480 : static void
9481 20 : bdev_desc_release_claims(struct spdk_bdev_desc *desc)
9482 : {
9483 20 : struct spdk_bdev *bdev = desc->bdev;
9484 :
9485 20 : assert(spdk_spin_held(&bdev->internal.spinlock));
9486 20 : assert(claim_type_is_v2(bdev->internal.claim_type));
9487 :
9488 20 : if (bdev->internal.examine_in_progress == 0) {
9489 20 : TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link);
9490 20 : free(desc->claim);
9491 20 : if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
9492 16 : claim_reset(bdev);
9493 : }
9494 : } else {
9495 : /* This is a dead claim that will be cleaned up when bdev_examine() is done. */
9496 0 : desc->claim->module = NULL;
9497 0 : desc->claim->desc = NULL;
9498 : }
9499 20 : desc->claim = NULL;
9500 20 : }
9501 :
9502 : /*
9503 : * End claims v2
9504 : */
9505 :
9506 : struct spdk_bdev *
9507 1590 : spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
9508 : {
9509 1590 : assert(desc != NULL);
9510 1590 : return desc->bdev;
9511 : }
9512 :
9513 : int
9514 1 : spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn)
9515 : {
9516 : struct spdk_bdev *bdev, *tmp;
9517 1 : struct spdk_bdev_desc *desc;
9518 1 : int rc = 0;
9519 :
9520 1 : assert(fn != NULL);
9521 :
9522 1 : spdk_spin_lock(&g_bdev_mgr.spinlock);
9523 1 : bdev = spdk_bdev_first();
9524 9 : while (bdev != NULL) {
9525 8 : rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc);
9526 8 : if (rc != 0) {
9527 0 : break;
9528 : }
9529 8 : rc = bdev_open(bdev, false, desc);
9530 8 : if (rc != 0) {
9531 1 : bdev_desc_free(desc);
9532 1 : if (rc == -ENODEV) {
9533 : /* Ignore the error and move to the next bdev. */
9534 1 : rc = 0;
9535 1 : bdev = spdk_bdev_next(bdev);
9536 1 : continue;
9537 : }
9538 0 : break;
9539 : }
9540 7 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
9541 :
9542 7 : rc = fn(ctx, bdev);
9543 :
9544 7 : spdk_spin_lock(&g_bdev_mgr.spinlock);
9545 7 : tmp = spdk_bdev_next(bdev);
9546 7 : bdev_close(bdev, desc);
9547 7 : if (rc != 0) {
9548 0 : break;
9549 : }
9550 7 : bdev = tmp;
9551 : }
9552 1 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
9553 :
9554 1 : return rc;
9555 : }
9556 :
9557 : int
9558 1 : spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn)
9559 : {
9560 : struct spdk_bdev *bdev, *tmp;
9561 1 : struct spdk_bdev_desc *desc;
9562 1 : int rc = 0;
9563 :
9564 1 : assert(fn != NULL);
9565 :
9566 1 : spdk_spin_lock(&g_bdev_mgr.spinlock);
9567 1 : bdev = spdk_bdev_first_leaf();
9568 6 : while (bdev != NULL) {
9569 5 : rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc);
9570 5 : if (rc != 0) {
9571 0 : break;
9572 : }
9573 5 : rc = bdev_open(bdev, false, desc);
9574 5 : if (rc != 0) {
9575 1 : bdev_desc_free(desc);
9576 1 : if (rc == -ENODEV) {
9577 : /* Ignore the error and move to the next bdev. */
9578 1 : rc = 0;
9579 1 : bdev = spdk_bdev_next_leaf(bdev);
9580 1 : continue;
9581 : }
9582 0 : break;
9583 : }
9584 4 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
9585 :
9586 4 : rc = fn(ctx, bdev);
9587 :
9588 4 : spdk_spin_lock(&g_bdev_mgr.spinlock);
9589 4 : tmp = spdk_bdev_next_leaf(bdev);
9590 4 : bdev_close(bdev, desc);
9591 4 : if (rc != 0) {
9592 0 : break;
9593 : }
9594 4 : bdev = tmp;
9595 : }
9596 1 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
9597 :
9598 1 : return rc;
9599 : }
9600 :
9601 : void
9602 0 : spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
9603 : {
9604 : struct iovec *iovs;
9605 : int iovcnt;
9606 :
9607 0 : if (bdev_io == NULL) {
9608 0 : return;
9609 : }
9610 :
9611 0 : switch (bdev_io->type) {
9612 0 : case SPDK_BDEV_IO_TYPE_READ:
9613 : case SPDK_BDEV_IO_TYPE_WRITE:
9614 : case SPDK_BDEV_IO_TYPE_ZCOPY:
9615 0 : iovs = bdev_io->u.bdev.iovs;
9616 0 : iovcnt = bdev_io->u.bdev.iovcnt;
9617 0 : break;
9618 0 : default:
9619 0 : iovs = NULL;
9620 0 : iovcnt = 0;
9621 0 : break;
9622 : }
9623 :
9624 0 : if (iovp) {
9625 0 : *iovp = iovs;
9626 : }
9627 0 : if (iovcntp) {
9628 0 : *iovcntp = iovcnt;
9629 : }
9630 : }
9631 :
9632 : void *
9633 0 : spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io)
9634 : {
9635 0 : if (bdev_io == NULL) {
9636 0 : return NULL;
9637 : }
9638 :
9639 0 : if (!spdk_bdev_is_md_separate(bdev_io->bdev)) {
9640 0 : return NULL;
9641 : }
9642 :
9643 0 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
9644 0 : bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
9645 0 : return bdev_io->u.bdev.md_buf;
9646 : }
9647 :
9648 0 : return NULL;
9649 : }
9650 :
9651 : void *
9652 0 : spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io)
9653 : {
9654 0 : if (bdev_io == NULL) {
9655 0 : assert(false);
9656 : return NULL;
9657 : }
9658 :
9659 0 : return bdev_io->internal.caller_ctx;
9660 : }
9661 :
9662 : void
9663 7 : spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
9664 : {
9665 :
9666 7 : if (spdk_bdev_module_list_find(bdev_module->name)) {
9667 0 : SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
9668 0 : assert(false);
9669 : }
9670 :
9671 7 : spdk_spin_init(&bdev_module->internal.spinlock);
9672 7 : TAILQ_INIT(&bdev_module->internal.quiesced_ranges);
9673 :
9674 : /*
9675 : * Modules with examine callbacks must be initialized first, so they are
9676 : * ready to handle examine callbacks from later modules that will
9677 : * register physical bdevs.
9678 : */
9679 7 : if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) {
9680 4 : TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
9681 : } else {
9682 3 : TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
9683 : }
9684 7 : }
9685 :
9686 : struct spdk_bdev_module *
9687 7 : spdk_bdev_module_list_find(const char *name)
9688 : {
9689 : struct spdk_bdev_module *bdev_module;
9690 :
9691 14 : TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
9692 7 : if (strcmp(name, bdev_module->name) == 0) {
9693 0 : break;
9694 : }
9695 : }
9696 :
9697 7 : return bdev_module;
9698 : }
9699 :
9700 : static int
9701 6 : bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io)
9702 : {
9703 : uint64_t num_blocks;
9704 6 : void *md_buf = NULL;
9705 :
9706 6 : num_blocks = bdev_io->u.bdev.num_blocks;
9707 :
9708 6 : if (spdk_bdev_is_md_separate(bdev_io->bdev)) {
9709 2 : md_buf = (char *)g_bdev_mgr.zero_buffer +
9710 2 : spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks;
9711 : }
9712 :
9713 12 : return bdev_write_blocks_with_md(bdev_io->internal.desc,
9714 6 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
9715 : g_bdev_mgr.zero_buffer, md_buf,
9716 : bdev_io->u.bdev.offset_blocks, num_blocks,
9717 : bdev_write_zero_buffer_done, bdev_io);
9718 : }
9719 :
9720 : static void
9721 6 : bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
9722 : {
9723 6 : struct spdk_bdev_io *parent_io = cb_arg;
9724 :
9725 6 : spdk_bdev_free_io(bdev_io);
9726 :
9727 6 : parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
9728 6 : parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
9729 6 : }
9730 :
9731 : static void
9732 10 : bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
9733 : {
9734 10 : spdk_spin_lock(&ctx->bdev->internal.spinlock);
9735 10 : ctx->bdev->internal.qos_mod_in_progress = false;
9736 10 : spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9737 :
9738 10 : if (ctx->cb_fn) {
9739 8 : ctx->cb_fn(ctx->cb_arg, status);
9740 : }
9741 10 : free(ctx);
9742 10 : }
9743 :
9744 : static void
9745 2 : bdev_disable_qos_done(void *cb_arg)
9746 : {
9747 2 : struct set_qos_limit_ctx *ctx = cb_arg;
9748 2 : struct spdk_bdev *bdev = ctx->bdev;
9749 : struct spdk_bdev_qos *qos;
9750 :
9751 2 : spdk_spin_lock(&bdev->internal.spinlock);
9752 2 : qos = bdev->internal.qos;
9753 2 : bdev->internal.qos = NULL;
9754 2 : spdk_spin_unlock(&bdev->internal.spinlock);
9755 :
9756 2 : if (qos->thread != NULL) {
9757 2 : spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
9758 2 : spdk_poller_unregister(&qos->poller);
9759 : }
9760 :
9761 2 : free(qos);
9762 :
9763 2 : bdev_set_qos_limit_done(ctx, 0);
9764 2 : }
9765 :
9766 : static void
9767 2 : bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status)
9768 : {
9769 2 : struct set_qos_limit_ctx *ctx = _ctx;
9770 : struct spdk_thread *thread;
9771 :
9772 2 : spdk_spin_lock(&bdev->internal.spinlock);
9773 2 : thread = bdev->internal.qos->thread;
9774 2 : spdk_spin_unlock(&bdev->internal.spinlock);
9775 :
9776 2 : if (thread != NULL) {
9777 2 : spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx);
9778 : } else {
9779 0 : bdev_disable_qos_done(ctx);
9780 : }
9781 2 : }
9782 :
9783 : static void
9784 4 : bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9785 : struct spdk_io_channel *ch, void *_ctx)
9786 : {
9787 4 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9788 : struct spdk_bdev_io *bdev_io;
9789 :
9790 4 : bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
9791 :
9792 6 : while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) {
9793 : /* Re-submit the queued I/O. */
9794 2 : bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io);
9795 2 : TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link);
9796 2 : _bdev_io_submit(bdev_io);
9797 : }
9798 :
9799 4 : spdk_bdev_for_each_channel_continue(i, 0);
9800 4 : }
9801 :
9802 : static void
9803 1 : bdev_update_qos_rate_limit_msg(void *cb_arg)
9804 : {
9805 1 : struct set_qos_limit_ctx *ctx = cb_arg;
9806 1 : struct spdk_bdev *bdev = ctx->bdev;
9807 :
9808 1 : spdk_spin_lock(&bdev->internal.spinlock);
9809 1 : bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos);
9810 1 : spdk_spin_unlock(&bdev->internal.spinlock);
9811 :
9812 1 : bdev_set_qos_limit_done(ctx, 0);
9813 1 : }
9814 :
9815 : static void
9816 9 : bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9817 : struct spdk_io_channel *ch, void *_ctx)
9818 : {
9819 9 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9820 :
9821 9 : spdk_spin_lock(&bdev->internal.spinlock);
9822 9 : bdev_enable_qos(bdev, bdev_ch);
9823 9 : spdk_spin_unlock(&bdev->internal.spinlock);
9824 9 : spdk_bdev_for_each_channel_continue(i, 0);
9825 9 : }
9826 :
9827 : static void
9828 6 : bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status)
9829 : {
9830 6 : struct set_qos_limit_ctx *ctx = _ctx;
9831 :
9832 6 : bdev_set_qos_limit_done(ctx, status);
9833 6 : }
9834 :
9835 : static void
9836 7 : bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
9837 : {
9838 : int i;
9839 :
9840 7 : assert(bdev->internal.qos != NULL);
9841 :
9842 35 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9843 28 : if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
9844 28 : bdev->internal.qos->rate_limits[i].limit = limits[i];
9845 :
9846 28 : if (limits[i] == 0) {
9847 19 : bdev->internal.qos->rate_limits[i].limit =
9848 : SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
9849 : }
9850 : }
9851 : }
9852 7 : }
9853 :
9854 : void
9855 9 : spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits,
9856 : void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
9857 : {
9858 : struct set_qos_limit_ctx *ctx;
9859 : uint32_t limit_set_complement;
9860 : uint64_t min_limit_per_sec;
9861 : int i;
9862 9 : bool disable_rate_limit = true;
9863 :
9864 45 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9865 36 : if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
9866 0 : continue;
9867 : }
9868 :
9869 36 : if (limits[i] > 0) {
9870 10 : disable_rate_limit = false;
9871 : }
9872 :
9873 36 : if (bdev_qos_is_iops_rate_limit(i) == true) {
9874 9 : min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
9875 : } else {
9876 27 : if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) {
9877 0 : SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, "
9878 : "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC);
9879 0 : limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC;
9880 : }
9881 : /* Change from megabyte to byte rate limit */
9882 27 : limits[i] = limits[i] * 1024 * 1024;
9883 27 : min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC;
9884 : }
9885 :
9886 36 : limit_set_complement = limits[i] % min_limit_per_sec;
9887 36 : if (limit_set_complement) {
9888 0 : SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n",
9889 : limits[i], min_limit_per_sec);
9890 0 : limits[i] += min_limit_per_sec - limit_set_complement;
9891 0 : SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]);
9892 : }
9893 : }
9894 :
9895 9 : ctx = calloc(1, sizeof(*ctx));
9896 9 : if (ctx == NULL) {
9897 0 : cb_fn(cb_arg, -ENOMEM);
9898 0 : return;
9899 : }
9900 :
9901 9 : ctx->cb_fn = cb_fn;
9902 9 : ctx->cb_arg = cb_arg;
9903 9 : ctx->bdev = bdev;
9904 :
9905 9 : spdk_spin_lock(&bdev->internal.spinlock);
9906 9 : if (bdev->internal.qos_mod_in_progress) {
9907 1 : spdk_spin_unlock(&bdev->internal.spinlock);
9908 1 : free(ctx);
9909 1 : cb_fn(cb_arg, -EAGAIN);
9910 1 : return;
9911 : }
9912 8 : bdev->internal.qos_mod_in_progress = true;
9913 :
9914 8 : if (disable_rate_limit == true && bdev->internal.qos) {
9915 10 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9916 8 : if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED &&
9917 0 : (bdev->internal.qos->rate_limits[i].limit > 0 &&
9918 0 : bdev->internal.qos->rate_limits[i].limit !=
9919 : SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) {
9920 0 : disable_rate_limit = false;
9921 0 : break;
9922 : }
9923 : }
9924 : }
9925 :
9926 8 : if (disable_rate_limit == false) {
9927 5 : if (bdev->internal.qos == NULL) {
9928 4 : bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
9929 4 : if (!bdev->internal.qos) {
9930 0 : spdk_spin_unlock(&bdev->internal.spinlock);
9931 0 : SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
9932 0 : bdev_set_qos_limit_done(ctx, -ENOMEM);
9933 0 : return;
9934 : }
9935 : }
9936 :
9937 5 : if (bdev->internal.qos->thread == NULL) {
9938 : /* Enabling */
9939 4 : bdev_set_qos_rate_limits(bdev, limits);
9940 :
9941 4 : spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx,
9942 : bdev_enable_qos_done);
9943 : } else {
9944 : /* Updating */
9945 1 : bdev_set_qos_rate_limits(bdev, limits);
9946 :
9947 1 : spdk_thread_send_msg(bdev->internal.qos->thread,
9948 : bdev_update_qos_rate_limit_msg, ctx);
9949 : }
9950 : } else {
9951 3 : if (bdev->internal.qos != NULL) {
9952 2 : bdev_set_qos_rate_limits(bdev, limits);
9953 :
9954 : /* Disabling */
9955 2 : spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx,
9956 : bdev_disable_qos_msg_done);
9957 : } else {
9958 1 : spdk_spin_unlock(&bdev->internal.spinlock);
9959 1 : bdev_set_qos_limit_done(ctx, 0);
9960 1 : return;
9961 : }
9962 : }
9963 :
9964 7 : spdk_spin_unlock(&bdev->internal.spinlock);
9965 : }
9966 :
9967 : struct spdk_bdev_histogram_ctx {
9968 : spdk_bdev_histogram_status_cb cb_fn;
9969 : void *cb_arg;
9970 : struct spdk_bdev *bdev;
9971 : int status;
9972 : };
9973 :
9974 : static void
9975 2 : bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9976 : {
9977 2 : struct spdk_bdev_histogram_ctx *ctx = _ctx;
9978 :
9979 2 : spdk_spin_lock(&ctx->bdev->internal.spinlock);
9980 2 : ctx->bdev->internal.histogram_in_progress = false;
9981 2 : spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9982 2 : ctx->cb_fn(ctx->cb_arg, ctx->status);
9983 2 : free(ctx);
9984 2 : }
9985 :
9986 : static void
9987 3 : bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9988 : struct spdk_io_channel *_ch, void *_ctx)
9989 : {
9990 3 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9991 :
9992 3 : if (ch->histogram != NULL) {
9993 3 : spdk_histogram_data_free(ch->histogram);
9994 3 : ch->histogram = NULL;
9995 : }
9996 3 : spdk_bdev_for_each_channel_continue(i, 0);
9997 3 : }
9998 :
9999 : static void
10000 2 : bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10001 : {
10002 2 : struct spdk_bdev_histogram_ctx *ctx = _ctx;
10003 :
10004 2 : if (status != 0) {
10005 0 : ctx->status = status;
10006 0 : ctx->bdev->internal.histogram_enabled = false;
10007 0 : spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx,
10008 : bdev_histogram_disable_channel_cb);
10009 : } else {
10010 2 : spdk_spin_lock(&ctx->bdev->internal.spinlock);
10011 2 : ctx->bdev->internal.histogram_in_progress = false;
10012 2 : spdk_spin_unlock(&ctx->bdev->internal.spinlock);
10013 2 : ctx->cb_fn(ctx->cb_arg, ctx->status);
10014 2 : free(ctx);
10015 : }
10016 2 : }
10017 :
10018 : static void
10019 3 : bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10020 : struct spdk_io_channel *_ch, void *_ctx)
10021 : {
10022 3 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10023 3 : int status = 0;
10024 :
10025 3 : if (ch->histogram == NULL) {
10026 3 : ch->histogram = spdk_histogram_data_alloc();
10027 3 : if (ch->histogram == NULL) {
10028 0 : status = -ENOMEM;
10029 : }
10030 : }
10031 :
10032 3 : spdk_bdev_for_each_channel_continue(i, status);
10033 3 : }
10034 :
10035 : void
10036 4 : spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
10037 : void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts)
10038 : {
10039 : struct spdk_bdev_histogram_ctx *ctx;
10040 :
10041 4 : ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx));
10042 4 : if (ctx == NULL) {
10043 0 : cb_fn(cb_arg, -ENOMEM);
10044 0 : return;
10045 : }
10046 :
10047 4 : ctx->bdev = bdev;
10048 4 : ctx->status = 0;
10049 4 : ctx->cb_fn = cb_fn;
10050 4 : ctx->cb_arg = cb_arg;
10051 :
10052 4 : spdk_spin_lock(&bdev->internal.spinlock);
10053 4 : if (bdev->internal.histogram_in_progress) {
10054 0 : spdk_spin_unlock(&bdev->internal.spinlock);
10055 0 : free(ctx);
10056 0 : cb_fn(cb_arg, -EAGAIN);
10057 0 : return;
10058 : }
10059 :
10060 4 : bdev->internal.histogram_in_progress = true;
10061 4 : spdk_spin_unlock(&bdev->internal.spinlock);
10062 :
10063 4 : bdev->internal.histogram_enabled = enable;
10064 4 : bdev->internal.histogram_io_type = opts->io_type;
10065 :
10066 4 : if (enable) {
10067 : /* Allocate histogram for each channel */
10068 2 : spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx,
10069 : bdev_histogram_enable_channel_cb);
10070 : } else {
10071 2 : spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx,
10072 : bdev_histogram_disable_channel_cb);
10073 : }
10074 : }
10075 :
10076 : void
10077 4 : spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size)
10078 : {
10079 4 : if (opts == NULL) {
10080 0 : SPDK_ERRLOG("opts should not be NULL\n");
10081 0 : assert(opts != NULL);
10082 0 : return;
10083 : }
10084 4 : if (size == 0) {
10085 0 : SPDK_ERRLOG("size should not be zero\n");
10086 0 : assert(size != 0);
10087 0 : return;
10088 : }
10089 :
10090 4 : memset(opts, 0, size);
10091 4 : opts->size = size;
10092 :
10093 : #define FIELD_OK(field) \
10094 : offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size
10095 :
10096 : #define SET_FIELD(field, value) \
10097 : if (FIELD_OK(field)) { \
10098 : opts->field = value; \
10099 : } \
10100 :
10101 4 : SET_FIELD(io_type, 0);
10102 :
10103 : /* You should not remove this statement, but need to update the assert statement
10104 : * if you add a new field, and also add a corresponding SET_FIELD statement */
10105 : SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size");
10106 :
10107 : #undef FIELD_OK
10108 : #undef SET_FIELD
10109 : }
10110 :
10111 : void
10112 4 : spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
10113 : void *cb_arg, bool enable)
10114 : {
10115 4 : struct spdk_bdev_enable_histogram_opts opts;
10116 :
10117 4 : spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts));
10118 4 : spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts);
10119 4 : }
10120 :
10121 : struct spdk_bdev_histogram_data_ctx {
10122 : spdk_bdev_histogram_data_cb cb_fn;
10123 : void *cb_arg;
10124 : struct spdk_bdev *bdev;
10125 : /** merged histogram data from all channels */
10126 : struct spdk_histogram_data *histogram;
10127 : };
10128 :
10129 : static void
10130 5 : bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10131 : {
10132 5 : struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
10133 :
10134 5 : ctx->cb_fn(ctx->cb_arg, status, ctx->histogram);
10135 5 : free(ctx);
10136 5 : }
10137 :
10138 : static void
10139 7 : bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10140 : struct spdk_io_channel *_ch, void *_ctx)
10141 : {
10142 7 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10143 7 : struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
10144 7 : int status = 0;
10145 :
10146 7 : if (ch->histogram == NULL) {
10147 1 : status = -EFAULT;
10148 : } else {
10149 6 : spdk_histogram_data_merge(ctx->histogram, ch->histogram);
10150 : }
10151 :
10152 7 : spdk_bdev_for_each_channel_continue(i, status);
10153 7 : }
10154 :
10155 : void
10156 5 : spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram,
10157 : spdk_bdev_histogram_data_cb cb_fn,
10158 : void *cb_arg)
10159 : {
10160 : struct spdk_bdev_histogram_data_ctx *ctx;
10161 :
10162 5 : ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx));
10163 5 : if (ctx == NULL) {
10164 0 : cb_fn(cb_arg, -ENOMEM, NULL);
10165 0 : return;
10166 : }
10167 :
10168 5 : ctx->bdev = bdev;
10169 5 : ctx->cb_fn = cb_fn;
10170 5 : ctx->cb_arg = cb_arg;
10171 :
10172 5 : ctx->histogram = histogram;
10173 :
10174 5 : spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx,
10175 : bdev_histogram_get_channel_cb);
10176 : }
10177 :
10178 : void
10179 2 : spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn,
10180 : void *cb_arg)
10181 : {
10182 2 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
10183 2 : int status = 0;
10184 :
10185 2 : assert(cb_fn != NULL);
10186 :
10187 2 : if (bdev_ch->histogram == NULL) {
10188 1 : status = -EFAULT;
10189 : }
10190 2 : cb_fn(cb_arg, status, bdev_ch->histogram);
10191 2 : }
10192 :
10193 : size_t
10194 0 : spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events,
10195 : size_t max_events)
10196 : {
10197 : struct media_event_entry *entry;
10198 0 : size_t num_events = 0;
10199 :
10200 0 : for (; num_events < max_events; ++num_events) {
10201 0 : entry = TAILQ_FIRST(&desc->pending_media_events);
10202 0 : if (entry == NULL) {
10203 0 : break;
10204 : }
10205 :
10206 0 : events[num_events] = entry->event;
10207 0 : TAILQ_REMOVE(&desc->pending_media_events, entry, tailq);
10208 0 : TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq);
10209 : }
10210 :
10211 0 : return num_events;
10212 : }
10213 :
10214 : int
10215 0 : spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events,
10216 : size_t num_events)
10217 : {
10218 : struct spdk_bdev_desc *desc;
10219 : struct media_event_entry *entry;
10220 : size_t event_id;
10221 0 : int rc = 0;
10222 :
10223 0 : assert(bdev->media_events);
10224 :
10225 0 : spdk_spin_lock(&bdev->internal.spinlock);
10226 0 : TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
10227 0 : if (desc->write) {
10228 0 : break;
10229 : }
10230 : }
10231 :
10232 0 : if (desc == NULL || desc->media_events_buffer == NULL) {
10233 0 : rc = -ENODEV;
10234 0 : goto out;
10235 : }
10236 :
10237 0 : for (event_id = 0; event_id < num_events; ++event_id) {
10238 0 : entry = TAILQ_FIRST(&desc->free_media_events);
10239 0 : if (entry == NULL) {
10240 0 : break;
10241 : }
10242 :
10243 0 : TAILQ_REMOVE(&desc->free_media_events, entry, tailq);
10244 0 : TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq);
10245 0 : entry->event = events[event_id];
10246 : }
10247 :
10248 0 : rc = event_id;
10249 0 : out:
10250 0 : spdk_spin_unlock(&bdev->internal.spinlock);
10251 0 : return rc;
10252 : }
10253 :
10254 : static void
10255 0 : _media_management_notify(void *arg)
10256 : {
10257 0 : struct spdk_bdev_desc *desc = arg;
10258 :
10259 0 : _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT);
10260 0 : }
10261 :
10262 : void
10263 0 : spdk_bdev_notify_media_management(struct spdk_bdev *bdev)
10264 : {
10265 : struct spdk_bdev_desc *desc;
10266 :
10267 0 : spdk_spin_lock(&bdev->internal.spinlock);
10268 0 : TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
10269 0 : if (!TAILQ_EMPTY(&desc->pending_media_events)) {
10270 0 : event_notify(desc, _media_management_notify);
10271 : }
10272 : }
10273 0 : spdk_spin_unlock(&bdev->internal.spinlock);
10274 0 : }
10275 :
10276 : struct locked_lba_range_ctx {
10277 : struct lba_range range;
10278 : struct lba_range *current_range;
10279 : struct lba_range *owner_range;
10280 : struct spdk_poller *poller;
10281 : lock_range_cb cb_fn;
10282 : void *cb_arg;
10283 : };
10284 :
10285 : static void
10286 0 : bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10287 : {
10288 0 : struct locked_lba_range_ctx *ctx = _ctx;
10289 :
10290 0 : ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM);
10291 0 : free(ctx);
10292 0 : }
10293 :
10294 : static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i,
10295 : struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx);
10296 :
10297 : static void
10298 14 : bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10299 : {
10300 14 : struct locked_lba_range_ctx *ctx = _ctx;
10301 :
10302 14 : if (status == -ENOMEM) {
10303 : /* One of the channels could not allocate a range object.
10304 : * So we have to go back and clean up any ranges that were
10305 : * allocated successfully before we return error status to
10306 : * the caller. We can reuse the unlock function to do that
10307 : * clean up.
10308 : */
10309 0 : spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
10310 : bdev_lock_error_cleanup_cb);
10311 0 : return;
10312 : }
10313 :
10314 : /* All channels have locked this range and no I/O overlapping the range
10315 : * are outstanding! Set the owner_ch for the range object for the
10316 : * locking channel, so that this channel will know that it is allowed
10317 : * to write to this range.
10318 : */
10319 14 : if (ctx->owner_range != NULL) {
10320 10 : ctx->owner_range->owner_ch = ctx->range.owner_ch;
10321 : }
10322 :
10323 14 : ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
10324 :
10325 : /* Don't free the ctx here. Its range is in the bdev's global list of
10326 : * locked ranges still, and will be removed and freed when this range
10327 : * is later unlocked.
10328 : */
10329 : }
10330 :
10331 : static int
10332 17 : bdev_lock_lba_range_check_io(void *_i)
10333 : {
10334 17 : struct spdk_bdev_channel_iter *i = _i;
10335 17 : struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i);
10336 17 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10337 17 : struct locked_lba_range_ctx *ctx = i->ctx;
10338 17 : struct lba_range *range = ctx->current_range;
10339 : struct spdk_bdev_io *bdev_io;
10340 :
10341 17 : spdk_poller_unregister(&ctx->poller);
10342 :
10343 : /* The range is now in the locked_ranges, so no new IO can be submitted to this
10344 : * range. But we need to wait until any outstanding IO overlapping with this range
10345 : * are completed.
10346 : */
10347 18 : TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) {
10348 3 : if (bdev_io_range_is_locked(bdev_io, range)) {
10349 2 : ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100);
10350 2 : return SPDK_POLLER_BUSY;
10351 : }
10352 : }
10353 :
10354 15 : spdk_bdev_for_each_channel_continue(i, 0);
10355 15 : return SPDK_POLLER_BUSY;
10356 : }
10357 :
10358 : static void
10359 15 : bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10360 : struct spdk_io_channel *_ch, void *_ctx)
10361 : {
10362 15 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10363 15 : struct locked_lba_range_ctx *ctx = _ctx;
10364 : struct lba_range *range;
10365 :
10366 16 : TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
10367 1 : if (range->length == ctx->range.length &&
10368 0 : range->offset == ctx->range.offset &&
10369 0 : range->locked_ctx == ctx->range.locked_ctx) {
10370 : /* This range already exists on this channel, so don't add
10371 : * it again. This can happen when a new channel is created
10372 : * while the for_each_channel operation is in progress.
10373 : * Do not check for outstanding I/O in that case, since the
10374 : * range was locked before any I/O could be submitted to the
10375 : * new channel.
10376 : */
10377 0 : spdk_bdev_for_each_channel_continue(i, 0);
10378 0 : return;
10379 : }
10380 : }
10381 :
10382 15 : range = calloc(1, sizeof(*range));
10383 15 : if (range == NULL) {
10384 0 : spdk_bdev_for_each_channel_continue(i, -ENOMEM);
10385 0 : return;
10386 : }
10387 :
10388 15 : range->length = ctx->range.length;
10389 15 : range->offset = ctx->range.offset;
10390 15 : range->locked_ctx = ctx->range.locked_ctx;
10391 15 : range->quiesce = ctx->range.quiesce;
10392 15 : ctx->current_range = range;
10393 15 : if (ctx->range.owner_ch == ch) {
10394 : /* This is the range object for the channel that will hold
10395 : * the lock. Store it in the ctx object so that we can easily
10396 : * set its owner_ch after the lock is finally acquired.
10397 : */
10398 10 : ctx->owner_range = range;
10399 : }
10400 15 : TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq);
10401 15 : bdev_lock_lba_range_check_io(i);
10402 : }
10403 :
10404 : static void
10405 14 : bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx)
10406 : {
10407 14 : assert(spdk_get_thread() == ctx->range.owner_thread);
10408 14 : assert(ctx->range.owner_ch == NULL ||
10409 : spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread);
10410 :
10411 : /* We will add a copy of this range to each channel now. */
10412 14 : spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx,
10413 : bdev_lock_lba_range_cb);
10414 14 : }
10415 :
10416 : static bool
10417 17 : bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq)
10418 : {
10419 : struct lba_range *r;
10420 :
10421 18 : TAILQ_FOREACH(r, tailq, tailq) {
10422 4 : if (bdev_lba_range_overlapped(range, r)) {
10423 3 : return true;
10424 : }
10425 : }
10426 14 : return false;
10427 : }
10428 :
10429 : static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status);
10430 :
10431 : static int
10432 14 : _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch,
10433 : uint64_t offset, uint64_t length,
10434 : lock_range_cb cb_fn, void *cb_arg)
10435 : {
10436 : struct locked_lba_range_ctx *ctx;
10437 :
10438 14 : ctx = calloc(1, sizeof(*ctx));
10439 14 : if (ctx == NULL) {
10440 0 : return -ENOMEM;
10441 : }
10442 :
10443 14 : ctx->range.offset = offset;
10444 14 : ctx->range.length = length;
10445 14 : ctx->range.owner_thread = spdk_get_thread();
10446 14 : ctx->range.owner_ch = ch;
10447 14 : ctx->range.locked_ctx = cb_arg;
10448 14 : ctx->range.bdev = bdev;
10449 14 : ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked);
10450 14 : ctx->cb_fn = cb_fn;
10451 14 : ctx->cb_arg = cb_arg;
10452 :
10453 14 : spdk_spin_lock(&bdev->internal.spinlock);
10454 14 : if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) {
10455 : /* There is an active lock overlapping with this range.
10456 : * Put it on the pending list until this range no
10457 : * longer overlaps with another.
10458 : */
10459 2 : TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq);
10460 : } else {
10461 12 : TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq);
10462 12 : bdev_lock_lba_range_ctx(bdev, ctx);
10463 : }
10464 14 : spdk_spin_unlock(&bdev->internal.spinlock);
10465 14 : return 0;
10466 : }
10467 :
10468 : static int
10469 10 : bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
10470 : uint64_t offset, uint64_t length,
10471 : lock_range_cb cb_fn, void *cb_arg)
10472 : {
10473 10 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10474 10 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10475 :
10476 10 : if (cb_arg == NULL) {
10477 0 : SPDK_ERRLOG("cb_arg must not be NULL\n");
10478 0 : return -EINVAL;
10479 : }
10480 :
10481 10 : return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg);
10482 : }
10483 :
10484 : static void
10485 2 : bdev_lock_lba_range_ctx_msg(void *_ctx)
10486 : {
10487 2 : struct locked_lba_range_ctx *ctx = _ctx;
10488 :
10489 2 : bdev_lock_lba_range_ctx(ctx->range.bdev, ctx);
10490 2 : }
10491 :
10492 : static void
10493 14 : bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10494 : {
10495 14 : struct locked_lba_range_ctx *ctx = _ctx;
10496 : struct locked_lba_range_ctx *pending_ctx;
10497 : struct lba_range *range, *tmp;
10498 :
10499 14 : spdk_spin_lock(&bdev->internal.spinlock);
10500 : /* Check if there are any pending locked ranges that overlap with this range
10501 : * that was just unlocked. If there are, check that it doesn't overlap with any
10502 : * other locked ranges before calling bdev_lock_lba_range_ctx which will start
10503 : * the lock process.
10504 : */
10505 17 : TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) {
10506 3 : if (bdev_lba_range_overlapped(range, &ctx->range) &&
10507 3 : !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) {
10508 2 : TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq);
10509 2 : pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
10510 2 : TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq);
10511 2 : spdk_thread_send_msg(pending_ctx->range.owner_thread,
10512 : bdev_lock_lba_range_ctx_msg, pending_ctx);
10513 : }
10514 : }
10515 14 : spdk_spin_unlock(&bdev->internal.spinlock);
10516 :
10517 14 : ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
10518 14 : free(ctx);
10519 14 : }
10520 :
10521 : static void
10522 16 : bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10523 : struct spdk_io_channel *_ch, void *_ctx)
10524 : {
10525 16 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10526 16 : struct locked_lba_range_ctx *ctx = _ctx;
10527 16 : TAILQ_HEAD(, spdk_bdev_io) io_locked;
10528 : struct spdk_bdev_io *bdev_io;
10529 : struct lba_range *range;
10530 :
10531 16 : TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
10532 16 : if (ctx->range.offset == range->offset &&
10533 16 : ctx->range.length == range->length &&
10534 16 : ctx->range.locked_ctx == range->locked_ctx) {
10535 16 : TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
10536 16 : free(range);
10537 16 : break;
10538 : }
10539 : }
10540 :
10541 : /* Note: we should almost always be able to assert that the range specified
10542 : * was found. But there are some very rare corner cases where a new channel
10543 : * gets created simultaneously with a range unlock, where this function
10544 : * would execute on that new channel and wouldn't have the range.
10545 : * We also use this to clean up range allocations when a later allocation
10546 : * fails in the locking path.
10547 : * So we can't actually assert() here.
10548 : */
10549 :
10550 : /* Swap the locked IO into a temporary list, and then try to submit them again.
10551 : * We could hyper-optimize this to only resubmit locked I/O that overlap
10552 : * with the range that was just unlocked, but this isn't a performance path so
10553 : * we go for simplicity here.
10554 : */
10555 16 : TAILQ_INIT(&io_locked);
10556 16 : TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link);
10557 19 : while (!TAILQ_EMPTY(&io_locked)) {
10558 3 : bdev_io = TAILQ_FIRST(&io_locked);
10559 3 : TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link);
10560 3 : bdev_io_submit(bdev_io);
10561 : }
10562 :
10563 16 : spdk_bdev_for_each_channel_continue(i, 0);
10564 16 : }
10565 :
10566 : static int
10567 14 : _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length,
10568 : lock_range_cb cb_fn, void *cb_arg)
10569 : {
10570 : struct locked_lba_range_ctx *ctx;
10571 : struct lba_range *range;
10572 :
10573 14 : spdk_spin_lock(&bdev->internal.spinlock);
10574 : /* To start the unlock the process, we find the range in the bdev's locked_ranges
10575 : * and remove it. This ensures new channels don't inherit the locked range.
10576 : * Then we will send a message to each channel to remove the range from its
10577 : * per-channel list.
10578 : */
10579 14 : TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
10580 14 : if (range->offset == offset && range->length == length &&
10581 14 : (range->owner_ch == NULL || range->locked_ctx == cb_arg)) {
10582 : break;
10583 : }
10584 : }
10585 14 : if (range == NULL) {
10586 0 : assert(false);
10587 : spdk_spin_unlock(&bdev->internal.spinlock);
10588 : return -EINVAL;
10589 : }
10590 14 : TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq);
10591 14 : ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
10592 14 : spdk_spin_unlock(&bdev->internal.spinlock);
10593 :
10594 14 : ctx->cb_fn = cb_fn;
10595 14 : ctx->cb_arg = cb_arg;
10596 :
10597 14 : spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
10598 : bdev_unlock_lba_range_cb);
10599 14 : return 0;
10600 : }
10601 :
10602 : static int
10603 12 : bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
10604 : uint64_t offset, uint64_t length,
10605 : lock_range_cb cb_fn, void *cb_arg)
10606 : {
10607 12 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10608 12 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10609 : struct lba_range *range;
10610 12 : bool range_found = false;
10611 :
10612 : /* Let's make sure the specified channel actually has a lock on
10613 : * the specified range. Note that the range must match exactly.
10614 : */
10615 14 : TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
10616 12 : if (range->offset == offset && range->length == length &&
10617 11 : range->owner_ch == ch && range->locked_ctx == cb_arg) {
10618 10 : range_found = true;
10619 10 : break;
10620 : }
10621 : }
10622 :
10623 12 : if (!range_found) {
10624 2 : return -EINVAL;
10625 : }
10626 :
10627 10 : return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg);
10628 : }
10629 :
10630 : struct bdev_quiesce_ctx {
10631 : spdk_bdev_quiesce_cb cb_fn;
10632 : void *cb_arg;
10633 : };
10634 :
10635 : static void
10636 4 : bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status)
10637 : {
10638 4 : struct bdev_quiesce_ctx *quiesce_ctx = ctx;
10639 :
10640 4 : if (quiesce_ctx->cb_fn != NULL) {
10641 4 : quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
10642 : }
10643 :
10644 4 : free(quiesce_ctx);
10645 4 : }
10646 :
10647 : static void
10648 4 : bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status)
10649 : {
10650 4 : struct bdev_quiesce_ctx *quiesce_ctx = ctx;
10651 4 : struct spdk_bdev_module *module = range->bdev->module;
10652 :
10653 4 : if (status != 0) {
10654 0 : if (quiesce_ctx->cb_fn != NULL) {
10655 0 : quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
10656 : }
10657 0 : free(quiesce_ctx);
10658 0 : return;
10659 : }
10660 :
10661 4 : spdk_spin_lock(&module->internal.spinlock);
10662 4 : TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module);
10663 4 : spdk_spin_unlock(&module->internal.spinlock);
10664 :
10665 4 : if (quiesce_ctx->cb_fn != NULL) {
10666 : /* copy the context in case the range is unlocked by the callback */
10667 4 : struct bdev_quiesce_ctx tmp = *quiesce_ctx;
10668 :
10669 4 : quiesce_ctx->cb_fn = NULL;
10670 4 : quiesce_ctx->cb_arg = NULL;
10671 :
10672 4 : tmp.cb_fn(tmp.cb_arg, status);
10673 : }
10674 : /* quiesce_ctx will be freed on unquiesce */
10675 : }
10676 :
10677 : static int
10678 9 : _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10679 : uint64_t offset, uint64_t length,
10680 : spdk_bdev_quiesce_cb cb_fn, void *cb_arg,
10681 : bool unquiesce)
10682 : {
10683 : struct bdev_quiesce_ctx *quiesce_ctx;
10684 : int rc;
10685 :
10686 9 : if (module != bdev->module) {
10687 0 : SPDK_ERRLOG("Bdev does not belong to specified module.\n");
10688 0 : return -EINVAL;
10689 : }
10690 :
10691 9 : if (!bdev_io_valid_blocks(bdev, offset, length)) {
10692 0 : return -EINVAL;
10693 : }
10694 :
10695 9 : if (unquiesce) {
10696 : struct lba_range *range;
10697 :
10698 : /* Make sure the specified range is actually quiesced in the specified module and
10699 : * then remove it from the list. Note that the range must match exactly.
10700 : */
10701 5 : spdk_spin_lock(&module->internal.spinlock);
10702 6 : TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) {
10703 5 : if (range->bdev == bdev && range->offset == offset && range->length == length) {
10704 4 : TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module);
10705 4 : break;
10706 : }
10707 : }
10708 5 : spdk_spin_unlock(&module->internal.spinlock);
10709 :
10710 5 : if (range == NULL) {
10711 1 : SPDK_ERRLOG("The range to unquiesce was not found.\n");
10712 1 : return -EINVAL;
10713 : }
10714 :
10715 4 : quiesce_ctx = range->locked_ctx;
10716 4 : quiesce_ctx->cb_fn = cb_fn;
10717 4 : quiesce_ctx->cb_arg = cb_arg;
10718 :
10719 4 : rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx);
10720 : } else {
10721 4 : quiesce_ctx = malloc(sizeof(*quiesce_ctx));
10722 4 : if (quiesce_ctx == NULL) {
10723 0 : return -ENOMEM;
10724 : }
10725 :
10726 4 : quiesce_ctx->cb_fn = cb_fn;
10727 4 : quiesce_ctx->cb_arg = cb_arg;
10728 :
10729 4 : rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx);
10730 4 : if (rc != 0) {
10731 0 : free(quiesce_ctx);
10732 : }
10733 : }
10734 :
10735 8 : return rc;
10736 : }
10737 :
10738 : int
10739 3 : spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10740 : spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10741 : {
10742 3 : return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false);
10743 : }
10744 :
10745 : int
10746 3 : spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10747 : spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10748 : {
10749 3 : return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true);
10750 : }
10751 :
10752 : int
10753 1 : spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10754 : uint64_t offset, uint64_t length,
10755 : spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10756 : {
10757 1 : return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false);
10758 : }
10759 :
10760 : int
10761 2 : spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10762 : uint64_t offset, uint64_t length,
10763 : spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10764 : {
10765 2 : return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true);
10766 : }
10767 :
10768 : int
10769 285 : spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains,
10770 : int array_size)
10771 : {
10772 285 : if (!bdev) {
10773 1 : return -EINVAL;
10774 : }
10775 :
10776 284 : if (bdev->fn_table->get_memory_domains) {
10777 3 : return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size);
10778 : }
10779 :
10780 281 : return 0;
10781 : }
10782 :
10783 : struct spdk_bdev_for_each_io_ctx {
10784 : void *ctx;
10785 : spdk_bdev_io_fn fn;
10786 : spdk_bdev_for_each_io_cb cb;
10787 : };
10788 :
10789 : static void
10790 0 : bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10791 : struct spdk_io_channel *io_ch, void *_ctx)
10792 : {
10793 0 : struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
10794 0 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
10795 : struct spdk_bdev_io *bdev_io;
10796 0 : int rc = 0;
10797 :
10798 0 : TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
10799 0 : rc = ctx->fn(ctx->ctx, bdev_io);
10800 0 : if (rc != 0) {
10801 0 : break;
10802 : }
10803 : }
10804 :
10805 0 : spdk_bdev_for_each_channel_continue(i, rc);
10806 0 : }
10807 :
10808 : static void
10809 0 : bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
10810 : {
10811 0 : struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
10812 :
10813 0 : ctx->cb(ctx->ctx, status);
10814 :
10815 0 : free(ctx);
10816 0 : }
10817 :
10818 : void
10819 0 : spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn,
10820 : spdk_bdev_for_each_io_cb cb)
10821 : {
10822 : struct spdk_bdev_for_each_io_ctx *ctx;
10823 :
10824 0 : assert(fn != NULL && cb != NULL);
10825 :
10826 0 : ctx = calloc(1, sizeof(*ctx));
10827 0 : if (ctx == NULL) {
10828 0 : SPDK_ERRLOG("Failed to allocate context.\n");
10829 0 : cb(_ctx, -ENOMEM);
10830 0 : return;
10831 : }
10832 :
10833 0 : ctx->ctx = _ctx;
10834 0 : ctx->fn = fn;
10835 0 : ctx->cb = cb;
10836 :
10837 0 : spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx,
10838 : bdev_for_each_io_done);
10839 : }
10840 :
10841 : void
10842 137 : spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status)
10843 : {
10844 137 : spdk_for_each_channel_continue(iter->i, status);
10845 137 : }
10846 :
10847 : static struct spdk_bdev *
10848 376 : io_channel_iter_get_bdev(struct spdk_io_channel_iter *i)
10849 : {
10850 376 : void *io_device = spdk_io_channel_iter_get_io_device(i);
10851 :
10852 376 : return __bdev_from_io_dev(io_device);
10853 : }
10854 :
10855 : static void
10856 137 : bdev_each_channel_msg(struct spdk_io_channel_iter *i)
10857 : {
10858 137 : struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
10859 137 : struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
10860 137 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
10861 :
10862 137 : iter->i = i;
10863 137 : iter->fn(iter, bdev, ch, iter->ctx);
10864 137 : }
10865 :
10866 : static void
10867 239 : bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
10868 : {
10869 239 : struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
10870 239 : struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
10871 :
10872 239 : iter->i = i;
10873 239 : iter->cpl(bdev, iter->ctx, status);
10874 :
10875 239 : free(iter);
10876 239 : }
10877 :
10878 : void
10879 239 : spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn,
10880 : void *ctx, spdk_bdev_for_each_channel_done cpl)
10881 : {
10882 : struct spdk_bdev_channel_iter *iter;
10883 :
10884 239 : assert(bdev != NULL && fn != NULL && ctx != NULL);
10885 :
10886 239 : iter = calloc(1, sizeof(struct spdk_bdev_channel_iter));
10887 239 : if (iter == NULL) {
10888 0 : SPDK_ERRLOG("Unable to allocate iterator\n");
10889 0 : assert(false);
10890 : return;
10891 : }
10892 :
10893 239 : iter->fn = fn;
10894 239 : iter->cpl = cpl;
10895 239 : iter->ctx = ctx;
10896 :
10897 239 : spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg,
10898 : iter, bdev_each_channel_cpl);
10899 : }
10900 :
10901 : static void
10902 3 : bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
10903 : {
10904 3 : struct spdk_bdev_io *parent_io = cb_arg;
10905 :
10906 3 : spdk_bdev_free_io(bdev_io);
10907 :
10908 : /* Check return status of write */
10909 3 : parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
10910 3 : parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
10911 3 : }
10912 :
10913 : static void
10914 3 : bdev_copy_do_write(void *_bdev_io)
10915 : {
10916 3 : struct spdk_bdev_io *bdev_io = _bdev_io;
10917 : int rc;
10918 :
10919 : /* Write blocks */
10920 6 : rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc,
10921 3 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
10922 3 : bdev_io->u.bdev.iovs[0].iov_base,
10923 : bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks,
10924 : bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io);
10925 :
10926 3 : if (rc == -ENOMEM) {
10927 0 : bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write);
10928 3 : } else if (rc != 0) {
10929 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10930 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10931 : }
10932 3 : }
10933 :
10934 : static void
10935 3 : bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
10936 : {
10937 3 : struct spdk_bdev_io *parent_io = cb_arg;
10938 :
10939 3 : spdk_bdev_free_io(bdev_io);
10940 :
10941 : /* Check return status of read */
10942 3 : if (!success) {
10943 0 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10944 0 : parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
10945 0 : return;
10946 : }
10947 :
10948 : /* Do write */
10949 3 : bdev_copy_do_write(parent_io);
10950 : }
10951 :
10952 : static void
10953 3 : bdev_copy_do_read(void *_bdev_io)
10954 : {
10955 3 : struct spdk_bdev_io *bdev_io = _bdev_io;
10956 : int rc;
10957 :
10958 : /* Read blocks */
10959 6 : rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc,
10960 3 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
10961 3 : bdev_io->u.bdev.iovs[0].iov_base,
10962 : bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks,
10963 : bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io);
10964 :
10965 3 : if (rc == -ENOMEM) {
10966 0 : bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read);
10967 3 : } else if (rc != 0) {
10968 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10969 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10970 : }
10971 3 : }
10972 :
10973 : static void
10974 3 : bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
10975 : {
10976 3 : if (!success) {
10977 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10978 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10979 0 : return;
10980 : }
10981 :
10982 3 : bdev_copy_do_read(bdev_io);
10983 : }
10984 :
10985 : int
10986 27 : spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
10987 : uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks,
10988 : spdk_bdev_io_completion_cb cb, void *cb_arg)
10989 : {
10990 27 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10991 : struct spdk_bdev_io *bdev_io;
10992 27 : struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
10993 :
10994 27 : if (!desc->write) {
10995 0 : return -EBADF;
10996 : }
10997 :
10998 27 : if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) ||
10999 27 : !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) {
11000 0 : SPDK_DEBUGLOG(bdev,
11001 : "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n",
11002 : dst_offset_blocks, src_offset_blocks, num_blocks);
11003 0 : return -EINVAL;
11004 : }
11005 :
11006 27 : bdev_io = bdev_channel_get_io(channel);
11007 27 : if (!bdev_io) {
11008 0 : return -ENOMEM;
11009 : }
11010 :
11011 27 : bdev_io->internal.ch = channel;
11012 27 : bdev_io->internal.desc = desc;
11013 27 : bdev_io->type = SPDK_BDEV_IO_TYPE_COPY;
11014 :
11015 27 : bdev_io->u.bdev.offset_blocks = dst_offset_blocks;
11016 27 : bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks;
11017 27 : bdev_io->u.bdev.num_blocks = num_blocks;
11018 27 : bdev_io->u.bdev.memory_domain = NULL;
11019 27 : bdev_io->u.bdev.memory_domain_ctx = NULL;
11020 27 : bdev_io->u.bdev.iovs = NULL;
11021 27 : bdev_io->u.bdev.iovcnt = 0;
11022 27 : bdev_io->u.bdev.md_buf = NULL;
11023 27 : bdev_io->u.bdev.accel_sequence = NULL;
11024 27 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
11025 :
11026 27 : if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) {
11027 0 : spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io);
11028 0 : return 0;
11029 : }
11030 :
11031 :
11032 : /* If the copy size is large and should be split, use the generic split logic
11033 : * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not.
11034 : *
11035 : * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or
11036 : * emulate it using regular read and write requests otherwise.
11037 : */
11038 27 : if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) ||
11039 : bdev_io->internal.f.split) {
11040 24 : bdev_io_submit(bdev_io);
11041 24 : return 0;
11042 : }
11043 :
11044 3 : spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev));
11045 :
11046 3 : return 0;
11047 : }
11048 :
11049 3 : SPDK_LOG_REGISTER_COMPONENT(bdev)
11050 :
11051 : static void
11052 0 : bdev_trace(void)
11053 : {
11054 0 : struct spdk_trace_tpoint_opts opts[] = {
11055 : {
11056 : "BDEV_IO_START", TRACE_BDEV_IO_START,
11057 : OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1,
11058 : {
11059 : { "type", SPDK_TRACE_ARG_TYPE_INT, 8 },
11060 : { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
11061 : { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 },
11062 : { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
11063 : }
11064 : },
11065 : {
11066 : "BDEV_IO_DONE", TRACE_BDEV_IO_DONE,
11067 : OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0,
11068 : {
11069 : { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
11070 : { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
11071 : }
11072 : },
11073 : {
11074 : "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE,
11075 : OWNER_TYPE_BDEV, OBJECT_NONE, 0,
11076 : {
11077 : { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 }
11078 : }
11079 : },
11080 : {
11081 : "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY,
11082 : OWNER_TYPE_BDEV, OBJECT_NONE, 0,
11083 : {
11084 : { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 }
11085 : }
11086 : },
11087 : };
11088 :
11089 :
11090 0 : spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b');
11091 0 : spdk_trace_register_object(OBJECT_BDEV_IO, 'i');
11092 0 : spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
11093 0 : spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0);
11094 0 : spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0);
11095 0 : spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0);
11096 0 : spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0);
11097 0 : spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0);
11098 0 : spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0);
11099 0 : }
11100 3 : SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV)
|