Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2016 Intel Corporation. All rights reserved.
3 : * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4 : * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 : */
6 :
7 : #include "spdk/stdinc.h"
8 :
9 : #include "spdk/bdev.h"
10 :
11 : #include "spdk/accel.h"
12 : #include "spdk/config.h"
13 : #include "spdk/env.h"
14 : #include "spdk/thread.h"
15 : #include "spdk/likely.h"
16 : #include "spdk/queue.h"
17 : #include "spdk/nvme_spec.h"
18 : #include "spdk/scsi_spec.h"
19 : #include "spdk/notify.h"
20 : #include "spdk/util.h"
21 : #include "spdk/trace.h"
22 : #include "spdk/dma.h"
23 :
24 : #include "spdk/bdev_module.h"
25 : #include "spdk/log.h"
26 : #include "spdk/string.h"
27 :
28 : #include "bdev_internal.h"
29 : #include "spdk_internal/trace_defs.h"
30 : #include "spdk_internal/assert.h"
31 :
32 : #ifdef SPDK_CONFIG_VTUNE
33 : #include "ittnotify.h"
34 : #include "ittnotify_types.h"
35 : int __itt_init_ittlib(const char *, __itt_group_id);
36 : #endif
37 :
38 : #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1)
39 : #define SPDK_BDEV_IO_CACHE_SIZE 256
40 : #define SPDK_BDEV_AUTO_EXAMINE true
41 : #define BUF_SMALL_CACHE_SIZE 128
42 : #define BUF_LARGE_CACHE_SIZE 16
43 : #define NOMEM_THRESHOLD_COUNT 8
44 :
45 : #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000
46 : #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1
47 : #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512
48 : #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000
49 : #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024)
50 : #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024))
51 : #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX
52 : #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000
53 :
54 : /* The maximum number of children requests for a UNMAP or WRITE ZEROES command
55 : * when splitting into children requests at a time.
56 : */
57 : #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8)
58 : #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000
59 :
60 : /* The maximum number of children requests for a COPY command
61 : * when splitting into children requests at a time.
62 : */
63 : #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8)
64 :
65 : #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \
66 : log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev)
67 : #ifdef DEBUG
68 : #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \
69 : log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev)
70 : #else
71 : #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0)
72 : #endif
73 :
74 : static void log_already_claimed(enum spdk_log_level level, const int line, const char *func,
75 : const char *detail, struct spdk_bdev *bdev);
76 :
77 : static const char *qos_rpc_type[] = {"rw_ios_per_sec",
78 : "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
79 : };
80 :
81 : TAILQ_HEAD(spdk_bdev_list, spdk_bdev);
82 :
83 : RB_HEAD(bdev_name_tree, spdk_bdev_name);
84 :
85 : static int
86 576 : bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2)
87 : {
88 576 : return strcmp(name1->name, name2->name);
89 : }
90 :
91 2145 : RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp);
92 :
93 : struct spdk_bdev_mgr {
94 : struct spdk_mempool *bdev_io_pool;
95 :
96 : void *zero_buffer;
97 :
98 : TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules;
99 :
100 : struct spdk_bdev_list bdevs;
101 : struct bdev_name_tree bdev_names;
102 :
103 : bool init_complete;
104 : bool module_init_complete;
105 :
106 : struct spdk_spinlock spinlock;
107 :
108 : TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens;
109 :
110 : #ifdef SPDK_CONFIG_VTUNE
111 : __itt_domain *domain;
112 : #endif
113 : };
114 :
115 : static struct spdk_bdev_mgr g_bdev_mgr = {
116 : .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
117 : .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
118 : .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names),
119 : .init_complete = false,
120 : .module_init_complete = false,
121 : .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens),
122 : };
123 :
124 : static void
125 : __attribute__((constructor))
126 3 : _bdev_init(void)
127 : {
128 3 : spdk_spin_init(&g_bdev_mgr.spinlock);
129 3 : }
130 :
131 : typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status);
132 :
133 : typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc);
134 :
135 : struct lba_range {
136 : struct spdk_bdev *bdev;
137 : uint64_t offset;
138 : uint64_t length;
139 : bool quiesce;
140 : void *locked_ctx;
141 : struct spdk_thread *owner_thread;
142 : struct spdk_bdev_channel *owner_ch;
143 : TAILQ_ENTRY(lba_range) tailq;
144 : TAILQ_ENTRY(lba_range) tailq_module;
145 : };
146 :
147 : static struct spdk_bdev_opts g_bdev_opts = {
148 : .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
149 : .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
150 : .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE,
151 : .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE,
152 : .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE,
153 : };
154 :
155 : static spdk_bdev_init_cb g_init_cb_fn = NULL;
156 : static void *g_init_cb_arg = NULL;
157 :
158 : static spdk_bdev_fini_cb g_fini_cb_fn = NULL;
159 : static void *g_fini_cb_arg = NULL;
160 : static struct spdk_thread *g_fini_thread = NULL;
161 :
162 : struct spdk_bdev_qos_limit {
163 : /** IOs or bytes allowed per second (i.e., 1s). */
164 : uint64_t limit;
165 :
166 : /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms).
167 : * For remaining bytes, allowed to run negative if an I/O is submitted when
168 : * some bytes are remaining, but the I/O is bigger than that amount. The
169 : * excess will be deducted from the next timeslice.
170 : */
171 : int64_t remaining_this_timeslice;
172 :
173 : /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
174 : uint32_t min_per_timeslice;
175 :
176 : /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
177 : uint32_t max_per_timeslice;
178 :
179 : /** Function to check whether to queue the IO.
180 : * If The IO is allowed to pass, the quota will be reduced correspondingly.
181 : */
182 : bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
183 :
184 : /** Function to rewind the quota once the IO was allowed to be sent by this
185 : * limit but queued due to one of the further limits.
186 : */
187 : void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
188 : };
189 :
190 : struct spdk_bdev_qos {
191 : /** Types of structure of rate limits. */
192 : struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
193 :
194 : /** The channel that all I/O are funneled through. */
195 : struct spdk_bdev_channel *ch;
196 :
197 : /** The thread on which the poller is running. */
198 : struct spdk_thread *thread;
199 :
200 : /** Size of a timeslice in tsc ticks. */
201 : uint64_t timeslice_size;
202 :
203 : /** Timestamp of start of last timeslice. */
204 : uint64_t last_timeslice;
205 :
206 : /** Poller that processes queued I/O commands each time slice. */
207 : struct spdk_poller *poller;
208 : };
209 :
210 : struct spdk_bdev_mgmt_channel {
211 : /*
212 : * Each thread keeps a cache of bdev_io - this allows
213 : * bdev threads which are *not* DPDK threads to still
214 : * benefit from a per-thread bdev_io cache. Without
215 : * this, non-DPDK threads fetching from the mempool
216 : * incur a cmpxchg on get and put.
217 : */
218 : bdev_io_stailq_t per_thread_cache;
219 : uint32_t per_thread_cache_count;
220 : uint32_t bdev_io_cache_size;
221 :
222 : struct spdk_iobuf_channel iobuf;
223 :
224 : TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources;
225 : TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue;
226 : };
227 :
228 : /*
229 : * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
230 : * will queue here their IO that awaits retry. It makes it possible to retry sending
231 : * IO to one bdev after IO from other bdev completes.
232 : */
233 : struct spdk_bdev_shared_resource {
234 : /* The bdev management channel */
235 : struct spdk_bdev_mgmt_channel *mgmt_ch;
236 :
237 : /*
238 : * Count of I/O submitted to bdev module and waiting for completion.
239 : * Incremented before submit_request() is called on an spdk_bdev_io.
240 : */
241 : uint64_t io_outstanding;
242 :
243 : /*
244 : * Queue of IO awaiting retry because of a previous NOMEM status returned
245 : * on this channel.
246 : */
247 : bdev_io_tailq_t nomem_io;
248 :
249 : /*
250 : * Threshold which io_outstanding must drop to before retrying nomem_io.
251 : */
252 : uint64_t nomem_threshold;
253 :
254 : /*
255 : * Indicate whether aborting nomem I/Os is in progress.
256 : * If true, we should not touch the nomem_io list on I/O completions.
257 : */
258 : bool abort_in_progress;
259 :
260 : /* I/O channel allocated by a bdev module */
261 : struct spdk_io_channel *shared_ch;
262 :
263 : struct spdk_poller *nomem_poller;
264 :
265 : /* Refcount of bdev channels using this resource */
266 : uint32_t ref;
267 :
268 : TAILQ_ENTRY(spdk_bdev_shared_resource) link;
269 : };
270 :
271 : #define BDEV_CH_RESET_IN_PROGRESS (1 << 0)
272 : #define BDEV_CH_QOS_ENABLED (1 << 1)
273 :
274 : struct spdk_bdev_channel {
275 : struct spdk_bdev *bdev;
276 :
277 : /* The channel for the underlying device */
278 : struct spdk_io_channel *channel;
279 :
280 : /* Accel channel */
281 : struct spdk_io_channel *accel_channel;
282 :
283 : /* Per io_device per thread data */
284 : struct spdk_bdev_shared_resource *shared_resource;
285 :
286 : struct spdk_bdev_io_stat *stat;
287 :
288 : /*
289 : * Count of I/O submitted to the underlying dev module through this channel
290 : * and waiting for completion.
291 : */
292 : uint64_t io_outstanding;
293 :
294 : /*
295 : * List of all submitted I/Os including I/O that are generated via splitting.
296 : */
297 : bdev_io_tailq_t io_submitted;
298 :
299 : /*
300 : * List of spdk_bdev_io that are currently queued because they write to a locked
301 : * LBA range.
302 : */
303 : bdev_io_tailq_t io_locked;
304 :
305 : /* List of I/Os with accel sequence being currently executed */
306 : bdev_io_tailq_t io_accel_exec;
307 :
308 : /* List of I/Os doing memory domain pull/push */
309 : bdev_io_tailq_t io_memory_domain;
310 :
311 : uint32_t flags;
312 :
313 : /* Counts number of bdev_io in the io_submitted TAILQ */
314 : uint16_t queue_depth;
315 :
316 : uint16_t trace_id;
317 :
318 : struct spdk_histogram_data *histogram;
319 :
320 : #ifdef SPDK_CONFIG_VTUNE
321 : uint64_t start_tsc;
322 : uint64_t interval_tsc;
323 : __itt_string_handle *handle;
324 : struct spdk_bdev_io_stat *prev_stat;
325 : #endif
326 :
327 : lba_range_tailq_t locked_ranges;
328 :
329 : /** List of I/Os queued by QoS. */
330 : bdev_io_tailq_t qos_queued_io;
331 : };
332 :
333 : struct media_event_entry {
334 : struct spdk_bdev_media_event event;
335 : TAILQ_ENTRY(media_event_entry) tailq;
336 : };
337 :
338 : #define MEDIA_EVENT_POOL_SIZE 64
339 :
340 : struct spdk_bdev_desc {
341 : struct spdk_bdev *bdev;
342 : bool write;
343 : bool memory_domains_supported;
344 : bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES];
345 : struct spdk_bdev_open_opts opts;
346 : struct spdk_thread *thread;
347 : struct {
348 : spdk_bdev_event_cb_t event_fn;
349 : void *ctx;
350 : } callback;
351 : bool closed;
352 : struct spdk_spinlock spinlock;
353 : uint32_t refs;
354 : TAILQ_HEAD(, media_event_entry) pending_media_events;
355 : TAILQ_HEAD(, media_event_entry) free_media_events;
356 : struct media_event_entry *media_events_buffer;
357 : TAILQ_ENTRY(spdk_bdev_desc) link;
358 :
359 : uint64_t timeout_in_sec;
360 : spdk_bdev_io_timeout_cb cb_fn;
361 : void *cb_arg;
362 : struct spdk_poller *io_timeout_poller;
363 : struct spdk_bdev_module_claim *claim;
364 : };
365 :
366 : struct spdk_bdev_iostat_ctx {
367 : struct spdk_bdev_io_stat *stat;
368 : enum spdk_bdev_reset_stat_mode reset_mode;
369 : spdk_bdev_get_device_stat_cb cb;
370 : void *cb_arg;
371 : };
372 :
373 : struct set_qos_limit_ctx {
374 : void (*cb_fn)(void *cb_arg, int status);
375 : void *cb_arg;
376 : struct spdk_bdev *bdev;
377 : };
378 :
379 : struct spdk_bdev_channel_iter {
380 : spdk_bdev_for_each_channel_msg fn;
381 : spdk_bdev_for_each_channel_done cpl;
382 : struct spdk_io_channel_iter *i;
383 : void *ctx;
384 : };
385 :
386 : struct spdk_bdev_io_error_stat {
387 : uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS];
388 : };
389 :
390 : enum bdev_io_retry_state {
391 : BDEV_IO_RETRY_STATE_INVALID,
392 : BDEV_IO_RETRY_STATE_PULL,
393 : BDEV_IO_RETRY_STATE_PULL_MD,
394 : BDEV_IO_RETRY_STATE_SUBMIT,
395 : BDEV_IO_RETRY_STATE_PUSH,
396 : BDEV_IO_RETRY_STATE_PUSH_MD,
397 : BDEV_IO_RETRY_STATE_GET_ACCEL_BUF,
398 : };
399 :
400 : #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1)
401 : #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1))
402 : #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch))
403 : #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch))
404 :
405 : static inline void bdev_io_complete(void *ctx);
406 : static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io);
407 : static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io);
408 : static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io);
409 : static void _bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io);
410 :
411 : static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
412 : static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io);
413 :
414 : static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
415 : struct spdk_io_channel *ch, void *_ctx);
416 : static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status);
417 :
418 : static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
419 : struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
420 : uint64_t num_blocks,
421 : struct spdk_memory_domain *domain, void *domain_ctx,
422 : struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
423 : spdk_bdev_io_completion_cb cb, void *cb_arg);
424 : static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
425 : struct iovec *iov, int iovcnt, void *md_buf,
426 : uint64_t offset_blocks, uint64_t num_blocks,
427 : struct spdk_memory_domain *domain, void *domain_ctx,
428 : struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
429 : uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw,
430 : spdk_bdev_io_completion_cb cb, void *cb_arg);
431 :
432 : static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
433 : uint64_t offset, uint64_t length,
434 : lock_range_cb cb_fn, void *cb_arg);
435 :
436 : static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
437 : uint64_t offset, uint64_t length,
438 : lock_range_cb cb_fn, void *cb_arg);
439 :
440 : static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort);
441 : static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort);
442 :
443 : static bool claim_type_is_v2(enum spdk_bdev_claim_type type);
444 : static void bdev_desc_release_claims(struct spdk_bdev_desc *desc);
445 : static void claim_reset(struct spdk_bdev *bdev);
446 :
447 : static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch);
448 :
449 : static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io);
450 :
451 : #define bdev_get_ext_io_opt(opts, field, defval) \
452 : ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval))
453 :
454 : static inline void
455 688 : bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io)
456 : {
457 688 : TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
458 688 : bdev_io->internal.ch->queue_depth++;
459 688 : }
460 :
461 : static inline void
462 688 : bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io)
463 : {
464 688 : TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
465 688 : bdev_io->internal.ch->queue_depth--;
466 688 : }
467 :
468 : void
469 16 : spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size)
470 : {
471 16 : if (!opts) {
472 0 : SPDK_ERRLOG("opts should not be NULL\n");
473 0 : return;
474 : }
475 :
476 16 : if (!opts_size) {
477 0 : SPDK_ERRLOG("opts_size should not be zero value\n");
478 0 : return;
479 : }
480 :
481 16 : opts->opts_size = opts_size;
482 :
483 : #define SET_FIELD(field) \
484 : if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \
485 : opts->field = g_bdev_opts.field; \
486 : } \
487 :
488 16 : SET_FIELD(bdev_io_pool_size);
489 16 : SET_FIELD(bdev_io_cache_size);
490 16 : SET_FIELD(bdev_auto_examine);
491 16 : SET_FIELD(iobuf_small_cache_size);
492 16 : SET_FIELD(iobuf_large_cache_size);
493 :
494 : /* Do not remove this statement, you should always update this statement when you adding a new field,
495 : * and do not forget to add the SET_FIELD statement for your added field. */
496 : SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size");
497 :
498 : #undef SET_FIELD
499 16 : }
500 :
501 : int
502 17 : spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
503 : {
504 : uint32_t min_pool_size;
505 :
506 17 : if (!opts) {
507 0 : SPDK_ERRLOG("opts cannot be NULL\n");
508 0 : return -1;
509 : }
510 :
511 17 : if (!opts->opts_size) {
512 1 : SPDK_ERRLOG("opts_size inside opts cannot be zero value\n");
513 1 : return -1;
514 : }
515 :
516 : /*
517 : * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem
518 : * initialization. A second mgmt_ch will be created on the same thread when the application starts
519 : * but before the deferred put_io_channel event is executed for the first mgmt_ch.
520 : */
521 16 : min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1);
522 16 : if (opts->bdev_io_pool_size < min_pool_size) {
523 0 : SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
524 : " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
525 : spdk_thread_get_count());
526 0 : SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size);
527 0 : return -1;
528 : }
529 :
530 : #define SET_FIELD(field) \
531 : if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \
532 : g_bdev_opts.field = opts->field; \
533 : } \
534 :
535 16 : SET_FIELD(bdev_io_pool_size);
536 16 : SET_FIELD(bdev_io_cache_size);
537 16 : SET_FIELD(bdev_auto_examine);
538 16 : SET_FIELD(iobuf_small_cache_size);
539 16 : SET_FIELD(iobuf_large_cache_size);
540 :
541 16 : g_bdev_opts.opts_size = opts->opts_size;
542 :
543 : #undef SET_FIELD
544 :
545 16 : return 0;
546 17 : }
547 :
548 : static struct spdk_bdev *
549 156 : bdev_get_by_name(const char *bdev_name)
550 : {
551 : struct spdk_bdev_name find;
552 : struct spdk_bdev_name *res;
553 :
554 156 : find.name = (char *)bdev_name;
555 156 : res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find);
556 156 : if (res != NULL) {
557 149 : return res->bdev;
558 : }
559 :
560 7 : return NULL;
561 156 : }
562 :
563 : struct spdk_bdev *
564 19 : spdk_bdev_get_by_name(const char *bdev_name)
565 : {
566 : struct spdk_bdev *bdev;
567 :
568 19 : spdk_spin_lock(&g_bdev_mgr.spinlock);
569 19 : bdev = bdev_get_by_name(bdev_name);
570 19 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
571 :
572 19 : return bdev;
573 : }
574 :
575 : struct bdev_io_status_string {
576 : enum spdk_bdev_io_status status;
577 : const char *str;
578 : };
579 :
580 : static const struct bdev_io_status_string bdev_io_status_strings[] = {
581 : { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" },
582 : { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" },
583 : { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" },
584 : { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" },
585 : { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" },
586 : { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" },
587 : { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" },
588 : { SPDK_BDEV_IO_STATUS_FAILED, "failed" },
589 : { SPDK_BDEV_IO_STATUS_PENDING, "pending" },
590 : { SPDK_BDEV_IO_STATUS_SUCCESS, "success" },
591 : };
592 :
593 : static const char *
594 0 : bdev_io_status_get_string(enum spdk_bdev_io_status status)
595 : {
596 : uint32_t i;
597 :
598 0 : for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) {
599 0 : if (bdev_io_status_strings[i].status == status) {
600 0 : return bdev_io_status_strings[i].str;
601 : }
602 0 : }
603 :
604 0 : return "reserved";
605 0 : }
606 :
607 : struct spdk_bdev_wait_for_examine_ctx {
608 : struct spdk_poller *poller;
609 : spdk_bdev_wait_for_examine_cb cb_fn;
610 : void *cb_arg;
611 : };
612 :
613 : static bool bdev_module_all_actions_completed(void);
614 :
615 : static int
616 205 : bdev_wait_for_examine_cb(void *arg)
617 : {
618 205 : struct spdk_bdev_wait_for_examine_ctx *ctx = arg;
619 :
620 205 : if (!bdev_module_all_actions_completed()) {
621 0 : return SPDK_POLLER_IDLE;
622 : }
623 :
624 205 : spdk_poller_unregister(&ctx->poller);
625 205 : ctx->cb_fn(ctx->cb_arg);
626 205 : free(ctx);
627 :
628 205 : return SPDK_POLLER_BUSY;
629 205 : }
630 :
631 : int
632 205 : spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg)
633 : {
634 : struct spdk_bdev_wait_for_examine_ctx *ctx;
635 :
636 205 : ctx = calloc(1, sizeof(*ctx));
637 205 : if (ctx == NULL) {
638 0 : return -ENOMEM;
639 : }
640 205 : ctx->cb_fn = cb_fn;
641 205 : ctx->cb_arg = cb_arg;
642 205 : ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0);
643 :
644 205 : return 0;
645 205 : }
646 :
647 : struct spdk_bdev_examine_item {
648 : char *name;
649 : TAILQ_ENTRY(spdk_bdev_examine_item) link;
650 : };
651 :
652 : TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item);
653 :
654 : struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER(
655 : g_bdev_examine_allowlist);
656 :
657 : static inline bool
658 24 : bdev_examine_allowlist_check(const char *name)
659 : {
660 : struct spdk_bdev_examine_item *item;
661 24 : TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
662 3 : if (strcmp(name, item->name) == 0) {
663 3 : return true;
664 : }
665 0 : }
666 21 : return false;
667 24 : }
668 :
669 : static inline void
670 260 : bdev_examine_allowlist_remove(const char *name)
671 : {
672 : struct spdk_bdev_examine_item *item;
673 260 : TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
674 3 : if (strcmp(name, item->name) == 0) {
675 3 : TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link);
676 3 : free(item->name);
677 3 : free(item);
678 3 : break;
679 : }
680 0 : }
681 260 : }
682 :
683 : static inline void
684 69 : bdev_examine_allowlist_free(void)
685 : {
686 : struct spdk_bdev_examine_item *item;
687 69 : while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) {
688 0 : item = TAILQ_FIRST(&g_bdev_examine_allowlist);
689 0 : TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link);
690 0 : free(item->name);
691 0 : free(item);
692 : }
693 69 : }
694 :
695 : static inline bool
696 12 : bdev_in_examine_allowlist(struct spdk_bdev *bdev)
697 : {
698 : struct spdk_bdev_alias *tmp;
699 12 : if (bdev_examine_allowlist_check(bdev->name)) {
700 3 : return true;
701 : }
702 18 : TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
703 9 : if (bdev_examine_allowlist_check(tmp->alias.name)) {
704 0 : return true;
705 : }
706 9 : }
707 9 : return false;
708 12 : }
709 :
710 : static inline bool
711 134 : bdev_ok_to_examine(struct spdk_bdev *bdev)
712 : {
713 : /* Some bdevs may not support the READ command.
714 : * Do not try to examine them.
715 : */
716 134 : if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) {
717 0 : return false;
718 : }
719 :
720 134 : if (g_bdev_opts.bdev_auto_examine) {
721 122 : return true;
722 : } else {
723 12 : return bdev_in_examine_allowlist(bdev);
724 : }
725 134 : }
726 :
727 : static void
728 134 : bdev_examine(struct spdk_bdev *bdev)
729 : {
730 : struct spdk_bdev_module *module;
731 : struct spdk_bdev_module_claim *claim, *tmpclaim;
732 : uint32_t action;
733 :
734 134 : if (!bdev_ok_to_examine(bdev)) {
735 9 : return;
736 : }
737 :
738 508 : TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
739 383 : if (module->examine_config) {
740 258 : spdk_spin_lock(&module->internal.spinlock);
741 258 : action = module->internal.action_in_progress;
742 258 : module->internal.action_in_progress++;
743 258 : spdk_spin_unlock(&module->internal.spinlock);
744 258 : module->examine_config(bdev);
745 258 : if (action != module->internal.action_in_progress) {
746 0 : SPDK_ERRLOG("examine_config for module %s did not call "
747 : "spdk_bdev_module_examine_done()\n", module->name);
748 0 : }
749 258 : }
750 383 : }
751 :
752 125 : spdk_spin_lock(&bdev->internal.spinlock);
753 :
754 125 : switch (bdev->internal.claim_type) {
755 : case SPDK_BDEV_CLAIM_NONE:
756 : /* Examine by all bdev modules */
757 468 : TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
758 351 : if (module->examine_disk) {
759 225 : spdk_spin_lock(&module->internal.spinlock);
760 225 : module->internal.action_in_progress++;
761 225 : spdk_spin_unlock(&module->internal.spinlock);
762 225 : spdk_spin_unlock(&bdev->internal.spinlock);
763 225 : module->examine_disk(bdev);
764 225 : spdk_spin_lock(&bdev->internal.spinlock);
765 225 : }
766 351 : }
767 117 : break;
768 : case SPDK_BDEV_CLAIM_EXCL_WRITE:
769 : /* Examine by the one bdev module with a v1 claim */
770 1 : module = bdev->internal.claim.v1.module;
771 1 : if (module->examine_disk) {
772 1 : spdk_spin_lock(&module->internal.spinlock);
773 1 : module->internal.action_in_progress++;
774 1 : spdk_spin_unlock(&module->internal.spinlock);
775 1 : spdk_spin_unlock(&bdev->internal.spinlock);
776 1 : module->examine_disk(bdev);
777 1 : return;
778 : }
779 0 : break;
780 : default:
781 : /* Examine by all bdev modules with a v2 claim */
782 7 : assert(claim_type_is_v2(bdev->internal.claim_type));
783 : /*
784 : * Removal of tailq nodes while iterating can cause the iteration to jump out of the
785 : * list, perhaps accessing freed memory. Without protection, this could happen
786 : * while the lock is dropped during the examine callback.
787 : */
788 7 : bdev->internal.examine_in_progress++;
789 :
790 16 : TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
791 9 : module = claim->module;
792 :
793 9 : if (module == NULL) {
794 : /* This is a vestigial claim, held by examine_count */
795 0 : continue;
796 : }
797 :
798 9 : if (module->examine_disk == NULL) {
799 0 : continue;
800 : }
801 :
802 9 : spdk_spin_lock(&module->internal.spinlock);
803 9 : module->internal.action_in_progress++;
804 9 : spdk_spin_unlock(&module->internal.spinlock);
805 :
806 : /* Call examine_disk without holding internal.spinlock. */
807 9 : spdk_spin_unlock(&bdev->internal.spinlock);
808 9 : module->examine_disk(bdev);
809 9 : spdk_spin_lock(&bdev->internal.spinlock);
810 9 : }
811 :
812 7 : assert(bdev->internal.examine_in_progress > 0);
813 7 : bdev->internal.examine_in_progress--;
814 7 : if (bdev->internal.examine_in_progress == 0) {
815 : /* Remove any claims that were released during examine_disk */
816 16 : TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) {
817 9 : if (claim->desc != NULL) {
818 9 : continue;
819 : }
820 :
821 0 : TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link);
822 0 : free(claim);
823 0 : }
824 7 : if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
825 0 : claim_reset(bdev);
826 0 : }
827 7 : }
828 7 : }
829 :
830 124 : spdk_spin_unlock(&bdev->internal.spinlock);
831 134 : }
832 :
833 : int
834 4 : spdk_bdev_examine(const char *name)
835 : {
836 : struct spdk_bdev *bdev;
837 : struct spdk_bdev_examine_item *item;
838 4 : struct spdk_thread *thread = spdk_get_thread();
839 :
840 4 : if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) {
841 1 : SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread,
842 : thread ? spdk_thread_get_name(thread) : "null");
843 1 : return -EINVAL;
844 : }
845 :
846 3 : if (g_bdev_opts.bdev_auto_examine) {
847 0 : SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n");
848 0 : return -EINVAL;
849 : }
850 :
851 3 : if (bdev_examine_allowlist_check(name)) {
852 0 : SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name);
853 0 : return -EEXIST;
854 : }
855 :
856 3 : item = calloc(1, sizeof(*item));
857 3 : if (!item) {
858 0 : return -ENOMEM;
859 : }
860 3 : item->name = strdup(name);
861 3 : if (!item->name) {
862 0 : free(item);
863 0 : return -ENOMEM;
864 : }
865 3 : TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link);
866 :
867 3 : bdev = spdk_bdev_get_by_name(name);
868 3 : if (bdev) {
869 3 : bdev_examine(bdev);
870 3 : }
871 3 : return 0;
872 4 : }
873 :
874 : static inline void
875 0 : bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w)
876 : {
877 : struct spdk_bdev_examine_item *item;
878 0 : TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
879 0 : spdk_json_write_object_begin(w);
880 0 : spdk_json_write_named_string(w, "method", "bdev_examine");
881 0 : spdk_json_write_named_object_begin(w, "params");
882 0 : spdk_json_write_named_string(w, "name", item->name);
883 0 : spdk_json_write_object_end(w);
884 0 : spdk_json_write_object_end(w);
885 0 : }
886 0 : }
887 :
888 : struct spdk_bdev *
889 1 : spdk_bdev_first(void)
890 : {
891 : struct spdk_bdev *bdev;
892 :
893 1 : bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
894 1 : if (bdev) {
895 1 : SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
896 1 : }
897 :
898 1 : return bdev;
899 : }
900 :
901 : struct spdk_bdev *
902 8 : spdk_bdev_next(struct spdk_bdev *prev)
903 : {
904 : struct spdk_bdev *bdev;
905 :
906 8 : bdev = TAILQ_NEXT(prev, internal.link);
907 8 : if (bdev) {
908 7 : SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
909 7 : }
910 :
911 8 : return bdev;
912 : }
913 :
914 : static struct spdk_bdev *
915 6 : _bdev_next_leaf(struct spdk_bdev *bdev)
916 : {
917 9 : while (bdev != NULL) {
918 8 : if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
919 5 : return bdev;
920 : } else {
921 3 : bdev = TAILQ_NEXT(bdev, internal.link);
922 : }
923 : }
924 :
925 1 : return bdev;
926 6 : }
927 :
928 : struct spdk_bdev *
929 1 : spdk_bdev_first_leaf(void)
930 : {
931 : struct spdk_bdev *bdev;
932 :
933 1 : bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
934 :
935 1 : if (bdev) {
936 1 : SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
937 1 : }
938 :
939 1 : return bdev;
940 : }
941 :
942 : struct spdk_bdev *
943 5 : spdk_bdev_next_leaf(struct spdk_bdev *prev)
944 : {
945 : struct spdk_bdev *bdev;
946 :
947 5 : bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link));
948 :
949 5 : if (bdev) {
950 4 : SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
951 4 : }
952 :
953 5 : return bdev;
954 : }
955 :
956 : static inline bool
957 820 : bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io)
958 : {
959 820 : return bdev_io->internal.f.has_memory_domain;
960 : }
961 :
962 : static inline bool
963 1568 : bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io)
964 : {
965 1568 : return bdev_io->internal.f.has_accel_sequence;
966 : }
967 :
968 : static inline uint32_t
969 389 : bdev_desc_get_block_size(struct spdk_bdev_desc *desc)
970 : {
971 389 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
972 :
973 389 : if (spdk_unlikely(desc->opts.hide_metadata)) {
974 0 : return bdev->blocklen - bdev->md_len;
975 : } else {
976 389 : return bdev->blocklen;
977 : }
978 389 : }
979 :
980 : static inline uint32_t
981 110 : bdev_io_get_block_size(struct spdk_bdev_io *bdev_io)
982 : {
983 110 : struct spdk_bdev *bdev = bdev_io->bdev;
984 :
985 110 : if (bdev_io->u.bdev.dif_check_flags & SPDK_DIF_FLAGS_NVME_PRACT) {
986 0 : if (bdev->md_len == spdk_dif_pi_format_get_size(bdev->dif_pi_format)) {
987 0 : return bdev->blocklen - bdev->md_len;
988 : } else {
989 0 : return bdev->blocklen;
990 : }
991 : }
992 :
993 110 : return bdev_desc_get_block_size(bdev_io->internal.desc);
994 110 : }
995 :
996 : static inline void
997 23 : bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource,
998 : struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
999 : {
1000 : /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io.
1001 : * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth
1002 : * channels we will instead wait for half to complete.
1003 : */
1004 23 : shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
1005 : (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
1006 :
1007 23 : assert(state != BDEV_IO_RETRY_STATE_INVALID);
1008 23 : bdev_io->internal.retry_state = state;
1009 23 : TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link);
1010 23 : }
1011 :
1012 : static inline void
1013 58 : bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource,
1014 : struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
1015 : {
1016 : /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while
1017 : * the queue isn't empty, so we don't need to update the nomem_threshold here */
1018 58 : assert(!TAILQ_EMPTY(&shared_resource->nomem_io));
1019 :
1020 58 : assert(state != BDEV_IO_RETRY_STATE_INVALID);
1021 58 : bdev_io->internal.retry_state = state;
1022 58 : TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link);
1023 58 : }
1024 :
1025 : void
1026 16 : spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
1027 : {
1028 : struct iovec *iovs;
1029 :
1030 16 : if (bdev_io->u.bdev.iovs == NULL) {
1031 3 : bdev_io->u.bdev.iovs = &bdev_io->iov;
1032 3 : bdev_io->u.bdev.iovcnt = 1;
1033 3 : }
1034 :
1035 16 : iovs = bdev_io->u.bdev.iovs;
1036 :
1037 16 : assert(iovs != NULL);
1038 16 : assert(bdev_io->u.bdev.iovcnt >= 1);
1039 :
1040 16 : iovs[0].iov_base = buf;
1041 16 : iovs[0].iov_len = len;
1042 16 : }
1043 :
1044 : void
1045 3 : spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
1046 : {
1047 3 : assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks);
1048 3 : bdev_io->u.bdev.md_buf = md_buf;
1049 3 : }
1050 :
1051 : static bool
1052 167 : _is_buf_allocated(const struct iovec *iovs)
1053 : {
1054 167 : if (iovs == NULL) {
1055 6 : return false;
1056 : }
1057 :
1058 161 : return iovs[0].iov_base != NULL;
1059 167 : }
1060 :
1061 : static bool
1062 50 : _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment)
1063 : {
1064 : int i;
1065 : uintptr_t iov_base;
1066 :
1067 50 : if (spdk_likely(alignment == 1)) {
1068 21 : return true;
1069 : }
1070 :
1071 36 : for (i = 0; i < iovcnt; i++) {
1072 29 : iov_base = (uintptr_t)iovs[i].iov_base;
1073 29 : if ((iov_base & (alignment - 1)) != 0) {
1074 22 : return false;
1075 : }
1076 7 : }
1077 :
1078 7 : return true;
1079 50 : }
1080 :
1081 : static inline bool
1082 895 : bdev_io_needs_metadata(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
1083 : {
1084 1048 : return (bdev_io->bdev->md_len != 0) &&
1085 153 : (desc->opts.hide_metadata ||
1086 153 : (bdev_io->u.bdev.dif_check_flags & SPDK_DIF_FLAGS_NVME_PRACT));
1087 : }
1088 :
1089 : static inline bool
1090 852 : bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
1091 : {
1092 852 : if (!bdev_io_use_accel_sequence(bdev_io)) {
1093 852 : return false;
1094 : }
1095 :
1096 : /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if
1097 : * bdev module didn't support accel sequences */
1098 0 : return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split;
1099 852 : }
1100 :
1101 : static inline void
1102 624 : bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch,
1103 : struct spdk_bdev_shared_resource *shared_resource)
1104 : {
1105 624 : bdev_ch->io_outstanding++;
1106 624 : shared_resource->io_outstanding++;
1107 624 : }
1108 :
1109 : static inline void
1110 624 : bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch,
1111 : struct spdk_bdev_shared_resource *shared_resource)
1112 : {
1113 624 : assert(bdev_ch->io_outstanding > 0);
1114 624 : assert(shared_resource->io_outstanding > 0);
1115 624 : bdev_ch->io_outstanding--;
1116 624 : shared_resource->io_outstanding--;
1117 624 : }
1118 :
1119 : static void
1120 0 : bdev_io_submit_sequence_cb(void *ctx, int status)
1121 : {
1122 0 : struct spdk_bdev_io *bdev_io = ctx;
1123 :
1124 0 : assert(bdev_io_use_accel_sequence(bdev_io));
1125 :
1126 0 : bdev_io->u.bdev.accel_sequence = NULL;
1127 0 : bdev_io->internal.f.has_accel_sequence = false;
1128 :
1129 0 : if (spdk_unlikely(status != 0)) {
1130 0 : SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
1131 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1132 0 : bdev_io_complete_unsubmitted(bdev_io);
1133 0 : return;
1134 : }
1135 :
1136 0 : bdev_io_submit(bdev_io);
1137 0 : }
1138 :
1139 : static void
1140 0 : bdev_io_exec_sequence_cb(void *ctx, int status)
1141 : {
1142 0 : struct spdk_bdev_io *bdev_io = ctx;
1143 0 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1144 :
1145 0 : TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1146 0 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1147 :
1148 0 : if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1149 0 : bdev_ch_retry_io(ch);
1150 0 : }
1151 :
1152 0 : bdev_io->internal.data_transfer_cpl(bdev_io, status);
1153 0 : }
1154 :
1155 : static void
1156 0 : bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status))
1157 : {
1158 0 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1159 :
1160 0 : assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1161 0 : assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1162 0 : assert(bdev_io_use_accel_sequence(bdev_io));
1163 :
1164 : /* Since the operations are appended during submission, they're in the opposite order than
1165 : * how we want to execute them for reads (i.e. we need to execute the most recently added
1166 : * operation first), so reverse the sequence before executing it.
1167 : */
1168 0 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1169 0 : spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence);
1170 0 : }
1171 :
1172 0 : TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1173 0 : bdev_io_increment_outstanding(ch, ch->shared_resource);
1174 0 : bdev_io->internal.data_transfer_cpl = cb_fn;
1175 :
1176 0 : spdk_accel_sequence_finish(bdev_io->internal.accel_sequence,
1177 0 : bdev_io_exec_sequence_cb, bdev_io);
1178 0 : }
1179 :
1180 : static void
1181 42 : bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status)
1182 : {
1183 42 : struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
1184 : void *buf;
1185 :
1186 42 : if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1187 0 : buf = bdev_io->internal.buf.ptr;
1188 0 : bdev_io->internal.buf.ptr = NULL;
1189 0 : bdev_io->internal.f.has_buf = false;
1190 0 : bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf);
1191 0 : bdev_io->internal.get_aux_buf_cb = NULL;
1192 0 : } else {
1193 42 : assert(bdev_io->internal.get_buf_cb != NULL);
1194 42 : bdev_io->internal.get_buf_cb(ch, bdev_io, status);
1195 42 : bdev_io->internal.get_buf_cb = NULL;
1196 : }
1197 42 : }
1198 :
1199 : static void
1200 4 : _bdev_io_pull_buffer_cpl(void *ctx, int rc)
1201 : {
1202 4 : struct spdk_bdev_io *bdev_io = ctx;
1203 :
1204 4 : if (rc) {
1205 0 : SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc);
1206 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1207 0 : }
1208 4 : bdev_io_get_buf_complete(bdev_io, !rc);
1209 4 : }
1210 :
1211 : static void
1212 2 : bdev_io_pull_md_buf_done(void *ctx, int status)
1213 : {
1214 2 : struct spdk_bdev_io *bdev_io = ctx;
1215 2 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1216 :
1217 2 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1218 2 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1219 :
1220 2 : if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1221 0 : bdev_ch_retry_io(ch);
1222 0 : }
1223 :
1224 2 : assert(bdev_io->internal.data_transfer_cpl);
1225 2 : bdev_io->internal.data_transfer_cpl(bdev_io, status);
1226 2 : }
1227 :
1228 : static void
1229 4 : bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io)
1230 : {
1231 4 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1232 4 : int rc = 0;
1233 :
1234 4 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1235 2 : assert(bdev_io->internal.f.has_bounce_buf);
1236 2 : if (bdev_io_use_memory_domain(bdev_io)) {
1237 2 : TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1238 2 : bdev_io_increment_outstanding(ch, ch->shared_resource);
1239 4 : rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1240 2 : bdev_io->internal.memory_domain_ctx,
1241 2 : &bdev_io->internal.bounce_buf.orig_md_iov, 1,
1242 2 : &bdev_io->internal.bounce_buf.md_iov, 1,
1243 2 : bdev_io_pull_md_buf_done, bdev_io);
1244 2 : if (rc == 0) {
1245 : /* Continue to submit IO in completion callback */
1246 2 : return;
1247 : }
1248 0 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1249 0 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1250 0 : if (rc != -ENOMEM) {
1251 0 : SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n",
1252 : spdk_memory_domain_get_dma_device_id(
1253 : bdev_io->internal.memory_domain), rc);
1254 0 : }
1255 0 : } else {
1256 0 : memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base,
1257 0 : bdev_io->internal.bounce_buf.orig_md_iov.iov_base,
1258 0 : bdev_io->internal.bounce_buf.orig_md_iov.iov_len);
1259 : }
1260 0 : }
1261 :
1262 2 : if (spdk_unlikely(rc == -ENOMEM)) {
1263 0 : bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD);
1264 0 : } else {
1265 2 : assert(bdev_io->internal.data_transfer_cpl);
1266 2 : bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1267 : }
1268 4 : }
1269 :
1270 : static void
1271 4 : _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
1272 : {
1273 4 : assert(bdev_io->internal.f.has_bounce_buf);
1274 :
1275 : /* save original md_buf */
1276 4 : bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf;
1277 4 : bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len;
1278 4 : bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf;
1279 4 : bdev_io->internal.bounce_buf.md_iov.iov_len = len;
1280 : /* set bounce md_buf */
1281 4 : bdev_io->u.bdev.md_buf = md_buf;
1282 :
1283 4 : bdev_io_pull_md_buf(bdev_io);
1284 4 : }
1285 :
1286 : static void
1287 42 : _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io)
1288 : {
1289 42 : struct spdk_bdev *bdev = bdev_io->bdev;
1290 : uint64_t md_len;
1291 : void *buf;
1292 :
1293 42 : if (spdk_bdev_is_md_separate(bdev)) {
1294 7 : assert(!bdev_io_use_accel_sequence(bdev_io));
1295 :
1296 7 : buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len;
1297 7 : md_len = bdev_io->u.bdev.num_blocks * bdev->md_len;
1298 :
1299 7 : assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0);
1300 :
1301 7 : if (bdev_io->u.bdev.md_buf != NULL) {
1302 4 : _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len);
1303 4 : return;
1304 : } else {
1305 3 : spdk_bdev_io_set_md_buf(bdev_io, buf, md_len);
1306 : }
1307 3 : }
1308 :
1309 38 : bdev_io_get_buf_complete(bdev_io, true);
1310 42 : }
1311 :
1312 : static inline void
1313 26 : bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc)
1314 : {
1315 26 : if (rc) {
1316 0 : SPDK_ERRLOG("Failed to get data buffer\n");
1317 0 : assert(bdev_io->internal.data_transfer_cpl);
1318 0 : bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1319 0 : return;
1320 : }
1321 :
1322 26 : _bdev_io_set_md_buf(bdev_io);
1323 26 : }
1324 :
1325 : static void
1326 2 : bdev_io_pull_data_done_and_track(void *ctx, int status)
1327 : {
1328 2 : struct spdk_bdev_io *bdev_io = ctx;
1329 2 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1330 :
1331 2 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1332 2 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1333 :
1334 2 : if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1335 0 : bdev_ch_retry_io(ch);
1336 0 : }
1337 :
1338 2 : bdev_io_pull_data_done(bdev_io, status);
1339 2 : }
1340 :
1341 : static void
1342 27 : bdev_io_pull_data(struct spdk_bdev_io *bdev_io)
1343 : {
1344 27 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1345 27 : struct spdk_bdev_desc *desc = bdev_io->internal.desc;
1346 27 : int rc = 0;
1347 :
1348 27 : assert(bdev_io->internal.f.has_bounce_buf);
1349 :
1350 27 : if (bdev_io_needs_metadata(desc, bdev_io)) {
1351 0 : assert(bdev_io->bdev->md_interleave);
1352 :
1353 0 : bdev_io->u.bdev.dif_check_flags &= ~SPDK_DIF_FLAGS_NVME_PRACT;
1354 :
1355 0 : if (!bdev_io_use_accel_sequence(bdev_io)) {
1356 0 : bdev_io->internal.accel_sequence = NULL;
1357 0 : }
1358 :
1359 0 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1360 0 : rc = spdk_accel_append_dif_generate_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1361 0 : bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1362 0 : bdev_io->u.bdev.memory_domain,
1363 0 : bdev_io->u.bdev.memory_domain_ctx,
1364 0 : bdev_io->internal.bounce_buf.orig_iovs,
1365 0 : bdev_io->internal.bounce_buf.orig_iovcnt,
1366 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1367 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1368 0 : bdev_io->u.bdev.num_blocks,
1369 0 : &bdev_io->u.bdev.dif_ctx,
1370 : NULL, NULL);
1371 0 : } else {
1372 0 : assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1373 0 : rc = spdk_accel_append_dif_verify_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1374 0 : bdev_io->internal.bounce_buf.orig_iovs,
1375 0 : bdev_io->internal.bounce_buf.orig_iovcnt,
1376 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1377 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1378 0 : bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1379 0 : bdev_io->u.bdev.memory_domain,
1380 0 : bdev_io->u.bdev.memory_domain_ctx,
1381 0 : bdev_io->u.bdev.num_blocks,
1382 0 : &bdev_io->u.bdev.dif_ctx,
1383 0 : &bdev_io->u.bdev.dif_err,
1384 : NULL, NULL);
1385 : }
1386 :
1387 0 : if (spdk_likely(rc == 0)) {
1388 0 : bdev_io->internal.f.has_accel_sequence = true;
1389 0 : bdev_io->u.bdev.accel_sequence = bdev_io->internal.accel_sequence;
1390 0 : } else if (rc != -ENOMEM) {
1391 0 : SPDK_ERRLOG("Failed to append generate/verify_copy to accel sequence: %p\n",
1392 : bdev_io->internal.accel_sequence);
1393 0 : }
1394 27 : } else if (bdev_io_needs_sequence_exec(desc, bdev_io) ||
1395 27 : (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) {
1396 : /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a
1397 : * sequence, append a copy operation making accel change the src/dst buffers of the previous
1398 : * operation */
1399 0 : assert(bdev_io_use_accel_sequence(bdev_io));
1400 0 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1401 0 : rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1402 0 : bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1403 : NULL, NULL,
1404 0 : bdev_io->internal.bounce_buf.orig_iovs,
1405 0 : bdev_io->internal.bounce_buf.orig_iovcnt,
1406 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1407 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1408 : NULL, NULL);
1409 0 : } else {
1410 : /* We need to reverse the src/dst for reads */
1411 0 : assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1412 0 : rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1413 0 : bdev_io->internal.bounce_buf.orig_iovs,
1414 0 : bdev_io->internal.bounce_buf.orig_iovcnt,
1415 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1416 0 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1417 0 : bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1418 : NULL, NULL, NULL, NULL);
1419 : }
1420 :
1421 0 : if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
1422 0 : SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n",
1423 : bdev_io->internal.accel_sequence);
1424 0 : }
1425 27 : } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1426 : /* if this is write path, copy data from original buffer to bounce buffer */
1427 17 : if (bdev_io_use_memory_domain(bdev_io)) {
1428 3 : TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1429 3 : bdev_io_increment_outstanding(ch, ch->shared_resource);
1430 6 : rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1431 3 : bdev_io->internal.memory_domain_ctx,
1432 3 : bdev_io->internal.bounce_buf.orig_iovs,
1433 3 : (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt,
1434 3 : bdev_io->u.bdev.iovs, 1,
1435 : bdev_io_pull_data_done_and_track,
1436 3 : bdev_io);
1437 3 : if (rc == 0) {
1438 : /* Continue to submit IO in completion callback */
1439 2 : return;
1440 : }
1441 1 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1442 1 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1443 1 : if (rc != -ENOMEM) {
1444 0 : SPDK_ERRLOG("Failed to pull data from memory domain %s\n",
1445 : spdk_memory_domain_get_dma_device_id(
1446 : bdev_io->internal.memory_domain));
1447 0 : }
1448 1 : } else {
1449 14 : assert(bdev_io->u.bdev.iovcnt == 1);
1450 28 : spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base,
1451 14 : bdev_io->u.bdev.iovs[0].iov_len,
1452 14 : bdev_io->internal.bounce_buf.orig_iovs,
1453 14 : bdev_io->internal.bounce_buf.orig_iovcnt);
1454 : }
1455 15 : }
1456 :
1457 25 : if (spdk_unlikely(rc == -ENOMEM)) {
1458 1 : bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1459 1 : } else {
1460 24 : bdev_io_pull_data_done(bdev_io, rc);
1461 : }
1462 27 : }
1463 :
1464 : static void
1465 26 : _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len,
1466 : bdev_copy_bounce_buffer_cpl cpl_cb)
1467 : {
1468 26 : struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource;
1469 :
1470 26 : assert(bdev_io->internal.f.has_bounce_buf == false);
1471 :
1472 26 : bdev_io->internal.data_transfer_cpl = cpl_cb;
1473 26 : bdev_io->internal.f.has_bounce_buf = true;
1474 : /* save original iovec */
1475 26 : bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs;
1476 26 : bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt;
1477 : /* zero the other data members */
1478 26 : bdev_io->internal.bounce_buf.iov.iov_base = NULL;
1479 26 : bdev_io->internal.bounce_buf.md_iov.iov_base = NULL;
1480 26 : bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL;
1481 : /* set bounce iov */
1482 26 : bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov;
1483 26 : bdev_io->u.bdev.iovcnt = 1;
1484 : /* set bounce buffer for this operation */
1485 26 : bdev_io->u.bdev.iovs[0].iov_base = buf;
1486 26 : bdev_io->u.bdev.iovs[0].iov_len = len;
1487 : /* Now we use 1 iov, the split condition could have been changed */
1488 26 : bdev_io->internal.f.split = bdev_io_should_split(bdev_io);
1489 :
1490 26 : if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1491 0 : bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1492 0 : } else {
1493 26 : bdev_io_pull_data(bdev_io);
1494 : }
1495 26 : }
1496 :
1497 : static void
1498 42 : _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len)
1499 : {
1500 42 : struct spdk_bdev *bdev = bdev_io->bdev;
1501 : bool buf_allocated;
1502 : uint64_t alignment;
1503 : void *aligned_buf;
1504 :
1505 42 : bdev_io->internal.buf.ptr = buf;
1506 42 : bdev_io->internal.f.has_buf = true;
1507 :
1508 42 : if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1509 0 : bdev_io_get_buf_complete(bdev_io, true);
1510 0 : return;
1511 : }
1512 :
1513 42 : alignment = spdk_bdev_get_buf_align(bdev);
1514 42 : buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs);
1515 42 : aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1));
1516 :
1517 42 : if (buf_allocated) {
1518 26 : _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl);
1519 : /* Continue in completion callback */
1520 26 : return;
1521 : } else {
1522 16 : spdk_bdev_io_set_buf(bdev_io, aligned_buf, len);
1523 : }
1524 :
1525 16 : _bdev_io_set_md_buf(bdev_io);
1526 42 : }
1527 :
1528 : static inline uint64_t
1529 42 : bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len)
1530 : {
1531 42 : struct spdk_bdev *bdev = bdev_io->bdev;
1532 : uint64_t md_len, alignment;
1533 :
1534 42 : md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
1535 :
1536 : /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */
1537 42 : alignment = spdk_bdev_get_buf_align(bdev) - 1;
1538 :
1539 42 : return len + alignment + md_len;
1540 : }
1541 :
1542 : static void
1543 42 : bdev_io_put_accel_buf(struct spdk_bdev_io *bdev_io)
1544 : {
1545 42 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1546 :
1547 84 : spdk_accel_put_buf(ch->accel_channel,
1548 42 : bdev_io->internal.buf.ptr,
1549 42 : bdev_io->u.bdev.memory_domain,
1550 42 : bdev_io->u.bdev.memory_domain_ctx);
1551 42 : }
1552 :
1553 : static void
1554 0 : _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len)
1555 : {
1556 : struct spdk_bdev_mgmt_channel *ch;
1557 :
1558 0 : ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1559 0 : spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len));
1560 0 : }
1561 :
1562 : static void
1563 42 : bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
1564 : {
1565 42 : assert(bdev_io->internal.f.has_buf);
1566 :
1567 42 : if (bdev_io->u.bdev.memory_domain == spdk_accel_get_memory_domain()) {
1568 42 : bdev_io_put_accel_buf(bdev_io);
1569 42 : } else {
1570 0 : assert(bdev_io->u.bdev.memory_domain == NULL);
1571 0 : _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr,
1572 0 : bdev_io->internal.buf.len);
1573 : }
1574 42 : bdev_io->internal.buf.ptr = NULL;
1575 42 : bdev_io->internal.f.has_buf = false;
1576 42 : }
1577 :
1578 3 : SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf,
1579 : "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0);
1580 :
1581 : void
1582 0 : spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf)
1583 : {
1584 0 : uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1585 :
1586 0 : SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf);
1587 :
1588 0 : assert(buf != NULL);
1589 0 : _bdev_io_put_buf(bdev_io, buf, len);
1590 0 : }
1591 :
1592 : static inline void
1593 566 : bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch,
1594 : struct spdk_bdev_io *bdev_io)
1595 : {
1596 : /* After a request is submitted to a bdev module, the ownership of an accel sequence
1597 : * associated with that bdev_io is transferred to the bdev module. So, clear the internal
1598 : * sequence pointer to make sure we won't touch it anymore. */
1599 1050 : if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE ||
1600 566 : bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) {
1601 0 : assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1602 0 : bdev_io->internal.f.has_accel_sequence = false;
1603 0 : }
1604 :
1605 : /* The generic bdev layer should not pass an I/O with a dif_check_flags set that
1606 : * the underlying bdev does not support. Add an assert to check this.
1607 : */
1608 566 : assert((bdev_io->type != SPDK_BDEV_IO_TYPE_WRITE &&
1609 : bdev_io->type != SPDK_BDEV_IO_TYPE_READ) ||
1610 : ((bdev_io->u.bdev.dif_check_flags & bdev->dif_check_flags) ==
1611 : bdev_io->u.bdev.dif_check_flags));
1612 :
1613 566 : bdev->fn_table->submit_request(ioch, bdev_io);
1614 566 : }
1615 :
1616 : static inline void
1617 25 : bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io)
1618 : {
1619 25 : struct spdk_bdev *bdev = bdev_io->bdev;
1620 :
1621 25 : bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource);
1622 25 : bdev_io->internal.error.nvme.cdw0 = 0;
1623 25 : bdev_io->num_retries++;
1624 25 : bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io);
1625 25 : }
1626 :
1627 : static void
1628 93 : bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource)
1629 : {
1630 : struct spdk_bdev_io *bdev_io;
1631 :
1632 93 : if (shared_resource->abort_in_progress) {
1633 : /**
1634 : * We are aborting nomem I/Os, do not touch nomem_io list now.
1635 : */
1636 51 : return;
1637 : }
1638 :
1639 42 : if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
1640 : /*
1641 : * Allow some more I/O to complete before retrying the nomem_io queue.
1642 : * Some drivers (such as nvme) cannot immediately take a new I/O in
1643 : * the context of a completion, because the resources for the I/O are
1644 : * not released until control returns to the bdev poller. Also, we
1645 : * may require several small I/O to complete before a larger I/O
1646 : * (that requires splitting) can be submitted.
1647 : */
1648 22 : return;
1649 : }
1650 :
1651 31 : while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1652 27 : bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
1653 27 : TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link);
1654 :
1655 27 : switch (bdev_io->internal.retry_state) {
1656 : case BDEV_IO_RETRY_STATE_SUBMIT:
1657 25 : bdev_ch_resubmit_io(shared_resource, bdev_io);
1658 25 : break;
1659 : case BDEV_IO_RETRY_STATE_PULL:
1660 1 : bdev_io_pull_data(bdev_io);
1661 1 : break;
1662 : case BDEV_IO_RETRY_STATE_PULL_MD:
1663 0 : bdev_io_pull_md_buf(bdev_io);
1664 0 : break;
1665 : case BDEV_IO_RETRY_STATE_PUSH:
1666 1 : bdev_io_push_bounce_data(bdev_io);
1667 1 : break;
1668 : case BDEV_IO_RETRY_STATE_PUSH_MD:
1669 0 : bdev_io_push_bounce_md_buf(bdev_io);
1670 0 : break;
1671 : case BDEV_IO_RETRY_STATE_GET_ACCEL_BUF:
1672 0 : _bdev_io_get_accel_buf(bdev_io);
1673 0 : break;
1674 : default:
1675 0 : assert(0 && "invalid retry state");
1676 : break;
1677 : }
1678 :
1679 27 : if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) {
1680 : /* This IO completed again with NOMEM status, so break the loop and
1681 : * don't try anymore. Note that a bdev_io that fails with NOMEM
1682 : * always gets requeued at the front of the list, to maintain
1683 : * ordering.
1684 : */
1685 16 : break;
1686 : }
1687 : }
1688 93 : }
1689 :
1690 : static void
1691 78 : bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
1692 : {
1693 78 : bdev_shared_ch_retry_io(bdev_ch->shared_resource);
1694 78 : }
1695 :
1696 : static int
1697 0 : bdev_no_mem_poller(void *ctx)
1698 : {
1699 0 : struct spdk_bdev_shared_resource *shared_resource = ctx;
1700 :
1701 0 : spdk_poller_unregister(&shared_resource->nomem_poller);
1702 :
1703 0 : if (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1704 0 : bdev_shared_ch_retry_io(shared_resource);
1705 0 : }
1706 : /* the retry cb may re-register the poller so double check */
1707 0 : if (!TAILQ_EMPTY(&shared_resource->nomem_io) &&
1708 0 : shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) {
1709 : /* No IOs were submitted, try again */
1710 0 : shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource,
1711 : SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10);
1712 0 : }
1713 :
1714 0 : return SPDK_POLLER_BUSY;
1715 : }
1716 :
1717 : static inline bool
1718 588 : _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
1719 : {
1720 588 : struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1721 588 : struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1722 :
1723 588 : if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) {
1724 21 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
1725 21 : bdev_queue_nomem_io_head(shared_resource, bdev_io, state);
1726 :
1727 21 : if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) {
1728 : /* Special case when we have nomem IOs and no outstanding IOs which completions
1729 : * could trigger retry of queued IOs
1730 : * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no
1731 : * new IOs submitted, e.g. qd==1 */
1732 1 : shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource,
1733 : SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10);
1734 1 : }
1735 : /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the
1736 : * ownership of that sequence is transferred back to the bdev layer, so we need to
1737 : * restore internal.accel_sequence to make sure that the sequence is handled
1738 : * correctly in case the I/O is later aborted. */
1739 21 : if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
1740 21 : bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) {
1741 0 : assert(!bdev_io_use_accel_sequence(bdev_io));
1742 0 : bdev_io->internal.f.has_accel_sequence = true;
1743 0 : bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence;
1744 0 : }
1745 :
1746 21 : return true;
1747 : }
1748 :
1749 567 : if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1750 78 : bdev_ch_retry_io(bdev_ch);
1751 78 : }
1752 :
1753 567 : return false;
1754 588 : }
1755 :
1756 : static void
1757 26 : _bdev_io_complete_push_bounce_done(void *ctx, int rc)
1758 : {
1759 26 : struct spdk_bdev_io *bdev_io = ctx;
1760 26 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1761 :
1762 26 : if (rc) {
1763 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1764 0 : }
1765 : /* We want to free the bounce buffer here since we know we're done with it (as opposed
1766 : * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()).
1767 : */
1768 26 : bdev_io_put_buf(bdev_io);
1769 :
1770 26 : if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1771 0 : bdev_ch_retry_io(ch);
1772 0 : }
1773 :
1774 : /* Continue with IO completion flow */
1775 26 : bdev_io_complete(bdev_io);
1776 26 : }
1777 :
1778 : static void
1779 2 : bdev_io_push_bounce_md_buf_done(void *ctx, int rc)
1780 : {
1781 2 : struct spdk_bdev_io *bdev_io = ctx;
1782 2 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1783 :
1784 2 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1785 2 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1786 2 : bdev_io->internal.f.has_bounce_buf = false;
1787 :
1788 2 : if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1789 0 : bdev_ch_retry_io(ch);
1790 0 : }
1791 :
1792 2 : bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1793 2 : }
1794 :
1795 : static inline void
1796 26 : bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io)
1797 : {
1798 26 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1799 26 : int rc = 0;
1800 :
1801 26 : assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1802 26 : assert(bdev_io->internal.f.has_bounce_buf);
1803 :
1804 : /* do the same for metadata buffer */
1805 26 : if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) {
1806 4 : assert(spdk_bdev_is_md_separate(bdev_io->bdev));
1807 :
1808 4 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1809 2 : if (bdev_io_use_memory_domain(bdev_io)) {
1810 2 : TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1811 2 : bdev_io_increment_outstanding(ch, ch->shared_resource);
1812 : /* If memory domain is used then we need to call async push function */
1813 4 : rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1814 2 : bdev_io->internal.memory_domain_ctx,
1815 2 : &bdev_io->internal.bounce_buf.orig_md_iov,
1816 2 : (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt,
1817 2 : &bdev_io->internal.bounce_buf.md_iov, 1,
1818 : bdev_io_push_bounce_md_buf_done,
1819 2 : bdev_io);
1820 2 : if (rc == 0) {
1821 : /* Continue IO completion in async callback */
1822 2 : return;
1823 : }
1824 0 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1825 0 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1826 0 : if (rc != -ENOMEM) {
1827 0 : SPDK_ERRLOG("Failed to push md to memory domain %s\n",
1828 : spdk_memory_domain_get_dma_device_id(
1829 : bdev_io->internal.memory_domain));
1830 0 : }
1831 0 : } else {
1832 0 : memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf,
1833 0 : bdev_io->internal.bounce_buf.orig_md_iov.iov_len);
1834 : }
1835 0 : }
1836 2 : }
1837 :
1838 24 : if (spdk_unlikely(rc == -ENOMEM)) {
1839 0 : bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD);
1840 0 : } else {
1841 24 : assert(bdev_io->internal.data_transfer_cpl);
1842 24 : bdev_io->internal.f.has_bounce_buf = false;
1843 24 : bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1844 : }
1845 26 : }
1846 :
1847 : static inline void
1848 26 : bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc)
1849 : {
1850 26 : assert(bdev_io->internal.data_transfer_cpl);
1851 26 : if (rc) {
1852 0 : bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1853 0 : return;
1854 : }
1855 :
1856 : /* set original buffer for this io */
1857 26 : bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt;
1858 26 : bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs;
1859 :
1860 : /* We don't set bdev_io->internal.f.has_bounce_buf to false here because
1861 : * we still need to clear the md buf */
1862 :
1863 26 : bdev_io_push_bounce_md_buf(bdev_io);
1864 26 : }
1865 :
1866 : static void
1867 2 : bdev_io_push_bounce_data_done_and_track(void *ctx, int status)
1868 : {
1869 2 : struct spdk_bdev_io *bdev_io = ctx;
1870 2 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1871 :
1872 2 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1873 2 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1874 :
1875 2 : if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1876 0 : bdev_ch_retry_io(ch);
1877 0 : }
1878 :
1879 2 : bdev_io_push_bounce_data_done(bdev_io, status);
1880 2 : }
1881 :
1882 : static inline void
1883 27 : bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io)
1884 : {
1885 27 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1886 27 : int rc = 0;
1887 :
1888 27 : assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1889 27 : assert(!bdev_io_use_accel_sequence(bdev_io));
1890 27 : assert(bdev_io->internal.f.has_bounce_buf);
1891 :
1892 : /* if this is read path, copy data from bounce buffer to original buffer */
1893 27 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1894 11 : if (bdev_io_use_memory_domain(bdev_io)) {
1895 3 : TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1896 3 : bdev_io_increment_outstanding(ch, ch->shared_resource);
1897 : /* If memory domain is used then we need to call async push function */
1898 6 : rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1899 3 : bdev_io->internal.memory_domain_ctx,
1900 3 : bdev_io->internal.bounce_buf.orig_iovs,
1901 3 : (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt,
1902 3 : &bdev_io->internal.bounce_buf.iov, 1,
1903 : bdev_io_push_bounce_data_done_and_track,
1904 3 : bdev_io);
1905 3 : if (rc == 0) {
1906 : /* Continue IO completion in async callback */
1907 2 : return;
1908 : }
1909 :
1910 1 : TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1911 1 : bdev_io_decrement_outstanding(ch, ch->shared_resource);
1912 1 : if (rc != -ENOMEM) {
1913 0 : SPDK_ERRLOG("Failed to push data to memory domain %s\n",
1914 : spdk_memory_domain_get_dma_device_id(
1915 : bdev_io->internal.memory_domain));
1916 0 : }
1917 1 : } else {
1918 16 : spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs,
1919 8 : bdev_io->internal.bounce_buf.orig_iovcnt,
1920 8 : bdev_io->internal.bounce_buf.iov.iov_base,
1921 8 : bdev_io->internal.bounce_buf.iov.iov_len);
1922 : }
1923 9 : }
1924 :
1925 25 : if (spdk_unlikely(rc == -ENOMEM)) {
1926 1 : bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH);
1927 1 : } else {
1928 24 : bdev_io_push_bounce_data_done(bdev_io, rc);
1929 : }
1930 27 : }
1931 :
1932 : static inline void
1933 26 : _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb)
1934 : {
1935 26 : bdev_io->internal.data_transfer_cpl = cpl_cb;
1936 26 : bdev_io_push_bounce_data(bdev_io);
1937 26 : }
1938 :
1939 : static void
1940 0 : bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf)
1941 : {
1942 : struct spdk_bdev_io *bdev_io;
1943 :
1944 0 : bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf);
1945 0 : _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len);
1946 0 : }
1947 :
1948 : static void
1949 42 : bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len)
1950 : {
1951 : struct spdk_bdev_mgmt_channel *mgmt_ch;
1952 : uint64_t max_len;
1953 : void *buf;
1954 :
1955 42 : assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread());
1956 42 : mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1957 42 : max_len = bdev_io_get_max_buf_len(bdev_io, len);
1958 :
1959 42 : if (spdk_unlikely(max_len > mgmt_ch->iobuf.cache[0].large.bufsize)) {
1960 0 : SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len);
1961 0 : bdev_io_get_buf_complete(bdev_io, false);
1962 0 : return;
1963 : }
1964 :
1965 42 : bdev_io->internal.buf.len = len;
1966 42 : buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf,
1967 : bdev_io_get_iobuf_cb);
1968 42 : if (buf != NULL) {
1969 42 : _bdev_io_set_buf(bdev_io, buf, len);
1970 42 : }
1971 42 : }
1972 :
1973 : void
1974 56 : spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
1975 : {
1976 56 : struct spdk_bdev *bdev = bdev_io->bdev;
1977 : uint64_t alignment;
1978 :
1979 56 : assert(cb != NULL);
1980 56 : bdev_io->internal.get_buf_cb = cb;
1981 :
1982 56 : alignment = spdk_bdev_get_buf_align(bdev);
1983 :
1984 56 : if (_is_buf_allocated(bdev_io->u.bdev.iovs) &&
1985 40 : _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) {
1986 : /* Buffer already present and aligned */
1987 18 : cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true);
1988 18 : return;
1989 : }
1990 :
1991 38 : bdev_io_get_buf(bdev_io, len);
1992 56 : }
1993 :
1994 : static void
1995 4 : _bdev_io_get_bounce_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb,
1996 : uint64_t len)
1997 : {
1998 4 : assert(cb != NULL);
1999 4 : bdev_io->internal.get_buf_cb = cb;
2000 :
2001 4 : bdev_io_get_buf(bdev_io, len);
2002 4 : }
2003 :
2004 : static void
2005 0 : _bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io)
2006 : {
2007 0 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
2008 : void *buf;
2009 : int rc;
2010 :
2011 0 : rc = spdk_accel_get_buf(ch->accel_channel,
2012 0 : bdev_io->internal.buf.len,
2013 : &buf,
2014 0 : &bdev_io->u.bdev.memory_domain,
2015 0 : &bdev_io->u.bdev.memory_domain_ctx);
2016 0 : if (rc != 0) {
2017 0 : bdev_queue_nomem_io_tail(ch->shared_resource, bdev_io,
2018 : BDEV_IO_RETRY_STATE_GET_ACCEL_BUF);
2019 0 : return;
2020 : }
2021 :
2022 0 : _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len);
2023 0 : }
2024 :
2025 : static inline void
2026 0 : bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb,
2027 : uint64_t len)
2028 : {
2029 0 : bdev_io->internal.buf.len = len;
2030 0 : bdev_io->internal.get_buf_cb = cb;
2031 :
2032 0 : _bdev_io_get_accel_buf(bdev_io);
2033 0 : }
2034 :
2035 3 : SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf,
2036 : "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0);
2037 :
2038 : void
2039 0 : spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb)
2040 : {
2041 0 : uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
2042 :
2043 0 : SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf);
2044 :
2045 0 : assert(cb != NULL);
2046 0 : assert(bdev_io->internal.get_aux_buf_cb == NULL);
2047 0 : bdev_io->internal.get_aux_buf_cb = cb;
2048 0 : bdev_io_get_buf(bdev_io, len);
2049 0 : }
2050 :
2051 : static int
2052 69 : bdev_module_get_max_ctx_size(void)
2053 : {
2054 : struct spdk_bdev_module *bdev_module;
2055 69 : int max_bdev_module_size = 0;
2056 :
2057 268 : TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2058 199 : if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
2059 68 : max_bdev_module_size = bdev_module->get_ctx_size();
2060 68 : }
2061 199 : }
2062 :
2063 69 : return max_bdev_module_size;
2064 : }
2065 :
2066 : static void
2067 0 : bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
2068 : {
2069 0 : if (!bdev->internal.histogram_enabled) {
2070 0 : return;
2071 : }
2072 :
2073 0 : spdk_json_write_object_begin(w);
2074 0 : spdk_json_write_named_string(w, "method", "bdev_enable_histogram");
2075 :
2076 0 : spdk_json_write_named_object_begin(w, "params");
2077 0 : spdk_json_write_named_string(w, "name", bdev->name);
2078 :
2079 0 : spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled);
2080 :
2081 0 : if (bdev->internal.histogram_io_type) {
2082 0 : spdk_json_write_named_string(w, "opc",
2083 0 : spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type));
2084 0 : }
2085 :
2086 0 : spdk_json_write_object_end(w);
2087 :
2088 0 : spdk_json_write_object_end(w);
2089 0 : }
2090 :
2091 : static void
2092 0 : bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
2093 : {
2094 : int i;
2095 0 : struct spdk_bdev_qos *qos = bdev->internal.qos;
2096 : uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
2097 :
2098 0 : if (!qos) {
2099 0 : return;
2100 : }
2101 :
2102 0 : spdk_bdev_get_qos_rate_limits(bdev, limits);
2103 :
2104 0 : spdk_json_write_object_begin(w);
2105 0 : spdk_json_write_named_string(w, "method", "bdev_set_qos_limit");
2106 :
2107 0 : spdk_json_write_named_object_begin(w, "params");
2108 0 : spdk_json_write_named_string(w, "name", bdev->name);
2109 0 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2110 0 : if (limits[i] > 0) {
2111 0 : spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]);
2112 0 : }
2113 0 : }
2114 0 : spdk_json_write_object_end(w);
2115 :
2116 0 : spdk_json_write_object_end(w);
2117 0 : }
2118 :
2119 : void
2120 0 : spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
2121 : {
2122 : struct spdk_bdev_module *bdev_module;
2123 : struct spdk_bdev *bdev;
2124 :
2125 0 : assert(w != NULL);
2126 :
2127 0 : spdk_json_write_array_begin(w);
2128 :
2129 0 : spdk_json_write_object_begin(w);
2130 0 : spdk_json_write_named_string(w, "method", "bdev_set_options");
2131 0 : spdk_json_write_named_object_begin(w, "params");
2132 0 : spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size);
2133 0 : spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size);
2134 0 : spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine);
2135 0 : spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size);
2136 0 : spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size);
2137 0 : spdk_json_write_object_end(w);
2138 0 : spdk_json_write_object_end(w);
2139 :
2140 0 : bdev_examine_allowlist_config_json(w);
2141 :
2142 0 : TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2143 0 : if (bdev_module->config_json) {
2144 0 : bdev_module->config_json(w);
2145 0 : }
2146 0 : }
2147 :
2148 0 : spdk_spin_lock(&g_bdev_mgr.spinlock);
2149 :
2150 0 : TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) {
2151 0 : if (bdev->fn_table->write_config_json) {
2152 0 : bdev->fn_table->write_config_json(bdev, w);
2153 0 : }
2154 :
2155 0 : bdev_qos_config_json(bdev, w);
2156 0 : bdev_enable_histogram_config_json(bdev, w);
2157 0 : }
2158 :
2159 0 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
2160 :
2161 : /* This has to be last RPC in array to make sure all bdevs finished examine */
2162 0 : spdk_json_write_object_begin(w);
2163 0 : spdk_json_write_named_string(w, "method", "bdev_wait_for_examine");
2164 0 : spdk_json_write_object_end(w);
2165 :
2166 0 : spdk_json_write_array_end(w);
2167 0 : }
2168 :
2169 : static void
2170 73 : bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
2171 : {
2172 73 : struct spdk_bdev_mgmt_channel *ch = ctx_buf;
2173 : struct spdk_bdev_io *bdev_io;
2174 :
2175 73 : spdk_iobuf_channel_fini(&ch->iobuf);
2176 :
2177 10483 : while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
2178 10410 : bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
2179 10410 : STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
2180 10410 : ch->per_thread_cache_count--;
2181 10410 : spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
2182 : }
2183 :
2184 73 : assert(ch->per_thread_cache_count == 0);
2185 73 : }
2186 :
2187 : static int
2188 73 : bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
2189 : {
2190 73 : struct spdk_bdev_mgmt_channel *ch = ctx_buf;
2191 : struct spdk_bdev_io *bdev_io;
2192 : uint32_t i;
2193 : int rc;
2194 :
2195 146 : rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev",
2196 73 : g_bdev_opts.iobuf_small_cache_size,
2197 73 : g_bdev_opts.iobuf_large_cache_size);
2198 73 : if (rc != 0) {
2199 0 : SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc));
2200 0 : return -1;
2201 : }
2202 :
2203 73 : STAILQ_INIT(&ch->per_thread_cache);
2204 73 : ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
2205 :
2206 : /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */
2207 73 : ch->per_thread_cache_count = 0;
2208 10483 : for (i = 0; i < ch->bdev_io_cache_size; i++) {
2209 10410 : bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
2210 10410 : if (bdev_io == NULL) {
2211 0 : SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n");
2212 0 : assert(false);
2213 : bdev_mgmt_channel_destroy(io_device, ctx_buf);
2214 : return -1;
2215 : }
2216 10410 : ch->per_thread_cache_count++;
2217 10410 : STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
2218 10410 : }
2219 :
2220 73 : TAILQ_INIT(&ch->shared_resources);
2221 73 : TAILQ_INIT(&ch->io_wait_queue);
2222 :
2223 73 : return 0;
2224 73 : }
2225 :
2226 : static void
2227 69 : bdev_init_complete(int rc)
2228 : {
2229 69 : spdk_bdev_init_cb cb_fn = g_init_cb_fn;
2230 69 : void *cb_arg = g_init_cb_arg;
2231 : struct spdk_bdev_module *m;
2232 :
2233 69 : g_bdev_mgr.init_complete = true;
2234 69 : g_init_cb_fn = NULL;
2235 69 : g_init_cb_arg = NULL;
2236 :
2237 : /*
2238 : * For modules that need to know when subsystem init is complete,
2239 : * inform them now.
2240 : */
2241 69 : if (rc == 0) {
2242 268 : TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
2243 199 : if (m->init_complete) {
2244 25 : m->init_complete();
2245 25 : }
2246 199 : }
2247 69 : }
2248 :
2249 69 : cb_fn(cb_arg, rc);
2250 69 : }
2251 :
2252 : static bool
2253 274 : bdev_module_all_actions_completed(void)
2254 : {
2255 : struct spdk_bdev_module *m;
2256 :
2257 1084 : TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
2258 810 : if (m->internal.action_in_progress > 0) {
2259 0 : return false;
2260 : }
2261 810 : }
2262 274 : return true;
2263 274 : }
2264 :
2265 : static void
2266 631 : bdev_module_action_complete(void)
2267 : {
2268 : /*
2269 : * Don't finish bdev subsystem initialization if
2270 : * module pre-initialization is still in progress, or
2271 : * the subsystem been already initialized.
2272 : */
2273 631 : if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
2274 562 : return;
2275 : }
2276 :
2277 : /*
2278 : * Check all bdev modules for inits/examinations in progress. If any
2279 : * exist, return immediately since we cannot finish bdev subsystem
2280 : * initialization until all are completed.
2281 : */
2282 69 : if (!bdev_module_all_actions_completed()) {
2283 0 : return;
2284 : }
2285 :
2286 : /*
2287 : * Modules already finished initialization - now that all
2288 : * the bdev modules have finished their asynchronous I/O
2289 : * processing, the entire bdev layer can be marked as complete.
2290 : */
2291 69 : bdev_init_complete(0);
2292 631 : }
2293 :
2294 : static void
2295 562 : bdev_module_action_done(struct spdk_bdev_module *module)
2296 : {
2297 562 : spdk_spin_lock(&module->internal.spinlock);
2298 562 : assert(module->internal.action_in_progress > 0);
2299 562 : module->internal.action_in_progress--;
2300 562 : spdk_spin_unlock(&module->internal.spinlock);
2301 562 : bdev_module_action_complete();
2302 562 : }
2303 :
2304 : void
2305 69 : spdk_bdev_module_init_done(struct spdk_bdev_module *module)
2306 : {
2307 69 : assert(module->async_init);
2308 69 : bdev_module_action_done(module);
2309 69 : }
2310 :
2311 : void
2312 493 : spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
2313 : {
2314 493 : bdev_module_action_done(module);
2315 493 : }
2316 :
2317 : /** The last initialized bdev module */
2318 : static struct spdk_bdev_module *g_resume_bdev_module = NULL;
2319 :
2320 : static void
2321 0 : bdev_init_failed(void *cb_arg)
2322 : {
2323 0 : struct spdk_bdev_module *module = cb_arg;
2324 :
2325 0 : spdk_spin_lock(&module->internal.spinlock);
2326 0 : assert(module->internal.action_in_progress > 0);
2327 0 : module->internal.action_in_progress--;
2328 0 : spdk_spin_unlock(&module->internal.spinlock);
2329 0 : bdev_init_complete(-1);
2330 0 : }
2331 :
2332 : static int
2333 69 : bdev_modules_init(void)
2334 : {
2335 : struct spdk_bdev_module *module;
2336 69 : int rc = 0;
2337 :
2338 268 : TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2339 199 : g_resume_bdev_module = module;
2340 199 : if (module->async_init) {
2341 69 : spdk_spin_lock(&module->internal.spinlock);
2342 69 : module->internal.action_in_progress = 1;
2343 69 : spdk_spin_unlock(&module->internal.spinlock);
2344 69 : }
2345 199 : rc = module->module_init();
2346 199 : if (rc != 0) {
2347 : /* Bump action_in_progress to prevent other modules from completion of modules_init
2348 : * Send message to defer application shutdown until resources are cleaned up */
2349 0 : spdk_spin_lock(&module->internal.spinlock);
2350 0 : module->internal.action_in_progress = 1;
2351 0 : spdk_spin_unlock(&module->internal.spinlock);
2352 0 : spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module);
2353 0 : return rc;
2354 : }
2355 199 : }
2356 :
2357 69 : g_resume_bdev_module = NULL;
2358 69 : return 0;
2359 69 : }
2360 :
2361 : void
2362 69 : spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
2363 : {
2364 69 : int rc = 0;
2365 : char mempool_name[32];
2366 :
2367 69 : assert(cb_fn != NULL);
2368 :
2369 69 : g_init_cb_fn = cb_fn;
2370 69 : g_init_cb_arg = cb_arg;
2371 :
2372 69 : spdk_notify_type_register("bdev_register");
2373 69 : spdk_notify_type_register("bdev_unregister");
2374 :
2375 69 : snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
2376 :
2377 69 : rc = spdk_iobuf_register_module("bdev");
2378 69 : if (rc != 0) {
2379 0 : SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc));
2380 0 : bdev_init_complete(-1);
2381 0 : return;
2382 : }
2383 :
2384 138 : g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
2385 69 : g_bdev_opts.bdev_io_pool_size,
2386 69 : sizeof(struct spdk_bdev_io) +
2387 69 : bdev_module_get_max_ctx_size(),
2388 : 0,
2389 : SPDK_ENV_NUMA_ID_ANY);
2390 :
2391 69 : if (g_bdev_mgr.bdev_io_pool == NULL) {
2392 0 : SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
2393 0 : bdev_init_complete(-1);
2394 0 : return;
2395 : }
2396 :
2397 69 : g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
2398 : NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
2399 69 : if (!g_bdev_mgr.zero_buffer) {
2400 0 : SPDK_ERRLOG("create bdev zero buffer failed\n");
2401 0 : bdev_init_complete(-1);
2402 0 : return;
2403 : }
2404 :
2405 : #ifdef SPDK_CONFIG_VTUNE
2406 : g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
2407 : #endif
2408 :
2409 69 : spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create,
2410 : bdev_mgmt_channel_destroy,
2411 : sizeof(struct spdk_bdev_mgmt_channel),
2412 : "bdev_mgr");
2413 :
2414 69 : rc = bdev_modules_init();
2415 69 : g_bdev_mgr.module_init_complete = true;
2416 69 : if (rc != 0) {
2417 0 : SPDK_ERRLOG("bdev modules init failed\n");
2418 0 : return;
2419 : }
2420 :
2421 69 : bdev_module_action_complete();
2422 69 : }
2423 :
2424 : static void
2425 69 : bdev_mgr_unregister_cb(void *io_device)
2426 : {
2427 69 : spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
2428 :
2429 69 : if (g_bdev_mgr.bdev_io_pool) {
2430 69 : if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
2431 0 : SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
2432 : spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
2433 : g_bdev_opts.bdev_io_pool_size);
2434 0 : }
2435 :
2436 69 : spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
2437 69 : }
2438 :
2439 69 : spdk_free(g_bdev_mgr.zero_buffer);
2440 :
2441 69 : bdev_examine_allowlist_free();
2442 :
2443 69 : cb_fn(g_fini_cb_arg);
2444 69 : g_fini_cb_fn = NULL;
2445 69 : g_fini_cb_arg = NULL;
2446 69 : g_bdev_mgr.init_complete = false;
2447 69 : g_bdev_mgr.module_init_complete = false;
2448 69 : }
2449 :
2450 : static void
2451 69 : bdev_module_fini_iter(void *arg)
2452 : {
2453 : struct spdk_bdev_module *bdev_module;
2454 :
2455 : /* FIXME: Handling initialization failures is broken now,
2456 : * so we won't even try cleaning up after successfully
2457 : * initialized modules. if module_init_complete is false,
2458 : * just call spdk_bdev_mgr_unregister_cb
2459 : */
2460 69 : if (!g_bdev_mgr.module_init_complete) {
2461 0 : bdev_mgr_unregister_cb(NULL);
2462 0 : return;
2463 : }
2464 :
2465 : /* Start iterating from the last touched module */
2466 69 : if (!g_resume_bdev_module) {
2467 69 : bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2468 69 : } else {
2469 0 : bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list,
2470 : internal.tailq);
2471 : }
2472 :
2473 268 : while (bdev_module) {
2474 199 : if (bdev_module->async_fini) {
2475 : /* Save our place so we can resume later. We must
2476 : * save the variable here, before calling module_fini()
2477 : * below, because in some cases the module may immediately
2478 : * call spdk_bdev_module_fini_done() and re-enter
2479 : * this function to continue iterating. */
2480 0 : g_resume_bdev_module = bdev_module;
2481 0 : }
2482 :
2483 199 : if (bdev_module->module_fini) {
2484 199 : bdev_module->module_fini();
2485 199 : }
2486 :
2487 199 : if (bdev_module->async_fini) {
2488 0 : return;
2489 : }
2490 :
2491 199 : bdev_module = TAILQ_PREV(bdev_module, bdev_module_list,
2492 : internal.tailq);
2493 : }
2494 :
2495 69 : g_resume_bdev_module = NULL;
2496 69 : spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb);
2497 69 : }
2498 :
2499 : void
2500 0 : spdk_bdev_module_fini_done(void)
2501 : {
2502 0 : if (spdk_get_thread() != g_fini_thread) {
2503 0 : spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL);
2504 0 : } else {
2505 0 : bdev_module_fini_iter(NULL);
2506 : }
2507 0 : }
2508 :
2509 : static void
2510 69 : bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
2511 : {
2512 69 : struct spdk_bdev *bdev = cb_arg;
2513 :
2514 69 : if (bdeverrno && bdev) {
2515 0 : SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
2516 : bdev->name);
2517 :
2518 : /*
2519 : * Since the call to spdk_bdev_unregister() failed, we have no way to free this
2520 : * bdev; try to continue by manually removing this bdev from the list and continue
2521 : * with the next bdev in the list.
2522 : */
2523 0 : TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
2524 0 : }
2525 :
2526 69 : if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
2527 69 : SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n");
2528 : /*
2529 : * Bdev module finish need to be deferred as we might be in the middle of some context
2530 : * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
2531 : * after returning.
2532 : */
2533 69 : spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL);
2534 69 : return;
2535 : }
2536 :
2537 : /*
2538 : * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem
2539 : * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity
2540 : * to detect clean shutdown as opposed to run-time hot removal of the underlying
2541 : * base bdevs.
2542 : *
2543 : * Also, walk the list in the reverse order.
2544 : */
2545 0 : for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2546 0 : bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2547 0 : spdk_spin_lock(&bdev->internal.spinlock);
2548 0 : if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
2549 0 : LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev);
2550 0 : spdk_spin_unlock(&bdev->internal.spinlock);
2551 0 : continue;
2552 : }
2553 0 : spdk_spin_unlock(&bdev->internal.spinlock);
2554 :
2555 0 : SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name);
2556 0 : spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2557 0 : return;
2558 : }
2559 :
2560 : /*
2561 : * If any bdev fails to unclaim underlying bdev properly, we may face the
2562 : * case of bdev list consisting of claimed bdevs only (if claims are managed
2563 : * correctly, this would mean there's a loop in the claims graph which is
2564 : * clearly impossible). Warn and unregister last bdev on the list then.
2565 : */
2566 0 : for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2567 0 : bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2568 0 : SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name);
2569 0 : spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2570 0 : return;
2571 : }
2572 69 : }
2573 :
2574 : static void
2575 69 : bdev_module_fini_start_iter(void *arg)
2576 : {
2577 : struct spdk_bdev_module *bdev_module;
2578 :
2579 69 : if (!g_resume_bdev_module) {
2580 69 : bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2581 69 : } else {
2582 0 : bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq);
2583 : }
2584 :
2585 268 : while (bdev_module) {
2586 199 : if (bdev_module->async_fini_start) {
2587 : /* Save our place so we can resume later. We must
2588 : * save the variable here, before calling fini_start()
2589 : * below, because in some cases the module may immediately
2590 : * call spdk_bdev_module_fini_start_done() and re-enter
2591 : * this function to continue iterating. */
2592 0 : g_resume_bdev_module = bdev_module;
2593 0 : }
2594 :
2595 199 : if (bdev_module->fini_start) {
2596 25 : bdev_module->fini_start();
2597 25 : }
2598 :
2599 199 : if (bdev_module->async_fini_start) {
2600 0 : return;
2601 : }
2602 :
2603 199 : bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq);
2604 : }
2605 :
2606 69 : g_resume_bdev_module = NULL;
2607 :
2608 69 : bdev_finish_unregister_bdevs_iter(NULL, 0);
2609 69 : }
2610 :
2611 : void
2612 0 : spdk_bdev_module_fini_start_done(void)
2613 : {
2614 0 : if (spdk_get_thread() != g_fini_thread) {
2615 0 : spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL);
2616 0 : } else {
2617 0 : bdev_module_fini_start_iter(NULL);
2618 : }
2619 0 : }
2620 :
2621 : static void
2622 69 : bdev_finish_wait_for_examine_done(void *cb_arg)
2623 : {
2624 69 : bdev_module_fini_start_iter(NULL);
2625 69 : }
2626 :
2627 : static void bdev_open_async_fini(void);
2628 :
2629 : void
2630 69 : spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
2631 : {
2632 : int rc;
2633 :
2634 69 : assert(cb_fn != NULL);
2635 :
2636 69 : g_fini_thread = spdk_get_thread();
2637 :
2638 69 : g_fini_cb_fn = cb_fn;
2639 69 : g_fini_cb_arg = cb_arg;
2640 :
2641 69 : bdev_open_async_fini();
2642 :
2643 69 : rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL);
2644 69 : if (rc != 0) {
2645 0 : SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc));
2646 0 : bdev_finish_wait_for_examine_done(NULL);
2647 0 : }
2648 69 : }
2649 :
2650 : struct spdk_bdev_io *
2651 716 : bdev_channel_get_io(struct spdk_bdev_channel *channel)
2652 : {
2653 716 : struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
2654 : struct spdk_bdev_io *bdev_io;
2655 :
2656 716 : if (ch->per_thread_cache_count > 0) {
2657 656 : bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
2658 656 : STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
2659 656 : ch->per_thread_cache_count--;
2660 716 : } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) {
2661 : /*
2662 : * Don't try to look for bdev_ios in the global pool if there are
2663 : * waiters on bdev_ios - we don't want this caller to jump the line.
2664 : */
2665 0 : bdev_io = NULL;
2666 0 : } else {
2667 60 : bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
2668 : }
2669 :
2670 716 : return bdev_io;
2671 : }
2672 :
2673 : void
2674 710 : spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
2675 : {
2676 : struct spdk_bdev_mgmt_channel *ch;
2677 :
2678 710 : assert(bdev_io != NULL);
2679 710 : assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING);
2680 :
2681 710 : ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
2682 :
2683 710 : if (bdev_io->internal.f.has_buf) {
2684 16 : bdev_io_put_buf(bdev_io);
2685 16 : }
2686 :
2687 710 : if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
2688 656 : ch->per_thread_cache_count++;
2689 656 : STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
2690 660 : while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) {
2691 : struct spdk_bdev_io_wait_entry *entry;
2692 :
2693 4 : entry = TAILQ_FIRST(&ch->io_wait_queue);
2694 4 : TAILQ_REMOVE(&ch->io_wait_queue, entry, link);
2695 4 : entry->cb_fn(entry->cb_arg);
2696 : }
2697 656 : } else {
2698 : /* We should never have a full cache with entries on the io wait queue. */
2699 54 : assert(TAILQ_EMPTY(&ch->io_wait_queue));
2700 54 : spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
2701 : }
2702 710 : }
2703 :
2704 : static bool
2705 72 : bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit)
2706 : {
2707 72 : assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
2708 :
2709 72 : switch (limit) {
2710 : case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2711 18 : return true;
2712 : case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2713 : case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2714 : case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2715 54 : return false;
2716 0 : case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES:
2717 : default:
2718 0 : return false;
2719 : }
2720 72 : }
2721 :
2722 : static bool
2723 25 : bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io)
2724 : {
2725 25 : switch (bdev_io->type) {
2726 : case SPDK_BDEV_IO_TYPE_NVME_IO:
2727 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2728 : case SPDK_BDEV_IO_TYPE_READ:
2729 : case SPDK_BDEV_IO_TYPE_WRITE:
2730 23 : return true;
2731 : case SPDK_BDEV_IO_TYPE_ZCOPY:
2732 0 : if (bdev_io->u.bdev.zcopy.start) {
2733 0 : return true;
2734 : } else {
2735 0 : return false;
2736 : }
2737 : default:
2738 2 : return false;
2739 : }
2740 25 : }
2741 :
2742 : static bool
2743 33 : bdev_is_read_io(struct spdk_bdev_io *bdev_io)
2744 : {
2745 33 : switch (bdev_io->type) {
2746 : case SPDK_BDEV_IO_TYPE_NVME_IO:
2747 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2748 : /* Bit 1 (0x2) set for read operation */
2749 0 : if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) {
2750 0 : return true;
2751 : } else {
2752 0 : return false;
2753 : }
2754 : case SPDK_BDEV_IO_TYPE_READ:
2755 30 : return true;
2756 : case SPDK_BDEV_IO_TYPE_ZCOPY:
2757 : /* Populate to read from disk */
2758 0 : if (bdev_io->u.bdev.zcopy.populate) {
2759 0 : return true;
2760 : } else {
2761 0 : return false;
2762 : }
2763 : default:
2764 3 : return false;
2765 : }
2766 33 : }
2767 :
2768 : static uint64_t
2769 43 : bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
2770 : {
2771 43 : uint32_t blocklen = bdev_io_get_block_size(bdev_io);
2772 :
2773 43 : switch (bdev_io->type) {
2774 : case SPDK_BDEV_IO_TYPE_NVME_IO:
2775 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2776 0 : return bdev_io->u.nvme_passthru.nbytes;
2777 : case SPDK_BDEV_IO_TYPE_READ:
2778 : case SPDK_BDEV_IO_TYPE_WRITE:
2779 43 : return bdev_io->u.bdev.num_blocks * blocklen;
2780 : case SPDK_BDEV_IO_TYPE_ZCOPY:
2781 : /* Track the data in the start phase only */
2782 0 : if (bdev_io->u.bdev.zcopy.start) {
2783 0 : return bdev_io->u.bdev.num_blocks * blocklen;
2784 : } else {
2785 0 : return 0;
2786 : }
2787 : default:
2788 0 : return 0;
2789 : }
2790 43 : }
2791 :
2792 : static inline bool
2793 64 : bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta)
2794 : {
2795 : int64_t remaining_this_timeslice;
2796 :
2797 64 : if (!limit->max_per_timeslice) {
2798 : /* The QoS is disabled */
2799 0 : return false;
2800 : }
2801 :
2802 64 : remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta,
2803 : __ATOMIC_RELAXED);
2804 64 : if (remaining_this_timeslice + (int64_t)delta > 0) {
2805 : /* There was still a quota for this delta -> the IO shouldn't be queued
2806 : *
2807 : * We allow a slight quota overrun here so an IO bigger than the per-timeslice
2808 : * quota can be allowed once a while. Such overrun then taken into account in
2809 : * the QoS poller, where the next timeslice quota is calculated.
2810 : */
2811 59 : return false;
2812 : }
2813 :
2814 : /* There was no quota for this delta -> the IO should be queued
2815 : * The remaining_this_timeslice must be rewinded so it reflects the real
2816 : * amount of IOs or bytes allowed.
2817 : */
2818 5 : __atomic_add_fetch(
2819 5 : &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED);
2820 5 : return true;
2821 64 : }
2822 :
2823 : static inline void
2824 5 : bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta)
2825 : {
2826 5 : __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED);
2827 5 : }
2828 :
2829 : static bool
2830 23 : bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2831 : {
2832 23 : return bdev_qos_rw_queue_io(limit, io, 1);
2833 : }
2834 :
2835 : static void
2836 3 : bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2837 : {
2838 3 : bdev_qos_rw_rewind_io(limit, io, 1);
2839 3 : }
2840 :
2841 : static bool
2842 41 : bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2843 : {
2844 41 : return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io));
2845 : }
2846 :
2847 : static void
2848 2 : bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2849 : {
2850 2 : bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2851 2 : }
2852 :
2853 : static bool
2854 19 : bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2855 : {
2856 19 : if (bdev_is_read_io(io) == false) {
2857 1 : return false;
2858 : }
2859 :
2860 18 : return bdev_qos_rw_bps_queue(limit, io);
2861 19 : }
2862 :
2863 : static void
2864 0 : bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2865 : {
2866 0 : if (bdev_is_read_io(io) != false) {
2867 0 : bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2868 0 : }
2869 0 : }
2870 :
2871 : static bool
2872 14 : bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2873 : {
2874 14 : if (bdev_is_read_io(io) == true) {
2875 12 : return false;
2876 : }
2877 :
2878 2 : return bdev_qos_rw_bps_queue(limit, io);
2879 14 : }
2880 :
2881 : static void
2882 0 : bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2883 : {
2884 0 : if (bdev_is_read_io(io) != true) {
2885 0 : bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2886 0 : }
2887 0 : }
2888 :
2889 : static void
2890 10 : bdev_qos_set_ops(struct spdk_bdev_qos *qos)
2891 : {
2892 : int i;
2893 :
2894 50 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2895 40 : if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
2896 15 : qos->rate_limits[i].queue_io = NULL;
2897 15 : continue;
2898 : }
2899 :
2900 25 : switch (i) {
2901 : case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2902 9 : qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue;
2903 9 : qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota;
2904 9 : break;
2905 : case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2906 7 : qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue;
2907 7 : qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota;
2908 7 : break;
2909 : case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2910 5 : qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue;
2911 5 : qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota;
2912 5 : break;
2913 : case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2914 4 : qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue;
2915 4 : qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota;
2916 4 : break;
2917 : default:
2918 0 : break;
2919 : }
2920 25 : }
2921 10 : }
2922 :
2923 : static void
2924 6 : _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch,
2925 : struct spdk_bdev_io *bdev_io,
2926 : enum spdk_bdev_io_status status)
2927 : {
2928 6 : bdev_io->internal.f.in_submit_request = true;
2929 6 : bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource);
2930 6 : spdk_bdev_io_complete(bdev_io, status);
2931 6 : bdev_io->internal.f.in_submit_request = false;
2932 6 : }
2933 :
2934 : static inline void
2935 590 : bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
2936 : {
2937 590 : struct spdk_bdev *bdev = bdev_io->bdev;
2938 590 : struct spdk_io_channel *ch = bdev_ch->channel;
2939 590 : struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2940 :
2941 590 : if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
2942 16 : struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch;
2943 16 : struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
2944 :
2945 16 : if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) ||
2946 16 : bdev_abort_buf_io(mgmt_channel, bio_to_abort)) {
2947 0 : _bdev_io_complete_in_submit(bdev_ch, bdev_io,
2948 : SPDK_BDEV_IO_STATUS_SUCCESS);
2949 0 : return;
2950 : }
2951 16 : }
2952 :
2953 590 : if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE &&
2954 : bdev_io->bdev->split_on_write_unit &&
2955 : bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) {
2956 4 : SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n",
2957 : bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size);
2958 4 : _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2959 4 : return;
2960 : }
2961 :
2962 586 : if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
2963 528 : bdev_io_increment_outstanding(bdev_ch, shared_resource);
2964 528 : bdev_io->internal.f.in_submit_request = true;
2965 528 : bdev_submit_request(bdev, ch, bdev_io);
2966 528 : bdev_io->internal.f.in_submit_request = false;
2967 528 : } else {
2968 58 : bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT);
2969 58 : if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) {
2970 : /* Special case when we have nomem IOs and no outstanding IOs which completions
2971 : * could trigger retry of queued IOs */
2972 15 : bdev_shared_ch_retry_io(shared_resource);
2973 15 : }
2974 : }
2975 590 : }
2976 :
2977 : static bool
2978 25 : bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io)
2979 : {
2980 : int i;
2981 :
2982 25 : if (bdev_qos_io_to_limit(bdev_io) == true) {
2983 100 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2984 82 : if (!qos->rate_limits[i].queue_io) {
2985 5 : continue;
2986 : }
2987 :
2988 231 : if (qos->rate_limits[i].queue_io(&qos->rate_limits[i],
2989 154 : bdev_io) == true) {
2990 10 : for (i -= 1; i >= 0 ; i--) {
2991 5 : if (!qos->rate_limits[i].queue_io) {
2992 0 : continue;
2993 : }
2994 :
2995 5 : qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io);
2996 5 : }
2997 5 : return true;
2998 : }
2999 72 : }
3000 18 : }
3001 :
3002 20 : return false;
3003 25 : }
3004 :
3005 : static int
3006 27 : bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos)
3007 : {
3008 27 : struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL;
3009 27 : int submitted_ios = 0;
3010 :
3011 52 : TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) {
3012 25 : if (!bdev_qos_queue_io(qos, bdev_io)) {
3013 20 : TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link);
3014 20 : bdev_io_do_submit(ch, bdev_io);
3015 :
3016 20 : submitted_ios++;
3017 20 : }
3018 25 : }
3019 :
3020 27 : return submitted_ios;
3021 : }
3022 :
3023 : static void
3024 2 : bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn)
3025 : {
3026 : int rc;
3027 :
3028 2 : bdev_io->internal.waitq_entry.bdev = bdev_io->bdev;
3029 2 : bdev_io->internal.waitq_entry.cb_fn = cb_fn;
3030 2 : bdev_io->internal.waitq_entry.cb_arg = bdev_io;
3031 4 : rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch),
3032 2 : &bdev_io->internal.waitq_entry);
3033 2 : if (rc != 0) {
3034 0 : SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc);
3035 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3036 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3037 0 : }
3038 2 : }
3039 :
3040 : static bool
3041 637 : bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
3042 : {
3043 : uint32_t io_boundary;
3044 637 : struct spdk_bdev *bdev = bdev_io->bdev;
3045 637 : uint32_t max_segment_size = bdev->max_segment_size;
3046 637 : uint32_t max_size = bdev->max_rw_size;
3047 637 : int max_segs = bdev->max_num_segments;
3048 :
3049 637 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
3050 24 : io_boundary = bdev->write_unit_size;
3051 637 : } else if (bdev->split_on_optimal_io_boundary) {
3052 168 : io_boundary = bdev->optimal_io_boundary;
3053 168 : } else {
3054 445 : io_boundary = 0;
3055 : }
3056 :
3057 637 : if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) {
3058 259 : return false;
3059 : }
3060 :
3061 378 : if (io_boundary) {
3062 : uint64_t start_stripe, end_stripe;
3063 :
3064 192 : start_stripe = bdev_io->u.bdev.offset_blocks;
3065 192 : end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1;
3066 : /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
3067 192 : if (spdk_likely(spdk_u32_is_pow2(io_boundary))) {
3068 192 : start_stripe >>= spdk_u32log2(io_boundary);
3069 192 : end_stripe >>= spdk_u32log2(io_boundary);
3070 192 : } else {
3071 0 : start_stripe /= io_boundary;
3072 0 : end_stripe /= io_boundary;
3073 : }
3074 :
3075 192 : if (start_stripe != end_stripe) {
3076 75 : return true;
3077 : }
3078 117 : }
3079 :
3080 303 : if (max_segs) {
3081 150 : if (bdev_io->u.bdev.iovcnt > max_segs) {
3082 15 : return true;
3083 : }
3084 135 : }
3085 :
3086 288 : if (max_segment_size) {
3087 470 : for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) {
3088 346 : if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) {
3089 12 : return true;
3090 : }
3091 334 : }
3092 124 : }
3093 :
3094 276 : if (max_size) {
3095 52 : if (bdev_io->u.bdev.num_blocks > max_size) {
3096 7 : return true;
3097 : }
3098 45 : }
3099 :
3100 269 : return false;
3101 637 : }
3102 :
3103 : static bool
3104 24 : bdev_unmap_should_split(struct spdk_bdev_io *bdev_io)
3105 : {
3106 : uint32_t num_unmap_segments;
3107 :
3108 24 : if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) {
3109 3 : return false;
3110 : }
3111 21 : num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap);
3112 21 : if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) {
3113 4 : return true;
3114 : }
3115 :
3116 17 : return false;
3117 24 : }
3118 :
3119 : static bool
3120 37 : bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io)
3121 : {
3122 37 : if (!bdev_io->bdev->max_write_zeroes) {
3123 4 : return false;
3124 : }
3125 :
3126 33 : if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) {
3127 10 : return true;
3128 : }
3129 :
3130 23 : return false;
3131 37 : }
3132 :
3133 : static bool
3134 30 : bdev_copy_should_split(struct spdk_bdev_io *bdev_io)
3135 : {
3136 30 : if (bdev_io->bdev->max_copy != 0 &&
3137 25 : bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) {
3138 6 : return true;
3139 : }
3140 :
3141 24 : return false;
3142 30 : }
3143 :
3144 : static bool
3145 811 : bdev_io_should_split(struct spdk_bdev_io *bdev_io)
3146 : {
3147 811 : switch (bdev_io->type) {
3148 : case SPDK_BDEV_IO_TYPE_READ:
3149 : case SPDK_BDEV_IO_TYPE_WRITE:
3150 637 : return bdev_rw_should_split(bdev_io);
3151 : case SPDK_BDEV_IO_TYPE_UNMAP:
3152 24 : return bdev_unmap_should_split(bdev_io);
3153 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3154 37 : return bdev_write_zeroes_should_split(bdev_io);
3155 : case SPDK_BDEV_IO_TYPE_COPY:
3156 30 : return bdev_copy_should_split(bdev_io);
3157 : default:
3158 83 : return false;
3159 : }
3160 811 : }
3161 :
3162 : static uint32_t
3163 249 : _to_next_boundary(uint64_t offset, uint32_t boundary)
3164 : {
3165 249 : return (boundary - (offset % boundary));
3166 : }
3167 :
3168 : static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
3169 :
3170 : static void _bdev_rw_split(void *_bdev_io);
3171 :
3172 : static void bdev_unmap_split(struct spdk_bdev_io *bdev_io);
3173 :
3174 : static void
3175 0 : _bdev_unmap_split(void *_bdev_io)
3176 : {
3177 0 : return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io);
3178 : }
3179 :
3180 : static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io);
3181 :
3182 : static void
3183 0 : _bdev_write_zeroes_split(void *_bdev_io)
3184 : {
3185 0 : return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io);
3186 : }
3187 :
3188 : static void bdev_copy_split(struct spdk_bdev_io *bdev_io);
3189 :
3190 : static void
3191 0 : _bdev_copy_split(void *_bdev_io)
3192 : {
3193 0 : return bdev_copy_split((struct spdk_bdev_io *)_bdev_io);
3194 : }
3195 :
3196 : static int
3197 305 : bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf,
3198 : uint64_t num_blocks, uint64_t *offset, uint64_t *remaining)
3199 : {
3200 : int rc;
3201 : uint64_t current_offset, current_remaining, current_src_offset;
3202 : spdk_bdev_io_wait_cb io_wait_fn;
3203 :
3204 305 : current_offset = *offset;
3205 305 : current_remaining = *remaining;
3206 :
3207 305 : assert(bdev_io->internal.f.split);
3208 :
3209 305 : bdev_io->internal.split.outstanding++;
3210 :
3211 305 : io_wait_fn = _bdev_rw_split;
3212 305 : switch (bdev_io->type) {
3213 : case SPDK_BDEV_IO_TYPE_READ:
3214 196 : assert(bdev_io->u.bdev.accel_sequence == NULL);
3215 392 : rc = bdev_readv_blocks_with_md(bdev_io->internal.desc,
3216 196 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
3217 196 : iov, iovcnt, md_buf, current_offset,
3218 196 : num_blocks,
3219 196 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
3220 196 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
3221 : NULL,
3222 196 : bdev_io->u.bdev.dif_check_flags,
3223 196 : bdev_io_split_done, bdev_io);
3224 196 : break;
3225 : case SPDK_BDEV_IO_TYPE_WRITE:
3226 50 : assert(bdev_io->u.bdev.accel_sequence == NULL);
3227 100 : rc = bdev_writev_blocks_with_md(bdev_io->internal.desc,
3228 50 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
3229 50 : iov, iovcnt, md_buf, current_offset,
3230 50 : num_blocks,
3231 50 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
3232 50 : bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
3233 : NULL,
3234 50 : bdev_io->u.bdev.dif_check_flags,
3235 50 : bdev_io->u.bdev.nvme_cdw12.raw,
3236 50 : bdev_io->u.bdev.nvme_cdw13.raw,
3237 50 : bdev_io_split_done, bdev_io);
3238 50 : break;
3239 : case SPDK_BDEV_IO_TYPE_UNMAP:
3240 17 : io_wait_fn = _bdev_unmap_split;
3241 34 : rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc,
3242 17 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
3243 17 : current_offset, num_blocks,
3244 17 : bdev_io_split_done, bdev_io);
3245 17 : break;
3246 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3247 23 : io_wait_fn = _bdev_write_zeroes_split;
3248 46 : rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc,
3249 23 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
3250 23 : current_offset, num_blocks,
3251 23 : bdev_io_split_done, bdev_io);
3252 23 : break;
3253 : case SPDK_BDEV_IO_TYPE_COPY:
3254 19 : io_wait_fn = _bdev_copy_split;
3255 38 : current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks +
3256 19 : (current_offset - bdev_io->u.bdev.offset_blocks);
3257 38 : rc = spdk_bdev_copy_blocks(bdev_io->internal.desc,
3258 19 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
3259 19 : current_offset, current_src_offset, num_blocks,
3260 19 : bdev_io_split_done, bdev_io);
3261 19 : break;
3262 : default:
3263 0 : assert(false);
3264 : rc = -EINVAL;
3265 : break;
3266 : }
3267 :
3268 305 : if (rc == 0) {
3269 301 : current_offset += num_blocks;
3270 301 : current_remaining -= num_blocks;
3271 301 : bdev_io->internal.split.current_offset_blocks = current_offset;
3272 301 : bdev_io->internal.split.remaining_num_blocks = current_remaining;
3273 301 : *offset = current_offset;
3274 301 : *remaining = current_remaining;
3275 301 : } else {
3276 4 : bdev_io->internal.split.outstanding--;
3277 4 : if (rc == -ENOMEM) {
3278 4 : if (bdev_io->internal.split.outstanding == 0) {
3279 : /* No I/O is outstanding. Hence we should wait here. */
3280 1 : bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn);
3281 1 : }
3282 4 : } else {
3283 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3284 0 : if (bdev_io->internal.split.outstanding == 0) {
3285 0 : bdev_ch_remove_from_io_submitted(bdev_io);
3286 0 : spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id,
3287 : 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx,
3288 : bdev_io->internal.ch->queue_depth);
3289 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3290 0 : }
3291 : }
3292 : }
3293 :
3294 305 : return rc;
3295 : }
3296 :
3297 : static void
3298 67 : _bdev_rw_split(void *_bdev_io)
3299 : {
3300 : struct iovec *parent_iov, *iov;
3301 67 : struct spdk_bdev_io *bdev_io = _bdev_io;
3302 67 : struct spdk_bdev *bdev = bdev_io->bdev;
3303 : uint64_t parent_offset, current_offset, remaining;
3304 : uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt;
3305 : uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
3306 : uint32_t iovcnt, iov_len, child_iovsize;
3307 : uint32_t blocklen;
3308 : uint32_t io_boundary;
3309 67 : uint32_t max_segment_size = bdev->max_segment_size;
3310 67 : uint32_t max_child_iovcnt = bdev->max_num_segments;
3311 67 : uint32_t max_size = bdev->max_rw_size;
3312 67 : void *md_buf = NULL;
3313 : int rc;
3314 :
3315 67 : blocklen = bdev_io_get_block_size(bdev_io);
3316 :
3317 67 : max_size = max_size ? max_size : UINT32_MAX;
3318 67 : max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX;
3319 67 : max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) :
3320 : SPDK_BDEV_IO_NUM_CHILD_IOV;
3321 :
3322 67 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
3323 5 : io_boundary = bdev->write_unit_size;
3324 67 : } else if (bdev->split_on_optimal_io_boundary) {
3325 40 : io_boundary = bdev->optimal_io_boundary;
3326 40 : } else {
3327 22 : io_boundary = UINT32_MAX;
3328 : }
3329 :
3330 67 : assert(bdev_io->internal.f.split);
3331 :
3332 67 : remaining = bdev_io->internal.split.remaining_num_blocks;
3333 67 : current_offset = bdev_io->internal.split.current_offset_blocks;
3334 67 : parent_offset = bdev_io->u.bdev.offset_blocks;
3335 67 : parent_iov_offset = (current_offset - parent_offset) * blocklen;
3336 67 : parent_iovcnt = bdev_io->u.bdev.iovcnt;
3337 :
3338 420 : for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) {
3339 420 : parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
3340 420 : if (parent_iov_offset < parent_iov->iov_len) {
3341 67 : break;
3342 : }
3343 353 : parent_iov_offset -= parent_iov->iov_len;
3344 353 : }
3345 :
3346 67 : child_iovcnt = 0;
3347 573 : while (remaining > 0 && parent_iovpos < parent_iovcnt &&
3348 264 : child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) {
3349 249 : to_next_boundary = _to_next_boundary(current_offset, io_boundary);
3350 249 : to_next_boundary = spdk_min(remaining, to_next_boundary);
3351 249 : to_next_boundary = spdk_min(max_size, to_next_boundary);
3352 249 : to_next_boundary_bytes = to_next_boundary * blocklen;
3353 :
3354 249 : iov = &bdev_io->child_iov[child_iovcnt];
3355 249 : iovcnt = 0;
3356 :
3357 249 : if (bdev_io->u.bdev.md_buf) {
3358 48 : md_buf = (char *)bdev_io->u.bdev.md_buf +
3359 24 : (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev);
3360 24 : }
3361 :
3362 249 : child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt);
3363 1810 : while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt &&
3364 836 : iovcnt < child_iovsize) {
3365 725 : parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
3366 725 : iov_len = parent_iov->iov_len - parent_iov_offset;
3367 :
3368 725 : iov_len = spdk_min(iov_len, max_segment_size);
3369 725 : iov_len = spdk_min(iov_len, to_next_boundary_bytes);
3370 725 : to_next_boundary_bytes -= iov_len;
3371 :
3372 725 : bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset;
3373 725 : bdev_io->child_iov[child_iovcnt].iov_len = iov_len;
3374 :
3375 725 : if (iov_len < parent_iov->iov_len - parent_iov_offset) {
3376 183 : parent_iov_offset += iov_len;
3377 183 : } else {
3378 542 : parent_iovpos++;
3379 542 : parent_iov_offset = 0;
3380 : }
3381 725 : child_iovcnt++;
3382 725 : iovcnt++;
3383 : }
3384 :
3385 249 : if (to_next_boundary_bytes > 0) {
3386 : /* We had to stop this child I/O early because we ran out of
3387 : * child_iov space or were limited by max_num_segments.
3388 : * Ensure the iovs to be aligned with block size and
3389 : * then adjust to_next_boundary before starting the
3390 : * child I/O.
3391 : */
3392 111 : assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV ||
3393 : iovcnt == child_iovsize);
3394 111 : to_last_block_bytes = to_next_boundary_bytes % blocklen;
3395 111 : if (to_last_block_bytes != 0) {
3396 24 : uint32_t child_iovpos = child_iovcnt - 1;
3397 : /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV
3398 : * so the loop will naturally end
3399 : */
3400 :
3401 24 : to_last_block_bytes = blocklen - to_last_block_bytes;
3402 24 : to_next_boundary_bytes += to_last_block_bytes;
3403 53 : while (to_last_block_bytes > 0 && iovcnt > 0) {
3404 32 : iov_len = spdk_min(to_last_block_bytes,
3405 : bdev_io->child_iov[child_iovpos].iov_len);
3406 32 : bdev_io->child_iov[child_iovpos].iov_len -= iov_len;
3407 32 : if (bdev_io->child_iov[child_iovpos].iov_len == 0) {
3408 15 : child_iovpos--;
3409 15 : if (--iovcnt == 0) {
3410 : /* If the child IO is less than a block size just return.
3411 : * If the first child IO of any split round is less than
3412 : * a block size, an error exit.
3413 : */
3414 3 : if (bdev_io->internal.split.outstanding == 0) {
3415 1 : SPDK_ERRLOG("The first child io was less than a block size\n");
3416 1 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3417 1 : bdev_ch_remove_from_io_submitted(bdev_io);
3418 1 : spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id,
3419 : 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx,
3420 : bdev_io->internal.ch->queue_depth);
3421 1 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3422 1 : }
3423 :
3424 3 : return;
3425 : }
3426 12 : }
3427 :
3428 29 : to_last_block_bytes -= iov_len;
3429 :
3430 29 : if (parent_iov_offset == 0) {
3431 14 : parent_iovpos--;
3432 14 : parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len;
3433 14 : }
3434 29 : parent_iov_offset -= iov_len;
3435 : }
3436 :
3437 21 : assert(to_last_block_bytes == 0);
3438 21 : }
3439 108 : to_next_boundary -= to_next_boundary_bytes / blocklen;
3440 108 : }
3441 :
3442 246 : rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary,
3443 : ¤t_offset, &remaining);
3444 246 : if (spdk_unlikely(rc)) {
3445 4 : return;
3446 : }
3447 : }
3448 67 : }
3449 :
3450 : static void
3451 3 : bdev_unmap_split(struct spdk_bdev_io *bdev_io)
3452 : {
3453 : uint64_t offset, unmap_blocks, remaining, max_unmap_blocks;
3454 3 : uint32_t num_children_reqs = 0;
3455 : int rc;
3456 :
3457 3 : assert(bdev_io->internal.f.split);
3458 :
3459 3 : offset = bdev_io->internal.split.current_offset_blocks;
3460 3 : remaining = bdev_io->internal.split.remaining_num_blocks;
3461 3 : max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments;
3462 :
3463 20 : while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3464 17 : unmap_blocks = spdk_min(remaining, max_unmap_blocks);
3465 :
3466 17 : rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks,
3467 : &offset, &remaining);
3468 17 : if (spdk_likely(rc == 0)) {
3469 17 : num_children_reqs++;
3470 17 : } else {
3471 0 : return;
3472 : }
3473 : }
3474 3 : }
3475 :
3476 : static void
3477 6 : bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io)
3478 : {
3479 : uint64_t offset, write_zeroes_blocks, remaining;
3480 6 : uint32_t num_children_reqs = 0;
3481 : int rc;
3482 :
3483 6 : assert(bdev_io->internal.f.split);
3484 :
3485 6 : offset = bdev_io->internal.split.current_offset_blocks;
3486 6 : remaining = bdev_io->internal.split.remaining_num_blocks;
3487 :
3488 29 : while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3489 23 : write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes);
3490 :
3491 23 : rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks,
3492 : &offset, &remaining);
3493 23 : if (spdk_likely(rc == 0)) {
3494 23 : num_children_reqs++;
3495 23 : } else {
3496 0 : return;
3497 : }
3498 : }
3499 6 : }
3500 :
3501 : static void
3502 4 : bdev_copy_split(struct spdk_bdev_io *bdev_io)
3503 : {
3504 : uint64_t offset, copy_blocks, remaining;
3505 4 : uint32_t num_children_reqs = 0;
3506 : int rc;
3507 :
3508 4 : assert(bdev_io->internal.f.split);
3509 :
3510 4 : offset = bdev_io->internal.split.current_offset_blocks;
3511 4 : remaining = bdev_io->internal.split.remaining_num_blocks;
3512 :
3513 4 : assert(bdev_io->bdev->max_copy != 0);
3514 23 : while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) {
3515 19 : copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy);
3516 :
3517 19 : rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks,
3518 : &offset, &remaining);
3519 19 : if (spdk_likely(rc == 0)) {
3520 19 : num_children_reqs++;
3521 19 : } else {
3522 0 : return;
3523 : }
3524 : }
3525 4 : }
3526 :
3527 : static void
3528 58 : parent_bdev_io_complete(void *ctx, int rc)
3529 : {
3530 58 : struct spdk_bdev_io *parent_io = ctx;
3531 :
3532 58 : if (rc) {
3533 0 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3534 0 : }
3535 :
3536 116 : parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
3537 58 : parent_io->internal.caller_ctx);
3538 58 : }
3539 :
3540 : static void
3541 0 : bdev_io_complete_parent_sequence_cb(void *ctx, int status)
3542 : {
3543 0 : struct spdk_bdev_io *bdev_io = ctx;
3544 :
3545 : /* u.bdev.accel_sequence should have already been cleared at this point */
3546 0 : assert(bdev_io->u.bdev.accel_sequence == NULL);
3547 0 : assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
3548 0 : bdev_io->internal.f.has_accel_sequence = false;
3549 :
3550 0 : if (spdk_unlikely(status != 0)) {
3551 0 : SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
3552 0 : }
3553 :
3554 0 : parent_bdev_io_complete(bdev_io, status);
3555 0 : }
3556 :
3557 : static void
3558 301 : bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
3559 : {
3560 301 : struct spdk_bdev_io *parent_io = cb_arg;
3561 :
3562 301 : spdk_bdev_free_io(bdev_io);
3563 :
3564 301 : assert(parent_io->internal.f.split);
3565 :
3566 301 : if (!success) {
3567 21 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3568 : /* If any child I/O failed, stop further splitting process. */
3569 21 : parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks;
3570 21 : parent_io->internal.split.remaining_num_blocks = 0;
3571 21 : }
3572 301 : parent_io->internal.split.outstanding--;
3573 301 : if (parent_io->internal.split.outstanding != 0) {
3574 223 : return;
3575 : }
3576 :
3577 : /*
3578 : * Parent I/O finishes when all blocks are consumed.
3579 : */
3580 78 : if (parent_io->internal.split.remaining_num_blocks == 0) {
3581 58 : assert(parent_io->internal.cb != bdev_io_split_done);
3582 58 : bdev_ch_remove_from_io_submitted(parent_io);
3583 58 : spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id,
3584 : 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx,
3585 : parent_io->internal.ch->queue_depth);
3586 :
3587 58 : if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
3588 48 : if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) {
3589 0 : bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb);
3590 0 : return;
3591 48 : } else if (parent_io->internal.f.has_bounce_buf &&
3592 0 : !bdev_io_use_accel_sequence(bdev_io)) {
3593 : /* bdev IO will be completed in the callback */
3594 0 : _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete);
3595 0 : return;
3596 : }
3597 48 : }
3598 :
3599 58 : parent_bdev_io_complete(parent_io, 0);
3600 58 : return;
3601 : }
3602 :
3603 : /*
3604 : * Continue with the splitting process. This function will complete the parent I/O if the
3605 : * splitting is done.
3606 : */
3607 20 : switch (parent_io->type) {
3608 : case SPDK_BDEV_IO_TYPE_READ:
3609 : case SPDK_BDEV_IO_TYPE_WRITE:
3610 17 : _bdev_rw_split(parent_io);
3611 17 : break;
3612 : case SPDK_BDEV_IO_TYPE_UNMAP:
3613 1 : bdev_unmap_split(parent_io);
3614 1 : break;
3615 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3616 1 : bdev_write_zeroes_split(parent_io);
3617 1 : break;
3618 : case SPDK_BDEV_IO_TYPE_COPY:
3619 1 : bdev_copy_split(parent_io);
3620 1 : break;
3621 : default:
3622 0 : assert(false);
3623 : break;
3624 : }
3625 301 : }
3626 :
3627 : static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3628 : bool success);
3629 :
3630 : static void
3631 59 : bdev_io_split(struct spdk_bdev_io *bdev_io)
3632 : {
3633 59 : assert(bdev_io_should_split(bdev_io));
3634 59 : assert(bdev_io->internal.f.split);
3635 :
3636 59 : bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks;
3637 59 : bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks;
3638 59 : bdev_io->internal.split.outstanding = 0;
3639 59 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
3640 :
3641 59 : switch (bdev_io->type) {
3642 : case SPDK_BDEV_IO_TYPE_READ:
3643 : case SPDK_BDEV_IO_TYPE_WRITE:
3644 49 : if (_is_buf_allocated(bdev_io->u.bdev.iovs)) {
3645 49 : _bdev_rw_split(bdev_io);
3646 49 : } else {
3647 0 : assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3648 0 : spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb,
3649 0 : bdev_io->u.bdev.num_blocks * bdev_io_get_block_size(bdev_io));
3650 : }
3651 49 : break;
3652 : case SPDK_BDEV_IO_TYPE_UNMAP:
3653 2 : bdev_unmap_split(bdev_io);
3654 2 : break;
3655 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3656 5 : bdev_write_zeroes_split(bdev_io);
3657 5 : break;
3658 : case SPDK_BDEV_IO_TYPE_COPY:
3659 3 : bdev_copy_split(bdev_io);
3660 3 : break;
3661 : default:
3662 0 : assert(false);
3663 : break;
3664 : }
3665 59 : }
3666 :
3667 : static void
3668 0 : bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
3669 : {
3670 0 : if (!success) {
3671 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3672 0 : return;
3673 : }
3674 :
3675 0 : _bdev_rw_split(bdev_io);
3676 0 : }
3677 :
3678 : static inline void
3679 595 : _bdev_io_submit(struct spdk_bdev_io *bdev_io)
3680 : {
3681 595 : struct spdk_bdev *bdev = bdev_io->bdev;
3682 595 : struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3683 :
3684 595 : if (spdk_likely(bdev_ch->flags == 0)) {
3685 570 : bdev_io_do_submit(bdev_ch, bdev_io);
3686 570 : return;
3687 : }
3688 :
3689 25 : if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
3690 2 : _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
3691 25 : } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
3692 23 : if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) &&
3693 2 : bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) {
3694 0 : _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
3695 0 : } else {
3696 23 : TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link);
3697 23 : bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
3698 : }
3699 23 : } else {
3700 0 : SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
3701 0 : _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3702 : }
3703 595 : }
3704 :
3705 : bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2);
3706 :
3707 : bool
3708 23 : bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2)
3709 : {
3710 23 : if (range1->length == 0 || range2->length == 0) {
3711 1 : return false;
3712 : }
3713 :
3714 22 : if (range1->offset + range1->length <= range2->offset) {
3715 1 : return false;
3716 : }
3717 :
3718 21 : if (range2->offset + range2->length <= range1->offset) {
3719 3 : return false;
3720 : }
3721 :
3722 18 : return true;
3723 23 : }
3724 :
3725 : static bool
3726 11 : bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range)
3727 : {
3728 11 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3729 : struct lba_range r;
3730 :
3731 11 : switch (bdev_io->type) {
3732 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3733 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3734 : /* Don't try to decode the NVMe command - just assume worst-case and that
3735 : * it overlaps a locked range.
3736 : */
3737 0 : return true;
3738 : case SPDK_BDEV_IO_TYPE_READ:
3739 6 : if (!range->quiesce) {
3740 4 : return false;
3741 : }
3742 : /* fallthrough */
3743 : case SPDK_BDEV_IO_TYPE_WRITE:
3744 : case SPDK_BDEV_IO_TYPE_UNMAP:
3745 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3746 : case SPDK_BDEV_IO_TYPE_ZCOPY:
3747 : case SPDK_BDEV_IO_TYPE_COPY:
3748 7 : r.offset = bdev_io->u.bdev.offset_blocks;
3749 7 : r.length = bdev_io->u.bdev.num_blocks;
3750 7 : if (!bdev_lba_range_overlapped(range, &r)) {
3751 : /* This I/O doesn't overlap the specified LBA range. */
3752 0 : return false;
3753 7 : } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) {
3754 : /* This I/O overlaps, but the I/O is on the same channel that locked this
3755 : * range, and the caller_ctx is the same as the locked_ctx. This means
3756 : * that this I/O is associated with the lock, and is allowed to execute.
3757 : */
3758 2 : return false;
3759 : } else {
3760 5 : return true;
3761 : }
3762 : default:
3763 0 : return false;
3764 : }
3765 11 : }
3766 :
3767 : void
3768 655 : bdev_io_submit(struct spdk_bdev_io *bdev_io)
3769 : {
3770 655 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3771 :
3772 655 : assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3773 :
3774 655 : if (!TAILQ_EMPTY(&ch->locked_ranges)) {
3775 : struct lba_range *range;
3776 :
3777 13 : TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
3778 8 : if (bdev_io_range_is_locked(bdev_io, range)) {
3779 3 : TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link);
3780 3 : return;
3781 : }
3782 5 : }
3783 5 : }
3784 :
3785 652 : bdev_ch_add_to_io_submitted(bdev_io);
3786 :
3787 652 : bdev_io->internal.submit_tsc = spdk_get_ticks();
3788 652 : spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START,
3789 : ch->trace_id, bdev_io->u.bdev.num_blocks,
3790 : (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx,
3791 : bdev_io->u.bdev.offset_blocks, ch->queue_depth);
3792 :
3793 652 : if (bdev_io->internal.f.split) {
3794 59 : bdev_io_split(bdev_io);
3795 59 : return;
3796 : }
3797 :
3798 593 : _bdev_io_submit(bdev_io);
3799 655 : }
3800 :
3801 : static inline int
3802 2 : bdev_io_init_dif_ctx(struct spdk_bdev_io *bdev_io)
3803 : {
3804 2 : struct spdk_bdev *bdev = bdev_io->bdev;
3805 : struct spdk_dif_ctx_init_ext_opts dif_opts;
3806 :
3807 2 : memset(&bdev_io->u.bdev.dif_err, 0, sizeof(struct spdk_dif_error));
3808 :
3809 2 : dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
3810 2 : dif_opts.dif_pi_format = bdev->dif_pi_format;
3811 :
3812 4 : return spdk_dif_ctx_init(&bdev_io->u.bdev.dif_ctx,
3813 2 : bdev->blocklen,
3814 2 : bdev->md_len,
3815 2 : bdev->md_interleave,
3816 2 : bdev->dif_is_head_of_md,
3817 2 : bdev->dif_type,
3818 2 : bdev_io->u.bdev.dif_check_flags,
3819 2 : bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF,
3820 : 0xFFFF, 0, 0, 0, &dif_opts);
3821 : }
3822 :
3823 : static void
3824 4 : _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3825 : bool success)
3826 : {
3827 4 : if (!success) {
3828 0 : SPDK_ERRLOG("Failed to get data buffer, completing IO\n");
3829 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3830 0 : bdev_io_complete_unsubmitted(bdev_io);
3831 0 : return;
3832 : }
3833 :
3834 4 : if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
3835 0 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
3836 0 : bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
3837 0 : return;
3838 : }
3839 : /* For reads we'll execute the sequence after the data is read, so, for now, only
3840 : * clear out accel_sequence pointer and submit the IO */
3841 0 : assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3842 0 : bdev_io->u.bdev.accel_sequence = NULL;
3843 0 : }
3844 :
3845 4 : bdev_io_submit(bdev_io);
3846 4 : }
3847 :
3848 : static inline void
3849 4 : _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io)
3850 : {
3851 : /* bdev doesn't support memory domains, thereby buffers in this IO request can't
3852 : * be accessed directly. It is needed to allocate buffers before issuing IO operation.
3853 : * For write operation we need to pull buffers from memory domain before submitting IO.
3854 : * Once read operation completes, we need to use memory_domain push functionality to
3855 : * update data in original memory domain IO buffer.
3856 : *
3857 : * If this I/O request is not aware of metadata, buffers in thsi IO request can't be
3858 : * accessed directly too. It is needed to allocate buffers before issuing IO operation.
3859 : * For write operation we need to insert metadata before submitting IO. Once read
3860 : * operation completes, we need to strip metadata in original IO buffer.
3861 : *
3862 : * This IO request will go through a regular IO flow, so clear memory domains pointers */
3863 4 : assert(bdev_io_use_memory_domain(bdev_io) ||
3864 : bdev_io_needs_metadata(bdev_io->internal.desc, bdev_io));
3865 :
3866 4 : bdev_io->u.bdev.memory_domain = NULL;
3867 4 : bdev_io->u.bdev.memory_domain_ctx = NULL;
3868 8 : _bdev_io_get_bounce_buf(bdev_io, _bdev_memory_domain_get_io_cb,
3869 4 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3870 4 : }
3871 :
3872 : static inline void
3873 0 : _bdev_io_ext_use_accel_buffer(struct spdk_bdev_io *bdev_io)
3874 : {
3875 0 : assert(bdev_io_use_memory_domain(bdev_io));
3876 0 : assert(bdev_io_needs_metadata(bdev_io->internal.desc, bdev_io));
3877 :
3878 0 : bdev_io->u.bdev.memory_domain = NULL;
3879 0 : bdev_io->u.bdev.memory_domain_ctx = NULL;
3880 0 : bdev_io_get_accel_buf(bdev_io, _bdev_memory_domain_get_io_cb,
3881 0 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3882 0 : }
3883 :
3884 : /* We need to allocate bounce buffer
3885 : * - if bdev doesn't support memory domains,
3886 : * - if it does support them, but we need to execute an accel sequence and the data buffer is
3887 : * from accel memory domain (to avoid doing a push/pull from that domain), or
3888 : * - if IO is not aware of metadata.
3889 : */
3890 : static inline bool
3891 292 : bdev_io_needs_bounce_buffer(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
3892 : {
3893 292 : if (bdev_io_use_memory_domain(bdev_io)) {
3894 4 : if (!desc->memory_domains_supported ||
3895 0 : (bdev_io_needs_sequence_exec(desc, bdev_io) &&
3896 0 : (bdev_io->internal.memory_domain == spdk_accel_get_memory_domain() ||
3897 0 : bdev_io_needs_metadata(desc, bdev_io)))) {
3898 4 : return true;
3899 : }
3900 :
3901 0 : return false;
3902 : }
3903 :
3904 288 : if (bdev_io_needs_metadata(desc, bdev_io)) {
3905 0 : return true;
3906 : }
3907 :
3908 288 : return false;
3909 292 : }
3910 :
3911 : /* We need to allocate fake accel buffer if bdev supports memory domains but IO is not
3912 : * aware of metadata.
3913 : */
3914 : static inline bool
3915 288 : bdev_io_needs_accel_buffer(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
3916 : {
3917 288 : if (bdev_io_needs_metadata(desc, bdev_io)) {
3918 0 : assert(bdev_io_use_memory_domain(bdev_io));
3919 0 : return true;
3920 : }
3921 :
3922 288 : return false;
3923 288 : }
3924 :
3925 : static inline void
3926 292 : _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
3927 : {
3928 292 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3929 : int rc;
3930 :
3931 292 : if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) {
3932 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED;
3933 0 : bdev_io_complete_unsubmitted(bdev_io);
3934 0 : return;
3935 : }
3936 :
3937 292 : if (bdev_io_needs_metadata(desc, bdev_io)) {
3938 0 : rc = bdev_io_init_dif_ctx(bdev_io);
3939 0 : if (spdk_unlikely(rc != 0)) {
3940 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3941 0 : bdev_io_complete_unsubmitted(bdev_io);
3942 0 : return;
3943 : }
3944 0 : }
3945 :
3946 292 : if (bdev_io_needs_bounce_buffer(desc, bdev_io)) {
3947 4 : _bdev_io_ext_use_bounce_buffer(bdev_io);
3948 4 : return;
3949 : }
3950 :
3951 288 : if (bdev_io_needs_accel_buffer(desc, bdev_io)) {
3952 0 : _bdev_io_ext_use_accel_buffer(bdev_io);
3953 0 : return;
3954 : }
3955 :
3956 288 : if (bdev_io_needs_sequence_exec(desc, bdev_io)) {
3957 0 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
3958 0 : bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
3959 0 : return;
3960 : }
3961 : /* For reads we'll execute the sequence after the data is read, so, for now, only
3962 : * clear out accel_sequence pointer and submit the IO */
3963 0 : assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3964 0 : bdev_io->u.bdev.accel_sequence = NULL;
3965 0 : }
3966 :
3967 288 : bdev_io_submit(bdev_io);
3968 292 : }
3969 :
3970 : static void
3971 13 : bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
3972 : {
3973 13 : struct spdk_bdev *bdev = bdev_io->bdev;
3974 13 : struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3975 13 : struct spdk_io_channel *ch = bdev_ch->channel;
3976 :
3977 13 : assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3978 :
3979 13 : bdev_io->internal.f.in_submit_request = true;
3980 13 : bdev_submit_request(bdev, ch, bdev_io);
3981 13 : bdev_io->internal.f.in_submit_request = false;
3982 13 : }
3983 :
3984 : void
3985 710 : bdev_io_init(struct spdk_bdev_io *bdev_io,
3986 : struct spdk_bdev *bdev, void *cb_arg,
3987 : spdk_bdev_io_completion_cb cb)
3988 : {
3989 710 : bdev_io->bdev = bdev;
3990 710 : bdev_io->internal.f.raw = 0;
3991 710 : bdev_io->internal.caller_ctx = cb_arg;
3992 710 : bdev_io->internal.cb = cb;
3993 710 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
3994 710 : bdev_io->internal.f.in_submit_request = false;
3995 710 : bdev_io->internal.error.nvme.cdw0 = 0;
3996 710 : bdev_io->num_retries = 0;
3997 710 : bdev_io->internal.get_buf_cb = NULL;
3998 710 : bdev_io->internal.get_aux_buf_cb = NULL;
3999 710 : bdev_io->internal.data_transfer_cpl = NULL;
4000 710 : bdev_io->internal.f.split = bdev_io_should_split(bdev_io);
4001 710 : }
4002 :
4003 : static bool
4004 543 : bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
4005 : {
4006 543 : return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
4007 : }
4008 :
4009 : bool
4010 179 : spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
4011 : {
4012 : bool supported;
4013 :
4014 179 : supported = bdev_io_type_supported(bdev, io_type);
4015 :
4016 179 : if (!supported) {
4017 7 : switch (io_type) {
4018 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
4019 : /* The bdev layer will emulate write zeroes as long as write is supported. */
4020 0 : supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
4021 0 : break;
4022 : default:
4023 7 : break;
4024 : }
4025 7 : }
4026 :
4027 179 : return supported;
4028 : }
4029 :
4030 : static const char *g_io_type_strings[] = {
4031 : [SPDK_BDEV_IO_TYPE_READ] = "read",
4032 : [SPDK_BDEV_IO_TYPE_WRITE] = "write",
4033 : [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap",
4034 : [SPDK_BDEV_IO_TYPE_FLUSH] = "flush",
4035 : [SPDK_BDEV_IO_TYPE_RESET] = "reset",
4036 : [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin",
4037 : [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io",
4038 : [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md",
4039 : [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes",
4040 : [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy",
4041 : [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info",
4042 : [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management",
4043 : [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append",
4044 : [SPDK_BDEV_IO_TYPE_COMPARE] = "compare",
4045 : [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write",
4046 : [SPDK_BDEV_IO_TYPE_ABORT] = "abort",
4047 : [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole",
4048 : [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data",
4049 : [SPDK_BDEV_IO_TYPE_COPY] = "copy",
4050 : [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md",
4051 : };
4052 :
4053 : const char *
4054 0 : spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type)
4055 : {
4056 0 : if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) {
4057 0 : return NULL;
4058 : }
4059 :
4060 0 : return g_io_type_strings[io_type];
4061 0 : }
4062 :
4063 : int
4064 0 : spdk_bdev_get_io_type(const char *io_type_string)
4065 : {
4066 : int i;
4067 :
4068 0 : for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
4069 0 : if (!strcmp(io_type_string, g_io_type_strings[i])) {
4070 0 : return i;
4071 : }
4072 0 : }
4073 :
4074 0 : return -1;
4075 0 : }
4076 :
4077 : uint64_t
4078 0 : spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io)
4079 : {
4080 0 : return bdev_io->internal.submit_tsc;
4081 : }
4082 :
4083 : bool
4084 0 : spdk_bdev_io_hide_metadata(struct spdk_bdev_io *bdev_io)
4085 : {
4086 0 : return bdev_io->internal.desc->opts.hide_metadata;
4087 : }
4088 :
4089 : int
4090 0 : spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
4091 : {
4092 0 : if (bdev->fn_table->dump_info_json) {
4093 0 : return bdev->fn_table->dump_info_json(bdev->ctxt, w);
4094 : }
4095 :
4096 0 : return 0;
4097 0 : }
4098 :
4099 : static void
4100 10 : bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
4101 : {
4102 10 : uint32_t max_per_timeslice = 0;
4103 : int i;
4104 :
4105 50 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4106 40 : if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
4107 15 : qos->rate_limits[i].max_per_timeslice = 0;
4108 15 : continue;
4109 : }
4110 :
4111 50 : max_per_timeslice = qos->rate_limits[i].limit *
4112 25 : SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC;
4113 :
4114 25 : qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice,
4115 : qos->rate_limits[i].min_per_timeslice);
4116 :
4117 50 : __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice,
4118 25 : qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE);
4119 25 : }
4120 :
4121 10 : bdev_qos_set_ops(qos);
4122 10 : }
4123 :
4124 : static void
4125 4 : bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4126 : struct spdk_io_channel *io_ch, void *ctx)
4127 : {
4128 4 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
4129 : int status;
4130 :
4131 4 : bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
4132 :
4133 : /* if all IOs were sent then continue the iteration, otherwise - stop it */
4134 : /* TODO: channels round robing */
4135 4 : status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1;
4136 :
4137 4 : spdk_bdev_for_each_channel_continue(i, status);
4138 4 : }
4139 :
4140 :
4141 : static void
4142 2 : bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status)
4143 : {
4144 :
4145 2 : }
4146 :
4147 : static int
4148 3 : bdev_channel_poll_qos(void *arg)
4149 : {
4150 3 : struct spdk_bdev *bdev = arg;
4151 3 : struct spdk_bdev_qos *qos = bdev->internal.qos;
4152 3 : uint64_t now = spdk_get_ticks();
4153 : int i;
4154 : int64_t remaining_last_timeslice;
4155 :
4156 3 : if (spdk_unlikely(qos->thread == NULL)) {
4157 : /* Old QoS was unbound to remove and new QoS is not enabled yet. */
4158 1 : return SPDK_POLLER_IDLE;
4159 : }
4160 :
4161 2 : if (now < (qos->last_timeslice + qos->timeslice_size)) {
4162 : /* We received our callback earlier than expected - return
4163 : * immediately and wait to do accounting until at least one
4164 : * timeslice has actually expired. This should never happen
4165 : * with a well-behaved timer implementation.
4166 : */
4167 0 : return SPDK_POLLER_IDLE;
4168 : }
4169 :
4170 : /* Reset for next round of rate limiting */
4171 10 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4172 : /* We may have allowed the IOs or bytes to slightly overrun in the last
4173 : * timeslice. remaining_this_timeslice is signed, so if it's negative
4174 : * here, we'll account for the overrun so that the next timeslice will
4175 : * be appropriately reduced.
4176 : */
4177 8 : remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice,
4178 : 0, __ATOMIC_RELAXED);
4179 8 : if (remaining_last_timeslice < 0) {
4180 : /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos()
4181 : * potentially use 2 atomic ops each, so they can intertwine.
4182 : * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage.
4183 : */
4184 0 : __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice,
4185 0 : remaining_last_timeslice, __ATOMIC_RELAXED);
4186 0 : }
4187 8 : }
4188 :
4189 4 : while (now >= (qos->last_timeslice + qos->timeslice_size)) {
4190 2 : qos->last_timeslice += qos->timeslice_size;
4191 10 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4192 16 : __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice,
4193 8 : qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED);
4194 8 : }
4195 : }
4196 :
4197 2 : spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos,
4198 : bdev_channel_submit_qos_io_done);
4199 :
4200 2 : return SPDK_POLLER_BUSY;
4201 3 : }
4202 :
4203 : static void
4204 76 : bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
4205 : {
4206 : struct spdk_bdev_shared_resource *shared_resource;
4207 : struct lba_range *range;
4208 :
4209 76 : bdev_free_io_stat(ch->stat);
4210 : #ifdef SPDK_CONFIG_VTUNE
4211 : bdev_free_io_stat(ch->prev_stat);
4212 : #endif
4213 :
4214 76 : while (!TAILQ_EMPTY(&ch->locked_ranges)) {
4215 0 : range = TAILQ_FIRST(&ch->locked_ranges);
4216 0 : TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
4217 0 : free(range);
4218 : }
4219 :
4220 76 : spdk_put_io_channel(ch->channel);
4221 76 : spdk_put_io_channel(ch->accel_channel);
4222 :
4223 76 : shared_resource = ch->shared_resource;
4224 :
4225 76 : assert(TAILQ_EMPTY(&ch->io_locked));
4226 76 : assert(TAILQ_EMPTY(&ch->io_submitted));
4227 76 : assert(TAILQ_EMPTY(&ch->io_accel_exec));
4228 76 : assert(TAILQ_EMPTY(&ch->io_memory_domain));
4229 76 : assert(ch->io_outstanding == 0);
4230 76 : assert(shared_resource->ref > 0);
4231 76 : shared_resource->ref--;
4232 76 : if (shared_resource->ref == 0) {
4233 75 : assert(shared_resource->io_outstanding == 0);
4234 75 : TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
4235 75 : spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
4236 75 : spdk_poller_unregister(&shared_resource->nomem_poller);
4237 75 : free(shared_resource);
4238 75 : }
4239 76 : }
4240 :
4241 : static void
4242 85 : bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
4243 : {
4244 85 : struct spdk_bdev_qos *qos = bdev->internal.qos;
4245 : int i;
4246 :
4247 85 : assert(spdk_spin_held(&bdev->internal.spinlock));
4248 :
4249 : /* Rate limiting on this bdev enabled */
4250 85 : if (qos) {
4251 17 : if (qos->ch == NULL) {
4252 : struct spdk_io_channel *io_ch;
4253 :
4254 9 : SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
4255 : bdev->name, spdk_get_thread());
4256 :
4257 : /* No qos channel has been selected, so set one up */
4258 :
4259 : /* Take another reference to ch */
4260 9 : io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
4261 9 : assert(io_ch != NULL);
4262 9 : qos->ch = ch;
4263 :
4264 9 : qos->thread = spdk_io_channel_get_thread(io_ch);
4265 :
4266 45 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4267 36 : if (bdev_qos_is_iops_rate_limit(i) == true) {
4268 9 : qos->rate_limits[i].min_per_timeslice =
4269 : SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE;
4270 9 : } else {
4271 27 : qos->rate_limits[i].min_per_timeslice =
4272 : SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE;
4273 : }
4274 :
4275 36 : if (qos->rate_limits[i].limit == 0) {
4276 2 : qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
4277 2 : }
4278 36 : }
4279 9 : bdev_qos_update_max_quota_per_timeslice(qos);
4280 9 : qos->timeslice_size =
4281 9 : SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
4282 9 : qos->last_timeslice = spdk_get_ticks();
4283 9 : qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos,
4284 : bdev,
4285 : SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
4286 9 : }
4287 :
4288 17 : ch->flags |= BDEV_CH_QOS_ENABLED;
4289 17 : }
4290 85 : }
4291 :
4292 : struct poll_timeout_ctx {
4293 : struct spdk_bdev_desc *desc;
4294 : uint64_t timeout_in_sec;
4295 : spdk_bdev_io_timeout_cb cb_fn;
4296 : void *cb_arg;
4297 : };
4298 :
4299 : static void
4300 280 : bdev_desc_free(struct spdk_bdev_desc *desc)
4301 : {
4302 280 : spdk_spin_destroy(&desc->spinlock);
4303 280 : free(desc->media_events_buffer);
4304 280 : free(desc);
4305 280 : }
4306 :
4307 : static void
4308 8 : bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
4309 : {
4310 8 : struct poll_timeout_ctx *ctx = _ctx;
4311 8 : struct spdk_bdev_desc *desc = ctx->desc;
4312 :
4313 8 : free(ctx);
4314 :
4315 8 : spdk_spin_lock(&desc->spinlock);
4316 8 : desc->refs--;
4317 8 : if (desc->closed == true && desc->refs == 0) {
4318 1 : spdk_spin_unlock(&desc->spinlock);
4319 1 : bdev_desc_free(desc);
4320 1 : return;
4321 : }
4322 7 : spdk_spin_unlock(&desc->spinlock);
4323 8 : }
4324 :
4325 : static void
4326 13 : bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4327 : struct spdk_io_channel *io_ch, void *_ctx)
4328 : {
4329 13 : struct poll_timeout_ctx *ctx = _ctx;
4330 13 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
4331 13 : struct spdk_bdev_desc *desc = ctx->desc;
4332 : struct spdk_bdev_io *bdev_io;
4333 : uint64_t now;
4334 :
4335 13 : spdk_spin_lock(&desc->spinlock);
4336 13 : if (desc->closed == true) {
4337 1 : spdk_spin_unlock(&desc->spinlock);
4338 1 : spdk_bdev_for_each_channel_continue(i, -1);
4339 1 : return;
4340 : }
4341 12 : spdk_spin_unlock(&desc->spinlock);
4342 :
4343 12 : now = spdk_get_ticks();
4344 22 : TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
4345 : /* Exclude any I/O that are generated via splitting. */
4346 15 : if (bdev_io->internal.cb == bdev_io_split_done) {
4347 3 : continue;
4348 : }
4349 :
4350 : /* Once we find an I/O that has not timed out, we can immediately
4351 : * exit the loop.
4352 : */
4353 24 : if (now < (bdev_io->internal.submit_tsc +
4354 12 : ctx->timeout_in_sec * spdk_get_ticks_hz())) {
4355 5 : goto end;
4356 : }
4357 :
4358 7 : if (bdev_io->internal.desc == desc) {
4359 7 : ctx->cb_fn(ctx->cb_arg, bdev_io);
4360 7 : }
4361 14 : }
4362 :
4363 : end:
4364 12 : spdk_bdev_for_each_channel_continue(i, 0);
4365 13 : }
4366 :
4367 : static int
4368 8 : bdev_poll_timeout_io(void *arg)
4369 : {
4370 8 : struct spdk_bdev_desc *desc = arg;
4371 8 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4372 : struct poll_timeout_ctx *ctx;
4373 :
4374 8 : ctx = calloc(1, sizeof(struct poll_timeout_ctx));
4375 8 : if (!ctx) {
4376 0 : SPDK_ERRLOG("failed to allocate memory\n");
4377 0 : return SPDK_POLLER_BUSY;
4378 : }
4379 8 : ctx->desc = desc;
4380 8 : ctx->cb_arg = desc->cb_arg;
4381 8 : ctx->cb_fn = desc->cb_fn;
4382 8 : ctx->timeout_in_sec = desc->timeout_in_sec;
4383 :
4384 : /* Take a ref on the descriptor in case it gets closed while we are checking
4385 : * all of the channels.
4386 : */
4387 8 : spdk_spin_lock(&desc->spinlock);
4388 8 : desc->refs++;
4389 8 : spdk_spin_unlock(&desc->spinlock);
4390 :
4391 8 : spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx,
4392 : bdev_channel_poll_timeout_io_done);
4393 :
4394 8 : return SPDK_POLLER_BUSY;
4395 8 : }
4396 :
4397 : int
4398 5 : spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec,
4399 : spdk_bdev_io_timeout_cb cb_fn, void *cb_arg)
4400 : {
4401 5 : assert(desc->thread == spdk_get_thread());
4402 :
4403 5 : spdk_poller_unregister(&desc->io_timeout_poller);
4404 :
4405 5 : if (timeout_in_sec) {
4406 4 : assert(cb_fn != NULL);
4407 4 : desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io,
4408 : desc,
4409 : SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC /
4410 : 1000);
4411 4 : if (desc->io_timeout_poller == NULL) {
4412 0 : SPDK_ERRLOG("can not register the desc timeout IO poller\n");
4413 0 : return -1;
4414 : }
4415 4 : }
4416 :
4417 5 : desc->cb_fn = cb_fn;
4418 5 : desc->cb_arg = cb_arg;
4419 5 : desc->timeout_in_sec = timeout_in_sec;
4420 :
4421 5 : return 0;
4422 5 : }
4423 :
4424 : static int
4425 78 : bdev_channel_create(void *io_device, void *ctx_buf)
4426 : {
4427 78 : struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
4428 78 : struct spdk_bdev_channel *ch = ctx_buf;
4429 : struct spdk_io_channel *mgmt_io_ch;
4430 : struct spdk_bdev_mgmt_channel *mgmt_ch;
4431 : struct spdk_bdev_shared_resource *shared_resource;
4432 : struct lba_range *range;
4433 :
4434 78 : ch->bdev = bdev;
4435 78 : ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
4436 78 : if (!ch->channel) {
4437 2 : return -1;
4438 : }
4439 :
4440 76 : ch->accel_channel = spdk_accel_get_io_channel();
4441 76 : if (!ch->accel_channel) {
4442 0 : spdk_put_io_channel(ch->channel);
4443 0 : return -1;
4444 : }
4445 :
4446 76 : spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0,
4447 : spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
4448 :
4449 76 : assert(ch->histogram == NULL);
4450 76 : if (bdev->internal.histogram_enabled) {
4451 0 : ch->histogram = spdk_histogram_data_alloc();
4452 0 : if (ch->histogram == NULL) {
4453 0 : SPDK_ERRLOG("Could not allocate histogram\n");
4454 0 : }
4455 0 : }
4456 :
4457 76 : mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
4458 76 : if (!mgmt_io_ch) {
4459 0 : spdk_put_io_channel(ch->channel);
4460 0 : spdk_put_io_channel(ch->accel_channel);
4461 0 : return -1;
4462 : }
4463 :
4464 76 : mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch);
4465 78 : TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
4466 3 : if (shared_resource->shared_ch == ch->channel) {
4467 1 : spdk_put_io_channel(mgmt_io_ch);
4468 1 : shared_resource->ref++;
4469 1 : break;
4470 : }
4471 2 : }
4472 :
4473 76 : if (shared_resource == NULL) {
4474 75 : shared_resource = calloc(1, sizeof(*shared_resource));
4475 75 : if (shared_resource == NULL) {
4476 0 : spdk_put_io_channel(ch->channel);
4477 0 : spdk_put_io_channel(ch->accel_channel);
4478 0 : spdk_put_io_channel(mgmt_io_ch);
4479 0 : return -1;
4480 : }
4481 :
4482 75 : shared_resource->mgmt_ch = mgmt_ch;
4483 75 : shared_resource->io_outstanding = 0;
4484 75 : TAILQ_INIT(&shared_resource->nomem_io);
4485 75 : shared_resource->nomem_threshold = 0;
4486 75 : shared_resource->shared_ch = ch->channel;
4487 75 : shared_resource->ref = 1;
4488 75 : TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
4489 75 : }
4490 :
4491 76 : ch->io_outstanding = 0;
4492 76 : TAILQ_INIT(&ch->locked_ranges);
4493 76 : TAILQ_INIT(&ch->qos_queued_io);
4494 76 : ch->flags = 0;
4495 76 : ch->trace_id = bdev->internal.trace_id;
4496 76 : ch->shared_resource = shared_resource;
4497 :
4498 76 : TAILQ_INIT(&ch->io_submitted);
4499 76 : TAILQ_INIT(&ch->io_locked);
4500 76 : TAILQ_INIT(&ch->io_accel_exec);
4501 76 : TAILQ_INIT(&ch->io_memory_domain);
4502 :
4503 76 : ch->stat = bdev_alloc_io_stat(false);
4504 76 : if (ch->stat == NULL) {
4505 0 : bdev_channel_destroy_resource(ch);
4506 0 : return -1;
4507 : }
4508 :
4509 76 : ch->stat->ticks_rate = spdk_get_ticks_hz();
4510 :
4511 : #ifdef SPDK_CONFIG_VTUNE
4512 : {
4513 : char *name;
4514 : __itt_init_ittlib(NULL, 0);
4515 : name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
4516 : if (!name) {
4517 : bdev_channel_destroy_resource(ch);
4518 : return -1;
4519 : }
4520 : ch->handle = __itt_string_handle_create(name);
4521 : free(name);
4522 : ch->start_tsc = spdk_get_ticks();
4523 : ch->interval_tsc = spdk_get_ticks_hz() / 100;
4524 : ch->prev_stat = bdev_alloc_io_stat(false);
4525 : if (ch->prev_stat == NULL) {
4526 : bdev_channel_destroy_resource(ch);
4527 : return -1;
4528 : }
4529 : }
4530 : #endif
4531 :
4532 76 : spdk_spin_lock(&bdev->internal.spinlock);
4533 76 : bdev_enable_qos(bdev, ch);
4534 :
4535 77 : TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
4536 : struct lba_range *new_range;
4537 :
4538 1 : new_range = calloc(1, sizeof(*new_range));
4539 1 : if (new_range == NULL) {
4540 0 : spdk_spin_unlock(&bdev->internal.spinlock);
4541 0 : bdev_channel_destroy_resource(ch);
4542 0 : return -1;
4543 : }
4544 1 : new_range->length = range->length;
4545 1 : new_range->offset = range->offset;
4546 1 : new_range->locked_ctx = range->locked_ctx;
4547 1 : TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq);
4548 1 : }
4549 :
4550 76 : spdk_spin_unlock(&bdev->internal.spinlock);
4551 :
4552 76 : return 0;
4553 78 : }
4554 :
4555 : static int
4556 0 : bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry,
4557 : void *cb_ctx)
4558 : {
4559 0 : struct spdk_bdev_channel *bdev_ch = cb_ctx;
4560 : struct spdk_bdev_io *bdev_io;
4561 : uint64_t buf_len;
4562 :
4563 0 : bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4564 0 : if (bdev_io->internal.ch == bdev_ch) {
4565 0 : buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len);
4566 0 : spdk_iobuf_entry_abort(ch, entry, buf_len);
4567 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4568 0 : }
4569 :
4570 0 : return 0;
4571 : }
4572 :
4573 : /*
4574 : * Abort I/O that are waiting on a data buffer.
4575 : */
4576 : static void
4577 100 : bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch)
4578 : {
4579 100 : spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_all_buf_io_cb, ch);
4580 100 : }
4581 :
4582 : /*
4583 : * Abort I/O that are queued waiting for submission. These types of I/O are
4584 : * linked using the spdk_bdev_io link TAILQ_ENTRY.
4585 : */
4586 : static void
4587 102 : bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
4588 : {
4589 : struct spdk_bdev_io *bdev_io, *tmp;
4590 :
4591 157 : TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) {
4592 55 : if (bdev_io->internal.ch == ch) {
4593 55 : TAILQ_REMOVE(queue, bdev_io, internal.link);
4594 : /*
4595 : * spdk_bdev_io_complete() assumes that the completed I/O had
4596 : * been submitted to the bdev module. Since in this case it
4597 : * hadn't, bump io_outstanding to account for the decrement
4598 : * that spdk_bdev_io_complete() will do.
4599 : */
4600 55 : if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
4601 55 : bdev_io_increment_outstanding(ch, ch->shared_resource);
4602 55 : }
4603 55 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4604 55 : }
4605 55 : }
4606 102 : }
4607 :
4608 : static inline void
4609 100 : bdev_abort_all_nomem_io(struct spdk_bdev_channel *ch)
4610 : {
4611 100 : struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
4612 :
4613 100 : shared_resource->abort_in_progress = true;
4614 100 : bdev_abort_all_queued_io(&shared_resource->nomem_io, ch);
4615 100 : shared_resource->abort_in_progress = false;
4616 100 : }
4617 :
4618 : static bool
4619 18 : bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort)
4620 : {
4621 : struct spdk_bdev_io *bdev_io;
4622 :
4623 18 : TAILQ_FOREACH(bdev_io, queue, internal.link) {
4624 0 : if (bdev_io == bio_to_abort) {
4625 0 : TAILQ_REMOVE(queue, bio_to_abort, internal.link);
4626 0 : spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4627 0 : return true;
4628 : }
4629 0 : }
4630 :
4631 18 : return false;
4632 18 : }
4633 :
4634 : static int
4635 0 : bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx)
4636 : {
4637 0 : struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx;
4638 : uint64_t buf_len;
4639 :
4640 0 : bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4641 0 : if (bdev_io == bio_to_abort) {
4642 0 : buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len);
4643 0 : spdk_iobuf_entry_abort(ch, entry, buf_len);
4644 0 : spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4645 0 : return 1;
4646 : }
4647 :
4648 0 : return 0;
4649 0 : }
4650 :
4651 : static bool
4652 16 : bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort)
4653 : {
4654 : int rc;
4655 :
4656 16 : rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_buf_io_cb, bio_to_abort);
4657 16 : return rc == 1;
4658 : }
4659 :
4660 : static void
4661 7 : bdev_qos_channel_destroy(void *cb_arg)
4662 : {
4663 7 : struct spdk_bdev_qos *qos = cb_arg;
4664 :
4665 7 : spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
4666 7 : spdk_poller_unregister(&qos->poller);
4667 :
4668 7 : SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos);
4669 :
4670 7 : free(qos);
4671 7 : }
4672 :
4673 : static int
4674 7 : bdev_qos_destroy(struct spdk_bdev *bdev)
4675 : {
4676 : int i;
4677 :
4678 : /*
4679 : * Cleanly shutting down the QoS poller is tricky, because
4680 : * during the asynchronous operation the user could open
4681 : * a new descriptor and create a new channel, spawning
4682 : * a new QoS poller.
4683 : *
4684 : * The strategy is to create a new QoS structure here and swap it
4685 : * in. The shutdown path then continues to refer to the old one
4686 : * until it completes and then releases it.
4687 : */
4688 : struct spdk_bdev_qos *new_qos, *old_qos;
4689 :
4690 7 : old_qos = bdev->internal.qos;
4691 :
4692 7 : new_qos = calloc(1, sizeof(*new_qos));
4693 7 : if (!new_qos) {
4694 0 : SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
4695 0 : return -ENOMEM;
4696 : }
4697 :
4698 : /* Copy the old QoS data into the newly allocated structure */
4699 7 : memcpy(new_qos, old_qos, sizeof(*new_qos));
4700 :
4701 : /* Zero out the key parts of the QoS structure */
4702 7 : new_qos->ch = NULL;
4703 7 : new_qos->thread = NULL;
4704 7 : new_qos->poller = NULL;
4705 : /*
4706 : * The limit member of spdk_bdev_qos_limit structure is not zeroed.
4707 : * It will be used later for the new QoS structure.
4708 : */
4709 35 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4710 28 : new_qos->rate_limits[i].remaining_this_timeslice = 0;
4711 28 : new_qos->rate_limits[i].min_per_timeslice = 0;
4712 28 : new_qos->rate_limits[i].max_per_timeslice = 0;
4713 28 : }
4714 :
4715 7 : bdev->internal.qos = new_qos;
4716 :
4717 7 : if (old_qos->thread == NULL) {
4718 0 : free(old_qos);
4719 0 : } else {
4720 7 : spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos);
4721 : }
4722 :
4723 : /* It is safe to continue with destroying the bdev even though the QoS channel hasn't
4724 : * been destroyed yet. The destruction path will end up waiting for the final
4725 : * channel to be put before it releases resources. */
4726 :
4727 7 : return 0;
4728 7 : }
4729 :
4730 : void
4731 80 : spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add)
4732 : {
4733 80 : total->bytes_read += add->bytes_read;
4734 80 : total->num_read_ops += add->num_read_ops;
4735 80 : total->bytes_written += add->bytes_written;
4736 80 : total->num_write_ops += add->num_write_ops;
4737 80 : total->bytes_unmapped += add->bytes_unmapped;
4738 80 : total->num_unmap_ops += add->num_unmap_ops;
4739 80 : total->bytes_copied += add->bytes_copied;
4740 80 : total->num_copy_ops += add->num_copy_ops;
4741 80 : total->read_latency_ticks += add->read_latency_ticks;
4742 80 : total->write_latency_ticks += add->write_latency_ticks;
4743 80 : total->unmap_latency_ticks += add->unmap_latency_ticks;
4744 80 : total->copy_latency_ticks += add->copy_latency_ticks;
4745 80 : if (total->max_read_latency_ticks < add->max_read_latency_ticks) {
4746 7 : total->max_read_latency_ticks = add->max_read_latency_ticks;
4747 7 : }
4748 80 : if (total->min_read_latency_ticks > add->min_read_latency_ticks) {
4749 39 : total->min_read_latency_ticks = add->min_read_latency_ticks;
4750 39 : }
4751 80 : if (total->max_write_latency_ticks < add->max_write_latency_ticks) {
4752 4 : total->max_write_latency_ticks = add->max_write_latency_ticks;
4753 4 : }
4754 80 : if (total->min_write_latency_ticks > add->min_write_latency_ticks) {
4755 24 : total->min_write_latency_ticks = add->min_write_latency_ticks;
4756 24 : }
4757 80 : if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) {
4758 0 : total->max_unmap_latency_ticks = add->max_unmap_latency_ticks;
4759 0 : }
4760 80 : if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) {
4761 3 : total->min_unmap_latency_ticks = add->min_unmap_latency_ticks;
4762 3 : }
4763 80 : if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) {
4764 0 : total->max_copy_latency_ticks = add->max_copy_latency_ticks;
4765 0 : }
4766 80 : if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) {
4767 4 : total->min_copy_latency_ticks = add->min_copy_latency_ticks;
4768 4 : }
4769 80 : }
4770 :
4771 : static void
4772 5 : bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat)
4773 : {
4774 5 : memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error));
4775 :
4776 5 : if (to_stat->io_error != NULL && from_stat->io_error != NULL) {
4777 0 : memcpy(to_stat->io_error, from_stat->io_error,
4778 : sizeof(struct spdk_bdev_io_error_stat));
4779 0 : }
4780 5 : }
4781 :
4782 : void
4783 218 : spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode)
4784 : {
4785 218 : if (mode == SPDK_BDEV_RESET_STAT_NONE) {
4786 5 : return;
4787 : }
4788 :
4789 213 : stat->max_read_latency_ticks = 0;
4790 213 : stat->min_read_latency_ticks = UINT64_MAX;
4791 213 : stat->max_write_latency_ticks = 0;
4792 213 : stat->min_write_latency_ticks = UINT64_MAX;
4793 213 : stat->max_unmap_latency_ticks = 0;
4794 213 : stat->min_unmap_latency_ticks = UINT64_MAX;
4795 213 : stat->max_copy_latency_ticks = 0;
4796 213 : stat->min_copy_latency_ticks = UINT64_MAX;
4797 :
4798 213 : if (mode != SPDK_BDEV_RESET_STAT_ALL) {
4799 2 : return;
4800 : }
4801 :
4802 211 : stat->bytes_read = 0;
4803 211 : stat->num_read_ops = 0;
4804 211 : stat->bytes_written = 0;
4805 211 : stat->num_write_ops = 0;
4806 211 : stat->bytes_unmapped = 0;
4807 211 : stat->num_unmap_ops = 0;
4808 211 : stat->bytes_copied = 0;
4809 211 : stat->num_copy_ops = 0;
4810 211 : stat->read_latency_ticks = 0;
4811 211 : stat->write_latency_ticks = 0;
4812 211 : stat->unmap_latency_ticks = 0;
4813 211 : stat->copy_latency_ticks = 0;
4814 :
4815 211 : if (stat->io_error != NULL) {
4816 134 : memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat));
4817 134 : }
4818 218 : }
4819 :
4820 : struct spdk_bdev_io_stat *
4821 209 : bdev_alloc_io_stat(bool io_error_stat)
4822 : {
4823 : struct spdk_bdev_io_stat *stat;
4824 :
4825 209 : stat = malloc(sizeof(struct spdk_bdev_io_stat));
4826 209 : if (stat == NULL) {
4827 0 : return NULL;
4828 : }
4829 :
4830 209 : if (io_error_stat) {
4831 133 : stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat));
4832 133 : if (stat->io_error == NULL) {
4833 0 : free(stat);
4834 0 : return NULL;
4835 : }
4836 133 : } else {
4837 76 : stat->io_error = NULL;
4838 : }
4839 :
4840 209 : spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL);
4841 :
4842 209 : return stat;
4843 209 : }
4844 :
4845 : void
4846 209 : bdev_free_io_stat(struct spdk_bdev_io_stat *stat)
4847 : {
4848 209 : if (stat != NULL) {
4849 209 : free(stat->io_error);
4850 209 : free(stat);
4851 209 : }
4852 209 : }
4853 :
4854 : void
4855 0 : spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w)
4856 : {
4857 : int i;
4858 :
4859 0 : spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read);
4860 0 : spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops);
4861 0 : spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written);
4862 0 : spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops);
4863 0 : spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped);
4864 0 : spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops);
4865 0 : spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied);
4866 0 : spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops);
4867 0 : spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks);
4868 0 : spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks);
4869 0 : spdk_json_write_named_uint64(w, "min_read_latency_ticks",
4870 0 : stat->min_read_latency_ticks != UINT64_MAX ?
4871 0 : stat->min_read_latency_ticks : 0);
4872 0 : spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks);
4873 0 : spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks);
4874 0 : spdk_json_write_named_uint64(w, "min_write_latency_ticks",
4875 0 : stat->min_write_latency_ticks != UINT64_MAX ?
4876 0 : stat->min_write_latency_ticks : 0);
4877 0 : spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks);
4878 0 : spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks);
4879 0 : spdk_json_write_named_uint64(w, "min_unmap_latency_ticks",
4880 0 : stat->min_unmap_latency_ticks != UINT64_MAX ?
4881 0 : stat->min_unmap_latency_ticks : 0);
4882 0 : spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks);
4883 0 : spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks);
4884 0 : spdk_json_write_named_uint64(w, "min_copy_latency_ticks",
4885 0 : stat->min_copy_latency_ticks != UINT64_MAX ?
4886 0 : stat->min_copy_latency_ticks : 0);
4887 :
4888 0 : if (stat->io_error != NULL) {
4889 0 : spdk_json_write_named_object_begin(w, "io_error");
4890 0 : for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) {
4891 0 : if (stat->io_error->error_status[i] != 0) {
4892 0 : spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)),
4893 0 : stat->io_error->error_status[i]);
4894 0 : }
4895 0 : }
4896 0 : spdk_json_write_object_end(w);
4897 0 : }
4898 0 : }
4899 :
4900 : static void
4901 80 : bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch)
4902 : {
4903 80 : struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
4904 80 : struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch;
4905 :
4906 80 : bdev_abort_all_nomem_io(ch);
4907 80 : bdev_abort_all_buf_io(mgmt_ch, ch);
4908 80 : }
4909 :
4910 : static void
4911 76 : bdev_channel_destroy(void *io_device, void *ctx_buf)
4912 : {
4913 76 : struct spdk_bdev_channel *ch = ctx_buf;
4914 :
4915 76 : SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
4916 : spdk_get_thread());
4917 :
4918 76 : spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0,
4919 : spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
4920 :
4921 : /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */
4922 76 : spdk_spin_lock(&ch->bdev->internal.spinlock);
4923 76 : spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat);
4924 76 : spdk_spin_unlock(&ch->bdev->internal.spinlock);
4925 :
4926 76 : bdev_channel_abort_queued_ios(ch);
4927 :
4928 76 : if (ch->histogram) {
4929 0 : spdk_histogram_data_free(ch->histogram);
4930 0 : }
4931 :
4932 76 : bdev_channel_destroy_resource(ch);
4933 76 : }
4934 :
4935 : /*
4936 : * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer
4937 : * to it. Hence we do not have to call bdev_get_by_name() when using this function.
4938 : */
4939 : static int
4940 269 : bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name)
4941 : {
4942 : struct spdk_bdev_name *tmp;
4943 :
4944 269 : bdev_name->name = strdup(name);
4945 269 : if (bdev_name->name == NULL) {
4946 0 : SPDK_ERRLOG("Unable to allocate bdev name\n");
4947 0 : return -ENOMEM;
4948 : }
4949 :
4950 269 : bdev_name->bdev = bdev;
4951 :
4952 269 : spdk_spin_lock(&g_bdev_mgr.spinlock);
4953 269 : tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4954 269 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
4955 :
4956 269 : if (tmp != NULL) {
4957 4 : SPDK_ERRLOG("Bdev name %s already exists\n", name);
4958 4 : free(bdev_name->name);
4959 4 : return -EEXIST;
4960 : }
4961 :
4962 265 : return 0;
4963 269 : }
4964 :
4965 : static void
4966 265 : bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name)
4967 : {
4968 265 : RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4969 265 : free(bdev_name->name);
4970 265 : }
4971 :
4972 : static void
4973 5 : bdev_name_del(struct spdk_bdev_name *bdev_name)
4974 : {
4975 5 : spdk_spin_lock(&g_bdev_mgr.spinlock);
4976 5 : bdev_name_del_unsafe(bdev_name);
4977 5 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
4978 5 : }
4979 :
4980 : int
4981 139 : spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
4982 : {
4983 : struct spdk_bdev_alias *tmp;
4984 : int ret;
4985 :
4986 139 : if (alias == NULL) {
4987 1 : SPDK_ERRLOG("Empty alias passed\n");
4988 1 : return -EINVAL;
4989 : }
4990 :
4991 138 : tmp = calloc(1, sizeof(*tmp));
4992 138 : if (tmp == NULL) {
4993 0 : SPDK_ERRLOG("Unable to allocate alias\n");
4994 0 : return -ENOMEM;
4995 : }
4996 :
4997 138 : ret = bdev_name_add(&tmp->alias, bdev, alias);
4998 138 : if (ret != 0) {
4999 4 : free(tmp);
5000 4 : return ret;
5001 : }
5002 :
5003 134 : TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
5004 :
5005 134 : return 0;
5006 139 : }
5007 :
5008 : static int
5009 135 : bdev_alias_del(struct spdk_bdev *bdev, const char *alias,
5010 : void (*alias_del_fn)(struct spdk_bdev_name *n))
5011 : {
5012 : struct spdk_bdev_alias *tmp;
5013 :
5014 140 : TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
5015 136 : if (strcmp(alias, tmp->alias.name) == 0) {
5016 131 : TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
5017 131 : alias_del_fn(&tmp->alias);
5018 131 : free(tmp);
5019 131 : return 0;
5020 : }
5021 5 : }
5022 :
5023 4 : return -ENOENT;
5024 135 : }
5025 :
5026 : int
5027 4 : spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
5028 : {
5029 : int rc;
5030 :
5031 4 : rc = bdev_alias_del(bdev, alias, bdev_name_del);
5032 4 : if (rc == -ENOENT) {
5033 2 : SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias);
5034 2 : }
5035 :
5036 4 : return rc;
5037 : }
5038 :
5039 : void
5040 2 : spdk_bdev_alias_del_all(struct spdk_bdev *bdev)
5041 : {
5042 : struct spdk_bdev_alias *p, *tmp;
5043 :
5044 5 : TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) {
5045 3 : TAILQ_REMOVE(&bdev->aliases, p, tailq);
5046 3 : bdev_name_del(&p->alias);
5047 3 : free(p);
5048 3 : }
5049 2 : }
5050 :
5051 : struct spdk_io_channel *
5052 78 : spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
5053 : {
5054 78 : return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc)));
5055 : }
5056 :
5057 : void *
5058 0 : spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc)
5059 : {
5060 0 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5061 0 : void *ctx = NULL;
5062 :
5063 0 : if (bdev->fn_table->get_module_ctx) {
5064 0 : ctx = bdev->fn_table->get_module_ctx(bdev->ctxt);
5065 0 : }
5066 :
5067 0 : return ctx;
5068 : }
5069 :
5070 : const char *
5071 0 : spdk_bdev_get_module_name(const struct spdk_bdev *bdev)
5072 : {
5073 0 : return bdev->module->name;
5074 : }
5075 :
5076 : const char *
5077 265 : spdk_bdev_get_name(const struct spdk_bdev *bdev)
5078 : {
5079 265 : return bdev->name;
5080 : }
5081 :
5082 : const char *
5083 0 : spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
5084 : {
5085 0 : return bdev->product_name;
5086 : }
5087 :
5088 : const struct spdk_bdev_aliases_list *
5089 0 : spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
5090 : {
5091 0 : return &bdev->aliases;
5092 : }
5093 :
5094 : uint32_t
5095 5 : spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
5096 : {
5097 5 : return bdev->blocklen;
5098 : }
5099 :
5100 : uint32_t
5101 0 : spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev)
5102 : {
5103 0 : return bdev->write_unit_size;
5104 : }
5105 :
5106 : uint64_t
5107 0 : spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
5108 : {
5109 0 : return bdev->blockcnt;
5110 : }
5111 :
5112 : const char *
5113 0 : spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type)
5114 : {
5115 0 : return qos_rpc_type[type];
5116 : }
5117 :
5118 : void
5119 0 : spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
5120 : {
5121 : int i;
5122 :
5123 0 : memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
5124 :
5125 0 : spdk_spin_lock(&bdev->internal.spinlock);
5126 0 : if (bdev->internal.qos) {
5127 0 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
5128 0 : if (bdev->internal.qos->rate_limits[i].limit !=
5129 : SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
5130 0 : limits[i] = bdev->internal.qos->rate_limits[i].limit;
5131 0 : if (bdev_qos_is_iops_rate_limit(i) == false) {
5132 : /* Change from Byte to Megabyte which is user visible. */
5133 0 : limits[i] = limits[i] / 1024 / 1024;
5134 0 : }
5135 0 : }
5136 0 : }
5137 0 : }
5138 0 : spdk_spin_unlock(&bdev->internal.spinlock);
5139 0 : }
5140 :
5141 : size_t
5142 281 : spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
5143 : {
5144 281 : return 1 << bdev->required_alignment;
5145 : }
5146 :
5147 : uint32_t
5148 0 : spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
5149 : {
5150 0 : return bdev->optimal_io_boundary;
5151 : }
5152 :
5153 : bool
5154 0 : spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
5155 : {
5156 0 : return bdev->write_cache;
5157 : }
5158 :
5159 : const struct spdk_uuid *
5160 0 : spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
5161 : {
5162 0 : return &bdev->uuid;
5163 : }
5164 :
5165 : uint16_t
5166 0 : spdk_bdev_get_acwu(const struct spdk_bdev *bdev)
5167 : {
5168 0 : return bdev->acwu;
5169 : }
5170 :
5171 : uint32_t
5172 29 : spdk_bdev_get_md_size(const struct spdk_bdev *bdev)
5173 : {
5174 29 : return bdev->md_len;
5175 : }
5176 :
5177 : bool
5178 136 : spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev)
5179 : {
5180 136 : return (bdev->md_len != 0) && bdev->md_interleave;
5181 : }
5182 :
5183 : bool
5184 117 : spdk_bdev_is_md_separate(const struct spdk_bdev *bdev)
5185 : {
5186 117 : return (bdev->md_len != 0) && !bdev->md_interleave;
5187 : }
5188 :
5189 : bool
5190 0 : spdk_bdev_is_zoned(const struct spdk_bdev *bdev)
5191 : {
5192 0 : return bdev->zoned;
5193 : }
5194 :
5195 : uint32_t
5196 127 : spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev)
5197 : {
5198 127 : if (spdk_bdev_is_md_interleaved(bdev)) {
5199 0 : return bdev->blocklen - bdev->md_len;
5200 : } else {
5201 127 : return bdev->blocklen;
5202 : }
5203 127 : }
5204 :
5205 : uint32_t
5206 0 : spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev)
5207 : {
5208 0 : return bdev->phys_blocklen;
5209 : }
5210 :
5211 : static uint32_t
5212 9 : _bdev_get_block_size_with_md(const struct spdk_bdev *bdev)
5213 : {
5214 9 : if (!spdk_bdev_is_md_interleaved(bdev)) {
5215 6 : return bdev->blocklen + bdev->md_len;
5216 : } else {
5217 3 : return bdev->blocklen;
5218 : }
5219 9 : }
5220 :
5221 : /* We have to use the typedef in the function declaration to appease astyle. */
5222 : typedef enum spdk_dif_type spdk_dif_type_t;
5223 : typedef enum spdk_dif_pi_format spdk_dif_pi_format_t;
5224 :
5225 : spdk_dif_type_t
5226 0 : spdk_bdev_get_dif_type(const struct spdk_bdev *bdev)
5227 : {
5228 0 : if (bdev->md_len != 0) {
5229 0 : return bdev->dif_type;
5230 : } else {
5231 0 : return SPDK_DIF_DISABLE;
5232 : }
5233 0 : }
5234 :
5235 : spdk_dif_pi_format_t
5236 0 : spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev)
5237 : {
5238 0 : return bdev->dif_pi_format;
5239 : }
5240 :
5241 : bool
5242 0 : spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev)
5243 : {
5244 0 : if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
5245 0 : return bdev->dif_is_head_of_md;
5246 : } else {
5247 0 : return false;
5248 : }
5249 0 : }
5250 :
5251 : bool
5252 0 : spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev,
5253 : enum spdk_dif_check_type check_type)
5254 : {
5255 0 : if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) {
5256 0 : return false;
5257 : }
5258 :
5259 0 : switch (check_type) {
5260 : case SPDK_DIF_CHECK_TYPE_REFTAG:
5261 0 : return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0;
5262 : case SPDK_DIF_CHECK_TYPE_APPTAG:
5263 0 : return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0;
5264 : case SPDK_DIF_CHECK_TYPE_GUARD:
5265 0 : return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0;
5266 : default:
5267 0 : return false;
5268 : }
5269 0 : }
5270 :
5271 : static uint32_t
5272 3 : bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes)
5273 : {
5274 : uint64_t aligned_length, max_write_blocks;
5275 :
5276 3 : aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1);
5277 3 : max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev);
5278 3 : max_write_blocks -= max_write_blocks % bdev->write_unit_size;
5279 :
5280 3 : return max_write_blocks;
5281 : }
5282 :
5283 : uint32_t
5284 1 : spdk_bdev_get_max_copy(const struct spdk_bdev *bdev)
5285 : {
5286 1 : return bdev->max_copy;
5287 : }
5288 :
5289 : uint64_t
5290 0 : spdk_bdev_get_qd(const struct spdk_bdev *bdev)
5291 : {
5292 0 : return bdev->internal.measured_queue_depth;
5293 : }
5294 :
5295 : uint64_t
5296 0 : spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev)
5297 : {
5298 0 : return bdev->internal.period;
5299 : }
5300 :
5301 : uint64_t
5302 0 : spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev)
5303 : {
5304 0 : return bdev->internal.weighted_io_time;
5305 : }
5306 :
5307 : uint64_t
5308 0 : spdk_bdev_get_io_time(const struct spdk_bdev *bdev)
5309 : {
5310 0 : return bdev->internal.io_time;
5311 : }
5312 :
5313 0 : union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev)
5314 : {
5315 0 : return bdev->ctratt;
5316 : }
5317 :
5318 : uint32_t
5319 0 : spdk_bdev_get_nvme_nsid(struct spdk_bdev *bdev)
5320 : {
5321 0 : return bdev->nsid;
5322 : }
5323 :
5324 : uint32_t
5325 0 : spdk_bdev_desc_get_block_size(struct spdk_bdev_desc *desc)
5326 : {
5327 0 : struct spdk_bdev *bdev = desc->bdev;
5328 :
5329 0 : return desc->opts.hide_metadata ? bdev->blocklen - bdev->md_len : bdev->blocklen;
5330 : }
5331 :
5332 : uint32_t
5333 0 : spdk_bdev_desc_get_md_size(struct spdk_bdev_desc *desc)
5334 : {
5335 0 : struct spdk_bdev *bdev = desc->bdev;
5336 :
5337 0 : return desc->opts.hide_metadata ? 0 : bdev->md_len;
5338 : }
5339 :
5340 : bool
5341 0 : spdk_bdev_desc_is_md_interleaved(struct spdk_bdev_desc *desc)
5342 : {
5343 0 : struct spdk_bdev *bdev = desc->bdev;
5344 :
5345 0 : return desc->opts.hide_metadata ? false : spdk_bdev_is_md_interleaved(bdev);
5346 : }
5347 :
5348 : bool
5349 0 : spdk_bdev_desc_is_md_separate(struct spdk_bdev_desc *desc)
5350 : {
5351 0 : struct spdk_bdev *bdev = desc->bdev;
5352 :
5353 0 : return desc->opts.hide_metadata ? false : spdk_bdev_is_md_separate(bdev);
5354 : }
5355 :
5356 : spdk_dif_type_t
5357 0 : spdk_bdev_desc_get_dif_type(struct spdk_bdev_desc *desc)
5358 : {
5359 0 : struct spdk_bdev *bdev = desc->bdev;
5360 :
5361 0 : return desc->opts.hide_metadata ? SPDK_DIF_DISABLE : spdk_bdev_get_dif_type(bdev);
5362 : }
5363 :
5364 : spdk_dif_pi_format_t
5365 0 : spdk_bdev_desc_get_dif_pi_format(struct spdk_bdev_desc *desc)
5366 : {
5367 0 : struct spdk_bdev *bdev = desc->bdev;
5368 :
5369 0 : return desc->opts.hide_metadata ? SPDK_DIF_PI_FORMAT_16 : spdk_bdev_get_dif_pi_format(bdev);
5370 : }
5371 :
5372 : bool
5373 0 : spdk_bdev_desc_is_dif_head_of_md(struct spdk_bdev_desc *desc)
5374 : {
5375 0 : struct spdk_bdev *bdev = desc->bdev;
5376 :
5377 0 : return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_head_of_md(bdev);
5378 : }
5379 :
5380 : bool
5381 0 : spdk_bdev_desc_is_dif_check_enabled(struct spdk_bdev_desc *desc,
5382 : enum spdk_dif_check_type check_type)
5383 : {
5384 0 : struct spdk_bdev *bdev = desc->bdev;
5385 :
5386 0 : return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_check_enabled(bdev, check_type);
5387 : }
5388 :
5389 : static void bdev_update_qd_sampling_period(void *ctx);
5390 :
5391 : static void
5392 1 : _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status)
5393 : {
5394 1 : bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth;
5395 :
5396 1 : if (bdev->internal.measured_queue_depth) {
5397 0 : bdev->internal.io_time += bdev->internal.period;
5398 0 : bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth;
5399 0 : }
5400 :
5401 1 : bdev->internal.qd_poll_in_progress = false;
5402 :
5403 1 : bdev_update_qd_sampling_period(bdev);
5404 1 : }
5405 :
5406 : static void
5407 1 : _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
5408 : struct spdk_io_channel *io_ch, void *_ctx)
5409 : {
5410 1 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch);
5411 :
5412 1 : bdev->internal.temporary_queue_depth += ch->io_outstanding;
5413 1 : spdk_bdev_for_each_channel_continue(i, 0);
5414 1 : }
5415 :
5416 : static int
5417 1 : bdev_calculate_measured_queue_depth(void *ctx)
5418 : {
5419 1 : struct spdk_bdev *bdev = ctx;
5420 :
5421 1 : bdev->internal.qd_poll_in_progress = true;
5422 1 : bdev->internal.temporary_queue_depth = 0;
5423 1 : spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl);
5424 1 : return SPDK_POLLER_BUSY;
5425 : }
5426 :
5427 : static void
5428 5 : bdev_update_qd_sampling_period(void *ctx)
5429 : {
5430 5 : struct spdk_bdev *bdev = ctx;
5431 :
5432 5 : if (bdev->internal.period == bdev->internal.new_period) {
5433 0 : return;
5434 : }
5435 :
5436 5 : if (bdev->internal.qd_poll_in_progress) {
5437 1 : return;
5438 : }
5439 :
5440 4 : bdev->internal.period = bdev->internal.new_period;
5441 :
5442 4 : spdk_poller_unregister(&bdev->internal.qd_poller);
5443 4 : if (bdev->internal.period != 0) {
5444 2 : bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
5445 : bdev, bdev->internal.period);
5446 2 : } else {
5447 2 : spdk_bdev_close(bdev->internal.qd_desc);
5448 2 : bdev->internal.qd_desc = NULL;
5449 : }
5450 5 : }
5451 :
5452 : static void
5453 0 : _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
5454 : {
5455 0 : SPDK_NOTICELOG("Unexpected event type: %d\n", type);
5456 0 : }
5457 :
5458 : void
5459 136 : spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period)
5460 : {
5461 : int rc;
5462 :
5463 136 : if (bdev->internal.new_period == period) {
5464 130 : return;
5465 : }
5466 :
5467 6 : bdev->internal.new_period = period;
5468 :
5469 6 : if (bdev->internal.qd_desc != NULL) {
5470 4 : assert(bdev->internal.period != 0);
5471 :
5472 8 : spdk_thread_send_msg(bdev->internal.qd_desc->thread,
5473 4 : bdev_update_qd_sampling_period, bdev);
5474 4 : return;
5475 : }
5476 :
5477 2 : assert(bdev->internal.period == 0);
5478 :
5479 4 : rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb,
5480 2 : NULL, &bdev->internal.qd_desc);
5481 2 : if (rc != 0) {
5482 0 : return;
5483 : }
5484 :
5485 2 : bdev->internal.period = period;
5486 2 : bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
5487 : bdev, period);
5488 136 : }
5489 :
5490 : struct bdev_get_current_qd_ctx {
5491 : uint64_t current_qd;
5492 : spdk_bdev_get_current_qd_cb cb_fn;
5493 : void *cb_arg;
5494 : };
5495 :
5496 : static void
5497 0 : bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status)
5498 : {
5499 0 : struct bdev_get_current_qd_ctx *ctx = _ctx;
5500 :
5501 0 : ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0);
5502 :
5503 0 : free(ctx);
5504 0 : }
5505 :
5506 : static void
5507 0 : bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
5508 : struct spdk_io_channel *io_ch, void *_ctx)
5509 : {
5510 0 : struct bdev_get_current_qd_ctx *ctx = _ctx;
5511 0 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
5512 :
5513 0 : ctx->current_qd += bdev_ch->io_outstanding;
5514 :
5515 0 : spdk_bdev_for_each_channel_continue(i, 0);
5516 0 : }
5517 :
5518 : void
5519 0 : spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn,
5520 : void *cb_arg)
5521 : {
5522 : struct bdev_get_current_qd_ctx *ctx;
5523 :
5524 0 : assert(cb_fn != NULL);
5525 :
5526 0 : ctx = calloc(1, sizeof(*ctx));
5527 0 : if (ctx == NULL) {
5528 0 : cb_fn(bdev, 0, cb_arg, -ENOMEM);
5529 0 : return;
5530 : }
5531 :
5532 0 : ctx->cb_fn = cb_fn;
5533 0 : ctx->cb_arg = cb_arg;
5534 :
5535 0 : spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done);
5536 0 : }
5537 :
5538 : static void
5539 25 : _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type)
5540 : {
5541 25 : assert(desc->thread == spdk_get_thread());
5542 :
5543 25 : spdk_spin_lock(&desc->spinlock);
5544 25 : desc->refs--;
5545 25 : if (!desc->closed) {
5546 14 : spdk_spin_unlock(&desc->spinlock);
5547 28 : desc->callback.event_fn(type,
5548 14 : desc->bdev,
5549 14 : desc->callback.ctx);
5550 14 : return;
5551 11 : } else if (desc->refs == 0) {
5552 : /* This descriptor was closed after this event_notify message was sent.
5553 : * spdk_bdev_close() could not free the descriptor since this message was
5554 : * in flight, so we free it now using bdev_desc_free().
5555 : */
5556 10 : spdk_spin_unlock(&desc->spinlock);
5557 10 : bdev_desc_free(desc);
5558 10 : return;
5559 : }
5560 1 : spdk_spin_unlock(&desc->spinlock);
5561 25 : }
5562 :
5563 : static void
5564 25 : event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn)
5565 : {
5566 25 : spdk_spin_lock(&desc->spinlock);
5567 25 : desc->refs++;
5568 25 : spdk_thread_send_msg(desc->thread, event_notify_fn, desc);
5569 25 : spdk_spin_unlock(&desc->spinlock);
5570 25 : }
5571 :
5572 : static void
5573 6 : _resize_notify(void *ctx)
5574 : {
5575 6 : struct spdk_bdev_desc *desc = ctx;
5576 :
5577 6 : _event_notify(desc, SPDK_BDEV_EVENT_RESIZE);
5578 6 : }
5579 :
5580 : int
5581 11 : spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
5582 : {
5583 : struct spdk_bdev_desc *desc;
5584 : int ret;
5585 :
5586 11 : if (size == bdev->blockcnt) {
5587 0 : return 0;
5588 : }
5589 :
5590 11 : spdk_spin_lock(&bdev->internal.spinlock);
5591 :
5592 : /* bdev has open descriptors */
5593 11 : if (!TAILQ_EMPTY(&bdev->internal.open_descs) &&
5594 7 : bdev->blockcnt > size) {
5595 1 : ret = -EBUSY;
5596 1 : } else {
5597 10 : bdev->blockcnt = size;
5598 16 : TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
5599 6 : event_notify(desc, _resize_notify);
5600 6 : }
5601 10 : ret = 0;
5602 : }
5603 :
5604 11 : spdk_spin_unlock(&bdev->internal.spinlock);
5605 :
5606 11 : return ret;
5607 11 : }
5608 :
5609 : /*
5610 : * Convert I/O offset and length from bytes to blocks.
5611 : *
5612 : * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
5613 : */
5614 : static uint64_t
5615 20 : bdev_bytes_to_blocks(struct spdk_bdev_desc *desc, uint64_t offset_bytes,
5616 : uint64_t *offset_blocks, uint64_t num_bytes, uint64_t *num_blocks)
5617 : {
5618 20 : uint32_t block_size = bdev_desc_get_block_size(desc);
5619 : uint8_t shift_cnt;
5620 :
5621 : /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
5622 20 : if (spdk_likely(spdk_u32_is_pow2(block_size))) {
5623 17 : shift_cnt = spdk_u32log2(block_size);
5624 17 : *offset_blocks = offset_bytes >> shift_cnt;
5625 17 : *num_blocks = num_bytes >> shift_cnt;
5626 34 : return (offset_bytes - (*offset_blocks << shift_cnt)) |
5627 17 : (num_bytes - (*num_blocks << shift_cnt));
5628 : } else {
5629 3 : *offset_blocks = offset_bytes / block_size;
5630 3 : *num_blocks = num_bytes / block_size;
5631 3 : return (offset_bytes % block_size) | (num_bytes % block_size);
5632 : }
5633 20 : }
5634 :
5635 : static bool
5636 705 : bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
5637 : {
5638 : /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
5639 : * has been an overflow and hence the offset has been wrapped around */
5640 705 : if (offset_blocks + num_blocks < offset_blocks) {
5641 1 : return false;
5642 : }
5643 :
5644 : /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
5645 704 : if (offset_blocks + num_blocks > bdev->blockcnt) {
5646 2 : return false;
5647 : }
5648 :
5649 702 : return true;
5650 705 : }
5651 :
5652 : static void
5653 2 : bdev_seek_complete_cb(void *ctx)
5654 : {
5655 2 : struct spdk_bdev_io *bdev_io = ctx;
5656 :
5657 2 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
5658 2 : bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
5659 2 : }
5660 :
5661 : static int
5662 4 : bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5663 : uint64_t offset_blocks, enum spdk_bdev_io_type io_type,
5664 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5665 : {
5666 4 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5667 : struct spdk_bdev_io *bdev_io;
5668 4 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5669 :
5670 4 : assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE);
5671 :
5672 : /* Check if offset_blocks is valid looking at the validity of one block */
5673 4 : if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) {
5674 0 : return -EINVAL;
5675 : }
5676 :
5677 4 : bdev_io = bdev_channel_get_io(channel);
5678 4 : if (!bdev_io) {
5679 0 : return -ENOMEM;
5680 : }
5681 :
5682 4 : bdev_io->internal.ch = channel;
5683 4 : bdev_io->internal.desc = desc;
5684 4 : bdev_io->type = io_type;
5685 4 : bdev_io->u.bdev.offset_blocks = offset_blocks;
5686 4 : bdev_io->u.bdev.memory_domain = NULL;
5687 4 : bdev_io->u.bdev.memory_domain_ctx = NULL;
5688 4 : bdev_io->u.bdev.accel_sequence = NULL;
5689 4 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
5690 :
5691 4 : if (!spdk_bdev_io_type_supported(bdev, io_type)) {
5692 : /* In case bdev doesn't support seek to next data/hole offset,
5693 : * it is assumed that only data and no holes are present */
5694 2 : if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) {
5695 1 : bdev_io->u.bdev.seek.offset = offset_blocks;
5696 1 : } else {
5697 1 : bdev_io->u.bdev.seek.offset = UINT64_MAX;
5698 : }
5699 :
5700 2 : spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io);
5701 2 : return 0;
5702 : }
5703 :
5704 2 : bdev_io_submit(bdev_io);
5705 2 : return 0;
5706 4 : }
5707 :
5708 : int
5709 2 : spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5710 : uint64_t offset_blocks,
5711 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5712 : {
5713 2 : return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg);
5714 : }
5715 :
5716 : int
5717 2 : spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5718 : uint64_t offset_blocks,
5719 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5720 : {
5721 2 : return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg);
5722 : }
5723 :
5724 : uint64_t
5725 4 : spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io)
5726 : {
5727 4 : return bdev_io->u.bdev.seek.offset;
5728 : }
5729 :
5730 : static int
5731 220 : bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf,
5732 : void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5733 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5734 : {
5735 220 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5736 : struct spdk_bdev_io *bdev_io;
5737 220 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5738 :
5739 220 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5740 0 : return -EINVAL;
5741 : }
5742 :
5743 220 : bdev_io = bdev_channel_get_io(channel);
5744 220 : if (!bdev_io) {
5745 1 : return -ENOMEM;
5746 : }
5747 :
5748 219 : bdev_io->internal.ch = channel;
5749 219 : bdev_io->internal.desc = desc;
5750 219 : bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5751 219 : bdev_io->u.bdev.iovs = &bdev_io->iov;
5752 219 : bdev_io->u.bdev.iovs[0].iov_base = buf;
5753 219 : bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc);
5754 219 : bdev_io->u.bdev.iovcnt = 1;
5755 219 : bdev_io->u.bdev.md_buf = md_buf;
5756 219 : bdev_io->u.bdev.num_blocks = num_blocks;
5757 219 : bdev_io->u.bdev.offset_blocks = offset_blocks;
5758 219 : bdev_io->u.bdev.memory_domain = NULL;
5759 219 : bdev_io->u.bdev.memory_domain_ctx = NULL;
5760 219 : bdev_io->u.bdev.accel_sequence = NULL;
5761 219 : bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags;
5762 219 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
5763 :
5764 219 : bdev_io_submit(bdev_io);
5765 219 : return 0;
5766 220 : }
5767 :
5768 : int
5769 3 : spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5770 : void *buf, uint64_t offset, uint64_t nbytes,
5771 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5772 : {
5773 : uint64_t offset_blocks, num_blocks;
5774 :
5775 3 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
5776 0 : return -EINVAL;
5777 : }
5778 :
5779 3 : return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
5780 3 : }
5781 :
5782 : int
5783 216 : spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5784 : void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5785 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5786 : {
5787 216 : return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg);
5788 : }
5789 :
5790 : int
5791 4 : spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5792 : void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5793 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5794 : {
5795 8 : struct iovec iov = {
5796 4 : .iov_base = buf,
5797 : };
5798 :
5799 4 : if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5800 0 : return -EINVAL;
5801 : }
5802 :
5803 4 : if ((md_buf || desc->opts.hide_metadata) && !_is_buf_allocated(&iov)) {
5804 0 : return -EINVAL;
5805 : }
5806 :
5807 8 : return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5808 4 : cb, cb_arg);
5809 4 : }
5810 :
5811 : int
5812 5 : spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5813 : struct iovec *iov, int iovcnt,
5814 : uint64_t offset, uint64_t nbytes,
5815 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5816 : {
5817 : uint64_t offset_blocks, num_blocks;
5818 :
5819 5 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
5820 0 : return -EINVAL;
5821 : }
5822 :
5823 5 : return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
5824 5 : }
5825 :
5826 : static int
5827 226 : bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5828 : struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
5829 : uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx,
5830 : struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
5831 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5832 : {
5833 226 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5834 : struct spdk_bdev_io *bdev_io;
5835 226 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5836 :
5837 226 : if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) {
5838 0 : return -EINVAL;
5839 : }
5840 :
5841 226 : bdev_io = bdev_channel_get_io(channel);
5842 226 : if (spdk_unlikely(!bdev_io)) {
5843 2 : return -ENOMEM;
5844 : }
5845 :
5846 224 : bdev_io->internal.ch = channel;
5847 224 : bdev_io->internal.desc = desc;
5848 224 : bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5849 224 : bdev_io->u.bdev.iovs = iov;
5850 224 : bdev_io->u.bdev.iovcnt = iovcnt;
5851 224 : bdev_io->u.bdev.md_buf = md_buf;
5852 224 : bdev_io->u.bdev.num_blocks = num_blocks;
5853 224 : bdev_io->u.bdev.offset_blocks = offset_blocks;
5854 224 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
5855 :
5856 224 : if (seq != NULL) {
5857 0 : bdev_io->internal.f.has_accel_sequence = true;
5858 0 : bdev_io->internal.accel_sequence = seq;
5859 0 : }
5860 :
5861 224 : if (domain != NULL) {
5862 2 : bdev_io->internal.f.has_memory_domain = true;
5863 2 : bdev_io->internal.memory_domain = domain;
5864 2 : bdev_io->internal.memory_domain_ctx = domain_ctx;
5865 2 : }
5866 :
5867 224 : bdev_io->u.bdev.memory_domain = domain;
5868 224 : bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
5869 224 : bdev_io->u.bdev.accel_sequence = seq;
5870 224 : bdev_io->u.bdev.dif_check_flags = dif_check_flags;
5871 :
5872 224 : _bdev_io_submit_ext(desc, bdev_io);
5873 :
5874 224 : return 0;
5875 226 : }
5876 :
5877 : int
5878 21 : spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5879 : struct iovec *iov, int iovcnt,
5880 : uint64_t offset_blocks, uint64_t num_blocks,
5881 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5882 : {
5883 21 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5884 :
5885 42 : return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5886 21 : num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg);
5887 : }
5888 :
5889 : int
5890 4 : spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5891 : struct iovec *iov, int iovcnt, void *md_buf,
5892 : uint64_t offset_blocks, uint64_t num_blocks,
5893 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5894 : {
5895 4 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5896 :
5897 4 : if (md_buf && !spdk_bdev_is_md_separate(bdev)) {
5898 0 : return -EINVAL;
5899 : }
5900 :
5901 4 : if (md_buf && !_is_buf_allocated(iov)) {
5902 0 : return -EINVAL;
5903 : }
5904 :
5905 8 : return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5906 4 : num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg);
5907 4 : }
5908 :
5909 : static inline bool
5910 14 : _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov)
5911 : {
5912 : /*
5913 : * We check if opts size is at least of size when we first introduced
5914 : * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members
5915 : * are not checked internal.
5916 : */
5917 24 : return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) +
5918 14 : sizeof(opts->metadata) &&
5919 10 : opts->size <= sizeof(*opts) &&
5920 : /* When memory domain is used, the user must provide data buffers */
5921 8 : (!opts->memory_domain || (iov && iov[0].iov_base));
5922 : }
5923 :
5924 : int
5925 8 : spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5926 : struct iovec *iov, int iovcnt,
5927 : uint64_t offset_blocks, uint64_t num_blocks,
5928 : spdk_bdev_io_completion_cb cb, void *cb_arg,
5929 : struct spdk_bdev_ext_io_opts *opts)
5930 : {
5931 8 : struct spdk_memory_domain *domain = NULL;
5932 8 : struct spdk_accel_sequence *seq = NULL;
5933 8 : void *domain_ctx = NULL, *md = NULL;
5934 8 : uint32_t dif_check_flags = 0;
5935 : uint32_t nvme_cdw12_raw;
5936 8 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5937 :
5938 8 : if (opts) {
5939 7 : if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
5940 3 : return -EINVAL;
5941 : }
5942 :
5943 4 : md = opts->metadata;
5944 4 : domain = bdev_get_ext_io_opt(opts, memory_domain, NULL);
5945 4 : domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL);
5946 4 : seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL);
5947 4 : nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0);
5948 4 : if (md) {
5949 4 : if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) {
5950 0 : return -EINVAL;
5951 : }
5952 :
5953 4 : if (spdk_unlikely(!_is_buf_allocated(iov))) {
5954 0 : return -EINVAL;
5955 : }
5956 :
5957 4 : if (spdk_unlikely(seq != NULL)) {
5958 0 : return -EINVAL;
5959 : }
5960 :
5961 4 : if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) {
5962 0 : SPDK_ERRLOG("Separate metadata with NVMe PRACT is not supported.\n");
5963 0 : return -ENOTSUP;
5964 : }
5965 4 : }
5966 :
5967 4 : if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) {
5968 0 : dif_check_flags |= SPDK_DIF_FLAGS_NVME_PRACT;
5969 0 : }
5970 4 : }
5971 :
5972 10 : dif_check_flags |= bdev->dif_check_flags &
5973 5 : ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0));
5974 :
5975 10 : return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks,
5976 5 : num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg);
5977 8 : }
5978 :
5979 : static int
5980 36 : bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5981 : void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5982 : spdk_bdev_io_completion_cb cb, void *cb_arg)
5983 : {
5984 36 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5985 : struct spdk_bdev_io *bdev_io;
5986 36 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5987 :
5988 36 : if (!desc->write) {
5989 0 : return -EBADF;
5990 : }
5991 :
5992 36 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5993 0 : return -EINVAL;
5994 : }
5995 :
5996 36 : bdev_io = bdev_channel_get_io(channel);
5997 36 : if (!bdev_io) {
5998 0 : return -ENOMEM;
5999 : }
6000 :
6001 36 : bdev_io->internal.ch = channel;
6002 36 : bdev_io->internal.desc = desc;
6003 36 : bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
6004 36 : bdev_io->u.bdev.iovs = &bdev_io->iov;
6005 36 : bdev_io->u.bdev.iovs[0].iov_base = buf;
6006 36 : bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc);
6007 36 : bdev_io->u.bdev.iovcnt = 1;
6008 36 : bdev_io->u.bdev.md_buf = md_buf;
6009 36 : bdev_io->u.bdev.num_blocks = num_blocks;
6010 36 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6011 36 : bdev_io->u.bdev.memory_domain = NULL;
6012 36 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6013 36 : bdev_io->u.bdev.accel_sequence = NULL;
6014 36 : bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags;
6015 36 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6016 :
6017 36 : bdev_io_submit(bdev_io);
6018 36 : return 0;
6019 36 : }
6020 :
6021 : int
6022 3 : spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6023 : void *buf, uint64_t offset, uint64_t nbytes,
6024 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6025 : {
6026 : uint64_t offset_blocks, num_blocks;
6027 :
6028 3 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
6029 0 : return -EINVAL;
6030 : }
6031 :
6032 3 : return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
6033 3 : }
6034 :
6035 : int
6036 27 : spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6037 : void *buf, uint64_t offset_blocks, uint64_t num_blocks,
6038 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6039 : {
6040 54 : return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
6041 27 : cb, cb_arg);
6042 : }
6043 :
6044 : int
6045 3 : spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6046 : void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
6047 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6048 : {
6049 6 : struct iovec iov = {
6050 3 : .iov_base = buf,
6051 : };
6052 :
6053 3 : if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
6054 0 : return -EINVAL;
6055 : }
6056 :
6057 3 : if (md_buf && !_is_buf_allocated(&iov)) {
6058 0 : return -EINVAL;
6059 : }
6060 :
6061 6 : return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
6062 3 : cb, cb_arg);
6063 3 : }
6064 :
6065 : static int
6066 70 : bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6067 : struct iovec *iov, int iovcnt, void *md_buf,
6068 : uint64_t offset_blocks, uint64_t num_blocks,
6069 : struct spdk_memory_domain *domain, void *domain_ctx,
6070 : struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
6071 : uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw,
6072 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6073 : {
6074 70 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6075 : struct spdk_bdev_io *bdev_io;
6076 70 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6077 :
6078 70 : if (spdk_unlikely(!desc->write)) {
6079 0 : return -EBADF;
6080 : }
6081 :
6082 70 : if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) {
6083 0 : return -EINVAL;
6084 : }
6085 :
6086 70 : bdev_io = bdev_channel_get_io(channel);
6087 70 : if (spdk_unlikely(!bdev_io)) {
6088 2 : return -ENOMEM;
6089 : }
6090 :
6091 68 : bdev_io->internal.ch = channel;
6092 68 : bdev_io->internal.desc = desc;
6093 68 : bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
6094 68 : bdev_io->u.bdev.iovs = iov;
6095 68 : bdev_io->u.bdev.iovcnt = iovcnt;
6096 68 : bdev_io->u.bdev.md_buf = md_buf;
6097 68 : bdev_io->u.bdev.num_blocks = num_blocks;
6098 68 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6099 68 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6100 68 : if (seq != NULL) {
6101 0 : bdev_io->internal.f.has_accel_sequence = true;
6102 0 : bdev_io->internal.accel_sequence = seq;
6103 0 : }
6104 :
6105 68 : if (domain != NULL) {
6106 2 : bdev_io->internal.f.has_memory_domain = true;
6107 2 : bdev_io->internal.memory_domain = domain;
6108 2 : bdev_io->internal.memory_domain_ctx = domain_ctx;
6109 2 : }
6110 :
6111 68 : bdev_io->u.bdev.memory_domain = domain;
6112 68 : bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
6113 68 : bdev_io->u.bdev.accel_sequence = seq;
6114 68 : bdev_io->u.bdev.dif_check_flags = dif_check_flags;
6115 68 : bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw;
6116 68 : bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw;
6117 :
6118 68 : _bdev_io_submit_ext(desc, bdev_io);
6119 :
6120 68 : return 0;
6121 70 : }
6122 :
6123 : int
6124 3 : spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6125 : struct iovec *iov, int iovcnt,
6126 : uint64_t offset, uint64_t len,
6127 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6128 : {
6129 : uint64_t offset_blocks, num_blocks;
6130 :
6131 3 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) {
6132 0 : return -EINVAL;
6133 : }
6134 :
6135 3 : return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
6136 3 : }
6137 :
6138 : int
6139 14 : spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6140 : struct iovec *iov, int iovcnt,
6141 : uint64_t offset_blocks, uint64_t num_blocks,
6142 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6143 : {
6144 14 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6145 :
6146 28 : return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
6147 14 : num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0,
6148 14 : cb, cb_arg);
6149 : }
6150 :
6151 : int
6152 1 : spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6153 : struct iovec *iov, int iovcnt, void *md_buf,
6154 : uint64_t offset_blocks, uint64_t num_blocks,
6155 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6156 : {
6157 1 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6158 :
6159 1 : if (md_buf && !spdk_bdev_is_md_separate(bdev)) {
6160 0 : return -EINVAL;
6161 : }
6162 :
6163 1 : if (md_buf && !_is_buf_allocated(iov)) {
6164 0 : return -EINVAL;
6165 : }
6166 :
6167 2 : return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
6168 1 : num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0,
6169 1 : cb, cb_arg);
6170 1 : }
6171 :
6172 : int
6173 8 : spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6174 : struct iovec *iov, int iovcnt,
6175 : uint64_t offset_blocks, uint64_t num_blocks,
6176 : spdk_bdev_io_completion_cb cb, void *cb_arg,
6177 : struct spdk_bdev_ext_io_opts *opts)
6178 : {
6179 8 : struct spdk_memory_domain *domain = NULL;
6180 8 : struct spdk_accel_sequence *seq = NULL;
6181 8 : void *domain_ctx = NULL, *md = NULL;
6182 8 : uint32_t dif_check_flags = 0;
6183 8 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6184 8 : uint32_t nvme_cdw12_raw = 0;
6185 8 : uint32_t nvme_cdw13_raw = 0;
6186 :
6187 8 : if (opts) {
6188 7 : if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
6189 3 : return -EINVAL;
6190 : }
6191 4 : md = opts->metadata;
6192 4 : domain = bdev_get_ext_io_opt(opts, memory_domain, NULL);
6193 4 : domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL);
6194 4 : seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL);
6195 4 : nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0);
6196 4 : nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0);
6197 4 : if (md) {
6198 4 : if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) {
6199 0 : return -EINVAL;
6200 : }
6201 :
6202 4 : if (spdk_unlikely(!_is_buf_allocated(iov))) {
6203 0 : return -EINVAL;
6204 : }
6205 :
6206 4 : if (spdk_unlikely(seq != NULL)) {
6207 0 : return -EINVAL;
6208 : }
6209 :
6210 4 : if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) {
6211 0 : SPDK_ERRLOG("Separate metadata with NVMe PRACT is not supported.\n");
6212 0 : return -ENOTSUP;
6213 : }
6214 4 : }
6215 :
6216 4 : if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) {
6217 0 : dif_check_flags |= SPDK_DIF_FLAGS_NVME_PRACT;
6218 0 : }
6219 4 : }
6220 :
6221 10 : dif_check_flags |= bdev->dif_check_flags &
6222 5 : ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0));
6223 :
6224 10 : return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks,
6225 5 : domain, domain_ctx, seq, dif_check_flags,
6226 5 : nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg);
6227 8 : }
6228 :
6229 : static void
6230 11 : bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6231 : {
6232 11 : struct spdk_bdev_io *parent_io = cb_arg;
6233 11 : struct spdk_bdev *bdev = parent_io->bdev;
6234 11 : uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base;
6235 11 : int i, rc = 0;
6236 :
6237 11 : if (!success) {
6238 0 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6239 0 : parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
6240 0 : spdk_bdev_free_io(bdev_io);
6241 0 : return;
6242 : }
6243 :
6244 17 : for (i = 0; i < parent_io->u.bdev.iovcnt; i++) {
6245 22 : rc = memcmp(read_buf,
6246 11 : parent_io->u.bdev.iovs[i].iov_base,
6247 11 : parent_io->u.bdev.iovs[i].iov_len);
6248 11 : if (rc) {
6249 5 : break;
6250 : }
6251 6 : read_buf += parent_io->u.bdev.iovs[i].iov_len;
6252 6 : }
6253 :
6254 11 : if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) {
6255 4 : rc = memcmp(bdev_io->u.bdev.md_buf,
6256 2 : parent_io->u.bdev.md_buf,
6257 2 : spdk_bdev_get_md_size(bdev));
6258 2 : }
6259 :
6260 11 : spdk_bdev_free_io(bdev_io);
6261 :
6262 11 : if (rc == 0) {
6263 5 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6264 5 : parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
6265 5 : } else {
6266 6 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE;
6267 6 : parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
6268 : }
6269 11 : }
6270 :
6271 : static void
6272 11 : bdev_compare_do_read(void *_bdev_io)
6273 : {
6274 11 : struct spdk_bdev_io *bdev_io = _bdev_io;
6275 : int rc;
6276 :
6277 22 : rc = spdk_bdev_read_blocks(bdev_io->internal.desc,
6278 11 : spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL,
6279 11 : bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6280 11 : bdev_compare_do_read_done, bdev_io);
6281 :
6282 11 : if (rc == -ENOMEM) {
6283 0 : bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read);
6284 11 : } else if (rc != 0) {
6285 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6286 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
6287 0 : }
6288 11 : }
6289 :
6290 : static int
6291 16 : bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6292 : struct iovec *iov, int iovcnt, void *md_buf,
6293 : uint64_t offset_blocks, uint64_t num_blocks,
6294 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6295 : {
6296 16 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6297 : struct spdk_bdev_io *bdev_io;
6298 16 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6299 :
6300 16 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6301 0 : return -EINVAL;
6302 : }
6303 :
6304 16 : bdev_io = bdev_channel_get_io(channel);
6305 16 : if (!bdev_io) {
6306 0 : return -ENOMEM;
6307 : }
6308 :
6309 16 : bdev_io->internal.ch = channel;
6310 16 : bdev_io->internal.desc = desc;
6311 16 : bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
6312 16 : bdev_io->u.bdev.iovs = iov;
6313 16 : bdev_io->u.bdev.iovcnt = iovcnt;
6314 16 : bdev_io->u.bdev.md_buf = md_buf;
6315 16 : bdev_io->u.bdev.num_blocks = num_blocks;
6316 16 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6317 16 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6318 16 : bdev_io->u.bdev.memory_domain = NULL;
6319 16 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6320 16 : bdev_io->u.bdev.accel_sequence = NULL;
6321 :
6322 16 : if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
6323 7 : bdev_io_submit(bdev_io);
6324 7 : return 0;
6325 : }
6326 :
6327 9 : bdev_compare_do_read(bdev_io);
6328 :
6329 9 : return 0;
6330 16 : }
6331 :
6332 : int
6333 10 : spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6334 : struct iovec *iov, int iovcnt,
6335 : uint64_t offset_blocks, uint64_t num_blocks,
6336 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6337 : {
6338 20 : return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
6339 10 : num_blocks, cb, cb_arg);
6340 : }
6341 :
6342 : int
6343 6 : spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6344 : struct iovec *iov, int iovcnt, void *md_buf,
6345 : uint64_t offset_blocks, uint64_t num_blocks,
6346 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6347 : {
6348 6 : if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
6349 0 : return -EINVAL;
6350 : }
6351 :
6352 6 : if (md_buf && !_is_buf_allocated(iov)) {
6353 0 : return -EINVAL;
6354 : }
6355 :
6356 12 : return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
6357 6 : num_blocks, cb, cb_arg);
6358 6 : }
6359 :
6360 : static int
6361 4 : bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6362 : void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
6363 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6364 : {
6365 4 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6366 : struct spdk_bdev_io *bdev_io;
6367 4 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6368 :
6369 4 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6370 0 : return -EINVAL;
6371 : }
6372 :
6373 4 : bdev_io = bdev_channel_get_io(channel);
6374 4 : if (!bdev_io) {
6375 0 : return -ENOMEM;
6376 : }
6377 :
6378 4 : bdev_io->internal.ch = channel;
6379 4 : bdev_io->internal.desc = desc;
6380 4 : bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
6381 4 : bdev_io->u.bdev.iovs = &bdev_io->iov;
6382 4 : bdev_io->u.bdev.iovs[0].iov_base = buf;
6383 4 : bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc);
6384 4 : bdev_io->u.bdev.iovcnt = 1;
6385 4 : bdev_io->u.bdev.md_buf = md_buf;
6386 4 : bdev_io->u.bdev.num_blocks = num_blocks;
6387 4 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6388 4 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6389 4 : bdev_io->u.bdev.memory_domain = NULL;
6390 4 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6391 4 : bdev_io->u.bdev.accel_sequence = NULL;
6392 :
6393 4 : if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
6394 2 : bdev_io_submit(bdev_io);
6395 2 : return 0;
6396 : }
6397 :
6398 2 : bdev_compare_do_read(bdev_io);
6399 :
6400 2 : return 0;
6401 4 : }
6402 :
6403 : int
6404 4 : spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6405 : void *buf, uint64_t offset_blocks, uint64_t num_blocks,
6406 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6407 : {
6408 8 : return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
6409 4 : cb, cb_arg);
6410 : }
6411 :
6412 : int
6413 0 : spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6414 : void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
6415 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6416 : {
6417 0 : struct iovec iov = {
6418 0 : .iov_base = buf,
6419 : };
6420 :
6421 0 : if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
6422 0 : return -EINVAL;
6423 : }
6424 :
6425 0 : if (md_buf && !_is_buf_allocated(&iov)) {
6426 0 : return -EINVAL;
6427 : }
6428 :
6429 0 : return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
6430 0 : cb, cb_arg);
6431 0 : }
6432 :
6433 : static void
6434 2 : bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status)
6435 : {
6436 2 : struct spdk_bdev_io *bdev_io = ctx;
6437 :
6438 2 : if (unlock_status) {
6439 0 : SPDK_ERRLOG("LBA range unlock failed\n");
6440 0 : }
6441 :
6442 4 : bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true :
6443 2 : false, bdev_io->internal.caller_ctx);
6444 2 : }
6445 :
6446 : static void
6447 2 : bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status)
6448 : {
6449 2 : bdev_io->internal.status = status;
6450 :
6451 4 : bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch),
6452 2 : bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6453 2 : bdev_comparev_and_writev_blocks_unlocked, bdev_io);
6454 2 : }
6455 :
6456 : static void
6457 1 : bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6458 : {
6459 1 : struct spdk_bdev_io *parent_io = cb_arg;
6460 :
6461 1 : if (!success) {
6462 0 : SPDK_ERRLOG("Compare and write operation failed\n");
6463 0 : }
6464 :
6465 1 : spdk_bdev_free_io(bdev_io);
6466 :
6467 2 : bdev_comparev_and_writev_blocks_unlock(parent_io,
6468 1 : success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
6469 1 : }
6470 :
6471 : static void
6472 1 : bdev_compare_and_write_do_write(void *_bdev_io)
6473 : {
6474 1 : struct spdk_bdev_io *bdev_io = _bdev_io;
6475 : int rc;
6476 :
6477 2 : rc = spdk_bdev_writev_blocks(bdev_io->internal.desc,
6478 1 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
6479 1 : bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt,
6480 1 : bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6481 1 : bdev_compare_and_write_do_write_done, bdev_io);
6482 :
6483 :
6484 1 : if (rc == -ENOMEM) {
6485 0 : bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write);
6486 1 : } else if (rc != 0) {
6487 0 : bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
6488 0 : }
6489 1 : }
6490 :
6491 : static void
6492 2 : bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6493 : {
6494 2 : struct spdk_bdev_io *parent_io = cb_arg;
6495 :
6496 2 : spdk_bdev_free_io(bdev_io);
6497 :
6498 2 : if (!success) {
6499 1 : bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE);
6500 1 : return;
6501 : }
6502 :
6503 1 : bdev_compare_and_write_do_write(parent_io);
6504 2 : }
6505 :
6506 : static void
6507 2 : bdev_compare_and_write_do_compare(void *_bdev_io)
6508 : {
6509 2 : struct spdk_bdev_io *bdev_io = _bdev_io;
6510 : int rc;
6511 :
6512 4 : rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc,
6513 2 : spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs,
6514 2 : bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6515 2 : bdev_compare_and_write_do_compare_done, bdev_io);
6516 :
6517 2 : if (rc == -ENOMEM) {
6518 0 : bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare);
6519 2 : } else if (rc != 0) {
6520 0 : bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED);
6521 0 : }
6522 2 : }
6523 :
6524 : static void
6525 2 : bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status)
6526 : {
6527 2 : struct spdk_bdev_io *bdev_io = ctx;
6528 :
6529 2 : if (status) {
6530 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED;
6531 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
6532 0 : return;
6533 : }
6534 :
6535 2 : bdev_compare_and_write_do_compare(bdev_io);
6536 2 : }
6537 :
6538 : int
6539 2 : spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6540 : struct iovec *compare_iov, int compare_iovcnt,
6541 : struct iovec *write_iov, int write_iovcnt,
6542 : uint64_t offset_blocks, uint64_t num_blocks,
6543 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6544 : {
6545 2 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6546 : struct spdk_bdev_io *bdev_io;
6547 2 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6548 :
6549 2 : if (!desc->write) {
6550 0 : return -EBADF;
6551 : }
6552 :
6553 2 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6554 0 : return -EINVAL;
6555 : }
6556 :
6557 2 : if (num_blocks > bdev->acwu) {
6558 0 : return -EINVAL;
6559 : }
6560 :
6561 2 : bdev_io = bdev_channel_get_io(channel);
6562 2 : if (!bdev_io) {
6563 0 : return -ENOMEM;
6564 : }
6565 :
6566 2 : bdev_io->internal.ch = channel;
6567 2 : bdev_io->internal.desc = desc;
6568 2 : bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE;
6569 2 : bdev_io->u.bdev.iovs = compare_iov;
6570 2 : bdev_io->u.bdev.iovcnt = compare_iovcnt;
6571 2 : bdev_io->u.bdev.fused_iovs = write_iov;
6572 2 : bdev_io->u.bdev.fused_iovcnt = write_iovcnt;
6573 2 : bdev_io->u.bdev.md_buf = NULL;
6574 2 : bdev_io->u.bdev.num_blocks = num_blocks;
6575 2 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6576 2 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6577 2 : bdev_io->u.bdev.memory_domain = NULL;
6578 2 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6579 2 : bdev_io->u.bdev.accel_sequence = NULL;
6580 :
6581 2 : if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
6582 0 : bdev_io_submit(bdev_io);
6583 0 : return 0;
6584 : }
6585 :
6586 4 : return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks,
6587 2 : bdev_comparev_and_writev_blocks_locked, bdev_io);
6588 2 : }
6589 :
6590 : int
6591 2 : spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6592 : struct iovec *iov, int iovcnt,
6593 : uint64_t offset_blocks, uint64_t num_blocks,
6594 : bool populate,
6595 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6596 : {
6597 2 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6598 : struct spdk_bdev_io *bdev_io;
6599 2 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6600 :
6601 2 : if (!desc->write) {
6602 0 : return -EBADF;
6603 : }
6604 :
6605 2 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6606 0 : return -EINVAL;
6607 : }
6608 :
6609 2 : if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
6610 0 : return -ENOTSUP;
6611 : }
6612 :
6613 2 : bdev_io = bdev_channel_get_io(channel);
6614 2 : if (!bdev_io) {
6615 0 : return -ENOMEM;
6616 : }
6617 :
6618 2 : bdev_io->internal.ch = channel;
6619 2 : bdev_io->internal.desc = desc;
6620 2 : bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
6621 2 : bdev_io->u.bdev.num_blocks = num_blocks;
6622 2 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6623 2 : bdev_io->u.bdev.iovs = iov;
6624 2 : bdev_io->u.bdev.iovcnt = iovcnt;
6625 2 : bdev_io->u.bdev.md_buf = NULL;
6626 2 : bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0;
6627 2 : bdev_io->u.bdev.zcopy.commit = 0;
6628 2 : bdev_io->u.bdev.zcopy.start = 1;
6629 2 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6630 2 : bdev_io->u.bdev.memory_domain = NULL;
6631 2 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6632 2 : bdev_io->u.bdev.accel_sequence = NULL;
6633 :
6634 2 : bdev_io_submit(bdev_io);
6635 :
6636 2 : return 0;
6637 2 : }
6638 :
6639 : int
6640 2 : spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
6641 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6642 : {
6643 2 : if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) {
6644 0 : return -EINVAL;
6645 : }
6646 :
6647 2 : bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0;
6648 2 : bdev_io->u.bdev.zcopy.start = 0;
6649 2 : bdev_io->internal.caller_ctx = cb_arg;
6650 2 : bdev_io->internal.cb = cb;
6651 2 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
6652 :
6653 2 : bdev_io_submit(bdev_io);
6654 :
6655 2 : return 0;
6656 2 : }
6657 :
6658 : int
6659 0 : spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6660 : uint64_t offset, uint64_t len,
6661 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6662 : {
6663 : uint64_t offset_blocks, num_blocks;
6664 :
6665 0 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) {
6666 0 : return -EINVAL;
6667 : }
6668 :
6669 0 : return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6670 0 : }
6671 :
6672 : int
6673 33 : spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6674 : uint64_t offset_blocks, uint64_t num_blocks,
6675 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6676 : {
6677 33 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6678 : struct spdk_bdev_io *bdev_io;
6679 33 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6680 :
6681 33 : if (!desc->write) {
6682 0 : return -EBADF;
6683 : }
6684 :
6685 33 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6686 0 : return -EINVAL;
6687 : }
6688 :
6689 33 : if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) &&
6690 10 : !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) {
6691 1 : return -ENOTSUP;
6692 : }
6693 :
6694 32 : bdev_io = bdev_channel_get_io(channel);
6695 :
6696 32 : if (!bdev_io) {
6697 0 : return -ENOMEM;
6698 : }
6699 :
6700 32 : bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
6701 32 : bdev_io->internal.ch = channel;
6702 32 : bdev_io->internal.desc = desc;
6703 32 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6704 32 : bdev_io->u.bdev.num_blocks = num_blocks;
6705 32 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6706 32 : bdev_io->u.bdev.memory_domain = NULL;
6707 32 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6708 32 : bdev_io->u.bdev.accel_sequence = NULL;
6709 :
6710 : /* If the write_zeroes size is large and should be split, use the generic split
6711 : * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not.
6712 : *
6713 : * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported
6714 : * or emulate it using regular write request otherwise.
6715 : */
6716 32 : if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) ||
6717 9 : bdev_io->internal.f.split) {
6718 26 : bdev_io_submit(bdev_io);
6719 26 : return 0;
6720 : }
6721 :
6722 6 : assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE);
6723 :
6724 6 : return bdev_write_zero_buffer(bdev_io);
6725 33 : }
6726 :
6727 : int
6728 0 : spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6729 : uint64_t offset, uint64_t nbytes,
6730 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6731 : {
6732 : uint64_t offset_blocks, num_blocks;
6733 :
6734 0 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
6735 0 : return -EINVAL;
6736 : }
6737 :
6738 0 : return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6739 0 : }
6740 :
6741 : static void
6742 0 : bdev_io_complete_cb(void *ctx)
6743 : {
6744 0 : struct spdk_bdev_io *bdev_io = ctx;
6745 :
6746 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6747 0 : bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
6748 0 : }
6749 :
6750 : int
6751 22 : spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6752 : uint64_t offset_blocks, uint64_t num_blocks,
6753 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6754 : {
6755 22 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6756 : struct spdk_bdev_io *bdev_io;
6757 22 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6758 :
6759 22 : if (!desc->write) {
6760 0 : return -EBADF;
6761 : }
6762 :
6763 22 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6764 0 : return -EINVAL;
6765 : }
6766 :
6767 22 : bdev_io = bdev_channel_get_io(channel);
6768 22 : if (!bdev_io) {
6769 0 : return -ENOMEM;
6770 : }
6771 :
6772 22 : bdev_io->internal.ch = channel;
6773 22 : bdev_io->internal.desc = desc;
6774 22 : bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
6775 :
6776 22 : bdev_io->u.bdev.iovs = &bdev_io->iov;
6777 22 : bdev_io->u.bdev.iovs[0].iov_base = NULL;
6778 22 : bdev_io->u.bdev.iovs[0].iov_len = 0;
6779 22 : bdev_io->u.bdev.iovcnt = 1;
6780 :
6781 22 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6782 22 : bdev_io->u.bdev.num_blocks = num_blocks;
6783 22 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6784 22 : bdev_io->u.bdev.memory_domain = NULL;
6785 22 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6786 22 : bdev_io->u.bdev.accel_sequence = NULL;
6787 :
6788 22 : if (num_blocks == 0) {
6789 0 : spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io);
6790 0 : return 0;
6791 : }
6792 :
6793 22 : bdev_io_submit(bdev_io);
6794 22 : return 0;
6795 22 : }
6796 :
6797 : int
6798 0 : spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6799 : uint64_t offset, uint64_t length,
6800 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6801 : {
6802 : uint64_t offset_blocks, num_blocks;
6803 :
6804 0 : if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, length, &num_blocks) != 0) {
6805 0 : return -EINVAL;
6806 : }
6807 :
6808 0 : return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6809 0 : }
6810 :
6811 : int
6812 2 : spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6813 : uint64_t offset_blocks, uint64_t num_blocks,
6814 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6815 : {
6816 2 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6817 : struct spdk_bdev_io *bdev_io;
6818 2 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6819 :
6820 2 : if (!desc->write) {
6821 0 : return -EBADF;
6822 : }
6823 :
6824 2 : if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH))) {
6825 0 : return -ENOTSUP;
6826 : }
6827 :
6828 2 : if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6829 0 : return -EINVAL;
6830 : }
6831 :
6832 2 : bdev_io = bdev_channel_get_io(channel);
6833 2 : if (!bdev_io) {
6834 0 : return -ENOMEM;
6835 : }
6836 :
6837 2 : bdev_io->internal.ch = channel;
6838 2 : bdev_io->internal.desc = desc;
6839 2 : bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
6840 2 : bdev_io->u.bdev.iovs = NULL;
6841 2 : bdev_io->u.bdev.iovcnt = 0;
6842 2 : bdev_io->u.bdev.offset_blocks = offset_blocks;
6843 2 : bdev_io->u.bdev.num_blocks = num_blocks;
6844 2 : bdev_io->u.bdev.memory_domain = NULL;
6845 2 : bdev_io->u.bdev.memory_domain_ctx = NULL;
6846 2 : bdev_io->u.bdev.accel_sequence = NULL;
6847 2 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
6848 :
6849 2 : bdev_io_submit(bdev_io);
6850 2 : return 0;
6851 2 : }
6852 :
6853 : static int bdev_reset_poll_for_outstanding_io(void *ctx);
6854 :
6855 : static void
6856 13 : bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
6857 : {
6858 13 : struct spdk_bdev_io *bdev_io = _ctx;
6859 13 : struct spdk_bdev_channel *ch = bdev_io->internal.ch;
6860 :
6861 13 : if (status == -EBUSY) {
6862 9 : if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
6863 8 : bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
6864 : bdev_io, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
6865 8 : } else {
6866 1 : if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) {
6867 : /* If outstanding IOs are still present and reset_io_drain_timeout
6868 : * seconds passed, start the reset. */
6869 1 : bdev_io_submit_reset(bdev_io);
6870 1 : } else {
6871 : /* We still have in progress memory domain pull/push or we're
6872 : * executing accel sequence. Since we cannot abort either of those
6873 : * operations, fail the reset request. */
6874 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
6875 : }
6876 : }
6877 9 : } else {
6878 4 : SPDK_DEBUGLOG(bdev,
6879 : "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
6880 : ch->bdev->name);
6881 : /* Mark the completion status as a SUCCESS and complete the reset. */
6882 4 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
6883 : }
6884 13 : }
6885 :
6886 : static void
6887 13 : bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6888 : struct spdk_io_channel *io_ch, void *_ctx)
6889 : {
6890 13 : struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch);
6891 13 : int status = 0;
6892 :
6893 17 : if (cur_ch->io_outstanding > 0 ||
6894 4 : !TAILQ_EMPTY(&cur_ch->io_memory_domain) ||
6895 4 : !TAILQ_EMPTY(&cur_ch->io_accel_exec)) {
6896 : /* If a channel has outstanding IO, set status to -EBUSY code. This will stop
6897 : * further iteration over the rest of the channels and pass non-zero status
6898 : * to the callback function. */
6899 9 : status = -EBUSY;
6900 9 : }
6901 13 : spdk_bdev_for_each_channel_continue(i, status);
6902 13 : }
6903 :
6904 : static int
6905 8 : bdev_reset_poll_for_outstanding_io(void *ctx)
6906 : {
6907 8 : struct spdk_bdev_io *bdev_io = ctx;
6908 :
6909 8 : spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
6910 8 : spdk_bdev_for_each_channel(bdev_io->bdev, bdev_reset_check_outstanding_io, bdev_io,
6911 : bdev_reset_check_outstanding_io_done);
6912 :
6913 8 : return SPDK_POLLER_BUSY;
6914 : }
6915 :
6916 : static void
6917 17 : bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status)
6918 : {
6919 17 : struct spdk_bdev_io *bdev_io = _ctx;
6920 :
6921 17 : if (bdev->reset_io_drain_timeout == 0) {
6922 12 : bdev_io_submit_reset(bdev_io);
6923 12 : return;
6924 : }
6925 :
6926 10 : bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
6927 5 : (bdev->reset_io_drain_timeout * spdk_get_ticks_hz());
6928 :
6929 : /* In case bdev->reset_io_drain_timeout is not equal to zero,
6930 : * submit the reset to the underlying module only if outstanding I/O
6931 : * remain after reset_io_drain_timeout seconds have passed. */
6932 5 : spdk_bdev_for_each_channel(bdev, bdev_reset_check_outstanding_io, bdev_io,
6933 : bdev_reset_check_outstanding_io_done);
6934 17 : }
6935 :
6936 : static void
6937 20 : bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6938 : struct spdk_io_channel *ch, void *_ctx)
6939 : {
6940 : struct spdk_bdev_channel *channel;
6941 : struct spdk_bdev_mgmt_channel *mgmt_channel;
6942 : struct spdk_bdev_shared_resource *shared_resource;
6943 :
6944 20 : channel = __io_ch_to_bdev_ch(ch);
6945 20 : shared_resource = channel->shared_resource;
6946 20 : mgmt_channel = shared_resource->mgmt_ch;
6947 :
6948 20 : channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
6949 :
6950 : /**
6951 : * Abort nomem I/Os first so that aborting other queued I/Os won't resubmit
6952 : * nomem I/Os of this channel.
6953 : */
6954 20 : bdev_abort_all_nomem_io(channel);
6955 :
6956 20 : if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
6957 2 : bdev_abort_all_queued_io(&channel->qos_queued_io, channel);
6958 2 : }
6959 :
6960 20 : bdev_abort_all_buf_io(mgmt_channel, channel);
6961 :
6962 20 : spdk_bdev_for_each_channel_continue(i, 0);
6963 20 : }
6964 :
6965 : static void
6966 19 : bdev_start_reset(struct spdk_bdev_io *bdev_io)
6967 : {
6968 19 : struct spdk_bdev *bdev = bdev_io->bdev;
6969 19 : bool freeze_channel = false;
6970 :
6971 19 : bdev_ch_add_to_io_submitted(bdev_io);
6972 :
6973 : /**
6974 : * Take a channel reference for the target bdev for the life of this
6975 : * reset. This guards against the channel getting destroyed before
6976 : * the reset is completed. We will release the reference when this
6977 : * reset is completed.
6978 : */
6979 19 : bdev_io->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
6980 :
6981 19 : spdk_spin_lock(&bdev->internal.spinlock);
6982 19 : if (bdev->internal.reset_in_progress == NULL) {
6983 17 : bdev->internal.reset_in_progress = bdev_io;
6984 17 : freeze_channel = true;
6985 17 : } else {
6986 2 : TAILQ_INSERT_TAIL(&bdev->internal.queued_resets, bdev_io, internal.link);
6987 : }
6988 19 : spdk_spin_unlock(&bdev->internal.spinlock);
6989 :
6990 19 : if (freeze_channel) {
6991 17 : spdk_bdev_for_each_channel(bdev, bdev_reset_freeze_channel, bdev_io,
6992 : bdev_reset_freeze_channel_done);
6993 17 : }
6994 19 : }
6995 :
6996 : int
6997 19 : spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6998 : spdk_bdev_io_completion_cb cb, void *cb_arg)
6999 : {
7000 19 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7001 : struct spdk_bdev_io *bdev_io;
7002 19 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7003 :
7004 19 : bdev_io = bdev_channel_get_io(channel);
7005 19 : if (!bdev_io) {
7006 0 : return -ENOMEM;
7007 : }
7008 :
7009 19 : bdev_io->internal.ch = channel;
7010 19 : bdev_io->internal.desc = desc;
7011 19 : bdev_io->internal.submit_tsc = spdk_get_ticks();
7012 19 : bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
7013 19 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7014 :
7015 19 : bdev_start_reset(bdev_io);
7016 19 : return 0;
7017 19 : }
7018 :
7019 : void
7020 0 : spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
7021 : struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode)
7022 : {
7023 0 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7024 :
7025 0 : bdev_get_io_stat(stat, channel->stat);
7026 0 : spdk_bdev_reset_io_stat(channel->stat, reset_mode);
7027 0 : }
7028 :
7029 : static void
7030 5 : bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
7031 : {
7032 5 : struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
7033 :
7034 10 : bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat,
7035 5 : bdev_iostat_ctx->cb_arg, 0);
7036 5 : free(bdev_iostat_ctx);
7037 5 : }
7038 :
7039 : static void
7040 4 : bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7041 : struct spdk_io_channel *ch, void *_ctx)
7042 : {
7043 4 : struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
7044 4 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7045 :
7046 4 : spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat);
7047 4 : spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode);
7048 4 : spdk_bdev_for_each_channel_continue(i, 0);
7049 4 : }
7050 :
7051 : void
7052 5 : spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
7053 : enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg)
7054 : {
7055 : struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
7056 :
7057 5 : assert(bdev != NULL);
7058 5 : assert(stat != NULL);
7059 5 : assert(cb != NULL);
7060 :
7061 5 : bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
7062 5 : if (bdev_iostat_ctx == NULL) {
7063 0 : SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
7064 0 : cb(bdev, stat, cb_arg, -ENOMEM);
7065 0 : return;
7066 : }
7067 :
7068 5 : bdev_iostat_ctx->stat = stat;
7069 5 : bdev_iostat_ctx->cb = cb;
7070 5 : bdev_iostat_ctx->cb_arg = cb_arg;
7071 5 : bdev_iostat_ctx->reset_mode = reset_mode;
7072 :
7073 : /* Start with the statistics from previously deleted channels. */
7074 5 : spdk_spin_lock(&bdev->internal.spinlock);
7075 5 : bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat);
7076 5 : spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode);
7077 5 : spdk_spin_unlock(&bdev->internal.spinlock);
7078 :
7079 : /* Then iterate and add the statistics from each existing channel. */
7080 5 : spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx,
7081 : bdev_get_device_stat_done);
7082 5 : }
7083 :
7084 : struct bdev_iostat_reset_ctx {
7085 : enum spdk_bdev_reset_stat_mode mode;
7086 : bdev_reset_device_stat_cb cb;
7087 : void *cb_arg;
7088 : };
7089 :
7090 : static void
7091 0 : bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
7092 : {
7093 0 : struct bdev_iostat_reset_ctx *ctx = _ctx;
7094 :
7095 0 : ctx->cb(bdev, ctx->cb_arg, 0);
7096 :
7097 0 : free(ctx);
7098 0 : }
7099 :
7100 : static void
7101 0 : bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7102 : struct spdk_io_channel *ch, void *_ctx)
7103 : {
7104 0 : struct bdev_iostat_reset_ctx *ctx = _ctx;
7105 0 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7106 :
7107 0 : spdk_bdev_reset_io_stat(channel->stat, ctx->mode);
7108 :
7109 0 : spdk_bdev_for_each_channel_continue(i, 0);
7110 0 : }
7111 :
7112 : void
7113 0 : bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode,
7114 : bdev_reset_device_stat_cb cb, void *cb_arg)
7115 : {
7116 : struct bdev_iostat_reset_ctx *ctx;
7117 :
7118 0 : assert(bdev != NULL);
7119 0 : assert(cb != NULL);
7120 :
7121 0 : ctx = calloc(1, sizeof(*ctx));
7122 0 : if (ctx == NULL) {
7123 0 : SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n");
7124 0 : cb(bdev, cb_arg, -ENOMEM);
7125 0 : return;
7126 : }
7127 :
7128 0 : ctx->mode = mode;
7129 0 : ctx->cb = cb;
7130 0 : ctx->cb_arg = cb_arg;
7131 :
7132 0 : spdk_spin_lock(&bdev->internal.spinlock);
7133 0 : spdk_bdev_reset_io_stat(bdev->internal.stat, mode);
7134 0 : spdk_spin_unlock(&bdev->internal.spinlock);
7135 :
7136 0 : spdk_bdev_for_each_channel(bdev,
7137 : bdev_reset_each_channel_stat,
7138 0 : ctx,
7139 : bdev_reset_device_stat_done);
7140 0 : }
7141 :
7142 : int
7143 1 : spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
7144 : const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
7145 : spdk_bdev_io_completion_cb cb, void *cb_arg)
7146 : {
7147 1 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7148 : struct spdk_bdev_io *bdev_io;
7149 1 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7150 :
7151 1 : if (!desc->write) {
7152 0 : return -EBADF;
7153 : }
7154 :
7155 1 : if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) {
7156 1 : return -ENOTSUP;
7157 : }
7158 :
7159 0 : bdev_io = bdev_channel_get_io(channel);
7160 0 : if (!bdev_io) {
7161 0 : return -ENOMEM;
7162 : }
7163 :
7164 0 : bdev_io->internal.ch = channel;
7165 0 : bdev_io->internal.desc = desc;
7166 0 : bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
7167 0 : bdev_io->u.nvme_passthru.cmd = *cmd;
7168 0 : bdev_io->u.nvme_passthru.buf = buf;
7169 0 : bdev_io->u.nvme_passthru.nbytes = nbytes;
7170 0 : bdev_io->u.nvme_passthru.md_buf = NULL;
7171 0 : bdev_io->u.nvme_passthru.md_len = 0;
7172 :
7173 0 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7174 :
7175 0 : bdev_io_submit(bdev_io);
7176 0 : return 0;
7177 1 : }
7178 :
7179 : int
7180 1 : spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
7181 : const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
7182 : spdk_bdev_io_completion_cb cb, void *cb_arg)
7183 : {
7184 1 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7185 : struct spdk_bdev_io *bdev_io;
7186 1 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7187 :
7188 1 : if (!desc->write) {
7189 : /*
7190 : * Do not try to parse the NVMe command - we could maybe use bits in the opcode
7191 : * to easily determine if the command is a read or write, but for now just
7192 : * do not allow io_passthru with a read-only descriptor.
7193 : */
7194 0 : return -EBADF;
7195 : }
7196 :
7197 1 : if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
7198 1 : return -ENOTSUP;
7199 : }
7200 :
7201 0 : bdev_io = bdev_channel_get_io(channel);
7202 0 : if (!bdev_io) {
7203 0 : return -ENOMEM;
7204 : }
7205 :
7206 0 : bdev_io->internal.ch = channel;
7207 0 : bdev_io->internal.desc = desc;
7208 0 : bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
7209 0 : bdev_io->u.nvme_passthru.cmd = *cmd;
7210 0 : bdev_io->u.nvme_passthru.buf = buf;
7211 0 : bdev_io->u.nvme_passthru.nbytes = nbytes;
7212 0 : bdev_io->u.nvme_passthru.md_buf = NULL;
7213 0 : bdev_io->u.nvme_passthru.md_len = 0;
7214 :
7215 0 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7216 :
7217 0 : bdev_io_submit(bdev_io);
7218 0 : return 0;
7219 1 : }
7220 :
7221 : int
7222 1 : spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
7223 : const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
7224 : spdk_bdev_io_completion_cb cb, void *cb_arg)
7225 : {
7226 1 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7227 : struct spdk_bdev_io *bdev_io;
7228 1 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7229 :
7230 1 : if (!desc->write) {
7231 : /*
7232 : * Do not try to parse the NVMe command - we could maybe use bits in the opcode
7233 : * to easily determine if the command is a read or write, but for now just
7234 : * do not allow io_passthru with a read-only descriptor.
7235 : */
7236 0 : return -EBADF;
7237 : }
7238 :
7239 1 : if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
7240 1 : return -ENOTSUP;
7241 : }
7242 :
7243 0 : bdev_io = bdev_channel_get_io(channel);
7244 0 : if (!bdev_io) {
7245 0 : return -ENOMEM;
7246 : }
7247 :
7248 0 : bdev_io->internal.ch = channel;
7249 0 : bdev_io->internal.desc = desc;
7250 0 : bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
7251 0 : bdev_io->u.nvme_passthru.cmd = *cmd;
7252 0 : bdev_io->u.nvme_passthru.buf = buf;
7253 0 : bdev_io->u.nvme_passthru.nbytes = nbytes;
7254 0 : bdev_io->u.nvme_passthru.md_buf = md_buf;
7255 0 : bdev_io->u.nvme_passthru.md_len = md_len;
7256 :
7257 0 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7258 :
7259 0 : bdev_io_submit(bdev_io);
7260 0 : return 0;
7261 1 : }
7262 :
7263 : int
7264 0 : spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc,
7265 : struct spdk_io_channel *ch,
7266 : const struct spdk_nvme_cmd *cmd,
7267 : struct iovec *iov, int iovcnt, size_t nbytes,
7268 : void *md_buf, size_t md_len,
7269 : spdk_bdev_io_completion_cb cb, void *cb_arg)
7270 : {
7271 0 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7272 : struct spdk_bdev_io *bdev_io;
7273 0 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7274 :
7275 0 : if (!desc->write) {
7276 : /*
7277 : * Do not try to parse the NVMe command - we could maybe use bits in the opcode
7278 : * to easily determine if the command is a read or write, but for now just
7279 : * do not allow io_passthru with a read-only descriptor.
7280 : */
7281 0 : return -EBADF;
7282 : }
7283 :
7284 0 : if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
7285 0 : return -ENOTSUP;
7286 0 : } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
7287 0 : return -ENOTSUP;
7288 : }
7289 :
7290 0 : bdev_io = bdev_channel_get_io(channel);
7291 0 : if (!bdev_io) {
7292 0 : return -ENOMEM;
7293 : }
7294 :
7295 0 : bdev_io->internal.ch = channel;
7296 0 : bdev_io->internal.desc = desc;
7297 0 : bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD;
7298 0 : bdev_io->u.nvme_passthru.cmd = *cmd;
7299 0 : bdev_io->u.nvme_passthru.iovs = iov;
7300 0 : bdev_io->u.nvme_passthru.iovcnt = iovcnt;
7301 0 : bdev_io->u.nvme_passthru.nbytes = nbytes;
7302 0 : bdev_io->u.nvme_passthru.md_buf = md_buf;
7303 0 : bdev_io->u.nvme_passthru.md_len = md_len;
7304 :
7305 0 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7306 :
7307 0 : bdev_io_submit(bdev_io);
7308 0 : return 0;
7309 0 : }
7310 :
7311 : static void bdev_abort_retry(void *ctx);
7312 : static void bdev_abort(struct spdk_bdev_io *parent_io);
7313 :
7314 : static void
7315 22 : bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
7316 : {
7317 22 : struct spdk_bdev_channel *channel = bdev_io->internal.ch;
7318 22 : struct spdk_bdev_io *parent_io = cb_arg;
7319 : struct spdk_bdev_io *bio_to_abort, *tmp_io;
7320 :
7321 22 : bio_to_abort = bdev_io->u.abort.bio_to_abort;
7322 :
7323 22 : spdk_bdev_free_io(bdev_io);
7324 :
7325 22 : if (!success) {
7326 : /* Check if the target I/O completed in the meantime. */
7327 2 : TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) {
7328 1 : if (tmp_io == bio_to_abort) {
7329 0 : break;
7330 : }
7331 1 : }
7332 :
7333 : /* If the target I/O still exists, set the parent to failed. */
7334 1 : if (tmp_io != NULL) {
7335 0 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7336 0 : }
7337 1 : }
7338 :
7339 22 : assert(parent_io->internal.f.split);
7340 :
7341 22 : parent_io->internal.split.outstanding--;
7342 22 : if (parent_io->internal.split.outstanding == 0) {
7343 16 : if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
7344 0 : bdev_abort_retry(parent_io);
7345 0 : } else {
7346 16 : bdev_io_complete(parent_io);
7347 : }
7348 16 : }
7349 22 : }
7350 :
7351 : static int
7352 23 : bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel,
7353 : struct spdk_bdev_io *bio_to_abort,
7354 : spdk_bdev_io_completion_cb cb, void *cb_arg)
7355 : {
7356 23 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7357 : struct spdk_bdev_io *bdev_io;
7358 :
7359 23 : if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT ||
7360 23 : bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) {
7361 : /* TODO: Abort reset or abort request. */
7362 0 : return -ENOTSUP;
7363 : }
7364 :
7365 23 : bdev_io = bdev_channel_get_io(channel);
7366 23 : if (bdev_io == NULL) {
7367 1 : return -ENOMEM;
7368 : }
7369 :
7370 22 : bdev_io->internal.ch = channel;
7371 22 : bdev_io->internal.desc = desc;
7372 22 : bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
7373 22 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7374 :
7375 22 : if (bio_to_abort->internal.f.split) {
7376 6 : assert(bdev_io_should_split(bio_to_abort));
7377 6 : bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort;
7378 :
7379 : /* Parent abort request is not submitted directly, but to manage its
7380 : * execution add it to the submitted list here.
7381 : */
7382 6 : bdev_io->internal.submit_tsc = spdk_get_ticks();
7383 6 : bdev_ch_add_to_io_submitted(bdev_io);
7384 :
7385 6 : bdev_abort(bdev_io);
7386 :
7387 6 : return 0;
7388 : }
7389 :
7390 16 : bdev_io->u.abort.bio_to_abort = bio_to_abort;
7391 :
7392 : /* Submit the abort request to the underlying bdev module. */
7393 16 : bdev_io_submit(bdev_io);
7394 :
7395 16 : return 0;
7396 23 : }
7397 :
7398 : static bool
7399 46 : bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq)
7400 : {
7401 : struct spdk_bdev_io *iter;
7402 :
7403 46 : TAILQ_FOREACH(iter, tailq, internal.link) {
7404 0 : if (iter == bdev_io) {
7405 0 : return true;
7406 : }
7407 0 : }
7408 :
7409 46 : return false;
7410 46 : }
7411 :
7412 : static uint32_t
7413 18 : _bdev_abort(struct spdk_bdev_io *parent_io)
7414 : {
7415 18 : struct spdk_bdev_desc *desc = parent_io->internal.desc;
7416 18 : struct spdk_bdev_channel *channel = parent_io->internal.ch;
7417 : void *bio_cb_arg;
7418 : struct spdk_bdev_io *bio_to_abort;
7419 : uint32_t matched_ios;
7420 : int rc;
7421 :
7422 18 : bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg;
7423 :
7424 : /* matched_ios is returned and will be kept by the caller.
7425 : *
7426 : * This function will be used for two cases, 1) the same cb_arg is used for
7427 : * multiple I/Os, 2) a single large I/O is split into smaller ones.
7428 : * Incrementing split_outstanding directly here may confuse readers especially
7429 : * for the 1st case.
7430 : *
7431 : * Completion of I/O abort is processed after stack unwinding. Hence this trick
7432 : * works as expected.
7433 : */
7434 18 : matched_ios = 0;
7435 18 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
7436 :
7437 105 : TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) {
7438 88 : if (bio_to_abort->internal.caller_ctx != bio_cb_arg) {
7439 65 : continue;
7440 : }
7441 :
7442 23 : if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) {
7443 : /* Any I/O which was submitted after this abort command should be excluded. */
7444 0 : continue;
7445 : }
7446 :
7447 : /* We can't abort a request that's being pushed/pulled or executed by accel */
7448 23 : if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) ||
7449 23 : bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) {
7450 0 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7451 0 : break;
7452 : }
7453 :
7454 23 : rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io);
7455 23 : if (rc != 0) {
7456 1 : if (rc == -ENOMEM) {
7457 1 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM;
7458 1 : } else {
7459 0 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7460 : }
7461 1 : break;
7462 : }
7463 22 : matched_ios++;
7464 22 : }
7465 :
7466 18 : return matched_ios;
7467 : }
7468 :
7469 : static void
7470 1 : bdev_abort_retry(void *ctx)
7471 : {
7472 1 : struct spdk_bdev_io *parent_io = ctx;
7473 : uint32_t matched_ios;
7474 :
7475 1 : matched_ios = _bdev_abort(parent_io);
7476 :
7477 1 : if (matched_ios == 0) {
7478 0 : if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
7479 0 : bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
7480 0 : } else {
7481 : /* For retry, the case that no target I/O was found is success
7482 : * because it means target I/Os completed in the meantime.
7483 : */
7484 0 : bdev_io_complete(parent_io);
7485 : }
7486 0 : return;
7487 : }
7488 :
7489 : /* Use split_outstanding to manage the progress of aborting I/Os. */
7490 1 : parent_io->internal.f.split = true;
7491 1 : parent_io->internal.split.outstanding = matched_ios;
7492 1 : }
7493 :
7494 : static void
7495 17 : bdev_abort(struct spdk_bdev_io *parent_io)
7496 : {
7497 : uint32_t matched_ios;
7498 :
7499 17 : matched_ios = _bdev_abort(parent_io);
7500 :
7501 17 : if (matched_ios == 0) {
7502 2 : if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
7503 1 : bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
7504 1 : } else {
7505 : /* The case the no target I/O was found is failure. */
7506 1 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7507 1 : bdev_io_complete(parent_io);
7508 : }
7509 2 : return;
7510 : }
7511 :
7512 : /* Use split_outstanding to manage the progress of aborting I/Os. */
7513 15 : parent_io->internal.f.split = true;
7514 15 : parent_io->internal.split.outstanding = matched_ios;
7515 17 : }
7516 :
7517 : int
7518 12 : spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
7519 : void *bio_cb_arg,
7520 : spdk_bdev_io_completion_cb cb, void *cb_arg)
7521 : {
7522 12 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7523 12 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7524 : struct spdk_bdev_io *bdev_io;
7525 :
7526 12 : if (bio_cb_arg == NULL) {
7527 0 : return -EINVAL;
7528 : }
7529 :
7530 12 : if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
7531 1 : return -ENOTSUP;
7532 : }
7533 :
7534 11 : bdev_io = bdev_channel_get_io(channel);
7535 11 : if (bdev_io == NULL) {
7536 0 : return -ENOMEM;
7537 : }
7538 :
7539 11 : bdev_io->internal.ch = channel;
7540 11 : bdev_io->internal.desc = desc;
7541 11 : bdev_io->internal.submit_tsc = spdk_get_ticks();
7542 11 : bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
7543 11 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
7544 :
7545 11 : bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg;
7546 :
7547 : /* Parent abort request is not submitted directly, but to manage its execution,
7548 : * add it to the submitted list here.
7549 : */
7550 11 : bdev_ch_add_to_io_submitted(bdev_io);
7551 :
7552 11 : bdev_abort(bdev_io);
7553 :
7554 11 : return 0;
7555 12 : }
7556 :
7557 : int
7558 4 : spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
7559 : struct spdk_bdev_io_wait_entry *entry)
7560 : {
7561 4 : struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7562 4 : struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch;
7563 :
7564 4 : if (bdev != entry->bdev) {
7565 0 : SPDK_ERRLOG("bdevs do not match\n");
7566 0 : return -EINVAL;
7567 : }
7568 :
7569 4 : if (mgmt_ch->per_thread_cache_count > 0) {
7570 0 : SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n");
7571 0 : return -EINVAL;
7572 : }
7573 :
7574 4 : TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link);
7575 4 : return 0;
7576 4 : }
7577 :
7578 : static inline void
7579 629 : bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff)
7580 : {
7581 629 : enum spdk_bdev_io_status io_status = bdev_io->internal.status;
7582 629 : struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat;
7583 629 : uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
7584 629 : uint32_t blocklen = bdev_io->bdev->blocklen;
7585 :
7586 629 : if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
7587 520 : switch (bdev_io->type) {
7588 : case SPDK_BDEV_IO_TYPE_READ:
7589 321 : io_stat->bytes_read += num_blocks * blocklen;
7590 321 : io_stat->num_read_ops++;
7591 321 : io_stat->read_latency_ticks += tsc_diff;
7592 321 : if (io_stat->max_read_latency_ticks < tsc_diff) {
7593 7 : io_stat->max_read_latency_ticks = tsc_diff;
7594 7 : }
7595 321 : if (io_stat->min_read_latency_ticks > tsc_diff) {
7596 42 : io_stat->min_read_latency_ticks = tsc_diff;
7597 42 : }
7598 321 : break;
7599 : case SPDK_BDEV_IO_TYPE_WRITE:
7600 75 : io_stat->bytes_written += num_blocks * blocklen;
7601 75 : io_stat->num_write_ops++;
7602 75 : io_stat->write_latency_ticks += tsc_diff;
7603 75 : if (io_stat->max_write_latency_ticks < tsc_diff) {
7604 4 : io_stat->max_write_latency_ticks = tsc_diff;
7605 4 : }
7606 75 : if (io_stat->min_write_latency_ticks > tsc_diff) {
7607 25 : io_stat->min_write_latency_ticks = tsc_diff;
7608 25 : }
7609 75 : break;
7610 : case SPDK_BDEV_IO_TYPE_UNMAP:
7611 20 : io_stat->bytes_unmapped += num_blocks * blocklen;
7612 20 : io_stat->num_unmap_ops++;
7613 20 : io_stat->unmap_latency_ticks += tsc_diff;
7614 20 : if (io_stat->max_unmap_latency_ticks < tsc_diff) {
7615 0 : io_stat->max_unmap_latency_ticks = tsc_diff;
7616 0 : }
7617 20 : if (io_stat->min_unmap_latency_ticks > tsc_diff) {
7618 3 : io_stat->min_unmap_latency_ticks = tsc_diff;
7619 3 : }
7620 20 : break;
7621 : case SPDK_BDEV_IO_TYPE_ZCOPY:
7622 : /* Track the data in the start phase only */
7623 4 : if (bdev_io->u.bdev.zcopy.start) {
7624 2 : if (bdev_io->u.bdev.zcopy.populate) {
7625 1 : io_stat->bytes_read += num_blocks * blocklen;
7626 1 : io_stat->num_read_ops++;
7627 1 : io_stat->read_latency_ticks += tsc_diff;
7628 1 : if (io_stat->max_read_latency_ticks < tsc_diff) {
7629 0 : io_stat->max_read_latency_ticks = tsc_diff;
7630 0 : }
7631 1 : if (io_stat->min_read_latency_ticks > tsc_diff) {
7632 1 : io_stat->min_read_latency_ticks = tsc_diff;
7633 1 : }
7634 1 : } else {
7635 1 : io_stat->bytes_written += num_blocks * blocklen;
7636 1 : io_stat->num_write_ops++;
7637 1 : io_stat->write_latency_ticks += tsc_diff;
7638 1 : if (io_stat->max_write_latency_ticks < tsc_diff) {
7639 0 : io_stat->max_write_latency_ticks = tsc_diff;
7640 0 : }
7641 1 : if (io_stat->min_write_latency_ticks > tsc_diff) {
7642 1 : io_stat->min_write_latency_ticks = tsc_diff;
7643 1 : }
7644 : }
7645 2 : }
7646 4 : break;
7647 : case SPDK_BDEV_IO_TYPE_COPY:
7648 21 : io_stat->bytes_copied += num_blocks * blocklen;
7649 21 : io_stat->num_copy_ops++;
7650 21 : bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff;
7651 21 : if (io_stat->max_copy_latency_ticks < tsc_diff) {
7652 0 : io_stat->max_copy_latency_ticks = tsc_diff;
7653 0 : }
7654 21 : if (io_stat->min_copy_latency_ticks > tsc_diff) {
7655 4 : io_stat->min_copy_latency_ticks = tsc_diff;
7656 4 : }
7657 21 : break;
7658 : default:
7659 79 : break;
7660 : }
7661 629 : } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) {
7662 109 : io_stat = bdev_io->bdev->internal.stat;
7663 109 : assert(io_stat->io_error != NULL);
7664 :
7665 109 : spdk_spin_lock(&bdev_io->bdev->internal.spinlock);
7666 109 : io_stat->io_error->error_status[-io_status - 1]++;
7667 109 : spdk_spin_unlock(&bdev_io->bdev->internal.spinlock);
7668 109 : }
7669 :
7670 : #ifdef SPDK_CONFIG_VTUNE
7671 : uint64_t now_tsc = spdk_get_ticks();
7672 : if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) {
7673 : uint64_t data[5];
7674 : struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat;
7675 :
7676 : data[0] = io_stat->num_read_ops - prev_stat->num_read_ops;
7677 : data[1] = io_stat->bytes_read - prev_stat->bytes_read;
7678 : data[2] = io_stat->num_write_ops - prev_stat->num_write_ops;
7679 : data[3] = io_stat->bytes_written - prev_stat->bytes_written;
7680 : data[4] = bdev_io->bdev->fn_table->get_spin_time ?
7681 : bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0;
7682 :
7683 : __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle,
7684 : __itt_metadata_u64, 5, data);
7685 :
7686 : memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat));
7687 : bdev_io->internal.ch->start_tsc = now_tsc;
7688 : }
7689 : #endif
7690 629 : }
7691 :
7692 : static inline void
7693 629 : _bdev_io_complete(void *ctx)
7694 : {
7695 629 : struct spdk_bdev_io *bdev_io = ctx;
7696 :
7697 629 : if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) {
7698 0 : assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
7699 0 : spdk_accel_sequence_abort(bdev_io->internal.accel_sequence);
7700 0 : }
7701 :
7702 629 : assert(bdev_io->internal.cb != NULL);
7703 629 : assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io));
7704 :
7705 1258 : bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
7706 629 : bdev_io->internal.caller_ctx);
7707 629 : }
7708 :
7709 : static inline void
7710 637 : bdev_io_complete(void *ctx)
7711 : {
7712 637 : struct spdk_bdev_io *bdev_io = ctx;
7713 637 : struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
7714 : uint64_t tsc, tsc_diff;
7715 :
7716 637 : if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) {
7717 : /*
7718 : * Defer completion to avoid potential infinite recursion if the
7719 : * user's completion callback issues a new I/O.
7720 : */
7721 16 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7722 8 : bdev_io_complete, bdev_io);
7723 8 : return;
7724 : }
7725 :
7726 629 : tsc = spdk_get_ticks();
7727 629 : tsc_diff = tsc - bdev_io->internal.submit_tsc;
7728 :
7729 629 : bdev_ch_remove_from_io_submitted(bdev_io);
7730 629 : spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io,
7731 : bdev_io->internal.caller_ctx, bdev_ch->queue_depth);
7732 :
7733 629 : if (bdev_ch->histogram) {
7734 4 : if (bdev_io->bdev->internal.histogram_io_type == 0 ||
7735 0 : bdev_io->bdev->internal.histogram_io_type == bdev_io->type) {
7736 : /*
7737 : * Tally all I/O types if the histogram_io_type is set to 0.
7738 : */
7739 4 : spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff);
7740 4 : }
7741 4 : }
7742 :
7743 629 : bdev_io_update_io_stat(bdev_io, tsc_diff);
7744 629 : _bdev_io_complete(bdev_io);
7745 637 : }
7746 :
7747 : /* The difference between this function and bdev_io_complete() is that this should be called to
7748 : * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the
7749 : * io_submitted list and don't have submit_tsc updated.
7750 : */
7751 : static inline void
7752 0 : bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io)
7753 : {
7754 : /* Since the IO hasn't been submitted it's bound to be failed */
7755 0 : assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
7756 :
7757 : /* At this point we don't know if the IO is completed from submission context or not, but,
7758 : * since this is an error path, we can always do an spdk_thread_send_msg(). */
7759 0 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7760 0 : _bdev_io_complete, bdev_io);
7761 0 : }
7762 :
7763 : static void bdev_destroy_cb(void *io_device);
7764 :
7765 : static inline void
7766 19 : _bdev_reset_complete(void *ctx)
7767 : {
7768 19 : struct spdk_bdev_io *bdev_io = ctx;
7769 :
7770 : /* Put the channel reference we got in submission. */
7771 19 : assert(bdev_io->u.reset.ch_ref != NULL);
7772 19 : spdk_put_io_channel(bdev_io->u.reset.ch_ref);
7773 19 : bdev_io->u.reset.ch_ref = NULL;
7774 :
7775 19 : bdev_io_complete(bdev_io);
7776 19 : }
7777 :
7778 : static void
7779 17 : bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status)
7780 : {
7781 17 : struct spdk_bdev_io *bdev_io = _ctx;
7782 : bdev_io_tailq_t queued_resets;
7783 : struct spdk_bdev_io *queued_reset;
7784 :
7785 17 : assert(bdev_io == bdev->internal.reset_in_progress);
7786 :
7787 17 : TAILQ_INIT(&queued_resets);
7788 :
7789 17 : spdk_spin_lock(&bdev->internal.spinlock);
7790 17 : TAILQ_SWAP(&bdev->internal.queued_resets, &queued_resets,
7791 : spdk_bdev_io, internal.link);
7792 17 : bdev->internal.reset_in_progress = NULL;
7793 17 : spdk_spin_unlock(&bdev->internal.spinlock);
7794 :
7795 19 : while (!TAILQ_EMPTY(&queued_resets)) {
7796 2 : queued_reset = TAILQ_FIRST(&queued_resets);
7797 2 : TAILQ_REMOVE(&queued_resets, queued_reset, internal.link);
7798 2 : queued_reset->internal.status = bdev_io->internal.status;
7799 4 : spdk_thread_send_msg(spdk_bdev_io_get_thread(queued_reset),
7800 2 : _bdev_reset_complete, queued_reset);
7801 : }
7802 :
7803 17 : _bdev_reset_complete(bdev_io);
7804 :
7805 17 : if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING &&
7806 1 : TAILQ_EMPTY(&bdev->internal.open_descs)) {
7807 1 : spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
7808 1 : }
7809 17 : }
7810 :
7811 : static void
7812 21 : bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7813 : struct spdk_io_channel *_ch, void *_ctx)
7814 : {
7815 21 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
7816 :
7817 21 : ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
7818 :
7819 21 : spdk_bdev_for_each_channel_continue(i, 0);
7820 21 : }
7821 :
7822 : static void
7823 0 : bdev_io_complete_sequence_cb(void *ctx, int status)
7824 : {
7825 0 : struct spdk_bdev_io *bdev_io = ctx;
7826 :
7827 : /* u.bdev.accel_sequence should have already been cleared at this point */
7828 0 : assert(bdev_io->u.bdev.accel_sequence == NULL);
7829 0 : assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
7830 0 : bdev_io->internal.f.has_accel_sequence = false;
7831 :
7832 0 : if (spdk_unlikely(status != 0)) {
7833 0 : SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
7834 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7835 0 : }
7836 :
7837 0 : bdev_io_complete(bdev_io);
7838 0 : }
7839 :
7840 : void
7841 631 : spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
7842 : {
7843 631 : struct spdk_bdev *bdev = bdev_io->bdev;
7844 631 : struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
7845 631 : struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
7846 :
7847 631 : if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) {
7848 0 : SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n",
7849 : spdk_bdev_get_module_name(bdev),
7850 : bdev_io_status_get_string(bdev_io->internal.status));
7851 0 : assert(false);
7852 : }
7853 631 : bdev_io->internal.status = status;
7854 :
7855 631 : if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
7856 17 : assert(bdev_io == bdev->internal.reset_in_progress);
7857 17 : spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io,
7858 : bdev_reset_complete);
7859 17 : return;
7860 : } else {
7861 614 : bdev_io_decrement_outstanding(bdev_ch, shared_resource);
7862 614 : if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
7863 485 : if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
7864 0 : bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb);
7865 0 : return;
7866 485 : } else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf &&
7867 : !bdev_io_use_accel_sequence(bdev_io))) {
7868 26 : _bdev_io_push_bounce_data_buffer(bdev_io,
7869 : _bdev_io_complete_push_bounce_done);
7870 : /* bdev IO will be completed in the callback */
7871 26 : return;
7872 : }
7873 459 : }
7874 :
7875 588 : if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) {
7876 21 : return;
7877 : }
7878 : }
7879 :
7880 567 : bdev_io_complete(bdev_io);
7881 631 : }
7882 :
7883 : void
7884 0 : spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
7885 : enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
7886 : {
7887 : enum spdk_bdev_io_status status;
7888 :
7889 0 : if (sc == SPDK_SCSI_STATUS_GOOD) {
7890 0 : status = SPDK_BDEV_IO_STATUS_SUCCESS;
7891 0 : } else {
7892 0 : status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
7893 0 : bdev_io->internal.error.scsi.sc = sc;
7894 0 : bdev_io->internal.error.scsi.sk = sk;
7895 0 : bdev_io->internal.error.scsi.asc = asc;
7896 0 : bdev_io->internal.error.scsi.ascq = ascq;
7897 : }
7898 :
7899 0 : spdk_bdev_io_complete(bdev_io, status);
7900 0 : }
7901 :
7902 : void
7903 0 : spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
7904 : int *sc, int *sk, int *asc, int *ascq)
7905 : {
7906 0 : assert(sc != NULL);
7907 0 : assert(sk != NULL);
7908 0 : assert(asc != NULL);
7909 0 : assert(ascq != NULL);
7910 :
7911 0 : switch (bdev_io->internal.status) {
7912 : case SPDK_BDEV_IO_STATUS_SUCCESS:
7913 0 : *sc = SPDK_SCSI_STATUS_GOOD;
7914 0 : *sk = SPDK_SCSI_SENSE_NO_SENSE;
7915 0 : *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7916 0 : *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7917 0 : break;
7918 : case SPDK_BDEV_IO_STATUS_NVME_ERROR:
7919 0 : spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
7920 0 : break;
7921 : case SPDK_BDEV_IO_STATUS_MISCOMPARE:
7922 0 : *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
7923 0 : *sk = SPDK_SCSI_SENSE_MISCOMPARE;
7924 0 : *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION;
7925 0 : *ascq = bdev_io->internal.error.scsi.ascq;
7926 0 : break;
7927 : case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
7928 0 : *sc = bdev_io->internal.error.scsi.sc;
7929 0 : *sk = bdev_io->internal.error.scsi.sk;
7930 0 : *asc = bdev_io->internal.error.scsi.asc;
7931 0 : *ascq = bdev_io->internal.error.scsi.ascq;
7932 0 : break;
7933 : default:
7934 0 : *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
7935 0 : *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
7936 0 : *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7937 0 : *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7938 0 : break;
7939 : }
7940 0 : }
7941 :
7942 : void
7943 0 : spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result)
7944 : {
7945 : enum spdk_bdev_io_status status;
7946 :
7947 0 : if (aio_result == 0) {
7948 0 : status = SPDK_BDEV_IO_STATUS_SUCCESS;
7949 0 : } else {
7950 0 : status = SPDK_BDEV_IO_STATUS_AIO_ERROR;
7951 : }
7952 :
7953 0 : bdev_io->internal.error.aio_result = aio_result;
7954 :
7955 0 : spdk_bdev_io_complete(bdev_io, status);
7956 0 : }
7957 :
7958 : void
7959 0 : spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result)
7960 : {
7961 0 : assert(aio_result != NULL);
7962 :
7963 0 : if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) {
7964 0 : *aio_result = bdev_io->internal.error.aio_result;
7965 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7966 0 : *aio_result = 0;
7967 0 : } else {
7968 0 : *aio_result = -EIO;
7969 : }
7970 0 : }
7971 :
7972 : void
7973 0 : spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc)
7974 : {
7975 : enum spdk_bdev_io_status status;
7976 :
7977 0 : if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) {
7978 0 : status = SPDK_BDEV_IO_STATUS_SUCCESS;
7979 0 : } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) {
7980 0 : status = SPDK_BDEV_IO_STATUS_ABORTED;
7981 0 : } else {
7982 0 : status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
7983 : }
7984 :
7985 0 : bdev_io->internal.error.nvme.cdw0 = cdw0;
7986 0 : bdev_io->internal.error.nvme.sct = sct;
7987 0 : bdev_io->internal.error.nvme.sc = sc;
7988 :
7989 0 : spdk_bdev_io_complete(bdev_io, status);
7990 0 : }
7991 :
7992 : void
7993 0 : spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc)
7994 : {
7995 0 : assert(sct != NULL);
7996 0 : assert(sc != NULL);
7997 0 : assert(cdw0 != NULL);
7998 :
7999 0 : if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
8000 0 : *sct = SPDK_NVME_SCT_GENERIC;
8001 0 : *sc = SPDK_NVME_SC_SUCCESS;
8002 0 : if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
8003 0 : *cdw0 = 0;
8004 0 : } else {
8005 0 : *cdw0 = 1U;
8006 : }
8007 0 : return;
8008 : }
8009 :
8010 0 : if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
8011 0 : *sct = SPDK_NVME_SCT_GENERIC;
8012 0 : *sc = SPDK_NVME_SC_SUCCESS;
8013 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
8014 0 : *sct = bdev_io->internal.error.nvme.sct;
8015 0 : *sc = bdev_io->internal.error.nvme.sc;
8016 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
8017 0 : *sct = SPDK_NVME_SCT_GENERIC;
8018 0 : *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
8019 0 : } else {
8020 0 : *sct = SPDK_NVME_SCT_GENERIC;
8021 0 : *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
8022 : }
8023 :
8024 0 : *cdw0 = bdev_io->internal.error.nvme.cdw0;
8025 0 : }
8026 :
8027 : void
8028 0 : spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0,
8029 : int *first_sct, int *first_sc, int *second_sct, int *second_sc)
8030 : {
8031 0 : assert(first_sct != NULL);
8032 0 : assert(first_sc != NULL);
8033 0 : assert(second_sct != NULL);
8034 0 : assert(second_sc != NULL);
8035 0 : assert(cdw0 != NULL);
8036 :
8037 0 : if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
8038 0 : if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR &&
8039 0 : bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) {
8040 0 : *first_sct = bdev_io->internal.error.nvme.sct;
8041 0 : *first_sc = bdev_io->internal.error.nvme.sc;
8042 0 : *second_sct = SPDK_NVME_SCT_GENERIC;
8043 0 : *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
8044 0 : } else {
8045 0 : *first_sct = SPDK_NVME_SCT_GENERIC;
8046 0 : *first_sc = SPDK_NVME_SC_SUCCESS;
8047 0 : *second_sct = bdev_io->internal.error.nvme.sct;
8048 0 : *second_sc = bdev_io->internal.error.nvme.sc;
8049 : }
8050 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
8051 0 : *first_sct = SPDK_NVME_SCT_GENERIC;
8052 0 : *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
8053 0 : *second_sct = SPDK_NVME_SCT_GENERIC;
8054 0 : *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
8055 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
8056 0 : *first_sct = SPDK_NVME_SCT_GENERIC;
8057 0 : *first_sc = SPDK_NVME_SC_SUCCESS;
8058 0 : *second_sct = SPDK_NVME_SCT_GENERIC;
8059 0 : *second_sc = SPDK_NVME_SC_SUCCESS;
8060 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) {
8061 0 : *first_sct = SPDK_NVME_SCT_GENERIC;
8062 0 : *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
8063 0 : *second_sct = SPDK_NVME_SCT_GENERIC;
8064 0 : *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
8065 0 : } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) {
8066 0 : *first_sct = SPDK_NVME_SCT_MEDIA_ERROR;
8067 0 : *first_sc = SPDK_NVME_SC_COMPARE_FAILURE;
8068 0 : *second_sct = SPDK_NVME_SCT_GENERIC;
8069 0 : *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
8070 0 : } else {
8071 0 : *first_sct = SPDK_NVME_SCT_GENERIC;
8072 0 : *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
8073 0 : *second_sct = SPDK_NVME_SCT_GENERIC;
8074 0 : *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
8075 : }
8076 :
8077 0 : *cdw0 = bdev_io->internal.error.nvme.cdw0;
8078 0 : }
8079 :
8080 : void
8081 0 : spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io,
8082 : const struct spdk_bdev_io *base_io)
8083 : {
8084 0 : switch (base_io->internal.status) {
8085 : case SPDK_BDEV_IO_STATUS_NVME_ERROR:
8086 0 : spdk_bdev_io_complete_nvme_status(bdev_io,
8087 0 : base_io->internal.error.nvme.cdw0,
8088 0 : base_io->internal.error.nvme.sct,
8089 0 : base_io->internal.error.nvme.sc);
8090 0 : break;
8091 : case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
8092 0 : spdk_bdev_io_complete_scsi_status(bdev_io,
8093 0 : base_io->internal.error.scsi.sc,
8094 0 : base_io->internal.error.scsi.sk,
8095 0 : base_io->internal.error.scsi.asc,
8096 0 : base_io->internal.error.scsi.ascq);
8097 0 : break;
8098 : case SPDK_BDEV_IO_STATUS_AIO_ERROR:
8099 0 : spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result);
8100 0 : break;
8101 : default:
8102 0 : spdk_bdev_io_complete(bdev_io, base_io->internal.status);
8103 0 : break;
8104 : }
8105 0 : }
8106 :
8107 : struct spdk_thread *
8108 681 : spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
8109 : {
8110 681 : return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
8111 : }
8112 :
8113 : struct spdk_io_channel *
8114 85 : spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io)
8115 : {
8116 85 : return bdev_io->internal.ch->channel;
8117 : }
8118 :
8119 : static int
8120 133 : bdev_register(struct spdk_bdev *bdev)
8121 : {
8122 : char *bdev_name;
8123 : char uuid[SPDK_UUID_STRING_LEN];
8124 : struct spdk_iobuf_opts iobuf_opts;
8125 : int ret;
8126 :
8127 133 : assert(bdev->module != NULL);
8128 :
8129 133 : if (!bdev->name) {
8130 0 : SPDK_ERRLOG("Bdev name is NULL\n");
8131 0 : return -EINVAL;
8132 : }
8133 :
8134 133 : if (!strlen(bdev->name)) {
8135 0 : SPDK_ERRLOG("Bdev name must not be an empty string\n");
8136 0 : return -EINVAL;
8137 : }
8138 :
8139 : /* Users often register their own I/O devices using the bdev name. In
8140 : * order to avoid conflicts, prepend bdev_. */
8141 133 : bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name);
8142 133 : if (!bdev_name) {
8143 0 : SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n");
8144 0 : return -ENOMEM;
8145 : }
8146 :
8147 133 : bdev->internal.stat = bdev_alloc_io_stat(true);
8148 133 : if (!bdev->internal.stat) {
8149 0 : SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n");
8150 0 : free(bdev_name);
8151 0 : return -ENOMEM;
8152 : }
8153 :
8154 133 : bdev->internal.status = SPDK_BDEV_STATUS_READY;
8155 133 : bdev->internal.measured_queue_depth = UINT64_MAX;
8156 133 : bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
8157 133 : memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
8158 133 : bdev->internal.qd_poller = NULL;
8159 133 : bdev->internal.qos = NULL;
8160 :
8161 133 : TAILQ_INIT(&bdev->internal.open_descs);
8162 133 : TAILQ_INIT(&bdev->internal.locked_ranges);
8163 133 : TAILQ_INIT(&bdev->internal.pending_locked_ranges);
8164 133 : TAILQ_INIT(&bdev->internal.queued_resets);
8165 133 : TAILQ_INIT(&bdev->aliases);
8166 :
8167 : /* UUID may be specified by the user or defined by bdev itself.
8168 : * Otherwise it will be generated here, so this field will never be empty. */
8169 133 : if (spdk_uuid_is_null(&bdev->uuid)) {
8170 44 : spdk_uuid_generate(&bdev->uuid);
8171 44 : }
8172 :
8173 : /* Add the UUID alias only if it's different than the name */
8174 133 : spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
8175 133 : if (strcmp(bdev->name, uuid) != 0) {
8176 132 : ret = spdk_bdev_alias_add(bdev, uuid);
8177 132 : if (ret != 0) {
8178 2 : SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name);
8179 2 : bdev_free_io_stat(bdev->internal.stat);
8180 2 : free(bdev_name);
8181 2 : return ret;
8182 : }
8183 130 : }
8184 :
8185 131 : spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts));
8186 131 : if (spdk_bdev_get_buf_align(bdev) > 1) {
8187 0 : bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX,
8188 : iobuf_opts.large_bufsize / bdev->blocklen);
8189 0 : }
8190 :
8191 : /* If the user didn't specify a write unit size, set it to one. */
8192 131 : if (bdev->write_unit_size == 0) {
8193 127 : bdev->write_unit_size = 1;
8194 127 : }
8195 :
8196 : /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */
8197 131 : if (bdev->acwu == 0) {
8198 127 : bdev->acwu = bdev->write_unit_size;
8199 127 : }
8200 :
8201 131 : if (bdev->phys_blocklen == 0) {
8202 127 : bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev);
8203 127 : }
8204 :
8205 131 : if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) {
8206 0 : bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize);
8207 0 : }
8208 :
8209 131 : if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
8210 0 : bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE);
8211 0 : }
8212 :
8213 131 : bdev->internal.reset_in_progress = NULL;
8214 131 : bdev->internal.qd_poll_in_progress = false;
8215 131 : bdev->internal.period = 0;
8216 131 : bdev->internal.new_period = 0;
8217 131 : bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name);
8218 :
8219 : /*
8220 : * Initialize spinlock before registering IO device because spinlock is used in
8221 : * bdev_channel_create
8222 : */
8223 131 : spdk_spin_init(&bdev->internal.spinlock);
8224 :
8225 262 : spdk_io_device_register(__bdev_to_io_dev(bdev),
8226 : bdev_channel_create, bdev_channel_destroy,
8227 : sizeof(struct spdk_bdev_channel),
8228 131 : bdev_name);
8229 :
8230 : /*
8231 : * Register bdev name only after the bdev object is ready.
8232 : * After bdev_name_add returns, it is possible for other threads to start using the bdev,
8233 : * create IO channels...
8234 : */
8235 131 : ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name);
8236 131 : if (ret != 0) {
8237 0 : spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL);
8238 0 : bdev_free_io_stat(bdev->internal.stat);
8239 0 : spdk_spin_destroy(&bdev->internal.spinlock);
8240 0 : free(bdev_name);
8241 0 : return ret;
8242 : }
8243 :
8244 131 : free(bdev_name);
8245 :
8246 131 : SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name);
8247 131 : TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
8248 :
8249 131 : return 0;
8250 133 : }
8251 :
8252 : static void
8253 132 : bdev_destroy_cb(void *io_device)
8254 : {
8255 : int rc;
8256 : struct spdk_bdev *bdev;
8257 : spdk_bdev_unregister_cb cb_fn;
8258 : void *cb_arg;
8259 :
8260 132 : bdev = __bdev_from_io_dev(io_device);
8261 :
8262 132 : if (bdev->internal.unregister_td != spdk_get_thread()) {
8263 1 : spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device);
8264 1 : return;
8265 : }
8266 :
8267 131 : cb_fn = bdev->internal.unregister_cb;
8268 131 : cb_arg = bdev->internal.unregister_ctx;
8269 :
8270 131 : spdk_spin_destroy(&bdev->internal.spinlock);
8271 131 : free(bdev->internal.qos);
8272 131 : bdev_free_io_stat(bdev->internal.stat);
8273 131 : spdk_trace_unregister_owner(bdev->internal.trace_id);
8274 :
8275 131 : rc = bdev->fn_table->destruct(bdev->ctxt);
8276 131 : if (rc < 0) {
8277 0 : SPDK_ERRLOG("destruct failed\n");
8278 0 : }
8279 131 : if (rc <= 0 && cb_fn != NULL) {
8280 10 : cb_fn(cb_arg, rc);
8281 10 : }
8282 132 : }
8283 :
8284 : void
8285 2 : spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
8286 : {
8287 2 : if (bdev->internal.unregister_cb != NULL) {
8288 0 : bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno);
8289 0 : }
8290 2 : }
8291 :
8292 : static void
8293 19 : _remove_notify(void *arg)
8294 : {
8295 19 : struct spdk_bdev_desc *desc = arg;
8296 :
8297 19 : _event_notify(desc, SPDK_BDEV_EVENT_REMOVE);
8298 19 : }
8299 :
8300 : /* returns: 0 - bdev removed and ready to be destructed.
8301 : * -EBUSY - bdev can't be destructed yet. */
8302 : static int
8303 146 : bdev_unregister_unsafe(struct spdk_bdev *bdev)
8304 : {
8305 : struct spdk_bdev_desc *desc, *tmp;
8306 : struct spdk_bdev_alias *alias;
8307 146 : int rc = 0;
8308 : char uuid[SPDK_UUID_STRING_LEN];
8309 :
8310 146 : assert(spdk_spin_held(&g_bdev_mgr.spinlock));
8311 146 : assert(spdk_spin_held(&bdev->internal.spinlock));
8312 :
8313 : /* Notify each descriptor about hotremoval */
8314 165 : TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) {
8315 19 : rc = -EBUSY;
8316 : /*
8317 : * Defer invocation of the event_cb to a separate message that will
8318 : * run later on its thread. This ensures this context unwinds and
8319 : * we don't recursively unregister this bdev again if the event_cb
8320 : * immediately closes its descriptor.
8321 : */
8322 19 : event_notify(desc, _remove_notify);
8323 19 : }
8324 :
8325 : /* If there are no descriptors, proceed removing the bdev */
8326 146 : if (rc == 0) {
8327 131 : bdev_examine_allowlist_remove(bdev->name);
8328 260 : TAILQ_FOREACH(alias, &bdev->aliases, tailq) {
8329 129 : bdev_examine_allowlist_remove(alias->alias.name);
8330 129 : }
8331 131 : TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
8332 131 : SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name);
8333 :
8334 : /* Delete the name and the UUID alias */
8335 131 : spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
8336 131 : bdev_name_del_unsafe(&bdev->internal.bdev_name);
8337 131 : bdev_alias_del(bdev, uuid, bdev_name_del_unsafe);
8338 :
8339 131 : spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev));
8340 :
8341 131 : if (bdev->internal.reset_in_progress != NULL) {
8342 : /* If reset is in progress, let the completion callback for reset
8343 : * unregister the bdev.
8344 : */
8345 1 : rc = -EBUSY;
8346 1 : }
8347 131 : }
8348 :
8349 146 : return rc;
8350 : }
8351 :
8352 : static void
8353 4 : bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8354 : struct spdk_io_channel *io_ch, void *_ctx)
8355 : {
8356 4 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
8357 :
8358 4 : bdev_channel_abort_queued_ios(bdev_ch);
8359 4 : spdk_bdev_for_each_channel_continue(i, 0);
8360 4 : }
8361 :
8362 : static void
8363 131 : bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status)
8364 : {
8365 : int rc;
8366 :
8367 131 : spdk_spin_lock(&g_bdev_mgr.spinlock);
8368 131 : spdk_spin_lock(&bdev->internal.spinlock);
8369 : /*
8370 : * Set the status to REMOVING after completing to abort channels. Otherwise,
8371 : * the last spdk_bdev_close() may call spdk_io_device_unregister() while
8372 : * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister()
8373 : * may fail.
8374 : */
8375 131 : bdev->internal.status = SPDK_BDEV_STATUS_REMOVING;
8376 131 : rc = bdev_unregister_unsafe(bdev);
8377 131 : spdk_spin_unlock(&bdev->internal.spinlock);
8378 131 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8379 :
8380 131 : if (rc == 0) {
8381 115 : spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
8382 115 : }
8383 131 : }
8384 :
8385 : void
8386 138 : spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
8387 : {
8388 : struct spdk_thread *thread;
8389 :
8390 138 : SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name);
8391 :
8392 138 : thread = spdk_get_thread();
8393 138 : if (!thread) {
8394 : /* The user called this from a non-SPDK thread. */
8395 0 : if (cb_fn != NULL) {
8396 0 : cb_fn(cb_arg, -ENOTSUP);
8397 0 : }
8398 0 : return;
8399 : }
8400 :
8401 138 : spdk_spin_lock(&g_bdev_mgr.spinlock);
8402 138 : if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
8403 138 : bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
8404 7 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8405 7 : if (cb_fn) {
8406 0 : cb_fn(cb_arg, -EBUSY);
8407 0 : }
8408 7 : return;
8409 : }
8410 :
8411 131 : spdk_spin_lock(&bdev->internal.spinlock);
8412 131 : bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING;
8413 131 : bdev->internal.unregister_cb = cb_fn;
8414 131 : bdev->internal.unregister_ctx = cb_arg;
8415 131 : bdev->internal.unregister_td = thread;
8416 131 : spdk_spin_unlock(&bdev->internal.spinlock);
8417 131 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8418 :
8419 131 : spdk_bdev_set_qd_sampling_period(bdev, 0);
8420 :
8421 131 : spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev,
8422 : bdev_unregister);
8423 138 : }
8424 :
8425 : int
8426 4 : spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module,
8427 : spdk_bdev_unregister_cb cb_fn, void *cb_arg)
8428 : {
8429 : struct spdk_bdev_desc *desc;
8430 : struct spdk_bdev *bdev;
8431 : int rc;
8432 :
8433 4 : rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc);
8434 4 : if (rc != 0) {
8435 1 : SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name);
8436 1 : return rc;
8437 : }
8438 :
8439 3 : bdev = spdk_bdev_desc_get_bdev(desc);
8440 :
8441 3 : if (bdev->module != module) {
8442 1 : spdk_bdev_close(desc);
8443 1 : SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n",
8444 : bdev_name);
8445 1 : return -ENODEV;
8446 : }
8447 :
8448 2 : spdk_bdev_unregister(bdev, cb_fn, cb_arg);
8449 :
8450 2 : spdk_bdev_close(desc);
8451 :
8452 2 : return 0;
8453 4 : }
8454 :
8455 : static int
8456 271 : bdev_start_qos(struct spdk_bdev *bdev)
8457 : {
8458 : struct set_qos_limit_ctx *ctx;
8459 :
8460 : /* Enable QoS */
8461 271 : if (bdev->internal.qos && bdev->internal.qos->thread == NULL) {
8462 2 : ctx = calloc(1, sizeof(*ctx));
8463 2 : if (ctx == NULL) {
8464 0 : SPDK_ERRLOG("Failed to allocate memory for QoS context\n");
8465 0 : return -ENOMEM;
8466 : }
8467 2 : ctx->bdev = bdev;
8468 2 : spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done);
8469 2 : }
8470 :
8471 271 : return 0;
8472 271 : }
8473 :
8474 : static void
8475 25 : log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail,
8476 : struct spdk_bdev *bdev)
8477 : {
8478 : enum spdk_bdev_claim_type type;
8479 : const char *typename, *modname;
8480 : extern struct spdk_log_flag SPDK_LOG_bdev;
8481 :
8482 25 : assert(spdk_spin_held(&bdev->internal.spinlock));
8483 :
8484 25 : if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) {
8485 0 : return;
8486 : }
8487 :
8488 25 : type = bdev->internal.claim_type;
8489 25 : typename = spdk_bdev_claim_get_name(type);
8490 :
8491 25 : if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) {
8492 6 : modname = bdev->internal.claim.v1.module->name;
8493 12 : spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
8494 6 : bdev->name, detail, typename, modname);
8495 6 : return;
8496 : }
8497 :
8498 19 : if (claim_type_is_v2(type)) {
8499 : struct spdk_bdev_module_claim *claim;
8500 :
8501 38 : TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
8502 19 : modname = claim->module->name;
8503 38 : spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
8504 19 : bdev->name, detail, typename, modname);
8505 19 : }
8506 19 : return;
8507 : }
8508 :
8509 0 : assert(false);
8510 25 : }
8511 :
8512 : static int
8513 280 : bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc)
8514 : {
8515 : struct spdk_thread *thread;
8516 280 : int rc = 0;
8517 :
8518 280 : thread = spdk_get_thread();
8519 280 : if (!thread) {
8520 0 : SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n");
8521 0 : return -ENOTSUP;
8522 : }
8523 :
8524 280 : SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
8525 : spdk_get_thread());
8526 :
8527 280 : desc->bdev = bdev;
8528 280 : desc->thread = thread;
8529 280 : desc->write = write;
8530 :
8531 280 : spdk_spin_lock(&bdev->internal.spinlock);
8532 280 : if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
8533 280 : bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
8534 3 : spdk_spin_unlock(&bdev->internal.spinlock);
8535 3 : return -ENODEV;
8536 : }
8537 :
8538 277 : if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
8539 6 : LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8540 6 : spdk_spin_unlock(&bdev->internal.spinlock);
8541 6 : return -EPERM;
8542 : }
8543 :
8544 271 : rc = bdev_start_qos(bdev);
8545 271 : if (rc != 0) {
8546 0 : SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name);
8547 0 : spdk_spin_unlock(&bdev->internal.spinlock);
8548 0 : return rc;
8549 : }
8550 :
8551 271 : TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link);
8552 :
8553 271 : spdk_spin_unlock(&bdev->internal.spinlock);
8554 :
8555 271 : return 0;
8556 280 : }
8557 :
8558 : static void
8559 281 : bdev_open_opts_get_defaults(struct spdk_bdev_open_opts *opts, size_t opts_size)
8560 : {
8561 281 : if (!opts) {
8562 0 : SPDK_ERRLOG("opts should not be NULL.\n");
8563 0 : return;
8564 : }
8565 :
8566 281 : if (!opts_size) {
8567 0 : SPDK_ERRLOG("opts_size should not be zero.\n");
8568 0 : return;
8569 : }
8570 :
8571 281 : memset(opts, 0, opts_size);
8572 281 : opts->size = opts_size;
8573 :
8574 : #define FIELD_OK(field) \
8575 : offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size
8576 :
8577 : #define SET_FIELD(field, value) \
8578 : if (FIELD_OK(field)) { \
8579 : opts->field = value; \
8580 : } \
8581 :
8582 281 : SET_FIELD(hide_metadata, false);
8583 :
8584 : #undef FIELD_OK
8585 : #undef SET_FIELD
8586 281 : }
8587 :
8588 : static void
8589 2 : bdev_open_opts_copy(struct spdk_bdev_open_opts *opts,
8590 : const struct spdk_bdev_open_opts *opts_src, size_t opts_size)
8591 : {
8592 2 : assert(opts);
8593 2 : assert(opts_src);
8594 :
8595 : #define SET_FIELD(field) \
8596 : if (offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size) { \
8597 : opts->field = opts_src->field; \
8598 : } \
8599 :
8600 2 : SET_FIELD(hide_metadata);
8601 :
8602 2 : opts->size = opts_src->size;
8603 :
8604 : /* We should not remove this statement, but need to update the assert statement
8605 : * if we add a new field, and also add a corresponding SET_FIELD statement.
8606 : */
8607 : SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_opts) == 16, "Incorrect size");
8608 :
8609 : #undef SET_FIELD
8610 2 : }
8611 :
8612 : void
8613 1 : spdk_bdev_open_opts_init(struct spdk_bdev_open_opts *opts, size_t opts_size)
8614 : {
8615 : struct spdk_bdev_open_opts opts_local;
8616 :
8617 1 : bdev_open_opts_get_defaults(&opts_local, sizeof(opts_local));
8618 1 : bdev_open_opts_copy(opts, &opts_local, opts_size);
8619 1 : }
8620 :
8621 : static int
8622 280 : bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx,
8623 : struct spdk_bdev_open_opts *user_opts, struct spdk_bdev_desc **_desc)
8624 : {
8625 : struct spdk_bdev_desc *desc;
8626 : struct spdk_bdev_open_opts opts;
8627 : unsigned int i;
8628 :
8629 280 : bdev_open_opts_get_defaults(&opts, sizeof(opts));
8630 280 : if (user_opts != NULL) {
8631 1 : bdev_open_opts_copy(&opts, user_opts, user_opts->size);
8632 1 : }
8633 :
8634 280 : desc = calloc(1, sizeof(*desc));
8635 280 : if (desc == NULL) {
8636 0 : SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
8637 0 : return -ENOMEM;
8638 : }
8639 :
8640 280 : desc->opts = opts;
8641 :
8642 280 : TAILQ_INIT(&desc->pending_media_events);
8643 280 : TAILQ_INIT(&desc->free_media_events);
8644 :
8645 280 : desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0;
8646 280 : desc->callback.event_fn = event_cb;
8647 280 : desc->callback.ctx = event_ctx;
8648 280 : spdk_spin_init(&desc->spinlock);
8649 :
8650 280 : if (desc->opts.hide_metadata) {
8651 1 : if (spdk_bdev_is_md_separate(bdev)) {
8652 0 : SPDK_ERRLOG("hide_metadata option is not supported with separate metadata.\n");
8653 0 : bdev_desc_free(desc);
8654 0 : return -EINVAL;
8655 : }
8656 1 : }
8657 :
8658 280 : if (bdev->media_events) {
8659 0 : desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE,
8660 : sizeof(*desc->media_events_buffer));
8661 0 : if (desc->media_events_buffer == NULL) {
8662 0 : SPDK_ERRLOG("Failed to initialize media event pool\n");
8663 0 : bdev_desc_free(desc);
8664 0 : return -ENOMEM;
8665 : }
8666 :
8667 0 : for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) {
8668 0 : TAILQ_INSERT_TAIL(&desc->free_media_events,
8669 : &desc->media_events_buffer[i], tailq);
8670 0 : }
8671 0 : }
8672 :
8673 280 : if (bdev->fn_table->accel_sequence_supported != NULL) {
8674 0 : for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
8675 0 : desc->accel_sequence_supported[i] =
8676 0 : bdev->fn_table->accel_sequence_supported(bdev->ctxt,
8677 0 : (enum spdk_bdev_io_type)i);
8678 0 : }
8679 0 : }
8680 :
8681 280 : *_desc = desc;
8682 :
8683 280 : return 0;
8684 280 : }
8685 :
8686 : static int
8687 137 : bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8688 : void *event_ctx, struct spdk_bdev_open_opts *opts,
8689 : struct spdk_bdev_desc **_desc)
8690 : {
8691 : struct spdk_bdev_desc *desc;
8692 : struct spdk_bdev *bdev;
8693 : int rc;
8694 :
8695 137 : bdev = bdev_get_by_name(bdev_name);
8696 :
8697 137 : if (bdev == NULL) {
8698 1 : SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name);
8699 1 : return -ENODEV;
8700 : }
8701 :
8702 136 : rc = bdev_desc_alloc(bdev, event_cb, event_ctx, opts, &desc);
8703 136 : if (rc != 0) {
8704 0 : return rc;
8705 : }
8706 :
8707 136 : rc = bdev_open(bdev, write, desc);
8708 136 : if (rc != 0) {
8709 7 : bdev_desc_free(desc);
8710 7 : desc = NULL;
8711 7 : }
8712 :
8713 136 : *_desc = desc;
8714 :
8715 136 : return rc;
8716 137 : }
8717 :
8718 : int
8719 139 : spdk_bdev_open_ext_v2(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8720 : void *event_ctx, struct spdk_bdev_open_opts *opts,
8721 : struct spdk_bdev_desc **_desc)
8722 : {
8723 : int rc;
8724 :
8725 139 : if (event_cb == NULL) {
8726 2 : SPDK_ERRLOG("Missing event callback function\n");
8727 2 : return -EINVAL;
8728 : }
8729 :
8730 137 : spdk_spin_lock(&g_bdev_mgr.spinlock);
8731 137 : rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, opts, _desc);
8732 137 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8733 :
8734 137 : return rc;
8735 139 : }
8736 :
8737 : int
8738 137 : spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8739 : void *event_ctx, struct spdk_bdev_desc **_desc)
8740 : {
8741 137 : return spdk_bdev_open_ext_v2(bdev_name, write, event_cb, event_ctx, NULL, _desc);
8742 : }
8743 :
8744 : struct spdk_bdev_open_async_ctx {
8745 : char *bdev_name;
8746 : spdk_bdev_event_cb_t event_cb;
8747 : void *event_ctx;
8748 : bool write;
8749 : int rc;
8750 : spdk_bdev_open_async_cb_t cb_fn;
8751 : void *cb_arg;
8752 : struct spdk_bdev_desc *desc;
8753 : struct spdk_bdev_open_async_opts opts;
8754 : uint64_t start_ticks;
8755 : struct spdk_thread *orig_thread;
8756 : struct spdk_poller *poller;
8757 : TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq;
8758 : };
8759 :
8760 : static void
8761 0 : bdev_open_async_done(void *arg)
8762 : {
8763 0 : struct spdk_bdev_open_async_ctx *ctx = arg;
8764 :
8765 0 : ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg);
8766 :
8767 0 : free(ctx->bdev_name);
8768 0 : free(ctx);
8769 0 : }
8770 :
8771 : static void
8772 0 : bdev_open_async_cancel(void *arg)
8773 : {
8774 0 : struct spdk_bdev_open_async_ctx *ctx = arg;
8775 :
8776 0 : assert(ctx->rc == -ESHUTDOWN);
8777 :
8778 0 : spdk_poller_unregister(&ctx->poller);
8779 :
8780 0 : bdev_open_async_done(ctx);
8781 0 : }
8782 :
8783 : /* This is called when the bdev library finishes at shutdown. */
8784 : static void
8785 69 : bdev_open_async_fini(void)
8786 : {
8787 : struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx;
8788 :
8789 69 : spdk_spin_lock(&g_bdev_mgr.spinlock);
8790 69 : TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) {
8791 0 : TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8792 : /*
8793 : * We have to move to ctx->orig_thread to unregister ctx->poller.
8794 : * However, there is a chance that ctx->poller is executed before
8795 : * message is executed, which could result in bdev_open_async_done()
8796 : * being called twice. To avoid such race condition, set ctx->rc to
8797 : * -ESHUTDOWN.
8798 : */
8799 0 : ctx->rc = -ESHUTDOWN;
8800 0 : spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx);
8801 0 : }
8802 69 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8803 69 : }
8804 :
8805 : static int bdev_open_async(void *arg);
8806 :
8807 : static void
8808 0 : _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx)
8809 : {
8810 : uint64_t timeout_ticks;
8811 :
8812 0 : if (ctx->rc == -ESHUTDOWN) {
8813 : /* This context is being canceled. Do nothing. */
8814 0 : return;
8815 : }
8816 :
8817 0 : ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx,
8818 0 : NULL, &ctx->desc);
8819 0 : if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) {
8820 0 : goto exit;
8821 : }
8822 :
8823 0 : timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull;
8824 0 : if (spdk_get_ticks() >= timeout_ticks) {
8825 0 : SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name);
8826 0 : ctx->rc = -ETIMEDOUT;
8827 0 : goto exit;
8828 : }
8829 :
8830 0 : return;
8831 :
8832 : exit:
8833 0 : spdk_poller_unregister(&ctx->poller);
8834 0 : TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8835 :
8836 : /* Completion callback is processed after stack unwinding. */
8837 0 : spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx);
8838 0 : }
8839 :
8840 : static int
8841 0 : bdev_open_async(void *arg)
8842 : {
8843 0 : struct spdk_bdev_open_async_ctx *ctx = arg;
8844 :
8845 0 : spdk_spin_lock(&g_bdev_mgr.spinlock);
8846 :
8847 0 : _bdev_open_async(ctx);
8848 :
8849 0 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8850 :
8851 0 : return SPDK_POLLER_BUSY;
8852 : }
8853 :
8854 : static void
8855 0 : bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts,
8856 : struct spdk_bdev_open_async_opts *opts_src,
8857 : size_t size)
8858 : {
8859 0 : assert(opts);
8860 0 : assert(opts_src);
8861 :
8862 0 : opts->size = size;
8863 :
8864 : #define SET_FIELD(field) \
8865 : if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \
8866 : opts->field = opts_src->field; \
8867 : } \
8868 :
8869 0 : SET_FIELD(timeout_ms);
8870 :
8871 : /* Do not remove this statement, you should always update this statement when you adding a new field,
8872 : * and do not forget to add the SET_FIELD statement for your added field. */
8873 : SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size");
8874 :
8875 : #undef SET_FIELD
8876 0 : }
8877 :
8878 : static void
8879 0 : bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size)
8880 : {
8881 0 : assert(opts);
8882 :
8883 0 : opts->size = size;
8884 :
8885 : #define SET_FIELD(field, value) \
8886 : if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \
8887 : opts->field = value; \
8888 : } \
8889 :
8890 0 : SET_FIELD(timeout_ms, 0);
8891 :
8892 : #undef SET_FIELD
8893 0 : }
8894 :
8895 : int
8896 0 : spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8897 : void *event_ctx, struct spdk_bdev_open_async_opts *opts,
8898 : spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg)
8899 : {
8900 : struct spdk_bdev_open_async_ctx *ctx;
8901 :
8902 0 : if (event_cb == NULL) {
8903 0 : SPDK_ERRLOG("Missing event callback function\n");
8904 0 : return -EINVAL;
8905 : }
8906 :
8907 0 : if (open_cb == NULL) {
8908 0 : SPDK_ERRLOG("Missing open callback function\n");
8909 0 : return -EINVAL;
8910 : }
8911 :
8912 0 : if (opts != NULL && opts->size == 0) {
8913 0 : SPDK_ERRLOG("size in the options structure should not be zero\n");
8914 0 : return -EINVAL;
8915 : }
8916 :
8917 0 : ctx = calloc(1, sizeof(*ctx));
8918 0 : if (ctx == NULL) {
8919 0 : SPDK_ERRLOG("Failed to allocate open context\n");
8920 0 : return -ENOMEM;
8921 : }
8922 :
8923 0 : ctx->bdev_name = strdup(bdev_name);
8924 0 : if (ctx->bdev_name == NULL) {
8925 0 : SPDK_ERRLOG("Failed to duplicate bdev_name\n");
8926 0 : free(ctx);
8927 0 : return -ENOMEM;
8928 : }
8929 :
8930 0 : ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000);
8931 0 : if (ctx->poller == NULL) {
8932 0 : SPDK_ERRLOG("Failed to register bdev_open_async poller\n");
8933 0 : free(ctx->bdev_name);
8934 0 : free(ctx);
8935 0 : return -ENOMEM;
8936 : }
8937 :
8938 0 : ctx->cb_fn = open_cb;
8939 0 : ctx->cb_arg = open_cb_arg;
8940 0 : ctx->write = write;
8941 0 : ctx->event_cb = event_cb;
8942 0 : ctx->event_ctx = event_ctx;
8943 0 : ctx->orig_thread = spdk_get_thread();
8944 0 : ctx->start_ticks = spdk_get_ticks();
8945 :
8946 0 : bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts));
8947 0 : if (opts != NULL) {
8948 0 : bdev_open_async_opts_copy(&ctx->opts, opts, opts->size);
8949 0 : }
8950 :
8951 0 : spdk_spin_lock(&g_bdev_mgr.spinlock);
8952 :
8953 0 : TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8954 0 : _bdev_open_async(ctx);
8955 :
8956 0 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
8957 :
8958 0 : return 0;
8959 0 : }
8960 :
8961 : static void
8962 271 : bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc)
8963 : {
8964 : int rc;
8965 :
8966 271 : spdk_spin_lock(&bdev->internal.spinlock);
8967 271 : spdk_spin_lock(&desc->spinlock);
8968 :
8969 271 : TAILQ_REMOVE(&bdev->internal.open_descs, desc, link);
8970 :
8971 271 : desc->closed = true;
8972 :
8973 271 : if (desc->claim != NULL) {
8974 20 : bdev_desc_release_claims(desc);
8975 20 : }
8976 :
8977 271 : if (0 == desc->refs) {
8978 260 : spdk_spin_unlock(&desc->spinlock);
8979 260 : bdev_desc_free(desc);
8980 260 : } else {
8981 11 : spdk_spin_unlock(&desc->spinlock);
8982 : }
8983 :
8984 : /* If no more descriptors, kill QoS channel */
8985 271 : if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) {
8986 7 : SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
8987 : bdev->name, spdk_get_thread());
8988 :
8989 7 : if (bdev_qos_destroy(bdev)) {
8990 : /* There isn't anything we can do to recover here. Just let the
8991 : * old QoS poller keep running. The QoS handling won't change
8992 : * cores when the user allocates a new channel, but it won't break. */
8993 0 : SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
8994 0 : }
8995 7 : }
8996 :
8997 271 : if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
8998 15 : rc = bdev_unregister_unsafe(bdev);
8999 15 : spdk_spin_unlock(&bdev->internal.spinlock);
9000 :
9001 15 : if (rc == 0) {
9002 15 : spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
9003 15 : }
9004 15 : } else {
9005 256 : spdk_spin_unlock(&bdev->internal.spinlock);
9006 : }
9007 271 : }
9008 :
9009 : void
9010 129 : spdk_bdev_close(struct spdk_bdev_desc *desc)
9011 : {
9012 129 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
9013 :
9014 129 : SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
9015 : spdk_get_thread());
9016 :
9017 129 : assert(desc->thread == spdk_get_thread());
9018 :
9019 129 : spdk_poller_unregister(&desc->io_timeout_poller);
9020 :
9021 129 : spdk_spin_lock(&g_bdev_mgr.spinlock);
9022 :
9023 129 : bdev_close(bdev, desc);
9024 :
9025 129 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
9026 129 : }
9027 :
9028 : int32_t
9029 3 : spdk_bdev_get_numa_id(struct spdk_bdev *bdev)
9030 : {
9031 3 : if (bdev->numa.id_valid) {
9032 2 : return bdev->numa.id;
9033 : } else {
9034 1 : return SPDK_ENV_NUMA_ID_ANY;
9035 : }
9036 3 : }
9037 :
9038 : static void
9039 131 : bdev_register_finished(void *arg)
9040 : {
9041 131 : struct spdk_bdev_desc *desc = arg;
9042 131 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
9043 :
9044 131 : spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev));
9045 :
9046 131 : spdk_spin_lock(&g_bdev_mgr.spinlock);
9047 :
9048 131 : bdev_close(bdev, desc);
9049 :
9050 131 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
9051 131 : }
9052 :
9053 : int
9054 134 : spdk_bdev_register(struct spdk_bdev *bdev)
9055 : {
9056 : struct spdk_bdev_desc *desc;
9057 134 : struct spdk_thread *thread = spdk_get_thread();
9058 : int rc;
9059 :
9060 134 : if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) {
9061 1 : SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread,
9062 : thread ? spdk_thread_get_name(thread) : "null");
9063 1 : return -EINVAL;
9064 : }
9065 :
9066 133 : rc = bdev_register(bdev);
9067 133 : if (rc != 0) {
9068 2 : return rc;
9069 : }
9070 :
9071 : /* A descriptor is opened to prevent bdev deletion during examination */
9072 131 : rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc);
9073 131 : if (rc != 0) {
9074 0 : spdk_bdev_unregister(bdev, NULL, NULL);
9075 0 : return rc;
9076 : }
9077 :
9078 131 : rc = bdev_open(bdev, false, desc);
9079 131 : if (rc != 0) {
9080 0 : bdev_desc_free(desc);
9081 0 : spdk_bdev_unregister(bdev, NULL, NULL);
9082 0 : return rc;
9083 : }
9084 :
9085 : /* Examine configuration before initializing I/O */
9086 131 : bdev_examine(bdev);
9087 :
9088 131 : rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc);
9089 131 : if (rc != 0) {
9090 0 : bdev_close(bdev, desc);
9091 0 : spdk_bdev_unregister(bdev, NULL, NULL);
9092 0 : }
9093 :
9094 131 : return rc;
9095 134 : }
9096 :
9097 : int
9098 26 : spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
9099 : struct spdk_bdev_module *module)
9100 : {
9101 26 : spdk_spin_lock(&bdev->internal.spinlock);
9102 :
9103 26 : if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
9104 6 : LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
9105 6 : spdk_spin_unlock(&bdev->internal.spinlock);
9106 6 : return -EPERM;
9107 : }
9108 :
9109 20 : if (desc && !desc->write) {
9110 5 : desc->write = true;
9111 5 : }
9112 :
9113 20 : bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE;
9114 20 : bdev->internal.claim.v1.module = module;
9115 :
9116 20 : spdk_spin_unlock(&bdev->internal.spinlock);
9117 20 : return 0;
9118 26 : }
9119 :
9120 : void
9121 8 : spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
9122 : {
9123 8 : spdk_spin_lock(&bdev->internal.spinlock);
9124 :
9125 8 : assert(bdev->internal.claim.v1.module != NULL);
9126 8 : assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE);
9127 8 : bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
9128 8 : bdev->internal.claim.v1.module = NULL;
9129 :
9130 8 : spdk_spin_unlock(&bdev->internal.spinlock);
9131 8 : }
9132 :
9133 : /*
9134 : * Start claims v2
9135 : */
9136 :
9137 : const char *
9138 25 : spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type)
9139 : {
9140 25 : switch (type) {
9141 : case SPDK_BDEV_CLAIM_NONE:
9142 0 : return "not_claimed";
9143 : case SPDK_BDEV_CLAIM_EXCL_WRITE:
9144 6 : return "exclusive_write";
9145 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
9146 8 : return "read_many_write_one";
9147 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
9148 5 : return "read_many_write_none";
9149 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9150 6 : return "read_many_write_many";
9151 : default:
9152 0 : break;
9153 : }
9154 0 : return "invalid_claim";
9155 25 : }
9156 :
9157 : static bool
9158 115 : claim_type_is_v2(enum spdk_bdev_claim_type type)
9159 : {
9160 115 : switch (type) {
9161 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
9162 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
9163 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9164 115 : return true;
9165 : default:
9166 0 : break;
9167 : }
9168 0 : return false;
9169 115 : }
9170 :
9171 : /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */
9172 : static bool
9173 17 : claim_type_promotes_to_write(enum spdk_bdev_claim_type type)
9174 : {
9175 17 : switch (type) {
9176 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
9177 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9178 6 : return true;
9179 : default:
9180 11 : break;
9181 : }
9182 11 : return false;
9183 17 : }
9184 :
9185 : void
9186 57 : spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size)
9187 : {
9188 57 : if (opts == NULL) {
9189 0 : SPDK_ERRLOG("opts should not be NULL\n");
9190 0 : assert(opts != NULL);
9191 0 : return;
9192 : }
9193 57 : if (size == 0) {
9194 0 : SPDK_ERRLOG("size should not be zero\n");
9195 0 : assert(size != 0);
9196 0 : return;
9197 : }
9198 :
9199 57 : memset(opts, 0, size);
9200 57 : opts->opts_size = size;
9201 :
9202 : #define FIELD_OK(field) \
9203 : offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size
9204 :
9205 : #define SET_FIELD(field, value) \
9206 : if (FIELD_OK(field)) { \
9207 : opts->field = value; \
9208 : } \
9209 :
9210 57 : SET_FIELD(shared_claim_key, 0);
9211 :
9212 : #undef FIELD_OK
9213 : #undef SET_FIELD
9214 57 : }
9215 :
9216 : static int
9217 22 : claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst)
9218 : {
9219 22 : if (src->opts_size == 0) {
9220 0 : SPDK_ERRLOG("size should not be zero\n");
9221 0 : return -1;
9222 : }
9223 :
9224 22 : memset(dst, 0, sizeof(*dst));
9225 22 : dst->opts_size = src->opts_size;
9226 :
9227 : #define FIELD_OK(field) \
9228 : offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size
9229 :
9230 : #define SET_FIELD(field) \
9231 : if (FIELD_OK(field)) { \
9232 : dst->field = src->field; \
9233 : } \
9234 :
9235 22 : if (FIELD_OK(name)) {
9236 22 : snprintf(dst->name, sizeof(dst->name), "%s", src->name);
9237 22 : }
9238 :
9239 22 : SET_FIELD(shared_claim_key);
9240 :
9241 : /* You should not remove this statement, but need to update the assert statement
9242 : * if you add a new field, and also add a corresponding SET_FIELD statement */
9243 : SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size");
9244 :
9245 : #undef FIELD_OK
9246 : #undef SET_FIELD
9247 22 : return 0;
9248 22 : }
9249 :
9250 : /* Returns 0 if a read-write-once claim can be taken. */
9251 : static int
9252 10 : claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9253 : struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
9254 : {
9255 10 : struct spdk_bdev *bdev = desc->bdev;
9256 : struct spdk_bdev_desc *open_desc;
9257 :
9258 10 : assert(spdk_spin_held(&bdev->internal.spinlock));
9259 10 : assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE);
9260 :
9261 10 : if (opts->shared_claim_key != 0) {
9262 1 : SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n",
9263 : bdev->name);
9264 1 : return -EINVAL;
9265 : }
9266 9 : if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
9267 1 : LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
9268 1 : return -EPERM;
9269 : }
9270 8 : if (desc->claim != NULL) {
9271 0 : SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n",
9272 : bdev->name, desc->claim->module->name);
9273 0 : return -EPERM;
9274 : }
9275 16 : TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
9276 10 : if (desc != open_desc && open_desc->write) {
9277 2 : SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while "
9278 : "another descriptor is open for writing\n",
9279 : bdev->name);
9280 2 : return -EPERM;
9281 : }
9282 8 : }
9283 :
9284 6 : return 0;
9285 10 : }
9286 :
9287 : /* Returns 0 if a read-only-many claim can be taken. */
9288 : static int
9289 15 : claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9290 : struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
9291 : {
9292 15 : struct spdk_bdev *bdev = desc->bdev;
9293 : struct spdk_bdev_desc *open_desc;
9294 :
9295 15 : assert(spdk_spin_held(&bdev->internal.spinlock));
9296 15 : assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE);
9297 15 : assert(desc->claim == NULL);
9298 :
9299 15 : if (desc->write) {
9300 3 : SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n",
9301 : bdev->name);
9302 3 : return -EINVAL;
9303 : }
9304 12 : if (opts->shared_claim_key != 0) {
9305 1 : SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name);
9306 1 : return -EINVAL;
9307 : }
9308 11 : if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
9309 19 : TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
9310 11 : if (open_desc->write) {
9311 0 : SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while "
9312 : "another descriptor is open for writing\n",
9313 : bdev->name);
9314 0 : return -EPERM;
9315 : }
9316 11 : }
9317 8 : }
9318 :
9319 11 : return 0;
9320 15 : }
9321 :
9322 : /* Returns 0 if a read-write-many claim can be taken. */
9323 : static int
9324 8 : claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9325 : struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
9326 : {
9327 8 : struct spdk_bdev *bdev = desc->bdev;
9328 : struct spdk_bdev_desc *open_desc;
9329 :
9330 8 : assert(spdk_spin_held(&bdev->internal.spinlock));
9331 8 : assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED);
9332 8 : assert(desc->claim == NULL);
9333 :
9334 8 : if (opts->shared_claim_key == 0) {
9335 2 : SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n",
9336 : bdev->name);
9337 2 : return -EINVAL;
9338 : }
9339 6 : switch (bdev->internal.claim_type) {
9340 : case SPDK_BDEV_CLAIM_NONE:
9341 7 : TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
9342 5 : if (open_desc == desc) {
9343 3 : continue;
9344 : }
9345 2 : if (open_desc->write) {
9346 2 : SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while "
9347 : "another descriptor is open for writing without a "
9348 : "claim\n", bdev->name);
9349 2 : return -EPERM;
9350 : }
9351 0 : }
9352 2 : break;
9353 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9354 2 : if (opts->shared_claim_key != bdev->internal.claim.v2.key) {
9355 1 : LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev);
9356 1 : return -EPERM;
9357 : }
9358 1 : break;
9359 : default:
9360 0 : LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
9361 0 : return -EBUSY;
9362 : }
9363 :
9364 3 : return 0;
9365 8 : }
9366 :
9367 : /* Updates desc and its bdev with a v2 claim. */
9368 : static int
9369 20 : claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9370 : struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
9371 : {
9372 20 : struct spdk_bdev *bdev = desc->bdev;
9373 : struct spdk_bdev_module_claim *claim;
9374 :
9375 20 : assert(spdk_spin_held(&bdev->internal.spinlock));
9376 20 : assert(claim_type_is_v2(type));
9377 20 : assert(desc->claim == NULL);
9378 :
9379 20 : claim = calloc(1, sizeof(*desc->claim));
9380 20 : if (claim == NULL) {
9381 0 : SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name);
9382 0 : return -ENOMEM;
9383 : }
9384 20 : claim->module = module;
9385 20 : claim->desc = desc;
9386 : SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match");
9387 20 : memcpy(claim->name, opts->name, sizeof(claim->name));
9388 20 : desc->claim = claim;
9389 :
9390 20 : if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
9391 16 : bdev->internal.claim_type = type;
9392 16 : TAILQ_INIT(&bdev->internal.claim.v2.claims);
9393 16 : bdev->internal.claim.v2.key = opts->shared_claim_key;
9394 16 : }
9395 20 : assert(type == bdev->internal.claim_type);
9396 :
9397 20 : TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link);
9398 :
9399 20 : if (!desc->write && claim_type_promotes_to_write(type)) {
9400 6 : desc->write = true;
9401 6 : }
9402 :
9403 20 : return 0;
9404 20 : }
9405 :
9406 : int
9407 44 : spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9408 : struct spdk_bdev_claim_opts *_opts,
9409 : struct spdk_bdev_module *module)
9410 : {
9411 : struct spdk_bdev *bdev;
9412 : struct spdk_bdev_claim_opts opts;
9413 44 : int rc = 0;
9414 :
9415 44 : if (desc == NULL) {
9416 0 : SPDK_ERRLOG("descriptor must not be NULL\n");
9417 0 : return -EINVAL;
9418 : }
9419 :
9420 44 : bdev = desc->bdev;
9421 :
9422 44 : if (_opts == NULL) {
9423 22 : spdk_bdev_claim_opts_init(&opts, sizeof(opts));
9424 44 : } else if (claim_opts_copy(_opts, &opts) != 0) {
9425 0 : return -EINVAL;
9426 : }
9427 :
9428 44 : spdk_spin_lock(&bdev->internal.spinlock);
9429 :
9430 44 : if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE &&
9431 17 : bdev->internal.claim_type != type) {
9432 11 : LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
9433 11 : spdk_spin_unlock(&bdev->internal.spinlock);
9434 11 : return -EPERM;
9435 : }
9436 :
9437 33 : if (claim_type_is_v2(type) && desc->claim != NULL) {
9438 0 : SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n",
9439 : bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name);
9440 0 : spdk_spin_unlock(&bdev->internal.spinlock);
9441 0 : return -EPERM;
9442 : }
9443 :
9444 33 : switch (type) {
9445 : case SPDK_BDEV_CLAIM_EXCL_WRITE:
9446 0 : spdk_spin_unlock(&bdev->internal.spinlock);
9447 0 : return spdk_bdev_module_claim_bdev(bdev, desc, module);
9448 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
9449 10 : rc = claim_verify_rwo(desc, type, &opts, module);
9450 10 : break;
9451 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
9452 15 : rc = claim_verify_rom(desc, type, &opts, module);
9453 15 : break;
9454 : case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9455 8 : rc = claim_verify_rwm(desc, type, &opts, module);
9456 8 : break;
9457 : default:
9458 0 : SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type);
9459 0 : rc = -ENOTSUP;
9460 0 : }
9461 :
9462 33 : if (rc == 0) {
9463 20 : rc = claim_bdev(desc, type, &opts, module);
9464 20 : }
9465 :
9466 33 : spdk_spin_unlock(&bdev->internal.spinlock);
9467 33 : return rc;
9468 44 : }
9469 :
9470 : static void
9471 16 : claim_reset(struct spdk_bdev *bdev)
9472 : {
9473 16 : assert(spdk_spin_held(&bdev->internal.spinlock));
9474 16 : assert(claim_type_is_v2(bdev->internal.claim_type));
9475 16 : assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims));
9476 :
9477 16 : memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
9478 16 : bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
9479 16 : }
9480 :
9481 : static void
9482 20 : bdev_desc_release_claims(struct spdk_bdev_desc *desc)
9483 : {
9484 20 : struct spdk_bdev *bdev = desc->bdev;
9485 :
9486 20 : assert(spdk_spin_held(&bdev->internal.spinlock));
9487 20 : assert(claim_type_is_v2(bdev->internal.claim_type));
9488 :
9489 20 : if (bdev->internal.examine_in_progress == 0) {
9490 20 : TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link);
9491 20 : free(desc->claim);
9492 20 : if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
9493 16 : claim_reset(bdev);
9494 16 : }
9495 20 : } else {
9496 : /* This is a dead claim that will be cleaned up when bdev_examine() is done. */
9497 0 : desc->claim->module = NULL;
9498 0 : desc->claim->desc = NULL;
9499 : }
9500 20 : desc->claim = NULL;
9501 20 : }
9502 :
9503 : /*
9504 : * End claims v2
9505 : */
9506 :
9507 : struct spdk_bdev *
9508 1590 : spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
9509 : {
9510 1590 : assert(desc != NULL);
9511 1590 : return desc->bdev;
9512 : }
9513 :
9514 : int
9515 1 : spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn)
9516 : {
9517 : struct spdk_bdev *bdev, *tmp;
9518 : struct spdk_bdev_desc *desc;
9519 1 : int rc = 0;
9520 :
9521 1 : assert(fn != NULL);
9522 :
9523 1 : spdk_spin_lock(&g_bdev_mgr.spinlock);
9524 1 : bdev = spdk_bdev_first();
9525 9 : while (bdev != NULL) {
9526 8 : rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc);
9527 8 : if (rc != 0) {
9528 0 : break;
9529 : }
9530 8 : rc = bdev_open(bdev, false, desc);
9531 8 : if (rc != 0) {
9532 1 : bdev_desc_free(desc);
9533 1 : if (rc == -ENODEV) {
9534 : /* Ignore the error and move to the next bdev. */
9535 1 : rc = 0;
9536 1 : bdev = spdk_bdev_next(bdev);
9537 1 : continue;
9538 : }
9539 0 : break;
9540 : }
9541 7 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
9542 :
9543 7 : rc = fn(ctx, bdev);
9544 :
9545 7 : spdk_spin_lock(&g_bdev_mgr.spinlock);
9546 7 : tmp = spdk_bdev_next(bdev);
9547 7 : bdev_close(bdev, desc);
9548 7 : if (rc != 0) {
9549 0 : break;
9550 : }
9551 7 : bdev = tmp;
9552 : }
9553 1 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
9554 :
9555 1 : return rc;
9556 : }
9557 :
9558 : int
9559 1 : spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn)
9560 : {
9561 : struct spdk_bdev *bdev, *tmp;
9562 : struct spdk_bdev_desc *desc;
9563 1 : int rc = 0;
9564 :
9565 1 : assert(fn != NULL);
9566 :
9567 1 : spdk_spin_lock(&g_bdev_mgr.spinlock);
9568 1 : bdev = spdk_bdev_first_leaf();
9569 6 : while (bdev != NULL) {
9570 5 : rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc);
9571 5 : if (rc != 0) {
9572 0 : break;
9573 : }
9574 5 : rc = bdev_open(bdev, false, desc);
9575 5 : if (rc != 0) {
9576 1 : bdev_desc_free(desc);
9577 1 : if (rc == -ENODEV) {
9578 : /* Ignore the error and move to the next bdev. */
9579 1 : rc = 0;
9580 1 : bdev = spdk_bdev_next_leaf(bdev);
9581 1 : continue;
9582 : }
9583 0 : break;
9584 : }
9585 4 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
9586 :
9587 4 : rc = fn(ctx, bdev);
9588 :
9589 4 : spdk_spin_lock(&g_bdev_mgr.spinlock);
9590 4 : tmp = spdk_bdev_next_leaf(bdev);
9591 4 : bdev_close(bdev, desc);
9592 4 : if (rc != 0) {
9593 0 : break;
9594 : }
9595 4 : bdev = tmp;
9596 : }
9597 1 : spdk_spin_unlock(&g_bdev_mgr.spinlock);
9598 :
9599 1 : return rc;
9600 : }
9601 :
9602 : void
9603 0 : spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
9604 : {
9605 : struct iovec *iovs;
9606 : int iovcnt;
9607 :
9608 0 : if (bdev_io == NULL) {
9609 0 : return;
9610 : }
9611 :
9612 0 : switch (bdev_io->type) {
9613 : case SPDK_BDEV_IO_TYPE_READ:
9614 : case SPDK_BDEV_IO_TYPE_WRITE:
9615 : case SPDK_BDEV_IO_TYPE_ZCOPY:
9616 0 : iovs = bdev_io->u.bdev.iovs;
9617 0 : iovcnt = bdev_io->u.bdev.iovcnt;
9618 0 : break;
9619 : default:
9620 0 : iovs = NULL;
9621 0 : iovcnt = 0;
9622 0 : break;
9623 : }
9624 :
9625 0 : if (iovp) {
9626 0 : *iovp = iovs;
9627 0 : }
9628 0 : if (iovcntp) {
9629 0 : *iovcntp = iovcnt;
9630 0 : }
9631 0 : }
9632 :
9633 : void *
9634 0 : spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io)
9635 : {
9636 0 : if (bdev_io == NULL) {
9637 0 : return NULL;
9638 : }
9639 :
9640 0 : if (!spdk_bdev_is_md_separate(bdev_io->bdev)) {
9641 0 : return NULL;
9642 : }
9643 :
9644 0 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
9645 0 : bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
9646 0 : return bdev_io->u.bdev.md_buf;
9647 : }
9648 :
9649 0 : return NULL;
9650 0 : }
9651 :
9652 : void *
9653 0 : spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io)
9654 : {
9655 0 : if (bdev_io == NULL) {
9656 0 : assert(false);
9657 : return NULL;
9658 : }
9659 :
9660 0 : return bdev_io->internal.caller_ctx;
9661 : }
9662 :
9663 : void
9664 7 : spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
9665 : {
9666 :
9667 7 : if (spdk_bdev_module_list_find(bdev_module->name)) {
9668 0 : SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
9669 0 : assert(false);
9670 : }
9671 :
9672 7 : spdk_spin_init(&bdev_module->internal.spinlock);
9673 7 : TAILQ_INIT(&bdev_module->internal.quiesced_ranges);
9674 :
9675 : /*
9676 : * Modules with examine callbacks must be initialized first, so they are
9677 : * ready to handle examine callbacks from later modules that will
9678 : * register physical bdevs.
9679 : */
9680 7 : if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) {
9681 4 : TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
9682 4 : } else {
9683 3 : TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
9684 : }
9685 7 : }
9686 :
9687 : struct spdk_bdev_module *
9688 7 : spdk_bdev_module_list_find(const char *name)
9689 : {
9690 : struct spdk_bdev_module *bdev_module;
9691 :
9692 14 : TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
9693 7 : if (strcmp(name, bdev_module->name) == 0) {
9694 0 : break;
9695 : }
9696 7 : }
9697 :
9698 7 : return bdev_module;
9699 : }
9700 :
9701 : static int
9702 6 : bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io)
9703 : {
9704 : uint64_t num_blocks;
9705 6 : void *md_buf = NULL;
9706 :
9707 6 : num_blocks = bdev_io->u.bdev.num_blocks;
9708 :
9709 6 : if (spdk_bdev_is_md_separate(bdev_io->bdev)) {
9710 4 : md_buf = (char *)g_bdev_mgr.zero_buffer +
9711 2 : spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks;
9712 2 : }
9713 :
9714 12 : return bdev_write_blocks_with_md(bdev_io->internal.desc,
9715 6 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
9716 6 : g_bdev_mgr.zero_buffer, md_buf,
9717 6 : bdev_io->u.bdev.offset_blocks, num_blocks,
9718 6 : bdev_write_zero_buffer_done, bdev_io);
9719 : }
9720 :
9721 : static void
9722 6 : bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
9723 : {
9724 6 : struct spdk_bdev_io *parent_io = cb_arg;
9725 :
9726 6 : spdk_bdev_free_io(bdev_io);
9727 :
9728 6 : parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
9729 6 : parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
9730 6 : }
9731 :
9732 : static void
9733 10 : bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
9734 : {
9735 10 : spdk_spin_lock(&ctx->bdev->internal.spinlock);
9736 10 : ctx->bdev->internal.qos_mod_in_progress = false;
9737 10 : spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9738 :
9739 10 : if (ctx->cb_fn) {
9740 8 : ctx->cb_fn(ctx->cb_arg, status);
9741 8 : }
9742 10 : free(ctx);
9743 10 : }
9744 :
9745 : static void
9746 2 : bdev_disable_qos_done(void *cb_arg)
9747 : {
9748 2 : struct set_qos_limit_ctx *ctx = cb_arg;
9749 2 : struct spdk_bdev *bdev = ctx->bdev;
9750 : struct spdk_bdev_qos *qos;
9751 :
9752 2 : spdk_spin_lock(&bdev->internal.spinlock);
9753 2 : qos = bdev->internal.qos;
9754 2 : bdev->internal.qos = NULL;
9755 2 : spdk_spin_unlock(&bdev->internal.spinlock);
9756 :
9757 2 : if (qos->thread != NULL) {
9758 2 : spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
9759 2 : spdk_poller_unregister(&qos->poller);
9760 2 : }
9761 :
9762 2 : free(qos);
9763 :
9764 2 : bdev_set_qos_limit_done(ctx, 0);
9765 2 : }
9766 :
9767 : static void
9768 2 : bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status)
9769 : {
9770 2 : struct set_qos_limit_ctx *ctx = _ctx;
9771 : struct spdk_thread *thread;
9772 :
9773 2 : spdk_spin_lock(&bdev->internal.spinlock);
9774 2 : thread = bdev->internal.qos->thread;
9775 2 : spdk_spin_unlock(&bdev->internal.spinlock);
9776 :
9777 2 : if (thread != NULL) {
9778 2 : spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx);
9779 2 : } else {
9780 0 : bdev_disable_qos_done(ctx);
9781 : }
9782 2 : }
9783 :
9784 : static void
9785 4 : bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9786 : struct spdk_io_channel *ch, void *_ctx)
9787 : {
9788 4 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9789 : struct spdk_bdev_io *bdev_io;
9790 :
9791 4 : bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
9792 :
9793 6 : while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) {
9794 : /* Re-submit the queued I/O. */
9795 2 : bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io);
9796 2 : TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link);
9797 2 : _bdev_io_submit(bdev_io);
9798 : }
9799 :
9800 4 : spdk_bdev_for_each_channel_continue(i, 0);
9801 4 : }
9802 :
9803 : static void
9804 1 : bdev_update_qos_rate_limit_msg(void *cb_arg)
9805 : {
9806 1 : struct set_qos_limit_ctx *ctx = cb_arg;
9807 1 : struct spdk_bdev *bdev = ctx->bdev;
9808 :
9809 1 : spdk_spin_lock(&bdev->internal.spinlock);
9810 1 : bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos);
9811 1 : spdk_spin_unlock(&bdev->internal.spinlock);
9812 :
9813 1 : bdev_set_qos_limit_done(ctx, 0);
9814 1 : }
9815 :
9816 : static void
9817 9 : bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9818 : struct spdk_io_channel *ch, void *_ctx)
9819 : {
9820 9 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9821 :
9822 9 : spdk_spin_lock(&bdev->internal.spinlock);
9823 9 : bdev_enable_qos(bdev, bdev_ch);
9824 9 : spdk_spin_unlock(&bdev->internal.spinlock);
9825 9 : spdk_bdev_for_each_channel_continue(i, 0);
9826 9 : }
9827 :
9828 : static void
9829 6 : bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status)
9830 : {
9831 6 : struct set_qos_limit_ctx *ctx = _ctx;
9832 :
9833 6 : bdev_set_qos_limit_done(ctx, status);
9834 6 : }
9835 :
9836 : static void
9837 7 : bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
9838 : {
9839 : int i;
9840 :
9841 7 : assert(bdev->internal.qos != NULL);
9842 :
9843 35 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9844 28 : if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
9845 28 : bdev->internal.qos->rate_limits[i].limit = limits[i];
9846 :
9847 28 : if (limits[i] == 0) {
9848 19 : bdev->internal.qos->rate_limits[i].limit =
9849 : SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
9850 19 : }
9851 28 : }
9852 28 : }
9853 7 : }
9854 :
9855 : void
9856 9 : spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits,
9857 : void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
9858 : {
9859 : struct set_qos_limit_ctx *ctx;
9860 : uint32_t limit_set_complement;
9861 : uint64_t min_limit_per_sec;
9862 : int i;
9863 9 : bool disable_rate_limit = true;
9864 :
9865 45 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9866 36 : if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
9867 0 : continue;
9868 : }
9869 :
9870 36 : if (limits[i] > 0) {
9871 10 : disable_rate_limit = false;
9872 10 : }
9873 :
9874 36 : if (bdev_qos_is_iops_rate_limit(i) == true) {
9875 9 : min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
9876 9 : } else {
9877 27 : if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) {
9878 0 : SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, "
9879 : "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC);
9880 0 : limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC;
9881 0 : }
9882 : /* Change from megabyte to byte rate limit */
9883 27 : limits[i] = limits[i] * 1024 * 1024;
9884 27 : min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC;
9885 : }
9886 :
9887 36 : limit_set_complement = limits[i] % min_limit_per_sec;
9888 36 : if (limit_set_complement) {
9889 0 : SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n",
9890 : limits[i], min_limit_per_sec);
9891 0 : limits[i] += min_limit_per_sec - limit_set_complement;
9892 0 : SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]);
9893 0 : }
9894 36 : }
9895 :
9896 9 : ctx = calloc(1, sizeof(*ctx));
9897 9 : if (ctx == NULL) {
9898 0 : cb_fn(cb_arg, -ENOMEM);
9899 0 : return;
9900 : }
9901 :
9902 9 : ctx->cb_fn = cb_fn;
9903 9 : ctx->cb_arg = cb_arg;
9904 9 : ctx->bdev = bdev;
9905 :
9906 9 : spdk_spin_lock(&bdev->internal.spinlock);
9907 9 : if (bdev->internal.qos_mod_in_progress) {
9908 1 : spdk_spin_unlock(&bdev->internal.spinlock);
9909 1 : free(ctx);
9910 1 : cb_fn(cb_arg, -EAGAIN);
9911 1 : return;
9912 : }
9913 8 : bdev->internal.qos_mod_in_progress = true;
9914 :
9915 8 : if (disable_rate_limit == true && bdev->internal.qos) {
9916 10 : for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9917 8 : if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED &&
9918 0 : (bdev->internal.qos->rate_limits[i].limit > 0 &&
9919 0 : bdev->internal.qos->rate_limits[i].limit !=
9920 : SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) {
9921 0 : disable_rate_limit = false;
9922 0 : break;
9923 : }
9924 8 : }
9925 2 : }
9926 :
9927 8 : if (disable_rate_limit == false) {
9928 5 : if (bdev->internal.qos == NULL) {
9929 4 : bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
9930 4 : if (!bdev->internal.qos) {
9931 0 : spdk_spin_unlock(&bdev->internal.spinlock);
9932 0 : SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
9933 0 : bdev_set_qos_limit_done(ctx, -ENOMEM);
9934 0 : return;
9935 : }
9936 4 : }
9937 :
9938 5 : if (bdev->internal.qos->thread == NULL) {
9939 : /* Enabling */
9940 4 : bdev_set_qos_rate_limits(bdev, limits);
9941 :
9942 4 : spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx,
9943 : bdev_enable_qos_done);
9944 4 : } else {
9945 : /* Updating */
9946 1 : bdev_set_qos_rate_limits(bdev, limits);
9947 :
9948 2 : spdk_thread_send_msg(bdev->internal.qos->thread,
9949 1 : bdev_update_qos_rate_limit_msg, ctx);
9950 : }
9951 5 : } else {
9952 3 : if (bdev->internal.qos != NULL) {
9953 2 : bdev_set_qos_rate_limits(bdev, limits);
9954 :
9955 : /* Disabling */
9956 2 : spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx,
9957 : bdev_disable_qos_msg_done);
9958 2 : } else {
9959 1 : spdk_spin_unlock(&bdev->internal.spinlock);
9960 1 : bdev_set_qos_limit_done(ctx, 0);
9961 1 : return;
9962 : }
9963 : }
9964 :
9965 7 : spdk_spin_unlock(&bdev->internal.spinlock);
9966 9 : }
9967 :
9968 : struct spdk_bdev_histogram_ctx {
9969 : spdk_bdev_histogram_status_cb cb_fn;
9970 : void *cb_arg;
9971 : struct spdk_bdev *bdev;
9972 : int status;
9973 : };
9974 :
9975 : static void
9976 2 : bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9977 : {
9978 2 : struct spdk_bdev_histogram_ctx *ctx = _ctx;
9979 :
9980 2 : spdk_spin_lock(&ctx->bdev->internal.spinlock);
9981 2 : ctx->bdev->internal.histogram_in_progress = false;
9982 2 : spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9983 2 : ctx->cb_fn(ctx->cb_arg, ctx->status);
9984 2 : free(ctx);
9985 2 : }
9986 :
9987 : static void
9988 3 : bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9989 : struct spdk_io_channel *_ch, void *_ctx)
9990 : {
9991 3 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9992 :
9993 3 : if (ch->histogram != NULL) {
9994 3 : spdk_histogram_data_free(ch->histogram);
9995 3 : ch->histogram = NULL;
9996 3 : }
9997 3 : spdk_bdev_for_each_channel_continue(i, 0);
9998 3 : }
9999 :
10000 : static void
10001 2 : bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10002 : {
10003 2 : struct spdk_bdev_histogram_ctx *ctx = _ctx;
10004 :
10005 2 : if (status != 0) {
10006 0 : ctx->status = status;
10007 0 : ctx->bdev->internal.histogram_enabled = false;
10008 0 : spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx,
10009 : bdev_histogram_disable_channel_cb);
10010 0 : } else {
10011 2 : spdk_spin_lock(&ctx->bdev->internal.spinlock);
10012 2 : ctx->bdev->internal.histogram_in_progress = false;
10013 2 : spdk_spin_unlock(&ctx->bdev->internal.spinlock);
10014 2 : ctx->cb_fn(ctx->cb_arg, ctx->status);
10015 2 : free(ctx);
10016 : }
10017 2 : }
10018 :
10019 : static void
10020 3 : bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10021 : struct spdk_io_channel *_ch, void *_ctx)
10022 : {
10023 3 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10024 3 : int status = 0;
10025 :
10026 3 : if (ch->histogram == NULL) {
10027 3 : ch->histogram = spdk_histogram_data_alloc();
10028 3 : if (ch->histogram == NULL) {
10029 0 : status = -ENOMEM;
10030 0 : }
10031 3 : }
10032 :
10033 3 : spdk_bdev_for_each_channel_continue(i, status);
10034 3 : }
10035 :
10036 : void
10037 4 : spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
10038 : void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts)
10039 : {
10040 : struct spdk_bdev_histogram_ctx *ctx;
10041 :
10042 4 : ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx));
10043 4 : if (ctx == NULL) {
10044 0 : cb_fn(cb_arg, -ENOMEM);
10045 0 : return;
10046 : }
10047 :
10048 4 : ctx->bdev = bdev;
10049 4 : ctx->status = 0;
10050 4 : ctx->cb_fn = cb_fn;
10051 4 : ctx->cb_arg = cb_arg;
10052 :
10053 4 : spdk_spin_lock(&bdev->internal.spinlock);
10054 4 : if (bdev->internal.histogram_in_progress) {
10055 0 : spdk_spin_unlock(&bdev->internal.spinlock);
10056 0 : free(ctx);
10057 0 : cb_fn(cb_arg, -EAGAIN);
10058 0 : return;
10059 : }
10060 :
10061 4 : bdev->internal.histogram_in_progress = true;
10062 4 : spdk_spin_unlock(&bdev->internal.spinlock);
10063 :
10064 4 : bdev->internal.histogram_enabled = enable;
10065 4 : bdev->internal.histogram_io_type = opts->io_type;
10066 :
10067 4 : if (enable) {
10068 : /* Allocate histogram for each channel */
10069 2 : spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx,
10070 : bdev_histogram_enable_channel_cb);
10071 2 : } else {
10072 2 : spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx,
10073 : bdev_histogram_disable_channel_cb);
10074 : }
10075 4 : }
10076 :
10077 : void
10078 4 : spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size)
10079 : {
10080 4 : if (opts == NULL) {
10081 0 : SPDK_ERRLOG("opts should not be NULL\n");
10082 0 : assert(opts != NULL);
10083 0 : return;
10084 : }
10085 4 : if (size == 0) {
10086 0 : SPDK_ERRLOG("size should not be zero\n");
10087 0 : assert(size != 0);
10088 0 : return;
10089 : }
10090 :
10091 4 : memset(opts, 0, size);
10092 4 : opts->size = size;
10093 :
10094 : #define FIELD_OK(field) \
10095 : offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size
10096 :
10097 : #define SET_FIELD(field, value) \
10098 : if (FIELD_OK(field)) { \
10099 : opts->field = value; \
10100 : } \
10101 :
10102 4 : SET_FIELD(io_type, 0);
10103 :
10104 : /* You should not remove this statement, but need to update the assert statement
10105 : * if you add a new field, and also add a corresponding SET_FIELD statement */
10106 : SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size");
10107 :
10108 : #undef FIELD_OK
10109 : #undef SET_FIELD
10110 4 : }
10111 :
10112 : void
10113 4 : spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
10114 : void *cb_arg, bool enable)
10115 : {
10116 : struct spdk_bdev_enable_histogram_opts opts;
10117 :
10118 4 : spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts));
10119 4 : spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts);
10120 4 : }
10121 :
10122 : struct spdk_bdev_histogram_data_ctx {
10123 : spdk_bdev_histogram_data_cb cb_fn;
10124 : void *cb_arg;
10125 : struct spdk_bdev *bdev;
10126 : /** merged histogram data from all channels */
10127 : struct spdk_histogram_data *histogram;
10128 : };
10129 :
10130 : static void
10131 5 : bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10132 : {
10133 5 : struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
10134 :
10135 5 : ctx->cb_fn(ctx->cb_arg, status, ctx->histogram);
10136 5 : free(ctx);
10137 5 : }
10138 :
10139 : static void
10140 7 : bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10141 : struct spdk_io_channel *_ch, void *_ctx)
10142 : {
10143 7 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10144 7 : struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
10145 7 : int status = 0;
10146 :
10147 7 : if (ch->histogram == NULL) {
10148 1 : status = -EFAULT;
10149 1 : } else {
10150 6 : spdk_histogram_data_merge(ctx->histogram, ch->histogram);
10151 : }
10152 :
10153 7 : spdk_bdev_for_each_channel_continue(i, status);
10154 7 : }
10155 :
10156 : void
10157 5 : spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram,
10158 : spdk_bdev_histogram_data_cb cb_fn,
10159 : void *cb_arg)
10160 : {
10161 : struct spdk_bdev_histogram_data_ctx *ctx;
10162 :
10163 5 : ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx));
10164 5 : if (ctx == NULL) {
10165 0 : cb_fn(cb_arg, -ENOMEM, NULL);
10166 0 : return;
10167 : }
10168 :
10169 5 : ctx->bdev = bdev;
10170 5 : ctx->cb_fn = cb_fn;
10171 5 : ctx->cb_arg = cb_arg;
10172 :
10173 5 : ctx->histogram = histogram;
10174 :
10175 5 : spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx,
10176 : bdev_histogram_get_channel_cb);
10177 5 : }
10178 :
10179 : void
10180 2 : spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn,
10181 : void *cb_arg)
10182 : {
10183 2 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
10184 2 : int status = 0;
10185 :
10186 2 : assert(cb_fn != NULL);
10187 :
10188 2 : if (bdev_ch->histogram == NULL) {
10189 1 : status = -EFAULT;
10190 1 : }
10191 2 : cb_fn(cb_arg, status, bdev_ch->histogram);
10192 2 : }
10193 :
10194 : size_t
10195 0 : spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events,
10196 : size_t max_events)
10197 : {
10198 : struct media_event_entry *entry;
10199 0 : size_t num_events = 0;
10200 :
10201 0 : for (; num_events < max_events; ++num_events) {
10202 0 : entry = TAILQ_FIRST(&desc->pending_media_events);
10203 0 : if (entry == NULL) {
10204 0 : break;
10205 : }
10206 :
10207 0 : events[num_events] = entry->event;
10208 0 : TAILQ_REMOVE(&desc->pending_media_events, entry, tailq);
10209 0 : TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq);
10210 0 : }
10211 :
10212 0 : return num_events;
10213 : }
10214 :
10215 : int
10216 0 : spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events,
10217 : size_t num_events)
10218 : {
10219 : struct spdk_bdev_desc *desc;
10220 : struct media_event_entry *entry;
10221 : size_t event_id;
10222 0 : int rc = 0;
10223 :
10224 0 : assert(bdev->media_events);
10225 :
10226 0 : spdk_spin_lock(&bdev->internal.spinlock);
10227 0 : TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
10228 0 : if (desc->write) {
10229 0 : break;
10230 : }
10231 0 : }
10232 :
10233 0 : if (desc == NULL || desc->media_events_buffer == NULL) {
10234 0 : rc = -ENODEV;
10235 0 : goto out;
10236 : }
10237 :
10238 0 : for (event_id = 0; event_id < num_events; ++event_id) {
10239 0 : entry = TAILQ_FIRST(&desc->free_media_events);
10240 0 : if (entry == NULL) {
10241 0 : break;
10242 : }
10243 :
10244 0 : TAILQ_REMOVE(&desc->free_media_events, entry, tailq);
10245 0 : TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq);
10246 0 : entry->event = events[event_id];
10247 0 : }
10248 :
10249 0 : rc = event_id;
10250 : out:
10251 0 : spdk_spin_unlock(&bdev->internal.spinlock);
10252 0 : return rc;
10253 : }
10254 :
10255 : static void
10256 0 : _media_management_notify(void *arg)
10257 : {
10258 0 : struct spdk_bdev_desc *desc = arg;
10259 :
10260 0 : _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT);
10261 0 : }
10262 :
10263 : void
10264 0 : spdk_bdev_notify_media_management(struct spdk_bdev *bdev)
10265 : {
10266 : struct spdk_bdev_desc *desc;
10267 :
10268 0 : spdk_spin_lock(&bdev->internal.spinlock);
10269 0 : TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
10270 0 : if (!TAILQ_EMPTY(&desc->pending_media_events)) {
10271 0 : event_notify(desc, _media_management_notify);
10272 0 : }
10273 0 : }
10274 0 : spdk_spin_unlock(&bdev->internal.spinlock);
10275 0 : }
10276 :
10277 : struct locked_lba_range_ctx {
10278 : struct lba_range range;
10279 : struct lba_range *current_range;
10280 : struct lba_range *owner_range;
10281 : struct spdk_poller *poller;
10282 : lock_range_cb cb_fn;
10283 : void *cb_arg;
10284 : };
10285 :
10286 : static void
10287 0 : bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10288 : {
10289 0 : struct locked_lba_range_ctx *ctx = _ctx;
10290 :
10291 0 : ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM);
10292 0 : free(ctx);
10293 0 : }
10294 :
10295 : static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i,
10296 : struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx);
10297 :
10298 : static void
10299 14 : bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10300 : {
10301 14 : struct locked_lba_range_ctx *ctx = _ctx;
10302 :
10303 14 : if (status == -ENOMEM) {
10304 : /* One of the channels could not allocate a range object.
10305 : * So we have to go back and clean up any ranges that were
10306 : * allocated successfully before we return error status to
10307 : * the caller. We can reuse the unlock function to do that
10308 : * clean up.
10309 : */
10310 0 : spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
10311 : bdev_lock_error_cleanup_cb);
10312 0 : return;
10313 : }
10314 :
10315 : /* All channels have locked this range and no I/O overlapping the range
10316 : * are outstanding! Set the owner_ch for the range object for the
10317 : * locking channel, so that this channel will know that it is allowed
10318 : * to write to this range.
10319 : */
10320 14 : if (ctx->owner_range != NULL) {
10321 10 : ctx->owner_range->owner_ch = ctx->range.owner_ch;
10322 10 : }
10323 :
10324 14 : ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
10325 :
10326 : /* Don't free the ctx here. Its range is in the bdev's global list of
10327 : * locked ranges still, and will be removed and freed when this range
10328 : * is later unlocked.
10329 : */
10330 14 : }
10331 :
10332 : static int
10333 17 : bdev_lock_lba_range_check_io(void *_i)
10334 : {
10335 17 : struct spdk_bdev_channel_iter *i = _i;
10336 17 : struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i);
10337 17 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10338 17 : struct locked_lba_range_ctx *ctx = i->ctx;
10339 17 : struct lba_range *range = ctx->current_range;
10340 : struct spdk_bdev_io *bdev_io;
10341 :
10342 17 : spdk_poller_unregister(&ctx->poller);
10343 :
10344 : /* The range is now in the locked_ranges, so no new IO can be submitted to this
10345 : * range. But we need to wait until any outstanding IO overlapping with this range
10346 : * are completed.
10347 : */
10348 18 : TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) {
10349 3 : if (bdev_io_range_is_locked(bdev_io, range)) {
10350 2 : ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100);
10351 2 : return SPDK_POLLER_BUSY;
10352 : }
10353 1 : }
10354 :
10355 15 : spdk_bdev_for_each_channel_continue(i, 0);
10356 15 : return SPDK_POLLER_BUSY;
10357 17 : }
10358 :
10359 : static void
10360 15 : bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10361 : struct spdk_io_channel *_ch, void *_ctx)
10362 : {
10363 15 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10364 15 : struct locked_lba_range_ctx *ctx = _ctx;
10365 : struct lba_range *range;
10366 :
10367 16 : TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
10368 1 : if (range->length == ctx->range.length &&
10369 0 : range->offset == ctx->range.offset &&
10370 0 : range->locked_ctx == ctx->range.locked_ctx) {
10371 : /* This range already exists on this channel, so don't add
10372 : * it again. This can happen when a new channel is created
10373 : * while the for_each_channel operation is in progress.
10374 : * Do not check for outstanding I/O in that case, since the
10375 : * range was locked before any I/O could be submitted to the
10376 : * new channel.
10377 : */
10378 0 : spdk_bdev_for_each_channel_continue(i, 0);
10379 0 : return;
10380 : }
10381 1 : }
10382 :
10383 15 : range = calloc(1, sizeof(*range));
10384 15 : if (range == NULL) {
10385 0 : spdk_bdev_for_each_channel_continue(i, -ENOMEM);
10386 0 : return;
10387 : }
10388 :
10389 15 : range->length = ctx->range.length;
10390 15 : range->offset = ctx->range.offset;
10391 15 : range->locked_ctx = ctx->range.locked_ctx;
10392 15 : range->quiesce = ctx->range.quiesce;
10393 15 : ctx->current_range = range;
10394 15 : if (ctx->range.owner_ch == ch) {
10395 : /* This is the range object for the channel that will hold
10396 : * the lock. Store it in the ctx object so that we can easily
10397 : * set its owner_ch after the lock is finally acquired.
10398 : */
10399 10 : ctx->owner_range = range;
10400 10 : }
10401 15 : TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq);
10402 15 : bdev_lock_lba_range_check_io(i);
10403 15 : }
10404 :
10405 : static void
10406 14 : bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx)
10407 : {
10408 14 : assert(spdk_get_thread() == ctx->range.owner_thread);
10409 14 : assert(ctx->range.owner_ch == NULL ||
10410 : spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread);
10411 :
10412 : /* We will add a copy of this range to each channel now. */
10413 14 : spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx,
10414 : bdev_lock_lba_range_cb);
10415 14 : }
10416 :
10417 : static bool
10418 17 : bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq)
10419 : {
10420 : struct lba_range *r;
10421 :
10422 18 : TAILQ_FOREACH(r, tailq, tailq) {
10423 4 : if (bdev_lba_range_overlapped(range, r)) {
10424 3 : return true;
10425 : }
10426 1 : }
10427 14 : return false;
10428 17 : }
10429 :
10430 : static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status);
10431 :
10432 : static int
10433 14 : _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch,
10434 : uint64_t offset, uint64_t length,
10435 : lock_range_cb cb_fn, void *cb_arg)
10436 : {
10437 : struct locked_lba_range_ctx *ctx;
10438 :
10439 14 : ctx = calloc(1, sizeof(*ctx));
10440 14 : if (ctx == NULL) {
10441 0 : return -ENOMEM;
10442 : }
10443 :
10444 14 : ctx->range.offset = offset;
10445 14 : ctx->range.length = length;
10446 14 : ctx->range.owner_thread = spdk_get_thread();
10447 14 : ctx->range.owner_ch = ch;
10448 14 : ctx->range.locked_ctx = cb_arg;
10449 14 : ctx->range.bdev = bdev;
10450 14 : ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked);
10451 14 : ctx->cb_fn = cb_fn;
10452 14 : ctx->cb_arg = cb_arg;
10453 :
10454 14 : spdk_spin_lock(&bdev->internal.spinlock);
10455 14 : if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) {
10456 : /* There is an active lock overlapping with this range.
10457 : * Put it on the pending list until this range no
10458 : * longer overlaps with another.
10459 : */
10460 2 : TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq);
10461 2 : } else {
10462 12 : TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq);
10463 12 : bdev_lock_lba_range_ctx(bdev, ctx);
10464 : }
10465 14 : spdk_spin_unlock(&bdev->internal.spinlock);
10466 14 : return 0;
10467 14 : }
10468 :
10469 : static int
10470 10 : bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
10471 : uint64_t offset, uint64_t length,
10472 : lock_range_cb cb_fn, void *cb_arg)
10473 : {
10474 10 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10475 10 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10476 :
10477 10 : if (cb_arg == NULL) {
10478 0 : SPDK_ERRLOG("cb_arg must not be NULL\n");
10479 0 : return -EINVAL;
10480 : }
10481 :
10482 10 : return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg);
10483 10 : }
10484 :
10485 : static void
10486 2 : bdev_lock_lba_range_ctx_msg(void *_ctx)
10487 : {
10488 2 : struct locked_lba_range_ctx *ctx = _ctx;
10489 :
10490 2 : bdev_lock_lba_range_ctx(ctx->range.bdev, ctx);
10491 2 : }
10492 :
10493 : static void
10494 14 : bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10495 : {
10496 14 : struct locked_lba_range_ctx *ctx = _ctx;
10497 : struct locked_lba_range_ctx *pending_ctx;
10498 : struct lba_range *range, *tmp;
10499 :
10500 14 : spdk_spin_lock(&bdev->internal.spinlock);
10501 : /* Check if there are any pending locked ranges that overlap with this range
10502 : * that was just unlocked. If there are, check that it doesn't overlap with any
10503 : * other locked ranges before calling bdev_lock_lba_range_ctx which will start
10504 : * the lock process.
10505 : */
10506 17 : TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) {
10507 3 : if (bdev_lba_range_overlapped(range, &ctx->range) &&
10508 3 : !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) {
10509 2 : TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq);
10510 2 : pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
10511 2 : TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq);
10512 4 : spdk_thread_send_msg(pending_ctx->range.owner_thread,
10513 2 : bdev_lock_lba_range_ctx_msg, pending_ctx);
10514 2 : }
10515 3 : }
10516 14 : spdk_spin_unlock(&bdev->internal.spinlock);
10517 :
10518 14 : ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
10519 14 : free(ctx);
10520 14 : }
10521 :
10522 : static void
10523 16 : bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10524 : struct spdk_io_channel *_ch, void *_ctx)
10525 : {
10526 16 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10527 16 : struct locked_lba_range_ctx *ctx = _ctx;
10528 : TAILQ_HEAD(, spdk_bdev_io) io_locked;
10529 : struct spdk_bdev_io *bdev_io;
10530 : struct lba_range *range;
10531 :
10532 16 : TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
10533 32 : if (ctx->range.offset == range->offset &&
10534 16 : ctx->range.length == range->length &&
10535 16 : ctx->range.locked_ctx == range->locked_ctx) {
10536 16 : TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
10537 16 : free(range);
10538 16 : break;
10539 : }
10540 0 : }
10541 :
10542 : /* Note: we should almost always be able to assert that the range specified
10543 : * was found. But there are some very rare corner cases where a new channel
10544 : * gets created simultaneously with a range unlock, where this function
10545 : * would execute on that new channel and wouldn't have the range.
10546 : * We also use this to clean up range allocations when a later allocation
10547 : * fails in the locking path.
10548 : * So we can't actually assert() here.
10549 : */
10550 :
10551 : /* Swap the locked IO into a temporary list, and then try to submit them again.
10552 : * We could hyper-optimize this to only resubmit locked I/O that overlap
10553 : * with the range that was just unlocked, but this isn't a performance path so
10554 : * we go for simplicity here.
10555 : */
10556 16 : TAILQ_INIT(&io_locked);
10557 16 : TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link);
10558 19 : while (!TAILQ_EMPTY(&io_locked)) {
10559 3 : bdev_io = TAILQ_FIRST(&io_locked);
10560 3 : TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link);
10561 3 : bdev_io_submit(bdev_io);
10562 : }
10563 :
10564 16 : spdk_bdev_for_each_channel_continue(i, 0);
10565 16 : }
10566 :
10567 : static int
10568 14 : _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length,
10569 : lock_range_cb cb_fn, void *cb_arg)
10570 : {
10571 : struct locked_lba_range_ctx *ctx;
10572 : struct lba_range *range;
10573 :
10574 14 : spdk_spin_lock(&bdev->internal.spinlock);
10575 : /* To start the unlock the process, we find the range in the bdev's locked_ranges
10576 : * and remove it. This ensures new channels don't inherit the locked range.
10577 : * Then we will send a message to each channel to remove the range from its
10578 : * per-channel list.
10579 : */
10580 14 : TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
10581 24 : if (range->offset == offset && range->length == length &&
10582 14 : (range->owner_ch == NULL || range->locked_ctx == cb_arg)) {
10583 14 : break;
10584 : }
10585 0 : }
10586 14 : if (range == NULL) {
10587 0 : assert(false);
10588 : spdk_spin_unlock(&bdev->internal.spinlock);
10589 : return -EINVAL;
10590 : }
10591 14 : TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq);
10592 14 : ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
10593 14 : spdk_spin_unlock(&bdev->internal.spinlock);
10594 :
10595 14 : ctx->cb_fn = cb_fn;
10596 14 : ctx->cb_arg = cb_arg;
10597 :
10598 14 : spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
10599 : bdev_unlock_lba_range_cb);
10600 14 : return 0;
10601 : }
10602 :
10603 : static int
10604 12 : bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
10605 : uint64_t offset, uint64_t length,
10606 : lock_range_cb cb_fn, void *cb_arg)
10607 : {
10608 12 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10609 12 : struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10610 : struct lba_range *range;
10611 12 : bool range_found = false;
10612 :
10613 : /* Let's make sure the specified channel actually has a lock on
10614 : * the specified range. Note that the range must match exactly.
10615 : */
10616 14 : TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
10617 22 : if (range->offset == offset && range->length == length &&
10618 11 : range->owner_ch == ch && range->locked_ctx == cb_arg) {
10619 10 : range_found = true;
10620 10 : break;
10621 : }
10622 2 : }
10623 :
10624 12 : if (!range_found) {
10625 2 : return -EINVAL;
10626 : }
10627 :
10628 10 : return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg);
10629 12 : }
10630 :
10631 : struct bdev_quiesce_ctx {
10632 : spdk_bdev_quiesce_cb cb_fn;
10633 : void *cb_arg;
10634 : };
10635 :
10636 : static void
10637 4 : bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status)
10638 : {
10639 4 : struct bdev_quiesce_ctx *quiesce_ctx = ctx;
10640 :
10641 4 : if (quiesce_ctx->cb_fn != NULL) {
10642 4 : quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
10643 4 : }
10644 :
10645 4 : free(quiesce_ctx);
10646 4 : }
10647 :
10648 : static void
10649 4 : bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status)
10650 : {
10651 4 : struct bdev_quiesce_ctx *quiesce_ctx = ctx;
10652 4 : struct spdk_bdev_module *module = range->bdev->module;
10653 :
10654 4 : if (status != 0) {
10655 0 : if (quiesce_ctx->cb_fn != NULL) {
10656 0 : quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
10657 0 : }
10658 0 : free(quiesce_ctx);
10659 0 : return;
10660 : }
10661 :
10662 4 : spdk_spin_lock(&module->internal.spinlock);
10663 4 : TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module);
10664 4 : spdk_spin_unlock(&module->internal.spinlock);
10665 :
10666 4 : if (quiesce_ctx->cb_fn != NULL) {
10667 : /* copy the context in case the range is unlocked by the callback */
10668 4 : struct bdev_quiesce_ctx tmp = *quiesce_ctx;
10669 :
10670 4 : quiesce_ctx->cb_fn = NULL;
10671 4 : quiesce_ctx->cb_arg = NULL;
10672 :
10673 4 : tmp.cb_fn(tmp.cb_arg, status);
10674 4 : }
10675 : /* quiesce_ctx will be freed on unquiesce */
10676 4 : }
10677 :
10678 : static int
10679 9 : _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10680 : uint64_t offset, uint64_t length,
10681 : spdk_bdev_quiesce_cb cb_fn, void *cb_arg,
10682 : bool unquiesce)
10683 : {
10684 : struct bdev_quiesce_ctx *quiesce_ctx;
10685 : int rc;
10686 :
10687 9 : if (module != bdev->module) {
10688 0 : SPDK_ERRLOG("Bdev does not belong to specified module.\n");
10689 0 : return -EINVAL;
10690 : }
10691 :
10692 9 : if (!bdev_io_valid_blocks(bdev, offset, length)) {
10693 0 : return -EINVAL;
10694 : }
10695 :
10696 9 : if (unquiesce) {
10697 : struct lba_range *range;
10698 :
10699 : /* Make sure the specified range is actually quiesced in the specified module and
10700 : * then remove it from the list. Note that the range must match exactly.
10701 : */
10702 5 : spdk_spin_lock(&module->internal.spinlock);
10703 6 : TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) {
10704 5 : if (range->bdev == bdev && range->offset == offset && range->length == length) {
10705 4 : TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module);
10706 4 : break;
10707 : }
10708 1 : }
10709 5 : spdk_spin_unlock(&module->internal.spinlock);
10710 :
10711 5 : if (range == NULL) {
10712 1 : SPDK_ERRLOG("The range to unquiesce was not found.\n");
10713 1 : return -EINVAL;
10714 : }
10715 :
10716 4 : quiesce_ctx = range->locked_ctx;
10717 4 : quiesce_ctx->cb_fn = cb_fn;
10718 4 : quiesce_ctx->cb_arg = cb_arg;
10719 :
10720 4 : rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx);
10721 4 : } else {
10722 4 : quiesce_ctx = malloc(sizeof(*quiesce_ctx));
10723 4 : if (quiesce_ctx == NULL) {
10724 0 : return -ENOMEM;
10725 : }
10726 :
10727 4 : quiesce_ctx->cb_fn = cb_fn;
10728 4 : quiesce_ctx->cb_arg = cb_arg;
10729 :
10730 4 : rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx);
10731 4 : if (rc != 0) {
10732 0 : free(quiesce_ctx);
10733 0 : }
10734 : }
10735 :
10736 8 : return rc;
10737 9 : }
10738 :
10739 : int
10740 3 : spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10741 : spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10742 : {
10743 3 : return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false);
10744 : }
10745 :
10746 : int
10747 3 : spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10748 : spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10749 : {
10750 3 : return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true);
10751 : }
10752 :
10753 : int
10754 1 : spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10755 : uint64_t offset, uint64_t length,
10756 : spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10757 : {
10758 1 : return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false);
10759 : }
10760 :
10761 : int
10762 2 : spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10763 : uint64_t offset, uint64_t length,
10764 : spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10765 : {
10766 2 : return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true);
10767 : }
10768 :
10769 : int
10770 285 : spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains,
10771 : int array_size)
10772 : {
10773 285 : if (!bdev) {
10774 1 : return -EINVAL;
10775 : }
10776 :
10777 284 : if (bdev->fn_table->get_memory_domains) {
10778 3 : return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size);
10779 : }
10780 :
10781 281 : return 0;
10782 285 : }
10783 :
10784 : struct spdk_bdev_for_each_io_ctx {
10785 : void *ctx;
10786 : spdk_bdev_io_fn fn;
10787 : spdk_bdev_for_each_io_cb cb;
10788 : };
10789 :
10790 : static void
10791 0 : bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10792 : struct spdk_io_channel *io_ch, void *_ctx)
10793 : {
10794 0 : struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
10795 0 : struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
10796 : struct spdk_bdev_io *bdev_io;
10797 0 : int rc = 0;
10798 :
10799 0 : TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
10800 0 : rc = ctx->fn(ctx->ctx, bdev_io);
10801 0 : if (rc != 0) {
10802 0 : break;
10803 : }
10804 0 : }
10805 :
10806 0 : spdk_bdev_for_each_channel_continue(i, rc);
10807 0 : }
10808 :
10809 : static void
10810 0 : bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
10811 : {
10812 0 : struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
10813 :
10814 0 : ctx->cb(ctx->ctx, status);
10815 :
10816 0 : free(ctx);
10817 0 : }
10818 :
10819 : void
10820 0 : spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn,
10821 : spdk_bdev_for_each_io_cb cb)
10822 : {
10823 : struct spdk_bdev_for_each_io_ctx *ctx;
10824 :
10825 0 : assert(fn != NULL && cb != NULL);
10826 :
10827 0 : ctx = calloc(1, sizeof(*ctx));
10828 0 : if (ctx == NULL) {
10829 0 : SPDK_ERRLOG("Failed to allocate context.\n");
10830 0 : cb(_ctx, -ENOMEM);
10831 0 : return;
10832 : }
10833 :
10834 0 : ctx->ctx = _ctx;
10835 0 : ctx->fn = fn;
10836 0 : ctx->cb = cb;
10837 :
10838 0 : spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx,
10839 : bdev_for_each_io_done);
10840 0 : }
10841 :
10842 : void
10843 137 : spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status)
10844 : {
10845 137 : spdk_for_each_channel_continue(iter->i, status);
10846 137 : }
10847 :
10848 : static struct spdk_bdev *
10849 376 : io_channel_iter_get_bdev(struct spdk_io_channel_iter *i)
10850 : {
10851 376 : void *io_device = spdk_io_channel_iter_get_io_device(i);
10852 :
10853 376 : return __bdev_from_io_dev(io_device);
10854 : }
10855 :
10856 : static void
10857 137 : bdev_each_channel_msg(struct spdk_io_channel_iter *i)
10858 : {
10859 137 : struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
10860 137 : struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
10861 137 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
10862 :
10863 137 : iter->i = i;
10864 137 : iter->fn(iter, bdev, ch, iter->ctx);
10865 137 : }
10866 :
10867 : static void
10868 239 : bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
10869 : {
10870 239 : struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
10871 239 : struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
10872 :
10873 239 : iter->i = i;
10874 239 : iter->cpl(bdev, iter->ctx, status);
10875 :
10876 239 : free(iter);
10877 239 : }
10878 :
10879 : void
10880 239 : spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn,
10881 : void *ctx, spdk_bdev_for_each_channel_done cpl)
10882 : {
10883 : struct spdk_bdev_channel_iter *iter;
10884 :
10885 239 : assert(bdev != NULL && fn != NULL && ctx != NULL);
10886 :
10887 239 : iter = calloc(1, sizeof(struct spdk_bdev_channel_iter));
10888 239 : if (iter == NULL) {
10889 0 : SPDK_ERRLOG("Unable to allocate iterator\n");
10890 0 : assert(false);
10891 : return;
10892 : }
10893 :
10894 239 : iter->fn = fn;
10895 239 : iter->cpl = cpl;
10896 239 : iter->ctx = ctx;
10897 :
10898 478 : spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg,
10899 239 : iter, bdev_each_channel_cpl);
10900 239 : }
10901 :
10902 : static void
10903 3 : bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
10904 : {
10905 3 : struct spdk_bdev_io *parent_io = cb_arg;
10906 :
10907 3 : spdk_bdev_free_io(bdev_io);
10908 :
10909 : /* Check return status of write */
10910 3 : parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
10911 3 : parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
10912 3 : }
10913 :
10914 : static void
10915 3 : bdev_copy_do_write(void *_bdev_io)
10916 : {
10917 3 : struct spdk_bdev_io *bdev_io = _bdev_io;
10918 : int rc;
10919 :
10920 : /* Write blocks */
10921 6 : rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc,
10922 3 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
10923 3 : bdev_io->u.bdev.iovs[0].iov_base,
10924 3 : bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks,
10925 3 : bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io);
10926 :
10927 3 : if (rc == -ENOMEM) {
10928 0 : bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write);
10929 3 : } else if (rc != 0) {
10930 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10931 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10932 0 : }
10933 3 : }
10934 :
10935 : static void
10936 3 : bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
10937 : {
10938 3 : struct spdk_bdev_io *parent_io = cb_arg;
10939 :
10940 3 : spdk_bdev_free_io(bdev_io);
10941 :
10942 : /* Check return status of read */
10943 3 : if (!success) {
10944 0 : parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10945 0 : parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
10946 0 : return;
10947 : }
10948 :
10949 : /* Do write */
10950 3 : bdev_copy_do_write(parent_io);
10951 3 : }
10952 :
10953 : static void
10954 3 : bdev_copy_do_read(void *_bdev_io)
10955 : {
10956 3 : struct spdk_bdev_io *bdev_io = _bdev_io;
10957 : int rc;
10958 :
10959 : /* Read blocks */
10960 6 : rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc,
10961 3 : spdk_io_channel_from_ctx(bdev_io->internal.ch),
10962 3 : bdev_io->u.bdev.iovs[0].iov_base,
10963 3 : bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks,
10964 3 : bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io);
10965 :
10966 3 : if (rc == -ENOMEM) {
10967 0 : bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read);
10968 3 : } else if (rc != 0) {
10969 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10970 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10971 0 : }
10972 3 : }
10973 :
10974 : static void
10975 3 : bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
10976 : {
10977 3 : if (!success) {
10978 0 : bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10979 0 : bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10980 0 : return;
10981 : }
10982 :
10983 3 : bdev_copy_do_read(bdev_io);
10984 3 : }
10985 :
10986 : int
10987 27 : spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
10988 : uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks,
10989 : spdk_bdev_io_completion_cb cb, void *cb_arg)
10990 : {
10991 27 : struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10992 : struct spdk_bdev_io *bdev_io;
10993 27 : struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
10994 :
10995 27 : if (!desc->write) {
10996 0 : return -EBADF;
10997 : }
10998 :
10999 27 : if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) ||
11000 27 : !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) {
11001 0 : SPDK_DEBUGLOG(bdev,
11002 : "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n",
11003 : dst_offset_blocks, src_offset_blocks, num_blocks);
11004 0 : return -EINVAL;
11005 : }
11006 :
11007 27 : bdev_io = bdev_channel_get_io(channel);
11008 27 : if (!bdev_io) {
11009 0 : return -ENOMEM;
11010 : }
11011 :
11012 27 : bdev_io->internal.ch = channel;
11013 27 : bdev_io->internal.desc = desc;
11014 27 : bdev_io->type = SPDK_BDEV_IO_TYPE_COPY;
11015 :
11016 27 : bdev_io->u.bdev.offset_blocks = dst_offset_blocks;
11017 27 : bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks;
11018 27 : bdev_io->u.bdev.num_blocks = num_blocks;
11019 27 : bdev_io->u.bdev.memory_domain = NULL;
11020 27 : bdev_io->u.bdev.memory_domain_ctx = NULL;
11021 27 : bdev_io->u.bdev.iovs = NULL;
11022 27 : bdev_io->u.bdev.iovcnt = 0;
11023 27 : bdev_io->u.bdev.md_buf = NULL;
11024 27 : bdev_io->u.bdev.accel_sequence = NULL;
11025 27 : bdev_io_init(bdev_io, bdev, cb_arg, cb);
11026 :
11027 27 : if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) {
11028 0 : spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io);
11029 0 : return 0;
11030 : }
11031 :
11032 :
11033 : /* If the copy size is large and should be split, use the generic split logic
11034 : * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not.
11035 : *
11036 : * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or
11037 : * emulate it using regular read and write requests otherwise.
11038 : */
11039 27 : if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) ||
11040 4 : bdev_io->internal.f.split) {
11041 24 : bdev_io_submit(bdev_io);
11042 24 : return 0;
11043 : }
11044 :
11045 3 : spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev));
11046 :
11047 3 : return 0;
11048 27 : }
11049 :
11050 3 : SPDK_LOG_REGISTER_COMPONENT(bdev)
11051 :
11052 : static void
11053 0 : bdev_trace(void)
11054 : {
11055 0 : struct spdk_trace_tpoint_opts opts[] = {
11056 : {
11057 : "BDEV_IO_START", TRACE_BDEV_IO_START,
11058 : OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1,
11059 : {
11060 : { "type", SPDK_TRACE_ARG_TYPE_INT, 8 },
11061 : { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
11062 : { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 },
11063 : { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
11064 : }
11065 : },
11066 : {
11067 : "BDEV_IO_DONE", TRACE_BDEV_IO_DONE,
11068 : OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0,
11069 : {
11070 : { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
11071 : { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
11072 : }
11073 : },
11074 : {
11075 : "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE,
11076 : OWNER_TYPE_BDEV, OBJECT_NONE, 0,
11077 : {
11078 : { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 }
11079 : }
11080 : },
11081 : {
11082 : "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY,
11083 : OWNER_TYPE_BDEV, OBJECT_NONE, 0,
11084 : {
11085 : { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 }
11086 : }
11087 : },
11088 : };
11089 :
11090 :
11091 0 : spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b');
11092 0 : spdk_trace_register_object(OBJECT_BDEV_IO, 'i');
11093 0 : spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
11094 0 : spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0);
11095 0 : spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0);
11096 0 : spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0);
11097 0 : spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0);
11098 0 : spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0);
11099 0 : spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0);
11100 0 : }
11101 3 : SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV)
|