Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2016 Intel Corporation. All rights reserved.
3 : * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4 : * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 : * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved.
6 : */
7 :
8 : #include "spdk/stdinc.h"
9 :
10 : #include "bdev_nvme.h"
11 :
12 : #include "spdk/accel.h"
13 : #include "spdk/config.h"
14 : #include "spdk/endian.h"
15 : #include "spdk/bdev.h"
16 : #include "spdk/json.h"
17 : #include "spdk/keyring.h"
18 : #include "spdk/likely.h"
19 : #include "spdk/nvme.h"
20 : #include "spdk/nvme_ocssd.h"
21 : #include "spdk/nvme_zns.h"
22 : #include "spdk/opal.h"
23 : #include "spdk/thread.h"
24 : #include "spdk/trace.h"
25 : #include "spdk/string.h"
26 : #include "spdk/util.h"
27 : #include "spdk/uuid.h"
28 :
29 : #include "spdk/bdev_module.h"
30 : #include "spdk/log.h"
31 :
32 : #include "spdk_internal/usdt.h"
33 : #include "spdk_internal/trace_defs.h"
34 :
35 : #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
36 : #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000)
37 :
38 : #define NSID_STR_LEN 10
39 :
40 : #define SPDK_CONTROLLER_NAME_MAX 512
41 :
42 : static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
43 :
44 : struct nvme_bdev_io {
45 : /** array of iovecs to transfer. */
46 : struct iovec *iovs;
47 :
48 : /** Number of iovecs in iovs array. */
49 : int iovcnt;
50 :
51 : /** Current iovec position. */
52 : int iovpos;
53 :
54 : /** Offset in current iovec. */
55 : uint32_t iov_offset;
56 :
57 : /** Offset in current iovec. */
58 : uint32_t fused_iov_offset;
59 :
60 : /** array of iovecs to transfer. */
61 : struct iovec *fused_iovs;
62 :
63 : /** Number of iovecs in iovs array. */
64 : int fused_iovcnt;
65 :
66 : /** Current iovec position. */
67 : int fused_iovpos;
68 :
69 : /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path
70 : * being reset in a reset I/O.
71 : */
72 : struct nvme_io_path *io_path;
73 :
74 : /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
75 : struct spdk_nvme_cpl cpl;
76 :
77 : /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */
78 : struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
79 :
80 : /** Keeps track if first of fused commands was submitted */
81 : bool first_fused_submitted;
82 :
83 : /** Keeps track if first of fused commands was completed */
84 : bool first_fused_completed;
85 :
86 : /* How many times the current I/O was retried. */
87 : int32_t retry_count;
88 :
89 : /** Expiration value in ticks to retry the current I/O. */
90 : uint64_t retry_ticks;
91 :
92 : /** Temporary pointer to zone report buffer */
93 : struct spdk_nvme_zns_zone_report *zone_report_buf;
94 :
95 : /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
96 : uint64_t handled_zones;
97 :
98 : /* Current tsc at submit time. */
99 : uint64_t submit_tsc;
100 :
101 : /* Used to put nvme_bdev_io into the list */
102 : TAILQ_ENTRY(nvme_bdev_io) retry_link;
103 : };
104 :
105 : struct nvme_probe_skip_entry {
106 : struct spdk_nvme_transport_id trid;
107 : TAILQ_ENTRY(nvme_probe_skip_entry) tailq;
108 : };
109 : /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
110 : static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
111 : g_skipped_nvme_ctrlrs);
112 :
113 : #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \
114 : SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \
115 : SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512))
116 :
117 : #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \
118 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \
119 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \
120 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \
121 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \
122 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192))
123 :
124 : static struct spdk_bdev_nvme_opts g_opts = {
125 : .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
126 : .timeout_us = 0,
127 : .timeout_admin_us = 0,
128 : .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
129 : .transport_retry_count = 4,
130 : .arbitration_burst = 0,
131 : .low_priority_weight = 0,
132 : .medium_priority_weight = 0,
133 : .high_priority_weight = 0,
134 : .nvme_adminq_poll_period_us = 10000ULL,
135 : .nvme_ioq_poll_period_us = 0,
136 : .io_queue_requests = 0,
137 : .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
138 : .bdev_retry_count = 3,
139 : .transport_ack_timeout = 0,
140 : .ctrlr_loss_timeout_sec = 0,
141 : .reconnect_delay_sec = 0,
142 : .fast_io_fail_timeout_sec = 0,
143 : .disable_auto_failback = false,
144 : .generate_uuids = false,
145 : .transport_tos = 0,
146 : .nvme_error_stat = false,
147 : .io_path_stat = false,
148 : .allow_accel_sequence = false,
149 : .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS,
150 : .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS,
151 : };
152 :
153 : #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL
154 : #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL
155 :
156 : static int g_hot_insert_nvme_controller_index = 0;
157 : static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
158 : static bool g_nvme_hotplug_enabled = false;
159 : struct spdk_thread *g_bdev_nvme_init_thread;
160 : static struct spdk_poller *g_hotplug_poller;
161 : static struct spdk_poller *g_hotplug_probe_poller;
162 : static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
163 :
164 : static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
165 : struct nvme_async_probe_ctx *ctx);
166 : static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
167 : struct nvme_async_probe_ctx *ctx);
168 : static int bdev_nvme_library_init(void);
169 : static void bdev_nvme_library_fini(void);
170 : static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch,
171 : struct spdk_bdev_io *bdev_io);
172 : static void bdev_nvme_submit_request(struct spdk_io_channel *ch,
173 : struct spdk_bdev_io *bdev_io);
174 : static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
175 : void *md, uint64_t lba_count, uint64_t lba,
176 : uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx,
177 : struct spdk_accel_sequence *seq);
178 : static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
179 : void *md, uint64_t lba_count, uint64_t lba);
180 : static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
181 : void *md, uint64_t lba_count, uint64_t lba,
182 : uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx,
183 : struct spdk_accel_sequence *seq,
184 : union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13);
185 : static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
186 : void *md, uint64_t lba_count,
187 : uint64_t zslba, uint32_t flags);
188 : static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
189 : void *md, uint64_t lba_count, uint64_t lba,
190 : uint32_t flags);
191 : static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio,
192 : struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
193 : int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
194 : uint32_t flags);
195 : static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id,
196 : uint32_t num_zones, struct spdk_bdev_zone_info *info);
197 : static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
198 : enum spdk_bdev_zone_action action);
199 : static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch,
200 : struct nvme_bdev_io *bio,
201 : struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
202 : static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
203 : void *buf, size_t nbytes);
204 : static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
205 : void *buf, size_t nbytes, void *md_buf, size_t md_len);
206 : static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
207 : struct iovec *iov, int iovcnt, size_t nbytes,
208 : void *md_buf, size_t md_len);
209 : static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch,
210 : struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
211 : static void bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio);
212 : static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
213 : static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
214 : static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
215 : static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr);
216 :
217 : static struct nvme_ns *nvme_ns_alloc(void);
218 : static void nvme_ns_free(struct nvme_ns *ns);
219 :
220 : static int
221 175 : nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2)
222 : {
223 175 : return ns1->id < ns2->id ? -1 : ns1->id > ns2->id;
224 : }
225 :
226 914 : RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp);
227 :
228 : struct spdk_nvme_qpair *
229 1 : bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
230 : {
231 : struct nvme_ctrlr_channel *ctrlr_ch;
232 :
233 1 : assert(ctrlr_io_ch != NULL);
234 :
235 1 : ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
236 :
237 1 : return ctrlr_ch->qpair->qpair;
238 : }
239 :
240 : static int
241 0 : bdev_nvme_get_ctx_size(void)
242 : {
243 0 : return sizeof(struct nvme_bdev_io);
244 : }
245 :
246 : static struct spdk_bdev_module nvme_if = {
247 : .name = "nvme",
248 : .async_fini = true,
249 : .module_init = bdev_nvme_library_init,
250 : .module_fini = bdev_nvme_library_fini,
251 : .config_json = bdev_nvme_config_json,
252 : .get_ctx_size = bdev_nvme_get_ctx_size,
253 :
254 : };
255 1 : SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
256 :
257 : struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs);
258 : pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER;
259 : bool g_bdev_nvme_module_finish;
260 :
261 : struct nvme_bdev_ctrlr *
262 275 : nvme_bdev_ctrlr_get_by_name(const char *name)
263 : {
264 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
265 :
266 275 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
267 150 : if (strcmp(name, nbdev_ctrlr->name) == 0) {
268 150 : break;
269 : }
270 : }
271 :
272 275 : return nbdev_ctrlr;
273 : }
274 :
275 : static struct nvme_ctrlr *
276 58 : nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr,
277 : const struct spdk_nvme_transport_id *trid, const char *hostnqn)
278 : {
279 : const struct spdk_nvme_ctrlr_opts *opts;
280 : struct nvme_ctrlr *nvme_ctrlr;
281 :
282 99 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
283 74 : opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr);
284 74 : if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0 &&
285 33 : strcmp(hostnqn, opts->hostnqn) == 0) {
286 33 : break;
287 : }
288 : }
289 :
290 58 : return nvme_ctrlr;
291 : }
292 :
293 : struct nvme_ctrlr *
294 0 : nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr,
295 : uint16_t cntlid)
296 : {
297 : struct nvme_ctrlr *nvme_ctrlr;
298 : const struct spdk_nvme_ctrlr_data *cdata;
299 :
300 0 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
301 0 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
302 0 : if (cdata->cntlid == cntlid) {
303 0 : break;
304 : }
305 : }
306 :
307 0 : return nvme_ctrlr;
308 : }
309 :
310 : static struct nvme_bdev *
311 73 : nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid)
312 : {
313 : struct nvme_bdev *bdev;
314 :
315 73 : pthread_mutex_lock(&g_bdev_nvme_mutex);
316 107 : TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) {
317 68 : if (bdev->nsid == nsid) {
318 34 : break;
319 : }
320 : }
321 73 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
322 :
323 73 : return bdev;
324 : }
325 :
326 : struct nvme_ns *
327 143 : nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid)
328 : {
329 143 : struct nvme_ns ns;
330 :
331 143 : assert(nsid > 0);
332 :
333 143 : ns.id = nsid;
334 143 : return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns);
335 : }
336 :
337 : struct nvme_ns *
338 155 : nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr)
339 : {
340 155 : return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces);
341 : }
342 :
343 : struct nvme_ns *
344 65 : nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns)
345 : {
346 65 : if (ns == NULL) {
347 0 : return NULL;
348 : }
349 :
350 65 : return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
351 : }
352 :
353 : static struct nvme_ctrlr *
354 52 : nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid, const char *hostnqn)
355 : {
356 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
357 52 : struct nvme_ctrlr *nvme_ctrlr = NULL;
358 :
359 52 : pthread_mutex_lock(&g_bdev_nvme_mutex);
360 71 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
361 19 : nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid, hostnqn);
362 19 : if (nvme_ctrlr != NULL) {
363 0 : break;
364 : }
365 : }
366 52 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
367 :
368 52 : return nvme_ctrlr;
369 : }
370 :
371 : struct nvme_ctrlr *
372 73 : nvme_ctrlr_get_by_name(const char *name)
373 : {
374 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
375 73 : struct nvme_ctrlr *nvme_ctrlr = NULL;
376 :
377 73 : if (name == NULL) {
378 0 : return NULL;
379 : }
380 :
381 73 : pthread_mutex_lock(&g_bdev_nvme_mutex);
382 73 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
383 73 : if (nbdev_ctrlr != NULL) {
384 41 : nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
385 : }
386 73 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
387 :
388 73 : return nvme_ctrlr;
389 : }
390 :
391 : void
392 0 : nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx)
393 : {
394 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
395 :
396 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
397 0 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
398 0 : fn(nbdev_ctrlr, ctx);
399 : }
400 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
401 0 : }
402 :
403 : struct nvme_ctrlr_channel_iter {
404 : nvme_ctrlr_for_each_channel_msg fn;
405 : nvme_ctrlr_for_each_channel_done cpl;
406 : struct spdk_io_channel_iter *i;
407 : void *ctx;
408 : };
409 :
410 : void
411 194 : nvme_ctrlr_for_each_channel_continue(struct nvme_ctrlr_channel_iter *iter, int status)
412 : {
413 194 : spdk_for_each_channel_continue(iter->i, status);
414 194 : }
415 :
416 : static void
417 194 : nvme_ctrlr_each_channel_msg(struct spdk_io_channel_iter *i)
418 : {
419 194 : struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
420 194 : struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
421 194 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
422 194 : struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
423 :
424 194 : iter->i = i;
425 194 : iter->fn(iter, nvme_ctrlr, ctrlr_ch, iter->ctx);
426 194 : }
427 :
428 : static void
429 120 : nvme_ctrlr_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
430 : {
431 120 : struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
432 120 : struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
433 :
434 120 : iter->i = i;
435 120 : iter->cpl(nvme_ctrlr, iter->ctx, status);
436 :
437 120 : free(iter);
438 120 : }
439 :
440 : void
441 120 : nvme_ctrlr_for_each_channel(struct nvme_ctrlr *nvme_ctrlr,
442 : nvme_ctrlr_for_each_channel_msg fn, void *ctx,
443 : nvme_ctrlr_for_each_channel_done cpl)
444 : {
445 : struct nvme_ctrlr_channel_iter *iter;
446 :
447 120 : assert(nvme_ctrlr != NULL && fn != NULL);
448 :
449 120 : iter = calloc(1, sizeof(struct nvme_ctrlr_channel_iter));
450 120 : if (iter == NULL) {
451 0 : SPDK_ERRLOG("Unable to allocate iterator\n");
452 0 : assert(false);
453 : return;
454 : }
455 :
456 120 : iter->fn = fn;
457 120 : iter->cpl = cpl;
458 120 : iter->ctx = ctx;
459 :
460 120 : spdk_for_each_channel(nvme_ctrlr, nvme_ctrlr_each_channel_msg,
461 : iter, nvme_ctrlr_each_channel_cpl);
462 : }
463 :
464 : struct nvme_bdev_channel_iter {
465 : nvme_bdev_for_each_channel_msg fn;
466 : nvme_bdev_for_each_channel_done cpl;
467 : struct spdk_io_channel_iter *i;
468 : void *ctx;
469 : };
470 :
471 : void
472 39 : nvme_bdev_for_each_channel_continue(struct nvme_bdev_channel_iter *iter, int status)
473 : {
474 39 : spdk_for_each_channel_continue(iter->i, status);
475 39 : }
476 :
477 : static void
478 39 : nvme_bdev_each_channel_msg(struct spdk_io_channel_iter *i)
479 : {
480 39 : struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
481 39 : struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i);
482 39 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
483 39 : struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
484 :
485 39 : iter->i = i;
486 39 : iter->fn(iter, nbdev, nbdev_ch, iter->ctx);
487 39 : }
488 :
489 : static void
490 45 : nvme_bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
491 : {
492 45 : struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
493 45 : struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i);
494 :
495 45 : iter->i = i;
496 45 : iter->cpl(nbdev, iter->ctx, status);
497 :
498 45 : free(iter);
499 45 : }
500 :
501 : void
502 45 : nvme_bdev_for_each_channel(struct nvme_bdev *nbdev,
503 : nvme_bdev_for_each_channel_msg fn, void *ctx,
504 : nvme_bdev_for_each_channel_done cpl)
505 : {
506 : struct nvme_bdev_channel_iter *iter;
507 :
508 45 : assert(nbdev != NULL && fn != NULL);
509 :
510 45 : iter = calloc(1, sizeof(struct nvme_bdev_channel_iter));
511 45 : if (iter == NULL) {
512 0 : SPDK_ERRLOG("Unable to allocate iterator\n");
513 0 : assert(false);
514 : return;
515 : }
516 :
517 45 : iter->fn = fn;
518 45 : iter->cpl = cpl;
519 45 : iter->ctx = ctx;
520 :
521 45 : spdk_for_each_channel(nbdev, nvme_bdev_each_channel_msg, iter,
522 : nvme_bdev_each_channel_cpl);
523 : }
524 :
525 : void
526 0 : nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w)
527 : {
528 : const char *trtype_str;
529 : const char *adrfam_str;
530 :
531 0 : trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype);
532 0 : if (trtype_str) {
533 0 : spdk_json_write_named_string(w, "trtype", trtype_str);
534 : }
535 :
536 0 : adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
537 0 : if (adrfam_str) {
538 0 : spdk_json_write_named_string(w, "adrfam", adrfam_str);
539 : }
540 :
541 0 : if (trid->traddr[0] != '\0') {
542 0 : spdk_json_write_named_string(w, "traddr", trid->traddr);
543 : }
544 :
545 0 : if (trid->trsvcid[0] != '\0') {
546 0 : spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
547 : }
548 :
549 0 : if (trid->subnqn[0] != '\0') {
550 0 : spdk_json_write_named_string(w, "subnqn", trid->subnqn);
551 : }
552 0 : }
553 :
554 : static void
555 60 : nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr,
556 : struct nvme_ctrlr *nvme_ctrlr)
557 : {
558 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name);
559 60 : pthread_mutex_lock(&g_bdev_nvme_mutex);
560 :
561 60 : TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
562 60 : if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) {
563 15 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
564 :
565 15 : return;
566 : }
567 45 : TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
568 :
569 45 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
570 :
571 45 : assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs));
572 :
573 45 : free(nbdev_ctrlr->name);
574 45 : free(nbdev_ctrlr);
575 : }
576 :
577 : static void
578 61 : _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
579 : {
580 : struct nvme_path_id *path_id, *tmp_path;
581 : struct nvme_ns *ns, *tmp_ns;
582 :
583 61 : free(nvme_ctrlr->copied_ana_desc);
584 61 : spdk_free(nvme_ctrlr->ana_log_page);
585 :
586 61 : if (nvme_ctrlr->opal_dev) {
587 0 : spdk_opal_dev_destruct(nvme_ctrlr->opal_dev);
588 0 : nvme_ctrlr->opal_dev = NULL;
589 : }
590 :
591 61 : if (nvme_ctrlr->nbdev_ctrlr) {
592 60 : nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr);
593 : }
594 :
595 61 : RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) {
596 0 : RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
597 0 : nvme_ns_free(ns);
598 : }
599 :
600 122 : TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) {
601 61 : TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
602 61 : free(path_id);
603 : }
604 :
605 61 : pthread_mutex_destroy(&nvme_ctrlr->mutex);
606 61 : spdk_keyring_put_key(nvme_ctrlr->psk);
607 61 : spdk_keyring_put_key(nvme_ctrlr->dhchap_key);
608 61 : spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key);
609 61 : free(nvme_ctrlr);
610 :
611 61 : pthread_mutex_lock(&g_bdev_nvme_mutex);
612 61 : if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
613 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
614 0 : spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
615 0 : spdk_bdev_module_fini_done();
616 0 : return;
617 : }
618 61 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
619 : }
620 :
621 : static int
622 61 : nvme_detach_poller(void *arg)
623 : {
624 61 : struct nvme_ctrlr *nvme_ctrlr = arg;
625 : int rc;
626 :
627 61 : rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx);
628 61 : if (rc != -EAGAIN) {
629 61 : spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
630 61 : _nvme_ctrlr_delete(nvme_ctrlr);
631 : }
632 :
633 61 : return SPDK_POLLER_BUSY;
634 : }
635 :
636 : static void
637 61 : nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
638 : {
639 : int rc;
640 :
641 61 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
642 :
643 : /* First, unregister the adminq poller, as the driver will poll adminq if necessary */
644 61 : spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller);
645 :
646 : /* If we got here, the reset/detach poller cannot be active */
647 61 : assert(nvme_ctrlr->reset_detach_poller == NULL);
648 61 : nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller,
649 : nvme_ctrlr, 1000);
650 61 : if (nvme_ctrlr->reset_detach_poller == NULL) {
651 0 : SPDK_ERRLOG("Failed to register detach poller\n");
652 0 : goto error;
653 : }
654 :
655 61 : rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx);
656 61 : if (rc != 0) {
657 0 : SPDK_ERRLOG("Failed to detach the NVMe controller\n");
658 0 : goto error;
659 : }
660 :
661 61 : return;
662 0 : error:
663 : /* We don't have a good way to handle errors here, so just do what we can and delete the
664 : * controller without detaching the underlying NVMe device.
665 : */
666 0 : spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
667 0 : _nvme_ctrlr_delete(nvme_ctrlr);
668 : }
669 :
670 : static void
671 60 : nvme_ctrlr_unregister_cb(void *io_device)
672 : {
673 60 : struct nvme_ctrlr *nvme_ctrlr = io_device;
674 :
675 60 : nvme_ctrlr_delete(nvme_ctrlr);
676 60 : }
677 :
678 : static void
679 60 : nvme_ctrlr_unregister(void *ctx)
680 : {
681 60 : struct nvme_ctrlr *nvme_ctrlr = ctx;
682 :
683 60 : spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb);
684 60 : }
685 :
686 : static bool
687 225 : nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr)
688 : {
689 225 : if (!nvme_ctrlr->destruct) {
690 109 : return false;
691 : }
692 :
693 116 : if (nvme_ctrlr->ref > 0) {
694 56 : return false;
695 : }
696 :
697 60 : if (nvme_ctrlr->resetting) {
698 0 : return false;
699 : }
700 :
701 60 : if (nvme_ctrlr->ana_log_page_updating) {
702 0 : return false;
703 : }
704 :
705 60 : if (nvme_ctrlr->io_path_cache_clearing) {
706 0 : return false;
707 : }
708 :
709 60 : return true;
710 : }
711 :
712 : static void
713 168 : nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr)
714 : {
715 168 : pthread_mutex_lock(&nvme_ctrlr->mutex);
716 : SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref);
717 :
718 168 : assert(nvme_ctrlr->ref > 0);
719 168 : nvme_ctrlr->ref--;
720 :
721 168 : if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
722 108 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
723 108 : return;
724 : }
725 :
726 60 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
727 :
728 60 : spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr);
729 : }
730 :
731 : static void
732 171 : bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch)
733 : {
734 171 : nbdev_ch->current_io_path = NULL;
735 171 : nbdev_ch->rr_counter = 0;
736 171 : }
737 :
738 : static struct nvme_io_path *
739 8 : _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
740 : {
741 : struct nvme_io_path *io_path;
742 :
743 16 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
744 15 : if (io_path->nvme_ns == nvme_ns) {
745 7 : break;
746 : }
747 : }
748 :
749 8 : return io_path;
750 : }
751 :
752 : static struct nvme_io_path *
753 37 : nvme_io_path_alloc(void)
754 : {
755 : struct nvme_io_path *io_path;
756 :
757 37 : io_path = calloc(1, sizeof(*io_path));
758 37 : if (io_path == NULL) {
759 0 : SPDK_ERRLOG("Failed to alloc io_path.\n");
760 0 : return NULL;
761 : }
762 :
763 37 : if (g_opts.io_path_stat) {
764 0 : io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
765 0 : if (io_path->stat == NULL) {
766 0 : free(io_path);
767 0 : SPDK_ERRLOG("Failed to alloc io_path stat.\n");
768 0 : return NULL;
769 : }
770 0 : spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN);
771 : }
772 :
773 37 : return io_path;
774 : }
775 :
776 : static void
777 37 : nvme_io_path_free(struct nvme_io_path *io_path)
778 : {
779 37 : free(io_path->stat);
780 37 : free(io_path);
781 37 : }
782 :
783 : static int
784 37 : _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
785 : {
786 : struct nvme_io_path *io_path;
787 : struct spdk_io_channel *ch;
788 : struct nvme_ctrlr_channel *ctrlr_ch;
789 : struct nvme_qpair *nvme_qpair;
790 :
791 37 : io_path = nvme_io_path_alloc();
792 37 : if (io_path == NULL) {
793 0 : return -ENOMEM;
794 : }
795 :
796 37 : io_path->nvme_ns = nvme_ns;
797 :
798 37 : ch = spdk_get_io_channel(nvme_ns->ctrlr);
799 37 : if (ch == NULL) {
800 0 : nvme_io_path_free(io_path);
801 0 : SPDK_ERRLOG("Failed to alloc io_channel.\n");
802 0 : return -ENOMEM;
803 : }
804 :
805 37 : ctrlr_ch = spdk_io_channel_get_ctx(ch);
806 :
807 37 : nvme_qpair = ctrlr_ch->qpair;
808 37 : assert(nvme_qpair != NULL);
809 :
810 37 : io_path->qpair = nvme_qpair;
811 37 : TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq);
812 :
813 37 : io_path->nbdev_ch = nbdev_ch;
814 37 : STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq);
815 :
816 37 : bdev_nvme_clear_current_io_path(nbdev_ch);
817 :
818 37 : return 0;
819 : }
820 :
821 : static void
822 37 : bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch,
823 : struct nvme_io_path *io_path)
824 : {
825 : struct nvme_bdev_io *bio;
826 :
827 38 : TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) {
828 1 : if (bio->io_path == io_path) {
829 1 : bio->io_path = NULL;
830 : }
831 : }
832 37 : }
833 :
834 : static void
835 37 : _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path)
836 : {
837 : struct spdk_io_channel *ch;
838 : struct nvme_qpair *nvme_qpair;
839 : struct nvme_ctrlr_channel *ctrlr_ch;
840 : struct nvme_bdev *nbdev;
841 :
842 37 : nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch));
843 :
844 : /* Add the statistics to nvme_ns before this path is destroyed. */
845 37 : pthread_mutex_lock(&nbdev->mutex);
846 37 : if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) {
847 0 : spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat);
848 : }
849 37 : pthread_mutex_unlock(&nbdev->mutex);
850 :
851 37 : bdev_nvme_clear_current_io_path(nbdev_ch);
852 37 : bdev_nvme_clear_retry_io_path(nbdev_ch, io_path);
853 :
854 37 : STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq);
855 37 : io_path->nbdev_ch = NULL;
856 :
857 37 : nvme_qpair = io_path->qpair;
858 37 : assert(nvme_qpair != NULL);
859 :
860 37 : ctrlr_ch = nvme_qpair->ctrlr_ch;
861 37 : assert(ctrlr_ch != NULL);
862 :
863 37 : ch = spdk_io_channel_from_ctx(ctrlr_ch);
864 37 : spdk_put_io_channel(ch);
865 :
866 : /* After an io_path is removed, I/Os submitted to it may complete and update statistics
867 : * of the io_path. To avoid heap-use-after-free error from this case, do not free the
868 : * io_path here but free the io_path when the associated qpair is freed. It is ensured
869 : * that all I/Os submitted to the io_path are completed when the associated qpair is freed.
870 : */
871 37 : }
872 :
873 : static void
874 24 : _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch)
875 : {
876 : struct nvme_io_path *io_path, *tmp_io_path;
877 :
878 59 : STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) {
879 35 : _bdev_nvme_delete_io_path(nbdev_ch, io_path);
880 : }
881 24 : }
882 :
883 : static int
884 24 : bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf)
885 : {
886 24 : struct nvme_bdev_channel *nbdev_ch = ctx_buf;
887 24 : struct nvme_bdev *nbdev = io_device;
888 : struct nvme_ns *nvme_ns;
889 : int rc;
890 :
891 24 : STAILQ_INIT(&nbdev_ch->io_path_list);
892 24 : TAILQ_INIT(&nbdev_ch->retry_io_list);
893 :
894 24 : pthread_mutex_lock(&nbdev->mutex);
895 :
896 24 : nbdev_ch->mp_policy = nbdev->mp_policy;
897 24 : nbdev_ch->mp_selector = nbdev->mp_selector;
898 24 : nbdev_ch->rr_min_io = nbdev->rr_min_io;
899 :
900 59 : TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
901 35 : rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
902 35 : if (rc != 0) {
903 0 : pthread_mutex_unlock(&nbdev->mutex);
904 :
905 0 : _bdev_nvme_delete_io_paths(nbdev_ch);
906 0 : return rc;
907 : }
908 : }
909 24 : pthread_mutex_unlock(&nbdev->mutex);
910 :
911 24 : return 0;
912 : }
913 :
914 : /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'.
915 : * If cpl == NULL, complete the bdev_io with bdev status based on 'status'.
916 : */
917 : static inline void
918 50 : __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status,
919 : const struct spdk_nvme_cpl *cpl)
920 : {
921 50 : spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx,
922 : (uintptr_t)bdev_io);
923 50 : if (cpl) {
924 29 : spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
925 : } else {
926 21 : spdk_bdev_io_complete(bdev_io, status);
927 : }
928 50 : }
929 :
930 : static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch);
931 :
932 : static void
933 24 : bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf)
934 : {
935 24 : struct nvme_bdev_channel *nbdev_ch = ctx_buf;
936 :
937 24 : bdev_nvme_abort_retry_ios(nbdev_ch);
938 24 : _bdev_nvme_delete_io_paths(nbdev_ch);
939 24 : }
940 :
941 : static inline bool
942 62 : bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type)
943 : {
944 62 : switch (io_type) {
945 5 : case SPDK_BDEV_IO_TYPE_RESET:
946 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
947 : case SPDK_BDEV_IO_TYPE_ABORT:
948 5 : return true;
949 57 : default:
950 57 : break;
951 : }
952 :
953 57 : return false;
954 : }
955 :
956 : static inline bool
957 91 : nvme_ns_is_active(struct nvme_ns *nvme_ns)
958 : {
959 91 : if (spdk_unlikely(nvme_ns->ana_state_updating)) {
960 1 : return false;
961 : }
962 :
963 90 : if (spdk_unlikely(nvme_ns->ns == NULL)) {
964 0 : return false;
965 : }
966 :
967 90 : return true;
968 : }
969 :
970 : static inline bool
971 79 : nvme_ns_is_accessible(struct nvme_ns *nvme_ns)
972 : {
973 79 : if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) {
974 1 : return false;
975 : }
976 :
977 78 : switch (nvme_ns->ana_state) {
978 69 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
979 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
980 69 : return true;
981 9 : default:
982 9 : break;
983 : }
984 :
985 9 : return false;
986 : }
987 :
988 : static inline bool
989 121 : nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair)
990 : {
991 121 : if (spdk_unlikely(nvme_qpair->qpair == NULL)) {
992 23 : return false;
993 : }
994 :
995 98 : if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) !=
996 : SPDK_NVME_QPAIR_FAILURE_NONE)) {
997 2 : return false;
998 : }
999 :
1000 96 : if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) {
1001 0 : return false;
1002 : }
1003 :
1004 96 : return true;
1005 : }
1006 :
1007 : static inline bool
1008 95 : nvme_io_path_is_available(struct nvme_io_path *io_path)
1009 : {
1010 95 : if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) {
1011 16 : return false;
1012 : }
1013 :
1014 79 : if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) {
1015 10 : return false;
1016 : }
1017 :
1018 69 : return true;
1019 : }
1020 :
1021 : static inline bool
1022 9 : nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr)
1023 : {
1024 9 : if (nvme_ctrlr->destruct) {
1025 0 : return true;
1026 : }
1027 :
1028 9 : if (nvme_ctrlr->fast_io_fail_timedout) {
1029 2 : return true;
1030 : }
1031 :
1032 7 : if (nvme_ctrlr->resetting) {
1033 5 : if (nvme_ctrlr->opts.reconnect_delay_sec != 0) {
1034 5 : return false;
1035 : } else {
1036 0 : return true;
1037 : }
1038 : }
1039 :
1040 2 : if (nvme_ctrlr->reconnect_is_delayed) {
1041 2 : return false;
1042 : }
1043 :
1044 0 : if (nvme_ctrlr->disabled) {
1045 0 : return true;
1046 : }
1047 :
1048 0 : if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
1049 0 : return true;
1050 : } else {
1051 0 : return false;
1052 : }
1053 : }
1054 :
1055 : static bool
1056 20 : nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr)
1057 : {
1058 20 : if (nvme_ctrlr->destruct) {
1059 0 : return false;
1060 : }
1061 :
1062 20 : if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
1063 3 : return false;
1064 : }
1065 :
1066 17 : if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) {
1067 1 : return false;
1068 : }
1069 :
1070 16 : if (nvme_ctrlr->disabled) {
1071 0 : return false;
1072 : }
1073 :
1074 16 : return true;
1075 : }
1076 :
1077 : /* Simulate circular linked list. */
1078 : static inline struct nvme_io_path *
1079 92 : nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path)
1080 : {
1081 : struct nvme_io_path *next_path;
1082 :
1083 92 : if (prev_path != NULL) {
1084 39 : next_path = STAILQ_NEXT(prev_path, stailq);
1085 39 : if (next_path != NULL) {
1086 14 : return next_path;
1087 : }
1088 : }
1089 :
1090 78 : return STAILQ_FIRST(&nbdev_ch->io_path_list);
1091 : }
1092 :
1093 : static struct nvme_io_path *
1094 60 : _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
1095 : {
1096 60 : struct nvme_io_path *io_path, *start, *non_optimized = NULL;
1097 :
1098 60 : start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path);
1099 :
1100 60 : io_path = start;
1101 : do {
1102 72 : if (spdk_likely(nvme_io_path_is_available(io_path))) {
1103 50 : switch (io_path->nvme_ns->ana_state) {
1104 40 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
1105 40 : nbdev_ch->current_io_path = io_path;
1106 40 : return io_path;
1107 10 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1108 10 : if (non_optimized == NULL) {
1109 7 : non_optimized = io_path;
1110 : }
1111 10 : break;
1112 0 : default:
1113 0 : assert(false);
1114 : break;
1115 : }
1116 : }
1117 32 : io_path = nvme_io_path_get_next(nbdev_ch, io_path);
1118 32 : } while (io_path != start);
1119 :
1120 20 : if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
1121 : /* We come here only if there is no optimized path. Cache even non_optimized
1122 : * path for load balance across multiple non_optimized paths.
1123 : */
1124 1 : nbdev_ch->current_io_path = non_optimized;
1125 : }
1126 :
1127 20 : return non_optimized;
1128 : }
1129 :
1130 : static struct nvme_io_path *
1131 4 : _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch)
1132 : {
1133 : struct nvme_io_path *io_path;
1134 4 : struct nvme_io_path *optimized = NULL, *non_optimized = NULL;
1135 4 : uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX;
1136 : uint32_t num_outstanding_reqs;
1137 :
1138 16 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
1139 12 : if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) {
1140 : /* The device is currently resetting. */
1141 0 : continue;
1142 : }
1143 :
1144 12 : if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) {
1145 0 : continue;
1146 : }
1147 :
1148 12 : num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair);
1149 12 : switch (io_path->nvme_ns->ana_state) {
1150 6 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
1151 6 : if (num_outstanding_reqs < opt_min_qd) {
1152 5 : opt_min_qd = num_outstanding_reqs;
1153 5 : optimized = io_path;
1154 : }
1155 6 : break;
1156 3 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1157 3 : if (num_outstanding_reqs < non_opt_min_qd) {
1158 3 : non_opt_min_qd = num_outstanding_reqs;
1159 3 : non_optimized = io_path;
1160 : }
1161 3 : break;
1162 3 : default:
1163 3 : break;
1164 : }
1165 : }
1166 :
1167 : /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */
1168 4 : if (optimized != NULL) {
1169 3 : return optimized;
1170 : }
1171 :
1172 1 : return non_optimized;
1173 : }
1174 :
1175 : static inline struct nvme_io_path *
1176 98 : bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
1177 : {
1178 98 : if (spdk_likely(nbdev_ch->current_io_path != NULL)) {
1179 41 : if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) {
1180 31 : return nbdev_ch->current_io_path;
1181 10 : } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
1182 10 : if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) {
1183 3 : return nbdev_ch->current_io_path;
1184 : }
1185 7 : nbdev_ch->rr_counter = 0;
1186 : }
1187 : }
1188 :
1189 64 : if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE ||
1190 14 : nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
1191 60 : return _bdev_nvme_find_io_path(nbdev_ch);
1192 : } else {
1193 4 : return _bdev_nvme_find_io_path_min_qd(nbdev_ch);
1194 : }
1195 : }
1196 :
1197 : /* Return true if there is any io_path whose qpair is active or ctrlr is not failed,
1198 : * or false otherwise.
1199 : *
1200 : * If any io_path has an active qpair but find_io_path() returned NULL, its namespace
1201 : * is likely to be non-accessible now but may become accessible.
1202 : *
1203 : * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr
1204 : * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed
1205 : * when starting to reset it but it is set to failed when the reset failed. Hence, if
1206 : * a ctrlr is unfailed, it is likely that it works fine or is resetting.
1207 : */
1208 : static bool
1209 15 : any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch)
1210 : {
1211 : struct nvme_io_path *io_path;
1212 :
1213 15 : if (nbdev_ch->resetting) {
1214 1 : return false;
1215 : }
1216 :
1217 16 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
1218 14 : if (io_path->nvme_ns->ana_transition_timedout) {
1219 0 : continue;
1220 : }
1221 :
1222 14 : if (nvme_qpair_is_connected(io_path->qpair) ||
1223 9 : !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) {
1224 12 : return true;
1225 : }
1226 : }
1227 :
1228 2 : return false;
1229 : }
1230 :
1231 : static void
1232 14 : bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io)
1233 : {
1234 14 : struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1235 : struct spdk_io_channel *ch;
1236 :
1237 14 : if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) {
1238 3 : _bdev_nvme_submit_request(nbdev_ch, bdev_io);
1239 : } else {
1240 11 : ch = spdk_io_channel_from_ctx(nbdev_ch);
1241 11 : bdev_nvme_submit_request(ch, bdev_io);
1242 : }
1243 14 : }
1244 :
1245 : static int
1246 14 : bdev_nvme_retry_ios(void *arg)
1247 : {
1248 14 : struct nvme_bdev_channel *nbdev_ch = arg;
1249 : struct nvme_bdev_io *bio, *tmp_bio;
1250 : uint64_t now, delay_us;
1251 :
1252 14 : now = spdk_get_ticks();
1253 :
1254 28 : TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) {
1255 15 : if (bio->retry_ticks > now) {
1256 1 : break;
1257 : }
1258 :
1259 14 : TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link);
1260 :
1261 14 : bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio));
1262 : }
1263 :
1264 14 : spdk_poller_unregister(&nbdev_ch->retry_io_poller);
1265 :
1266 14 : bio = TAILQ_FIRST(&nbdev_ch->retry_io_list);
1267 14 : if (bio != NULL) {
1268 4 : delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz();
1269 :
1270 4 : nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
1271 : delay_us);
1272 : }
1273 :
1274 14 : return SPDK_POLLER_BUSY;
1275 : }
1276 :
1277 : static void
1278 16 : bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch,
1279 : struct nvme_bdev_io *bio, uint64_t delay_ms)
1280 : {
1281 : struct nvme_bdev_io *tmp_bio;
1282 :
1283 16 : bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL;
1284 :
1285 16 : TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) {
1286 1 : if (tmp_bio->retry_ticks <= bio->retry_ticks) {
1287 1 : TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio,
1288 : retry_link);
1289 1 : return;
1290 : }
1291 : }
1292 :
1293 : /* No earlier I/Os were found. This I/O must be the new head. */
1294 15 : TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link);
1295 :
1296 15 : spdk_poller_unregister(&nbdev_ch->retry_io_poller);
1297 :
1298 15 : nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
1299 : delay_ms * 1000ULL);
1300 : }
1301 :
1302 : static void
1303 40 : bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch)
1304 : {
1305 : struct nvme_bdev_io *bio, *tmp_bio;
1306 :
1307 41 : TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) {
1308 1 : TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link);
1309 1 : __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL);
1310 : }
1311 :
1312 40 : spdk_poller_unregister(&nbdev_ch->retry_io_poller);
1313 40 : }
1314 :
1315 : static int
1316 6 : bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch,
1317 : struct nvme_bdev_io *bio_to_abort)
1318 : {
1319 : struct nvme_bdev_io *bio;
1320 :
1321 6 : TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) {
1322 1 : if (bio == bio_to_abort) {
1323 1 : TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link);
1324 1 : __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL);
1325 1 : return 0;
1326 : }
1327 : }
1328 :
1329 5 : return -ENOENT;
1330 : }
1331 :
1332 : static void
1333 12 : bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl)
1334 : {
1335 : struct nvme_bdev *nbdev;
1336 : uint16_t sct, sc;
1337 :
1338 12 : assert(spdk_nvme_cpl_is_error(cpl));
1339 :
1340 12 : nbdev = bdev_io->bdev->ctxt;
1341 :
1342 12 : if (nbdev->err_stat == NULL) {
1343 12 : return;
1344 : }
1345 :
1346 0 : sct = cpl->status.sct;
1347 0 : sc = cpl->status.sc;
1348 :
1349 0 : pthread_mutex_lock(&nbdev->mutex);
1350 :
1351 0 : nbdev->err_stat->status_type[sct]++;
1352 0 : switch (sct) {
1353 0 : case SPDK_NVME_SCT_GENERIC:
1354 : case SPDK_NVME_SCT_COMMAND_SPECIFIC:
1355 : case SPDK_NVME_SCT_MEDIA_ERROR:
1356 : case SPDK_NVME_SCT_PATH:
1357 0 : nbdev->err_stat->status[sct][sc]++;
1358 0 : break;
1359 0 : default:
1360 0 : break;
1361 : }
1362 :
1363 0 : pthread_mutex_unlock(&nbdev->mutex);
1364 : }
1365 :
1366 : static inline void
1367 20 : bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio)
1368 : {
1369 20 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1370 20 : uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
1371 20 : uint32_t blocklen = bdev_io->bdev->blocklen;
1372 : struct spdk_bdev_io_stat *stat;
1373 : uint64_t tsc_diff;
1374 :
1375 20 : if (bio->io_path->stat == NULL) {
1376 20 : return;
1377 : }
1378 :
1379 0 : tsc_diff = spdk_get_ticks() - bio->submit_tsc;
1380 0 : stat = bio->io_path->stat;
1381 :
1382 0 : switch (bdev_io->type) {
1383 0 : case SPDK_BDEV_IO_TYPE_READ:
1384 0 : stat->bytes_read += num_blocks * blocklen;
1385 0 : stat->num_read_ops++;
1386 0 : stat->read_latency_ticks += tsc_diff;
1387 0 : if (stat->max_read_latency_ticks < tsc_diff) {
1388 0 : stat->max_read_latency_ticks = tsc_diff;
1389 : }
1390 0 : if (stat->min_read_latency_ticks > tsc_diff) {
1391 0 : stat->min_read_latency_ticks = tsc_diff;
1392 : }
1393 0 : break;
1394 0 : case SPDK_BDEV_IO_TYPE_WRITE:
1395 0 : stat->bytes_written += num_blocks * blocklen;
1396 0 : stat->num_write_ops++;
1397 0 : stat->write_latency_ticks += tsc_diff;
1398 0 : if (stat->max_write_latency_ticks < tsc_diff) {
1399 0 : stat->max_write_latency_ticks = tsc_diff;
1400 : }
1401 0 : if (stat->min_write_latency_ticks > tsc_diff) {
1402 0 : stat->min_write_latency_ticks = tsc_diff;
1403 : }
1404 0 : break;
1405 0 : case SPDK_BDEV_IO_TYPE_UNMAP:
1406 0 : stat->bytes_unmapped += num_blocks * blocklen;
1407 0 : stat->num_unmap_ops++;
1408 0 : stat->unmap_latency_ticks += tsc_diff;
1409 0 : if (stat->max_unmap_latency_ticks < tsc_diff) {
1410 0 : stat->max_unmap_latency_ticks = tsc_diff;
1411 : }
1412 0 : if (stat->min_unmap_latency_ticks > tsc_diff) {
1413 0 : stat->min_unmap_latency_ticks = tsc_diff;
1414 : }
1415 0 : break;
1416 0 : case SPDK_BDEV_IO_TYPE_ZCOPY:
1417 : /* Track the data in the start phase only */
1418 0 : if (!bdev_io->u.bdev.zcopy.start) {
1419 0 : break;
1420 : }
1421 0 : if (bdev_io->u.bdev.zcopy.populate) {
1422 0 : stat->bytes_read += num_blocks * blocklen;
1423 0 : stat->num_read_ops++;
1424 0 : stat->read_latency_ticks += tsc_diff;
1425 0 : if (stat->max_read_latency_ticks < tsc_diff) {
1426 0 : stat->max_read_latency_ticks = tsc_diff;
1427 : }
1428 0 : if (stat->min_read_latency_ticks > tsc_diff) {
1429 0 : stat->min_read_latency_ticks = tsc_diff;
1430 : }
1431 : } else {
1432 0 : stat->bytes_written += num_blocks * blocklen;
1433 0 : stat->num_write_ops++;
1434 0 : stat->write_latency_ticks += tsc_diff;
1435 0 : if (stat->max_write_latency_ticks < tsc_diff) {
1436 0 : stat->max_write_latency_ticks = tsc_diff;
1437 : }
1438 0 : if (stat->min_write_latency_ticks > tsc_diff) {
1439 0 : stat->min_write_latency_ticks = tsc_diff;
1440 : }
1441 : }
1442 0 : break;
1443 0 : case SPDK_BDEV_IO_TYPE_COPY:
1444 0 : stat->bytes_copied += num_blocks * blocklen;
1445 0 : stat->num_copy_ops++;
1446 0 : stat->copy_latency_ticks += tsc_diff;
1447 0 : if (stat->max_copy_latency_ticks < tsc_diff) {
1448 0 : stat->max_copy_latency_ticks = tsc_diff;
1449 : }
1450 0 : if (stat->min_copy_latency_ticks > tsc_diff) {
1451 0 : stat->min_copy_latency_ticks = tsc_diff;
1452 : }
1453 0 : break;
1454 0 : default:
1455 0 : break;
1456 : }
1457 : }
1458 :
1459 : static bool
1460 7 : bdev_nvme_check_retry_io(struct nvme_bdev_io *bio,
1461 : const struct spdk_nvme_cpl *cpl,
1462 : struct nvme_bdev_channel *nbdev_ch,
1463 : uint64_t *_delay_ms)
1464 : {
1465 7 : struct nvme_io_path *io_path = bio->io_path;
1466 7 : struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr;
1467 : const struct spdk_nvme_ctrlr_data *cdata;
1468 :
1469 7 : if (spdk_nvme_cpl_is_path_error(cpl) ||
1470 5 : spdk_nvme_cpl_is_aborted_sq_deletion(cpl) ||
1471 4 : !nvme_io_path_is_available(io_path) ||
1472 4 : !nvme_ctrlr_is_available(nvme_ctrlr)) {
1473 3 : bdev_nvme_clear_current_io_path(nbdev_ch);
1474 3 : bio->io_path = NULL;
1475 3 : if (spdk_nvme_cpl_is_ana_error(cpl)) {
1476 1 : if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) {
1477 1 : io_path->nvme_ns->ana_state_updating = true;
1478 : }
1479 : }
1480 3 : if (!any_io_path_may_become_available(nbdev_ch)) {
1481 0 : return false;
1482 : }
1483 3 : *_delay_ms = 0;
1484 : } else {
1485 4 : bio->retry_count++;
1486 :
1487 4 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
1488 :
1489 4 : if (cpl->status.crd != 0) {
1490 1 : *_delay_ms = cdata->crdt[cpl->status.crd] * 100;
1491 : } else {
1492 3 : *_delay_ms = 0;
1493 : }
1494 : }
1495 :
1496 7 : return true;
1497 : }
1498 :
1499 : static inline void
1500 32 : bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
1501 : const struct spdk_nvme_cpl *cpl)
1502 : {
1503 32 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1504 : struct nvme_bdev_channel *nbdev_ch;
1505 32 : uint64_t delay_ms;
1506 :
1507 32 : assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
1508 :
1509 32 : if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
1510 20 : bdev_nvme_update_io_path_stat(bio);
1511 20 : goto complete;
1512 : }
1513 :
1514 : /* Update error counts before deciding if retry is needed.
1515 : * Hence, error counts may be more than the number of I/O errors.
1516 : */
1517 12 : bdev_nvme_update_nvme_error_stat(bdev_io, cpl);
1518 :
1519 12 : if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) ||
1520 8 : (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) {
1521 5 : goto complete;
1522 : }
1523 :
1524 : /* At this point we don't know whether the sequence was successfully executed or not, so we
1525 : * cannot retry the IO */
1526 7 : if (bdev_io->u.bdev.accel_sequence != NULL) {
1527 0 : goto complete;
1528 : }
1529 :
1530 7 : nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
1531 :
1532 7 : if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) {
1533 7 : bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms);
1534 7 : return;
1535 : }
1536 :
1537 25 : complete:
1538 25 : bio->retry_count = 0;
1539 25 : bio->submit_tsc = 0;
1540 25 : bdev_io->u.bdev.accel_sequence = NULL;
1541 25 : __bdev_nvme_io_complete(bdev_io, 0, cpl);
1542 : }
1543 :
1544 : static inline void
1545 13 : bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
1546 : {
1547 13 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1548 : struct nvme_bdev_channel *nbdev_ch;
1549 : enum spdk_bdev_io_status io_status;
1550 :
1551 13 : assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
1552 :
1553 13 : switch (rc) {
1554 1 : case 0:
1555 1 : io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1556 1 : break;
1557 0 : case -ENOMEM:
1558 0 : io_status = SPDK_BDEV_IO_STATUS_NOMEM;
1559 0 : break;
1560 12 : case -ENXIO:
1561 12 : if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) {
1562 12 : nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
1563 :
1564 12 : bdev_nvme_clear_current_io_path(nbdev_ch);
1565 12 : bio->io_path = NULL;
1566 :
1567 12 : if (any_io_path_may_become_available(nbdev_ch)) {
1568 9 : bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL);
1569 9 : return;
1570 : }
1571 : }
1572 :
1573 : /* fallthrough */
1574 : default:
1575 3 : spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence);
1576 3 : bdev_io->u.bdev.accel_sequence = NULL;
1577 3 : io_status = SPDK_BDEV_IO_STATUS_FAILED;
1578 3 : break;
1579 : }
1580 :
1581 4 : bio->retry_count = 0;
1582 4 : bio->submit_tsc = 0;
1583 4 : __bdev_nvme_io_complete(bdev_io, io_status, NULL);
1584 : }
1585 :
1586 : static inline void
1587 4 : bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc)
1588 : {
1589 4 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1590 : enum spdk_bdev_io_status io_status;
1591 :
1592 4 : switch (rc) {
1593 1 : case 0:
1594 1 : io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1595 1 : break;
1596 0 : case -ENOMEM:
1597 0 : io_status = SPDK_BDEV_IO_STATUS_NOMEM;
1598 0 : break;
1599 3 : case -ENXIO:
1600 : /* fallthrough */
1601 : default:
1602 3 : io_status = SPDK_BDEV_IO_STATUS_FAILED;
1603 3 : break;
1604 : }
1605 :
1606 4 : __bdev_nvme_io_complete(bdev_io, io_status, NULL);
1607 4 : }
1608 :
1609 : static void
1610 3 : bdev_nvme_clear_io_path_caches_done(struct nvme_ctrlr *nvme_ctrlr,
1611 : void *ctx, int status)
1612 : {
1613 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
1614 :
1615 3 : assert(nvme_ctrlr->io_path_cache_clearing == true);
1616 3 : nvme_ctrlr->io_path_cache_clearing = false;
1617 :
1618 3 : if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
1619 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1620 3 : return;
1621 : }
1622 :
1623 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1624 :
1625 0 : nvme_ctrlr_unregister(nvme_ctrlr);
1626 : }
1627 :
1628 : static void
1629 328 : _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair)
1630 : {
1631 : struct nvme_io_path *io_path;
1632 :
1633 475 : TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) {
1634 147 : if (io_path->nbdev_ch == NULL) {
1635 68 : continue;
1636 : }
1637 79 : bdev_nvme_clear_current_io_path(io_path->nbdev_ch);
1638 : }
1639 328 : }
1640 :
1641 : static void
1642 1 : bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel_iter *i,
1643 : struct nvme_ctrlr *nvme_ctrlr,
1644 : struct nvme_ctrlr_channel *ctrlr_ch,
1645 : void *ctx)
1646 : {
1647 1 : assert(ctrlr_ch->qpair != NULL);
1648 :
1649 1 : _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair);
1650 :
1651 1 : nvme_ctrlr_for_each_channel_continue(i, 0);
1652 1 : }
1653 :
1654 : static void
1655 3 : bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr)
1656 : {
1657 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
1658 3 : if (!nvme_ctrlr_is_available(nvme_ctrlr) ||
1659 : nvme_ctrlr->io_path_cache_clearing) {
1660 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1661 0 : return;
1662 : }
1663 :
1664 3 : nvme_ctrlr->io_path_cache_clearing = true;
1665 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1666 :
1667 3 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
1668 : bdev_nvme_clear_io_path_cache,
1669 : NULL,
1670 : bdev_nvme_clear_io_path_caches_done);
1671 : }
1672 :
1673 : static struct nvme_qpair *
1674 103 : nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair)
1675 : {
1676 : struct nvme_qpair *nvme_qpair;
1677 :
1678 112 : TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) {
1679 112 : if (nvme_qpair->qpair == qpair) {
1680 103 : break;
1681 : }
1682 : }
1683 :
1684 103 : return nvme_qpair;
1685 : }
1686 :
1687 : static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair);
1688 :
1689 : static void
1690 103 : bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
1691 : {
1692 103 : struct nvme_poll_group *group = poll_group_ctx;
1693 : struct nvme_qpair *nvme_qpair;
1694 : struct nvme_ctrlr_channel *ctrlr_ch;
1695 : int status;
1696 :
1697 103 : nvme_qpair = nvme_poll_group_get_qpair(group, qpair);
1698 103 : if (nvme_qpair == NULL) {
1699 0 : return;
1700 : }
1701 :
1702 103 : if (nvme_qpair->qpair != NULL) {
1703 103 : spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair);
1704 103 : nvme_qpair->qpair = NULL;
1705 : }
1706 :
1707 103 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
1708 :
1709 103 : ctrlr_ch = nvme_qpair->ctrlr_ch;
1710 :
1711 103 : if (ctrlr_ch != NULL) {
1712 58 : if (ctrlr_ch->reset_iter != NULL) {
1713 : /* We are in a full reset sequence. */
1714 53 : if (ctrlr_ch->connect_poller != NULL) {
1715 : /* qpair was failed to connect. Abort the reset sequence. */
1716 0 : SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n",
1717 : qpair);
1718 0 : spdk_poller_unregister(&ctrlr_ch->connect_poller);
1719 0 : status = -1;
1720 : } else {
1721 : /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */
1722 53 : SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n",
1723 : qpair);
1724 53 : status = 0;
1725 : }
1726 53 : nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, status);
1727 53 : ctrlr_ch->reset_iter = NULL;
1728 : } else {
1729 : /* qpair was disconnected unexpectedly. Reset controller for recovery. */
1730 5 : SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair);
1731 5 : bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr);
1732 : }
1733 : } else {
1734 : /* In this case, ctrlr_channel is already deleted. */
1735 45 : SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair);
1736 45 : nvme_qpair_delete(nvme_qpair);
1737 : }
1738 : }
1739 :
1740 : static void
1741 0 : bdev_nvme_check_io_qpairs(struct nvme_poll_group *group)
1742 : {
1743 : struct nvme_qpair *nvme_qpair;
1744 :
1745 0 : TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) {
1746 0 : if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) {
1747 0 : continue;
1748 : }
1749 :
1750 0 : if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) !=
1751 : SPDK_NVME_QPAIR_FAILURE_NONE) {
1752 0 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
1753 : }
1754 : }
1755 0 : }
1756 :
1757 : static int
1758 1089 : bdev_nvme_poll(void *arg)
1759 : {
1760 1089 : struct nvme_poll_group *group = arg;
1761 : int64_t num_completions;
1762 :
1763 1089 : if (group->collect_spin_stat && group->start_ticks == 0) {
1764 0 : group->start_ticks = spdk_get_ticks();
1765 : }
1766 :
1767 1089 : num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
1768 : bdev_nvme_disconnected_qpair_cb);
1769 1089 : if (group->collect_spin_stat) {
1770 0 : if (num_completions > 0) {
1771 0 : if (group->end_ticks != 0) {
1772 0 : group->spin_ticks += (group->end_ticks - group->start_ticks);
1773 0 : group->end_ticks = 0;
1774 : }
1775 0 : group->start_ticks = 0;
1776 : } else {
1777 0 : group->end_ticks = spdk_get_ticks();
1778 : }
1779 : }
1780 :
1781 1089 : if (spdk_unlikely(num_completions < 0)) {
1782 0 : bdev_nvme_check_io_qpairs(group);
1783 : }
1784 :
1785 1089 : return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
1786 : }
1787 :
1788 : static int bdev_nvme_poll_adminq(void *arg);
1789 :
1790 : static void
1791 102 : bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us)
1792 : {
1793 102 : spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller);
1794 :
1795 102 : nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq,
1796 : nvme_ctrlr, new_period_us);
1797 102 : }
1798 :
1799 : static int
1800 148 : bdev_nvme_poll_adminq(void *arg)
1801 : {
1802 : int32_t rc;
1803 148 : struct nvme_ctrlr *nvme_ctrlr = arg;
1804 : nvme_ctrlr_disconnected_cb disconnected_cb;
1805 :
1806 148 : assert(nvme_ctrlr != NULL);
1807 :
1808 148 : rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr);
1809 148 : if (rc < 0) {
1810 54 : disconnected_cb = nvme_ctrlr->disconnected_cb;
1811 54 : nvme_ctrlr->disconnected_cb = NULL;
1812 :
1813 54 : if (disconnected_cb != NULL) {
1814 51 : bdev_nvme_change_adminq_poll_period(nvme_ctrlr,
1815 : g_opts.nvme_adminq_poll_period_us);
1816 51 : disconnected_cb(nvme_ctrlr);
1817 : } else {
1818 3 : bdev_nvme_failover_ctrlr(nvme_ctrlr);
1819 : }
1820 94 : } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) !=
1821 : SPDK_NVME_QPAIR_FAILURE_NONE) {
1822 0 : bdev_nvme_clear_io_path_caches(nvme_ctrlr);
1823 : }
1824 :
1825 148 : return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
1826 : }
1827 :
1828 : static void
1829 38 : nvme_bdev_free(void *io_device)
1830 : {
1831 38 : struct nvme_bdev *nvme_disk = io_device;
1832 :
1833 38 : pthread_mutex_destroy(&nvme_disk->mutex);
1834 38 : free(nvme_disk->disk.name);
1835 38 : free(nvme_disk->err_stat);
1836 38 : free(nvme_disk);
1837 38 : }
1838 :
1839 : static int
1840 37 : bdev_nvme_destruct(void *ctx)
1841 : {
1842 37 : struct nvme_bdev *nvme_disk = ctx;
1843 : struct nvme_ns *nvme_ns, *tmp_nvme_ns;
1844 :
1845 : SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid);
1846 :
1847 75 : TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) {
1848 38 : pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
1849 :
1850 38 : nvme_ns->bdev = NULL;
1851 :
1852 38 : assert(nvme_ns->id > 0);
1853 :
1854 38 : if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) {
1855 0 : pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1856 :
1857 0 : nvme_ctrlr_release(nvme_ns->ctrlr);
1858 0 : nvme_ns_free(nvme_ns);
1859 : } else {
1860 38 : pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1861 : }
1862 : }
1863 :
1864 37 : pthread_mutex_lock(&g_bdev_nvme_mutex);
1865 37 : TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq);
1866 37 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
1867 :
1868 37 : spdk_io_device_unregister(nvme_disk, nvme_bdev_free);
1869 :
1870 37 : return 0;
1871 : }
1872 :
1873 : static int
1874 104 : bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair)
1875 : {
1876 : struct nvme_ctrlr *nvme_ctrlr;
1877 104 : struct spdk_nvme_io_qpair_opts opts;
1878 : struct spdk_nvme_qpair *qpair;
1879 : int rc;
1880 :
1881 104 : nvme_ctrlr = nvme_qpair->ctrlr;
1882 :
1883 104 : spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1884 104 : opts.delay_cmd_submit = g_opts.delay_cmd_submit;
1885 104 : opts.create_only = true;
1886 104 : opts.async_mode = true;
1887 104 : opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
1888 104 : g_opts.io_queue_requests = opts.io_queue_requests;
1889 :
1890 104 : qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1891 104 : if (qpair == NULL) {
1892 0 : return -1;
1893 : }
1894 :
1895 : SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name,
1896 : spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread));
1897 :
1898 104 : assert(nvme_qpair->group != NULL);
1899 :
1900 104 : rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair);
1901 104 : if (rc != 0) {
1902 0 : SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
1903 0 : goto err;
1904 : }
1905 :
1906 104 : rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair);
1907 104 : if (rc != 0) {
1908 0 : SPDK_ERRLOG("Unable to connect I/O qpair.\n");
1909 0 : goto err;
1910 : }
1911 :
1912 104 : nvme_qpair->qpair = qpair;
1913 :
1914 104 : if (!g_opts.disable_auto_failback) {
1915 71 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
1916 : }
1917 :
1918 104 : return 0;
1919 :
1920 0 : err:
1921 0 : spdk_nvme_ctrlr_free_io_qpair(qpair);
1922 :
1923 0 : return rc;
1924 : }
1925 :
1926 : static void bdev_nvme_reset_io_continue(void *cb_arg, int rc);
1927 :
1928 : static void
1929 84 : bdev_nvme_complete_pending_resets(struct nvme_ctrlr_channel_iter *i,
1930 : struct nvme_ctrlr *nvme_ctrlr,
1931 : struct nvme_ctrlr_channel *ctrlr_ch,
1932 : void *ctx)
1933 : {
1934 84 : int rc = 0;
1935 : struct nvme_bdev_io *bio;
1936 :
1937 84 : if (ctx != NULL) {
1938 35 : rc = -1;
1939 : }
1940 :
1941 89 : while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) {
1942 5 : bio = TAILQ_FIRST(&ctrlr_ch->pending_resets);
1943 5 : TAILQ_REMOVE(&ctrlr_ch->pending_resets, bio, retry_link);
1944 :
1945 5 : bdev_nvme_reset_io_continue(bio, rc);
1946 : }
1947 :
1948 84 : nvme_ctrlr_for_each_channel_continue(i, 0);
1949 84 : }
1950 :
1951 : /* This function marks the current trid as failed by storing the current ticks
1952 : * and then sets the next trid to the active trid within a controller if exists.
1953 : *
1954 : * The purpose of the boolean return value is to request the caller to disconnect
1955 : * the current trid now to try connecting the next trid.
1956 : */
1957 : static bool
1958 37 : bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start)
1959 : {
1960 : struct nvme_path_id *path_id, *next_path;
1961 : int rc __attribute__((unused));
1962 :
1963 37 : path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
1964 37 : assert(path_id);
1965 37 : assert(path_id == nvme_ctrlr->active_path_id);
1966 37 : next_path = TAILQ_NEXT(path_id, link);
1967 :
1968 : /* Update the last failed time. It means the trid is failed if its last
1969 : * failed time is non-zero.
1970 : */
1971 37 : path_id->last_failed_tsc = spdk_get_ticks();
1972 :
1973 37 : if (next_path == NULL) {
1974 : /* There is no alternate trid within a controller. */
1975 26 : return false;
1976 : }
1977 :
1978 11 : if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) {
1979 : /* Connect is not retried in a controller reset sequence. Connecting
1980 : * the next trid will be done by the next bdev_nvme_failover_ctrlr() call.
1981 : */
1982 3 : return false;
1983 : }
1984 :
1985 8 : assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
1986 :
1987 8 : SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr,
1988 : path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid);
1989 :
1990 8 : spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
1991 8 : nvme_ctrlr->active_path_id = next_path;
1992 8 : rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid);
1993 8 : assert(rc == 0);
1994 8 : TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
1995 8 : if (!remove) {
1996 : /** Shuffle the old trid to the end of the list and use the new one.
1997 : * Allows for round robin through multiple connections.
1998 : */
1999 6 : TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link);
2000 : } else {
2001 2 : free(path_id);
2002 : }
2003 :
2004 8 : if (start || next_path->last_failed_tsc == 0) {
2005 : /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed
2006 : * or used yet. Try the next trid now.
2007 : */
2008 7 : return true;
2009 : }
2010 :
2011 1 : if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() *
2012 1 : nvme_ctrlr->opts.reconnect_delay_sec) {
2013 : /* Enough backoff passed since the next trid failed. Try the next trid now. */
2014 0 : return true;
2015 : }
2016 :
2017 : /* The next trid will be tried after reconnect_delay_sec seconds. */
2018 1 : return false;
2019 : }
2020 :
2021 : static bool
2022 69 : bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr)
2023 : {
2024 : int32_t elapsed;
2025 :
2026 69 : if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 ||
2027 37 : nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) {
2028 43 : return false;
2029 : }
2030 :
2031 26 : elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz();
2032 26 : if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) {
2033 6 : return true;
2034 : } else {
2035 20 : return false;
2036 : }
2037 : }
2038 :
2039 : static bool
2040 12 : bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr)
2041 : {
2042 : uint32_t elapsed;
2043 :
2044 12 : if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) {
2045 8 : return false;
2046 : }
2047 :
2048 4 : elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz();
2049 4 : if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) {
2050 2 : return true;
2051 : } else {
2052 2 : return false;
2053 : }
2054 : }
2055 :
2056 : static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success);
2057 :
2058 : static void
2059 52 : nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn)
2060 : {
2061 : int rc;
2062 :
2063 52 : rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr);
2064 52 : if (rc != 0) {
2065 : /* Disconnect fails if ctrlr is already resetting or removed. In this case,
2066 : * fail the reset sequence immediately.
2067 : */
2068 1 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false);
2069 1 : return;
2070 : }
2071 :
2072 : /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq.
2073 : * Set callback here to execute the specified operation after ctrlr is really disconnected.
2074 : */
2075 51 : assert(nvme_ctrlr->disconnected_cb == NULL);
2076 51 : nvme_ctrlr->disconnected_cb = cb_fn;
2077 :
2078 : /* During disconnection, reduce the period to poll adminq more often. */
2079 51 : bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0);
2080 : }
2081 :
2082 : enum bdev_nvme_op_after_reset {
2083 : OP_NONE,
2084 : OP_COMPLETE_PENDING_DESTRUCT,
2085 : OP_DESTRUCT,
2086 : OP_DELAYED_RECONNECT,
2087 : OP_FAILOVER,
2088 : };
2089 :
2090 : typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset;
2091 :
2092 : static _bdev_nvme_op_after_reset
2093 51 : bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success)
2094 : {
2095 51 : if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
2096 : /* Complete pending destruct after reset completes. */
2097 0 : return OP_COMPLETE_PENDING_DESTRUCT;
2098 51 : } else if (nvme_ctrlr->pending_failover) {
2099 3 : nvme_ctrlr->pending_failover = false;
2100 3 : nvme_ctrlr->reset_start_tsc = 0;
2101 3 : return OP_FAILOVER;
2102 48 : } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) {
2103 34 : nvme_ctrlr->reset_start_tsc = 0;
2104 34 : return OP_NONE;
2105 14 : } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) {
2106 2 : return OP_DESTRUCT;
2107 : } else {
2108 12 : if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) {
2109 2 : nvme_ctrlr->fast_io_fail_timedout = true;
2110 : }
2111 12 : return OP_DELAYED_RECONNECT;
2112 : }
2113 : }
2114 :
2115 : static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug);
2116 : static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
2117 :
2118 : static int
2119 9 : bdev_nvme_reconnect_delay_timer_expired(void *ctx)
2120 : {
2121 9 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2122 :
2123 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name);
2124 9 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2125 :
2126 9 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
2127 :
2128 9 : if (!nvme_ctrlr->reconnect_is_delayed) {
2129 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2130 0 : return SPDK_POLLER_BUSY;
2131 : }
2132 :
2133 9 : nvme_ctrlr->reconnect_is_delayed = false;
2134 :
2135 9 : if (nvme_ctrlr->destruct) {
2136 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2137 0 : return SPDK_POLLER_BUSY;
2138 : }
2139 :
2140 9 : assert(nvme_ctrlr->resetting == false);
2141 9 : nvme_ctrlr->resetting = true;
2142 :
2143 9 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2144 :
2145 9 : spdk_poller_resume(nvme_ctrlr->adminq_timer_poller);
2146 :
2147 9 : bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
2148 9 : return SPDK_POLLER_BUSY;
2149 : }
2150 :
2151 : static void
2152 12 : bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr)
2153 : {
2154 12 : spdk_poller_pause(nvme_ctrlr->adminq_timer_poller);
2155 :
2156 12 : assert(nvme_ctrlr->reconnect_is_delayed == false);
2157 12 : nvme_ctrlr->reconnect_is_delayed = true;
2158 :
2159 12 : assert(nvme_ctrlr->reconnect_delay_timer == NULL);
2160 12 : nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired,
2161 : nvme_ctrlr,
2162 : nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC);
2163 12 : }
2164 :
2165 : static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr);
2166 :
2167 : static void
2168 49 : _bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2169 : {
2170 49 : bool success = (ctx == NULL);
2171 49 : bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn;
2172 49 : void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg;
2173 : enum bdev_nvme_op_after_reset op_after_reset;
2174 :
2175 49 : assert(nvme_ctrlr->thread == spdk_get_thread());
2176 :
2177 49 : nvme_ctrlr->ctrlr_op_cb_fn = NULL;
2178 49 : nvme_ctrlr->ctrlr_op_cb_arg = NULL;
2179 :
2180 49 : if (!success) {
2181 21 : SPDK_ERRLOG("Resetting controller failed.\n");
2182 : } else {
2183 28 : SPDK_NOTICELOG("Resetting controller successful.\n");
2184 : }
2185 :
2186 49 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2187 49 : nvme_ctrlr->resetting = false;
2188 49 : nvme_ctrlr->dont_retry = false;
2189 49 : nvme_ctrlr->in_failover = false;
2190 :
2191 49 : op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success);
2192 49 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2193 :
2194 : /* Delay callbacks when the next operation is a failover. */
2195 49 : if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) {
2196 10 : ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1);
2197 : }
2198 :
2199 49 : switch (op_after_reset) {
2200 0 : case OP_COMPLETE_PENDING_DESTRUCT:
2201 0 : nvme_ctrlr_unregister(nvme_ctrlr);
2202 0 : break;
2203 2 : case OP_DESTRUCT:
2204 2 : bdev_nvme_delete_ctrlr(nvme_ctrlr, false);
2205 2 : remove_discovery_entry(nvme_ctrlr);
2206 2 : break;
2207 12 : case OP_DELAYED_RECONNECT:
2208 12 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer);
2209 12 : break;
2210 3 : case OP_FAILOVER:
2211 3 : nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn;
2212 3 : nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg;
2213 3 : bdev_nvme_failover_ctrlr(nvme_ctrlr);
2214 3 : break;
2215 32 : default:
2216 32 : break;
2217 : }
2218 49 : }
2219 :
2220 : static void
2221 51 : bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success)
2222 : {
2223 51 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2224 51 : if (!success) {
2225 : /* Connecting the active trid failed. Set the next alternate trid to the
2226 : * active trid if it exists.
2227 : */
2228 23 : if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) {
2229 : /* The next alternate trid exists and is ready to try. Try it now. */
2230 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2231 :
2232 2 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr);
2233 2 : return;
2234 : }
2235 :
2236 : /* We came here if there is no alternate trid or if the next trid exists but
2237 : * is not ready to try. We will try the active trid after reconnect_delay_sec
2238 : * seconds if it is non-zero or at the next reset call otherwise.
2239 : */
2240 : } else {
2241 : /* Connecting the active trid succeeded. Clear the last failed time because it
2242 : * means the trid is failed if its last failed time is non-zero.
2243 : */
2244 28 : nvme_ctrlr->active_path_id->last_failed_tsc = 0;
2245 : }
2246 49 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2247 :
2248 : /* Make sure we clear any pending resets before returning. */
2249 49 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2250 : bdev_nvme_complete_pending_resets,
2251 : success ? NULL : (void *)0x1,
2252 : _bdev_nvme_reset_ctrlr_complete);
2253 : }
2254 :
2255 : static void
2256 0 : bdev_nvme_reset_create_qpairs_failed(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2257 : {
2258 0 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false);
2259 0 : }
2260 :
2261 : static void
2262 64 : bdev_nvme_reset_destroy_qpair(struct nvme_ctrlr_channel_iter *i,
2263 : struct nvme_ctrlr *nvme_ctrlr,
2264 : struct nvme_ctrlr_channel *ctrlr_ch, void *ctx)
2265 : {
2266 : struct nvme_qpair *nvme_qpair;
2267 :
2268 64 : nvme_qpair = ctrlr_ch->qpair;
2269 64 : assert(nvme_qpair != NULL);
2270 :
2271 64 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
2272 :
2273 64 : if (nvme_qpair->qpair != NULL) {
2274 53 : if (nvme_qpair->ctrlr->dont_retry) {
2275 39 : spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true);
2276 : }
2277 53 : spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair);
2278 :
2279 : /* The current full reset sequence will move to the next
2280 : * ctrlr_channel after the qpair is actually disconnected.
2281 : */
2282 53 : assert(ctrlr_ch->reset_iter == NULL);
2283 53 : ctrlr_ch->reset_iter = i;
2284 : } else {
2285 11 : nvme_ctrlr_for_each_channel_continue(i, 0);
2286 : }
2287 64 : }
2288 :
2289 : static void
2290 28 : bdev_nvme_reset_create_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2291 : {
2292 28 : if (status == 0) {
2293 28 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true);
2294 : } else {
2295 : /* Delete the added qpairs and quiesce ctrlr to make the states clean. */
2296 0 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2297 : bdev_nvme_reset_destroy_qpair,
2298 : NULL,
2299 : bdev_nvme_reset_create_qpairs_failed);
2300 : }
2301 28 : }
2302 :
2303 : static int
2304 45 : bdev_nvme_reset_check_qpair_connected(void *ctx)
2305 : {
2306 45 : struct nvme_ctrlr_channel *ctrlr_ch = ctx;
2307 :
2308 45 : if (ctrlr_ch->reset_iter == NULL) {
2309 : /* qpair was already failed to connect and the reset sequence is being aborted. */
2310 0 : assert(ctrlr_ch->connect_poller == NULL);
2311 0 : assert(ctrlr_ch->qpair->qpair == NULL);
2312 0 : return SPDK_POLLER_BUSY;
2313 : }
2314 :
2315 45 : assert(ctrlr_ch->qpair->qpair != NULL);
2316 :
2317 45 : if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) {
2318 0 : return SPDK_POLLER_BUSY;
2319 : }
2320 :
2321 45 : spdk_poller_unregister(&ctrlr_ch->connect_poller);
2322 :
2323 : /* qpair was completed to connect. Move to the next ctrlr_channel */
2324 45 : nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0);
2325 45 : ctrlr_ch->reset_iter = NULL;
2326 :
2327 45 : if (!g_opts.disable_auto_failback) {
2328 30 : _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair);
2329 : }
2330 :
2331 45 : return SPDK_POLLER_BUSY;
2332 : }
2333 :
2334 : static void
2335 45 : bdev_nvme_reset_create_qpair(struct nvme_ctrlr_channel_iter *i,
2336 : struct nvme_ctrlr *nvme_ctrlr,
2337 : struct nvme_ctrlr_channel *ctrlr_ch,
2338 : void *ctx)
2339 : {
2340 : int rc;
2341 :
2342 45 : rc = bdev_nvme_create_qpair(ctrlr_ch->qpair);
2343 45 : if (rc == 0) {
2344 45 : ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected,
2345 : ctrlr_ch, 0);
2346 :
2347 : /* The current full reset sequence will move to the next
2348 : * ctrlr_channel after the qpair is actually connected.
2349 : */
2350 45 : assert(ctrlr_ch->reset_iter == NULL);
2351 45 : ctrlr_ch->reset_iter = i;
2352 : } else {
2353 0 : nvme_ctrlr_for_each_channel_continue(i, rc);
2354 : }
2355 45 : }
2356 :
2357 : static void
2358 28 : nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr)
2359 : {
2360 28 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
2361 : struct nvme_ns *nvme_ns;
2362 :
2363 28 : for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
2364 41 : nvme_ns != NULL;
2365 13 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) {
2366 13 : if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) {
2367 1 : SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id);
2368 : /* NS can be added again. Just nullify nvme_ns->ns. */
2369 1 : nvme_ns->ns = NULL;
2370 : }
2371 : }
2372 28 : }
2373 :
2374 :
2375 : static int
2376 50 : bdev_nvme_reconnect_ctrlr_poll(void *arg)
2377 : {
2378 50 : struct nvme_ctrlr *nvme_ctrlr = arg;
2379 50 : int rc = -ETIMEDOUT;
2380 :
2381 50 : if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) {
2382 : /* Mark the ctrlr as failed. The next call to
2383 : * spdk_nvme_ctrlr_reconnect_poll_async() will then
2384 : * do the necessary cleanup and return failure.
2385 : */
2386 2 : spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
2387 : }
2388 :
2389 50 : rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr);
2390 50 : if (rc == -EAGAIN) {
2391 0 : return SPDK_POLLER_BUSY;
2392 : }
2393 :
2394 50 : spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
2395 50 : if (rc == 0) {
2396 28 : nvme_ctrlr_check_namespaces(nvme_ctrlr);
2397 :
2398 : /* Recreate all of the I/O queue pairs */
2399 28 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2400 : bdev_nvme_reset_create_qpair,
2401 : NULL,
2402 : bdev_nvme_reset_create_qpairs_done);
2403 : } else {
2404 22 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false);
2405 : }
2406 50 : return SPDK_POLLER_BUSY;
2407 : }
2408 :
2409 : static void
2410 50 : bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2411 : {
2412 50 : spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr);
2413 :
2414 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name);
2415 50 : assert(nvme_ctrlr->reset_detach_poller == NULL);
2416 50 : nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll,
2417 : nvme_ctrlr, 0);
2418 50 : }
2419 :
2420 : static void
2421 37 : bdev_nvme_reset_destroy_qpair_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2422 : {
2423 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name);
2424 37 : assert(status == 0);
2425 :
2426 37 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2427 0 : bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
2428 : } else {
2429 37 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr);
2430 : }
2431 37 : }
2432 :
2433 : static void
2434 37 : bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr)
2435 : {
2436 37 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2437 : bdev_nvme_reset_destroy_qpair,
2438 : NULL,
2439 : bdev_nvme_reset_destroy_qpair_done);
2440 37 : }
2441 :
2442 : static void
2443 3 : bdev_nvme_reconnect_ctrlr_now(void *ctx)
2444 : {
2445 3 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2446 :
2447 3 : assert(nvme_ctrlr->resetting == true);
2448 3 : assert(nvme_ctrlr->thread == spdk_get_thread());
2449 :
2450 3 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
2451 :
2452 3 : spdk_poller_resume(nvme_ctrlr->adminq_timer_poller);
2453 :
2454 3 : bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
2455 3 : }
2456 :
2457 : static void
2458 37 : _bdev_nvme_reset_ctrlr(void *ctx)
2459 : {
2460 37 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2461 :
2462 37 : assert(nvme_ctrlr->resetting == true);
2463 37 : assert(nvme_ctrlr->thread == spdk_get_thread());
2464 :
2465 37 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2466 0 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs);
2467 : } else {
2468 37 : bdev_nvme_reset_destroy_qpairs(nvme_ctrlr);
2469 : }
2470 37 : }
2471 :
2472 : static int
2473 35 : bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2474 : {
2475 : spdk_msg_fn msg_fn;
2476 :
2477 35 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2478 35 : if (nvme_ctrlr->destruct) {
2479 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2480 3 : return -ENXIO;
2481 : }
2482 :
2483 32 : if (nvme_ctrlr->resetting) {
2484 7 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2485 7 : SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
2486 7 : return -EBUSY;
2487 : }
2488 :
2489 25 : if (nvme_ctrlr->disabled) {
2490 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2491 0 : SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n");
2492 0 : return -EALREADY;
2493 : }
2494 :
2495 25 : nvme_ctrlr->resetting = true;
2496 25 : nvme_ctrlr->dont_retry = true;
2497 :
2498 25 : if (nvme_ctrlr->reconnect_is_delayed) {
2499 1 : SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n");
2500 1 : msg_fn = bdev_nvme_reconnect_ctrlr_now;
2501 1 : nvme_ctrlr->reconnect_is_delayed = false;
2502 : } else {
2503 24 : msg_fn = _bdev_nvme_reset_ctrlr;
2504 24 : assert(nvme_ctrlr->reset_start_tsc == 0);
2505 : }
2506 :
2507 25 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
2508 :
2509 25 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2510 :
2511 25 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
2512 25 : return 0;
2513 : }
2514 :
2515 : static int
2516 3 : bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2517 : {
2518 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2519 3 : if (nvme_ctrlr->destruct) {
2520 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2521 0 : return -ENXIO;
2522 : }
2523 :
2524 3 : if (nvme_ctrlr->resetting) {
2525 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2526 0 : return -EBUSY;
2527 : }
2528 :
2529 3 : if (!nvme_ctrlr->disabled) {
2530 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2531 1 : return -EALREADY;
2532 : }
2533 :
2534 2 : nvme_ctrlr->disabled = false;
2535 2 : nvme_ctrlr->resetting = true;
2536 :
2537 2 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
2538 :
2539 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2540 :
2541 2 : spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr);
2542 2 : return 0;
2543 : }
2544 :
2545 : static void
2546 2 : _bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2547 : {
2548 2 : bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn;
2549 2 : void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg;
2550 : enum bdev_nvme_op_after_reset op_after_disable;
2551 :
2552 2 : assert(nvme_ctrlr->thread == spdk_get_thread());
2553 :
2554 2 : nvme_ctrlr->ctrlr_op_cb_fn = NULL;
2555 2 : nvme_ctrlr->ctrlr_op_cb_arg = NULL;
2556 :
2557 2 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2558 :
2559 2 : nvme_ctrlr->resetting = false;
2560 2 : nvme_ctrlr->dont_retry = false;
2561 :
2562 2 : op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true);
2563 :
2564 2 : nvme_ctrlr->disabled = true;
2565 2 : spdk_poller_pause(nvme_ctrlr->adminq_timer_poller);
2566 :
2567 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2568 :
2569 2 : if (ctrlr_op_cb_fn) {
2570 0 : ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0);
2571 : }
2572 :
2573 2 : switch (op_after_disable) {
2574 0 : case OP_COMPLETE_PENDING_DESTRUCT:
2575 0 : nvme_ctrlr_unregister(nvme_ctrlr);
2576 0 : break;
2577 2 : default:
2578 2 : break;
2579 : }
2580 :
2581 2 : }
2582 :
2583 : static void
2584 2 : bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr)
2585 : {
2586 : /* Make sure we clear any pending resets before returning. */
2587 2 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2588 : bdev_nvme_complete_pending_resets,
2589 : NULL,
2590 : _bdev_nvme_disable_ctrlr_complete);
2591 2 : }
2592 :
2593 : static void
2594 1 : bdev_nvme_disable_destroy_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2595 : {
2596 1 : assert(status == 0);
2597 :
2598 1 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2599 0 : bdev_nvme_disable_ctrlr_complete(nvme_ctrlr);
2600 : } else {
2601 1 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete);
2602 : }
2603 1 : }
2604 :
2605 : static void
2606 1 : bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr)
2607 : {
2608 1 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2609 : bdev_nvme_reset_destroy_qpair,
2610 : NULL,
2611 : bdev_nvme_disable_destroy_qpairs_done);
2612 1 : }
2613 :
2614 : static void
2615 1 : _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx)
2616 : {
2617 1 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2618 :
2619 1 : assert(nvme_ctrlr->resetting == true);
2620 1 : assert(nvme_ctrlr->thread == spdk_get_thread());
2621 :
2622 1 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
2623 :
2624 1 : bdev_nvme_disable_ctrlr_complete(nvme_ctrlr);
2625 1 : }
2626 :
2627 : static void
2628 1 : _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx)
2629 : {
2630 1 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2631 :
2632 1 : assert(nvme_ctrlr->resetting == true);
2633 1 : assert(nvme_ctrlr->thread == spdk_get_thread());
2634 :
2635 1 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2636 0 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs);
2637 : } else {
2638 1 : bdev_nvme_disable_destroy_qpairs(nvme_ctrlr);
2639 : }
2640 1 : }
2641 :
2642 : static int
2643 5 : bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2644 : {
2645 : spdk_msg_fn msg_fn;
2646 :
2647 5 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2648 5 : if (nvme_ctrlr->destruct) {
2649 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2650 1 : return -ENXIO;
2651 : }
2652 :
2653 4 : if (nvme_ctrlr->resetting) {
2654 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2655 1 : return -EBUSY;
2656 : }
2657 :
2658 3 : if (nvme_ctrlr->disabled) {
2659 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2660 1 : return -EALREADY;
2661 : }
2662 :
2663 2 : nvme_ctrlr->resetting = true;
2664 2 : nvme_ctrlr->dont_retry = true;
2665 :
2666 2 : if (nvme_ctrlr->reconnect_is_delayed) {
2667 1 : msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr;
2668 1 : nvme_ctrlr->reconnect_is_delayed = false;
2669 : } else {
2670 1 : msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr;
2671 : }
2672 :
2673 2 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
2674 :
2675 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2676 :
2677 2 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
2678 2 : return 0;
2679 : }
2680 :
2681 : static int
2682 17 : nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op,
2683 : bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg)
2684 : {
2685 : int rc;
2686 :
2687 17 : switch (op) {
2688 16 : case NVME_CTRLR_OP_RESET:
2689 16 : rc = bdev_nvme_reset_ctrlr(nvme_ctrlr);
2690 16 : break;
2691 0 : case NVME_CTRLR_OP_ENABLE:
2692 0 : rc = bdev_nvme_enable_ctrlr(nvme_ctrlr);
2693 0 : break;
2694 0 : case NVME_CTRLR_OP_DISABLE:
2695 0 : rc = bdev_nvme_disable_ctrlr(nvme_ctrlr);
2696 0 : break;
2697 1 : default:
2698 1 : rc = -EINVAL;
2699 1 : break;
2700 : }
2701 :
2702 17 : if (rc == 0) {
2703 9 : assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL);
2704 9 : assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL);
2705 9 : nvme_ctrlr->ctrlr_op_cb_fn = cb_fn;
2706 9 : nvme_ctrlr->ctrlr_op_cb_arg = cb_arg;
2707 : }
2708 17 : return rc;
2709 : }
2710 :
2711 : struct nvme_ctrlr_op_rpc_ctx {
2712 : struct nvme_ctrlr *nvme_ctrlr;
2713 : struct spdk_thread *orig_thread;
2714 : enum nvme_ctrlr_op op;
2715 : int rc;
2716 : bdev_nvme_ctrlr_op_cb cb_fn;
2717 : void *cb_arg;
2718 : };
2719 :
2720 : static void
2721 4 : _nvme_ctrlr_op_rpc_complete(void *_ctx)
2722 : {
2723 4 : struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx;
2724 :
2725 4 : assert(ctx != NULL);
2726 4 : assert(ctx->cb_fn != NULL);
2727 :
2728 4 : ctx->cb_fn(ctx->cb_arg, ctx->rc);
2729 :
2730 4 : free(ctx);
2731 4 : }
2732 :
2733 : static void
2734 4 : nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc)
2735 : {
2736 4 : struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg;
2737 :
2738 4 : ctx->rc = rc;
2739 :
2740 4 : spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx);
2741 4 : }
2742 :
2743 : void
2744 4 : nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op,
2745 : bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg)
2746 : {
2747 : struct nvme_ctrlr_op_rpc_ctx *ctx;
2748 : int rc;
2749 :
2750 4 : assert(cb_fn != NULL);
2751 :
2752 4 : ctx = calloc(1, sizeof(*ctx));
2753 4 : if (ctx == NULL) {
2754 0 : SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n");
2755 0 : cb_fn(cb_arg, -ENOMEM);
2756 0 : return;
2757 : }
2758 :
2759 4 : ctx->orig_thread = spdk_get_thread();
2760 4 : ctx->cb_fn = cb_fn;
2761 4 : ctx->cb_arg = cb_arg;
2762 :
2763 4 : rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx);
2764 4 : if (rc == 0) {
2765 1 : return;
2766 3 : } else if (rc == -EALREADY) {
2767 0 : rc = 0;
2768 : }
2769 :
2770 3 : nvme_ctrlr_op_rpc_complete(ctx, rc);
2771 : }
2772 :
2773 : static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc);
2774 :
2775 : static void
2776 2 : _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx)
2777 : {
2778 2 : struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx;
2779 : struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr;
2780 : int rc;
2781 :
2782 2 : prev_nvme_ctrlr = ctx->nvme_ctrlr;
2783 2 : ctx->nvme_ctrlr = NULL;
2784 :
2785 2 : if (ctx->rc != 0) {
2786 0 : goto complete;
2787 : }
2788 :
2789 2 : next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq);
2790 2 : if (next_nvme_ctrlr == NULL) {
2791 1 : goto complete;
2792 : }
2793 :
2794 1 : rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx);
2795 1 : if (rc == 0) {
2796 1 : ctx->nvme_ctrlr = next_nvme_ctrlr;
2797 1 : return;
2798 0 : } else if (rc == -EALREADY) {
2799 0 : ctx->nvme_ctrlr = next_nvme_ctrlr;
2800 0 : rc = 0;
2801 : }
2802 :
2803 0 : ctx->rc = rc;
2804 :
2805 1 : complete:
2806 1 : ctx->cb_fn(ctx->cb_arg, ctx->rc);
2807 1 : free(ctx);
2808 : }
2809 :
2810 : static void
2811 2 : nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc)
2812 : {
2813 2 : struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg;
2814 :
2815 2 : ctx->rc = rc;
2816 :
2817 2 : spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx);
2818 2 : }
2819 :
2820 : void
2821 1 : nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op,
2822 : bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg)
2823 : {
2824 : struct nvme_ctrlr_op_rpc_ctx *ctx;
2825 : struct nvme_ctrlr *nvme_ctrlr;
2826 : int rc;
2827 :
2828 1 : assert(cb_fn != NULL);
2829 :
2830 1 : ctx = calloc(1, sizeof(*ctx));
2831 1 : if (ctx == NULL) {
2832 0 : SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n");
2833 0 : cb_fn(cb_arg, -ENOMEM);
2834 0 : return;
2835 : }
2836 :
2837 1 : ctx->orig_thread = spdk_get_thread();
2838 1 : ctx->op = op;
2839 1 : ctx->cb_fn = cb_fn;
2840 1 : ctx->cb_arg = cb_arg;
2841 :
2842 1 : nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
2843 1 : assert(nvme_ctrlr != NULL);
2844 :
2845 1 : rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx);
2846 1 : if (rc == 0) {
2847 1 : ctx->nvme_ctrlr = nvme_ctrlr;
2848 1 : return;
2849 0 : } else if (rc == -EALREADY) {
2850 0 : ctx->nvme_ctrlr = nvme_ctrlr;
2851 0 : rc = 0;
2852 : }
2853 :
2854 0 : nvme_bdev_ctrlr_op_rpc_continue(ctx, rc);
2855 : }
2856 :
2857 : static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio);
2858 :
2859 : static void
2860 8 : bdev_nvme_unfreeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status)
2861 : {
2862 8 : struct nvme_bdev_io *bio = ctx;
2863 : enum spdk_bdev_io_status io_status;
2864 :
2865 8 : if (bio->cpl.cdw0 == 0) {
2866 6 : io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
2867 : } else {
2868 2 : io_status = SPDK_BDEV_IO_STATUS_FAILED;
2869 : }
2870 :
2871 8 : __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL);
2872 8 : }
2873 :
2874 : static void
2875 16 : bdev_nvme_unfreeze_bdev_channel(struct nvme_bdev_channel_iter *i,
2876 : struct nvme_bdev *nbdev,
2877 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
2878 : {
2879 16 : bdev_nvme_abort_retry_ios(nbdev_ch);
2880 16 : nbdev_ch->resetting = false;
2881 :
2882 16 : nvme_bdev_for_each_channel_continue(i, 0);
2883 16 : }
2884 :
2885 : static void
2886 8 : bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio)
2887 : {
2888 8 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2889 8 : struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2890 :
2891 : /* Abort all queued I/Os for retry. */
2892 8 : nvme_bdev_for_each_channel(nbdev,
2893 : bdev_nvme_unfreeze_bdev_channel,
2894 : bio,
2895 : bdev_nvme_unfreeze_bdev_channel_done);
2896 8 : }
2897 :
2898 : static void
2899 11 : _bdev_nvme_reset_io_continue(void *ctx)
2900 : {
2901 11 : struct nvme_bdev_io *bio = ctx;
2902 : struct nvme_io_path *prev_io_path, *next_io_path;
2903 : int rc;
2904 :
2905 11 : prev_io_path = bio->io_path;
2906 11 : bio->io_path = NULL;
2907 :
2908 11 : if (bio->cpl.cdw0 != 0) {
2909 2 : goto complete;
2910 : }
2911 :
2912 9 : next_io_path = STAILQ_NEXT(prev_io_path, stailq);
2913 9 : if (next_io_path == NULL) {
2914 6 : goto complete;
2915 : }
2916 :
2917 3 : rc = _bdev_nvme_reset_io(next_io_path, bio);
2918 3 : if (rc == 0) {
2919 3 : return;
2920 : }
2921 :
2922 0 : bio->cpl.cdw0 = 1;
2923 :
2924 8 : complete:
2925 8 : bdev_nvme_reset_io_complete(bio);
2926 : }
2927 :
2928 : static void
2929 11 : bdev_nvme_reset_io_continue(void *cb_arg, int rc)
2930 : {
2931 11 : struct nvme_bdev_io *bio = cb_arg;
2932 11 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2933 :
2934 11 : bio->cpl.cdw0 = (rc == 0) ? 0 : 1;
2935 :
2936 11 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio);
2937 11 : }
2938 :
2939 : static int
2940 11 : _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio)
2941 : {
2942 : struct nvme_ctrlr_channel *ctrlr_ch;
2943 : int rc;
2944 :
2945 11 : rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET,
2946 : bdev_nvme_reset_io_continue, bio);
2947 11 : if (rc != 0 && rc != -EBUSY) {
2948 0 : return rc;
2949 : }
2950 :
2951 11 : assert(bio->io_path == NULL);
2952 11 : bio->io_path = io_path;
2953 :
2954 11 : if (rc == -EBUSY) {
2955 5 : ctrlr_ch = io_path->qpair->ctrlr_ch;
2956 5 : assert(ctrlr_ch != NULL);
2957 : /*
2958 : * Reset call is queued only if it is from the app framework. This is on purpose so that
2959 : * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
2960 : * upper level. If they are in the middle of a reset, we won't try to schedule another one.
2961 : */
2962 5 : TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bio, retry_link);
2963 : }
2964 :
2965 11 : return 0;
2966 : }
2967 :
2968 : static void
2969 8 : bdev_nvme_freeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status)
2970 : {
2971 8 : struct nvme_bdev_io *bio = ctx;
2972 8 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2973 : struct nvme_bdev_channel *nbdev_ch;
2974 : struct nvme_io_path *io_path;
2975 : int rc;
2976 :
2977 8 : nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2978 :
2979 8 : bio->cpl.cdw0 = 0;
2980 :
2981 : /* Reset all nvme_ctrlrs of a bdev controller sequentially. */
2982 8 : io_path = STAILQ_FIRST(&nbdev_ch->io_path_list);
2983 8 : assert(io_path != NULL);
2984 :
2985 8 : rc = _bdev_nvme_reset_io(io_path, bio);
2986 8 : if (rc != 0) {
2987 : /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */
2988 0 : rc = (rc == -EALREADY) ? 0 : rc;
2989 :
2990 0 : bdev_nvme_reset_io_continue(bio, rc);
2991 : }
2992 8 : }
2993 :
2994 : static void
2995 16 : bdev_nvme_freeze_bdev_channel(struct nvme_bdev_channel_iter *i,
2996 : struct nvme_bdev *nbdev,
2997 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
2998 : {
2999 16 : nbdev_ch->resetting = true;
3000 :
3001 16 : nvme_bdev_for_each_channel_continue(i, 0);
3002 16 : }
3003 :
3004 : static void
3005 8 : bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio)
3006 : {
3007 8 : nvme_bdev_for_each_channel(nbdev,
3008 : bdev_nvme_freeze_bdev_channel,
3009 : bio,
3010 : bdev_nvme_freeze_bdev_channel_done);
3011 8 : }
3012 :
3013 : static int
3014 19 : bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove)
3015 : {
3016 19 : if (nvme_ctrlr->destruct) {
3017 : /* Don't bother resetting if the controller is in the process of being destructed. */
3018 2 : return -ENXIO;
3019 : }
3020 :
3021 17 : if (nvme_ctrlr->resetting) {
3022 3 : if (!nvme_ctrlr->in_failover) {
3023 3 : SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n");
3024 :
3025 : /* Defer failover until reset completes. */
3026 3 : nvme_ctrlr->pending_failover = true;
3027 3 : return -EINPROGRESS;
3028 : } else {
3029 0 : SPDK_NOTICELOG("Unable to perform failover, already in progress.\n");
3030 0 : return -EBUSY;
3031 : }
3032 : }
3033 :
3034 14 : bdev_nvme_failover_trid(nvme_ctrlr, remove, true);
3035 :
3036 14 : if (nvme_ctrlr->reconnect_is_delayed) {
3037 1 : SPDK_NOTICELOG("Reconnect is already scheduled.\n");
3038 :
3039 : /* We rely on the next reconnect for the failover. */
3040 1 : return -EALREADY;
3041 : }
3042 :
3043 13 : if (nvme_ctrlr->disabled) {
3044 0 : SPDK_NOTICELOG("Controller is disabled.\n");
3045 :
3046 : /* We rely on the enablement for the failover. */
3047 0 : return -EALREADY;
3048 : }
3049 :
3050 13 : nvme_ctrlr->resetting = true;
3051 13 : nvme_ctrlr->in_failover = true;
3052 :
3053 13 : assert(nvme_ctrlr->reset_start_tsc == 0);
3054 13 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
3055 :
3056 13 : return 0;
3057 : }
3058 :
3059 : static int
3060 17 : bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
3061 : {
3062 : int rc;
3063 :
3064 17 : pthread_mutex_lock(&nvme_ctrlr->mutex);
3065 17 : rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false);
3066 17 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
3067 :
3068 17 : if (rc == 0) {
3069 12 : spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr);
3070 5 : } else if (rc == -EALREADY) {
3071 0 : rc = 0;
3072 : }
3073 :
3074 17 : return rc;
3075 : }
3076 :
3077 : static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks,
3078 : uint64_t num_blocks);
3079 :
3080 : static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks,
3081 : uint64_t num_blocks);
3082 :
3083 : static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks,
3084 : uint64_t src_offset_blocks,
3085 : uint64_t num_blocks);
3086 :
3087 : static void
3088 1 : bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3089 : bool success)
3090 : {
3091 1 : struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
3092 : int ret;
3093 :
3094 1 : if (!success) {
3095 0 : ret = -EINVAL;
3096 0 : goto exit;
3097 : }
3098 :
3099 1 : if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
3100 0 : ret = -ENXIO;
3101 0 : goto exit;
3102 : }
3103 :
3104 1 : ret = bdev_nvme_readv(bio,
3105 : bdev_io->u.bdev.iovs,
3106 : bdev_io->u.bdev.iovcnt,
3107 : bdev_io->u.bdev.md_buf,
3108 : bdev_io->u.bdev.num_blocks,
3109 : bdev_io->u.bdev.offset_blocks,
3110 : bdev_io->u.bdev.dif_check_flags,
3111 : bdev_io->u.bdev.memory_domain,
3112 : bdev_io->u.bdev.memory_domain_ctx,
3113 : bdev_io->u.bdev.accel_sequence);
3114 :
3115 1 : exit:
3116 1 : if (spdk_unlikely(ret != 0)) {
3117 0 : bdev_nvme_io_complete(bio, ret);
3118 : }
3119 1 : }
3120 :
3121 : static inline void
3122 52 : _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io)
3123 : {
3124 52 : struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
3125 52 : struct spdk_bdev *bdev = bdev_io->bdev;
3126 : struct nvme_bdev_io *nbdev_io_to_abort;
3127 52 : int rc = 0;
3128 :
3129 52 : switch (bdev_io->type) {
3130 3 : case SPDK_BDEV_IO_TYPE_READ:
3131 3 : if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
3132 :
3133 2 : rc = bdev_nvme_readv(nbdev_io,
3134 : bdev_io->u.bdev.iovs,
3135 : bdev_io->u.bdev.iovcnt,
3136 : bdev_io->u.bdev.md_buf,
3137 : bdev_io->u.bdev.num_blocks,
3138 : bdev_io->u.bdev.offset_blocks,
3139 : bdev_io->u.bdev.dif_check_flags,
3140 : bdev_io->u.bdev.memory_domain,
3141 : bdev_io->u.bdev.memory_domain_ctx,
3142 : bdev_io->u.bdev.accel_sequence);
3143 : } else {
3144 1 : spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
3145 1 : bdev_io->u.bdev.num_blocks * bdev->blocklen);
3146 1 : rc = 0;
3147 : }
3148 3 : break;
3149 25 : case SPDK_BDEV_IO_TYPE_WRITE:
3150 25 : rc = bdev_nvme_writev(nbdev_io,
3151 : bdev_io->u.bdev.iovs,
3152 : bdev_io->u.bdev.iovcnt,
3153 : bdev_io->u.bdev.md_buf,
3154 : bdev_io->u.bdev.num_blocks,
3155 : bdev_io->u.bdev.offset_blocks,
3156 : bdev_io->u.bdev.dif_check_flags,
3157 : bdev_io->u.bdev.memory_domain,
3158 : bdev_io->u.bdev.memory_domain_ctx,
3159 : bdev_io->u.bdev.accel_sequence,
3160 : bdev_io->u.bdev.nvme_cdw12,
3161 : bdev_io->u.bdev.nvme_cdw13);
3162 25 : break;
3163 1 : case SPDK_BDEV_IO_TYPE_COMPARE:
3164 1 : rc = bdev_nvme_comparev(nbdev_io,
3165 : bdev_io->u.bdev.iovs,
3166 : bdev_io->u.bdev.iovcnt,
3167 : bdev_io->u.bdev.md_buf,
3168 : bdev_io->u.bdev.num_blocks,
3169 : bdev_io->u.bdev.offset_blocks,
3170 : bdev_io->u.bdev.dif_check_flags);
3171 1 : break;
3172 2 : case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
3173 2 : rc = bdev_nvme_comparev_and_writev(nbdev_io,
3174 : bdev_io->u.bdev.iovs,
3175 : bdev_io->u.bdev.iovcnt,
3176 : bdev_io->u.bdev.fused_iovs,
3177 : bdev_io->u.bdev.fused_iovcnt,
3178 : bdev_io->u.bdev.md_buf,
3179 : bdev_io->u.bdev.num_blocks,
3180 : bdev_io->u.bdev.offset_blocks,
3181 : bdev_io->u.bdev.dif_check_flags);
3182 2 : break;
3183 1 : case SPDK_BDEV_IO_TYPE_UNMAP:
3184 1 : rc = bdev_nvme_unmap(nbdev_io,
3185 : bdev_io->u.bdev.offset_blocks,
3186 : bdev_io->u.bdev.num_blocks);
3187 1 : break;
3188 0 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3189 0 : rc = bdev_nvme_write_zeroes(nbdev_io,
3190 : bdev_io->u.bdev.offset_blocks,
3191 : bdev_io->u.bdev.num_blocks);
3192 0 : break;
3193 8 : case SPDK_BDEV_IO_TYPE_RESET:
3194 8 : nbdev_io->io_path = NULL;
3195 8 : bdev_nvme_reset_io(bdev->ctxt, nbdev_io);
3196 8 : return;
3197 :
3198 1 : case SPDK_BDEV_IO_TYPE_FLUSH:
3199 1 : bdev_nvme_io_complete(nbdev_io, 0);
3200 1 : return;
3201 :
3202 0 : case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
3203 0 : rc = bdev_nvme_zone_appendv(nbdev_io,
3204 : bdev_io->u.bdev.iovs,
3205 : bdev_io->u.bdev.iovcnt,
3206 : bdev_io->u.bdev.md_buf,
3207 : bdev_io->u.bdev.num_blocks,
3208 : bdev_io->u.bdev.offset_blocks,
3209 : bdev_io->u.bdev.dif_check_flags);
3210 0 : break;
3211 0 : case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
3212 0 : rc = bdev_nvme_get_zone_info(nbdev_io,
3213 : bdev_io->u.zone_mgmt.zone_id,
3214 : bdev_io->u.zone_mgmt.num_zones,
3215 0 : bdev_io->u.zone_mgmt.buf);
3216 0 : break;
3217 0 : case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
3218 0 : rc = bdev_nvme_zone_management(nbdev_io,
3219 : bdev_io->u.zone_mgmt.zone_id,
3220 : bdev_io->u.zone_mgmt.zone_action);
3221 0 : break;
3222 5 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
3223 5 : nbdev_io->io_path = NULL;
3224 5 : bdev_nvme_admin_passthru(nbdev_ch,
3225 : nbdev_io,
3226 : &bdev_io->u.nvme_passthru.cmd,
3227 : bdev_io->u.nvme_passthru.buf,
3228 : bdev_io->u.nvme_passthru.nbytes);
3229 5 : return;
3230 :
3231 0 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3232 0 : rc = bdev_nvme_io_passthru(nbdev_io,
3233 : &bdev_io->u.nvme_passthru.cmd,
3234 : bdev_io->u.nvme_passthru.buf,
3235 : bdev_io->u.nvme_passthru.nbytes);
3236 0 : break;
3237 0 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3238 0 : rc = bdev_nvme_io_passthru_md(nbdev_io,
3239 : &bdev_io->u.nvme_passthru.cmd,
3240 : bdev_io->u.nvme_passthru.buf,
3241 : bdev_io->u.nvme_passthru.nbytes,
3242 : bdev_io->u.nvme_passthru.md_buf,
3243 : bdev_io->u.nvme_passthru.md_len);
3244 0 : break;
3245 0 : case SPDK_BDEV_IO_TYPE_NVME_IOV_MD:
3246 0 : rc = bdev_nvme_iov_passthru_md(nbdev_io,
3247 : &bdev_io->u.nvme_passthru.cmd,
3248 : bdev_io->u.nvme_passthru.iovs,
3249 : bdev_io->u.nvme_passthru.iovcnt,
3250 : bdev_io->u.nvme_passthru.nbytes,
3251 : bdev_io->u.nvme_passthru.md_buf,
3252 : bdev_io->u.nvme_passthru.md_len);
3253 0 : break;
3254 6 : case SPDK_BDEV_IO_TYPE_ABORT:
3255 6 : nbdev_io->io_path = NULL;
3256 6 : nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
3257 6 : bdev_nvme_abort(nbdev_ch,
3258 : nbdev_io,
3259 : nbdev_io_to_abort);
3260 6 : return;
3261 :
3262 0 : case SPDK_BDEV_IO_TYPE_COPY:
3263 0 : rc = bdev_nvme_copy(nbdev_io,
3264 : bdev_io->u.bdev.offset_blocks,
3265 : bdev_io->u.bdev.copy.src_offset_blocks,
3266 : bdev_io->u.bdev.num_blocks);
3267 0 : break;
3268 0 : default:
3269 0 : rc = -EINVAL;
3270 0 : break;
3271 : }
3272 :
3273 32 : if (spdk_unlikely(rc != 0)) {
3274 0 : bdev_nvme_io_complete(nbdev_io, rc);
3275 : }
3276 : }
3277 :
3278 : static void
3279 61 : bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
3280 : {
3281 61 : struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
3282 61 : struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
3283 :
3284 61 : if (spdk_likely(nbdev_io->submit_tsc == 0)) {
3285 61 : nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io);
3286 : } else {
3287 : /* There are cases where submit_tsc != 0, i.e. retry I/O.
3288 : * We need to update submit_tsc here.
3289 : */
3290 0 : nbdev_io->submit_tsc = spdk_get_ticks();
3291 : }
3292 :
3293 61 : spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io);
3294 61 : nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
3295 61 : if (spdk_unlikely(!nbdev_io->io_path)) {
3296 13 : if (!bdev_nvme_io_type_is_admin(bdev_io->type)) {
3297 12 : bdev_nvme_io_complete(nbdev_io, -ENXIO);
3298 12 : return;
3299 : }
3300 :
3301 : /* Admin commands do not use the optimal I/O path.
3302 : * Simply fall through even if it is not found.
3303 : */
3304 : }
3305 :
3306 49 : _bdev_nvme_submit_request(nbdev_ch, bdev_io);
3307 : }
3308 :
3309 : static bool
3310 0 : bdev_nvme_is_supported_csi(enum spdk_nvme_csi csi)
3311 : {
3312 0 : switch (csi) {
3313 0 : case SPDK_NVME_CSI_NVM:
3314 0 : return true;
3315 0 : case SPDK_NVME_CSI_ZNS:
3316 0 : return true;
3317 0 : default:
3318 0 : return false;
3319 : }
3320 : }
3321 :
3322 : static bool
3323 0 : bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
3324 : {
3325 0 : struct nvme_bdev *nbdev = ctx;
3326 : struct nvme_ns *nvme_ns;
3327 : struct spdk_nvme_ns *ns;
3328 : struct spdk_nvme_ctrlr *ctrlr;
3329 : const struct spdk_nvme_ctrlr_data *cdata;
3330 :
3331 0 : nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
3332 0 : assert(nvme_ns != NULL);
3333 0 : ns = nvme_ns->ns;
3334 0 : if (ns == NULL) {
3335 0 : return false;
3336 : }
3337 :
3338 0 : if (!bdev_nvme_is_supported_csi(spdk_nvme_ns_get_csi(ns))) {
3339 0 : switch (io_type) {
3340 0 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
3341 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3342 0 : return true;
3343 :
3344 0 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3345 0 : return spdk_nvme_ns_get_md_size(ns) ? true : false;
3346 :
3347 0 : default:
3348 0 : return false;
3349 : }
3350 : }
3351 :
3352 0 : ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3353 :
3354 0 : switch (io_type) {
3355 0 : case SPDK_BDEV_IO_TYPE_READ:
3356 : case SPDK_BDEV_IO_TYPE_WRITE:
3357 : case SPDK_BDEV_IO_TYPE_RESET:
3358 : case SPDK_BDEV_IO_TYPE_FLUSH:
3359 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
3360 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3361 : case SPDK_BDEV_IO_TYPE_ABORT:
3362 0 : return true;
3363 :
3364 0 : case SPDK_BDEV_IO_TYPE_COMPARE:
3365 0 : return spdk_nvme_ns_supports_compare(ns);
3366 :
3367 0 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3368 0 : return spdk_nvme_ns_get_md_size(ns) ? true : false;
3369 :
3370 0 : case SPDK_BDEV_IO_TYPE_UNMAP:
3371 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3372 0 : return cdata->oncs.dsm;
3373 :
3374 0 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3375 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3376 0 : return cdata->oncs.write_zeroes;
3377 :
3378 0 : case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
3379 0 : if (spdk_nvme_ctrlr_get_flags(ctrlr) &
3380 : SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
3381 0 : return true;
3382 : }
3383 0 : return false;
3384 :
3385 0 : case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
3386 : case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
3387 0 : return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
3388 :
3389 0 : case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
3390 0 : return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
3391 0 : spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
3392 :
3393 0 : case SPDK_BDEV_IO_TYPE_COPY:
3394 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3395 0 : return cdata->oncs.copy;
3396 :
3397 0 : default:
3398 0 : return false;
3399 : }
3400 : }
3401 :
3402 : static int
3403 59 : nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch)
3404 : {
3405 : struct nvme_qpair *nvme_qpair;
3406 : struct spdk_io_channel *pg_ch;
3407 : int rc;
3408 :
3409 59 : nvme_qpair = calloc(1, sizeof(*nvme_qpair));
3410 59 : if (!nvme_qpair) {
3411 0 : SPDK_ERRLOG("Failed to alloc nvme_qpair.\n");
3412 0 : return -1;
3413 : }
3414 :
3415 59 : TAILQ_INIT(&nvme_qpair->io_path_list);
3416 :
3417 59 : nvme_qpair->ctrlr = nvme_ctrlr;
3418 59 : nvme_qpair->ctrlr_ch = ctrlr_ch;
3419 :
3420 59 : pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
3421 59 : if (!pg_ch) {
3422 0 : free(nvme_qpair);
3423 0 : return -1;
3424 : }
3425 :
3426 59 : nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch);
3427 :
3428 : #ifdef SPDK_CONFIG_VTUNE
3429 : nvme_qpair->group->collect_spin_stat = true;
3430 : #else
3431 59 : nvme_qpair->group->collect_spin_stat = false;
3432 : #endif
3433 :
3434 59 : if (!nvme_ctrlr->disabled) {
3435 : /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will
3436 : * be created when it's enabled.
3437 : */
3438 59 : rc = bdev_nvme_create_qpair(nvme_qpair);
3439 59 : if (rc != 0) {
3440 : /* nvme_ctrlr can't create IO qpair if connection is down.
3441 : * If reconnect_delay_sec is non-zero, creating IO qpair is retried
3442 : * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero,
3443 : * submitted IO will be queued until IO qpair is successfully created.
3444 : *
3445 : * Hence, if both are satisfied, ignore the failure.
3446 : */
3447 0 : if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) {
3448 0 : spdk_put_io_channel(pg_ch);
3449 0 : free(nvme_qpair);
3450 0 : return rc;
3451 : }
3452 : }
3453 : }
3454 :
3455 59 : TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq);
3456 :
3457 59 : ctrlr_ch->qpair = nvme_qpair;
3458 :
3459 59 : pthread_mutex_lock(&nvme_qpair->ctrlr->mutex);
3460 59 : nvme_qpair->ctrlr->ref++;
3461 59 : pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex);
3462 :
3463 59 : return 0;
3464 : }
3465 :
3466 : static int
3467 59 : bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf)
3468 : {
3469 59 : struct nvme_ctrlr *nvme_ctrlr = io_device;
3470 59 : struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
3471 :
3472 59 : TAILQ_INIT(&ctrlr_ch->pending_resets);
3473 :
3474 59 : return nvme_qpair_create(nvme_ctrlr, ctrlr_ch);
3475 : }
3476 :
3477 : static void
3478 59 : nvme_qpair_delete(struct nvme_qpair *nvme_qpair)
3479 : {
3480 : struct nvme_io_path *io_path, *next;
3481 :
3482 59 : assert(nvme_qpair->group != NULL);
3483 :
3484 96 : TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) {
3485 37 : TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq);
3486 37 : nvme_io_path_free(io_path);
3487 : }
3488 :
3489 59 : TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq);
3490 :
3491 59 : spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group));
3492 :
3493 59 : nvme_ctrlr_release(nvme_qpair->ctrlr);
3494 :
3495 59 : free(nvme_qpair);
3496 59 : }
3497 :
3498 : static void
3499 59 : bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf)
3500 : {
3501 59 : struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
3502 : struct nvme_qpair *nvme_qpair;
3503 :
3504 59 : nvme_qpair = ctrlr_ch->qpair;
3505 59 : assert(nvme_qpair != NULL);
3506 :
3507 59 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
3508 :
3509 59 : if (nvme_qpair->qpair != NULL) {
3510 45 : if (ctrlr_ch->reset_iter == NULL) {
3511 45 : spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair);
3512 : } else {
3513 : /* Skip current ctrlr_channel in a full reset sequence because
3514 : * it is being deleted now. The qpair is already being disconnected.
3515 : * We do not have to restart disconnecting it.
3516 : */
3517 0 : nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0);
3518 : }
3519 :
3520 : /* We cannot release a reference to the poll group now.
3521 : * The qpair may be disconnected asynchronously later.
3522 : * We need to poll it until it is actually disconnected.
3523 : * Just detach the qpair from the deleting ctrlr_channel.
3524 : */
3525 45 : nvme_qpair->ctrlr_ch = NULL;
3526 : } else {
3527 14 : assert(ctrlr_ch->reset_iter == NULL);
3528 :
3529 14 : nvme_qpair_delete(nvme_qpair);
3530 : }
3531 59 : }
3532 :
3533 : static inline struct spdk_io_channel *
3534 0 : bdev_nvme_get_accel_channel(struct nvme_poll_group *group)
3535 : {
3536 0 : if (spdk_unlikely(!group->accel_channel)) {
3537 0 : group->accel_channel = spdk_accel_get_io_channel();
3538 0 : if (!group->accel_channel) {
3539 0 : SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
3540 : group);
3541 0 : return NULL;
3542 : }
3543 : }
3544 :
3545 0 : return group->accel_channel;
3546 : }
3547 :
3548 : static void
3549 0 : bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
3550 : {
3551 0 : spdk_accel_sequence_finish(seq, cb_fn, cb_arg);
3552 0 : }
3553 :
3554 : static void
3555 0 : bdev_nvme_abort_sequence(void *seq)
3556 : {
3557 0 : spdk_accel_sequence_abort(seq);
3558 0 : }
3559 :
3560 : static void
3561 0 : bdev_nvme_reverse_sequence(void *seq)
3562 : {
3563 0 : spdk_accel_sequence_reverse(seq);
3564 0 : }
3565 :
3566 : static int
3567 0 : bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt,
3568 : struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed,
3569 : spdk_nvme_accel_step_cb cb_fn, void *cb_arg)
3570 : {
3571 : struct spdk_io_channel *ch;
3572 0 : struct nvme_poll_group *group = ctx;
3573 :
3574 0 : ch = bdev_nvme_get_accel_channel(group);
3575 0 : if (spdk_unlikely(ch == NULL)) {
3576 0 : return -ENOMEM;
3577 : }
3578 :
3579 0 : return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt,
3580 : domain, domain_ctx, seed, cb_fn, cb_arg);
3581 : }
3582 :
3583 : static int
3584 0 : bdev_nvme_append_copy(void *ctx, void **seq, struct iovec *dst_iovs, uint32_t dst_iovcnt,
3585 : struct spdk_memory_domain *dst_domain, void *dst_domain_ctx,
3586 : struct iovec *src_iovs, uint32_t src_iovcnt,
3587 : struct spdk_memory_domain *src_domain, void *src_domain_ctx,
3588 : spdk_nvme_accel_step_cb cb_fn, void *cb_arg)
3589 : {
3590 : struct spdk_io_channel *ch;
3591 0 : struct nvme_poll_group *group = ctx;
3592 :
3593 0 : ch = bdev_nvme_get_accel_channel(group);
3594 0 : if (spdk_unlikely(ch == NULL)) {
3595 0 : return -ENOMEM;
3596 : }
3597 :
3598 0 : return spdk_accel_append_copy((struct spdk_accel_sequence **)seq, ch,
3599 : dst_iovs, dst_iovcnt, dst_domain, dst_domain_ctx,
3600 : src_iovs, src_iovcnt, src_domain, src_domain_ctx,
3601 : cb_fn, cb_arg);
3602 : }
3603 :
3604 : static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
3605 : .table_size = sizeof(struct spdk_nvme_accel_fn_table),
3606 : .append_crc32c = bdev_nvme_append_crc32c,
3607 : .append_copy = bdev_nvme_append_copy,
3608 : .finish_sequence = bdev_nvme_finish_sequence,
3609 : .reverse_sequence = bdev_nvme_reverse_sequence,
3610 : .abort_sequence = bdev_nvme_abort_sequence,
3611 : };
3612 :
3613 : static int
3614 44 : bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf)
3615 : {
3616 44 : struct nvme_poll_group *group = ctx_buf;
3617 :
3618 44 : TAILQ_INIT(&group->qpair_list);
3619 :
3620 44 : group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
3621 44 : if (group->group == NULL) {
3622 0 : return -1;
3623 : }
3624 :
3625 44 : group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
3626 :
3627 44 : if (group->poller == NULL) {
3628 0 : spdk_nvme_poll_group_destroy(group->group);
3629 0 : return -1;
3630 : }
3631 :
3632 44 : return 0;
3633 : }
3634 :
3635 : static void
3636 44 : bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf)
3637 : {
3638 44 : struct nvme_poll_group *group = ctx_buf;
3639 :
3640 44 : assert(TAILQ_EMPTY(&group->qpair_list));
3641 :
3642 44 : if (group->accel_channel) {
3643 0 : spdk_put_io_channel(group->accel_channel);
3644 : }
3645 :
3646 44 : spdk_poller_unregister(&group->poller);
3647 44 : if (spdk_nvme_poll_group_destroy(group->group)) {
3648 0 : SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n");
3649 0 : assert(false);
3650 : }
3651 44 : }
3652 :
3653 : static struct spdk_io_channel *
3654 0 : bdev_nvme_get_io_channel(void *ctx)
3655 : {
3656 0 : struct nvme_bdev *nvme_bdev = ctx;
3657 :
3658 0 : return spdk_get_io_channel(nvme_bdev);
3659 : }
3660 :
3661 : static void *
3662 0 : bdev_nvme_get_module_ctx(void *ctx)
3663 : {
3664 0 : struct nvme_bdev *nvme_bdev = ctx;
3665 : struct nvme_ns *nvme_ns;
3666 :
3667 0 : if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) {
3668 0 : return NULL;
3669 : }
3670 :
3671 0 : nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list);
3672 0 : if (!nvme_ns) {
3673 0 : return NULL;
3674 : }
3675 :
3676 0 : return nvme_ns->ns;
3677 : }
3678 :
3679 : static const char *
3680 0 : _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state)
3681 : {
3682 0 : switch (ana_state) {
3683 0 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
3684 0 : return "optimized";
3685 0 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
3686 0 : return "non_optimized";
3687 0 : case SPDK_NVME_ANA_INACCESSIBLE_STATE:
3688 0 : return "inaccessible";
3689 0 : case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE:
3690 0 : return "persistent_loss";
3691 0 : case SPDK_NVME_ANA_CHANGE_STATE:
3692 0 : return "change";
3693 0 : default:
3694 0 : return NULL;
3695 : }
3696 : }
3697 :
3698 : static int
3699 8 : bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
3700 : {
3701 8 : struct spdk_memory_domain **_domains = NULL;
3702 8 : struct nvme_bdev *nbdev = ctx;
3703 : struct nvme_ns *nvme_ns;
3704 8 : int i = 0, _array_size = array_size;
3705 8 : int rc = 0;
3706 :
3707 22 : TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
3708 14 : if (domains && array_size >= i) {
3709 11 : _domains = &domains[i];
3710 : } else {
3711 3 : _domains = NULL;
3712 : }
3713 14 : rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size);
3714 14 : if (rc > 0) {
3715 13 : i += rc;
3716 13 : if (_array_size >= rc) {
3717 9 : _array_size -= rc;
3718 : } else {
3719 4 : _array_size = 0;
3720 : }
3721 1 : } else if (rc < 0) {
3722 0 : return rc;
3723 : }
3724 : }
3725 :
3726 8 : return i;
3727 : }
3728 :
3729 : static const char *
3730 0 : nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr)
3731 : {
3732 0 : if (nvme_ctrlr->destruct) {
3733 0 : return "deleting";
3734 0 : } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
3735 0 : return "failed";
3736 0 : } else if (nvme_ctrlr->resetting) {
3737 0 : return "resetting";
3738 0 : } else if (nvme_ctrlr->reconnect_is_delayed > 0) {
3739 0 : return "reconnect_is_delayed";
3740 0 : } else if (nvme_ctrlr->disabled) {
3741 0 : return "disabled";
3742 : } else {
3743 0 : return "enabled";
3744 : }
3745 : }
3746 :
3747 : void
3748 0 : nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr)
3749 0 : {
3750 : struct spdk_nvme_transport_id *trid;
3751 : const struct spdk_nvme_ctrlr_opts *opts;
3752 : const struct spdk_nvme_ctrlr_data *cdata;
3753 : struct nvme_path_id *path_id;
3754 : int32_t numa_id;
3755 :
3756 0 : spdk_json_write_object_begin(w);
3757 :
3758 0 : spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr));
3759 :
3760 : #ifdef SPDK_CONFIG_NVME_CUSE
3761 0 : size_t cuse_name_size = 128;
3762 0 : char cuse_name[cuse_name_size];
3763 :
3764 0 : int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size);
3765 0 : if (rc == 0) {
3766 0 : spdk_json_write_named_string(w, "cuse_device", cuse_name);
3767 : }
3768 : #endif
3769 0 : trid = &nvme_ctrlr->active_path_id->trid;
3770 0 : spdk_json_write_named_object_begin(w, "trid");
3771 0 : nvme_bdev_dump_trid_json(trid, w);
3772 0 : spdk_json_write_object_end(w);
3773 :
3774 0 : path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link);
3775 0 : if (path_id != NULL) {
3776 0 : spdk_json_write_named_array_begin(w, "alternate_trids");
3777 : do {
3778 0 : trid = &path_id->trid;
3779 0 : spdk_json_write_object_begin(w);
3780 0 : nvme_bdev_dump_trid_json(trid, w);
3781 0 : spdk_json_write_object_end(w);
3782 :
3783 0 : path_id = TAILQ_NEXT(path_id, link);
3784 0 : } while (path_id != NULL);
3785 0 : spdk_json_write_array_end(w);
3786 : }
3787 :
3788 0 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
3789 0 : spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid);
3790 :
3791 0 : opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr);
3792 0 : spdk_json_write_named_object_begin(w, "host");
3793 0 : spdk_json_write_named_string(w, "nqn", opts->hostnqn);
3794 0 : spdk_json_write_named_string(w, "addr", opts->src_addr);
3795 0 : spdk_json_write_named_string(w, "svcid", opts->src_svcid);
3796 0 : spdk_json_write_object_end(w);
3797 :
3798 0 : numa_id = spdk_nvme_ctrlr_get_numa_id(nvme_ctrlr->ctrlr);
3799 0 : if (numa_id != SPDK_ENV_NUMA_ID_ANY) {
3800 0 : spdk_json_write_named_uint32(w, "numa_id", numa_id);
3801 : }
3802 0 : spdk_json_write_object_end(w);
3803 0 : }
3804 :
3805 : static void
3806 0 : nvme_namespace_info_json(struct spdk_json_write_ctx *w,
3807 : struct nvme_ns *nvme_ns)
3808 0 : {
3809 : struct spdk_nvme_ns *ns;
3810 : struct spdk_nvme_ctrlr *ctrlr;
3811 : const struct spdk_nvme_ctrlr_data *cdata;
3812 : const struct spdk_nvme_transport_id *trid;
3813 : union spdk_nvme_vs_register vs;
3814 : const struct spdk_nvme_ns_data *nsdata;
3815 0 : char buf[128];
3816 :
3817 0 : ns = nvme_ns->ns;
3818 0 : if (ns == NULL) {
3819 0 : return;
3820 : }
3821 :
3822 0 : ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3823 :
3824 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3825 0 : trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
3826 0 : vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
3827 :
3828 0 : spdk_json_write_object_begin(w);
3829 :
3830 0 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
3831 0 : spdk_json_write_named_string(w, "pci_address", trid->traddr);
3832 : }
3833 :
3834 0 : spdk_json_write_named_object_begin(w, "trid");
3835 :
3836 0 : nvme_bdev_dump_trid_json(trid, w);
3837 :
3838 0 : spdk_json_write_object_end(w);
3839 :
3840 : #ifdef SPDK_CONFIG_NVME_CUSE
3841 0 : size_t cuse_name_size = 128;
3842 0 : char cuse_name[cuse_name_size];
3843 :
3844 0 : int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
3845 : cuse_name, &cuse_name_size);
3846 0 : if (rc == 0) {
3847 0 : spdk_json_write_named_string(w, "cuse_device", cuse_name);
3848 : }
3849 : #endif
3850 :
3851 0 : spdk_json_write_named_object_begin(w, "ctrlr_data");
3852 :
3853 0 : spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid);
3854 :
3855 0 : spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
3856 :
3857 0 : snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
3858 0 : spdk_str_trim(buf);
3859 0 : spdk_json_write_named_string(w, "model_number", buf);
3860 :
3861 0 : snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
3862 0 : spdk_str_trim(buf);
3863 0 : spdk_json_write_named_string(w, "serial_number", buf);
3864 :
3865 0 : snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
3866 0 : spdk_str_trim(buf);
3867 0 : spdk_json_write_named_string(w, "firmware_revision", buf);
3868 :
3869 0 : if (cdata->subnqn[0] != '\0') {
3870 0 : spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
3871 : }
3872 :
3873 0 : spdk_json_write_named_object_begin(w, "oacs");
3874 :
3875 0 : spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
3876 0 : spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
3877 0 : spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
3878 0 : spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
3879 :
3880 0 : spdk_json_write_object_end(w);
3881 :
3882 0 : spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr);
3883 0 : spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting);
3884 :
3885 0 : spdk_json_write_object_end(w);
3886 :
3887 0 : spdk_json_write_named_object_begin(w, "vs");
3888 :
3889 0 : spdk_json_write_name(w, "nvme_version");
3890 0 : if (vs.bits.ter) {
3891 0 : spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
3892 : } else {
3893 0 : spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
3894 : }
3895 :
3896 0 : spdk_json_write_object_end(w);
3897 :
3898 0 : nsdata = spdk_nvme_ns_get_data(ns);
3899 :
3900 0 : spdk_json_write_named_object_begin(w, "ns_data");
3901 :
3902 0 : spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
3903 :
3904 0 : if (cdata->cmic.ana_reporting) {
3905 0 : spdk_json_write_named_string(w, "ana_state",
3906 : _nvme_ana_state_str(nvme_ns->ana_state));
3907 : }
3908 :
3909 0 : spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share);
3910 :
3911 0 : spdk_json_write_object_end(w);
3912 :
3913 0 : if (cdata->oacs.security) {
3914 0 : spdk_json_write_named_object_begin(w, "security");
3915 :
3916 0 : spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal);
3917 :
3918 0 : spdk_json_write_object_end(w);
3919 : }
3920 :
3921 0 : spdk_json_write_object_end(w);
3922 : }
3923 :
3924 : static const char *
3925 0 : nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev)
3926 : {
3927 0 : switch (nbdev->mp_policy) {
3928 0 : case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE:
3929 0 : return "active_passive";
3930 0 : case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE:
3931 0 : return "active_active";
3932 0 : default:
3933 0 : assert(false);
3934 : return "invalid";
3935 : }
3936 : }
3937 :
3938 : static const char *
3939 0 : nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev)
3940 : {
3941 0 : switch (nbdev->mp_selector) {
3942 0 : case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN:
3943 0 : return "round_robin";
3944 0 : case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH:
3945 0 : return "queue_depth";
3946 0 : default:
3947 0 : assert(false);
3948 : return "invalid";
3949 : }
3950 : }
3951 :
3952 : static int
3953 0 : bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
3954 : {
3955 0 : struct nvme_bdev *nvme_bdev = ctx;
3956 : struct nvme_ns *nvme_ns;
3957 :
3958 0 : pthread_mutex_lock(&nvme_bdev->mutex);
3959 0 : spdk_json_write_named_array_begin(w, "nvme");
3960 0 : TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) {
3961 0 : nvme_namespace_info_json(w, nvme_ns);
3962 : }
3963 0 : spdk_json_write_array_end(w);
3964 0 : spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev));
3965 0 : if (nvme_bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
3966 0 : spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nvme_bdev));
3967 0 : if (nvme_bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
3968 0 : spdk_json_write_named_uint32(w, "rr_min_io", nvme_bdev->rr_min_io);
3969 : }
3970 : }
3971 0 : pthread_mutex_unlock(&nvme_bdev->mutex);
3972 :
3973 0 : return 0;
3974 : }
3975 :
3976 : static void
3977 0 : bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
3978 : {
3979 : /* No config per bdev needed */
3980 0 : }
3981 :
3982 : static uint64_t
3983 0 : bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
3984 : {
3985 0 : struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
3986 : struct nvme_io_path *io_path;
3987 : struct nvme_poll_group *group;
3988 0 : uint64_t spin_time = 0;
3989 :
3990 0 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
3991 0 : group = io_path->qpair->group;
3992 :
3993 0 : if (!group || !group->collect_spin_stat) {
3994 0 : continue;
3995 : }
3996 :
3997 0 : if (group->end_ticks != 0) {
3998 0 : group->spin_ticks += (group->end_ticks - group->start_ticks);
3999 0 : group->end_ticks = 0;
4000 : }
4001 :
4002 0 : spin_time += group->spin_ticks;
4003 0 : group->start_ticks = 0;
4004 0 : group->spin_ticks = 0;
4005 : }
4006 :
4007 0 : return (spin_time * 1000000ULL) / spdk_get_ticks_hz();
4008 : }
4009 :
4010 : static void
4011 0 : bdev_nvme_reset_device_stat(void *ctx)
4012 : {
4013 0 : struct nvme_bdev *nbdev = ctx;
4014 :
4015 0 : if (nbdev->err_stat != NULL) {
4016 0 : memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat));
4017 : }
4018 0 : }
4019 :
4020 : /* JSON string should be lowercases and underscore delimited string. */
4021 : static void
4022 0 : bdev_nvme_format_nvme_status(char *dst, const char *src)
4023 : {
4024 0 : char tmp[256];
4025 :
4026 0 : spdk_strcpy_replace(dst, 256, src, " - ", "_");
4027 0 : spdk_strcpy_replace(tmp, 256, dst, "-", "_");
4028 0 : spdk_strcpy_replace(dst, 256, tmp, " ", "_");
4029 0 : spdk_strlwr(dst);
4030 0 : }
4031 :
4032 : static void
4033 0 : bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w)
4034 : {
4035 0 : struct nvme_bdev *nbdev = ctx;
4036 0 : struct spdk_nvme_status status = {};
4037 : uint16_t sct, sc;
4038 0 : char status_json[256];
4039 : const char *status_str;
4040 :
4041 0 : if (nbdev->err_stat == NULL) {
4042 0 : return;
4043 : }
4044 :
4045 0 : spdk_json_write_named_object_begin(w, "nvme_error");
4046 :
4047 0 : spdk_json_write_named_object_begin(w, "status_type");
4048 0 : for (sct = 0; sct < 8; sct++) {
4049 0 : if (nbdev->err_stat->status_type[sct] == 0) {
4050 0 : continue;
4051 : }
4052 0 : status.sct = sct;
4053 :
4054 0 : status_str = spdk_nvme_cpl_get_status_type_string(&status);
4055 0 : assert(status_str != NULL);
4056 0 : bdev_nvme_format_nvme_status(status_json, status_str);
4057 :
4058 0 : spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]);
4059 : }
4060 0 : spdk_json_write_object_end(w);
4061 :
4062 0 : spdk_json_write_named_object_begin(w, "status_code");
4063 0 : for (sct = 0; sct < 4; sct++) {
4064 0 : status.sct = sct;
4065 0 : for (sc = 0; sc < 256; sc++) {
4066 0 : if (nbdev->err_stat->status[sct][sc] == 0) {
4067 0 : continue;
4068 : }
4069 0 : status.sc = sc;
4070 :
4071 0 : status_str = spdk_nvme_cpl_get_status_string(&status);
4072 0 : assert(status_str != NULL);
4073 0 : bdev_nvme_format_nvme_status(status_json, status_str);
4074 :
4075 0 : spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]);
4076 : }
4077 : }
4078 0 : spdk_json_write_object_end(w);
4079 :
4080 0 : spdk_json_write_object_end(w);
4081 : }
4082 :
4083 : static bool
4084 0 : bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type)
4085 : {
4086 0 : struct nvme_bdev *nbdev = ctx;
4087 : struct spdk_nvme_ctrlr *ctrlr;
4088 :
4089 0 : if (!g_opts.allow_accel_sequence) {
4090 0 : return false;
4091 : }
4092 :
4093 0 : switch (type) {
4094 0 : case SPDK_BDEV_IO_TYPE_WRITE:
4095 : case SPDK_BDEV_IO_TYPE_READ:
4096 0 : break;
4097 0 : default:
4098 0 : return false;
4099 : }
4100 :
4101 0 : ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk);
4102 0 : assert(ctrlr != NULL);
4103 :
4104 0 : return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED;
4105 : }
4106 :
4107 : static const struct spdk_bdev_fn_table nvmelib_fn_table = {
4108 : .destruct = bdev_nvme_destruct,
4109 : .submit_request = bdev_nvme_submit_request,
4110 : .io_type_supported = bdev_nvme_io_type_supported,
4111 : .get_io_channel = bdev_nvme_get_io_channel,
4112 : .dump_info_json = bdev_nvme_dump_info_json,
4113 : .write_config_json = bdev_nvme_write_config_json,
4114 : .get_spin_time = bdev_nvme_get_spin_time,
4115 : .get_module_ctx = bdev_nvme_get_module_ctx,
4116 : .get_memory_domains = bdev_nvme_get_memory_domains,
4117 : .accel_sequence_supported = bdev_nvme_accel_sequence_supported,
4118 : .reset_device_stat = bdev_nvme_reset_device_stat,
4119 : .dump_device_stat_json = bdev_nvme_dump_device_stat_json,
4120 : };
4121 :
4122 : typedef int (*bdev_nvme_parse_ana_log_page_cb)(
4123 : const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg);
4124 :
4125 : static int
4126 41 : bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
4127 : bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg)
4128 : {
4129 : struct spdk_nvme_ana_group_descriptor *copied_desc;
4130 : uint8_t *orig_desc;
4131 : uint32_t i, desc_size, copy_len;
4132 41 : int rc = 0;
4133 :
4134 41 : if (nvme_ctrlr->ana_log_page == NULL) {
4135 0 : return -EINVAL;
4136 : }
4137 :
4138 41 : copied_desc = nvme_ctrlr->copied_ana_desc;
4139 :
4140 41 : orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page);
4141 41 : copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page);
4142 :
4143 71 : for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) {
4144 66 : memcpy(copied_desc, orig_desc, copy_len);
4145 :
4146 66 : rc = cb_fn(copied_desc, cb_arg);
4147 66 : if (rc != 0) {
4148 36 : break;
4149 : }
4150 :
4151 30 : desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) +
4152 30 : copied_desc->num_of_nsid * sizeof(uint32_t);
4153 30 : orig_desc += desc_size;
4154 30 : copy_len -= desc_size;
4155 : }
4156 :
4157 41 : return rc;
4158 : }
4159 :
4160 : static int
4161 5 : nvme_ns_ana_transition_timedout(void *ctx)
4162 : {
4163 5 : struct nvme_ns *nvme_ns = ctx;
4164 :
4165 5 : spdk_poller_unregister(&nvme_ns->anatt_timer);
4166 5 : nvme_ns->ana_transition_timedout = true;
4167 :
4168 5 : return SPDK_POLLER_BUSY;
4169 : }
4170 :
4171 : static void
4172 45 : _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns,
4173 : const struct spdk_nvme_ana_group_descriptor *desc)
4174 : {
4175 : const struct spdk_nvme_ctrlr_data *cdata;
4176 :
4177 45 : nvme_ns->ana_group_id = desc->ana_group_id;
4178 45 : nvme_ns->ana_state = desc->ana_state;
4179 45 : nvme_ns->ana_state_updating = false;
4180 :
4181 45 : switch (nvme_ns->ana_state) {
4182 38 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
4183 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
4184 38 : nvme_ns->ana_transition_timedout = false;
4185 38 : spdk_poller_unregister(&nvme_ns->anatt_timer);
4186 38 : break;
4187 :
4188 6 : case SPDK_NVME_ANA_INACCESSIBLE_STATE:
4189 : case SPDK_NVME_ANA_CHANGE_STATE:
4190 6 : if (nvme_ns->anatt_timer != NULL) {
4191 1 : break;
4192 : }
4193 :
4194 5 : cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);
4195 5 : nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout,
4196 : nvme_ns,
4197 : cdata->anatt * SPDK_SEC_TO_USEC);
4198 5 : break;
4199 1 : default:
4200 1 : break;
4201 : }
4202 45 : }
4203 :
4204 : static int
4205 59 : nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg)
4206 : {
4207 59 : struct nvme_ns *nvme_ns = cb_arg;
4208 : uint32_t i;
4209 :
4210 59 : assert(nvme_ns->ns != NULL);
4211 :
4212 81 : for (i = 0; i < desc->num_of_nsid; i++) {
4213 58 : if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) {
4214 22 : continue;
4215 : }
4216 :
4217 36 : _nvme_ns_set_ana_state(nvme_ns, desc);
4218 36 : return 1;
4219 : }
4220 :
4221 23 : return 0;
4222 : }
4223 :
4224 : static int
4225 5 : nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid)
4226 : {
4227 5 : int rc = 0;
4228 5 : struct spdk_uuid new_uuid, namespace_uuid;
4229 5 : char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'};
4230 : /* This namespace UUID was generated using uuid_generate() method. */
4231 5 : const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"};
4232 : int size;
4233 :
4234 5 : assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN);
4235 :
4236 5 : spdk_uuid_set_null(&new_uuid);
4237 5 : spdk_uuid_set_null(&namespace_uuid);
4238 :
4239 5 : size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid);
4240 5 : if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) {
4241 0 : return -EINVAL;
4242 : }
4243 :
4244 5 : spdk_uuid_parse(&namespace_uuid, namespace_str);
4245 :
4246 5 : rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size);
4247 5 : if (rc == 0) {
4248 5 : memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid));
4249 : }
4250 :
4251 5 : return rc;
4252 : }
4253 :
4254 : static int
4255 38 : nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
4256 : struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
4257 : struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, void *ctx)
4258 : {
4259 : const struct spdk_uuid *uuid;
4260 : const uint8_t *nguid;
4261 : const struct spdk_nvme_ctrlr_data *cdata;
4262 : const struct spdk_nvme_ns_data *nsdata;
4263 : const struct spdk_nvme_ctrlr_opts *opts;
4264 : enum spdk_nvme_csi csi;
4265 : uint32_t atomic_bs, phys_bs, bs;
4266 38 : char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'};
4267 : int rc;
4268 :
4269 38 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
4270 38 : csi = spdk_nvme_ns_get_csi(ns);
4271 38 : opts = spdk_nvme_ctrlr_get_opts(ctrlr);
4272 :
4273 38 : switch (csi) {
4274 38 : case SPDK_NVME_CSI_NVM:
4275 38 : disk->product_name = "NVMe disk";
4276 38 : break;
4277 0 : case SPDK_NVME_CSI_ZNS:
4278 0 : disk->product_name = "NVMe ZNS disk";
4279 0 : disk->zoned = true;
4280 0 : disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
4281 0 : disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
4282 0 : spdk_nvme_ns_get_extended_sector_size(ns);
4283 0 : disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
4284 0 : disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
4285 0 : break;
4286 0 : default:
4287 0 : if (bdev_opts->allow_unrecognized_csi) {
4288 0 : disk->product_name = "NVMe Passthrough disk";
4289 0 : break;
4290 : }
4291 0 : SPDK_ERRLOG("unsupported CSI: %u\n", csi);
4292 0 : return -ENOTSUP;
4293 : }
4294 :
4295 38 : nguid = spdk_nvme_ns_get_nguid(ns);
4296 38 : if (!nguid) {
4297 38 : uuid = spdk_nvme_ns_get_uuid(ns);
4298 38 : if (uuid) {
4299 12 : disk->uuid = *uuid;
4300 26 : } else if (g_opts.generate_uuids) {
4301 0 : spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0');
4302 0 : rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid);
4303 0 : if (rc < 0) {
4304 0 : SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc));
4305 0 : return rc;
4306 : }
4307 : }
4308 : } else {
4309 0 : memcpy(&disk->uuid, nguid, sizeof(disk->uuid));
4310 : }
4311 :
4312 38 : disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
4313 38 : if (!disk->name) {
4314 0 : return -ENOMEM;
4315 : }
4316 :
4317 38 : disk->write_cache = 0;
4318 38 : if (cdata->vwc.present) {
4319 : /* Enable if the Volatile Write Cache exists */
4320 0 : disk->write_cache = 1;
4321 : }
4322 38 : if (cdata->oncs.write_zeroes) {
4323 0 : disk->max_write_zeroes = UINT16_MAX + 1;
4324 : }
4325 38 : disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
4326 38 : disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
4327 38 : disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr);
4328 38 : disk->ctratt.raw = cdata->ctratt.raw;
4329 : /* NVMe driver will split one request into multiple requests
4330 : * based on MDTS and stripe boundary, the bdev layer will use
4331 : * max_segment_size and max_num_segments to split one big IO
4332 : * into multiple requests, then small request can't run out
4333 : * of NVMe internal requests data structure.
4334 : */
4335 38 : if (opts && opts->io_queue_requests) {
4336 0 : disk->max_num_segments = opts->io_queue_requests / 2;
4337 : }
4338 38 : if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
4339 : /* The nvme driver will try to split I/O that have too many
4340 : * SGEs, but it doesn't work if that last SGE doesn't end on
4341 : * an aggregate total that is block aligned. The bdev layer has
4342 : * a more robust splitting framework, so use that instead for
4343 : * this case. (See issue #3269.)
4344 : */
4345 0 : uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr);
4346 :
4347 0 : if (disk->max_num_segments == 0) {
4348 0 : disk->max_num_segments = max_sges;
4349 : } else {
4350 0 : disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges);
4351 : }
4352 : }
4353 38 : disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
4354 :
4355 38 : nsdata = spdk_nvme_ns_get_data(ns);
4356 38 : bs = spdk_nvme_ns_get_sector_size(ns);
4357 38 : atomic_bs = bs;
4358 38 : phys_bs = bs;
4359 38 : if (nsdata->nabo == 0) {
4360 38 : if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) {
4361 0 : atomic_bs = bs * (1 + nsdata->nawupf);
4362 : } else {
4363 38 : atomic_bs = bs * (1 + cdata->awupf);
4364 : }
4365 : }
4366 38 : if (nsdata->nsfeat.optperf) {
4367 0 : phys_bs = bs * (1 + nsdata->npwg);
4368 : }
4369 38 : disk->phys_blocklen = spdk_min(phys_bs, atomic_bs);
4370 :
4371 38 : disk->md_len = spdk_nvme_ns_get_md_size(ns);
4372 38 : if (disk->md_len != 0) {
4373 0 : disk->md_interleave = nsdata->flbas.extended;
4374 0 : disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
4375 0 : if (disk->dif_type != SPDK_DIF_DISABLE) {
4376 0 : disk->dif_is_head_of_md = nsdata->dps.md_start;
4377 0 : disk->dif_check_flags = bdev_opts->prchk_flags;
4378 0 : disk->dif_pi_format = (enum spdk_dif_pi_format)spdk_nvme_ns_get_pi_format(ns);
4379 : }
4380 : }
4381 :
4382 38 : if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
4383 : SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
4384 38 : disk->acwu = 0;
4385 0 : } else if (nsdata->nsfeat.ns_atomic_write_unit) {
4386 0 : disk->acwu = nsdata->nacwu + 1; /* 0-based */
4387 : } else {
4388 0 : disk->acwu = cdata->acwu + 1; /* 0-based */
4389 : }
4390 :
4391 38 : if (cdata->oncs.copy) {
4392 : /* For now bdev interface allows only single segment copy */
4393 0 : disk->max_copy = nsdata->mssrl;
4394 : }
4395 :
4396 38 : disk->ctxt = ctx;
4397 38 : disk->fn_table = &nvmelib_fn_table;
4398 38 : disk->module = &nvme_if;
4399 :
4400 38 : disk->numa.id_valid = 1;
4401 38 : disk->numa.id = spdk_nvme_ctrlr_get_numa_id(ctrlr);
4402 :
4403 38 : return 0;
4404 : }
4405 :
4406 : static struct nvme_bdev *
4407 38 : nvme_bdev_alloc(void)
4408 : {
4409 : struct nvme_bdev *bdev;
4410 : int rc;
4411 :
4412 38 : bdev = calloc(1, sizeof(*bdev));
4413 38 : if (!bdev) {
4414 0 : SPDK_ERRLOG("bdev calloc() failed\n");
4415 0 : return NULL;
4416 : }
4417 :
4418 38 : if (g_opts.nvme_error_stat) {
4419 0 : bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat));
4420 0 : if (!bdev->err_stat) {
4421 0 : SPDK_ERRLOG("err_stat calloc() failed\n");
4422 0 : free(bdev);
4423 0 : return NULL;
4424 : }
4425 : }
4426 :
4427 38 : rc = pthread_mutex_init(&bdev->mutex, NULL);
4428 38 : if (rc != 0) {
4429 0 : free(bdev->err_stat);
4430 0 : free(bdev);
4431 0 : return NULL;
4432 : }
4433 :
4434 38 : bdev->ref = 1;
4435 38 : bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE;
4436 38 : bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN;
4437 38 : bdev->rr_min_io = UINT32_MAX;
4438 38 : TAILQ_INIT(&bdev->nvme_ns_list);
4439 :
4440 38 : return bdev;
4441 : }
4442 :
4443 : static int
4444 38 : nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
4445 : {
4446 : struct nvme_bdev *bdev;
4447 38 : struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr;
4448 : int rc;
4449 :
4450 38 : bdev = nvme_bdev_alloc();
4451 38 : if (bdev == NULL) {
4452 0 : SPDK_ERRLOG("Failed to allocate NVMe bdev\n");
4453 0 : return -ENOMEM;
4454 : }
4455 :
4456 38 : bdev->opal = nvme_ctrlr->opal_dev != NULL;
4457 :
4458 38 : rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr,
4459 : nvme_ns->ns, &nvme_ctrlr->opts, bdev);
4460 38 : if (rc != 0) {
4461 0 : SPDK_ERRLOG("Failed to create NVMe disk\n");
4462 0 : nvme_bdev_free(bdev);
4463 0 : return rc;
4464 : }
4465 :
4466 38 : spdk_io_device_register(bdev,
4467 : bdev_nvme_create_bdev_channel_cb,
4468 : bdev_nvme_destroy_bdev_channel_cb,
4469 : sizeof(struct nvme_bdev_channel),
4470 38 : bdev->disk.name);
4471 :
4472 38 : nvme_ns->bdev = bdev;
4473 38 : bdev->nsid = nvme_ns->id;
4474 38 : TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
4475 :
4476 38 : bdev->nbdev_ctrlr = nbdev_ctrlr;
4477 38 : TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq);
4478 :
4479 38 : rc = spdk_bdev_register(&bdev->disk);
4480 38 : if (rc != 0) {
4481 1 : SPDK_ERRLOG("spdk_bdev_register() failed\n");
4482 1 : spdk_io_device_unregister(bdev, NULL);
4483 1 : nvme_ns->bdev = NULL;
4484 1 : TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq);
4485 1 : nvme_bdev_free(bdev);
4486 1 : return rc;
4487 : }
4488 :
4489 37 : return 0;
4490 : }
4491 :
4492 : static bool
4493 23 : bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
4494 : {
4495 : const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
4496 : const struct spdk_uuid *uuid1, *uuid2;
4497 :
4498 23 : nsdata1 = spdk_nvme_ns_get_data(ns1);
4499 23 : nsdata2 = spdk_nvme_ns_get_data(ns2);
4500 23 : uuid1 = spdk_nvme_ns_get_uuid(ns1);
4501 23 : uuid2 = spdk_nvme_ns_get_uuid(ns2);
4502 :
4503 45 : return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 &&
4504 22 : nsdata1->eui64 == nsdata2->eui64 &&
4505 21 : ((uuid1 == NULL && uuid2 == NULL) ||
4506 59 : (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) &&
4507 18 : spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2);
4508 : }
4509 :
4510 : static bool
4511 0 : hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
4512 : struct spdk_nvme_ctrlr_opts *opts)
4513 : {
4514 : struct nvme_probe_skip_entry *entry;
4515 :
4516 0 : TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
4517 0 : if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
4518 0 : return false;
4519 : }
4520 : }
4521 :
4522 0 : opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
4523 0 : opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
4524 0 : opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
4525 0 : opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
4526 0 : opts->disable_read_ana_log_page = true;
4527 :
4528 0 : SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
4529 :
4530 0 : return true;
4531 : }
4532 :
4533 : static void
4534 0 : nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
4535 : {
4536 0 : struct nvme_ctrlr *nvme_ctrlr = ctx;
4537 :
4538 0 : if (spdk_nvme_cpl_is_error(cpl)) {
4539 0 : SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc,
4540 : cpl->status.sct);
4541 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4542 0 : } else if (cpl->cdw0 & 0x1) {
4543 0 : SPDK_WARNLOG("Specified command could not be aborted.\n");
4544 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4545 : }
4546 0 : }
4547 :
4548 : static void
4549 0 : timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
4550 : struct spdk_nvme_qpair *qpair, uint16_t cid)
4551 : {
4552 0 : struct nvme_ctrlr *nvme_ctrlr = cb_arg;
4553 : union spdk_nvme_csts_register csts;
4554 : int rc;
4555 :
4556 0 : assert(nvme_ctrlr->ctrlr == ctrlr);
4557 :
4558 0 : SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
4559 :
4560 : /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
4561 : * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we
4562 : * would submit another fabrics cmd on the admin queue to read CSTS and check for its
4563 : * completion recursively.
4564 : */
4565 0 : if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
4566 0 : csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
4567 0 : if (csts.bits.cfs) {
4568 0 : SPDK_ERRLOG("Controller Fatal Status, reset required\n");
4569 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4570 0 : return;
4571 : }
4572 : }
4573 :
4574 0 : switch (g_opts.action_on_timeout) {
4575 0 : case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
4576 0 : if (qpair) {
4577 : /* Don't send abort to ctrlr when ctrlr is not available. */
4578 0 : pthread_mutex_lock(&nvme_ctrlr->mutex);
4579 0 : if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
4580 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4581 0 : SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n");
4582 0 : return;
4583 : }
4584 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4585 :
4586 0 : rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
4587 : nvme_abort_cpl, nvme_ctrlr);
4588 0 : if (rc == 0) {
4589 0 : return;
4590 : }
4591 :
4592 0 : SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc);
4593 : }
4594 :
4595 : /* FALLTHROUGH */
4596 : case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
4597 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4598 0 : break;
4599 0 : case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
4600 0 : SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
4601 0 : break;
4602 0 : default:
4603 0 : SPDK_ERRLOG("An invalid timeout action value is found.\n");
4604 0 : break;
4605 : }
4606 : }
4607 :
4608 : static struct nvme_ns *
4609 51 : nvme_ns_alloc(void)
4610 : {
4611 : struct nvme_ns *nvme_ns;
4612 :
4613 51 : nvme_ns = calloc(1, sizeof(struct nvme_ns));
4614 51 : if (nvme_ns == NULL) {
4615 0 : return NULL;
4616 : }
4617 :
4618 51 : if (g_opts.io_path_stat) {
4619 0 : nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
4620 0 : if (nvme_ns->stat == NULL) {
4621 0 : free(nvme_ns);
4622 0 : return NULL;
4623 : }
4624 0 : spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN);
4625 : }
4626 :
4627 51 : return nvme_ns;
4628 : }
4629 :
4630 : static void
4631 51 : nvme_ns_free(struct nvme_ns *nvme_ns)
4632 : {
4633 51 : free(nvme_ns->stat);
4634 51 : free(nvme_ns);
4635 51 : }
4636 :
4637 : static void
4638 51 : nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc)
4639 : {
4640 51 : struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
4641 51 : struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx;
4642 :
4643 51 : if (rc == 0) {
4644 49 : nvme_ns->probe_ctx = NULL;
4645 49 : pthread_mutex_lock(&nvme_ctrlr->mutex);
4646 49 : nvme_ctrlr->ref++;
4647 49 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4648 : } else {
4649 2 : RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
4650 2 : nvme_ns_free(nvme_ns);
4651 : }
4652 :
4653 51 : if (ctx) {
4654 50 : ctx->populates_in_progress--;
4655 50 : if (ctx->populates_in_progress == 0) {
4656 12 : nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
4657 : }
4658 : }
4659 51 : }
4660 :
4661 : static void
4662 2 : bdev_nvme_add_io_path(struct nvme_bdev_channel_iter *i,
4663 : struct nvme_bdev *nbdev,
4664 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
4665 : {
4666 2 : struct nvme_ns *nvme_ns = ctx;
4667 : int rc;
4668 :
4669 2 : rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
4670 2 : if (rc != 0) {
4671 0 : SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n");
4672 : }
4673 :
4674 2 : nvme_bdev_for_each_channel_continue(i, rc);
4675 2 : }
4676 :
4677 : static void
4678 2 : bdev_nvme_delete_io_path(struct nvme_bdev_channel_iter *i,
4679 : struct nvme_bdev *nbdev,
4680 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
4681 : {
4682 2 : struct nvme_ns *nvme_ns = ctx;
4683 : struct nvme_io_path *io_path;
4684 :
4685 2 : io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns);
4686 2 : if (io_path != NULL) {
4687 2 : _bdev_nvme_delete_io_path(nbdev_ch, io_path);
4688 : }
4689 :
4690 2 : nvme_bdev_for_each_channel_continue(i, 0);
4691 2 : }
4692 :
4693 : static void
4694 0 : bdev_nvme_add_io_path_failed(struct nvme_bdev *nbdev, void *ctx, int status)
4695 : {
4696 0 : struct nvme_ns *nvme_ns = ctx;
4697 :
4698 0 : nvme_ctrlr_populate_namespace_done(nvme_ns, -1);
4699 0 : }
4700 :
4701 : static void
4702 12 : bdev_nvme_add_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status)
4703 : {
4704 12 : struct nvme_ns *nvme_ns = ctx;
4705 :
4706 12 : if (status == 0) {
4707 12 : nvme_ctrlr_populate_namespace_done(nvme_ns, 0);
4708 : } else {
4709 : /* Delete the added io_paths and fail populating the namespace. */
4710 0 : nvme_bdev_for_each_channel(nbdev,
4711 : bdev_nvme_delete_io_path,
4712 : nvme_ns,
4713 : bdev_nvme_add_io_path_failed);
4714 : }
4715 12 : }
4716 :
4717 : static int
4718 13 : nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns)
4719 : {
4720 : struct nvme_ns *tmp_ns;
4721 : const struct spdk_nvme_ns_data *nsdata;
4722 :
4723 13 : nsdata = spdk_nvme_ns_get_data(nvme_ns->ns);
4724 13 : if (!nsdata->nmic.can_share) {
4725 0 : SPDK_ERRLOG("Namespace cannot be shared.\n");
4726 0 : return -EINVAL;
4727 : }
4728 :
4729 13 : pthread_mutex_lock(&bdev->mutex);
4730 :
4731 13 : tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list);
4732 13 : assert(tmp_ns != NULL);
4733 :
4734 13 : if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) {
4735 1 : pthread_mutex_unlock(&bdev->mutex);
4736 1 : SPDK_ERRLOG("Namespaces are not identical.\n");
4737 1 : return -EINVAL;
4738 : }
4739 :
4740 12 : bdev->ref++;
4741 12 : TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
4742 12 : nvme_ns->bdev = bdev;
4743 :
4744 12 : pthread_mutex_unlock(&bdev->mutex);
4745 :
4746 : /* Add nvme_io_path to nvme_bdev_channels dynamically. */
4747 12 : nvme_bdev_for_each_channel(bdev,
4748 : bdev_nvme_add_io_path,
4749 : nvme_ns,
4750 : bdev_nvme_add_io_path_done);
4751 :
4752 12 : return 0;
4753 : }
4754 :
4755 : static void
4756 51 : nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
4757 : {
4758 : struct spdk_nvme_ns *ns;
4759 : struct nvme_bdev *bdev;
4760 51 : int rc = 0;
4761 :
4762 51 : ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id);
4763 51 : if (!ns) {
4764 0 : SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
4765 0 : rc = -EINVAL;
4766 0 : goto done;
4767 : }
4768 :
4769 51 : nvme_ns->ns = ns;
4770 51 : nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
4771 :
4772 51 : if (nvme_ctrlr->ana_log_page != NULL) {
4773 37 : bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns);
4774 : }
4775 :
4776 51 : bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id);
4777 51 : if (bdev == NULL) {
4778 38 : rc = nvme_bdev_create(nvme_ctrlr, nvme_ns);
4779 : } else {
4780 13 : rc = nvme_bdev_add_ns(bdev, nvme_ns);
4781 13 : if (rc == 0) {
4782 12 : return;
4783 : }
4784 : }
4785 1 : done:
4786 39 : nvme_ctrlr_populate_namespace_done(nvme_ns, rc);
4787 : }
4788 :
4789 : static void
4790 49 : nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns)
4791 : {
4792 49 : struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
4793 :
4794 49 : assert(nvme_ctrlr != NULL);
4795 :
4796 49 : pthread_mutex_lock(&nvme_ctrlr->mutex);
4797 :
4798 49 : RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
4799 :
4800 49 : if (nvme_ns->bdev != NULL) {
4801 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4802 0 : return;
4803 : }
4804 :
4805 49 : nvme_ns_free(nvme_ns);
4806 49 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4807 :
4808 49 : nvme_ctrlr_release(nvme_ctrlr);
4809 : }
4810 :
4811 : static void
4812 11 : bdev_nvme_delete_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status)
4813 : {
4814 11 : struct nvme_ns *nvme_ns = ctx;
4815 :
4816 11 : nvme_ctrlr_depopulate_namespace_done(nvme_ns);
4817 11 : }
4818 :
4819 : static void
4820 49 : nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
4821 : {
4822 : struct nvme_bdev *bdev;
4823 :
4824 49 : spdk_poller_unregister(&nvme_ns->anatt_timer);
4825 :
4826 49 : bdev = nvme_ns->bdev;
4827 49 : if (bdev != NULL) {
4828 45 : pthread_mutex_lock(&bdev->mutex);
4829 :
4830 45 : assert(bdev->ref > 0);
4831 45 : bdev->ref--;
4832 45 : if (bdev->ref == 0) {
4833 34 : pthread_mutex_unlock(&bdev->mutex);
4834 :
4835 34 : spdk_bdev_unregister(&bdev->disk, NULL, NULL);
4836 : } else {
4837 : /* spdk_bdev_unregister() is not called until the last nvme_ns is
4838 : * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list
4839 : * and clear nvme_ns->bdev here.
4840 : */
4841 11 : TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq);
4842 11 : nvme_ns->bdev = NULL;
4843 :
4844 11 : pthread_mutex_unlock(&bdev->mutex);
4845 :
4846 : /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that,
4847 : * we call depopulate_namespace_done() to avoid use-after-free.
4848 : */
4849 11 : nvme_bdev_for_each_channel(bdev,
4850 : bdev_nvme_delete_io_path,
4851 : nvme_ns,
4852 : bdev_nvme_delete_io_path_done);
4853 11 : return;
4854 : }
4855 : }
4856 :
4857 38 : nvme_ctrlr_depopulate_namespace_done(nvme_ns);
4858 : }
4859 :
4860 : static void
4861 62 : nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
4862 : struct nvme_async_probe_ctx *ctx)
4863 : {
4864 62 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
4865 : struct nvme_ns *nvme_ns, *next;
4866 : struct spdk_nvme_ns *ns;
4867 : struct nvme_bdev *bdev;
4868 : uint32_t nsid;
4869 : int rc;
4870 : uint64_t num_sectors;
4871 :
4872 62 : if (ctx) {
4873 : /* Initialize this count to 1 to handle the populate functions
4874 : * calling nvme_ctrlr_populate_namespace_done() immediately.
4875 : */
4876 46 : ctx->populates_in_progress = 1;
4877 : }
4878 :
4879 : /* First loop over our existing namespaces and see if they have been
4880 : * removed. */
4881 62 : nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
4882 66 : while (nvme_ns != NULL) {
4883 4 : next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
4884 :
4885 4 : if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) {
4886 : /* NS is still there or added again. Its attributes may have changed. */
4887 3 : ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
4888 3 : if (nvme_ns->ns != ns) {
4889 1 : assert(nvme_ns->ns == NULL);
4890 1 : nvme_ns->ns = ns;
4891 1 : SPDK_DEBUGLOG(bdev_nvme, "NSID %u was added\n", nvme_ns->id);
4892 : }
4893 :
4894 3 : num_sectors = spdk_nvme_ns_get_num_sectors(ns);
4895 3 : bdev = nvme_ns->bdev;
4896 3 : assert(bdev != NULL);
4897 3 : if (bdev->disk.blockcnt != num_sectors) {
4898 1 : SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
4899 : nvme_ns->id,
4900 : bdev->disk.name,
4901 : bdev->disk.blockcnt,
4902 : num_sectors);
4903 1 : rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
4904 1 : if (rc != 0) {
4905 0 : SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
4906 : bdev->disk.name, rc);
4907 : }
4908 : }
4909 : } else {
4910 : /* Namespace was removed */
4911 1 : nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
4912 : }
4913 :
4914 4 : nvme_ns = next;
4915 : }
4916 :
4917 : /* Loop through all of the namespaces at the nvme level and see if any of them are new */
4918 62 : nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
4919 116 : while (nsid != 0) {
4920 54 : nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
4921 :
4922 54 : if (nvme_ns == NULL) {
4923 : /* Found a new one */
4924 51 : nvme_ns = nvme_ns_alloc();
4925 51 : if (nvme_ns == NULL) {
4926 0 : SPDK_ERRLOG("Failed to allocate namespace\n");
4927 : /* This just fails to attach the namespace. It may work on a future attempt. */
4928 0 : continue;
4929 : }
4930 :
4931 51 : nvme_ns->id = nsid;
4932 51 : nvme_ns->ctrlr = nvme_ctrlr;
4933 :
4934 51 : nvme_ns->bdev = NULL;
4935 :
4936 51 : if (ctx) {
4937 50 : ctx->populates_in_progress++;
4938 : }
4939 51 : nvme_ns->probe_ctx = ctx;
4940 :
4941 51 : RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
4942 :
4943 51 : nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns);
4944 : }
4945 :
4946 54 : nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid);
4947 : }
4948 :
4949 62 : if (ctx) {
4950 : /* Decrement this count now that the loop is over to account
4951 : * for the one we started with. If the count is then 0, we
4952 : * know any populate_namespace functions completed immediately,
4953 : * so we'll kick the callback here.
4954 : */
4955 46 : ctx->populates_in_progress--;
4956 46 : if (ctx->populates_in_progress == 0) {
4957 34 : nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
4958 : }
4959 : }
4960 :
4961 62 : }
4962 :
4963 : static void
4964 61 : nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr)
4965 : {
4966 : struct nvme_ns *nvme_ns, *tmp;
4967 :
4968 109 : RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) {
4969 48 : nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
4970 : }
4971 61 : }
4972 :
4973 : static uint32_t
4974 36 : nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr)
4975 : {
4976 36 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
4977 : const struct spdk_nvme_ctrlr_data *cdata;
4978 36 : uint32_t nsid, ns_count = 0;
4979 :
4980 36 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
4981 :
4982 36 : for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
4983 80 : nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) {
4984 44 : ns_count++;
4985 : }
4986 :
4987 36 : return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
4988 36 : sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count *
4989 : sizeof(uint32_t);
4990 : }
4991 :
4992 : static int
4993 7 : nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc,
4994 : void *cb_arg)
4995 : {
4996 7 : struct nvme_ctrlr *nvme_ctrlr = cb_arg;
4997 : struct nvme_ns *nvme_ns;
4998 : uint32_t i, nsid;
4999 :
5000 13 : for (i = 0; i < desc->num_of_nsid; i++) {
5001 6 : nsid = desc->nsid[i];
5002 6 : if (nsid == 0) {
5003 0 : continue;
5004 : }
5005 :
5006 6 : nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
5007 :
5008 6 : if (nvme_ns == NULL) {
5009 : /* Target told us that an inactive namespace had an ANA change */
5010 1 : continue;
5011 : }
5012 :
5013 5 : _nvme_ns_set_ana_state(nvme_ns, desc);
5014 : }
5015 :
5016 7 : return 0;
5017 : }
5018 :
5019 : static void
5020 0 : bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
5021 : {
5022 : struct nvme_ns *nvme_ns;
5023 :
5024 0 : spdk_free(nvme_ctrlr->ana_log_page);
5025 0 : nvme_ctrlr->ana_log_page = NULL;
5026 :
5027 0 : for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
5028 0 : nvme_ns != NULL;
5029 0 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) {
5030 0 : nvme_ns->ana_state_updating = false;
5031 0 : nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
5032 : }
5033 0 : }
5034 :
5035 : static void
5036 3 : nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl)
5037 : {
5038 3 : struct nvme_ctrlr *nvme_ctrlr = ctx;
5039 :
5040 3 : if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) {
5041 3 : bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states,
5042 : nvme_ctrlr);
5043 : } else {
5044 0 : bdev_nvme_disable_read_ana_log_page(nvme_ctrlr);
5045 : }
5046 :
5047 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
5048 :
5049 3 : assert(nvme_ctrlr->ana_log_page_updating == true);
5050 3 : nvme_ctrlr->ana_log_page_updating = false;
5051 :
5052 3 : if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
5053 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5054 :
5055 0 : nvme_ctrlr_unregister(nvme_ctrlr);
5056 : } else {
5057 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5058 :
5059 3 : bdev_nvme_clear_io_path_caches(nvme_ctrlr);
5060 : }
5061 3 : }
5062 :
5063 : static int
5064 6 : nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
5065 : {
5066 : uint32_t ana_log_page_size;
5067 : int rc;
5068 :
5069 6 : if (nvme_ctrlr->ana_log_page == NULL) {
5070 0 : return -EINVAL;
5071 : }
5072 :
5073 6 : ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr);
5074 :
5075 6 : if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) {
5076 0 : SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n",
5077 : ana_log_page_size, nvme_ctrlr->max_ana_log_page_size);
5078 0 : return -EINVAL;
5079 : }
5080 :
5081 6 : pthread_mutex_lock(&nvme_ctrlr->mutex);
5082 6 : if (!nvme_ctrlr_is_available(nvme_ctrlr) ||
5083 : nvme_ctrlr->ana_log_page_updating) {
5084 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5085 3 : return -EBUSY;
5086 : }
5087 :
5088 3 : nvme_ctrlr->ana_log_page_updating = true;
5089 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5090 :
5091 3 : rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr,
5092 : SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
5093 : SPDK_NVME_GLOBAL_NS_TAG,
5094 3 : nvme_ctrlr->ana_log_page,
5095 : ana_log_page_size, 0,
5096 : nvme_ctrlr_read_ana_log_page_done,
5097 : nvme_ctrlr);
5098 3 : if (rc != 0) {
5099 0 : nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL);
5100 : }
5101 :
5102 3 : return rc;
5103 : }
5104 :
5105 : static void
5106 0 : dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
5107 : {
5108 0 : }
5109 :
5110 : struct bdev_nvme_set_preferred_path_ctx {
5111 : struct spdk_bdev_desc *desc;
5112 : struct nvme_ns *nvme_ns;
5113 : bdev_nvme_set_preferred_path_cb cb_fn;
5114 : void *cb_arg;
5115 : };
5116 :
5117 : static void
5118 3 : bdev_nvme_set_preferred_path_done(struct nvme_bdev *nbdev, void *_ctx, int status)
5119 : {
5120 3 : struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx;
5121 :
5122 3 : assert(ctx != NULL);
5123 3 : assert(ctx->desc != NULL);
5124 3 : assert(ctx->cb_fn != NULL);
5125 :
5126 3 : spdk_bdev_close(ctx->desc);
5127 :
5128 3 : ctx->cb_fn(ctx->cb_arg, status);
5129 :
5130 3 : free(ctx);
5131 3 : }
5132 :
5133 : static void
5134 2 : _bdev_nvme_set_preferred_path(struct nvme_bdev_channel_iter *i,
5135 : struct nvme_bdev *nbdev,
5136 : struct nvme_bdev_channel *nbdev_ch, void *_ctx)
5137 : {
5138 2 : struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx;
5139 : struct nvme_io_path *io_path, *prev;
5140 :
5141 2 : prev = NULL;
5142 3 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
5143 3 : if (io_path->nvme_ns == ctx->nvme_ns) {
5144 2 : break;
5145 : }
5146 1 : prev = io_path;
5147 : }
5148 :
5149 2 : if (io_path != NULL) {
5150 2 : if (prev != NULL) {
5151 1 : STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq);
5152 1 : STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq);
5153 : }
5154 :
5155 : /* We can set io_path to nbdev_ch->current_io_path directly here.
5156 : * However, it needs to be conditional. To simplify the code,
5157 : * just clear nbdev_ch->current_io_path and let find_io_path()
5158 : * fill it.
5159 : *
5160 : * Automatic failback may be disabled. Hence even if the io_path is
5161 : * already at the head, clear nbdev_ch->current_io_path.
5162 : */
5163 2 : bdev_nvme_clear_current_io_path(nbdev_ch);
5164 : }
5165 :
5166 2 : nvme_bdev_for_each_channel_continue(i, 0);
5167 2 : }
5168 :
5169 : static struct nvme_ns *
5170 3 : bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid)
5171 : {
5172 : struct nvme_ns *nvme_ns, *prev;
5173 : const struct spdk_nvme_ctrlr_data *cdata;
5174 :
5175 3 : prev = NULL;
5176 6 : TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
5177 6 : cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);
5178 :
5179 6 : if (cdata->cntlid == cntlid) {
5180 3 : break;
5181 : }
5182 3 : prev = nvme_ns;
5183 : }
5184 :
5185 3 : if (nvme_ns != NULL && prev != NULL) {
5186 2 : TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq);
5187 2 : TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq);
5188 : }
5189 :
5190 3 : return nvme_ns;
5191 : }
5192 :
5193 : /* This function supports only multipath mode. There is only a single I/O path
5194 : * for each NVMe-oF controller. Hence, just move the matched I/O path to the
5195 : * head of the I/O path list for each NVMe bdev channel.
5196 : *
5197 : * NVMe bdev channel may be acquired after completing this function. move the
5198 : * matched namespace to the head of the namespace list for the NVMe bdev too.
5199 : */
5200 : void
5201 3 : bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
5202 : bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg)
5203 : {
5204 : struct bdev_nvme_set_preferred_path_ctx *ctx;
5205 : struct spdk_bdev *bdev;
5206 : struct nvme_bdev *nbdev;
5207 3 : int rc = 0;
5208 :
5209 3 : assert(cb_fn != NULL);
5210 :
5211 3 : ctx = calloc(1, sizeof(*ctx));
5212 3 : if (ctx == NULL) {
5213 0 : SPDK_ERRLOG("Failed to alloc context.\n");
5214 0 : rc = -ENOMEM;
5215 0 : goto err_alloc;
5216 : }
5217 :
5218 3 : ctx->cb_fn = cb_fn;
5219 3 : ctx->cb_arg = cb_arg;
5220 :
5221 3 : rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc);
5222 3 : if (rc != 0) {
5223 0 : SPDK_ERRLOG("Failed to open bdev %s.\n", name);
5224 0 : goto err_open;
5225 : }
5226 :
5227 3 : bdev = spdk_bdev_desc_get_bdev(ctx->desc);
5228 :
5229 3 : if (bdev->module != &nvme_if) {
5230 0 : SPDK_ERRLOG("bdev %s is not registered in this module.\n", name);
5231 0 : rc = -ENODEV;
5232 0 : goto err_bdev;
5233 : }
5234 :
5235 3 : nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
5236 :
5237 3 : pthread_mutex_lock(&nbdev->mutex);
5238 :
5239 3 : ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid);
5240 3 : if (ctx->nvme_ns == NULL) {
5241 0 : pthread_mutex_unlock(&nbdev->mutex);
5242 :
5243 0 : SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid);
5244 0 : rc = -ENODEV;
5245 0 : goto err_bdev;
5246 : }
5247 :
5248 3 : pthread_mutex_unlock(&nbdev->mutex);
5249 :
5250 3 : nvme_bdev_for_each_channel(nbdev,
5251 : _bdev_nvme_set_preferred_path,
5252 : ctx,
5253 : bdev_nvme_set_preferred_path_done);
5254 3 : return;
5255 :
5256 0 : err_bdev:
5257 0 : spdk_bdev_close(ctx->desc);
5258 0 : err_open:
5259 0 : free(ctx);
5260 0 : err_alloc:
5261 0 : cb_fn(cb_arg, rc);
5262 : }
5263 :
5264 : struct bdev_nvme_set_multipath_policy_ctx {
5265 : struct spdk_bdev_desc *desc;
5266 : spdk_bdev_nvme_set_multipath_policy_cb cb_fn;
5267 : void *cb_arg;
5268 : };
5269 :
5270 : static void
5271 3 : bdev_nvme_set_multipath_policy_done(struct nvme_bdev *nbdev, void *_ctx, int status)
5272 : {
5273 3 : struct bdev_nvme_set_multipath_policy_ctx *ctx = _ctx;
5274 :
5275 3 : assert(ctx != NULL);
5276 3 : assert(ctx->desc != NULL);
5277 3 : assert(ctx->cb_fn != NULL);
5278 :
5279 3 : spdk_bdev_close(ctx->desc);
5280 :
5281 3 : ctx->cb_fn(ctx->cb_arg, status);
5282 :
5283 3 : free(ctx);
5284 3 : }
5285 :
5286 : static void
5287 1 : _bdev_nvme_set_multipath_policy(struct nvme_bdev_channel_iter *i,
5288 : struct nvme_bdev *nbdev,
5289 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
5290 : {
5291 1 : nbdev_ch->mp_policy = nbdev->mp_policy;
5292 1 : nbdev_ch->mp_selector = nbdev->mp_selector;
5293 1 : nbdev_ch->rr_min_io = nbdev->rr_min_io;
5294 1 : bdev_nvme_clear_current_io_path(nbdev_ch);
5295 :
5296 1 : nvme_bdev_for_each_channel_continue(i, 0);
5297 1 : }
5298 :
5299 : void
5300 3 : spdk_bdev_nvme_set_multipath_policy(const char *name, enum spdk_bdev_nvme_multipath_policy policy,
5301 : enum spdk_bdev_nvme_multipath_selector selector, uint32_t rr_min_io,
5302 : spdk_bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg)
5303 : {
5304 : struct bdev_nvme_set_multipath_policy_ctx *ctx;
5305 : struct spdk_bdev *bdev;
5306 : struct nvme_bdev *nbdev;
5307 : int rc;
5308 :
5309 3 : assert(cb_fn != NULL);
5310 :
5311 3 : switch (policy) {
5312 1 : case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE:
5313 1 : break;
5314 2 : case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE:
5315 : switch (selector) {
5316 1 : case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN:
5317 1 : if (rr_min_io == UINT32_MAX) {
5318 0 : rr_min_io = 1;
5319 1 : } else if (rr_min_io == 0) {
5320 0 : rc = -EINVAL;
5321 0 : goto exit;
5322 : }
5323 1 : break;
5324 1 : case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH:
5325 1 : break;
5326 0 : default:
5327 0 : rc = -EINVAL;
5328 0 : goto exit;
5329 : }
5330 2 : break;
5331 0 : default:
5332 0 : rc = -EINVAL;
5333 0 : goto exit;
5334 : }
5335 :
5336 3 : ctx = calloc(1, sizeof(*ctx));
5337 3 : if (ctx == NULL) {
5338 0 : SPDK_ERRLOG("Failed to alloc context.\n");
5339 0 : rc = -ENOMEM;
5340 0 : goto exit;
5341 : }
5342 :
5343 3 : ctx->cb_fn = cb_fn;
5344 3 : ctx->cb_arg = cb_arg;
5345 :
5346 3 : rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc);
5347 3 : if (rc != 0) {
5348 0 : SPDK_ERRLOG("Failed to open bdev %s.\n", name);
5349 0 : rc = -ENODEV;
5350 0 : goto err_open;
5351 : }
5352 :
5353 3 : bdev = spdk_bdev_desc_get_bdev(ctx->desc);
5354 3 : if (bdev->module != &nvme_if) {
5355 0 : SPDK_ERRLOG("bdev %s is not registered in this module.\n", name);
5356 0 : rc = -ENODEV;
5357 0 : goto err_module;
5358 : }
5359 3 : nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
5360 :
5361 3 : pthread_mutex_lock(&nbdev->mutex);
5362 3 : nbdev->mp_policy = policy;
5363 3 : nbdev->mp_selector = selector;
5364 3 : nbdev->rr_min_io = rr_min_io;
5365 3 : pthread_mutex_unlock(&nbdev->mutex);
5366 :
5367 3 : nvme_bdev_for_each_channel(nbdev,
5368 : _bdev_nvme_set_multipath_policy,
5369 : ctx,
5370 : bdev_nvme_set_multipath_policy_done);
5371 3 : return;
5372 :
5373 0 : err_module:
5374 0 : spdk_bdev_close(ctx->desc);
5375 0 : err_open:
5376 0 : free(ctx);
5377 0 : exit:
5378 0 : cb_fn(cb_arg, rc);
5379 : }
5380 :
5381 : static void
5382 3 : aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
5383 : {
5384 3 : struct nvme_ctrlr *nvme_ctrlr = arg;
5385 : union spdk_nvme_async_event_completion event;
5386 :
5387 3 : if (spdk_nvme_cpl_is_error(cpl)) {
5388 0 : SPDK_WARNLOG("AER request execute failed\n");
5389 0 : return;
5390 : }
5391 :
5392 3 : event.raw = cpl->cdw0;
5393 3 : if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
5394 3 : (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
5395 2 : nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL);
5396 1 : } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
5397 1 : (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) {
5398 1 : nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
5399 : }
5400 : }
5401 :
5402 : static void
5403 52 : free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx)
5404 : {
5405 52 : spdk_keyring_put_key(ctx->drv_opts.tls_psk);
5406 52 : spdk_keyring_put_key(ctx->drv_opts.dhchap_key);
5407 52 : spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key);
5408 52 : free(ctx);
5409 52 : }
5410 :
5411 : static void
5412 52 : populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc)
5413 : {
5414 52 : if (ctx->cb_fn) {
5415 52 : ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc);
5416 : }
5417 :
5418 52 : ctx->namespaces_populated = true;
5419 52 : if (ctx->probe_done) {
5420 : /* The probe was already completed, so we need to free the context
5421 : * here. This can happen for cases like OCSSD, where we need to
5422 : * send additional commands to the SSD after attach.
5423 : */
5424 31 : free_nvme_async_probe_ctx(ctx);
5425 : }
5426 52 : }
5427 :
5428 : static int
5429 19 : bdev_nvme_remove_poller(void *ctx)
5430 : {
5431 19 : struct spdk_nvme_transport_id trid_pcie;
5432 :
5433 19 : if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
5434 1 : spdk_poller_unregister(&g_hotplug_poller);
5435 1 : return SPDK_POLLER_IDLE;
5436 : }
5437 :
5438 18 : memset(&trid_pcie, 0, sizeof(trid_pcie));
5439 18 : spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
5440 :
5441 18 : if (spdk_nvme_scan_attached(&trid_pcie)) {
5442 0 : SPDK_ERRLOG_RATELIMIT("spdk_nvme_scan_attached() failed\n");
5443 : }
5444 :
5445 18 : return SPDK_POLLER_BUSY;
5446 : }
5447 :
5448 : static void
5449 60 : nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr,
5450 : struct nvme_async_probe_ctx *ctx)
5451 : {
5452 60 : spdk_io_device_register(nvme_ctrlr,
5453 : bdev_nvme_create_ctrlr_channel_cb,
5454 : bdev_nvme_destroy_ctrlr_channel_cb,
5455 : sizeof(struct nvme_ctrlr_channel),
5456 60 : nvme_ctrlr->nbdev_ctrlr->name);
5457 :
5458 60 : nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx);
5459 :
5460 60 : if (g_hotplug_poller == NULL) {
5461 2 : g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL,
5462 : NVME_HOTPLUG_POLL_PERIOD_DEFAULT);
5463 : }
5464 60 : }
5465 :
5466 : static void
5467 30 : nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl)
5468 : {
5469 30 : struct nvme_ctrlr *nvme_ctrlr = _ctx;
5470 30 : struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx;
5471 :
5472 30 : nvme_ctrlr->probe_ctx = NULL;
5473 :
5474 30 : if (spdk_nvme_cpl_is_error(cpl)) {
5475 0 : nvme_ctrlr_delete(nvme_ctrlr);
5476 :
5477 0 : if (ctx != NULL) {
5478 0 : ctx->reported_bdevs = 0;
5479 0 : populate_namespaces_cb(ctx, -1);
5480 : }
5481 0 : return;
5482 : }
5483 :
5484 30 : nvme_ctrlr_create_done(nvme_ctrlr, ctx);
5485 : }
5486 :
5487 : static int
5488 30 : nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
5489 : struct nvme_async_probe_ctx *ctx)
5490 : {
5491 30 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5492 : const struct spdk_nvme_ctrlr_data *cdata;
5493 : uint32_t ana_log_page_size;
5494 :
5495 30 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5496 :
5497 : /* Set buffer size enough to include maximum number of allowed namespaces. */
5498 30 : ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
5499 30 : sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan *
5500 : sizeof(uint32_t);
5501 :
5502 30 : nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL,
5503 : SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
5504 30 : if (nvme_ctrlr->ana_log_page == NULL) {
5505 0 : SPDK_ERRLOG("could not allocate ANA log page buffer\n");
5506 0 : return -ENXIO;
5507 : }
5508 :
5509 : /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned.
5510 : * Hence copy each descriptor to a temporary area when parsing it.
5511 : *
5512 : * Allocate a buffer whose size is as large as ANA log page buffer because
5513 : * we do not know the size of a descriptor until actually reading it.
5514 : */
5515 30 : nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size);
5516 30 : if (nvme_ctrlr->copied_ana_desc == NULL) {
5517 0 : SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n");
5518 0 : return -ENOMEM;
5519 : }
5520 :
5521 30 : nvme_ctrlr->max_ana_log_page_size = ana_log_page_size;
5522 :
5523 30 : nvme_ctrlr->probe_ctx = ctx;
5524 :
5525 : /* Then, set the read size only to include the current active namespaces. */
5526 30 : ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr);
5527 :
5528 30 : if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) {
5529 0 : SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n",
5530 : ana_log_page_size, nvme_ctrlr->max_ana_log_page_size);
5531 0 : return -EINVAL;
5532 : }
5533 :
5534 30 : return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr,
5535 : SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
5536 : SPDK_NVME_GLOBAL_NS_TAG,
5537 30 : nvme_ctrlr->ana_log_page,
5538 : ana_log_page_size, 0,
5539 : nvme_ctrlr_init_ana_log_page_done,
5540 : nvme_ctrlr);
5541 : }
5542 :
5543 : /* hostnqn and subnqn were already verified before attaching a controller.
5544 : * Hence check only the multipath capability and cntlid here.
5545 : */
5546 : static bool
5547 16 : bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr)
5548 : {
5549 : struct nvme_ctrlr *tmp;
5550 : const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata;
5551 :
5552 16 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5553 :
5554 16 : if (!cdata->cmic.multi_ctrlr) {
5555 0 : SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid);
5556 0 : return false;
5557 : }
5558 :
5559 33 : TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) {
5560 18 : tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr);
5561 :
5562 18 : if (!tmp_cdata->cmic.multi_ctrlr) {
5563 0 : SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid);
5564 0 : return false;
5565 : }
5566 18 : if (cdata->cntlid == tmp_cdata->cntlid) {
5567 1 : SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid);
5568 1 : return false;
5569 : }
5570 : }
5571 :
5572 15 : return true;
5573 : }
5574 :
5575 1 : SPDK_LOG_DEPRECATION_REGISTER(multipath_config,
5576 : "bdev_nvme_attach_controller.multipath configuration mismatch", "v25.01", 0);
5577 :
5578 : static int
5579 61 : nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr)
5580 : {
5581 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
5582 61 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5583 : struct nvme_ctrlr *nctrlr;
5584 61 : int rc = 0;
5585 :
5586 61 : pthread_mutex_lock(&g_bdev_nvme_mutex);
5587 :
5588 61 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
5589 61 : if (nbdev_ctrlr != NULL) {
5590 16 : if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) {
5591 1 : rc = -EINVAL;
5592 1 : goto exit;
5593 : }
5594 32 : TAILQ_FOREACH(nctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
5595 17 : if (nctrlr->opts.multipath != nvme_ctrlr->opts.multipath) {
5596 : /* All controllers created with the same name must be configured either
5597 : * for multipath or for failover. Otherwise we have configuration mismatch.
5598 : * While this is currently still supported, support for configuration where some
5599 : * controllers with the same name are configured for multipath, while others
5600 : * are configured for failover will be removed in release 25.01.
5601 : * Default mode change: starting from SPDK 25.01, if the user will not provide
5602 : * '-x <mode>' parameter in the bdev_nvme_attach_controller RPC call, default
5603 : * mode assigned to the controller will be 'multipath'
5604 : */
5605 0 : SPDK_LOG_DEPRECATED(multipath_config);
5606 0 : break;
5607 : }
5608 : }
5609 : } else {
5610 45 : nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr));
5611 45 : if (nbdev_ctrlr == NULL) {
5612 0 : SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n");
5613 0 : rc = -ENOMEM;
5614 0 : goto exit;
5615 : }
5616 45 : nbdev_ctrlr->name = strdup(name);
5617 45 : if (nbdev_ctrlr->name == NULL) {
5618 0 : SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n");
5619 0 : free(nbdev_ctrlr);
5620 0 : goto exit;
5621 : }
5622 45 : TAILQ_INIT(&nbdev_ctrlr->ctrlrs);
5623 45 : TAILQ_INIT(&nbdev_ctrlr->bdevs);
5624 45 : TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
5625 : }
5626 60 : nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr;
5627 60 : TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
5628 61 : exit:
5629 61 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
5630 61 : return rc;
5631 : }
5632 :
5633 : static int
5634 61 : nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
5635 : const char *name,
5636 : const struct spdk_nvme_transport_id *trid,
5637 : struct nvme_async_probe_ctx *ctx)
5638 : {
5639 : struct nvme_ctrlr *nvme_ctrlr;
5640 : struct nvme_path_id *path_id;
5641 : const struct spdk_nvme_ctrlr_data *cdata;
5642 : int rc;
5643 :
5644 61 : nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr));
5645 61 : if (nvme_ctrlr == NULL) {
5646 0 : SPDK_ERRLOG("Failed to allocate device struct\n");
5647 0 : return -ENOMEM;
5648 : }
5649 :
5650 61 : rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL);
5651 61 : if (rc != 0) {
5652 0 : free(nvme_ctrlr);
5653 0 : return rc;
5654 : }
5655 :
5656 61 : TAILQ_INIT(&nvme_ctrlr->trids);
5657 61 : RB_INIT(&nvme_ctrlr->namespaces);
5658 :
5659 : /* Get another reference to the key, so the first one can be released from probe_ctx */
5660 61 : if (ctx != NULL) {
5661 47 : if (ctx->drv_opts.tls_psk != NULL) {
5662 0 : nvme_ctrlr->psk = spdk_keyring_get_key(
5663 : spdk_key_get_name(ctx->drv_opts.tls_psk));
5664 0 : if (nvme_ctrlr->psk == NULL) {
5665 : /* Could only happen if the key was removed in the meantime */
5666 0 : SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n",
5667 : spdk_key_get_name(ctx->drv_opts.tls_psk));
5668 0 : rc = -ENOKEY;
5669 0 : goto err;
5670 : }
5671 : }
5672 :
5673 47 : if (ctx->drv_opts.dhchap_key != NULL) {
5674 0 : nvme_ctrlr->dhchap_key = spdk_keyring_get_key(
5675 : spdk_key_get_name(ctx->drv_opts.dhchap_key));
5676 0 : if (nvme_ctrlr->dhchap_key == NULL) {
5677 0 : SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n",
5678 : spdk_key_get_name(ctx->drv_opts.dhchap_key));
5679 0 : rc = -ENOKEY;
5680 0 : goto err;
5681 : }
5682 : }
5683 :
5684 47 : if (ctx->drv_opts.dhchap_ctrlr_key != NULL) {
5685 0 : nvme_ctrlr->dhchap_ctrlr_key =
5686 0 : spdk_keyring_get_key(
5687 : spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key));
5688 0 : if (nvme_ctrlr->dhchap_ctrlr_key == NULL) {
5689 0 : SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n",
5690 : spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key));
5691 0 : rc = -ENOKEY;
5692 0 : goto err;
5693 : }
5694 : }
5695 : }
5696 :
5697 61 : path_id = calloc(1, sizeof(*path_id));
5698 61 : if (path_id == NULL) {
5699 0 : SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
5700 0 : rc = -ENOMEM;
5701 0 : goto err;
5702 : }
5703 :
5704 61 : path_id->trid = *trid;
5705 61 : if (ctx != NULL) {
5706 47 : memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr));
5707 47 : memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid));
5708 : }
5709 61 : nvme_ctrlr->active_path_id = path_id;
5710 61 : TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link);
5711 :
5712 61 : nvme_ctrlr->thread = spdk_get_thread();
5713 61 : nvme_ctrlr->ctrlr = ctrlr;
5714 61 : nvme_ctrlr->ref = 1;
5715 :
5716 61 : if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
5717 0 : SPDK_ERRLOG("OCSSDs are not supported");
5718 0 : rc = -ENOTSUP;
5719 0 : goto err;
5720 : }
5721 :
5722 61 : if (ctx != NULL) {
5723 47 : memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts));
5724 : } else {
5725 14 : spdk_bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts);
5726 : }
5727 :
5728 61 : nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr,
5729 : g_opts.nvme_adminq_poll_period_us);
5730 :
5731 61 : if (g_opts.timeout_us > 0) {
5732 : /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */
5733 : /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */
5734 0 : uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ?
5735 0 : g_opts.timeout_us : g_opts.timeout_admin_us;
5736 0 : spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
5737 : adm_timeout_us, timeout_cb, nvme_ctrlr);
5738 : }
5739 :
5740 61 : spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr);
5741 61 : spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr);
5742 :
5743 61 : if (spdk_nvme_ctrlr_get_flags(ctrlr) &
5744 : SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
5745 0 : nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr);
5746 : }
5747 :
5748 61 : rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr);
5749 61 : if (rc != 0) {
5750 1 : goto err;
5751 : }
5752 :
5753 60 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5754 :
5755 60 : if (cdata->cmic.ana_reporting) {
5756 30 : rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx);
5757 30 : if (rc == 0) {
5758 30 : return 0;
5759 : }
5760 : } else {
5761 30 : nvme_ctrlr_create_done(nvme_ctrlr, ctx);
5762 30 : return 0;
5763 : }
5764 :
5765 1 : err:
5766 1 : nvme_ctrlr_delete(nvme_ctrlr);
5767 1 : return rc;
5768 : }
5769 :
5770 : void
5771 56 : spdk_bdev_nvme_get_default_ctrlr_opts(struct spdk_bdev_nvme_ctrlr_opts *opts)
5772 : {
5773 56 : opts->prchk_flags = 0;
5774 56 : opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec;
5775 56 : opts->reconnect_delay_sec = g_opts.reconnect_delay_sec;
5776 56 : opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec;
5777 56 : opts->multipath = false;
5778 56 : }
5779 :
5780 : static void
5781 0 : attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
5782 : struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts)
5783 : {
5784 : char *name;
5785 :
5786 0 : name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
5787 0 : if (!name) {
5788 0 : SPDK_ERRLOG("Failed to assign name to NVMe device\n");
5789 0 : return;
5790 : }
5791 :
5792 0 : if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) {
5793 0 : SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
5794 : } else {
5795 0 : SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name);
5796 : }
5797 :
5798 0 : free(name);
5799 : }
5800 :
5801 : static void
5802 60 : _nvme_ctrlr_destruct(void *ctx)
5803 : {
5804 60 : struct nvme_ctrlr *nvme_ctrlr = ctx;
5805 :
5806 60 : nvme_ctrlr_depopulate_namespaces(nvme_ctrlr);
5807 60 : nvme_ctrlr_release(nvme_ctrlr);
5808 60 : }
5809 :
5810 : static int
5811 57 : bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
5812 : {
5813 : struct nvme_probe_skip_entry *entry;
5814 :
5815 : /* The controller's destruction was already started */
5816 57 : if (nvme_ctrlr->destruct) {
5817 0 : return -EALREADY;
5818 : }
5819 :
5820 57 : if (!hotplug &&
5821 57 : nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
5822 0 : entry = calloc(1, sizeof(*entry));
5823 0 : if (!entry) {
5824 0 : return -ENOMEM;
5825 : }
5826 0 : entry->trid = nvme_ctrlr->active_path_id->trid;
5827 0 : TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
5828 : }
5829 :
5830 57 : nvme_ctrlr->destruct = true;
5831 57 : return 0;
5832 : }
5833 :
5834 : static int
5835 2 : bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
5836 : {
5837 : int rc;
5838 :
5839 2 : pthread_mutex_lock(&nvme_ctrlr->mutex);
5840 2 : rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug);
5841 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5842 :
5843 2 : if (rc == 0) {
5844 2 : _nvme_ctrlr_destruct(nvme_ctrlr);
5845 0 : } else if (rc == -EALREADY) {
5846 0 : rc = 0;
5847 : }
5848 :
5849 2 : return rc;
5850 : }
5851 :
5852 : static void
5853 0 : remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
5854 : {
5855 0 : struct nvme_ctrlr *nvme_ctrlr = cb_ctx;
5856 :
5857 0 : bdev_nvme_delete_ctrlr(nvme_ctrlr, true);
5858 0 : }
5859 :
5860 : static int
5861 0 : bdev_nvme_hotplug_probe(void *arg)
5862 : {
5863 0 : if (g_hotplug_probe_ctx == NULL) {
5864 0 : spdk_poller_unregister(&g_hotplug_probe_poller);
5865 0 : return SPDK_POLLER_IDLE;
5866 : }
5867 :
5868 0 : if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
5869 0 : g_hotplug_probe_ctx = NULL;
5870 0 : spdk_poller_unregister(&g_hotplug_probe_poller);
5871 : }
5872 :
5873 0 : return SPDK_POLLER_BUSY;
5874 : }
5875 :
5876 : static int
5877 0 : bdev_nvme_hotplug(void *arg)
5878 : {
5879 0 : struct spdk_nvme_transport_id trid_pcie;
5880 :
5881 0 : if (g_hotplug_probe_ctx) {
5882 0 : return SPDK_POLLER_BUSY;
5883 : }
5884 :
5885 0 : memset(&trid_pcie, 0, sizeof(trid_pcie));
5886 0 : spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
5887 :
5888 0 : g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
5889 : hotplug_probe_cb, attach_cb, NULL);
5890 :
5891 0 : if (g_hotplug_probe_ctx) {
5892 0 : assert(g_hotplug_probe_poller == NULL);
5893 0 : g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
5894 : }
5895 :
5896 0 : return SPDK_POLLER_BUSY;
5897 : }
5898 :
5899 : void
5900 0 : bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
5901 : {
5902 0 : *opts = g_opts;
5903 0 : }
5904 :
5905 : static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec,
5906 : uint32_t reconnect_delay_sec,
5907 : uint32_t fast_io_fail_timeout_sec);
5908 :
5909 : static int
5910 0 : bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
5911 : {
5912 0 : if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) {
5913 : /* Can't set timeout_admin_us without also setting timeout_us */
5914 0 : SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n");
5915 0 : return -EINVAL;
5916 : }
5917 :
5918 0 : if (opts->bdev_retry_count < -1) {
5919 0 : SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n");
5920 0 : return -EINVAL;
5921 : }
5922 :
5923 0 : if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec,
5924 0 : opts->reconnect_delay_sec,
5925 0 : opts->fast_io_fail_timeout_sec)) {
5926 0 : return -EINVAL;
5927 : }
5928 :
5929 0 : return 0;
5930 : }
5931 :
5932 : int
5933 0 : bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
5934 : {
5935 : int ret;
5936 :
5937 0 : ret = bdev_nvme_validate_opts(opts);
5938 0 : if (ret) {
5939 0 : SPDK_WARNLOG("Failed to set nvme opts.\n");
5940 0 : return ret;
5941 : }
5942 :
5943 0 : if (g_bdev_nvme_init_thread != NULL) {
5944 0 : if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
5945 0 : return -EPERM;
5946 : }
5947 : }
5948 :
5949 0 : if (opts->rdma_srq_size != 0 ||
5950 0 : opts->rdma_max_cq_size != 0 ||
5951 0 : opts->rdma_cm_event_timeout_ms != 0) {
5952 0 : struct spdk_nvme_transport_opts drv_opts;
5953 :
5954 0 : spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts));
5955 0 : if (opts->rdma_srq_size != 0) {
5956 0 : drv_opts.rdma_srq_size = opts->rdma_srq_size;
5957 : }
5958 0 : if (opts->rdma_max_cq_size != 0) {
5959 0 : drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size;
5960 : }
5961 0 : if (opts->rdma_cm_event_timeout_ms != 0) {
5962 0 : drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms;
5963 : }
5964 :
5965 0 : ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts));
5966 0 : if (ret) {
5967 0 : SPDK_ERRLOG("Failed to set NVMe transport opts.\n");
5968 0 : return ret;
5969 : }
5970 : }
5971 :
5972 0 : g_opts = *opts;
5973 :
5974 0 : return 0;
5975 : }
5976 :
5977 : struct set_nvme_hotplug_ctx {
5978 : uint64_t period_us;
5979 : bool enabled;
5980 : spdk_msg_fn fn;
5981 : void *fn_ctx;
5982 : };
5983 :
5984 : static void
5985 0 : set_nvme_hotplug_period_cb(void *_ctx)
5986 : {
5987 0 : struct set_nvme_hotplug_ctx *ctx = _ctx;
5988 :
5989 0 : spdk_poller_unregister(&g_hotplug_poller);
5990 0 : if (ctx->enabled) {
5991 0 : g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
5992 : } else {
5993 0 : g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL,
5994 : NVME_HOTPLUG_POLL_PERIOD_DEFAULT);
5995 : }
5996 :
5997 0 : g_nvme_hotplug_poll_period_us = ctx->period_us;
5998 0 : g_nvme_hotplug_enabled = ctx->enabled;
5999 0 : if (ctx->fn) {
6000 0 : ctx->fn(ctx->fn_ctx);
6001 : }
6002 :
6003 0 : free(ctx);
6004 0 : }
6005 :
6006 : int
6007 0 : bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
6008 : {
6009 : struct set_nvme_hotplug_ctx *ctx;
6010 :
6011 0 : if (enabled == true && !spdk_process_is_primary()) {
6012 0 : return -EPERM;
6013 : }
6014 :
6015 0 : ctx = calloc(1, sizeof(*ctx));
6016 0 : if (ctx == NULL) {
6017 0 : return -ENOMEM;
6018 : }
6019 :
6020 0 : period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
6021 0 : ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
6022 0 : ctx->enabled = enabled;
6023 0 : ctx->fn = cb;
6024 0 : ctx->fn_ctx = cb_ctx;
6025 :
6026 0 : spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
6027 0 : return 0;
6028 : }
6029 :
6030 : static void
6031 46 : nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
6032 : struct nvme_async_probe_ctx *ctx)
6033 : {
6034 : struct nvme_ns *nvme_ns;
6035 : struct nvme_bdev *nvme_bdev;
6036 : size_t j;
6037 :
6038 46 : assert(nvme_ctrlr != NULL);
6039 :
6040 46 : if (ctx->names == NULL) {
6041 0 : ctx->reported_bdevs = 0;
6042 0 : populate_namespaces_cb(ctx, 0);
6043 0 : return;
6044 : }
6045 :
6046 : /*
6047 : * Report the new bdevs that were created in this call.
6048 : * There can be more than one bdev per NVMe controller.
6049 : */
6050 46 : j = 0;
6051 46 : nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
6052 94 : while (nvme_ns != NULL) {
6053 48 : nvme_bdev = nvme_ns->bdev;
6054 48 : if (j < ctx->max_bdevs) {
6055 48 : ctx->names[j] = nvme_bdev->disk.name;
6056 48 : j++;
6057 : } else {
6058 0 : SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
6059 : ctx->max_bdevs);
6060 0 : ctx->reported_bdevs = 0;
6061 0 : populate_namespaces_cb(ctx, -ERANGE);
6062 0 : return;
6063 : }
6064 :
6065 48 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
6066 : }
6067 :
6068 46 : ctx->reported_bdevs = j;
6069 46 : populate_namespaces_cb(ctx, 0);
6070 : }
6071 :
6072 : static int
6073 9 : bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
6074 : struct spdk_nvme_ctrlr *new_ctrlr,
6075 : struct spdk_nvme_transport_id *trid)
6076 : {
6077 : struct nvme_path_id *tmp_trid;
6078 :
6079 9 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
6080 0 : SPDK_ERRLOG("PCIe failover is not supported.\n");
6081 0 : return -ENOTSUP;
6082 : }
6083 :
6084 : /* Currently we only support failover to the same transport type. */
6085 9 : if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) {
6086 0 : SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n",
6087 : spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype),
6088 : spdk_nvme_transport_id_trtype_str(trid->trtype));
6089 0 : return -EINVAL;
6090 : }
6091 :
6092 :
6093 : /* Currently we only support failover to the same NQN. */
6094 9 : if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
6095 0 : SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n",
6096 : nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn);
6097 0 : return -EINVAL;
6098 : }
6099 :
6100 : /* Skip all the other checks if we've already registered this path. */
6101 21 : TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
6102 12 : if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
6103 0 : SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr,
6104 : trid->subnqn);
6105 0 : return -EALREADY;
6106 : }
6107 : }
6108 :
6109 9 : return 0;
6110 : }
6111 :
6112 : static int
6113 9 : bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr,
6114 : struct spdk_nvme_ctrlr *new_ctrlr)
6115 : {
6116 : struct nvme_ns *nvme_ns;
6117 : struct spdk_nvme_ns *new_ns;
6118 :
6119 9 : nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
6120 9 : while (nvme_ns != NULL) {
6121 0 : new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id);
6122 0 : assert(new_ns != NULL);
6123 :
6124 0 : if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) {
6125 0 : return -EINVAL;
6126 : }
6127 :
6128 0 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
6129 : }
6130 :
6131 9 : return 0;
6132 : }
6133 :
6134 : static int
6135 9 : _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
6136 : struct spdk_nvme_transport_id *trid)
6137 : {
6138 : struct nvme_path_id *active_id, *new_trid, *tmp_trid;
6139 :
6140 9 : new_trid = calloc(1, sizeof(*new_trid));
6141 9 : if (new_trid == NULL) {
6142 0 : return -ENOMEM;
6143 : }
6144 9 : new_trid->trid = *trid;
6145 :
6146 9 : active_id = nvme_ctrlr->active_path_id;
6147 9 : assert(active_id != NULL);
6148 9 : assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids));
6149 :
6150 : /* Skip the active trid not to replace it until it is failed. */
6151 9 : tmp_trid = TAILQ_NEXT(active_id, link);
6152 9 : if (tmp_trid == NULL) {
6153 6 : goto add_tail;
6154 : }
6155 :
6156 : /* It means the trid is faled if its last failed time is non-zero.
6157 : * Insert the new alternate trid before any failed trid.
6158 : */
6159 5 : TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) {
6160 3 : if (tmp_trid->last_failed_tsc != 0) {
6161 1 : TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
6162 1 : return 0;
6163 : }
6164 : }
6165 :
6166 2 : add_tail:
6167 8 : TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link);
6168 8 : return 0;
6169 : }
6170 :
6171 : /* This is the case that a secondary path is added to an existing
6172 : * nvme_ctrlr for failover. After checking if it can access the same
6173 : * namespaces as the primary path, it is disconnected until failover occurs.
6174 : */
6175 : static int
6176 9 : bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
6177 : struct spdk_nvme_ctrlr *new_ctrlr,
6178 : struct spdk_nvme_transport_id *trid)
6179 : {
6180 : int rc;
6181 :
6182 9 : assert(nvme_ctrlr != NULL);
6183 :
6184 9 : pthread_mutex_lock(&nvme_ctrlr->mutex);
6185 :
6186 9 : rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid);
6187 9 : if (rc != 0) {
6188 0 : goto exit;
6189 : }
6190 :
6191 9 : rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr);
6192 9 : if (rc != 0) {
6193 0 : goto exit;
6194 : }
6195 :
6196 9 : rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid);
6197 :
6198 9 : exit:
6199 9 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6200 :
6201 9 : spdk_nvme_detach(new_ctrlr);
6202 :
6203 9 : return rc;
6204 : }
6205 :
6206 : static void
6207 47 : connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
6208 : struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
6209 : {
6210 47 : struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
6211 : struct nvme_async_probe_ctx *ctx;
6212 : int rc;
6213 :
6214 47 : ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts);
6215 47 : ctx->ctrlr_attached = true;
6216 :
6217 47 : rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx);
6218 47 : if (rc != 0) {
6219 1 : ctx->reported_bdevs = 0;
6220 1 : populate_namespaces_cb(ctx, rc);
6221 : }
6222 47 : }
6223 :
6224 1 : SPDK_LOG_DEPRECATION_REGISTER(failover_config,
6225 : "bdev_nvme_attach_controller.failover configuration mismatch", "v25.01", 0);
6226 :
6227 : static void
6228 4 : connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
6229 : struct spdk_nvme_ctrlr *ctrlr,
6230 : const struct spdk_nvme_ctrlr_opts *opts)
6231 : {
6232 4 : struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
6233 : struct nvme_ctrlr *nvme_ctrlr;
6234 : struct nvme_async_probe_ctx *ctx;
6235 : int rc;
6236 :
6237 4 : ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts);
6238 4 : ctx->ctrlr_attached = true;
6239 :
6240 4 : nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
6241 4 : if (nvme_ctrlr) {
6242 4 : if (nvme_ctrlr->opts.multipath) {
6243 : /* All controllers created with the same name must be configured either
6244 : * for multipath or for failover. Otherwise we have configuration mismatch.
6245 : * While this is currently still supported, support for configuration where some
6246 : * controllers with the same name are configured for multipath, while others
6247 : * are configured for failover will be removed in release 25.01.
6248 : * Default mode change: starting from SPDK 25.01, if the user will not provide
6249 : * '-x <mode>' parameter in the bdev_nvme_attach_controller RPC call, default
6250 : * mode assigned to the controller will be 'multipath'
6251 : */
6252 0 : SPDK_LOG_DEPRECATED(failover_config);
6253 : }
6254 4 : rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid);
6255 : } else {
6256 0 : rc = -ENODEV;
6257 : }
6258 :
6259 4 : ctx->reported_bdevs = 0;
6260 4 : populate_namespaces_cb(ctx, rc);
6261 4 : }
6262 :
6263 : static int
6264 52 : bdev_nvme_async_poll(void *arg)
6265 : {
6266 52 : struct nvme_async_probe_ctx *ctx = arg;
6267 : int rc;
6268 :
6269 52 : rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
6270 52 : if (spdk_unlikely(rc != -EAGAIN)) {
6271 52 : ctx->probe_done = true;
6272 52 : spdk_poller_unregister(&ctx->poller);
6273 52 : if (!ctx->ctrlr_attached) {
6274 : /* The probe is done, but no controller was attached.
6275 : * That means we had a failure, so report -EIO back to
6276 : * the caller (usually the RPC). populate_namespaces_cb()
6277 : * will take care of freeing the nvme_async_probe_ctx.
6278 : */
6279 1 : ctx->reported_bdevs = 0;
6280 1 : populate_namespaces_cb(ctx, -EIO);
6281 51 : } else if (ctx->namespaces_populated) {
6282 : /* The namespaces for the attached controller were all
6283 : * populated and the response was already sent to the
6284 : * caller (usually the RPC). So free the context here.
6285 : */
6286 21 : free_nvme_async_probe_ctx(ctx);
6287 : }
6288 : }
6289 :
6290 52 : return SPDK_POLLER_BUSY;
6291 : }
6292 :
6293 : static bool
6294 29 : bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec,
6295 : uint32_t reconnect_delay_sec,
6296 : uint32_t fast_io_fail_timeout_sec)
6297 : {
6298 29 : if (ctrlr_loss_timeout_sec < -1) {
6299 1 : SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n");
6300 1 : return false;
6301 28 : } else if (ctrlr_loss_timeout_sec == -1) {
6302 14 : if (reconnect_delay_sec == 0) {
6303 1 : SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n");
6304 1 : return false;
6305 13 : } else if (fast_io_fail_timeout_sec != 0 &&
6306 : fast_io_fail_timeout_sec < reconnect_delay_sec) {
6307 1 : SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n");
6308 1 : return false;
6309 : }
6310 14 : } else if (ctrlr_loss_timeout_sec != 0) {
6311 11 : if (reconnect_delay_sec == 0) {
6312 1 : SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n");
6313 1 : return false;
6314 10 : } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) {
6315 1 : SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n");
6316 1 : return false;
6317 9 : } else if (fast_io_fail_timeout_sec != 0) {
6318 6 : if (fast_io_fail_timeout_sec < reconnect_delay_sec) {
6319 1 : SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n");
6320 1 : return false;
6321 5 : } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) {
6322 1 : SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n");
6323 1 : return false;
6324 : }
6325 : }
6326 3 : } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) {
6327 2 : SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n");
6328 2 : return false;
6329 : }
6330 :
6331 20 : return true;
6332 : }
6333 :
6334 : int
6335 52 : spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid,
6336 : const char *base_name,
6337 : const char **names,
6338 : uint32_t count,
6339 : spdk_bdev_nvme_create_cb cb_fn,
6340 : void *cb_ctx,
6341 : struct spdk_nvme_ctrlr_opts *drv_opts,
6342 : struct spdk_bdev_nvme_ctrlr_opts *bdev_opts,
6343 : bool multipath)
6344 : {
6345 : struct nvme_probe_skip_entry *entry, *tmp;
6346 : struct nvme_async_probe_ctx *ctx;
6347 : spdk_nvme_attach_cb attach_cb;
6348 : int len;
6349 :
6350 : /* TODO expand this check to include both the host and target TRIDs.
6351 : * Only if both are the same should we fail.
6352 : */
6353 52 : if (nvme_ctrlr_get(trid, drv_opts->hostnqn) != NULL) {
6354 0 : SPDK_ERRLOG("A controller with the provided trid (traddr: %s, hostnqn: %s) "
6355 : "already exists.\n", trid->traddr, drv_opts->hostnqn);
6356 0 : return -EEXIST;
6357 : }
6358 :
6359 52 : len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX);
6360 :
6361 52 : if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) {
6362 0 : SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1);
6363 0 : return -EINVAL;
6364 : }
6365 :
6366 52 : if (bdev_opts != NULL &&
6367 10 : !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec,
6368 : bdev_opts->reconnect_delay_sec,
6369 : bdev_opts->fast_io_fail_timeout_sec)) {
6370 0 : return -EINVAL;
6371 : }
6372 :
6373 52 : ctx = calloc(1, sizeof(*ctx));
6374 52 : if (!ctx) {
6375 0 : return -ENOMEM;
6376 : }
6377 52 : ctx->base_name = base_name;
6378 52 : ctx->names = names;
6379 52 : ctx->max_bdevs = count;
6380 52 : ctx->cb_fn = cb_fn;
6381 52 : ctx->cb_ctx = cb_ctx;
6382 52 : ctx->trid = *trid;
6383 :
6384 52 : if (bdev_opts) {
6385 10 : memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts));
6386 : } else {
6387 42 : spdk_bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts);
6388 : }
6389 :
6390 52 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
6391 0 : TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
6392 0 : if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
6393 0 : TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
6394 0 : free(entry);
6395 0 : break;
6396 : }
6397 : }
6398 : }
6399 :
6400 52 : memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts));
6401 52 : ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count;
6402 52 : ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout;
6403 52 : ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
6404 52 : ctx->drv_opts.disable_read_ana_log_page = true;
6405 52 : ctx->drv_opts.transport_tos = g_opts.transport_tos;
6406 :
6407 52 : if (ctx->bdev_opts.psk != NULL) {
6408 0 : ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk);
6409 0 : if (ctx->drv_opts.tls_psk == NULL) {
6410 0 : SPDK_ERRLOG("Could not load PSK: %s\n", ctx->bdev_opts.psk);
6411 0 : free_nvme_async_probe_ctx(ctx);
6412 0 : return -ENOKEY;
6413 : }
6414 : }
6415 :
6416 52 : if (ctx->bdev_opts.dhchap_key != NULL) {
6417 0 : ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key);
6418 0 : if (ctx->drv_opts.dhchap_key == NULL) {
6419 0 : SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n",
6420 : ctx->bdev_opts.dhchap_key);
6421 0 : free_nvme_async_probe_ctx(ctx);
6422 0 : return -ENOKEY;
6423 : }
6424 :
6425 0 : ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests;
6426 0 : ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups;
6427 : }
6428 52 : if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) {
6429 0 : ctx->drv_opts.dhchap_ctrlr_key =
6430 0 : spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key);
6431 0 : if (ctx->drv_opts.dhchap_ctrlr_key == NULL) {
6432 0 : SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n",
6433 : ctx->bdev_opts.dhchap_ctrlr_key);
6434 0 : free_nvme_async_probe_ctx(ctx);
6435 0 : return -ENOKEY;
6436 : }
6437 : }
6438 :
6439 52 : if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) {
6440 48 : attach_cb = connect_attach_cb;
6441 : } else {
6442 4 : attach_cb = connect_set_failover_cb;
6443 : }
6444 :
6445 52 : ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb);
6446 52 : if (ctx->probe_ctx == NULL) {
6447 0 : SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
6448 0 : free_nvme_async_probe_ctx(ctx);
6449 0 : return -ENODEV;
6450 : }
6451 52 : ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
6452 :
6453 52 : return 0;
6454 : }
6455 :
6456 : struct bdev_nvme_delete_ctx {
6457 : char *name;
6458 : struct nvme_path_id path_id;
6459 : bdev_nvme_delete_done_fn delete_done;
6460 : void *delete_done_ctx;
6461 : uint64_t timeout_ticks;
6462 : struct spdk_poller *poller;
6463 : };
6464 :
6465 : static void
6466 2 : free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx)
6467 : {
6468 2 : if (ctx != NULL) {
6469 1 : free(ctx->name);
6470 1 : free(ctx);
6471 : }
6472 2 : }
6473 :
6474 : static bool
6475 75 : nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id)
6476 : {
6477 75 : if (path_id->trid.trtype != 0) {
6478 21 : if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) {
6479 0 : if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) {
6480 0 : return false;
6481 : }
6482 : } else {
6483 21 : if (path_id->trid.trtype != p->trid.trtype) {
6484 0 : return false;
6485 : }
6486 : }
6487 : }
6488 :
6489 75 : if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) {
6490 21 : if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) {
6491 11 : return false;
6492 : }
6493 : }
6494 :
6495 64 : if (path_id->trid.adrfam != 0) {
6496 0 : if (path_id->trid.adrfam != p->trid.adrfam) {
6497 0 : return false;
6498 : }
6499 : }
6500 :
6501 64 : if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) {
6502 10 : if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) {
6503 0 : return false;
6504 : }
6505 : }
6506 :
6507 64 : if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) {
6508 10 : if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) {
6509 0 : return false;
6510 : }
6511 : }
6512 :
6513 64 : if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) {
6514 0 : if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) {
6515 0 : return false;
6516 : }
6517 : }
6518 :
6519 64 : if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) {
6520 0 : if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) {
6521 0 : return false;
6522 : }
6523 : }
6524 :
6525 64 : return true;
6526 : }
6527 :
6528 : static bool
6529 2 : nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id)
6530 : {
6531 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
6532 : struct nvme_ctrlr *ctrlr;
6533 : struct nvme_path_id *p;
6534 :
6535 2 : pthread_mutex_lock(&g_bdev_nvme_mutex);
6536 2 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
6537 2 : if (!nbdev_ctrlr) {
6538 1 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6539 1 : return false;
6540 : }
6541 :
6542 1 : TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
6543 1 : pthread_mutex_lock(&ctrlr->mutex);
6544 1 : TAILQ_FOREACH(p, &ctrlr->trids, link) {
6545 1 : if (nvme_path_id_compare(p, path_id)) {
6546 1 : pthread_mutex_unlock(&ctrlr->mutex);
6547 1 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6548 1 : return true;
6549 : }
6550 : }
6551 0 : pthread_mutex_unlock(&ctrlr->mutex);
6552 : }
6553 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6554 :
6555 0 : return false;
6556 : }
6557 :
6558 : static int
6559 2 : bdev_nvme_delete_complete_poll(void *arg)
6560 : {
6561 2 : struct bdev_nvme_delete_ctx *ctx = arg;
6562 2 : int rc = 0;
6563 :
6564 2 : if (nvme_path_id_exists(ctx->name, &ctx->path_id)) {
6565 1 : if (ctx->timeout_ticks > spdk_get_ticks()) {
6566 1 : return SPDK_POLLER_BUSY;
6567 : }
6568 :
6569 0 : SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name);
6570 0 : rc = -ETIMEDOUT;
6571 : }
6572 :
6573 1 : spdk_poller_unregister(&ctx->poller);
6574 :
6575 1 : ctx->delete_done(ctx->delete_done_ctx, rc);
6576 1 : free_bdev_nvme_delete_ctx(ctx);
6577 :
6578 1 : return SPDK_POLLER_BUSY;
6579 : }
6580 :
6581 : static int
6582 64 : _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id)
6583 : {
6584 : struct nvme_path_id *p, *t;
6585 : spdk_msg_fn msg_fn;
6586 64 : int rc = -ENXIO;
6587 :
6588 64 : pthread_mutex_lock(&nvme_ctrlr->mutex);
6589 :
6590 74 : TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) {
6591 74 : if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) {
6592 64 : break;
6593 : }
6594 :
6595 10 : if (!nvme_path_id_compare(p, path_id)) {
6596 3 : continue;
6597 : }
6598 :
6599 : /* We are not using the specified path. */
6600 7 : TAILQ_REMOVE(&nvme_ctrlr->trids, p, link);
6601 7 : free(p);
6602 7 : rc = 0;
6603 : }
6604 :
6605 64 : if (p == NULL || !nvme_path_id_compare(p, path_id)) {
6606 8 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6607 8 : return rc;
6608 : }
6609 :
6610 : /* If we made it here, then this path is a match! Now we need to remove it. */
6611 :
6612 : /* This is the active path in use right now. The active path is always the first in the list. */
6613 56 : assert(p == nvme_ctrlr->active_path_id);
6614 :
6615 56 : if (!TAILQ_NEXT(p, link)) {
6616 : /* The current path is the only path. */
6617 55 : msg_fn = _nvme_ctrlr_destruct;
6618 55 : rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false);
6619 : } else {
6620 : /* There is an alternative path. */
6621 1 : msg_fn = _bdev_nvme_reset_ctrlr;
6622 1 : rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true);
6623 : }
6624 :
6625 56 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6626 :
6627 56 : if (rc == 0) {
6628 56 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
6629 0 : } else if (rc == -EALREADY) {
6630 0 : rc = 0;
6631 : }
6632 :
6633 56 : return rc;
6634 : }
6635 :
6636 : int
6637 49 : bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id,
6638 : bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx)
6639 : {
6640 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
6641 : struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr;
6642 49 : struct bdev_nvme_delete_ctx *ctx = NULL;
6643 49 : int rc = -ENXIO, _rc;
6644 :
6645 49 : if (name == NULL || path_id == NULL) {
6646 0 : rc = -EINVAL;
6647 0 : goto exit;
6648 : }
6649 :
6650 49 : pthread_mutex_lock(&g_bdev_nvme_mutex);
6651 :
6652 49 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
6653 49 : if (nbdev_ctrlr == NULL) {
6654 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6655 :
6656 0 : SPDK_ERRLOG("Failed to find NVMe bdev controller\n");
6657 0 : rc = -ENODEV;
6658 0 : goto exit;
6659 : }
6660 :
6661 113 : TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) {
6662 64 : _rc = _bdev_nvme_delete(nvme_ctrlr, path_id);
6663 64 : if (_rc < 0 && _rc != -ENXIO) {
6664 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6665 0 : rc = _rc;
6666 0 : goto exit;
6667 64 : } else if (_rc == 0) {
6668 : /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr
6669 : * was deleted successfully. To remember the successful deletion,
6670 : * overwrite rc only if _rc is zero.
6671 : */
6672 58 : rc = 0;
6673 : }
6674 : }
6675 :
6676 49 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6677 :
6678 49 : if (rc != 0 || delete_done == NULL) {
6679 48 : goto exit;
6680 : }
6681 :
6682 1 : ctx = calloc(1, sizeof(*ctx));
6683 1 : if (ctx == NULL) {
6684 0 : SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n");
6685 0 : rc = -ENOMEM;
6686 0 : goto exit;
6687 : }
6688 :
6689 1 : ctx->name = strdup(name);
6690 1 : if (ctx->name == NULL) {
6691 0 : SPDK_ERRLOG("Failed to copy controller name for deletion\n");
6692 0 : rc = -ENOMEM;
6693 0 : goto exit;
6694 : }
6695 :
6696 1 : ctx->delete_done = delete_done;
6697 1 : ctx->delete_done_ctx = delete_done_ctx;
6698 1 : ctx->path_id = *path_id;
6699 1 : ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz();
6700 1 : ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000);
6701 1 : if (ctx->poller == NULL) {
6702 0 : SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n");
6703 0 : rc = -ENOMEM;
6704 0 : goto exit;
6705 : }
6706 :
6707 1 : exit:
6708 49 : if (rc != 0) {
6709 1 : free_bdev_nvme_delete_ctx(ctx);
6710 : }
6711 :
6712 49 : return rc;
6713 : }
6714 :
6715 : #define DISCOVERY_INFOLOG(ctx, format, ...) \
6716 : SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__);
6717 :
6718 : #define DISCOVERY_ERRLOG(ctx, format, ...) \
6719 : SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__);
6720 :
6721 : struct discovery_entry_ctx {
6722 : char name[128];
6723 : struct spdk_nvme_transport_id trid;
6724 : struct spdk_nvme_ctrlr_opts drv_opts;
6725 : struct spdk_nvmf_discovery_log_page_entry entry;
6726 : TAILQ_ENTRY(discovery_entry_ctx) tailq;
6727 : struct discovery_ctx *ctx;
6728 : };
6729 :
6730 : struct discovery_ctx {
6731 : char *name;
6732 : spdk_bdev_nvme_start_discovery_fn start_cb_fn;
6733 : spdk_bdev_nvme_stop_discovery_fn stop_cb_fn;
6734 : void *cb_ctx;
6735 : struct spdk_nvme_probe_ctx *probe_ctx;
6736 : struct spdk_nvme_detach_ctx *detach_ctx;
6737 : struct spdk_nvme_ctrlr *ctrlr;
6738 : struct spdk_nvme_transport_id trid;
6739 : struct discovery_entry_ctx *entry_ctx_in_use;
6740 : struct spdk_poller *poller;
6741 : struct spdk_nvme_ctrlr_opts drv_opts;
6742 : struct spdk_bdev_nvme_ctrlr_opts bdev_opts;
6743 : struct spdk_nvmf_discovery_log_page *log_page;
6744 : TAILQ_ENTRY(discovery_ctx) tailq;
6745 : TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs;
6746 : TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs;
6747 : int rc;
6748 : bool wait_for_attach;
6749 : uint64_t timeout_ticks;
6750 : /* Denotes that the discovery service is being started. We're waiting
6751 : * for the initial connection to the discovery controller to be
6752 : * established and attach discovered NVM ctrlrs.
6753 : */
6754 : bool initializing;
6755 : /* Denotes if a discovery is currently in progress for this context.
6756 : * That includes connecting to newly discovered subsystems. Used to
6757 : * ensure we do not start a new discovery until an existing one is
6758 : * complete.
6759 : */
6760 : bool in_progress;
6761 :
6762 : /* Denotes if another discovery is needed after the one in progress
6763 : * completes. Set when we receive an AER completion while a discovery
6764 : * is already in progress.
6765 : */
6766 : bool pending;
6767 :
6768 : /* Signal to the discovery context poller that it should stop the
6769 : * discovery service, including detaching from the current discovery
6770 : * controller.
6771 : */
6772 : bool stop;
6773 :
6774 : struct spdk_thread *calling_thread;
6775 : uint32_t index;
6776 : uint32_t attach_in_progress;
6777 : char *hostnqn;
6778 :
6779 : /* Denotes if the discovery service was started by the mdns discovery.
6780 : */
6781 : bool from_mdns_discovery_service;
6782 : };
6783 :
6784 : TAILQ_HEAD(discovery_ctxs, discovery_ctx);
6785 : static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs);
6786 :
6787 : static void get_discovery_log_page(struct discovery_ctx *ctx);
6788 :
6789 : static void
6790 0 : free_discovery_ctx(struct discovery_ctx *ctx)
6791 : {
6792 0 : free(ctx->log_page);
6793 0 : free(ctx->hostnqn);
6794 0 : free(ctx->name);
6795 0 : free(ctx);
6796 0 : }
6797 :
6798 : static void
6799 0 : discovery_complete(struct discovery_ctx *ctx)
6800 : {
6801 0 : ctx->initializing = false;
6802 0 : ctx->in_progress = false;
6803 0 : if (ctx->pending) {
6804 0 : ctx->pending = false;
6805 0 : get_discovery_log_page(ctx);
6806 : }
6807 0 : }
6808 :
6809 : static void
6810 0 : build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid,
6811 : struct spdk_nvmf_discovery_log_page_entry *entry)
6812 : {
6813 : char *space;
6814 :
6815 0 : trid->trtype = entry->trtype;
6816 0 : trid->adrfam = entry->adrfam;
6817 0 : memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr));
6818 0 : memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid));
6819 : /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and
6820 : * before call to this function trid->subnqn is zeroed out, we need
6821 : * to copy sizeof(trid->subnqn) minus one byte to make sure the last character
6822 : * remains 0. Then we can shorten the string (replace ' ' with 0) if required
6823 : */
6824 0 : memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1);
6825 :
6826 : /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated.
6827 : * But the log page entries typically pad them with spaces, not zeroes.
6828 : * So add a NULL terminator to each of these fields at the appropriate
6829 : * location.
6830 : */
6831 0 : space = strchr(trid->traddr, ' ');
6832 0 : if (space) {
6833 0 : *space = 0;
6834 : }
6835 0 : space = strchr(trid->trsvcid, ' ');
6836 0 : if (space) {
6837 0 : *space = 0;
6838 : }
6839 0 : space = strchr(trid->subnqn, ' ');
6840 0 : if (space) {
6841 0 : *space = 0;
6842 : }
6843 0 : }
6844 :
6845 : static void
6846 0 : _stop_discovery(void *_ctx)
6847 : {
6848 0 : struct discovery_ctx *ctx = _ctx;
6849 :
6850 0 : if (ctx->attach_in_progress > 0) {
6851 0 : spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx);
6852 0 : return;
6853 : }
6854 :
6855 0 : ctx->stop = true;
6856 :
6857 0 : while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) {
6858 : struct discovery_entry_ctx *entry_ctx;
6859 0 : struct nvme_path_id path = {};
6860 :
6861 0 : entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs);
6862 0 : path.trid = entry_ctx->trid;
6863 0 : bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL);
6864 0 : TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq);
6865 0 : free(entry_ctx);
6866 : }
6867 :
6868 0 : while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) {
6869 : struct discovery_entry_ctx *entry_ctx;
6870 :
6871 0 : entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs);
6872 0 : TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq);
6873 0 : free(entry_ctx);
6874 : }
6875 :
6876 0 : free(ctx->entry_ctx_in_use);
6877 0 : ctx->entry_ctx_in_use = NULL;
6878 : }
6879 :
6880 : static void
6881 0 : stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx)
6882 : {
6883 0 : ctx->stop_cb_fn = cb_fn;
6884 0 : ctx->cb_ctx = cb_ctx;
6885 :
6886 0 : if (ctx->attach_in_progress > 0) {
6887 0 : DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n",
6888 : ctx->attach_in_progress);
6889 : }
6890 :
6891 0 : _stop_discovery(ctx);
6892 0 : }
6893 :
6894 : static void
6895 2 : remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr)
6896 : {
6897 : struct discovery_ctx *d_ctx;
6898 : struct nvme_path_id *path_id;
6899 2 : struct spdk_nvme_transport_id trid = {};
6900 : struct discovery_entry_ctx *entry_ctx, *tmp;
6901 :
6902 2 : path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
6903 :
6904 2 : TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) {
6905 0 : TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) {
6906 0 : build_trid_from_log_page_entry(&trid, &entry_ctx->entry);
6907 0 : if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) {
6908 0 : continue;
6909 : }
6910 :
6911 0 : TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq);
6912 0 : free(entry_ctx);
6913 0 : DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n",
6914 : trid.subnqn, trid.traddr, trid.trsvcid);
6915 :
6916 : /* Fail discovery ctrlr to force reattach attempt */
6917 0 : spdk_nvme_ctrlr_fail(d_ctx->ctrlr);
6918 : }
6919 : }
6920 2 : }
6921 :
6922 : static void
6923 0 : discovery_remove_controllers(struct discovery_ctx *ctx)
6924 : {
6925 0 : struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page;
6926 : struct discovery_entry_ctx *entry_ctx, *tmp;
6927 : struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry;
6928 0 : struct spdk_nvme_transport_id old_trid = {};
6929 : uint64_t numrec, i;
6930 : bool found;
6931 :
6932 0 : numrec = from_le64(&log_page->numrec);
6933 0 : TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) {
6934 0 : found = false;
6935 0 : old_entry = &entry_ctx->entry;
6936 0 : build_trid_from_log_page_entry(&old_trid, old_entry);
6937 0 : for (i = 0; i < numrec; i++) {
6938 0 : new_entry = &log_page->entries[i];
6939 0 : if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) {
6940 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n",
6941 : old_trid.subnqn, old_trid.traddr, old_trid.trsvcid);
6942 0 : found = true;
6943 0 : break;
6944 : }
6945 : }
6946 0 : if (!found) {
6947 0 : struct nvme_path_id path = {};
6948 :
6949 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n",
6950 : old_trid.subnqn, old_trid.traddr, old_trid.trsvcid);
6951 :
6952 0 : path.trid = entry_ctx->trid;
6953 0 : bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL);
6954 0 : TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq);
6955 0 : free(entry_ctx);
6956 : }
6957 : }
6958 0 : free(log_page);
6959 0 : ctx->log_page = NULL;
6960 0 : discovery_complete(ctx);
6961 0 : }
6962 :
6963 : static void
6964 0 : complete_discovery_start(struct discovery_ctx *ctx, int status)
6965 : {
6966 0 : ctx->timeout_ticks = 0;
6967 0 : ctx->rc = status;
6968 0 : if (ctx->start_cb_fn) {
6969 0 : ctx->start_cb_fn(ctx->cb_ctx, status);
6970 0 : ctx->start_cb_fn = NULL;
6971 0 : ctx->cb_ctx = NULL;
6972 : }
6973 0 : }
6974 :
6975 : static void
6976 0 : discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc)
6977 : {
6978 0 : struct discovery_entry_ctx *entry_ctx = cb_ctx;
6979 0 : struct discovery_ctx *ctx = entry_ctx->ctx;
6980 :
6981 0 : DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name);
6982 0 : ctx->attach_in_progress--;
6983 0 : if (ctx->attach_in_progress == 0) {
6984 0 : complete_discovery_start(ctx, ctx->rc);
6985 0 : if (ctx->initializing && ctx->rc != 0) {
6986 0 : DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc);
6987 0 : stop_discovery(ctx, NULL, ctx->cb_ctx);
6988 : } else {
6989 0 : discovery_remove_controllers(ctx);
6990 : }
6991 : }
6992 0 : }
6993 :
6994 : static struct discovery_entry_ctx *
6995 0 : create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid)
6996 : {
6997 : struct discovery_entry_ctx *new_ctx;
6998 :
6999 0 : new_ctx = calloc(1, sizeof(*new_ctx));
7000 0 : if (new_ctx == NULL) {
7001 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7002 0 : return NULL;
7003 : }
7004 :
7005 0 : new_ctx->ctx = ctx;
7006 0 : memcpy(&new_ctx->trid, trid, sizeof(*trid));
7007 0 : spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts));
7008 0 : snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn);
7009 0 : return new_ctx;
7010 : }
7011 :
7012 : static void
7013 0 : discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl,
7014 : struct spdk_nvmf_discovery_log_page *log_page)
7015 : {
7016 0 : struct discovery_ctx *ctx = cb_arg;
7017 : struct discovery_entry_ctx *entry_ctx, *tmp;
7018 : struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry;
7019 : uint64_t numrec, i;
7020 : bool found;
7021 :
7022 0 : if (rc || spdk_nvme_cpl_is_error(cpl)) {
7023 0 : DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n");
7024 0 : return;
7025 : }
7026 :
7027 0 : ctx->log_page = log_page;
7028 0 : assert(ctx->attach_in_progress == 0);
7029 0 : numrec = from_le64(&log_page->numrec);
7030 0 : TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) {
7031 0 : TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq);
7032 0 : free(entry_ctx);
7033 : }
7034 0 : for (i = 0; i < numrec; i++) {
7035 0 : found = false;
7036 0 : new_entry = &log_page->entries[i];
7037 0 : if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT ||
7038 0 : new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
7039 : struct discovery_entry_ctx *new_ctx;
7040 0 : struct spdk_nvme_transport_id trid = {};
7041 :
7042 0 : build_trid_from_log_page_entry(&trid, new_entry);
7043 0 : new_ctx = create_discovery_entry_ctx(ctx, &trid);
7044 0 : if (new_ctx == NULL) {
7045 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7046 0 : break;
7047 : }
7048 :
7049 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq);
7050 0 : continue;
7051 : }
7052 0 : TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) {
7053 0 : old_entry = &entry_ctx->entry;
7054 0 : if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) {
7055 0 : found = true;
7056 0 : break;
7057 : }
7058 : }
7059 0 : if (!found) {
7060 0 : struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx;
7061 : struct discovery_ctx *d_ctx;
7062 :
7063 0 : TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) {
7064 0 : TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) {
7065 0 : if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn,
7066 : sizeof(new_entry->subnqn))) {
7067 0 : break;
7068 : }
7069 : }
7070 0 : if (subnqn_ctx) {
7071 0 : break;
7072 : }
7073 : }
7074 :
7075 0 : new_ctx = calloc(1, sizeof(*new_ctx));
7076 0 : if (new_ctx == NULL) {
7077 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7078 0 : break;
7079 : }
7080 :
7081 0 : new_ctx->ctx = ctx;
7082 0 : memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry));
7083 0 : build_trid_from_log_page_entry(&new_ctx->trid, new_entry);
7084 0 : if (subnqn_ctx) {
7085 0 : snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name);
7086 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n",
7087 : new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid,
7088 : new_ctx->name);
7089 : } else {
7090 0 : snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++);
7091 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n",
7092 : new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid,
7093 : new_ctx->name);
7094 : }
7095 0 : spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts));
7096 0 : snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn);
7097 0 : rc = spdk_bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0,
7098 : discovery_attach_controller_done, new_ctx,
7099 : &new_ctx->drv_opts, &ctx->bdev_opts, true);
7100 0 : if (rc == 0) {
7101 0 : TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq);
7102 0 : ctx->attach_in_progress++;
7103 : } else {
7104 0 : DISCOVERY_ERRLOG(ctx, "spdk_bdev_nvme_create failed (%s)\n", spdk_strerror(-rc));
7105 : }
7106 : }
7107 : }
7108 :
7109 0 : if (ctx->attach_in_progress == 0) {
7110 0 : discovery_remove_controllers(ctx);
7111 : }
7112 : }
7113 :
7114 : static void
7115 0 : get_discovery_log_page(struct discovery_ctx *ctx)
7116 : {
7117 : int rc;
7118 :
7119 0 : assert(ctx->in_progress == false);
7120 0 : ctx->in_progress = true;
7121 0 : rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx);
7122 0 : if (rc != 0) {
7123 0 : DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n");
7124 : }
7125 0 : DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n");
7126 0 : }
7127 :
7128 : static void
7129 0 : discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
7130 : {
7131 0 : struct discovery_ctx *ctx = arg;
7132 0 : uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
7133 :
7134 0 : if (spdk_nvme_cpl_is_error(cpl)) {
7135 0 : DISCOVERY_ERRLOG(ctx, "aer failed\n");
7136 0 : return;
7137 : }
7138 :
7139 0 : if (log_page_id != SPDK_NVME_LOG_DISCOVERY) {
7140 0 : DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id);
7141 0 : return;
7142 : }
7143 :
7144 0 : DISCOVERY_INFOLOG(ctx, "got aer\n");
7145 0 : if (ctx->in_progress) {
7146 0 : ctx->pending = true;
7147 0 : return;
7148 : }
7149 :
7150 0 : get_discovery_log_page(ctx);
7151 : }
7152 :
7153 : static void
7154 0 : discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
7155 : struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
7156 : {
7157 0 : struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
7158 : struct discovery_ctx *ctx;
7159 :
7160 0 : ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts);
7161 :
7162 0 : DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n");
7163 0 : ctx->probe_ctx = NULL;
7164 0 : ctx->ctrlr = ctrlr;
7165 :
7166 0 : if (ctx->rc != 0) {
7167 0 : DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n",
7168 : ctx->rc);
7169 0 : return;
7170 : }
7171 :
7172 0 : spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx);
7173 : }
7174 :
7175 : static int
7176 0 : discovery_poller(void *arg)
7177 : {
7178 0 : struct discovery_ctx *ctx = arg;
7179 : struct spdk_nvme_transport_id *trid;
7180 : int rc;
7181 :
7182 0 : if (ctx->detach_ctx) {
7183 0 : rc = spdk_nvme_detach_poll_async(ctx->detach_ctx);
7184 0 : if (rc != -EAGAIN) {
7185 0 : ctx->detach_ctx = NULL;
7186 0 : ctx->ctrlr = NULL;
7187 : }
7188 0 : } else if (ctx->stop) {
7189 0 : if (ctx->ctrlr != NULL) {
7190 0 : rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx);
7191 0 : if (rc == 0) {
7192 0 : return SPDK_POLLER_BUSY;
7193 : }
7194 0 : DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n");
7195 : }
7196 0 : spdk_poller_unregister(&ctx->poller);
7197 0 : TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq);
7198 0 : assert(ctx->start_cb_fn == NULL);
7199 0 : if (ctx->stop_cb_fn != NULL) {
7200 0 : ctx->stop_cb_fn(ctx->cb_ctx);
7201 : }
7202 0 : free_discovery_ctx(ctx);
7203 0 : } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) {
7204 0 : if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) {
7205 0 : DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n");
7206 0 : assert(ctx->initializing);
7207 0 : spdk_poller_unregister(&ctx->poller);
7208 0 : TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq);
7209 0 : complete_discovery_start(ctx, -ETIMEDOUT);
7210 0 : stop_discovery(ctx, NULL, NULL);
7211 0 : free_discovery_ctx(ctx);
7212 0 : return SPDK_POLLER_BUSY;
7213 : }
7214 :
7215 0 : assert(ctx->entry_ctx_in_use == NULL);
7216 0 : ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs);
7217 0 : TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq);
7218 0 : trid = &ctx->entry_ctx_in_use->trid;
7219 0 : ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb);
7220 0 : if (ctx->probe_ctx) {
7221 0 : spdk_poller_unregister(&ctx->poller);
7222 0 : ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000);
7223 : } else {
7224 0 : DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n");
7225 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq);
7226 0 : ctx->entry_ctx_in_use = NULL;
7227 : }
7228 0 : } else if (ctx->probe_ctx) {
7229 0 : if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) {
7230 0 : DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n");
7231 0 : complete_discovery_start(ctx, -ETIMEDOUT);
7232 0 : return SPDK_POLLER_BUSY;
7233 : }
7234 :
7235 0 : rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
7236 0 : if (rc != -EAGAIN) {
7237 0 : if (ctx->rc != 0) {
7238 0 : assert(ctx->initializing);
7239 0 : stop_discovery(ctx, NULL, ctx->cb_ctx);
7240 : } else {
7241 0 : assert(rc == 0);
7242 0 : DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n");
7243 0 : ctx->rc = rc;
7244 0 : get_discovery_log_page(ctx);
7245 : }
7246 : }
7247 : } else {
7248 0 : if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) {
7249 0 : DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n");
7250 0 : complete_discovery_start(ctx, -ETIMEDOUT);
7251 : /* We need to wait until all NVM ctrlrs are attached before we stop the
7252 : * discovery service to make sure we don't detach a ctrlr that is still
7253 : * being attached.
7254 : */
7255 0 : if (ctx->attach_in_progress == 0) {
7256 0 : stop_discovery(ctx, NULL, ctx->cb_ctx);
7257 0 : return SPDK_POLLER_BUSY;
7258 : }
7259 : }
7260 :
7261 0 : rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr);
7262 0 : if (rc < 0) {
7263 0 : spdk_poller_unregister(&ctx->poller);
7264 0 : ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000);
7265 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq);
7266 0 : ctx->entry_ctx_in_use = NULL;
7267 :
7268 0 : rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx);
7269 0 : if (rc != 0) {
7270 0 : DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n");
7271 0 : ctx->ctrlr = NULL;
7272 : }
7273 : }
7274 : }
7275 :
7276 0 : return SPDK_POLLER_BUSY;
7277 : }
7278 :
7279 : static void
7280 0 : start_discovery_poller(void *arg)
7281 : {
7282 0 : struct discovery_ctx *ctx = arg;
7283 :
7284 0 : TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq);
7285 0 : ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000);
7286 0 : }
7287 :
7288 : int
7289 0 : bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid,
7290 : const char *base_name,
7291 : struct spdk_nvme_ctrlr_opts *drv_opts,
7292 : struct spdk_bdev_nvme_ctrlr_opts *bdev_opts,
7293 : uint64_t attach_timeout,
7294 : bool from_mdns,
7295 : spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx)
7296 : {
7297 : struct discovery_ctx *ctx;
7298 : struct discovery_entry_ctx *discovery_entry_ctx;
7299 :
7300 0 : snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
7301 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
7302 0 : if (strcmp(ctx->name, base_name) == 0) {
7303 0 : return -EEXIST;
7304 : }
7305 :
7306 0 : if (ctx->entry_ctx_in_use != NULL) {
7307 0 : if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) {
7308 0 : return -EEXIST;
7309 : }
7310 : }
7311 :
7312 0 : TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) {
7313 0 : if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) {
7314 0 : return -EEXIST;
7315 : }
7316 : }
7317 : }
7318 :
7319 0 : ctx = calloc(1, sizeof(*ctx));
7320 0 : if (ctx == NULL) {
7321 0 : return -ENOMEM;
7322 : }
7323 :
7324 0 : ctx->name = strdup(base_name);
7325 0 : if (ctx->name == NULL) {
7326 0 : free_discovery_ctx(ctx);
7327 0 : return -ENOMEM;
7328 : }
7329 0 : memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts));
7330 0 : memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts));
7331 0 : ctx->from_mdns_discovery_service = from_mdns;
7332 0 : ctx->bdev_opts.from_discovery_service = true;
7333 0 : ctx->calling_thread = spdk_get_thread();
7334 0 : ctx->start_cb_fn = cb_fn;
7335 0 : ctx->cb_ctx = cb_ctx;
7336 0 : ctx->initializing = true;
7337 0 : if (ctx->start_cb_fn) {
7338 : /* We can use this when dumping json to denote if this RPC parameter
7339 : * was specified or not.
7340 : */
7341 0 : ctx->wait_for_attach = true;
7342 : }
7343 0 : if (attach_timeout != 0) {
7344 0 : ctx->timeout_ticks = spdk_get_ticks() + attach_timeout *
7345 0 : spdk_get_ticks_hz() / 1000ull;
7346 : }
7347 0 : TAILQ_INIT(&ctx->nvm_entry_ctxs);
7348 0 : TAILQ_INIT(&ctx->discovery_entry_ctxs);
7349 0 : memcpy(&ctx->trid, trid, sizeof(*trid));
7350 : /* Even if user did not specify hostnqn, we can still strdup("\0"); */
7351 0 : ctx->hostnqn = strdup(ctx->drv_opts.hostnqn);
7352 0 : if (ctx->hostnqn == NULL) {
7353 0 : free_discovery_ctx(ctx);
7354 0 : return -ENOMEM;
7355 : }
7356 0 : discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid);
7357 0 : if (discovery_entry_ctx == NULL) {
7358 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7359 0 : free_discovery_ctx(ctx);
7360 0 : return -ENOMEM;
7361 : }
7362 :
7363 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq);
7364 0 : spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx);
7365 0 : return 0;
7366 : }
7367 :
7368 : int
7369 0 : bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx)
7370 : {
7371 : struct discovery_ctx *ctx;
7372 :
7373 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
7374 0 : if (strcmp(name, ctx->name) == 0) {
7375 0 : if (ctx->stop) {
7376 0 : return -EALREADY;
7377 : }
7378 : /* If we're still starting the discovery service and ->rc is non-zero, we're
7379 : * going to stop it as soon as we can
7380 : */
7381 0 : if (ctx->initializing && ctx->rc != 0) {
7382 0 : return -EALREADY;
7383 : }
7384 0 : stop_discovery(ctx, cb_fn, cb_ctx);
7385 0 : return 0;
7386 : }
7387 : }
7388 :
7389 0 : return -ENOENT;
7390 : }
7391 :
7392 : static int
7393 1 : bdev_nvme_library_init(void)
7394 : {
7395 1 : g_bdev_nvme_init_thread = spdk_get_thread();
7396 :
7397 1 : spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb,
7398 : bdev_nvme_destroy_poll_group_cb,
7399 : sizeof(struct nvme_poll_group), "nvme_poll_groups");
7400 :
7401 1 : return 0;
7402 : }
7403 :
7404 : static void
7405 1 : bdev_nvme_fini_destruct_ctrlrs(void)
7406 : {
7407 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
7408 : struct nvme_ctrlr *nvme_ctrlr;
7409 :
7410 1 : pthread_mutex_lock(&g_bdev_nvme_mutex);
7411 1 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
7412 0 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
7413 0 : pthread_mutex_lock(&nvme_ctrlr->mutex);
7414 0 : if (nvme_ctrlr->destruct) {
7415 : /* This controller's destruction was already started
7416 : * before the application started shutting down
7417 : */
7418 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
7419 0 : continue;
7420 : }
7421 0 : nvme_ctrlr->destruct = true;
7422 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
7423 :
7424 0 : spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct,
7425 : nvme_ctrlr);
7426 : }
7427 : }
7428 :
7429 1 : g_bdev_nvme_module_finish = true;
7430 1 : if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
7431 1 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
7432 1 : spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
7433 1 : spdk_bdev_module_fini_done();
7434 1 : return;
7435 : }
7436 :
7437 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
7438 : }
7439 :
7440 : static void
7441 0 : check_discovery_fini(void *arg)
7442 : {
7443 0 : if (TAILQ_EMPTY(&g_discovery_ctxs)) {
7444 0 : bdev_nvme_fini_destruct_ctrlrs();
7445 : }
7446 0 : }
7447 :
7448 : static void
7449 1 : bdev_nvme_library_fini(void)
7450 : {
7451 : struct nvme_probe_skip_entry *entry, *entry_tmp;
7452 : struct discovery_ctx *ctx;
7453 :
7454 1 : spdk_poller_unregister(&g_hotplug_poller);
7455 1 : free(g_hotplug_probe_ctx);
7456 1 : g_hotplug_probe_ctx = NULL;
7457 :
7458 1 : TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
7459 0 : TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
7460 0 : free(entry);
7461 : }
7462 :
7463 1 : assert(spdk_get_thread() == g_bdev_nvme_init_thread);
7464 1 : if (TAILQ_EMPTY(&g_discovery_ctxs)) {
7465 1 : bdev_nvme_fini_destruct_ctrlrs();
7466 : } else {
7467 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
7468 0 : stop_discovery(ctx, check_discovery_fini, NULL);
7469 : }
7470 : }
7471 1 : }
7472 :
7473 : static void
7474 0 : bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio)
7475 : {
7476 0 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7477 0 : struct spdk_bdev *bdev = bdev_io->bdev;
7478 0 : struct spdk_dif_ctx dif_ctx;
7479 0 : struct spdk_dif_error err_blk = {};
7480 : int rc;
7481 0 : struct spdk_dif_ctx_init_ext_opts dif_opts;
7482 :
7483 0 : dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
7484 0 : dif_opts.dif_pi_format = bdev->dif_pi_format;
7485 0 : rc = spdk_dif_ctx_init(&dif_ctx,
7486 0 : bdev->blocklen, bdev->md_len, bdev->md_interleave,
7487 0 : bdev->dif_is_head_of_md, bdev->dif_type,
7488 : bdev_io->u.bdev.dif_check_flags,
7489 0 : bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts);
7490 0 : if (rc != 0) {
7491 0 : SPDK_ERRLOG("Initialization of DIF context failed\n");
7492 0 : return;
7493 : }
7494 :
7495 0 : if (bdev->md_interleave) {
7496 0 : rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
7497 0 : bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
7498 : } else {
7499 0 : struct iovec md_iov = {
7500 0 : .iov_base = bdev_io->u.bdev.md_buf,
7501 0 : .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len,
7502 : };
7503 :
7504 0 : rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
7505 0 : &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
7506 : }
7507 :
7508 0 : if (rc != 0) {
7509 0 : SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
7510 : err_blk.err_type, err_blk.err_offset);
7511 : } else {
7512 0 : SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
7513 : }
7514 : }
7515 :
7516 : static void
7517 0 : bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
7518 : {
7519 0 : struct nvme_bdev_io *bio = ref;
7520 :
7521 0 : if (spdk_nvme_cpl_is_success(cpl)) {
7522 : /* Run PI verification for read data buffer. */
7523 0 : bdev_nvme_verify_pi_error(bio);
7524 : }
7525 :
7526 : /* Return original completion status */
7527 0 : bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
7528 0 : }
7529 :
7530 : static void
7531 3 : bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
7532 : {
7533 3 : struct nvme_bdev_io *bio = ref;
7534 3 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7535 : int ret;
7536 :
7537 3 : if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
7538 0 : SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
7539 : cpl->status.sct, cpl->status.sc);
7540 :
7541 : /* Save completion status to use after verifying PI error. */
7542 0 : bio->cpl = *cpl;
7543 :
7544 0 : if (spdk_likely(nvme_io_path_is_available(bio->io_path))) {
7545 : /* Read without PI checking to verify PI error. */
7546 0 : ret = bdev_nvme_no_pi_readv(bio,
7547 : bdev_io->u.bdev.iovs,
7548 : bdev_io->u.bdev.iovcnt,
7549 : bdev_io->u.bdev.md_buf,
7550 : bdev_io->u.bdev.num_blocks,
7551 : bdev_io->u.bdev.offset_blocks);
7552 0 : if (ret == 0) {
7553 0 : return;
7554 : }
7555 : }
7556 : }
7557 :
7558 3 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7559 : }
7560 :
7561 : static void
7562 25 : bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
7563 : {
7564 25 : struct nvme_bdev_io *bio = ref;
7565 :
7566 25 : if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
7567 0 : SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
7568 : cpl->status.sct, cpl->status.sc);
7569 : /* Run PI verification for write data buffer if PI error is detected. */
7570 0 : bdev_nvme_verify_pi_error(bio);
7571 : }
7572 :
7573 25 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7574 25 : }
7575 :
7576 : static void
7577 0 : bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
7578 : {
7579 0 : struct nvme_bdev_io *bio = ref;
7580 0 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7581 :
7582 : /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
7583 : * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
7584 : */
7585 0 : bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
7586 :
7587 0 : if (spdk_nvme_cpl_is_pi_error(cpl)) {
7588 0 : SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
7589 : cpl->status.sct, cpl->status.sc);
7590 : /* Run PI verification for zone append data buffer if PI error is detected. */
7591 0 : bdev_nvme_verify_pi_error(bio);
7592 : }
7593 :
7594 0 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7595 0 : }
7596 :
7597 : static void
7598 1 : bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
7599 : {
7600 1 : struct nvme_bdev_io *bio = ref;
7601 :
7602 1 : if (spdk_nvme_cpl_is_pi_error(cpl)) {
7603 0 : SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
7604 : cpl->status.sct, cpl->status.sc);
7605 : /* Run PI verification for compare data buffer if PI error is detected. */
7606 0 : bdev_nvme_verify_pi_error(bio);
7607 : }
7608 :
7609 1 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7610 1 : }
7611 :
7612 : static void
7613 4 : bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
7614 : {
7615 4 : struct nvme_bdev_io *bio = ref;
7616 :
7617 : /* Compare operation completion */
7618 4 : if (!bio->first_fused_completed) {
7619 : /* Save compare result for write callback */
7620 2 : bio->cpl = *cpl;
7621 2 : bio->first_fused_completed = true;
7622 2 : return;
7623 : }
7624 :
7625 : /* Write operation completion */
7626 2 : if (spdk_nvme_cpl_is_error(&bio->cpl)) {
7627 : /* If bio->cpl is already an error, it means the compare operation failed. In that case,
7628 : * complete the IO with the compare operation's status.
7629 : */
7630 1 : if (!spdk_nvme_cpl_is_error(cpl)) {
7631 1 : SPDK_ERRLOG("Unexpected write success after compare failure.\n");
7632 : }
7633 :
7634 1 : bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
7635 : } else {
7636 1 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7637 : }
7638 : }
7639 :
7640 : static void
7641 1 : bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
7642 : {
7643 1 : struct nvme_bdev_io *bio = ref;
7644 :
7645 1 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7646 1 : }
7647 :
7648 : static int
7649 0 : fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
7650 : {
7651 0 : switch (desc->zt) {
7652 0 : case SPDK_NVME_ZONE_TYPE_SEQWR:
7653 0 : info->type = SPDK_BDEV_ZONE_TYPE_SEQWR;
7654 0 : break;
7655 0 : default:
7656 0 : SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt);
7657 0 : return -EIO;
7658 : }
7659 :
7660 0 : switch (desc->zs) {
7661 0 : case SPDK_NVME_ZONE_STATE_EMPTY:
7662 0 : info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
7663 0 : break;
7664 0 : case SPDK_NVME_ZONE_STATE_IOPEN:
7665 0 : info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
7666 0 : break;
7667 0 : case SPDK_NVME_ZONE_STATE_EOPEN:
7668 0 : info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
7669 0 : break;
7670 0 : case SPDK_NVME_ZONE_STATE_CLOSED:
7671 0 : info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
7672 0 : break;
7673 0 : case SPDK_NVME_ZONE_STATE_RONLY:
7674 0 : info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
7675 0 : break;
7676 0 : case SPDK_NVME_ZONE_STATE_FULL:
7677 0 : info->state = SPDK_BDEV_ZONE_STATE_FULL;
7678 0 : break;
7679 0 : case SPDK_NVME_ZONE_STATE_OFFLINE:
7680 0 : info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
7681 0 : break;
7682 0 : default:
7683 0 : SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
7684 0 : return -EIO;
7685 : }
7686 :
7687 0 : info->zone_id = desc->zslba;
7688 0 : info->write_pointer = desc->wp;
7689 0 : info->capacity = desc->zcap;
7690 :
7691 0 : return 0;
7692 : }
7693 :
7694 : static void
7695 0 : bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
7696 : {
7697 0 : struct nvme_bdev_io *bio = ref;
7698 0 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7699 0 : uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
7700 0 : uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
7701 0 : struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
7702 : uint64_t max_zones_per_buf, i;
7703 : uint32_t zone_report_bufsize;
7704 : struct spdk_nvme_ns *ns;
7705 : struct spdk_nvme_qpair *qpair;
7706 : int ret;
7707 :
7708 0 : if (spdk_nvme_cpl_is_error(cpl)) {
7709 0 : goto out_complete_io_nvme_cpl;
7710 : }
7711 :
7712 0 : if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
7713 0 : ret = -ENXIO;
7714 0 : goto out_complete_io_ret;
7715 : }
7716 :
7717 0 : ns = bio->io_path->nvme_ns->ns;
7718 0 : qpair = bio->io_path->qpair->qpair;
7719 :
7720 0 : zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
7721 0 : max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
7722 : sizeof(bio->zone_report_buf->descs[0]);
7723 :
7724 0 : if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
7725 0 : ret = -EINVAL;
7726 0 : goto out_complete_io_ret;
7727 : }
7728 :
7729 0 : if (!bio->zone_report_buf->nr_zones) {
7730 0 : ret = -EINVAL;
7731 0 : goto out_complete_io_ret;
7732 : }
7733 :
7734 0 : for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
7735 0 : ret = fill_zone_from_report(&info[bio->handled_zones],
7736 0 : &bio->zone_report_buf->descs[i]);
7737 0 : if (ret) {
7738 0 : goto out_complete_io_ret;
7739 : }
7740 0 : bio->handled_zones++;
7741 : }
7742 :
7743 0 : if (bio->handled_zones < zones_to_copy) {
7744 0 : uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
7745 0 : uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
7746 :
7747 0 : memset(bio->zone_report_buf, 0, zone_report_bufsize);
7748 0 : ret = spdk_nvme_zns_report_zones(ns, qpair,
7749 0 : bio->zone_report_buf, zone_report_bufsize,
7750 : slba, SPDK_NVME_ZRA_LIST_ALL, true,
7751 : bdev_nvme_get_zone_info_done, bio);
7752 0 : if (!ret) {
7753 0 : return;
7754 : } else {
7755 0 : goto out_complete_io_ret;
7756 : }
7757 : }
7758 :
7759 0 : out_complete_io_nvme_cpl:
7760 0 : free(bio->zone_report_buf);
7761 0 : bio->zone_report_buf = NULL;
7762 0 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7763 0 : return;
7764 :
7765 0 : out_complete_io_ret:
7766 0 : free(bio->zone_report_buf);
7767 0 : bio->zone_report_buf = NULL;
7768 0 : bdev_nvme_io_complete(bio, ret);
7769 : }
7770 :
7771 : static void
7772 0 : bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
7773 : {
7774 0 : struct nvme_bdev_io *bio = ref;
7775 :
7776 0 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7777 0 : }
7778 :
7779 : static void
7780 4 : bdev_nvme_admin_passthru_complete_nvme_status(void *ctx)
7781 : {
7782 4 : struct nvme_bdev_io *bio = ctx;
7783 4 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7784 4 : const struct spdk_nvme_cpl *cpl = &bio->cpl;
7785 :
7786 4 : assert(bdev_nvme_io_type_is_admin(bdev_io->type));
7787 :
7788 4 : __bdev_nvme_io_complete(bdev_io, 0, cpl);
7789 4 : }
7790 :
7791 : static void
7792 3 : bdev_nvme_abort_complete(void *ctx)
7793 : {
7794 3 : struct nvme_bdev_io *bio = ctx;
7795 3 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7796 :
7797 3 : if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
7798 3 : __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL);
7799 : } else {
7800 0 : __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL);
7801 : }
7802 3 : }
7803 :
7804 : static void
7805 3 : bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
7806 : {
7807 3 : struct nvme_bdev_io *bio = ref;
7808 3 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7809 :
7810 3 : bio->cpl = *cpl;
7811 3 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio);
7812 3 : }
7813 :
7814 : static void
7815 4 : bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
7816 : {
7817 4 : struct nvme_bdev_io *bio = ref;
7818 4 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7819 :
7820 4 : bio->cpl = *cpl;
7821 4 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7822 : bdev_nvme_admin_passthru_complete_nvme_status, bio);
7823 4 : }
7824 :
7825 : static void
7826 0 : bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
7827 : {
7828 0 : struct nvme_bdev_io *bio = ref;
7829 : struct iovec *iov;
7830 :
7831 0 : bio->iov_offset = sgl_offset;
7832 0 : for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
7833 0 : iov = &bio->iovs[bio->iovpos];
7834 0 : if (bio->iov_offset < iov->iov_len) {
7835 0 : break;
7836 : }
7837 :
7838 0 : bio->iov_offset -= iov->iov_len;
7839 : }
7840 0 : }
7841 :
7842 : static int
7843 0 : bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
7844 : {
7845 0 : struct nvme_bdev_io *bio = ref;
7846 : struct iovec *iov;
7847 :
7848 0 : assert(bio->iovpos < bio->iovcnt);
7849 :
7850 0 : iov = &bio->iovs[bio->iovpos];
7851 :
7852 0 : *address = iov->iov_base;
7853 0 : *length = iov->iov_len;
7854 :
7855 0 : if (bio->iov_offset) {
7856 0 : assert(bio->iov_offset <= iov->iov_len);
7857 0 : *address += bio->iov_offset;
7858 0 : *length -= bio->iov_offset;
7859 : }
7860 :
7861 0 : bio->iov_offset += *length;
7862 0 : if (bio->iov_offset == iov->iov_len) {
7863 0 : bio->iovpos++;
7864 0 : bio->iov_offset = 0;
7865 : }
7866 :
7867 0 : return 0;
7868 : }
7869 :
7870 : static void
7871 0 : bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
7872 : {
7873 0 : struct nvme_bdev_io *bio = ref;
7874 : struct iovec *iov;
7875 :
7876 0 : bio->fused_iov_offset = sgl_offset;
7877 0 : for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
7878 0 : iov = &bio->fused_iovs[bio->fused_iovpos];
7879 0 : if (bio->fused_iov_offset < iov->iov_len) {
7880 0 : break;
7881 : }
7882 :
7883 0 : bio->fused_iov_offset -= iov->iov_len;
7884 : }
7885 0 : }
7886 :
7887 : static int
7888 0 : bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
7889 : {
7890 0 : struct nvme_bdev_io *bio = ref;
7891 : struct iovec *iov;
7892 :
7893 0 : assert(bio->fused_iovpos < bio->fused_iovcnt);
7894 :
7895 0 : iov = &bio->fused_iovs[bio->fused_iovpos];
7896 :
7897 0 : *address = iov->iov_base;
7898 0 : *length = iov->iov_len;
7899 :
7900 0 : if (bio->fused_iov_offset) {
7901 0 : assert(bio->fused_iov_offset <= iov->iov_len);
7902 0 : *address += bio->fused_iov_offset;
7903 0 : *length -= bio->fused_iov_offset;
7904 : }
7905 :
7906 0 : bio->fused_iov_offset += *length;
7907 0 : if (bio->fused_iov_offset == iov->iov_len) {
7908 0 : bio->fused_iovpos++;
7909 0 : bio->fused_iov_offset = 0;
7910 : }
7911 :
7912 0 : return 0;
7913 : }
7914 :
7915 : static int
7916 0 : bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
7917 : void *md, uint64_t lba_count, uint64_t lba)
7918 : {
7919 : int rc;
7920 :
7921 0 : SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
7922 : lba_count, lba);
7923 :
7924 0 : bio->iovs = iov;
7925 0 : bio->iovcnt = iovcnt;
7926 0 : bio->iovpos = 0;
7927 0 : bio->iov_offset = 0;
7928 :
7929 0 : rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns,
7930 0 : bio->io_path->qpair->qpair,
7931 : lba, lba_count,
7932 : bdev_nvme_no_pi_readv_done, bio, 0,
7933 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
7934 : md, 0, 0);
7935 :
7936 0 : if (rc != 0 && rc != -ENOMEM) {
7937 0 : SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
7938 : }
7939 0 : return rc;
7940 : }
7941 :
7942 : static int
7943 3 : bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
7944 : void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
7945 : struct spdk_memory_domain *domain, void *domain_ctx,
7946 : struct spdk_accel_sequence *seq)
7947 : {
7948 3 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
7949 3 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
7950 : int rc;
7951 :
7952 3 : SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
7953 : lba_count, lba);
7954 :
7955 3 : bio->iovs = iov;
7956 3 : bio->iovcnt = iovcnt;
7957 3 : bio->iovpos = 0;
7958 3 : bio->iov_offset = 0;
7959 :
7960 3 : if (domain != NULL || seq != NULL) {
7961 1 : bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence);
7962 1 : bio->ext_opts.memory_domain = domain;
7963 1 : bio->ext_opts.memory_domain_ctx = domain_ctx;
7964 1 : bio->ext_opts.io_flags = flags;
7965 1 : bio->ext_opts.metadata = md;
7966 1 : bio->ext_opts.accel_sequence = seq;
7967 :
7968 1 : if (iovcnt == 1) {
7969 1 : rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done,
7970 : bio, &bio->ext_opts);
7971 : } else {
7972 0 : rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count,
7973 : bdev_nvme_readv_done, bio,
7974 : bdev_nvme_queued_reset_sgl,
7975 : bdev_nvme_queued_next_sge,
7976 : &bio->ext_opts);
7977 : }
7978 2 : } else if (iovcnt == 1) {
7979 2 : rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base,
7980 : md, lba, lba_count, bdev_nvme_readv_done,
7981 : bio, flags, 0, 0);
7982 : } else {
7983 0 : rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
7984 : bdev_nvme_readv_done, bio, flags,
7985 : bdev_nvme_queued_reset_sgl,
7986 : bdev_nvme_queued_next_sge, md, 0, 0);
7987 : }
7988 :
7989 3 : if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
7990 0 : SPDK_ERRLOG("readv failed: rc = %d\n", rc);
7991 : }
7992 3 : return rc;
7993 : }
7994 :
7995 : static int
7996 25 : bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
7997 : void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
7998 : struct spdk_memory_domain *domain, void *domain_ctx,
7999 : struct spdk_accel_sequence *seq,
8000 : union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13)
8001 : {
8002 25 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8003 25 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8004 : int rc;
8005 :
8006 25 : SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8007 : lba_count, lba);
8008 :
8009 25 : bio->iovs = iov;
8010 25 : bio->iovcnt = iovcnt;
8011 25 : bio->iovpos = 0;
8012 25 : bio->iov_offset = 0;
8013 :
8014 25 : if (domain != NULL || seq != NULL) {
8015 0 : bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence);
8016 0 : bio->ext_opts.memory_domain = domain;
8017 0 : bio->ext_opts.memory_domain_ctx = domain_ctx;
8018 0 : bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype);
8019 0 : bio->ext_opts.cdw13 = cdw13.raw;
8020 0 : bio->ext_opts.metadata = md;
8021 0 : bio->ext_opts.accel_sequence = seq;
8022 :
8023 0 : if (iovcnt == 1) {
8024 0 : rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done,
8025 : bio, &bio->ext_opts);
8026 : } else {
8027 0 : rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count,
8028 : bdev_nvme_writev_done, bio,
8029 : bdev_nvme_queued_reset_sgl,
8030 : bdev_nvme_queued_next_sge,
8031 : &bio->ext_opts);
8032 : }
8033 25 : } else if (iovcnt == 1) {
8034 25 : rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base,
8035 : md, lba, lba_count, bdev_nvme_writev_done,
8036 : bio, flags, 0, 0);
8037 : } else {
8038 0 : rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
8039 : bdev_nvme_writev_done, bio, flags,
8040 : bdev_nvme_queued_reset_sgl,
8041 : bdev_nvme_queued_next_sge, md, 0, 0);
8042 : }
8043 :
8044 25 : if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
8045 0 : SPDK_ERRLOG("writev failed: rc = %d\n", rc);
8046 : }
8047 25 : return rc;
8048 : }
8049 :
8050 : static int
8051 0 : bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8052 : void *md, uint64_t lba_count, uint64_t zslba,
8053 : uint32_t flags)
8054 : {
8055 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8056 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8057 : int rc;
8058 :
8059 0 : SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
8060 : lba_count, zslba);
8061 :
8062 0 : bio->iovs = iov;
8063 0 : bio->iovcnt = iovcnt;
8064 0 : bio->iovpos = 0;
8065 0 : bio->iov_offset = 0;
8066 :
8067 0 : if (iovcnt == 1) {
8068 0 : rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
8069 : lba_count,
8070 : bdev_nvme_zone_appendv_done, bio,
8071 : flags,
8072 : 0, 0);
8073 : } else {
8074 0 : rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
8075 : bdev_nvme_zone_appendv_done, bio, flags,
8076 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
8077 : md, 0, 0);
8078 : }
8079 :
8080 0 : if (rc != 0 && rc != -ENOMEM) {
8081 0 : SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
8082 : }
8083 0 : return rc;
8084 : }
8085 :
8086 : static int
8087 1 : bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8088 : void *md, uint64_t lba_count, uint64_t lba,
8089 : uint32_t flags)
8090 : {
8091 : int rc;
8092 :
8093 1 : SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8094 : lba_count, lba);
8095 :
8096 1 : bio->iovs = iov;
8097 1 : bio->iovcnt = iovcnt;
8098 1 : bio->iovpos = 0;
8099 1 : bio->iov_offset = 0;
8100 :
8101 1 : rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns,
8102 1 : bio->io_path->qpair->qpair,
8103 : lba, lba_count,
8104 : bdev_nvme_comparev_done, bio, flags,
8105 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
8106 : md, 0, 0);
8107 :
8108 1 : if (rc != 0 && rc != -ENOMEM) {
8109 0 : SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
8110 : }
8111 1 : return rc;
8112 : }
8113 :
8114 : static int
8115 2 : bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
8116 : struct iovec *write_iov, int write_iovcnt,
8117 : void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
8118 : {
8119 2 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8120 2 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8121 2 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8122 : int rc;
8123 :
8124 2 : SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8125 : lba_count, lba);
8126 :
8127 2 : bio->iovs = cmp_iov;
8128 2 : bio->iovcnt = cmp_iovcnt;
8129 2 : bio->iovpos = 0;
8130 2 : bio->iov_offset = 0;
8131 2 : bio->fused_iovs = write_iov;
8132 2 : bio->fused_iovcnt = write_iovcnt;
8133 2 : bio->fused_iovpos = 0;
8134 2 : bio->fused_iov_offset = 0;
8135 :
8136 2 : if (bdev_io->num_retries == 0) {
8137 2 : bio->first_fused_submitted = false;
8138 2 : bio->first_fused_completed = false;
8139 : }
8140 :
8141 2 : if (!bio->first_fused_submitted) {
8142 2 : flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
8143 2 : memset(&bio->cpl, 0, sizeof(bio->cpl));
8144 :
8145 2 : rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
8146 : bdev_nvme_comparev_and_writev_done, bio, flags,
8147 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
8148 2 : if (rc == 0) {
8149 2 : bio->first_fused_submitted = true;
8150 2 : flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
8151 : } else {
8152 0 : if (rc != -ENOMEM) {
8153 0 : SPDK_ERRLOG("compare failed: rc = %d\n", rc);
8154 : }
8155 0 : return rc;
8156 : }
8157 : }
8158 :
8159 2 : flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
8160 :
8161 2 : rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
8162 : bdev_nvme_comparev_and_writev_done, bio, flags,
8163 : bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
8164 2 : if (rc != 0 && rc != -ENOMEM) {
8165 0 : SPDK_ERRLOG("write failed: rc = %d\n", rc);
8166 0 : rc = 0;
8167 : }
8168 :
8169 2 : return rc;
8170 : }
8171 :
8172 : static int
8173 1 : bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
8174 : {
8175 1 : struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
8176 : struct spdk_nvme_dsm_range *range;
8177 : uint64_t offset, remaining;
8178 : uint64_t num_ranges_u64;
8179 : uint16_t num_ranges;
8180 : int rc;
8181 :
8182 1 : num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
8183 : SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8184 1 : if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
8185 0 : SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
8186 0 : return -EINVAL;
8187 : }
8188 1 : num_ranges = (uint16_t)num_ranges_u64;
8189 :
8190 1 : offset = offset_blocks;
8191 1 : remaining = num_blocks;
8192 1 : range = &dsm_ranges[0];
8193 :
8194 : /* Fill max-size ranges until the remaining blocks fit into one range */
8195 1 : while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
8196 0 : range->attributes.raw = 0;
8197 0 : range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8198 0 : range->starting_lba = offset;
8199 :
8200 0 : offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8201 0 : remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8202 0 : range++;
8203 : }
8204 :
8205 : /* Final range describes the remaining blocks */
8206 1 : range->attributes.raw = 0;
8207 1 : range->length = remaining;
8208 1 : range->starting_lba = offset;
8209 :
8210 1 : rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns,
8211 1 : bio->io_path->qpair->qpair,
8212 : SPDK_NVME_DSM_ATTR_DEALLOCATE,
8213 : dsm_ranges, num_ranges,
8214 : bdev_nvme_queued_done, bio);
8215 :
8216 1 : return rc;
8217 : }
8218 :
8219 : static int
8220 0 : bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
8221 : {
8222 0 : if (num_blocks > UINT16_MAX + 1) {
8223 0 : SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n");
8224 0 : return -EINVAL;
8225 : }
8226 :
8227 0 : return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns,
8228 0 : bio->io_path->qpair->qpair,
8229 : offset_blocks, num_blocks,
8230 : bdev_nvme_queued_done, bio,
8231 : 0);
8232 : }
8233 :
8234 : static int
8235 0 : bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
8236 : struct spdk_bdev_zone_info *info)
8237 : {
8238 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8239 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8240 0 : uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
8241 0 : uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
8242 0 : uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
8243 :
8244 0 : if (zone_id % zone_size != 0) {
8245 0 : return -EINVAL;
8246 : }
8247 :
8248 0 : if (num_zones > total_zones || !num_zones) {
8249 0 : return -EINVAL;
8250 : }
8251 :
8252 0 : assert(!bio->zone_report_buf);
8253 0 : bio->zone_report_buf = calloc(1, zone_report_bufsize);
8254 0 : if (!bio->zone_report_buf) {
8255 0 : return -ENOMEM;
8256 : }
8257 :
8258 0 : bio->handled_zones = 0;
8259 :
8260 0 : return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
8261 : zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
8262 : bdev_nvme_get_zone_info_done, bio);
8263 : }
8264 :
8265 : static int
8266 0 : bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
8267 : enum spdk_bdev_zone_action action)
8268 : {
8269 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8270 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8271 :
8272 0 : switch (action) {
8273 0 : case SPDK_BDEV_ZONE_CLOSE:
8274 0 : return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
8275 : bdev_nvme_zone_management_done, bio);
8276 0 : case SPDK_BDEV_ZONE_FINISH:
8277 0 : return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
8278 : bdev_nvme_zone_management_done, bio);
8279 0 : case SPDK_BDEV_ZONE_OPEN:
8280 0 : return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
8281 : bdev_nvme_zone_management_done, bio);
8282 0 : case SPDK_BDEV_ZONE_RESET:
8283 0 : return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
8284 : bdev_nvme_zone_management_done, bio);
8285 0 : case SPDK_BDEV_ZONE_OFFLINE:
8286 0 : return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
8287 : bdev_nvme_zone_management_done, bio);
8288 0 : default:
8289 0 : return -EINVAL;
8290 : }
8291 : }
8292 :
8293 : static void
8294 5 : bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
8295 : struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
8296 : {
8297 : struct nvme_io_path *io_path;
8298 : struct nvme_ctrlr *nvme_ctrlr;
8299 : uint32_t max_xfer_size;
8300 5 : int rc = -ENXIO;
8301 :
8302 : /* Choose the first ctrlr which is not failed. */
8303 8 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
8304 7 : nvme_ctrlr = io_path->qpair->ctrlr;
8305 :
8306 : /* We should skip any unavailable nvme_ctrlr rather than checking
8307 : * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO.
8308 : */
8309 7 : if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
8310 3 : continue;
8311 : }
8312 :
8313 4 : max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr);
8314 :
8315 4 : if (nbytes > max_xfer_size) {
8316 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8317 0 : rc = -EINVAL;
8318 0 : goto err;
8319 : }
8320 :
8321 4 : rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes,
8322 : bdev_nvme_admin_passthru_done, bio);
8323 4 : if (rc == 0) {
8324 4 : return;
8325 : }
8326 : }
8327 :
8328 1 : err:
8329 1 : bdev_nvme_admin_complete(bio, rc);
8330 : }
8331 :
8332 : static int
8333 0 : bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
8334 : void *buf, size_t nbytes)
8335 : {
8336 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8337 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8338 0 : uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
8339 0 : struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
8340 :
8341 0 : if (nbytes > max_xfer_size) {
8342 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8343 0 : return -EINVAL;
8344 : }
8345 :
8346 : /*
8347 : * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
8348 : * so fill it out automatically.
8349 : */
8350 0 : cmd->nsid = spdk_nvme_ns_get_id(ns);
8351 :
8352 0 : return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
8353 : (uint32_t)nbytes, bdev_nvme_queued_done, bio);
8354 : }
8355 :
8356 : static int
8357 0 : bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
8358 : void *buf, size_t nbytes, void *md_buf, size_t md_len)
8359 : {
8360 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8361 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8362 0 : size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
8363 0 : uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
8364 0 : struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
8365 :
8366 0 : if (nbytes > max_xfer_size) {
8367 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8368 0 : return -EINVAL;
8369 : }
8370 :
8371 0 : if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
8372 0 : SPDK_ERRLOG("invalid meta data buffer size\n");
8373 0 : return -EINVAL;
8374 : }
8375 :
8376 : /*
8377 : * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
8378 : * so fill it out automatically.
8379 : */
8380 0 : cmd->nsid = spdk_nvme_ns_get_id(ns);
8381 :
8382 0 : return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
8383 : (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
8384 : }
8385 :
8386 : static int
8387 0 : bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio,
8388 : struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt,
8389 : size_t nbytes, void *md_buf, size_t md_len)
8390 : {
8391 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8392 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8393 0 : size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
8394 0 : uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
8395 0 : struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
8396 :
8397 0 : bio->iovs = iov;
8398 0 : bio->iovcnt = iovcnt;
8399 0 : bio->iovpos = 0;
8400 0 : bio->iov_offset = 0;
8401 :
8402 0 : if (nbytes > max_xfer_size) {
8403 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8404 0 : return -EINVAL;
8405 : }
8406 :
8407 0 : if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
8408 0 : SPDK_ERRLOG("invalid meta data buffer size\n");
8409 0 : return -EINVAL;
8410 : }
8411 :
8412 : /*
8413 : * Each NVMe bdev is a specific namespace, and all NVMe I/O commands
8414 : * require a nsid, so fill it out automatically.
8415 : */
8416 0 : cmd->nsid = spdk_nvme_ns_get_id(ns);
8417 :
8418 0 : return spdk_nvme_ctrlr_cmd_iov_raw_with_md(
8419 : ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio,
8420 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge);
8421 : }
8422 :
8423 : static void
8424 6 : bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
8425 : struct nvme_bdev_io *bio_to_abort)
8426 : {
8427 : struct nvme_io_path *io_path;
8428 6 : int rc = 0;
8429 :
8430 6 : rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort);
8431 6 : if (rc == 0) {
8432 1 : bdev_nvme_admin_complete(bio, 0);
8433 1 : return;
8434 : }
8435 :
8436 5 : io_path = bio_to_abort->io_path;
8437 5 : if (io_path != NULL) {
8438 3 : rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr,
8439 3 : io_path->qpair->qpair,
8440 : bio_to_abort,
8441 : bdev_nvme_abort_done, bio);
8442 : } else {
8443 3 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
8444 2 : rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr,
8445 : NULL,
8446 : bio_to_abort,
8447 : bdev_nvme_abort_done, bio);
8448 :
8449 2 : if (rc != -ENOENT) {
8450 1 : break;
8451 : }
8452 : }
8453 : }
8454 :
8455 5 : if (rc != 0) {
8456 : /* If no command was found or there was any error, complete the abort
8457 : * request with failure.
8458 : */
8459 2 : bdev_nvme_admin_complete(bio, rc);
8460 : }
8461 : }
8462 :
8463 : static int
8464 0 : bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks,
8465 : uint64_t num_blocks)
8466 : {
8467 0 : struct spdk_nvme_scc_source_range range = {
8468 : .slba = src_offset_blocks,
8469 0 : .nlb = num_blocks - 1
8470 : };
8471 :
8472 0 : return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns,
8473 0 : bio->io_path->qpair->qpair,
8474 : &range, 1, dst_offset_blocks,
8475 : bdev_nvme_queued_done, bio);
8476 : }
8477 :
8478 : static void
8479 0 : bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
8480 : {
8481 : const char *action;
8482 : uint32_t i;
8483 :
8484 0 : if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
8485 0 : action = "reset";
8486 0 : } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
8487 0 : action = "abort";
8488 : } else {
8489 0 : action = "none";
8490 : }
8491 :
8492 0 : spdk_json_write_object_begin(w);
8493 :
8494 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
8495 :
8496 0 : spdk_json_write_named_object_begin(w, "params");
8497 0 : spdk_json_write_named_string(w, "action_on_timeout", action);
8498 0 : spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
8499 0 : spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us);
8500 0 : spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
8501 0 : spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
8502 0 : spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
8503 0 : spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
8504 0 : spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
8505 0 : spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
8506 0 : spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
8507 0 : spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
8508 0 : spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
8509 0 : spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count);
8510 0 : spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count);
8511 0 : spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout);
8512 0 : spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec);
8513 0 : spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec);
8514 0 : spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec);
8515 0 : spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback);
8516 0 : spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids);
8517 0 : spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos);
8518 0 : spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat);
8519 0 : spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size);
8520 0 : spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat);
8521 0 : spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence);
8522 0 : spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size);
8523 0 : spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms);
8524 0 : spdk_json_write_named_array_begin(w, "dhchap_digests");
8525 0 : for (i = 0; i < 32; ++i) {
8526 0 : if (g_opts.dhchap_digests & SPDK_BIT(i)) {
8527 0 : spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i));
8528 : }
8529 : }
8530 0 : spdk_json_write_array_end(w);
8531 0 : spdk_json_write_named_array_begin(w, "dhchap_dhgroups");
8532 0 : for (i = 0; i < 32; ++i) {
8533 0 : if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) {
8534 0 : spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i));
8535 : }
8536 : }
8537 :
8538 0 : spdk_json_write_array_end(w);
8539 0 : spdk_json_write_object_end(w);
8540 :
8541 0 : spdk_json_write_object_end(w);
8542 0 : }
8543 :
8544 : static void
8545 0 : bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx)
8546 : {
8547 0 : struct spdk_nvme_transport_id trid;
8548 :
8549 0 : spdk_json_write_object_begin(w);
8550 :
8551 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery");
8552 :
8553 0 : spdk_json_write_named_object_begin(w, "params");
8554 0 : spdk_json_write_named_string(w, "name", ctx->name);
8555 0 : spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn);
8556 :
8557 0 : trid = ctx->trid;
8558 0 : memset(trid.subnqn, 0, sizeof(trid.subnqn));
8559 0 : nvme_bdev_dump_trid_json(&trid, w);
8560 :
8561 0 : spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach);
8562 0 : spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec);
8563 0 : spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec);
8564 0 : spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec",
8565 : ctx->bdev_opts.fast_io_fail_timeout_sec);
8566 0 : spdk_json_write_object_end(w);
8567 :
8568 0 : spdk_json_write_object_end(w);
8569 0 : }
8570 :
8571 : #ifdef SPDK_CONFIG_NVME_CUSE
8572 : static void
8573 0 : nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w,
8574 : struct nvme_ctrlr *nvme_ctrlr)
8575 0 : {
8576 0 : size_t cuse_name_size = 128;
8577 0 : char cuse_name[cuse_name_size];
8578 :
8579 0 : if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr,
8580 : cuse_name, &cuse_name_size) != 0) {
8581 0 : return;
8582 : }
8583 :
8584 0 : spdk_json_write_object_begin(w);
8585 :
8586 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register");
8587 :
8588 0 : spdk_json_write_named_object_begin(w, "params");
8589 0 : spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name);
8590 0 : spdk_json_write_object_end(w);
8591 :
8592 0 : spdk_json_write_object_end(w);
8593 : }
8594 : #endif
8595 :
8596 : static void
8597 0 : nvme_ctrlr_config_json(struct spdk_json_write_ctx *w,
8598 : struct nvme_ctrlr *nvme_ctrlr,
8599 : struct nvme_path_id *path_id)
8600 : {
8601 : struct spdk_nvme_transport_id *trid;
8602 : const struct spdk_nvme_ctrlr_opts *opts;
8603 :
8604 0 : if (nvme_ctrlr->opts.from_discovery_service) {
8605 : /* Do not emit an RPC for this - it will be implicitly
8606 : * covered by a separate bdev_nvme_start_discovery or
8607 : * bdev_nvme_start_mdns_discovery RPC.
8608 : */
8609 0 : return;
8610 : }
8611 :
8612 0 : trid = &path_id->trid;
8613 :
8614 0 : spdk_json_write_object_begin(w);
8615 :
8616 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
8617 :
8618 0 : spdk_json_write_named_object_begin(w, "params");
8619 0 : spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name);
8620 0 : nvme_bdev_dump_trid_json(trid, w);
8621 0 : spdk_json_write_named_bool(w, "prchk_reftag",
8622 0 : (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
8623 0 : spdk_json_write_named_bool(w, "prchk_guard",
8624 0 : (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
8625 0 : spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec);
8626 0 : spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec);
8627 0 : spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec",
8628 : nvme_ctrlr->opts.fast_io_fail_timeout_sec);
8629 0 : if (nvme_ctrlr->psk != NULL) {
8630 0 : spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk));
8631 : }
8632 0 : if (nvme_ctrlr->dhchap_key != NULL) {
8633 0 : spdk_json_write_named_string(w, "dhchap_key",
8634 : spdk_key_get_name(nvme_ctrlr->dhchap_key));
8635 : }
8636 0 : if (nvme_ctrlr->dhchap_ctrlr_key != NULL) {
8637 0 : spdk_json_write_named_string(w, "dhchap_ctrlr_key",
8638 : spdk_key_get_name(nvme_ctrlr->dhchap_ctrlr_key));
8639 : }
8640 0 : opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr);
8641 0 : spdk_json_write_named_string(w, "hostnqn", opts->hostnqn);
8642 0 : spdk_json_write_named_bool(w, "hdgst", opts->header_digest);
8643 0 : spdk_json_write_named_bool(w, "ddgst", opts->data_digest);
8644 0 : if (opts->src_addr[0] != '\0') {
8645 0 : spdk_json_write_named_string(w, "hostaddr", opts->src_addr);
8646 : }
8647 0 : if (opts->src_svcid[0] != '\0') {
8648 0 : spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid);
8649 : }
8650 :
8651 0 : if (nvme_ctrlr->opts.multipath) {
8652 0 : spdk_json_write_named_string(w, "multipath", "multipath");
8653 : }
8654 0 : spdk_json_write_object_end(w);
8655 :
8656 0 : spdk_json_write_object_end(w);
8657 : }
8658 :
8659 : static void
8660 0 : bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
8661 : {
8662 0 : spdk_json_write_object_begin(w);
8663 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
8664 :
8665 0 : spdk_json_write_named_object_begin(w, "params");
8666 0 : spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
8667 0 : spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
8668 0 : spdk_json_write_object_end(w);
8669 :
8670 0 : spdk_json_write_object_end(w);
8671 0 : }
8672 :
8673 : static int
8674 0 : bdev_nvme_config_json(struct spdk_json_write_ctx *w)
8675 : {
8676 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
8677 : struct nvme_ctrlr *nvme_ctrlr;
8678 : struct discovery_ctx *ctx;
8679 : struct nvme_path_id *path_id;
8680 :
8681 0 : bdev_nvme_opts_config_json(w);
8682 :
8683 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
8684 :
8685 0 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
8686 0 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
8687 0 : path_id = nvme_ctrlr->active_path_id;
8688 0 : assert(path_id == TAILQ_FIRST(&nvme_ctrlr->trids));
8689 0 : nvme_ctrlr_config_json(w, nvme_ctrlr, path_id);
8690 :
8691 0 : path_id = TAILQ_NEXT(path_id, link);
8692 0 : while (path_id != NULL) {
8693 0 : nvme_ctrlr_config_json(w, nvme_ctrlr, path_id);
8694 0 : path_id = TAILQ_NEXT(path_id, link);
8695 : }
8696 :
8697 : #ifdef SPDK_CONFIG_NVME_CUSE
8698 0 : nvme_ctrlr_cuse_config_json(w, nvme_ctrlr);
8699 : #endif
8700 : }
8701 : }
8702 :
8703 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
8704 0 : if (!ctx->from_mdns_discovery_service) {
8705 0 : bdev_nvme_discovery_config_json(w, ctx);
8706 : }
8707 : }
8708 :
8709 0 : bdev_nvme_mdns_discovery_config_json(w);
8710 :
8711 : /* Dump as last parameter to give all NVMe bdevs chance to be constructed
8712 : * before enabling hotplug poller.
8713 : */
8714 0 : bdev_nvme_hotplug_config_json(w);
8715 :
8716 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
8717 0 : return 0;
8718 : }
8719 :
8720 : struct spdk_nvme_ctrlr *
8721 1 : bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
8722 : {
8723 : struct nvme_bdev *nbdev;
8724 : struct nvme_ns *nvme_ns;
8725 :
8726 1 : if (!bdev || bdev->module != &nvme_if) {
8727 0 : return NULL;
8728 : }
8729 :
8730 1 : nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
8731 1 : nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
8732 1 : assert(nvme_ns != NULL);
8733 :
8734 1 : return nvme_ns->ctrlr->ctrlr;
8735 : }
8736 :
8737 : static bool
8738 12 : nvme_io_path_is_current(struct nvme_io_path *io_path)
8739 : {
8740 : const struct nvme_bdev_channel *nbdev_ch;
8741 : bool current;
8742 :
8743 12 : if (!nvme_io_path_is_available(io_path)) {
8744 4 : return false;
8745 : }
8746 :
8747 8 : nbdev_ch = io_path->nbdev_ch;
8748 8 : if (nbdev_ch == NULL) {
8749 1 : current = false;
8750 7 : } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
8751 3 : struct nvme_io_path *optimized_io_path = NULL;
8752 :
8753 6 : STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) {
8754 5 : if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) {
8755 2 : break;
8756 : }
8757 : }
8758 :
8759 : /* A non-optimized path is only current if there are no optimized paths. */
8760 3 : current = (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) ||
8761 : (optimized_io_path == NULL);
8762 : } else {
8763 4 : if (nbdev_ch->current_io_path) {
8764 1 : current = (io_path == nbdev_ch->current_io_path);
8765 : } else {
8766 : struct nvme_io_path *first_path;
8767 :
8768 : /* We arrived here as there are no optimized paths for active-passive
8769 : * mode. Check if this io_path is the first one available on the list.
8770 : */
8771 3 : current = false;
8772 3 : STAILQ_FOREACH(first_path, &nbdev_ch->io_path_list, stailq) {
8773 3 : if (nvme_io_path_is_available(first_path)) {
8774 3 : current = (io_path == first_path);
8775 3 : break;
8776 : }
8777 : }
8778 : }
8779 : }
8780 :
8781 8 : return current;
8782 : }
8783 :
8784 : static struct nvme_ctrlr *
8785 0 : bdev_nvme_next_ctrlr_unsafe(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct nvme_ctrlr *prev)
8786 : {
8787 : struct nvme_ctrlr *next;
8788 :
8789 : /* Must be called under g_bdev_nvme_mutex */
8790 0 : next = prev != NULL ? TAILQ_NEXT(prev, tailq) : TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
8791 0 : while (next != NULL) {
8792 : /* ref can be 0 when the ctrlr was released, but hasn't been detached yet */
8793 0 : pthread_mutex_lock(&next->mutex);
8794 0 : if (next->ref > 0) {
8795 0 : next->ref++;
8796 0 : pthread_mutex_unlock(&next->mutex);
8797 0 : return next;
8798 : }
8799 :
8800 0 : pthread_mutex_unlock(&next->mutex);
8801 0 : next = TAILQ_NEXT(next, tailq);
8802 : }
8803 :
8804 0 : return NULL;
8805 : }
8806 :
8807 : struct bdev_nvme_set_keys_ctx {
8808 : struct nvme_ctrlr *nctrlr;
8809 : struct spdk_key *dhchap_key;
8810 : struct spdk_key *dhchap_ctrlr_key;
8811 : struct spdk_thread *thread;
8812 : bdev_nvme_set_keys_cb cb_fn;
8813 : void *cb_ctx;
8814 : int status;
8815 : };
8816 :
8817 : static void
8818 0 : bdev_nvme_free_set_keys_ctx(struct bdev_nvme_set_keys_ctx *ctx)
8819 : {
8820 0 : if (ctx == NULL) {
8821 0 : return;
8822 : }
8823 :
8824 0 : spdk_keyring_put_key(ctx->dhchap_key);
8825 0 : spdk_keyring_put_key(ctx->dhchap_ctrlr_key);
8826 0 : free(ctx);
8827 : }
8828 :
8829 : static void
8830 0 : _bdev_nvme_set_keys_done(void *_ctx)
8831 : {
8832 0 : struct bdev_nvme_set_keys_ctx *ctx = _ctx;
8833 :
8834 0 : ctx->cb_fn(ctx->cb_ctx, ctx->status);
8835 :
8836 0 : if (ctx->nctrlr != NULL) {
8837 0 : nvme_ctrlr_release(ctx->nctrlr);
8838 : }
8839 0 : bdev_nvme_free_set_keys_ctx(ctx);
8840 0 : }
8841 :
8842 : static void
8843 0 : bdev_nvme_set_keys_done(struct bdev_nvme_set_keys_ctx *ctx, int status)
8844 : {
8845 0 : ctx->status = status;
8846 0 : spdk_thread_exec_msg(ctx->thread, _bdev_nvme_set_keys_done, ctx);
8847 0 : }
8848 :
8849 : static void bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx);
8850 :
8851 : static void
8852 0 : bdev_nvme_authenticate_ctrlr_continue(struct bdev_nvme_set_keys_ctx *ctx)
8853 : {
8854 : struct nvme_ctrlr *next;
8855 :
8856 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
8857 0 : next = bdev_nvme_next_ctrlr_unsafe(NULL, ctx->nctrlr);
8858 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
8859 :
8860 0 : nvme_ctrlr_release(ctx->nctrlr);
8861 0 : ctx->nctrlr = next;
8862 :
8863 0 : if (next == NULL) {
8864 0 : bdev_nvme_set_keys_done(ctx, 0);
8865 : } else {
8866 0 : bdev_nvme_authenticate_ctrlr(ctx);
8867 : }
8868 0 : }
8869 :
8870 : static void
8871 0 : bdev_nvme_authenticate_qpairs_done(struct spdk_io_channel_iter *i, int status)
8872 : {
8873 0 : struct bdev_nvme_set_keys_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
8874 :
8875 0 : if (status != 0) {
8876 0 : bdev_nvme_set_keys_done(ctx, status);
8877 0 : return;
8878 : }
8879 0 : bdev_nvme_authenticate_ctrlr_continue(ctx);
8880 : }
8881 :
8882 : static void
8883 0 : bdev_nvme_authenticate_qpair_done(void *ctx, int status)
8884 : {
8885 0 : spdk_for_each_channel_continue(ctx, status);
8886 0 : }
8887 :
8888 : static void
8889 0 : bdev_nvme_authenticate_qpair(struct spdk_io_channel_iter *i)
8890 : {
8891 0 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
8892 0 : struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
8893 0 : struct nvme_qpair *qpair = ctrlr_ch->qpair;
8894 : int rc;
8895 :
8896 0 : if (!nvme_qpair_is_connected(qpair)) {
8897 0 : spdk_for_each_channel_continue(i, 0);
8898 0 : return;
8899 : }
8900 :
8901 0 : rc = spdk_nvme_qpair_authenticate(qpair->qpair, bdev_nvme_authenticate_qpair_done, i);
8902 0 : if (rc != 0) {
8903 0 : spdk_for_each_channel_continue(i, rc);
8904 : }
8905 : }
8906 :
8907 : static void
8908 0 : bdev_nvme_authenticate_ctrlr_done(void *_ctx, int status)
8909 : {
8910 0 : struct bdev_nvme_set_keys_ctx *ctx = _ctx;
8911 :
8912 0 : if (status != 0) {
8913 0 : bdev_nvme_set_keys_done(ctx, status);
8914 0 : return;
8915 : }
8916 :
8917 0 : spdk_for_each_channel(ctx->nctrlr, bdev_nvme_authenticate_qpair, ctx,
8918 : bdev_nvme_authenticate_qpairs_done);
8919 : }
8920 :
8921 : static void
8922 0 : bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx)
8923 : {
8924 0 : struct spdk_nvme_ctrlr_key_opts opts = {};
8925 0 : struct nvme_ctrlr *nctrlr = ctx->nctrlr;
8926 : int rc;
8927 :
8928 0 : opts.size = SPDK_SIZEOF(&opts, dhchap_ctrlr_key);
8929 0 : opts.dhchap_key = ctx->dhchap_key;
8930 0 : opts.dhchap_ctrlr_key = ctx->dhchap_ctrlr_key;
8931 0 : rc = spdk_nvme_ctrlr_set_keys(nctrlr->ctrlr, &opts);
8932 0 : if (rc != 0) {
8933 0 : bdev_nvme_set_keys_done(ctx, rc);
8934 0 : return;
8935 : }
8936 :
8937 0 : if (ctx->dhchap_key != NULL) {
8938 0 : rc = spdk_nvme_ctrlr_authenticate(nctrlr->ctrlr,
8939 : bdev_nvme_authenticate_ctrlr_done, ctx);
8940 0 : if (rc != 0) {
8941 0 : bdev_nvme_set_keys_done(ctx, rc);
8942 : }
8943 : } else {
8944 0 : bdev_nvme_authenticate_ctrlr_continue(ctx);
8945 : }
8946 : }
8947 :
8948 : int
8949 0 : bdev_nvme_set_keys(const char *name, const char *dhchap_key, const char *dhchap_ctrlr_key,
8950 : bdev_nvme_set_keys_cb cb_fn, void *cb_ctx)
8951 : {
8952 : struct bdev_nvme_set_keys_ctx *ctx;
8953 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
8954 : struct nvme_ctrlr *nctrlr;
8955 :
8956 0 : ctx = calloc(1, sizeof(*ctx));
8957 0 : if (ctx == NULL) {
8958 0 : return -ENOMEM;
8959 : }
8960 :
8961 0 : if (dhchap_key != NULL) {
8962 0 : ctx->dhchap_key = spdk_keyring_get_key(dhchap_key);
8963 0 : if (ctx->dhchap_key == NULL) {
8964 0 : SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_key, name);
8965 0 : bdev_nvme_free_set_keys_ctx(ctx);
8966 0 : return -ENOKEY;
8967 : }
8968 : }
8969 0 : if (dhchap_ctrlr_key != NULL) {
8970 0 : ctx->dhchap_ctrlr_key = spdk_keyring_get_key(dhchap_ctrlr_key);
8971 0 : if (ctx->dhchap_ctrlr_key == NULL) {
8972 0 : SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_ctrlr_key, name);
8973 0 : bdev_nvme_free_set_keys_ctx(ctx);
8974 0 : return -ENOKEY;
8975 : }
8976 : }
8977 :
8978 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
8979 0 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
8980 0 : if (nbdev_ctrlr == NULL) {
8981 0 : SPDK_ERRLOG("Could not find bdev_ctrlr %s\n", name);
8982 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
8983 0 : bdev_nvme_free_set_keys_ctx(ctx);
8984 0 : return -ENODEV;
8985 : }
8986 0 : nctrlr = bdev_nvme_next_ctrlr_unsafe(nbdev_ctrlr, NULL);
8987 0 : if (nctrlr == NULL) {
8988 0 : SPDK_ERRLOG("Could not find any nvme_ctrlrs on bdev_ctrlr %s\n", name);
8989 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
8990 0 : bdev_nvme_free_set_keys_ctx(ctx);
8991 0 : return -ENODEV;
8992 : }
8993 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
8994 :
8995 0 : ctx->nctrlr = nctrlr;
8996 0 : ctx->cb_fn = cb_fn;
8997 0 : ctx->cb_ctx = cb_ctx;
8998 0 : ctx->thread = spdk_get_thread();
8999 :
9000 0 : bdev_nvme_authenticate_ctrlr(ctx);
9001 :
9002 0 : return 0;
9003 : }
9004 :
9005 : void
9006 0 : nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path)
9007 : {
9008 0 : struct nvme_ns *nvme_ns = io_path->nvme_ns;
9009 0 : struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr;
9010 : const struct spdk_nvme_ctrlr_data *cdata;
9011 : const struct spdk_nvme_transport_id *trid;
9012 : const char *adrfam_str;
9013 :
9014 0 : spdk_json_write_object_begin(w);
9015 :
9016 0 : spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name);
9017 :
9018 0 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
9019 0 : trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr);
9020 :
9021 0 : spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid);
9022 0 : spdk_json_write_named_bool(w, "current", nvme_io_path_is_current(io_path));
9023 0 : spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair));
9024 0 : spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns));
9025 :
9026 0 : spdk_json_write_named_object_begin(w, "transport");
9027 0 : spdk_json_write_named_string(w, "trtype", trid->trstring);
9028 0 : spdk_json_write_named_string(w, "traddr", trid->traddr);
9029 0 : if (trid->trsvcid[0] != '\0') {
9030 0 : spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
9031 : }
9032 0 : adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
9033 0 : if (adrfam_str) {
9034 0 : spdk_json_write_named_string(w, "adrfam", adrfam_str);
9035 : }
9036 0 : spdk_json_write_object_end(w);
9037 :
9038 0 : spdk_json_write_object_end(w);
9039 0 : }
9040 :
9041 : void
9042 0 : bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w)
9043 : {
9044 : struct discovery_ctx *ctx;
9045 : struct discovery_entry_ctx *entry_ctx;
9046 :
9047 0 : spdk_json_write_array_begin(w);
9048 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
9049 0 : spdk_json_write_object_begin(w);
9050 0 : spdk_json_write_named_string(w, "name", ctx->name);
9051 :
9052 0 : spdk_json_write_named_object_begin(w, "trid");
9053 0 : nvme_bdev_dump_trid_json(&ctx->trid, w);
9054 0 : spdk_json_write_object_end(w);
9055 :
9056 0 : spdk_json_write_named_array_begin(w, "referrals");
9057 0 : TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) {
9058 0 : spdk_json_write_object_begin(w);
9059 0 : spdk_json_write_named_object_begin(w, "trid");
9060 0 : nvme_bdev_dump_trid_json(&entry_ctx->trid, w);
9061 0 : spdk_json_write_object_end(w);
9062 0 : spdk_json_write_object_end(w);
9063 : }
9064 0 : spdk_json_write_array_end(w);
9065 :
9066 0 : spdk_json_write_object_end(w);
9067 : }
9068 0 : spdk_json_write_array_end(w);
9069 0 : }
9070 :
9071 1 : SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
9072 :
9073 : static void
9074 0 : bdev_nvme_trace(void)
9075 : {
9076 0 : struct spdk_trace_tpoint_opts opts[] = {
9077 : {
9078 : "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START,
9079 : OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1,
9080 : {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }}
9081 : },
9082 : {
9083 : "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE,
9084 : OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0,
9085 : {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }}
9086 : }
9087 : };
9088 :
9089 :
9090 0 : spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N');
9091 0 : spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
9092 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0);
9093 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0);
9094 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0);
9095 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0);
9096 0 : }
9097 1 : SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME)
|