Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2016 Intel Corporation. All rights reserved.
3 : * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4 : * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 : * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved.
6 : */
7 :
8 : #include "spdk/stdinc.h"
9 :
10 : #include "bdev_nvme.h"
11 :
12 : #include "spdk/accel.h"
13 : #include "spdk/config.h"
14 : #include "spdk/endian.h"
15 : #include "spdk/bdev.h"
16 : #include "spdk/json.h"
17 : #include "spdk/keyring.h"
18 : #include "spdk/likely.h"
19 : #include "spdk/nvme.h"
20 : #include "spdk/nvme_ocssd.h"
21 : #include "spdk/nvme_zns.h"
22 : #include "spdk/opal.h"
23 : #include "spdk/thread.h"
24 : #include "spdk/trace.h"
25 : #include "spdk/string.h"
26 : #include "spdk/util.h"
27 : #include "spdk/uuid.h"
28 :
29 : #include "spdk/bdev_module.h"
30 : #include "spdk/log.h"
31 :
32 : #include "spdk_internal/usdt.h"
33 : #include "spdk_internal/trace_defs.h"
34 :
35 : #define CTRLR_STRING(nvme_ctrlr) \
36 : (spdk_nvme_trtype_is_fabrics(nvme_ctrlr->active_path_id->trid.trtype) ? \
37 : nvme_ctrlr->active_path_id->trid.subnqn : nvme_ctrlr->active_path_id->trid.traddr)
38 :
39 : #define CTRLR_ID(nvme_ctrlr) (spdk_nvme_ctrlr_get_id(nvme_ctrlr->ctrlr))
40 :
41 : #define NVME_CTRLR_ERRLOG(ctrlr, format, ...) \
42 : SPDK_ERRLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
43 :
44 : #define NVME_CTRLR_WARNLOG(ctrlr, format, ...) \
45 : SPDK_WARNLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
46 :
47 : #define NVME_CTRLR_NOTICELOG(ctrlr, format, ...) \
48 : SPDK_NOTICELOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
49 :
50 : #define NVME_CTRLR_INFOLOG(ctrlr, format, ...) \
51 : SPDK_INFOLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
52 :
53 : #ifdef DEBUG
54 : #define NVME_CTRLR_DEBUGLOG(ctrlr, format, ...) \
55 : SPDK_DEBUGLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
56 : #else
57 : #define NVME_CTRLR_DEBUGLOG(ctrlr, ...) do { } while (0)
58 : #endif
59 :
60 : #define BDEV_STRING(nbdev) (nbdev->disk.name)
61 :
62 : #define NVME_BDEV_ERRLOG(nbdev, format, ...) \
63 : SPDK_ERRLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__);
64 :
65 : #define NVME_BDEV_WARNLOG(nbdev, format, ...) \
66 : SPDK_WARNLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__);
67 :
68 : #define NVME_BDEV_NOTICELOG(nbdev, format, ...) \
69 : SPDK_NOTICELOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__);
70 :
71 : #define NVME_BDEV_INFOLOG(nbdev, format, ...) \
72 : SPDK_INFOLOG(bdev_nvme, "[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__);
73 :
74 : #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
75 : #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000)
76 :
77 : #define NSID_STR_LEN 10
78 :
79 : #define SPDK_CONTROLLER_NAME_MAX 512
80 :
81 : static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
82 :
83 : struct nvme_bdev_io {
84 : /** array of iovecs to transfer. */
85 : struct iovec *iovs;
86 :
87 : /** Number of iovecs in iovs array. */
88 : int iovcnt;
89 :
90 : /** Current iovec position. */
91 : int iovpos;
92 :
93 : /** Offset in current iovec. */
94 : uint32_t iov_offset;
95 :
96 : /** Offset in current iovec. */
97 : uint32_t fused_iov_offset;
98 :
99 : /** array of iovecs to transfer. */
100 : struct iovec *fused_iovs;
101 :
102 : /** Number of iovecs in iovs array. */
103 : int fused_iovcnt;
104 :
105 : /** Current iovec position. */
106 : int fused_iovpos;
107 :
108 : /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path
109 : * being reset in a reset I/O.
110 : */
111 : struct nvme_io_path *io_path;
112 :
113 : /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
114 : struct spdk_nvme_cpl cpl;
115 :
116 : /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */
117 : struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
118 :
119 : /** Keeps track if first of fused commands was submitted */
120 : bool first_fused_submitted;
121 :
122 : /** Keeps track if first of fused commands was completed */
123 : bool first_fused_completed;
124 :
125 : /* How many times the current I/O was retried. */
126 : int32_t retry_count;
127 :
128 : /** Expiration value in ticks to retry the current I/O. */
129 : uint64_t retry_ticks;
130 :
131 : /** Temporary pointer to zone report buffer */
132 : struct spdk_nvme_zns_zone_report *zone_report_buf;
133 :
134 : /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
135 : uint64_t handled_zones;
136 :
137 : /* Current tsc at submit time. */
138 : uint64_t submit_tsc;
139 :
140 : /* Used to put nvme_bdev_io into the list */
141 : TAILQ_ENTRY(nvme_bdev_io) retry_link;
142 : };
143 :
144 : struct nvme_probe_skip_entry {
145 : struct spdk_nvme_transport_id trid;
146 : TAILQ_ENTRY(nvme_probe_skip_entry) tailq;
147 : };
148 : /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
149 : static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
150 : g_skipped_nvme_ctrlrs);
151 :
152 : #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \
153 : SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \
154 : SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512))
155 :
156 : #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \
157 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \
158 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \
159 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \
160 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \
161 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192))
162 :
163 : static struct spdk_bdev_nvme_opts g_opts = {
164 : .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
165 : .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
166 : .timeout_us = 0,
167 : .timeout_admin_us = 0,
168 : .transport_retry_count = 4,
169 : .arbitration_burst = 0,
170 : .low_priority_weight = 0,
171 : .medium_priority_weight = 0,
172 : .high_priority_weight = 0,
173 : .io_queue_requests = 0,
174 : .nvme_adminq_poll_period_us = 10000ULL,
175 : .nvme_ioq_poll_period_us = 0,
176 : .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
177 : .bdev_retry_count = 3,
178 : .ctrlr_loss_timeout_sec = 0,
179 : .reconnect_delay_sec = 0,
180 : .fast_io_fail_timeout_sec = 0,
181 : .transport_ack_timeout = 0,
182 : .disable_auto_failback = false,
183 : .generate_uuids = false,
184 : .transport_tos = 0,
185 : .nvme_error_stat = false,
186 : .io_path_stat = false,
187 : .allow_accel_sequence = false,
188 : .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS,
189 : .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS,
190 : };
191 :
192 : #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL
193 : #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL
194 :
195 : static int g_hot_insert_nvme_controller_index = 0;
196 : static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
197 : static bool g_nvme_hotplug_enabled = false;
198 : struct spdk_thread *g_bdev_nvme_init_thread;
199 : static struct spdk_poller *g_hotplug_poller;
200 : static struct spdk_poller *g_hotplug_probe_poller;
201 : static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
202 :
203 : static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
204 : struct nvme_async_probe_ctx *ctx);
205 : static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
206 : struct nvme_async_probe_ctx *ctx);
207 : static int bdev_nvme_library_init(void);
208 : static void bdev_nvme_library_fini(void);
209 : static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch,
210 : struct spdk_bdev_io *bdev_io);
211 : static void bdev_nvme_submit_request(struct spdk_io_channel *ch,
212 : struct spdk_bdev_io *bdev_io);
213 : static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
214 : void *md, uint64_t lba_count, uint64_t lba,
215 : uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx,
216 : struct spdk_accel_sequence *seq);
217 : static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
218 : void *md, uint64_t lba_count, uint64_t lba);
219 : static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
220 : void *md, uint64_t lba_count, uint64_t lba,
221 : uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx,
222 : struct spdk_accel_sequence *seq,
223 : union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13);
224 : static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
225 : void *md, uint64_t lba_count,
226 : uint64_t zslba, uint32_t flags);
227 : static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
228 : void *md, uint64_t lba_count, uint64_t lba,
229 : uint32_t flags);
230 : static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio,
231 : struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
232 : int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
233 : uint32_t flags);
234 : static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id,
235 : uint32_t num_zones, struct spdk_bdev_zone_info *info);
236 : static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
237 : enum spdk_bdev_zone_action action);
238 : static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch,
239 : struct nvme_bdev_io *bio,
240 : struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
241 : static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
242 : void *buf, size_t nbytes);
243 : static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
244 : void *buf, size_t nbytes, void *md_buf, size_t md_len);
245 : static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
246 : struct iovec *iov, int iovcnt, size_t nbytes,
247 : void *md_buf, size_t md_len);
248 : static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch,
249 : struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
250 : static void bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio);
251 : static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
252 : static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
253 : static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
254 : static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr);
255 :
256 : static struct nvme_ns *nvme_ns_alloc(void);
257 : static void nvme_ns_free(struct nvme_ns *ns);
258 :
259 : static int
260 176 : nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2)
261 : {
262 176 : return ns1->id < ns2->id ? -1 : ns1->id > ns2->id;
263 : }
264 :
265 1089 : RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp);
266 :
267 : struct spdk_nvme_qpair *
268 1 : bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
269 : {
270 1 : struct nvme_ctrlr_channel *ctrlr_ch;
271 :
272 1 : assert(ctrlr_io_ch != NULL);
273 :
274 1 : ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
275 :
276 1 : return ctrlr_ch->qpair->qpair;
277 1 : }
278 :
279 : static int
280 0 : bdev_nvme_get_ctx_size(void)
281 : {
282 0 : return sizeof(struct nvme_bdev_io);
283 : }
284 :
285 : static struct spdk_bdev_module nvme_if = {
286 : .name = "nvme",
287 : .async_fini = true,
288 : .module_init = bdev_nvme_library_init,
289 : .module_fini = bdev_nvme_library_fini,
290 : .config_json = bdev_nvme_config_json,
291 : .get_ctx_size = bdev_nvme_get_ctx_size,
292 :
293 : };
294 1 : SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
295 :
296 : struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs);
297 : pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER;
298 : bool g_bdev_nvme_module_finish;
299 :
300 : struct nvme_bdev_ctrlr *
301 333 : nvme_bdev_ctrlr_get_by_name(const char *name)
302 : {
303 333 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
304 :
305 333 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
306 171 : if (strcmp(name, nbdev_ctrlr->name) == 0) {
307 171 : break;
308 : }
309 0 : }
310 :
311 666 : return nbdev_ctrlr;
312 333 : }
313 :
314 : static struct nvme_ctrlr *
315 59 : nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr,
316 : const struct spdk_nvme_transport_id *trid, const char *hostnqn)
317 : {
318 59 : const struct spdk_nvme_ctrlr_opts *opts;
319 59 : struct nvme_ctrlr *nvme_ctrlr;
320 :
321 100 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
322 75 : opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr);
323 109 : if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0 &&
324 34 : strcmp(hostnqn, opts->hostnqn) == 0) {
325 34 : break;
326 : }
327 41 : }
328 :
329 118 : return nvme_ctrlr;
330 59 : }
331 :
332 : struct nvme_ctrlr *
333 0 : nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr,
334 : uint16_t cntlid)
335 : {
336 0 : struct nvme_ctrlr *nvme_ctrlr;
337 0 : const struct spdk_nvme_ctrlr_data *cdata;
338 :
339 0 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
340 0 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
341 0 : if (cdata->cntlid == cntlid) {
342 0 : break;
343 : }
344 0 : }
345 :
346 0 : return nvme_ctrlr;
347 0 : }
348 :
349 : static struct nvme_bdev *
350 75 : nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid)
351 : {
352 75 : struct nvme_bdev *bdev;
353 :
354 75 : pthread_mutex_lock(&g_bdev_nvme_mutex);
355 109 : TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) {
356 69 : if (bdev->nsid == nsid) {
357 35 : break;
358 : }
359 34 : }
360 75 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
361 :
362 150 : return bdev;
363 75 : }
364 :
365 : struct nvme_ns *
366 145 : nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid)
367 : {
368 145 : struct nvme_ns ns;
369 :
370 145 : assert(nsid > 0);
371 :
372 145 : ns.id = nsid;
373 145 : return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns);
374 145 : }
375 :
376 : struct nvme_ns *
377 165 : nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr)
378 : {
379 165 : return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces);
380 : }
381 :
382 : struct nvme_ns *
383 74 : nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns)
384 : {
385 74 : if (ns == NULL) {
386 0 : return NULL;
387 : }
388 :
389 74 : return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
390 74 : }
391 :
392 : static struct nvme_ctrlr *
393 53 : nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid, const char *hostnqn)
394 : {
395 53 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
396 53 : struct nvme_ctrlr *nvme_ctrlr = NULL;
397 :
398 53 : pthread_mutex_lock(&g_bdev_nvme_mutex);
399 72 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
400 19 : nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid, hostnqn);
401 19 : if (nvme_ctrlr != NULL) {
402 0 : break;
403 : }
404 19 : }
405 53 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
406 :
407 106 : return nvme_ctrlr;
408 53 : }
409 :
410 : struct nvme_ctrlr *
411 126 : nvme_ctrlr_get_by_name(const char *name)
412 : {
413 126 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
414 126 : struct nvme_ctrlr *nvme_ctrlr = NULL;
415 :
416 126 : if (name == NULL) {
417 0 : return NULL;
418 : }
419 :
420 126 : pthread_mutex_lock(&g_bdev_nvme_mutex);
421 126 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
422 126 : if (nbdev_ctrlr != NULL) {
423 60 : nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
424 60 : }
425 126 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
426 :
427 126 : return nvme_ctrlr;
428 126 : }
429 :
430 : void
431 0 : nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx)
432 : {
433 0 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
434 :
435 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
436 0 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
437 0 : fn(nbdev_ctrlr, ctx);
438 0 : }
439 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
440 0 : }
441 :
442 : struct nvme_ctrlr_channel_iter {
443 : nvme_ctrlr_for_each_channel_msg fn;
444 : nvme_ctrlr_for_each_channel_done cpl;
445 : struct spdk_io_channel_iter *i;
446 : void *ctx;
447 : };
448 :
449 : void
450 166 : nvme_ctrlr_for_each_channel_continue(struct nvme_ctrlr_channel_iter *iter, int status)
451 : {
452 166 : spdk_for_each_channel_continue(iter->i, status);
453 166 : }
454 :
455 : static void
456 166 : nvme_ctrlr_each_channel_msg(struct spdk_io_channel_iter *i)
457 : {
458 166 : struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
459 166 : struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
460 166 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
461 166 : struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
462 :
463 166 : iter->i = i;
464 166 : iter->fn(iter, nvme_ctrlr, ctrlr_ch, iter->ctx);
465 166 : }
466 :
467 : static void
468 97 : nvme_ctrlr_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
469 : {
470 97 : struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
471 97 : struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
472 :
473 97 : iter->i = i;
474 97 : iter->cpl(nvme_ctrlr, iter->ctx, status);
475 :
476 97 : free(iter);
477 97 : }
478 :
479 : void
480 97 : nvme_ctrlr_for_each_channel(struct nvme_ctrlr *nvme_ctrlr,
481 : nvme_ctrlr_for_each_channel_msg fn, void *ctx,
482 : nvme_ctrlr_for_each_channel_done cpl)
483 : {
484 97 : struct nvme_ctrlr_channel_iter *iter;
485 :
486 97 : assert(nvme_ctrlr != NULL && fn != NULL);
487 :
488 97 : iter = calloc(1, sizeof(struct nvme_ctrlr_channel_iter));
489 97 : if (iter == NULL) {
490 0 : SPDK_ERRLOG("Unable to allocate iterator\n");
491 0 : assert(false);
492 : return;
493 : }
494 :
495 97 : iter->fn = fn;
496 97 : iter->cpl = cpl;
497 97 : iter->ctx = ctx;
498 :
499 194 : spdk_for_each_channel(nvme_ctrlr, nvme_ctrlr_each_channel_msg,
500 97 : iter, nvme_ctrlr_each_channel_cpl);
501 97 : }
502 :
503 : struct nvme_bdev_channel_iter {
504 : nvme_bdev_for_each_channel_msg fn;
505 : nvme_bdev_for_each_channel_done cpl;
506 : struct spdk_io_channel_iter *i;
507 : void *ctx;
508 : };
509 :
510 : void
511 69 : nvme_bdev_for_each_channel_continue(struct nvme_bdev_channel_iter *iter, int status)
512 : {
513 69 : spdk_for_each_channel_continue(iter->i, status);
514 69 : }
515 :
516 : static void
517 69 : nvme_bdev_each_channel_msg(struct spdk_io_channel_iter *i)
518 : {
519 69 : struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
520 69 : struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i);
521 69 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
522 69 : struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
523 :
524 69 : iter->i = i;
525 69 : iter->fn(iter, nbdev, nbdev_ch, iter->ctx);
526 69 : }
527 :
528 : static void
529 60 : nvme_bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
530 : {
531 60 : struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
532 60 : struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i);
533 :
534 60 : iter->i = i;
535 60 : iter->cpl(nbdev, iter->ctx, status);
536 :
537 60 : free(iter);
538 60 : }
539 :
540 : void
541 60 : nvme_bdev_for_each_channel(struct nvme_bdev *nbdev,
542 : nvme_bdev_for_each_channel_msg fn, void *ctx,
543 : nvme_bdev_for_each_channel_done cpl)
544 : {
545 60 : struct nvme_bdev_channel_iter *iter;
546 :
547 60 : assert(nbdev != NULL && fn != NULL);
548 :
549 60 : iter = calloc(1, sizeof(struct nvme_bdev_channel_iter));
550 60 : if (iter == NULL) {
551 0 : SPDK_ERRLOG("Unable to allocate iterator\n");
552 0 : assert(false);
553 : return;
554 : }
555 :
556 60 : iter->fn = fn;
557 60 : iter->cpl = cpl;
558 60 : iter->ctx = ctx;
559 :
560 60 : spdk_for_each_channel(nbdev, nvme_bdev_each_channel_msg, iter,
561 : nvme_bdev_each_channel_cpl);
562 60 : }
563 :
564 : void
565 0 : nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w)
566 : {
567 0 : const char *trtype_str;
568 0 : const char *adrfam_str;
569 :
570 0 : trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype);
571 0 : if (trtype_str) {
572 0 : spdk_json_write_named_string(w, "trtype", trtype_str);
573 0 : }
574 :
575 0 : adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
576 0 : if (adrfam_str) {
577 0 : spdk_json_write_named_string(w, "adrfam", adrfam_str);
578 0 : }
579 :
580 0 : if (trid->traddr[0] != '\0') {
581 0 : spdk_json_write_named_string(w, "traddr", trid->traddr);
582 0 : }
583 :
584 0 : if (trid->trsvcid[0] != '\0') {
585 0 : spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
586 0 : }
587 :
588 0 : if (trid->subnqn[0] != '\0') {
589 0 : spdk_json_write_named_string(w, "subnqn", trid->subnqn);
590 0 : }
591 0 : }
592 :
593 : static void
594 61 : nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr,
595 : struct nvme_ctrlr *nvme_ctrlr)
596 : {
597 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name);
598 61 : pthread_mutex_lock(&g_bdev_nvme_mutex);
599 :
600 61 : TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
601 61 : if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) {
602 15 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
603 :
604 15 : return;
605 : }
606 46 : TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
607 :
608 46 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
609 :
610 46 : assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs));
611 :
612 46 : free(nbdev_ctrlr->name);
613 46 : free(nbdev_ctrlr);
614 61 : }
615 :
616 : static void
617 62 : _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
618 : {
619 62 : struct nvme_path_id *path_id, *tmp_path;
620 62 : struct nvme_ns *ns, *tmp_ns;
621 :
622 62 : free(nvme_ctrlr->copied_ana_desc);
623 62 : spdk_free(nvme_ctrlr->ana_log_page);
624 :
625 62 : if (nvme_ctrlr->opal_dev) {
626 0 : spdk_opal_dev_destruct(nvme_ctrlr->opal_dev);
627 0 : nvme_ctrlr->opal_dev = NULL;
628 0 : }
629 :
630 62 : if (nvme_ctrlr->nbdev_ctrlr) {
631 61 : nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr);
632 61 : }
633 :
634 62 : RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) {
635 0 : RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
636 0 : nvme_ns_free(ns);
637 0 : }
638 :
639 124 : TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) {
640 62 : TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
641 62 : free(path_id);
642 62 : }
643 :
644 62 : pthread_mutex_destroy(&nvme_ctrlr->mutex);
645 62 : spdk_keyring_put_key(nvme_ctrlr->psk);
646 62 : spdk_keyring_put_key(nvme_ctrlr->dhchap_key);
647 62 : spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key);
648 62 : free(nvme_ctrlr);
649 :
650 62 : pthread_mutex_lock(&g_bdev_nvme_mutex);
651 62 : if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
652 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
653 0 : spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
654 0 : spdk_bdev_module_fini_done();
655 0 : return;
656 : }
657 62 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
658 62 : }
659 :
660 : static int
661 62 : nvme_detach_poller(void *arg)
662 : {
663 62 : struct nvme_ctrlr *nvme_ctrlr = arg;
664 62 : int rc;
665 :
666 62 : rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx);
667 62 : if (rc != -EAGAIN) {
668 62 : spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
669 62 : _nvme_ctrlr_delete(nvme_ctrlr);
670 62 : }
671 :
672 62 : return SPDK_POLLER_BUSY;
673 62 : }
674 :
675 : static void
676 62 : nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
677 : {
678 62 : int rc;
679 :
680 62 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
681 :
682 62 : if (spdk_interrupt_mode_is_enabled()) {
683 0 : spdk_interrupt_unregister(&nvme_ctrlr->intr);
684 0 : }
685 :
686 : /* First, unregister the adminq poller, as the driver will poll adminq if necessary */
687 62 : spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller);
688 :
689 : /* If we got here, the reset/detach poller cannot be active */
690 62 : assert(nvme_ctrlr->reset_detach_poller == NULL);
691 62 : nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller,
692 : nvme_ctrlr, 1000);
693 62 : if (nvme_ctrlr->reset_detach_poller == NULL) {
694 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to register detach poller\n");
695 0 : goto error;
696 : }
697 :
698 62 : rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx);
699 62 : if (rc != 0) {
700 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to detach the NVMe controller\n");
701 0 : goto error;
702 : }
703 :
704 62 : return;
705 : error:
706 : /* We don't have a good way to handle errors here, so just do what we can and delete the
707 : * controller without detaching the underlying NVMe device.
708 : */
709 0 : spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
710 0 : _nvme_ctrlr_delete(nvme_ctrlr);
711 62 : }
712 :
713 : static void
714 61 : nvme_ctrlr_unregister_cb(void *io_device)
715 : {
716 61 : struct nvme_ctrlr *nvme_ctrlr = io_device;
717 :
718 61 : nvme_ctrlr_delete(nvme_ctrlr);
719 61 : }
720 :
721 : static void
722 61 : nvme_ctrlr_unregister(void *ctx)
723 : {
724 61 : struct nvme_ctrlr *nvme_ctrlr = ctx;
725 :
726 61 : spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb);
727 61 : }
728 :
729 : static bool
730 249 : nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr)
731 : {
732 249 : if (!nvme_ctrlr->destruct) {
733 131 : return false;
734 : }
735 :
736 118 : if (nvme_ctrlr->ref > 0) {
737 57 : return false;
738 : }
739 :
740 61 : if (nvme_ctrlr->resetting) {
741 0 : return false;
742 : }
743 :
744 61 : if (nvme_ctrlr->ana_log_page_updating) {
745 0 : return false;
746 : }
747 :
748 61 : if (nvme_ctrlr->io_path_cache_clearing) {
749 0 : return false;
750 : }
751 :
752 61 : return true;
753 249 : }
754 :
755 : static void
756 172 : nvme_ctrlr_put_ref(struct nvme_ctrlr *nvme_ctrlr)
757 : {
758 172 : pthread_mutex_lock(&nvme_ctrlr->mutex);
759 : SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref);
760 :
761 172 : assert(nvme_ctrlr->ref > 0);
762 172 : nvme_ctrlr->ref--;
763 :
764 172 : if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
765 111 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
766 111 : return;
767 : }
768 :
769 61 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
770 :
771 61 : spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr);
772 172 : }
773 :
774 : static void
775 111 : nvme_ctrlr_get_ref(struct nvme_ctrlr *nvme_ctrlr)
776 : {
777 111 : pthread_mutex_lock(&nvme_ctrlr->mutex);
778 111 : nvme_ctrlr->ref++;
779 111 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
780 111 : }
781 :
782 : static void
783 259 : bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch)
784 : {
785 259 : nbdev_ch->current_io_path = NULL;
786 259 : nbdev_ch->rr_counter = 0;
787 259 : }
788 :
789 : static struct nvme_io_path *
790 8 : _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
791 : {
792 8 : struct nvme_io_path *io_path;
793 :
794 16 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
795 15 : if (io_path->nvme_ns == nvme_ns) {
796 7 : break;
797 : }
798 8 : }
799 :
800 16 : return io_path;
801 8 : }
802 :
803 : static struct nvme_io_path *
804 39 : nvme_io_path_alloc(void)
805 : {
806 39 : struct nvme_io_path *io_path;
807 :
808 39 : io_path = calloc(1, sizeof(*io_path));
809 39 : if (io_path == NULL) {
810 0 : SPDK_ERRLOG("Failed to alloc io_path.\n");
811 0 : return NULL;
812 : }
813 :
814 39 : if (g_opts.io_path_stat) {
815 0 : io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
816 0 : if (io_path->stat == NULL) {
817 0 : free(io_path);
818 0 : SPDK_ERRLOG("Failed to alloc io_path stat.\n");
819 0 : return NULL;
820 : }
821 0 : spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN);
822 0 : }
823 :
824 39 : return io_path;
825 39 : }
826 :
827 : static void
828 39 : nvme_io_path_free(struct nvme_io_path *io_path)
829 : {
830 39 : free(io_path->stat);
831 39 : free(io_path);
832 39 : }
833 :
834 : static int
835 39 : _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
836 : {
837 39 : struct nvme_io_path *io_path;
838 39 : struct spdk_io_channel *ch;
839 39 : struct nvme_ctrlr_channel *ctrlr_ch;
840 39 : struct nvme_qpair *nvme_qpair;
841 :
842 39 : io_path = nvme_io_path_alloc();
843 39 : if (io_path == NULL) {
844 0 : return -ENOMEM;
845 : }
846 :
847 39 : io_path->nvme_ns = nvme_ns;
848 :
849 39 : ch = spdk_get_io_channel(nvme_ns->ctrlr);
850 39 : if (ch == NULL) {
851 0 : nvme_io_path_free(io_path);
852 0 : SPDK_ERRLOG("Failed to alloc io_channel.\n");
853 0 : return -ENOMEM;
854 : }
855 :
856 39 : ctrlr_ch = spdk_io_channel_get_ctx(ch);
857 :
858 39 : nvme_qpair = ctrlr_ch->qpair;
859 39 : assert(nvme_qpair != NULL);
860 :
861 39 : io_path->qpair = nvme_qpair;
862 39 : TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq);
863 :
864 39 : io_path->nbdev_ch = nbdev_ch;
865 39 : STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq);
866 :
867 39 : bdev_nvme_clear_current_io_path(nbdev_ch);
868 :
869 39 : return 0;
870 39 : }
871 :
872 : static void
873 39 : bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch,
874 : struct nvme_io_path *io_path)
875 : {
876 39 : struct nvme_bdev_io *bio;
877 :
878 40 : TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) {
879 1 : if (bio->io_path == io_path) {
880 1 : bio->io_path = NULL;
881 1 : }
882 1 : }
883 39 : }
884 :
885 : static void
886 39 : _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path)
887 : {
888 39 : struct spdk_io_channel *ch;
889 39 : struct nvme_qpair *nvme_qpair;
890 39 : struct nvme_ctrlr_channel *ctrlr_ch;
891 39 : struct nvme_bdev *nbdev;
892 :
893 39 : nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch));
894 :
895 : /* Add the statistics to nvme_ns before this path is destroyed. */
896 39 : pthread_mutex_lock(&nbdev->mutex);
897 39 : if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) {
898 0 : spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat);
899 0 : }
900 39 : pthread_mutex_unlock(&nbdev->mutex);
901 :
902 39 : bdev_nvme_clear_current_io_path(nbdev_ch);
903 39 : bdev_nvme_clear_retry_io_path(nbdev_ch, io_path);
904 :
905 39 : STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq);
906 39 : io_path->nbdev_ch = NULL;
907 :
908 39 : nvme_qpair = io_path->qpair;
909 39 : assert(nvme_qpair != NULL);
910 :
911 39 : ctrlr_ch = nvme_qpair->ctrlr_ch;
912 39 : assert(ctrlr_ch != NULL);
913 :
914 39 : ch = spdk_io_channel_from_ctx(ctrlr_ch);
915 39 : spdk_put_io_channel(ch);
916 :
917 : /* After an io_path is removed, I/Os submitted to it may complete and update statistics
918 : * of the io_path. To avoid heap-use-after-free error from this case, do not free the
919 : * io_path here but free the io_path when the associated qpair is freed. It is ensured
920 : * that all I/Os submitted to the io_path are completed when the associated qpair is freed.
921 : */
922 39 : }
923 :
924 : static void
925 26 : _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch)
926 : {
927 26 : struct nvme_io_path *io_path, *tmp_io_path;
928 :
929 63 : STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) {
930 37 : _bdev_nvme_delete_io_path(nbdev_ch, io_path);
931 37 : }
932 26 : }
933 :
934 : static int
935 26 : bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf)
936 : {
937 26 : struct nvme_bdev_channel *nbdev_ch = ctx_buf;
938 26 : struct nvme_bdev *nbdev = io_device;
939 26 : struct nvme_ns *nvme_ns;
940 26 : int rc;
941 :
942 26 : STAILQ_INIT(&nbdev_ch->io_path_list);
943 26 : TAILQ_INIT(&nbdev_ch->retry_io_list);
944 :
945 26 : pthread_mutex_lock(&nbdev->mutex);
946 :
947 26 : nbdev_ch->mp_policy = nbdev->mp_policy;
948 26 : nbdev_ch->mp_selector = nbdev->mp_selector;
949 26 : nbdev_ch->rr_min_io = nbdev->rr_min_io;
950 :
951 63 : TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
952 37 : rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
953 37 : if (rc != 0) {
954 0 : pthread_mutex_unlock(&nbdev->mutex);
955 :
956 0 : _bdev_nvme_delete_io_paths(nbdev_ch);
957 0 : return rc;
958 : }
959 37 : }
960 26 : pthread_mutex_unlock(&nbdev->mutex);
961 :
962 26 : return 0;
963 26 : }
964 :
965 : /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'.
966 : * If cpl == NULL, complete the bdev_io with bdev status based on 'status'.
967 : */
968 : static inline void
969 58 : __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status,
970 : const struct spdk_nvme_cpl *cpl)
971 : {
972 58 : spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx,
973 : (uintptr_t)bdev_io);
974 58 : if (cpl) {
975 29 : spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
976 29 : } else {
977 29 : spdk_bdev_io_complete(bdev_io, status);
978 : }
979 58 : }
980 :
981 : static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch);
982 :
983 : static void
984 26 : bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf)
985 : {
986 26 : struct nvme_bdev_channel *nbdev_ch = ctx_buf;
987 :
988 26 : bdev_nvme_abort_retry_ios(nbdev_ch);
989 26 : _bdev_nvme_delete_io_paths(nbdev_ch);
990 26 : }
991 :
992 : static inline bool
993 62 : bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type)
994 : {
995 62 : switch (io_type) {
996 : case SPDK_BDEV_IO_TYPE_RESET:
997 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
998 : case SPDK_BDEV_IO_TYPE_ABORT:
999 5 : return true;
1000 : default:
1001 57 : break;
1002 : }
1003 :
1004 57 : return false;
1005 62 : }
1006 :
1007 : static inline bool
1008 98 : nvme_ns_is_active(struct nvme_ns *nvme_ns)
1009 : {
1010 98 : if (spdk_unlikely(nvme_ns->ana_state_updating)) {
1011 1 : return false;
1012 : }
1013 :
1014 97 : if (spdk_unlikely(nvme_ns->ns == NULL)) {
1015 0 : return false;
1016 : }
1017 :
1018 97 : return true;
1019 98 : }
1020 :
1021 : static inline bool
1022 86 : nvme_ns_is_accessible(struct nvme_ns *nvme_ns)
1023 : {
1024 86 : if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) {
1025 1 : return false;
1026 : }
1027 :
1028 85 : switch (nvme_ns->ana_state) {
1029 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
1030 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1031 76 : return true;
1032 : default:
1033 9 : break;
1034 : }
1035 :
1036 9 : return false;
1037 86 : }
1038 :
1039 : static inline bool
1040 128 : nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair)
1041 : {
1042 128 : if (spdk_unlikely(nvme_qpair->qpair == NULL)) {
1043 23 : return false;
1044 : }
1045 :
1046 105 : if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) !=
1047 : SPDK_NVME_QPAIR_FAILURE_NONE)) {
1048 2 : return false;
1049 : }
1050 :
1051 103 : if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) {
1052 0 : return false;
1053 : }
1054 :
1055 103 : return true;
1056 128 : }
1057 :
1058 : static inline bool
1059 102 : nvme_io_path_is_available(struct nvme_io_path *io_path)
1060 : {
1061 102 : if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) {
1062 16 : return false;
1063 : }
1064 :
1065 86 : if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) {
1066 10 : return false;
1067 : }
1068 :
1069 76 : return true;
1070 102 : }
1071 :
1072 : static inline bool
1073 9 : nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr)
1074 : {
1075 9 : if (nvme_ctrlr->destruct) {
1076 0 : return true;
1077 : }
1078 :
1079 9 : if (nvme_ctrlr->fast_io_fail_timedout) {
1080 2 : return true;
1081 : }
1082 :
1083 7 : if (nvme_ctrlr->resetting) {
1084 5 : if (nvme_ctrlr->opts.reconnect_delay_sec != 0) {
1085 5 : return false;
1086 : } else {
1087 0 : return true;
1088 : }
1089 : }
1090 :
1091 2 : if (nvme_ctrlr->reconnect_is_delayed) {
1092 2 : return false;
1093 : }
1094 :
1095 0 : if (nvme_ctrlr->disabled) {
1096 0 : return true;
1097 : }
1098 :
1099 0 : if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
1100 0 : return true;
1101 : } else {
1102 0 : return false;
1103 : }
1104 9 : }
1105 :
1106 : static bool
1107 20 : nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr)
1108 : {
1109 20 : if (nvme_ctrlr->destruct) {
1110 0 : return false;
1111 : }
1112 :
1113 20 : if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
1114 3 : return false;
1115 : }
1116 :
1117 17 : if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) {
1118 1 : return false;
1119 : }
1120 :
1121 16 : if (nvme_ctrlr->disabled) {
1122 0 : return false;
1123 : }
1124 :
1125 16 : return true;
1126 20 : }
1127 :
1128 : /* Simulate circular linked list. */
1129 : static inline struct nvme_io_path *
1130 99 : nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path)
1131 : {
1132 99 : struct nvme_io_path *next_path;
1133 :
1134 99 : if (prev_path != NULL) {
1135 39 : next_path = STAILQ_NEXT(prev_path, stailq);
1136 39 : if (next_path != NULL) {
1137 14 : return next_path;
1138 : }
1139 25 : }
1140 :
1141 85 : return STAILQ_FIRST(&nbdev_ch->io_path_list);
1142 99 : }
1143 :
1144 : static struct nvme_io_path *
1145 67 : _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
1146 : {
1147 67 : struct nvme_io_path *io_path, *start, *non_optimized = NULL;
1148 :
1149 67 : start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path);
1150 :
1151 67 : io_path = start;
1152 67 : do {
1153 79 : if (spdk_likely(nvme_io_path_is_available(io_path))) {
1154 57 : switch (io_path->nvme_ns->ana_state) {
1155 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
1156 47 : nbdev_ch->current_io_path = io_path;
1157 47 : return io_path;
1158 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1159 10 : if (non_optimized == NULL) {
1160 7 : non_optimized = io_path;
1161 7 : }
1162 10 : break;
1163 : default:
1164 0 : assert(false);
1165 : break;
1166 : }
1167 10 : }
1168 32 : io_path = nvme_io_path_get_next(nbdev_ch, io_path);
1169 32 : } while (io_path != start);
1170 :
1171 20 : if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
1172 : /* We come here only if there is no optimized path. Cache even non_optimized
1173 : * path for load balance across multiple non_optimized paths.
1174 : */
1175 1 : nbdev_ch->current_io_path = non_optimized;
1176 1 : }
1177 :
1178 20 : return non_optimized;
1179 67 : }
1180 :
1181 : static struct nvme_io_path *
1182 4 : _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch)
1183 : {
1184 4 : struct nvme_io_path *io_path;
1185 4 : struct nvme_io_path *optimized = NULL, *non_optimized = NULL;
1186 4 : uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX;
1187 4 : uint32_t num_outstanding_reqs;
1188 :
1189 16 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
1190 12 : if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) {
1191 : /* The device is currently resetting. */
1192 0 : continue;
1193 : }
1194 :
1195 12 : if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) {
1196 0 : continue;
1197 : }
1198 :
1199 12 : num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair);
1200 12 : switch (io_path->nvme_ns->ana_state) {
1201 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
1202 6 : if (num_outstanding_reqs < opt_min_qd) {
1203 5 : opt_min_qd = num_outstanding_reqs;
1204 5 : optimized = io_path;
1205 5 : }
1206 6 : break;
1207 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1208 3 : if (num_outstanding_reqs < non_opt_min_qd) {
1209 3 : non_opt_min_qd = num_outstanding_reqs;
1210 3 : non_optimized = io_path;
1211 3 : }
1212 3 : break;
1213 : default:
1214 3 : break;
1215 : }
1216 12 : }
1217 :
1218 : /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */
1219 4 : if (optimized != NULL) {
1220 3 : return optimized;
1221 : }
1222 :
1223 1 : return non_optimized;
1224 4 : }
1225 :
1226 : static inline struct nvme_io_path *
1227 105 : bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
1228 : {
1229 105 : if (spdk_likely(nbdev_ch->current_io_path != NULL)) {
1230 41 : if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) {
1231 31 : return nbdev_ch->current_io_path;
1232 10 : } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
1233 10 : if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) {
1234 3 : return nbdev_ch->current_io_path;
1235 : }
1236 7 : nbdev_ch->rr_counter = 0;
1237 7 : }
1238 7 : }
1239 :
1240 85 : if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE ||
1241 14 : nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
1242 67 : return _bdev_nvme_find_io_path(nbdev_ch);
1243 : } else {
1244 4 : return _bdev_nvme_find_io_path_min_qd(nbdev_ch);
1245 : }
1246 105 : }
1247 :
1248 : /* Return true if there is any io_path whose qpair is active or ctrlr is not failed,
1249 : * or false otherwise.
1250 : *
1251 : * If any io_path has an active qpair but find_io_path() returned NULL, its namespace
1252 : * is likely to be non-accessible now but may become accessible.
1253 : *
1254 : * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr
1255 : * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed
1256 : * when starting to reset it but it is set to failed when the reset failed. Hence, if
1257 : * a ctrlr is unfailed, it is likely that it works fine or is resetting.
1258 : */
1259 : static bool
1260 15 : any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch)
1261 : {
1262 15 : struct nvme_io_path *io_path;
1263 :
1264 15 : if (nbdev_ch->resetting) {
1265 1 : return false;
1266 : }
1267 :
1268 16 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
1269 14 : if (io_path->nvme_ns->ana_transition_timedout) {
1270 0 : continue;
1271 : }
1272 :
1273 23 : if (nvme_qpair_is_connected(io_path->qpair) ||
1274 9 : !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) {
1275 12 : return true;
1276 : }
1277 2 : }
1278 :
1279 2 : return false;
1280 15 : }
1281 :
1282 : static void
1283 14 : bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io)
1284 : {
1285 14 : struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1286 14 : struct spdk_io_channel *ch;
1287 :
1288 14 : if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) {
1289 3 : _bdev_nvme_submit_request(nbdev_ch, bdev_io);
1290 3 : } else {
1291 11 : ch = spdk_io_channel_from_ctx(nbdev_ch);
1292 11 : bdev_nvme_submit_request(ch, bdev_io);
1293 : }
1294 14 : }
1295 :
1296 : static int
1297 14 : bdev_nvme_retry_ios(void *arg)
1298 : {
1299 14 : struct nvme_bdev_channel *nbdev_ch = arg;
1300 14 : struct nvme_bdev_io *bio, *tmp_bio;
1301 14 : uint64_t now, delay_us;
1302 :
1303 14 : now = spdk_get_ticks();
1304 :
1305 28 : TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) {
1306 15 : if (bio->retry_ticks > now) {
1307 1 : break;
1308 : }
1309 :
1310 14 : TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link);
1311 :
1312 14 : bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio));
1313 14 : }
1314 :
1315 14 : spdk_poller_unregister(&nbdev_ch->retry_io_poller);
1316 :
1317 14 : bio = TAILQ_FIRST(&nbdev_ch->retry_io_list);
1318 14 : if (bio != NULL) {
1319 4 : delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz();
1320 :
1321 4 : nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
1322 : delay_us);
1323 4 : }
1324 :
1325 14 : return SPDK_POLLER_BUSY;
1326 14 : }
1327 :
1328 : static void
1329 16 : bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch,
1330 : struct nvme_bdev_io *bio, uint64_t delay_ms)
1331 : {
1332 16 : struct nvme_bdev_io *tmp_bio;
1333 :
1334 16 : bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL;
1335 :
1336 16 : TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) {
1337 1 : if (tmp_bio->retry_ticks <= bio->retry_ticks) {
1338 1 : TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio,
1339 : retry_link);
1340 1 : return;
1341 : }
1342 0 : }
1343 :
1344 : /* No earlier I/Os were found. This I/O must be the new head. */
1345 15 : TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link);
1346 :
1347 15 : spdk_poller_unregister(&nbdev_ch->retry_io_poller);
1348 :
1349 15 : nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
1350 : delay_ms * 1000ULL);
1351 16 : }
1352 :
1353 : static void
1354 58 : bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch)
1355 : {
1356 58 : struct nvme_bdev_io *bio, *tmp_bio;
1357 :
1358 59 : TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) {
1359 1 : TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link);
1360 1 : __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL);
1361 1 : }
1362 :
1363 58 : spdk_poller_unregister(&nbdev_ch->retry_io_poller);
1364 58 : }
1365 :
1366 : static int
1367 6 : bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch,
1368 : struct nvme_bdev_io *bio_to_abort)
1369 : {
1370 6 : struct nvme_bdev_io *bio;
1371 :
1372 6 : TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) {
1373 1 : if (bio == bio_to_abort) {
1374 1 : TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link);
1375 1 : __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL);
1376 1 : return 0;
1377 : }
1378 0 : }
1379 :
1380 5 : return -ENOENT;
1381 6 : }
1382 :
1383 : static void
1384 12 : bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl)
1385 : {
1386 12 : struct nvme_bdev *nbdev;
1387 12 : uint16_t sct, sc;
1388 :
1389 12 : assert(spdk_nvme_cpl_is_error(cpl));
1390 :
1391 12 : nbdev = bdev_io->bdev->ctxt;
1392 :
1393 12 : if (nbdev->err_stat == NULL) {
1394 12 : return;
1395 : }
1396 :
1397 0 : sct = cpl->status.sct;
1398 0 : sc = cpl->status.sc;
1399 :
1400 0 : pthread_mutex_lock(&nbdev->mutex);
1401 :
1402 0 : nbdev->err_stat->status_type[sct]++;
1403 0 : switch (sct) {
1404 : case SPDK_NVME_SCT_GENERIC:
1405 : case SPDK_NVME_SCT_COMMAND_SPECIFIC:
1406 : case SPDK_NVME_SCT_MEDIA_ERROR:
1407 : case SPDK_NVME_SCT_PATH:
1408 0 : nbdev->err_stat->status[sct][sc]++;
1409 0 : break;
1410 : default:
1411 0 : break;
1412 : }
1413 :
1414 0 : pthread_mutex_unlock(&nbdev->mutex);
1415 12 : }
1416 :
1417 : static inline void
1418 20 : bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio)
1419 : {
1420 20 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1421 20 : uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
1422 20 : uint32_t blocklen = bdev_io->bdev->blocklen;
1423 20 : struct spdk_bdev_io_stat *stat;
1424 20 : uint64_t tsc_diff;
1425 :
1426 20 : if (bio->io_path->stat == NULL) {
1427 20 : return;
1428 : }
1429 :
1430 0 : tsc_diff = spdk_get_ticks() - bio->submit_tsc;
1431 0 : stat = bio->io_path->stat;
1432 :
1433 0 : switch (bdev_io->type) {
1434 : case SPDK_BDEV_IO_TYPE_READ:
1435 0 : stat->bytes_read += num_blocks * blocklen;
1436 0 : stat->num_read_ops++;
1437 0 : stat->read_latency_ticks += tsc_diff;
1438 0 : if (stat->max_read_latency_ticks < tsc_diff) {
1439 0 : stat->max_read_latency_ticks = tsc_diff;
1440 0 : }
1441 0 : if (stat->min_read_latency_ticks > tsc_diff) {
1442 0 : stat->min_read_latency_ticks = tsc_diff;
1443 0 : }
1444 0 : break;
1445 : case SPDK_BDEV_IO_TYPE_WRITE:
1446 0 : stat->bytes_written += num_blocks * blocklen;
1447 0 : stat->num_write_ops++;
1448 0 : stat->write_latency_ticks += tsc_diff;
1449 0 : if (stat->max_write_latency_ticks < tsc_diff) {
1450 0 : stat->max_write_latency_ticks = tsc_diff;
1451 0 : }
1452 0 : if (stat->min_write_latency_ticks > tsc_diff) {
1453 0 : stat->min_write_latency_ticks = tsc_diff;
1454 0 : }
1455 0 : break;
1456 : case SPDK_BDEV_IO_TYPE_UNMAP:
1457 0 : stat->bytes_unmapped += num_blocks * blocklen;
1458 0 : stat->num_unmap_ops++;
1459 0 : stat->unmap_latency_ticks += tsc_diff;
1460 0 : if (stat->max_unmap_latency_ticks < tsc_diff) {
1461 0 : stat->max_unmap_latency_ticks = tsc_diff;
1462 0 : }
1463 0 : if (stat->min_unmap_latency_ticks > tsc_diff) {
1464 0 : stat->min_unmap_latency_ticks = tsc_diff;
1465 0 : }
1466 0 : break;
1467 : case SPDK_BDEV_IO_TYPE_ZCOPY:
1468 : /* Track the data in the start phase only */
1469 0 : if (!bdev_io->u.bdev.zcopy.start) {
1470 0 : break;
1471 : }
1472 0 : if (bdev_io->u.bdev.zcopy.populate) {
1473 0 : stat->bytes_read += num_blocks * blocklen;
1474 0 : stat->num_read_ops++;
1475 0 : stat->read_latency_ticks += tsc_diff;
1476 0 : if (stat->max_read_latency_ticks < tsc_diff) {
1477 0 : stat->max_read_latency_ticks = tsc_diff;
1478 0 : }
1479 0 : if (stat->min_read_latency_ticks > tsc_diff) {
1480 0 : stat->min_read_latency_ticks = tsc_diff;
1481 0 : }
1482 0 : } else {
1483 0 : stat->bytes_written += num_blocks * blocklen;
1484 0 : stat->num_write_ops++;
1485 0 : stat->write_latency_ticks += tsc_diff;
1486 0 : if (stat->max_write_latency_ticks < tsc_diff) {
1487 0 : stat->max_write_latency_ticks = tsc_diff;
1488 0 : }
1489 0 : if (stat->min_write_latency_ticks > tsc_diff) {
1490 0 : stat->min_write_latency_ticks = tsc_diff;
1491 0 : }
1492 : }
1493 0 : break;
1494 : case SPDK_BDEV_IO_TYPE_COPY:
1495 0 : stat->bytes_copied += num_blocks * blocklen;
1496 0 : stat->num_copy_ops++;
1497 0 : stat->copy_latency_ticks += tsc_diff;
1498 0 : if (stat->max_copy_latency_ticks < tsc_diff) {
1499 0 : stat->max_copy_latency_ticks = tsc_diff;
1500 0 : }
1501 0 : if (stat->min_copy_latency_ticks > tsc_diff) {
1502 0 : stat->min_copy_latency_ticks = tsc_diff;
1503 0 : }
1504 0 : break;
1505 : default:
1506 0 : break;
1507 : }
1508 20 : }
1509 :
1510 : static bool
1511 11 : bdev_nvme_check_retry_io(struct nvme_bdev_io *bio,
1512 : const struct spdk_nvme_cpl *cpl,
1513 : struct nvme_bdev_channel *nbdev_ch,
1514 : uint64_t *_delay_ms)
1515 : {
1516 11 : struct nvme_io_path *io_path = bio->io_path;
1517 11 : struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr;
1518 11 : const struct spdk_nvme_ctrlr_data *cdata;
1519 :
1520 15 : if (spdk_nvme_cpl_is_path_error(cpl) ||
1521 5 : spdk_nvme_cpl_is_aborted_sq_deletion(cpl) ||
1522 0 : !nvme_io_path_is_available(io_path) ||
1523 4 : !nvme_ctrlr_is_available(nvme_ctrlr)) {
1524 15 : bdev_nvme_clear_current_io_path(nbdev_ch);
1525 15 : bio->io_path = NULL;
1526 3 : if (spdk_nvme_cpl_is_ana_error(cpl)) {
1527 1 : if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) {
1528 1 : io_path->nvme_ns->ana_state_updating = true;
1529 1 : }
1530 1 : }
1531 3 : if (!any_io_path_may_become_available(nbdev_ch)) {
1532 0 : return false;
1533 : }
1534 3 : *_delay_ms = 0;
1535 3 : } else {
1536 4 : bio->retry_count++;
1537 :
1538 4 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
1539 :
1540 4 : if (cpl->status.crd != 0) {
1541 1 : *_delay_ms = cdata->crdt[cpl->status.crd] * 100;
1542 1 : } else {
1543 3 : *_delay_ms = 0;
1544 : }
1545 : }
1546 :
1547 7 : return true;
1548 7 : }
1549 :
1550 : static inline void
1551 40 : bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
1552 : const struct spdk_nvme_cpl *cpl)
1553 : {
1554 40 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1555 40 : struct nvme_bdev_channel *nbdev_ch;
1556 40 : uint64_t delay_ms;
1557 :
1558 40 : assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
1559 :
1560 40 : if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
1561 20 : bdev_nvme_update_io_path_stat(bio);
1562 20 : goto complete;
1563 : }
1564 :
1565 : /* Update error counts before deciding if retry is needed.
1566 : * Hence, error counts may be more than the number of I/O errors.
1567 : */
1568 20 : bdev_nvme_update_nvme_error_stat(bdev_io, cpl);
1569 :
1570 27 : if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) ||
1571 2 : (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) {
1572 23 : goto complete;
1573 : }
1574 :
1575 : /* At this point we don't know whether the sequence was successfully executed or not, so we
1576 : * cannot retry the IO */
1577 7 : if (bdev_io->u.bdev.accel_sequence != NULL) {
1578 0 : goto complete;
1579 : }
1580 :
1581 7 : nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
1582 :
1583 7 : if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) {
1584 7 : bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms);
1585 7 : return;
1586 : }
1587 :
1588 : complete:
1589 25 : bio->retry_count = 0;
1590 25 : bio->submit_tsc = 0;
1591 25 : bdev_io->u.bdev.accel_sequence = NULL;
1592 25 : __bdev_nvme_io_complete(bdev_io, 0, cpl);
1593 32 : }
1594 :
1595 : static inline void
1596 13 : bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
1597 : {
1598 13 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1599 13 : struct nvme_bdev_channel *nbdev_ch;
1600 13 : enum spdk_bdev_io_status io_status;
1601 :
1602 13 : assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
1603 :
1604 13 : switch (rc) {
1605 : case 0:
1606 1 : io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1607 1 : break;
1608 : case -ENOMEM:
1609 0 : io_status = SPDK_BDEV_IO_STATUS_NOMEM;
1610 0 : break;
1611 : case -ENXIO:
1612 15 : if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) {
1613 12 : nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
1614 :
1615 12 : bdev_nvme_clear_current_io_path(nbdev_ch);
1616 12 : bio->io_path = NULL;
1617 :
1618 12 : if (any_io_path_may_become_available(nbdev_ch)) {
1619 9 : bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL);
1620 9 : return;
1621 : }
1622 3 : }
1623 :
1624 : /* fallthrough */
1625 : default:
1626 3 : spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence);
1627 3 : bdev_io->u.bdev.accel_sequence = NULL;
1628 3 : io_status = SPDK_BDEV_IO_STATUS_FAILED;
1629 3 : break;
1630 : }
1631 :
1632 4 : bio->retry_count = 0;
1633 4 : bio->submit_tsc = 0;
1634 4 : __bdev_nvme_io_complete(bdev_io, io_status, NULL);
1635 13 : }
1636 :
1637 : static inline void
1638 4 : bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc)
1639 : {
1640 4 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1641 4 : enum spdk_bdev_io_status io_status;
1642 :
1643 4 : switch (rc) {
1644 : case 0:
1645 1 : io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1646 1 : break;
1647 : case -ENOMEM:
1648 0 : io_status = SPDK_BDEV_IO_STATUS_NOMEM;
1649 0 : break;
1650 1 : case -ENXIO:
1651 : /* fallthrough */
1652 : default:
1653 3 : io_status = SPDK_BDEV_IO_STATUS_FAILED;
1654 3 : break;
1655 : }
1656 :
1657 4 : __bdev_nvme_io_complete(bdev_io, io_status, NULL);
1658 4 : }
1659 :
1660 : static void
1661 3 : bdev_nvme_clear_io_path_caches_done(struct nvme_ctrlr *nvme_ctrlr,
1662 : void *ctx, int status)
1663 : {
1664 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
1665 :
1666 3 : assert(nvme_ctrlr->io_path_cache_clearing == true);
1667 3 : nvme_ctrlr->io_path_cache_clearing = false;
1668 :
1669 3 : if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
1670 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1671 3 : return;
1672 : }
1673 :
1674 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1675 :
1676 0 : nvme_ctrlr_unregister(nvme_ctrlr);
1677 3 : }
1678 :
1679 : static void
1680 416 : _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair)
1681 : {
1682 416 : struct nvme_io_path *io_path;
1683 :
1684 651 : TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) {
1685 235 : if (io_path->nbdev_ch == NULL) {
1686 72 : continue;
1687 : }
1688 163 : bdev_nvme_clear_current_io_path(io_path->nbdev_ch);
1689 163 : }
1690 416 : }
1691 :
1692 : static void
1693 1 : bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel_iter *i,
1694 : struct nvme_ctrlr *nvme_ctrlr,
1695 : struct nvme_ctrlr_channel *ctrlr_ch,
1696 : void *ctx)
1697 : {
1698 1 : assert(ctrlr_ch->qpair != NULL);
1699 :
1700 1 : _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair);
1701 :
1702 1 : nvme_ctrlr_for_each_channel_continue(i, 0);
1703 1 : }
1704 :
1705 : static void
1706 3 : bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr)
1707 : {
1708 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
1709 6 : if (!nvme_ctrlr_is_available(nvme_ctrlr) ||
1710 3 : nvme_ctrlr->io_path_cache_clearing) {
1711 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1712 0 : return;
1713 : }
1714 :
1715 3 : nvme_ctrlr->io_path_cache_clearing = true;
1716 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1717 :
1718 3 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
1719 : bdev_nvme_clear_io_path_cache,
1720 : NULL,
1721 : bdev_nvme_clear_io_path_caches_done);
1722 3 : }
1723 :
1724 : static struct nvme_qpair *
1725 121 : nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair)
1726 : {
1727 121 : struct nvme_qpair *nvme_qpair;
1728 :
1729 138 : TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) {
1730 138 : if (nvme_qpair->qpair == qpair) {
1731 121 : break;
1732 : }
1733 17 : }
1734 :
1735 242 : return nvme_qpair;
1736 121 : }
1737 :
1738 : static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair);
1739 :
1740 : static void
1741 121 : bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
1742 : {
1743 121 : struct nvme_poll_group *group = poll_group_ctx;
1744 121 : struct nvme_qpair *nvme_qpair;
1745 121 : struct nvme_ctrlr *nvme_ctrlr;
1746 121 : struct nvme_ctrlr_channel *ctrlr_ch;
1747 121 : int status;
1748 :
1749 121 : nvme_qpair = nvme_poll_group_get_qpair(group, qpair);
1750 121 : if (nvme_qpair == NULL) {
1751 0 : return;
1752 : }
1753 :
1754 121 : if (nvme_qpair->qpair != NULL) {
1755 121 : spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair);
1756 121 : nvme_qpair->qpair = NULL;
1757 121 : }
1758 :
1759 121 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
1760 :
1761 121 : nvme_ctrlr = nvme_qpair->ctrlr;
1762 121 : ctrlr_ch = nvme_qpair->ctrlr_ch;
1763 :
1764 121 : if (ctrlr_ch != NULL) {
1765 74 : if (ctrlr_ch->reset_iter != NULL) {
1766 : /* We are in a full reset sequence. */
1767 69 : if (ctrlr_ch->connect_poller != NULL) {
1768 : /* qpair was failed to connect. Abort the reset sequence. */
1769 0 : NVME_CTRLR_INFOLOG(nvme_ctrlr,
1770 : "qpair %p was failed to connect. abort the reset ctrlr sequence.\n",
1771 : qpair);
1772 0 : spdk_poller_unregister(&ctrlr_ch->connect_poller);
1773 0 : status = -1;
1774 0 : } else {
1775 : /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */
1776 69 : NVME_CTRLR_INFOLOG(nvme_ctrlr,
1777 : "qpair %p was disconnected and freed in a reset ctrlr sequence.\n",
1778 : qpair);
1779 69 : status = 0;
1780 : }
1781 69 : nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, status);
1782 69 : ctrlr_ch->reset_iter = NULL;
1783 69 : } else {
1784 : /* qpair was disconnected unexpectedly. Reset controller for recovery. */
1785 5 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. reset controller.\n",
1786 : qpair);
1787 5 : bdev_nvme_failover_ctrlr(nvme_ctrlr);
1788 : }
1789 74 : } else {
1790 : /* In this case, ctrlr_channel is already deleted. */
1791 47 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. delete nvme_qpair.\n",
1792 : qpair);
1793 47 : nvme_qpair_delete(nvme_qpair);
1794 : }
1795 121 : }
1796 :
1797 : static void
1798 0 : bdev_nvme_check_io_qpairs(struct nvme_poll_group *group)
1799 : {
1800 0 : struct nvme_qpair *nvme_qpair;
1801 :
1802 0 : TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) {
1803 0 : if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) {
1804 0 : continue;
1805 : }
1806 :
1807 0 : if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) !=
1808 : SPDK_NVME_QPAIR_FAILURE_NONE) {
1809 0 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
1810 0 : }
1811 0 : }
1812 0 : }
1813 :
1814 : static int
1815 1209 : bdev_nvme_poll(void *arg)
1816 : {
1817 1209 : struct nvme_poll_group *group = arg;
1818 1209 : int64_t num_completions;
1819 :
1820 1209 : if (group->collect_spin_stat && group->start_ticks == 0) {
1821 0 : group->start_ticks = spdk_get_ticks();
1822 0 : }
1823 :
1824 1209 : num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
1825 : bdev_nvme_disconnected_qpair_cb);
1826 1209 : if (group->collect_spin_stat) {
1827 0 : if (num_completions > 0) {
1828 0 : if (group->end_ticks != 0) {
1829 0 : group->spin_ticks += (group->end_ticks - group->start_ticks);
1830 0 : group->end_ticks = 0;
1831 0 : }
1832 0 : group->start_ticks = 0;
1833 0 : } else {
1834 0 : group->end_ticks = spdk_get_ticks();
1835 : }
1836 0 : }
1837 :
1838 1209 : if (spdk_unlikely(num_completions < 0)) {
1839 0 : bdev_nvme_check_io_qpairs(group);
1840 0 : }
1841 :
1842 2418 : return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
1843 1209 : }
1844 :
1845 : static int bdev_nvme_poll_adminq(void *arg);
1846 :
1847 : static void
1848 142 : bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us)
1849 : {
1850 142 : if (spdk_interrupt_mode_is_enabled()) {
1851 0 : return;
1852 : }
1853 :
1854 142 : spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller);
1855 :
1856 142 : nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq,
1857 : nvme_ctrlr, new_period_us);
1858 142 : }
1859 :
1860 : static int
1861 191 : bdev_nvme_poll_adminq(void *arg)
1862 : {
1863 191 : int32_t rc;
1864 191 : struct nvme_ctrlr *nvme_ctrlr = arg;
1865 191 : nvme_ctrlr_disconnected_cb disconnected_cb;
1866 :
1867 191 : assert(nvme_ctrlr != NULL);
1868 :
1869 191 : rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr);
1870 191 : if (rc < 0) {
1871 86 : disconnected_cb = nvme_ctrlr->disconnected_cb;
1872 86 : nvme_ctrlr->disconnected_cb = NULL;
1873 :
1874 86 : if (disconnected_cb != NULL) {
1875 71 : bdev_nvme_change_adminq_poll_period(nvme_ctrlr,
1876 71 : g_opts.nvme_adminq_poll_period_us);
1877 71 : disconnected_cb(nvme_ctrlr);
1878 71 : } else {
1879 15 : bdev_nvme_failover_ctrlr(nvme_ctrlr);
1880 : }
1881 191 : } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) !=
1882 : SPDK_NVME_QPAIR_FAILURE_NONE) {
1883 0 : bdev_nvme_clear_io_path_caches(nvme_ctrlr);
1884 0 : }
1885 :
1886 382 : return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
1887 191 : }
1888 :
1889 : static void
1890 39 : nvme_bdev_free(void *io_device)
1891 : {
1892 39 : struct nvme_bdev *nvme_disk = io_device;
1893 :
1894 39 : pthread_mutex_destroy(&nvme_disk->mutex);
1895 39 : free(nvme_disk->disk.name);
1896 39 : free(nvme_disk->err_stat);
1897 39 : free(nvme_disk);
1898 39 : }
1899 :
1900 : static int
1901 38 : bdev_nvme_destruct(void *ctx)
1902 : {
1903 38 : struct nvme_bdev *nvme_disk = ctx;
1904 38 : struct nvme_ns *nvme_ns, *tmp_nvme_ns;
1905 :
1906 : SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid);
1907 :
1908 77 : TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) {
1909 39 : pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
1910 :
1911 39 : nvme_ns->bdev = NULL;
1912 :
1913 39 : assert(nvme_ns->id > 0);
1914 :
1915 39 : if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) {
1916 0 : pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1917 :
1918 0 : nvme_ctrlr_put_ref(nvme_ns->ctrlr);
1919 0 : nvme_ns_free(nvme_ns);
1920 0 : } else {
1921 39 : pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1922 : }
1923 39 : }
1924 :
1925 38 : pthread_mutex_lock(&g_bdev_nvme_mutex);
1926 38 : TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq);
1927 38 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
1928 :
1929 38 : spdk_io_device_unregister(nvme_disk, nvme_bdev_free);
1930 :
1931 38 : return 0;
1932 38 : }
1933 :
1934 : static int
1935 122 : bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair)
1936 : {
1937 122 : struct nvme_ctrlr *nvme_ctrlr;
1938 122 : struct spdk_nvme_io_qpair_opts opts;
1939 122 : struct spdk_nvme_qpair *qpair;
1940 122 : int rc;
1941 :
1942 122 : nvme_ctrlr = nvme_qpair->ctrlr;
1943 :
1944 122 : spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1945 122 : opts.create_only = true;
1946 : /* In interrupt mode qpairs must be created in sync mode, else it will never be connected.
1947 : * delay_cmd_submit must be false as in interrupt mode requests cannot be submitted in
1948 : * completion context.
1949 : */
1950 122 : if (!spdk_interrupt_mode_is_enabled()) {
1951 122 : opts.async_mode = true;
1952 122 : opts.delay_cmd_submit = g_opts.delay_cmd_submit;
1953 122 : }
1954 122 : opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
1955 122 : g_opts.io_queue_requests = opts.io_queue_requests;
1956 :
1957 122 : qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1958 122 : if (qpair == NULL) {
1959 0 : return -1;
1960 : }
1961 :
1962 : SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name,
1963 : spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread));
1964 :
1965 122 : assert(nvme_qpair->group != NULL);
1966 :
1967 122 : rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair);
1968 122 : if (rc != 0) {
1969 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to begin polling on NVMe Channel.\n");
1970 0 : goto err;
1971 : }
1972 :
1973 122 : rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair);
1974 122 : if (rc != 0) {
1975 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to connect I/O qpair.\n");
1976 0 : goto err;
1977 : }
1978 :
1979 122 : nvme_qpair->qpair = qpair;
1980 :
1981 122 : if (!g_opts.disable_auto_failback) {
1982 85 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
1983 85 : }
1984 :
1985 122 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Connecting qpair %p:%u started.\n",
1986 : qpair, spdk_nvme_qpair_get_id(qpair));
1987 :
1988 122 : return 0;
1989 :
1990 : err:
1991 0 : spdk_nvme_ctrlr_free_io_qpair(qpair);
1992 :
1993 0 : return rc;
1994 122 : }
1995 :
1996 : static void bdev_nvme_reset_io_continue(void *cb_arg, int rc);
1997 :
1998 : static void
1999 71 : bdev_nvme_complete_pending_resets(struct nvme_ctrlr *nvme_ctrlr, bool success)
2000 : {
2001 71 : int rc = 0;
2002 71 : struct nvme_bdev_io *bio;
2003 :
2004 71 : if (!success) {
2005 33 : rc = -1;
2006 33 : }
2007 :
2008 83 : while (!TAILQ_EMPTY(&nvme_ctrlr->pending_resets)) {
2009 12 : bio = TAILQ_FIRST(&nvme_ctrlr->pending_resets);
2010 12 : TAILQ_REMOVE(&nvme_ctrlr->pending_resets, bio, retry_link);
2011 :
2012 12 : bdev_nvme_reset_io_continue(bio, rc);
2013 : }
2014 71 : }
2015 :
2016 : /* This function marks the current trid as failed by storing the current ticks
2017 : * and then sets the next trid to the active trid within a controller if exists.
2018 : *
2019 : * The purpose of the boolean return value is to request the caller to disconnect
2020 : * the current trid now to try connecting the next trid.
2021 : */
2022 : static bool
2023 62 : bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start)
2024 : {
2025 62 : struct nvme_path_id *path_id, *next_path;
2026 62 : int rc __attribute__((unused));
2027 :
2028 62 : path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
2029 62 : assert(path_id);
2030 62 : assert(path_id == nvme_ctrlr->active_path_id);
2031 62 : next_path = TAILQ_NEXT(path_id, link);
2032 :
2033 : /* Update the last failed time. It means the trid is failed if its last
2034 : * failed time is non-zero.
2035 : */
2036 62 : path_id->last_failed_tsc = spdk_get_ticks();
2037 :
2038 62 : if (next_path == NULL) {
2039 : /* There is no alternate trid within a controller. */
2040 51 : return false;
2041 : }
2042 :
2043 11 : if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) {
2044 : /* Connect is not retried in a controller reset sequence. Connecting
2045 : * the next trid will be done by the next bdev_nvme_failover_ctrlr() call.
2046 : */
2047 3 : return false;
2048 : }
2049 :
2050 8 : assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
2051 :
2052 8 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Start failover from %s:%s to %s:%s\n",
2053 : path_id->trid.traddr, path_id->trid.trsvcid,
2054 : next_path->trid.traddr, next_path->trid.trsvcid);
2055 :
2056 8 : spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
2057 8 : nvme_ctrlr->active_path_id = next_path;
2058 8 : rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid);
2059 8 : assert(rc == 0);
2060 8 : TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
2061 8 : if (!remove) {
2062 : /** Shuffle the old trid to the end of the list and use the new one.
2063 : * Allows for round robin through multiple connections.
2064 : */
2065 6 : TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link);
2066 6 : } else {
2067 2 : free(path_id);
2068 : }
2069 :
2070 8 : if (start || next_path->last_failed_tsc == 0) {
2071 : /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed
2072 : * or used yet. Try the next trid now.
2073 : */
2074 7 : return true;
2075 : }
2076 :
2077 2 : if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() *
2078 1 : nvme_ctrlr->opts.reconnect_delay_sec) {
2079 : /* Enough backoff passed since the next trid failed. Try the next trid now. */
2080 0 : return true;
2081 : }
2082 :
2083 : /* The next trid will be tried after reconnect_delay_sec seconds. */
2084 1 : return false;
2085 62 : }
2086 :
2087 : static bool
2088 89 : bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr)
2089 : {
2090 89 : int32_t elapsed;
2091 :
2092 126 : if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 ||
2093 37 : nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) {
2094 63 : return false;
2095 : }
2096 :
2097 26 : elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz();
2098 26 : if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) {
2099 6 : return true;
2100 : } else {
2101 20 : return false;
2102 : }
2103 89 : }
2104 :
2105 : static bool
2106 12 : bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr)
2107 : {
2108 12 : uint32_t elapsed;
2109 :
2110 12 : if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) {
2111 8 : return false;
2112 : }
2113 :
2114 4 : elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz();
2115 4 : if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) {
2116 2 : return true;
2117 : } else {
2118 2 : return false;
2119 : }
2120 12 : }
2121 :
2122 : static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success);
2123 :
2124 : static void
2125 72 : nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn)
2126 : {
2127 72 : int rc;
2128 :
2129 72 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting ctrlr.\n");
2130 :
2131 72 : rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr);
2132 72 : if (rc != 0) {
2133 1 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "disconnecting ctrlr failed.\n");
2134 :
2135 : /* Disconnect fails if ctrlr is already resetting or removed. In this case,
2136 : * fail the reset sequence immediately.
2137 : */
2138 1 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false);
2139 1 : return;
2140 : }
2141 :
2142 : /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq.
2143 : * Set callback here to execute the specified operation after ctrlr is really disconnected.
2144 : */
2145 71 : assert(nvme_ctrlr->disconnected_cb == NULL);
2146 71 : nvme_ctrlr->disconnected_cb = cb_fn;
2147 :
2148 : /* During disconnection, reduce the period to poll adminq more often. */
2149 71 : bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0);
2150 72 : }
2151 :
2152 : enum bdev_nvme_op_after_reset {
2153 : OP_NONE,
2154 : OP_COMPLETE_PENDING_DESTRUCT,
2155 : OP_DESTRUCT,
2156 : OP_DELAYED_RECONNECT,
2157 : OP_FAILOVER,
2158 : };
2159 :
2160 : typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset;
2161 :
2162 : static _bdev_nvme_op_after_reset
2163 71 : bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success)
2164 : {
2165 71 : if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
2166 : /* Complete pending destruct after reset completes. */
2167 0 : return OP_COMPLETE_PENDING_DESTRUCT;
2168 71 : } else if (nvme_ctrlr->pending_failover) {
2169 3 : nvme_ctrlr->pending_failover = false;
2170 3 : nvme_ctrlr->reset_start_tsc = 0;
2171 3 : return OP_FAILOVER;
2172 68 : } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) {
2173 54 : nvme_ctrlr->reset_start_tsc = 0;
2174 54 : return OP_NONE;
2175 14 : } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) {
2176 2 : return OP_DESTRUCT;
2177 : } else {
2178 12 : if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) {
2179 2 : nvme_ctrlr->fast_io_fail_timedout = true;
2180 2 : }
2181 12 : return OP_DELAYED_RECONNECT;
2182 : }
2183 71 : }
2184 :
2185 : static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug);
2186 : static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
2187 :
2188 : static int
2189 9 : bdev_nvme_reconnect_delay_timer_expired(void *ctx)
2190 : {
2191 9 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2192 :
2193 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name);
2194 9 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2195 :
2196 9 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
2197 :
2198 9 : if (!nvme_ctrlr->reconnect_is_delayed) {
2199 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2200 0 : return SPDK_POLLER_BUSY;
2201 : }
2202 :
2203 9 : nvme_ctrlr->reconnect_is_delayed = false;
2204 :
2205 9 : if (nvme_ctrlr->destruct) {
2206 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2207 0 : return SPDK_POLLER_BUSY;
2208 : }
2209 :
2210 9 : assert(nvme_ctrlr->resetting == false);
2211 9 : nvme_ctrlr->resetting = true;
2212 :
2213 9 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2214 :
2215 9 : spdk_poller_resume(nvme_ctrlr->adminq_timer_poller);
2216 :
2217 9 : bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
2218 9 : return SPDK_POLLER_BUSY;
2219 9 : }
2220 :
2221 : static void
2222 12 : bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr)
2223 : {
2224 12 : spdk_poller_pause(nvme_ctrlr->adminq_timer_poller);
2225 :
2226 12 : assert(nvme_ctrlr->reconnect_is_delayed == false);
2227 12 : nvme_ctrlr->reconnect_is_delayed = true;
2228 :
2229 12 : assert(nvme_ctrlr->reconnect_delay_timer == NULL);
2230 12 : nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired,
2231 : nvme_ctrlr,
2232 : nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC);
2233 12 : }
2234 :
2235 : static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr);
2236 :
2237 : static void
2238 71 : bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success)
2239 : {
2240 71 : bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn;
2241 71 : void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg;
2242 71 : enum bdev_nvme_op_after_reset op_after_reset;
2243 :
2244 71 : assert(nvme_ctrlr->thread == spdk_get_thread());
2245 :
2246 71 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2247 71 : if (!success) {
2248 : /* Connecting the active trid failed. Set the next alternate trid to the
2249 : * active trid if it exists.
2250 : */
2251 35 : if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) {
2252 : /* The next alternate trid exists and is ready to try. Try it now. */
2253 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2254 :
2255 2 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Try the next alternate trid %s:%s now.\n",
2256 : nvme_ctrlr->active_path_id->trid.traddr,
2257 : nvme_ctrlr->active_path_id->trid.trsvcid);
2258 :
2259 2 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr);
2260 2 : return;
2261 : }
2262 :
2263 : /* We came here if there is no alternate trid or if the next trid exists but
2264 : * is not ready to try. We will try the active trid after reconnect_delay_sec
2265 : * seconds if it is non-zero or at the next reset call otherwise.
2266 : */
2267 33 : } else {
2268 : /* Connecting the active trid succeeded. Clear the last failed time because it
2269 : * means the trid is failed if its last failed time is non-zero.
2270 : */
2271 36 : nvme_ctrlr->active_path_id->last_failed_tsc = 0;
2272 : }
2273 :
2274 69 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Clear pending resets.\n");
2275 :
2276 : /* Make sure we clear any pending resets before returning. */
2277 69 : bdev_nvme_complete_pending_resets(nvme_ctrlr, success);
2278 :
2279 69 : if (!success) {
2280 33 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Resetting controller failed.\n");
2281 33 : } else {
2282 36 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Resetting controller successful.\n");
2283 : }
2284 :
2285 69 : nvme_ctrlr->resetting = false;
2286 69 : nvme_ctrlr->dont_retry = false;
2287 69 : nvme_ctrlr->in_failover = false;
2288 :
2289 69 : nvme_ctrlr->ctrlr_op_cb_fn = NULL;
2290 69 : nvme_ctrlr->ctrlr_op_cb_arg = NULL;
2291 :
2292 69 : op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success);
2293 69 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2294 :
2295 : /* Delay callbacks when the next operation is a failover. */
2296 69 : if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) {
2297 17 : ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1);
2298 17 : }
2299 :
2300 69 : switch (op_after_reset) {
2301 : case OP_COMPLETE_PENDING_DESTRUCT:
2302 0 : nvme_ctrlr_unregister(nvme_ctrlr);
2303 0 : break;
2304 : case OP_DESTRUCT:
2305 2 : bdev_nvme_delete_ctrlr(nvme_ctrlr, false);
2306 2 : remove_discovery_entry(nvme_ctrlr);
2307 2 : break;
2308 : case OP_DELAYED_RECONNECT:
2309 12 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer);
2310 12 : break;
2311 : case OP_FAILOVER:
2312 3 : nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn;
2313 3 : nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg;
2314 3 : bdev_nvme_failover_ctrlr(nvme_ctrlr);
2315 3 : break;
2316 : default:
2317 52 : break;
2318 : }
2319 71 : }
2320 :
2321 : static void
2322 0 : bdev_nvme_reset_create_qpairs_failed(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2323 : {
2324 0 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false);
2325 0 : }
2326 :
2327 : static void
2328 104 : bdev_nvme_reset_destroy_qpair(struct nvme_ctrlr_channel_iter *i,
2329 : struct nvme_ctrlr *nvme_ctrlr,
2330 : struct nvme_ctrlr_channel *ctrlr_ch, void *ctx)
2331 : {
2332 104 : struct nvme_qpair *nvme_qpair;
2333 104 : struct spdk_nvme_qpair *qpair;
2334 :
2335 104 : nvme_qpair = ctrlr_ch->qpair;
2336 104 : assert(nvme_qpair != NULL);
2337 :
2338 104 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
2339 :
2340 104 : qpair = nvme_qpair->qpair;
2341 104 : if (qpair != NULL) {
2342 69 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting qpair %p:%u.\n",
2343 : qpair, spdk_nvme_qpair_get_id(qpair));
2344 :
2345 69 : if (nvme_qpair->ctrlr->dont_retry) {
2346 53 : spdk_nvme_qpair_set_abort_dnr(qpair, true);
2347 53 : }
2348 69 : spdk_nvme_ctrlr_disconnect_io_qpair(qpair);
2349 :
2350 : /* The current full reset sequence will move to the next
2351 : * ctrlr_channel after the qpair is actually disconnected.
2352 : */
2353 69 : assert(ctrlr_ch->reset_iter == NULL);
2354 69 : ctrlr_ch->reset_iter = i;
2355 69 : } else {
2356 35 : nvme_ctrlr_for_each_channel_continue(i, 0);
2357 : }
2358 104 : }
2359 :
2360 : static void
2361 36 : bdev_nvme_reset_create_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2362 : {
2363 36 : if (status == 0) {
2364 36 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were created after ctrlr reset.\n");
2365 :
2366 36 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true);
2367 36 : } else {
2368 0 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were failed to create after ctrlr reset.\n");
2369 :
2370 : /* Delete the added qpairs and quiesce ctrlr to make the states clean. */
2371 0 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2372 : bdev_nvme_reset_destroy_qpair,
2373 : NULL,
2374 : bdev_nvme_reset_create_qpairs_failed);
2375 : }
2376 36 : }
2377 :
2378 : static int
2379 61 : bdev_nvme_reset_check_qpair_connected(void *ctx)
2380 : {
2381 61 : struct nvme_ctrlr_channel *ctrlr_ch = ctx;
2382 61 : struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair;
2383 61 : struct spdk_nvme_qpair *qpair;
2384 :
2385 61 : if (ctrlr_ch->reset_iter == NULL) {
2386 : /* qpair was already failed to connect and the reset sequence is being aborted. */
2387 0 : assert(ctrlr_ch->connect_poller == NULL);
2388 0 : assert(nvme_qpair->qpair == NULL);
2389 :
2390 0 : NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr,
2391 : "qpair was already failed to connect. reset is being aborted.\n");
2392 0 : return SPDK_POLLER_BUSY;
2393 : }
2394 :
2395 61 : qpair = nvme_qpair->qpair;
2396 61 : assert(qpair != NULL);
2397 :
2398 61 : if (!spdk_nvme_qpair_is_connected(qpair)) {
2399 0 : return SPDK_POLLER_BUSY;
2400 : }
2401 :
2402 61 : NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr, "qpair %p:%u was connected.\n",
2403 : qpair, spdk_nvme_qpair_get_id(qpair));
2404 :
2405 61 : spdk_poller_unregister(&ctrlr_ch->connect_poller);
2406 :
2407 : /* qpair was completed to connect. Move to the next ctrlr_channel */
2408 61 : nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0);
2409 61 : ctrlr_ch->reset_iter = NULL;
2410 :
2411 61 : if (!g_opts.disable_auto_failback) {
2412 44 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
2413 44 : }
2414 :
2415 61 : return SPDK_POLLER_BUSY;
2416 61 : }
2417 :
2418 : static void
2419 61 : bdev_nvme_reset_create_qpair(struct nvme_ctrlr_channel_iter *i,
2420 : struct nvme_ctrlr *nvme_ctrlr,
2421 : struct nvme_ctrlr_channel *ctrlr_ch,
2422 : void *ctx)
2423 : {
2424 61 : struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair;
2425 61 : struct spdk_nvme_qpair *qpair;
2426 61 : int rc = 0;
2427 :
2428 61 : if (nvme_qpair->qpair == NULL) {
2429 61 : rc = bdev_nvme_create_qpair(nvme_qpair);
2430 61 : }
2431 61 : if (rc == 0) {
2432 61 : ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected,
2433 : ctrlr_ch, 0);
2434 :
2435 61 : qpair = nvme_qpair->qpair;
2436 :
2437 61 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start checking qpair %p:%u to be connected.\n",
2438 : qpair, spdk_nvme_qpair_get_id(qpair));
2439 :
2440 : /* The current full reset sequence will move to the next
2441 : * ctrlr_channel after the qpair is actually connected.
2442 : */
2443 61 : assert(ctrlr_ch->reset_iter == NULL);
2444 61 : ctrlr_ch->reset_iter = i;
2445 61 : } else {
2446 0 : nvme_ctrlr_for_each_channel_continue(i, rc);
2447 : }
2448 61 : }
2449 :
2450 : static void
2451 36 : nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr)
2452 : {
2453 36 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
2454 36 : struct nvme_ns *nvme_ns;
2455 :
2456 57 : for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
2457 57 : nvme_ns != NULL;
2458 21 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) {
2459 21 : if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) {
2460 1 : SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id);
2461 : /* NS can be added again. Just nullify nvme_ns->ns. */
2462 1 : nvme_ns->ns = NULL;
2463 1 : }
2464 21 : }
2465 36 : }
2466 :
2467 :
2468 : static int
2469 70 : bdev_nvme_reconnect_ctrlr_poll(void *arg)
2470 : {
2471 70 : struct nvme_ctrlr *nvme_ctrlr = arg;
2472 70 : struct spdk_nvme_transport_id *trid;
2473 70 : int rc = -ETIMEDOUT;
2474 :
2475 70 : if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) {
2476 : /* Mark the ctrlr as failed. The next call to
2477 : * spdk_nvme_ctrlr_reconnect_poll_async() will then
2478 : * do the necessary cleanup and return failure.
2479 : */
2480 2 : spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
2481 2 : }
2482 :
2483 70 : rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr);
2484 70 : if (rc == -EAGAIN) {
2485 0 : return SPDK_POLLER_BUSY;
2486 : }
2487 :
2488 70 : spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
2489 70 : if (rc == 0) {
2490 36 : trid = &nvme_ctrlr->active_path_id->trid;
2491 :
2492 36 : if (spdk_nvme_trtype_is_fabrics(trid->trtype)) {
2493 36 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected to %s:%s. Create qpairs.\n",
2494 : trid->traddr, trid->trsvcid);
2495 36 : } else {
2496 0 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected. Create qpairs.\n");
2497 : }
2498 :
2499 36 : nvme_ctrlr_check_namespaces(nvme_ctrlr);
2500 :
2501 : /* Recreate all of the I/O queue pairs */
2502 36 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2503 : bdev_nvme_reset_create_qpair,
2504 : NULL,
2505 : bdev_nvme_reset_create_qpairs_done);
2506 36 : } else {
2507 34 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr could not be connected.\n");
2508 :
2509 34 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false);
2510 : }
2511 70 : return SPDK_POLLER_BUSY;
2512 70 : }
2513 :
2514 : static void
2515 70 : bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2516 : {
2517 70 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start reconnecting ctrlr.\n");
2518 :
2519 70 : spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr);
2520 :
2521 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name);
2522 70 : assert(nvme_ctrlr->reset_detach_poller == NULL);
2523 70 : nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll,
2524 : nvme_ctrlr, 0);
2525 70 : }
2526 :
2527 : static void
2528 57 : bdev_nvme_reset_destroy_qpair_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2529 : {
2530 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name);
2531 57 : assert(status == 0);
2532 :
2533 57 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were deleted.\n");
2534 :
2535 57 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2536 0 : bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
2537 0 : } else {
2538 57 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr);
2539 : }
2540 57 : }
2541 :
2542 : static void
2543 57 : bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr)
2544 : {
2545 57 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Delete qpairs for reset.\n");
2546 :
2547 57 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2548 : bdev_nvme_reset_destroy_qpair,
2549 : NULL,
2550 : bdev_nvme_reset_destroy_qpair_done);
2551 57 : }
2552 :
2553 : static void
2554 3 : bdev_nvme_reconnect_ctrlr_now(void *ctx)
2555 : {
2556 3 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2557 :
2558 3 : assert(nvme_ctrlr->resetting == true);
2559 3 : assert(nvme_ctrlr->thread == spdk_get_thread());
2560 :
2561 3 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
2562 :
2563 3 : spdk_poller_resume(nvme_ctrlr->adminq_timer_poller);
2564 :
2565 3 : bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
2566 3 : }
2567 :
2568 : static void
2569 57 : _bdev_nvme_reset_ctrlr(void *ctx)
2570 : {
2571 57 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2572 :
2573 57 : assert(nvme_ctrlr->resetting == true);
2574 57 : assert(nvme_ctrlr->thread == spdk_get_thread());
2575 :
2576 57 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2577 0 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs);
2578 0 : } else {
2579 57 : bdev_nvme_reset_destroy_qpairs(nvme_ctrlr);
2580 : }
2581 57 : }
2582 :
2583 : static int
2584 50 : bdev_nvme_reset_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, spdk_msg_fn *msg_fn)
2585 : {
2586 50 : if (nvme_ctrlr->destruct) {
2587 3 : return -ENXIO;
2588 : }
2589 :
2590 47 : if (nvme_ctrlr->resetting) {
2591 14 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset, already in progress.\n");
2592 14 : return -EBUSY;
2593 : }
2594 :
2595 33 : if (nvme_ctrlr->disabled) {
2596 1 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset. Controller is disabled.\n");
2597 1 : return -EALREADY;
2598 : }
2599 :
2600 32 : nvme_ctrlr->resetting = true;
2601 32 : nvme_ctrlr->dont_retry = true;
2602 :
2603 32 : if (nvme_ctrlr->reconnect_is_delayed) {
2604 1 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Reconnect is already scheduled.\n");
2605 1 : *msg_fn = bdev_nvme_reconnect_ctrlr_now;
2606 1 : nvme_ctrlr->reconnect_is_delayed = false;
2607 1 : } else {
2608 31 : *msg_fn = _bdev_nvme_reset_ctrlr;
2609 31 : assert(nvme_ctrlr->reset_start_tsc == 0);
2610 : }
2611 :
2612 32 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
2613 :
2614 32 : return 0;
2615 50 : }
2616 :
2617 : static int
2618 24 : bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2619 : {
2620 24 : spdk_msg_fn msg_fn;
2621 24 : int rc;
2622 :
2623 24 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2624 24 : rc = bdev_nvme_reset_ctrlr_unsafe(nvme_ctrlr, &msg_fn);
2625 24 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2626 :
2627 24 : if (rc == 0) {
2628 19 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
2629 19 : }
2630 :
2631 48 : return rc;
2632 24 : }
2633 :
2634 : static int
2635 3 : bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2636 : {
2637 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2638 3 : if (nvme_ctrlr->destruct) {
2639 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2640 0 : return -ENXIO;
2641 : }
2642 :
2643 3 : if (nvme_ctrlr->resetting) {
2644 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2645 0 : return -EBUSY;
2646 : }
2647 :
2648 3 : if (!nvme_ctrlr->disabled) {
2649 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2650 1 : return -EALREADY;
2651 : }
2652 :
2653 2 : nvme_ctrlr->disabled = false;
2654 2 : nvme_ctrlr->resetting = true;
2655 :
2656 2 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
2657 :
2658 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2659 :
2660 2 : spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr);
2661 2 : return 0;
2662 3 : }
2663 :
2664 : static void
2665 2 : bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr)
2666 : {
2667 2 : bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn;
2668 2 : void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg;
2669 2 : enum bdev_nvme_op_after_reset op_after_disable;
2670 :
2671 2 : assert(nvme_ctrlr->thread == spdk_get_thread());
2672 :
2673 2 : nvme_ctrlr->ctrlr_op_cb_fn = NULL;
2674 2 : nvme_ctrlr->ctrlr_op_cb_arg = NULL;
2675 :
2676 2 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2677 :
2678 2 : nvme_ctrlr->resetting = false;
2679 2 : nvme_ctrlr->dont_retry = false;
2680 :
2681 2 : op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true);
2682 :
2683 2 : nvme_ctrlr->disabled = true;
2684 2 : spdk_poller_pause(nvme_ctrlr->adminq_timer_poller);
2685 :
2686 : /* Make sure we clear any pending resets before returning. */
2687 2 : bdev_nvme_complete_pending_resets(nvme_ctrlr, true);
2688 :
2689 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2690 :
2691 2 : if (ctrlr_op_cb_fn) {
2692 0 : ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0);
2693 0 : }
2694 :
2695 2 : switch (op_after_disable) {
2696 : case OP_COMPLETE_PENDING_DESTRUCT:
2697 0 : nvme_ctrlr_unregister(nvme_ctrlr);
2698 0 : break;
2699 : default:
2700 2 : break;
2701 : }
2702 2 : }
2703 :
2704 : static void
2705 1 : bdev_nvme_disable_destroy_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2706 : {
2707 1 : assert(status == 0);
2708 :
2709 1 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2710 0 : bdev_nvme_disable_ctrlr_complete(nvme_ctrlr);
2711 0 : } else {
2712 1 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete);
2713 : }
2714 1 : }
2715 :
2716 : static void
2717 1 : bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr)
2718 : {
2719 1 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2720 : bdev_nvme_reset_destroy_qpair,
2721 : NULL,
2722 : bdev_nvme_disable_destroy_qpairs_done);
2723 1 : }
2724 :
2725 : static void
2726 1 : _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx)
2727 : {
2728 1 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2729 :
2730 1 : assert(nvme_ctrlr->resetting == true);
2731 1 : assert(nvme_ctrlr->thread == spdk_get_thread());
2732 :
2733 1 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
2734 :
2735 1 : bdev_nvme_disable_ctrlr_complete(nvme_ctrlr);
2736 1 : }
2737 :
2738 : static void
2739 1 : _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx)
2740 : {
2741 1 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2742 :
2743 1 : assert(nvme_ctrlr->resetting == true);
2744 1 : assert(nvme_ctrlr->thread == spdk_get_thread());
2745 :
2746 1 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2747 0 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs);
2748 0 : } else {
2749 1 : bdev_nvme_disable_destroy_qpairs(nvme_ctrlr);
2750 : }
2751 1 : }
2752 :
2753 : static int
2754 5 : bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2755 : {
2756 5 : spdk_msg_fn msg_fn;
2757 :
2758 5 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2759 5 : if (nvme_ctrlr->destruct) {
2760 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2761 1 : return -ENXIO;
2762 : }
2763 :
2764 4 : if (nvme_ctrlr->resetting) {
2765 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2766 1 : return -EBUSY;
2767 : }
2768 :
2769 3 : if (nvme_ctrlr->disabled) {
2770 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2771 1 : return -EALREADY;
2772 : }
2773 :
2774 2 : nvme_ctrlr->resetting = true;
2775 2 : nvme_ctrlr->dont_retry = true;
2776 :
2777 2 : if (nvme_ctrlr->reconnect_is_delayed) {
2778 1 : msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr;
2779 1 : nvme_ctrlr->reconnect_is_delayed = false;
2780 1 : } else {
2781 1 : msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr;
2782 : }
2783 :
2784 2 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
2785 :
2786 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2787 :
2788 2 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
2789 2 : return 0;
2790 5 : }
2791 :
2792 : static int
2793 6 : nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op,
2794 : bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg)
2795 : {
2796 6 : int rc;
2797 :
2798 6 : switch (op) {
2799 : case NVME_CTRLR_OP_RESET:
2800 5 : rc = bdev_nvme_reset_ctrlr(nvme_ctrlr);
2801 5 : break;
2802 : case NVME_CTRLR_OP_ENABLE:
2803 0 : rc = bdev_nvme_enable_ctrlr(nvme_ctrlr);
2804 0 : break;
2805 : case NVME_CTRLR_OP_DISABLE:
2806 0 : rc = bdev_nvme_disable_ctrlr(nvme_ctrlr);
2807 0 : break;
2808 : default:
2809 1 : rc = -EINVAL;
2810 1 : break;
2811 : }
2812 :
2813 6 : if (rc == 0) {
2814 3 : assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL);
2815 3 : assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL);
2816 3 : nvme_ctrlr->ctrlr_op_cb_fn = cb_fn;
2817 3 : nvme_ctrlr->ctrlr_op_cb_arg = cb_arg;
2818 3 : }
2819 12 : return rc;
2820 6 : }
2821 :
2822 : struct nvme_ctrlr_op_rpc_ctx {
2823 : struct nvme_ctrlr *nvme_ctrlr;
2824 : struct spdk_thread *orig_thread;
2825 : enum nvme_ctrlr_op op;
2826 : int rc;
2827 : bdev_nvme_ctrlr_op_cb cb_fn;
2828 : void *cb_arg;
2829 : };
2830 :
2831 : static void
2832 4 : _nvme_ctrlr_op_rpc_complete(void *_ctx)
2833 : {
2834 4 : struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx;
2835 :
2836 4 : assert(ctx != NULL);
2837 4 : assert(ctx->cb_fn != NULL);
2838 :
2839 4 : ctx->cb_fn(ctx->cb_arg, ctx->rc);
2840 :
2841 4 : free(ctx);
2842 4 : }
2843 :
2844 : static void
2845 4 : nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc)
2846 : {
2847 4 : struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg;
2848 :
2849 4 : ctx->rc = rc;
2850 :
2851 4 : spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx);
2852 4 : }
2853 :
2854 : void
2855 4 : nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op,
2856 : bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg)
2857 : {
2858 4 : struct nvme_ctrlr_op_rpc_ctx *ctx;
2859 4 : int rc;
2860 :
2861 4 : assert(cb_fn != NULL);
2862 :
2863 4 : ctx = calloc(1, sizeof(*ctx));
2864 4 : if (ctx == NULL) {
2865 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_ctrlr_op_rpc_ctx.\n");
2866 0 : cb_fn(cb_arg, -ENOMEM);
2867 0 : return;
2868 : }
2869 :
2870 4 : ctx->orig_thread = spdk_get_thread();
2871 4 : ctx->cb_fn = cb_fn;
2872 4 : ctx->cb_arg = cb_arg;
2873 :
2874 4 : rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx);
2875 4 : if (rc == 0) {
2876 1 : return;
2877 3 : } else if (rc == -EALREADY) {
2878 0 : rc = 0;
2879 0 : }
2880 :
2881 3 : nvme_ctrlr_op_rpc_complete(ctx, rc);
2882 4 : }
2883 :
2884 : static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc);
2885 :
2886 : static void
2887 2 : _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx)
2888 : {
2889 2 : struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx;
2890 2 : struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr;
2891 2 : int rc;
2892 :
2893 2 : prev_nvme_ctrlr = ctx->nvme_ctrlr;
2894 2 : ctx->nvme_ctrlr = NULL;
2895 :
2896 2 : if (ctx->rc != 0) {
2897 0 : goto complete;
2898 : }
2899 :
2900 2 : next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq);
2901 2 : if (next_nvme_ctrlr == NULL) {
2902 1 : goto complete;
2903 : }
2904 :
2905 1 : rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx);
2906 1 : if (rc == 0) {
2907 1 : ctx->nvme_ctrlr = next_nvme_ctrlr;
2908 1 : return;
2909 0 : } else if (rc == -EALREADY) {
2910 0 : ctx->nvme_ctrlr = next_nvme_ctrlr;
2911 0 : rc = 0;
2912 0 : }
2913 :
2914 0 : ctx->rc = rc;
2915 :
2916 : complete:
2917 1 : ctx->cb_fn(ctx->cb_arg, ctx->rc);
2918 1 : free(ctx);
2919 2 : }
2920 :
2921 : static void
2922 2 : nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc)
2923 : {
2924 2 : struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg;
2925 :
2926 2 : ctx->rc = rc;
2927 :
2928 2 : spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx);
2929 2 : }
2930 :
2931 : void
2932 1 : nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op,
2933 : bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg)
2934 : {
2935 1 : struct nvme_ctrlr_op_rpc_ctx *ctx;
2936 1 : struct nvme_ctrlr *nvme_ctrlr;
2937 1 : int rc;
2938 :
2939 1 : assert(cb_fn != NULL);
2940 :
2941 1 : ctx = calloc(1, sizeof(*ctx));
2942 1 : if (ctx == NULL) {
2943 0 : SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n");
2944 0 : cb_fn(cb_arg, -ENOMEM);
2945 0 : return;
2946 : }
2947 :
2948 1 : ctx->orig_thread = spdk_get_thread();
2949 1 : ctx->op = op;
2950 1 : ctx->cb_fn = cb_fn;
2951 1 : ctx->cb_arg = cb_arg;
2952 :
2953 1 : nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
2954 1 : assert(nvme_ctrlr != NULL);
2955 :
2956 1 : rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx);
2957 1 : if (rc == 0) {
2958 1 : ctx->nvme_ctrlr = nvme_ctrlr;
2959 1 : return;
2960 0 : } else if (rc == -EALREADY) {
2961 0 : ctx->nvme_ctrlr = nvme_ctrlr;
2962 0 : rc = 0;
2963 0 : }
2964 :
2965 0 : nvme_bdev_ctrlr_op_rpc_continue(ctx, rc);
2966 1 : }
2967 :
2968 : static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio);
2969 :
2970 : static void
2971 16 : bdev_nvme_unfreeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status)
2972 : {
2973 16 : struct nvme_bdev_io *bio = ctx;
2974 16 : enum spdk_bdev_io_status io_status;
2975 :
2976 16 : if (bio->cpl.cdw0 == 0) {
2977 12 : io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
2978 12 : } else {
2979 4 : io_status = SPDK_BDEV_IO_STATUS_FAILED;
2980 : }
2981 :
2982 16 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p completed, status:%d\n", bio, io_status);
2983 :
2984 16 : __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL);
2985 16 : }
2986 :
2987 : static void
2988 32 : bdev_nvme_unfreeze_bdev_channel(struct nvme_bdev_channel_iter *i,
2989 : struct nvme_bdev *nbdev,
2990 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
2991 : {
2992 32 : bdev_nvme_abort_retry_ios(nbdev_ch);
2993 32 : nbdev_ch->resetting = false;
2994 :
2995 32 : nvme_bdev_for_each_channel_continue(i, 0);
2996 32 : }
2997 :
2998 : static void
2999 16 : bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio)
3000 : {
3001 16 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3002 16 : struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
3003 :
3004 : /* Abort all queued I/Os for retry. */
3005 32 : nvme_bdev_for_each_channel(nbdev,
3006 : bdev_nvme_unfreeze_bdev_channel,
3007 16 : bio,
3008 : bdev_nvme_unfreeze_bdev_channel_done);
3009 16 : }
3010 :
3011 : static void
3012 26 : _bdev_nvme_reset_io_continue(void *ctx)
3013 : {
3014 26 : struct nvme_bdev_io *bio = ctx;
3015 26 : struct nvme_io_path *prev_io_path, *next_io_path;
3016 26 : int rc;
3017 :
3018 26 : prev_io_path = bio->io_path;
3019 26 : bio->io_path = NULL;
3020 :
3021 26 : next_io_path = STAILQ_NEXT(prev_io_path, stailq);
3022 26 : if (next_io_path == NULL) {
3023 16 : goto complete;
3024 : }
3025 :
3026 10 : rc = _bdev_nvme_reset_io(next_io_path, bio);
3027 10 : if (rc == 0) {
3028 10 : return;
3029 : }
3030 :
3031 : complete:
3032 16 : bdev_nvme_reset_io_complete(bio);
3033 26 : }
3034 :
3035 : static void
3036 26 : bdev_nvme_reset_io_continue(void *cb_arg, int rc)
3037 : {
3038 26 : struct nvme_bdev_io *bio = cb_arg;
3039 26 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3040 26 : struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
3041 :
3042 26 : NVME_BDEV_INFOLOG(nbdev, "continue reset_io %p, rc:%d\n", bio, rc);
3043 :
3044 : /* Reset status is initialized as "failed". Set to "success" once we have at least one
3045 : * successfully reset nvme_ctrlr.
3046 : */
3047 26 : if (rc == 0) {
3048 16 : bio->cpl.cdw0 = 0;
3049 16 : }
3050 :
3051 26 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio);
3052 26 : }
3053 :
3054 : static int
3055 26 : _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio)
3056 : {
3057 26 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3058 26 : struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
3059 26 : struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr;
3060 26 : spdk_msg_fn msg_fn;
3061 26 : int rc;
3062 :
3063 26 : assert(bio->io_path == NULL);
3064 26 : bio->io_path = io_path;
3065 :
3066 26 : pthread_mutex_lock(&nvme_ctrlr->mutex);
3067 26 : rc = bdev_nvme_reset_ctrlr_unsafe(nvme_ctrlr, &msg_fn);
3068 26 : if (rc == -EBUSY) {
3069 : /*
3070 : * Reset call is queued only if it is from the app framework. This is on purpose so that
3071 : * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
3072 : * upper level. If they are in the middle of a reset, we won't try to schedule another one.
3073 : */
3074 12 : TAILQ_INSERT_TAIL(&nvme_ctrlr->pending_resets, bio, retry_link);
3075 12 : }
3076 26 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
3077 :
3078 26 : if (rc == 0) {
3079 13 : assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL);
3080 13 : assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL);
3081 13 : nvme_ctrlr->ctrlr_op_cb_fn = bdev_nvme_reset_io_continue;
3082 13 : nvme_ctrlr->ctrlr_op_cb_arg = bio;
3083 :
3084 13 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
3085 :
3086 13 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p started resetting ctrlr [%s, %u].\n",
3087 : bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr));
3088 26 : } else if (rc == -EBUSY) {
3089 12 : rc = 0;
3090 :
3091 12 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p was queued to ctrlr [%s, %u].\n",
3092 : bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr));
3093 12 : } else {
3094 1 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p could not reset ctrlr [%s, %u], rc:%d\n",
3095 : bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr), rc);
3096 : }
3097 :
3098 52 : return rc;
3099 26 : }
3100 :
3101 : static void
3102 16 : bdev_nvme_freeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status)
3103 : {
3104 16 : struct nvme_bdev_io *bio = ctx;
3105 16 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3106 16 : struct nvme_bdev_channel *nbdev_ch;
3107 16 : struct nvme_io_path *io_path;
3108 16 : int rc;
3109 :
3110 16 : nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
3111 :
3112 : /* Initialize with failed status. With multipath it is enough to have at least one successful
3113 : * nvme_ctrlr reset. If there is none, reset status will remain failed.
3114 : */
3115 16 : bio->cpl.cdw0 = 1;
3116 :
3117 : /* Reset all nvme_ctrlrs of a bdev controller sequentially. */
3118 16 : io_path = STAILQ_FIRST(&nbdev_ch->io_path_list);
3119 16 : assert(io_path != NULL);
3120 :
3121 16 : rc = _bdev_nvme_reset_io(io_path, bio);
3122 16 : if (rc != 0) {
3123 : /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */
3124 1 : rc = (rc == -EALREADY) ? 0 : rc;
3125 :
3126 1 : bdev_nvme_reset_io_continue(bio, rc);
3127 1 : }
3128 16 : }
3129 :
3130 : static void
3131 30 : bdev_nvme_freeze_bdev_channel(struct nvme_bdev_channel_iter *i,
3132 : struct nvme_bdev *nbdev,
3133 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
3134 : {
3135 30 : nbdev_ch->resetting = true;
3136 :
3137 30 : nvme_bdev_for_each_channel_continue(i, 0);
3138 30 : }
3139 :
3140 : static void
3141 15 : bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio)
3142 : {
3143 15 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p started.\n", bio);
3144 :
3145 30 : nvme_bdev_for_each_channel(nbdev,
3146 : bdev_nvme_freeze_bdev_channel,
3147 15 : bio,
3148 : bdev_nvme_freeze_bdev_channel_done);
3149 15 : }
3150 :
3151 : static int
3152 32 : bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove)
3153 : {
3154 32 : if (nvme_ctrlr->destruct) {
3155 : /* Don't bother resetting if the controller is in the process of being destructed. */
3156 2 : return -ENXIO;
3157 : }
3158 :
3159 30 : if (nvme_ctrlr->resetting) {
3160 3 : if (!nvme_ctrlr->in_failover) {
3161 3 : NVME_CTRLR_NOTICELOG(nvme_ctrlr,
3162 : "Reset is already in progress. Defer failover until reset completes.\n");
3163 :
3164 : /* Defer failover until reset completes. */
3165 3 : nvme_ctrlr->pending_failover = true;
3166 3 : return -EINPROGRESS;
3167 : } else {
3168 0 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform failover, already in progress.\n");
3169 0 : return -EBUSY;
3170 : }
3171 : }
3172 :
3173 27 : bdev_nvme_failover_trid(nvme_ctrlr, remove, true);
3174 :
3175 27 : if (nvme_ctrlr->reconnect_is_delayed) {
3176 1 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Reconnect is already scheduled.\n");
3177 :
3178 : /* We rely on the next reconnect for the failover. */
3179 1 : return -EALREADY;
3180 : }
3181 :
3182 26 : if (nvme_ctrlr->disabled) {
3183 0 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Controller is disabled.\n");
3184 :
3185 : /* We rely on the enablement for the failover. */
3186 0 : return -EALREADY;
3187 : }
3188 :
3189 26 : nvme_ctrlr->resetting = true;
3190 26 : nvme_ctrlr->in_failover = true;
3191 :
3192 26 : assert(nvme_ctrlr->reset_start_tsc == 0);
3193 26 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
3194 :
3195 26 : return 0;
3196 32 : }
3197 :
3198 : static int
3199 30 : bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
3200 : {
3201 30 : int rc;
3202 :
3203 30 : pthread_mutex_lock(&nvme_ctrlr->mutex);
3204 30 : rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false);
3205 30 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
3206 :
3207 30 : if (rc == 0) {
3208 25 : spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr);
3209 30 : } else if (rc == -EALREADY) {
3210 0 : rc = 0;
3211 0 : }
3212 :
3213 60 : return rc;
3214 30 : }
3215 :
3216 : static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks,
3217 : uint64_t num_blocks);
3218 :
3219 : static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks,
3220 : uint64_t num_blocks);
3221 :
3222 : static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks,
3223 : uint64_t src_offset_blocks,
3224 : uint64_t num_blocks);
3225 :
3226 : static void
3227 1 : bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3228 : bool success)
3229 : {
3230 1 : struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
3231 1 : int ret;
3232 :
3233 1 : if (!success) {
3234 0 : ret = -EINVAL;
3235 0 : goto exit;
3236 : }
3237 :
3238 1 : if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
3239 0 : ret = -ENXIO;
3240 0 : goto exit;
3241 : }
3242 :
3243 2 : ret = bdev_nvme_readv(bio,
3244 1 : bdev_io->u.bdev.iovs,
3245 1 : bdev_io->u.bdev.iovcnt,
3246 1 : bdev_io->u.bdev.md_buf,
3247 1 : bdev_io->u.bdev.num_blocks,
3248 1 : bdev_io->u.bdev.offset_blocks,
3249 1 : bdev_io->u.bdev.dif_check_flags,
3250 1 : bdev_io->u.bdev.memory_domain,
3251 1 : bdev_io->u.bdev.memory_domain_ctx,
3252 1 : bdev_io->u.bdev.accel_sequence);
3253 :
3254 : exit:
3255 1 : if (spdk_unlikely(ret != 0)) {
3256 0 : bdev_nvme_io_complete(bio, ret);
3257 0 : }
3258 1 : }
3259 :
3260 : static inline void
3261 59 : _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io)
3262 : {
3263 59 : struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
3264 59 : struct spdk_bdev *bdev = bdev_io->bdev;
3265 59 : struct nvme_bdev_io *nbdev_io_to_abort;
3266 59 : int rc = 0;
3267 :
3268 59 : switch (bdev_io->type) {
3269 : case SPDK_BDEV_IO_TYPE_READ:
3270 3 : if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
3271 :
3272 4 : rc = bdev_nvme_readv(nbdev_io,
3273 2 : bdev_io->u.bdev.iovs,
3274 2 : bdev_io->u.bdev.iovcnt,
3275 2 : bdev_io->u.bdev.md_buf,
3276 2 : bdev_io->u.bdev.num_blocks,
3277 2 : bdev_io->u.bdev.offset_blocks,
3278 2 : bdev_io->u.bdev.dif_check_flags,
3279 2 : bdev_io->u.bdev.memory_domain,
3280 2 : bdev_io->u.bdev.memory_domain_ctx,
3281 2 : bdev_io->u.bdev.accel_sequence);
3282 2 : } else {
3283 2 : spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
3284 1 : bdev_io->u.bdev.num_blocks * bdev->blocklen);
3285 1 : rc = 0;
3286 : }
3287 3 : break;
3288 : case SPDK_BDEV_IO_TYPE_WRITE:
3289 50 : rc = bdev_nvme_writev(nbdev_io,
3290 25 : bdev_io->u.bdev.iovs,
3291 25 : bdev_io->u.bdev.iovcnt,
3292 25 : bdev_io->u.bdev.md_buf,
3293 25 : bdev_io->u.bdev.num_blocks,
3294 25 : bdev_io->u.bdev.offset_blocks,
3295 25 : bdev_io->u.bdev.dif_check_flags,
3296 25 : bdev_io->u.bdev.memory_domain,
3297 25 : bdev_io->u.bdev.memory_domain_ctx,
3298 25 : bdev_io->u.bdev.accel_sequence,
3299 25 : bdev_io->u.bdev.nvme_cdw12,
3300 25 : bdev_io->u.bdev.nvme_cdw13);
3301 25 : break;
3302 : case SPDK_BDEV_IO_TYPE_COMPARE:
3303 2 : rc = bdev_nvme_comparev(nbdev_io,
3304 1 : bdev_io->u.bdev.iovs,
3305 1 : bdev_io->u.bdev.iovcnt,
3306 1 : bdev_io->u.bdev.md_buf,
3307 1 : bdev_io->u.bdev.num_blocks,
3308 1 : bdev_io->u.bdev.offset_blocks,
3309 1 : bdev_io->u.bdev.dif_check_flags);
3310 1 : break;
3311 : case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
3312 4 : rc = bdev_nvme_comparev_and_writev(nbdev_io,
3313 2 : bdev_io->u.bdev.iovs,
3314 2 : bdev_io->u.bdev.iovcnt,
3315 2 : bdev_io->u.bdev.fused_iovs,
3316 2 : bdev_io->u.bdev.fused_iovcnt,
3317 2 : bdev_io->u.bdev.md_buf,
3318 2 : bdev_io->u.bdev.num_blocks,
3319 2 : bdev_io->u.bdev.offset_blocks,
3320 2 : bdev_io->u.bdev.dif_check_flags);
3321 2 : break;
3322 : case SPDK_BDEV_IO_TYPE_UNMAP:
3323 2 : rc = bdev_nvme_unmap(nbdev_io,
3324 1 : bdev_io->u.bdev.offset_blocks,
3325 1 : bdev_io->u.bdev.num_blocks);
3326 1 : break;
3327 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3328 0 : rc = bdev_nvme_write_zeroes(nbdev_io,
3329 0 : bdev_io->u.bdev.offset_blocks,
3330 0 : bdev_io->u.bdev.num_blocks);
3331 0 : break;
3332 : case SPDK_BDEV_IO_TYPE_RESET:
3333 15 : nbdev_io->io_path = NULL;
3334 15 : bdev_nvme_reset_io(bdev->ctxt, nbdev_io);
3335 15 : return;
3336 :
3337 : case SPDK_BDEV_IO_TYPE_FLUSH:
3338 1 : bdev_nvme_io_complete(nbdev_io, 0);
3339 1 : return;
3340 :
3341 : case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
3342 0 : rc = bdev_nvme_zone_appendv(nbdev_io,
3343 0 : bdev_io->u.bdev.iovs,
3344 0 : bdev_io->u.bdev.iovcnt,
3345 0 : bdev_io->u.bdev.md_buf,
3346 0 : bdev_io->u.bdev.num_blocks,
3347 0 : bdev_io->u.bdev.offset_blocks,
3348 0 : bdev_io->u.bdev.dif_check_flags);
3349 0 : break;
3350 : case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
3351 0 : rc = bdev_nvme_get_zone_info(nbdev_io,
3352 0 : bdev_io->u.zone_mgmt.zone_id,
3353 0 : bdev_io->u.zone_mgmt.num_zones,
3354 0 : bdev_io->u.zone_mgmt.buf);
3355 0 : break;
3356 : case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
3357 0 : rc = bdev_nvme_zone_management(nbdev_io,
3358 0 : bdev_io->u.zone_mgmt.zone_id,
3359 0 : bdev_io->u.zone_mgmt.zone_action);
3360 0 : break;
3361 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
3362 5 : nbdev_io->io_path = NULL;
3363 10 : bdev_nvme_admin_passthru(nbdev_ch,
3364 5 : nbdev_io,
3365 5 : &bdev_io->u.nvme_passthru.cmd,
3366 5 : bdev_io->u.nvme_passthru.buf,
3367 5 : bdev_io->u.nvme_passthru.nbytes);
3368 5 : return;
3369 :
3370 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3371 0 : rc = bdev_nvme_io_passthru(nbdev_io,
3372 0 : &bdev_io->u.nvme_passthru.cmd,
3373 0 : bdev_io->u.nvme_passthru.buf,
3374 0 : bdev_io->u.nvme_passthru.nbytes);
3375 0 : break;
3376 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3377 0 : rc = bdev_nvme_io_passthru_md(nbdev_io,
3378 0 : &bdev_io->u.nvme_passthru.cmd,
3379 0 : bdev_io->u.nvme_passthru.buf,
3380 0 : bdev_io->u.nvme_passthru.nbytes,
3381 0 : bdev_io->u.nvme_passthru.md_buf,
3382 0 : bdev_io->u.nvme_passthru.md_len);
3383 0 : break;
3384 : case SPDK_BDEV_IO_TYPE_NVME_IOV_MD:
3385 0 : rc = bdev_nvme_iov_passthru_md(nbdev_io,
3386 0 : &bdev_io->u.nvme_passthru.cmd,
3387 0 : bdev_io->u.nvme_passthru.iovs,
3388 0 : bdev_io->u.nvme_passthru.iovcnt,
3389 0 : bdev_io->u.nvme_passthru.nbytes,
3390 0 : bdev_io->u.nvme_passthru.md_buf,
3391 0 : bdev_io->u.nvme_passthru.md_len);
3392 0 : break;
3393 : case SPDK_BDEV_IO_TYPE_ABORT:
3394 6 : nbdev_io->io_path = NULL;
3395 6 : nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
3396 12 : bdev_nvme_abort(nbdev_ch,
3397 6 : nbdev_io,
3398 6 : nbdev_io_to_abort);
3399 6 : return;
3400 :
3401 : case SPDK_BDEV_IO_TYPE_COPY:
3402 0 : rc = bdev_nvme_copy(nbdev_io,
3403 0 : bdev_io->u.bdev.offset_blocks,
3404 0 : bdev_io->u.bdev.copy.src_offset_blocks,
3405 0 : bdev_io->u.bdev.num_blocks);
3406 0 : break;
3407 : default:
3408 0 : rc = -EINVAL;
3409 0 : break;
3410 : }
3411 :
3412 32 : if (spdk_unlikely(rc != 0)) {
3413 0 : bdev_nvme_io_complete(nbdev_io, rc);
3414 0 : }
3415 59 : }
3416 :
3417 : static void
3418 68 : bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
3419 : {
3420 68 : struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
3421 68 : struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
3422 :
3423 68 : if (spdk_likely(nbdev_io->submit_tsc == 0)) {
3424 68 : nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io);
3425 68 : } else {
3426 : /* There are cases where submit_tsc != 0, i.e. retry I/O.
3427 : * We need to update submit_tsc here.
3428 : */
3429 0 : nbdev_io->submit_tsc = spdk_get_ticks();
3430 : }
3431 :
3432 68 : spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io);
3433 68 : nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
3434 68 : if (spdk_unlikely(!nbdev_io->io_path)) {
3435 13 : if (!bdev_nvme_io_type_is_admin(bdev_io->type)) {
3436 12 : bdev_nvme_io_complete(nbdev_io, -ENXIO);
3437 12 : return;
3438 : }
3439 :
3440 : /* Admin commands do not use the optimal I/O path.
3441 : * Simply fall through even if it is not found.
3442 : */
3443 1 : }
3444 :
3445 56 : _bdev_nvme_submit_request(nbdev_ch, bdev_io);
3446 68 : }
3447 :
3448 : static bool
3449 0 : bdev_nvme_is_supported_csi(enum spdk_nvme_csi csi)
3450 : {
3451 0 : switch (csi) {
3452 : case SPDK_NVME_CSI_NVM:
3453 0 : return true;
3454 : case SPDK_NVME_CSI_ZNS:
3455 0 : return true;
3456 : default:
3457 0 : return false;
3458 : }
3459 0 : }
3460 :
3461 : static bool
3462 0 : bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
3463 : {
3464 0 : struct nvme_bdev *nbdev = ctx;
3465 0 : struct nvme_ns *nvme_ns;
3466 0 : struct spdk_nvme_ns *ns;
3467 0 : struct spdk_nvme_ctrlr *ctrlr;
3468 0 : const struct spdk_nvme_ctrlr_data *cdata;
3469 :
3470 0 : nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
3471 0 : assert(nvme_ns != NULL);
3472 0 : ns = nvme_ns->ns;
3473 0 : if (ns == NULL) {
3474 0 : return false;
3475 : }
3476 :
3477 0 : if (!bdev_nvme_is_supported_csi(spdk_nvme_ns_get_csi(ns))) {
3478 0 : switch (io_type) {
3479 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
3480 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3481 0 : return true;
3482 :
3483 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3484 0 : return spdk_nvme_ns_get_md_size(ns) ? true : false;
3485 :
3486 : default:
3487 0 : return false;
3488 : }
3489 : }
3490 :
3491 0 : ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3492 :
3493 0 : switch (io_type) {
3494 : case SPDK_BDEV_IO_TYPE_READ:
3495 : case SPDK_BDEV_IO_TYPE_WRITE:
3496 : case SPDK_BDEV_IO_TYPE_RESET:
3497 : case SPDK_BDEV_IO_TYPE_FLUSH:
3498 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
3499 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3500 : case SPDK_BDEV_IO_TYPE_ABORT:
3501 0 : return true;
3502 :
3503 : case SPDK_BDEV_IO_TYPE_COMPARE:
3504 0 : return spdk_nvme_ns_supports_compare(ns);
3505 :
3506 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3507 0 : return spdk_nvme_ns_get_md_size(ns) ? true : false;
3508 :
3509 : case SPDK_BDEV_IO_TYPE_UNMAP:
3510 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3511 0 : return cdata->oncs.dsm;
3512 :
3513 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3514 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3515 0 : return cdata->oncs.write_zeroes;
3516 :
3517 : case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
3518 0 : if (spdk_nvme_ctrlr_get_flags(ctrlr) &
3519 : SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
3520 0 : return true;
3521 : }
3522 0 : return false;
3523 :
3524 : case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
3525 : case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
3526 0 : return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
3527 :
3528 : case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
3529 0 : return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
3530 0 : spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
3531 :
3532 : case SPDK_BDEV_IO_TYPE_COPY:
3533 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3534 0 : return cdata->oncs.copy;
3535 :
3536 : default:
3537 0 : return false;
3538 : }
3539 0 : }
3540 :
3541 : static int
3542 61 : nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch)
3543 : {
3544 61 : struct nvme_qpair *nvme_qpair;
3545 61 : struct spdk_io_channel *pg_ch;
3546 61 : int rc;
3547 :
3548 61 : nvme_qpair = calloc(1, sizeof(*nvme_qpair));
3549 61 : if (!nvme_qpair) {
3550 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to alloc nvme_qpair.\n");
3551 0 : return -1;
3552 : }
3553 :
3554 61 : TAILQ_INIT(&nvme_qpair->io_path_list);
3555 :
3556 61 : nvme_qpair->ctrlr = nvme_ctrlr;
3557 61 : nvme_qpair->ctrlr_ch = ctrlr_ch;
3558 :
3559 61 : pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
3560 61 : if (!pg_ch) {
3561 0 : free(nvme_qpair);
3562 0 : return -1;
3563 : }
3564 :
3565 61 : nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch);
3566 :
3567 : #ifdef SPDK_CONFIG_VTUNE
3568 : nvme_qpair->group->collect_spin_stat = true;
3569 : #else
3570 61 : nvme_qpair->group->collect_spin_stat = false;
3571 : #endif
3572 :
3573 61 : if (!nvme_ctrlr->disabled) {
3574 : /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will
3575 : * be created when it's enabled.
3576 : */
3577 61 : rc = bdev_nvme_create_qpair(nvme_qpair);
3578 61 : if (rc != 0) {
3579 : /* nvme_ctrlr can't create IO qpair if connection is down.
3580 : * If reconnect_delay_sec is non-zero, creating IO qpair is retried
3581 : * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero,
3582 : * submitted IO will be queued until IO qpair is successfully created.
3583 : *
3584 : * Hence, if both are satisfied, ignore the failure.
3585 : */
3586 0 : if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) {
3587 0 : spdk_put_io_channel(pg_ch);
3588 0 : free(nvme_qpair);
3589 0 : return rc;
3590 : }
3591 0 : }
3592 61 : }
3593 :
3594 61 : TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq);
3595 :
3596 61 : ctrlr_ch->qpair = nvme_qpair;
3597 :
3598 61 : nvme_ctrlr_get_ref(nvme_ctrlr);
3599 :
3600 61 : return 0;
3601 61 : }
3602 :
3603 : static int
3604 61 : bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf)
3605 : {
3606 61 : struct nvme_ctrlr *nvme_ctrlr = io_device;
3607 61 : struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
3608 :
3609 122 : return nvme_qpair_create(nvme_ctrlr, ctrlr_ch);
3610 61 : }
3611 :
3612 : static void
3613 61 : nvme_qpair_delete(struct nvme_qpair *nvme_qpair)
3614 : {
3615 61 : struct nvme_io_path *io_path, *next;
3616 :
3617 61 : assert(nvme_qpair->group != NULL);
3618 :
3619 100 : TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) {
3620 39 : TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq);
3621 39 : nvme_io_path_free(io_path);
3622 39 : }
3623 :
3624 61 : TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq);
3625 :
3626 61 : spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group));
3627 :
3628 61 : nvme_ctrlr_put_ref(nvme_qpair->ctrlr);
3629 :
3630 61 : free(nvme_qpair);
3631 61 : }
3632 :
3633 : static void
3634 61 : bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf)
3635 : {
3636 61 : struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
3637 61 : struct nvme_qpair *nvme_qpair;
3638 :
3639 61 : nvme_qpair = ctrlr_ch->qpair;
3640 61 : assert(nvme_qpair != NULL);
3641 :
3642 61 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
3643 :
3644 61 : if (nvme_qpair->qpair != NULL) {
3645 : /* Always try to disconnect the qpair, even if a reset is in progress.
3646 : * The qpair may have been created after the reset process started.
3647 : */
3648 47 : spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair);
3649 47 : if (ctrlr_ch->reset_iter) {
3650 : /* Skip current ctrlr_channel in a full reset sequence because
3651 : * it is being deleted now.
3652 : */
3653 0 : nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0);
3654 0 : }
3655 :
3656 : /* We cannot release a reference to the poll group now.
3657 : * The qpair may be disconnected asynchronously later.
3658 : * We need to poll it until it is actually disconnected.
3659 : * Just detach the qpair from the deleting ctrlr_channel.
3660 : */
3661 47 : nvme_qpair->ctrlr_ch = NULL;
3662 47 : } else {
3663 14 : assert(ctrlr_ch->reset_iter == NULL);
3664 :
3665 14 : nvme_qpair_delete(nvme_qpair);
3666 : }
3667 61 : }
3668 :
3669 : static inline struct spdk_io_channel *
3670 0 : bdev_nvme_get_accel_channel(struct nvme_poll_group *group)
3671 : {
3672 0 : if (spdk_unlikely(!group->accel_channel)) {
3673 0 : group->accel_channel = spdk_accel_get_io_channel();
3674 0 : if (!group->accel_channel) {
3675 0 : SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
3676 : group);
3677 0 : return NULL;
3678 : }
3679 0 : }
3680 :
3681 0 : return group->accel_channel;
3682 0 : }
3683 :
3684 : static void
3685 0 : bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
3686 : {
3687 0 : spdk_accel_sequence_finish(seq, cb_fn, cb_arg);
3688 0 : }
3689 :
3690 : static void
3691 0 : bdev_nvme_abort_sequence(void *seq)
3692 : {
3693 0 : spdk_accel_sequence_abort(seq);
3694 0 : }
3695 :
3696 : static void
3697 0 : bdev_nvme_reverse_sequence(void *seq)
3698 : {
3699 0 : spdk_accel_sequence_reverse(seq);
3700 0 : }
3701 :
3702 : static int
3703 0 : bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt,
3704 : struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed,
3705 : spdk_nvme_accel_step_cb cb_fn, void *cb_arg)
3706 : {
3707 0 : struct spdk_io_channel *ch;
3708 0 : struct nvme_poll_group *group = ctx;
3709 :
3710 0 : ch = bdev_nvme_get_accel_channel(group);
3711 0 : if (spdk_unlikely(ch == NULL)) {
3712 0 : return -ENOMEM;
3713 : }
3714 :
3715 0 : return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt,
3716 0 : domain, domain_ctx, seed, cb_fn, cb_arg);
3717 0 : }
3718 :
3719 : static int
3720 0 : bdev_nvme_append_copy(void *ctx, void **seq, struct iovec *dst_iovs, uint32_t dst_iovcnt,
3721 : struct spdk_memory_domain *dst_domain, void *dst_domain_ctx,
3722 : struct iovec *src_iovs, uint32_t src_iovcnt,
3723 : struct spdk_memory_domain *src_domain, void *src_domain_ctx,
3724 : spdk_nvme_accel_step_cb cb_fn, void *cb_arg)
3725 : {
3726 0 : struct spdk_io_channel *ch;
3727 0 : struct nvme_poll_group *group = ctx;
3728 :
3729 0 : ch = bdev_nvme_get_accel_channel(group);
3730 0 : if (spdk_unlikely(ch == NULL)) {
3731 0 : return -ENOMEM;
3732 : }
3733 :
3734 0 : return spdk_accel_append_copy((struct spdk_accel_sequence **)seq, ch,
3735 0 : dst_iovs, dst_iovcnt, dst_domain, dst_domain_ctx,
3736 0 : src_iovs, src_iovcnt, src_domain, src_domain_ctx,
3737 0 : cb_fn, cb_arg);
3738 0 : }
3739 :
3740 : static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
3741 : .table_size = sizeof(struct spdk_nvme_accel_fn_table),
3742 : .append_crc32c = bdev_nvme_append_crc32c,
3743 : .append_copy = bdev_nvme_append_copy,
3744 : .finish_sequence = bdev_nvme_finish_sequence,
3745 : .reverse_sequence = bdev_nvme_reverse_sequence,
3746 : .abort_sequence = bdev_nvme_abort_sequence,
3747 : };
3748 :
3749 : static int
3750 0 : bdev_nvme_interrupt_wrapper(void *ctx)
3751 : {
3752 0 : int num_events;
3753 0 : struct nvme_poll_group *group = ctx;
3754 :
3755 0 : num_events = spdk_nvme_poll_group_wait(group->group, bdev_nvme_disconnected_qpair_cb);
3756 0 : if (spdk_unlikely(num_events < 0)) {
3757 0 : bdev_nvme_check_io_qpairs(group);
3758 0 : }
3759 :
3760 0 : return num_events;
3761 0 : }
3762 :
3763 : static int
3764 46 : bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf)
3765 : {
3766 46 : struct nvme_poll_group *group = ctx_buf;
3767 46 : uint64_t period;
3768 46 : int fd;
3769 :
3770 46 : TAILQ_INIT(&group->qpair_list);
3771 :
3772 46 : group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
3773 46 : if (group->group == NULL) {
3774 0 : return -1;
3775 : }
3776 :
3777 46 : period = spdk_interrupt_mode_is_enabled() ? 0 : g_opts.nvme_ioq_poll_period_us;
3778 46 : group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, period);
3779 :
3780 46 : if (group->poller == NULL) {
3781 0 : spdk_nvme_poll_group_destroy(group->group);
3782 0 : return -1;
3783 : }
3784 :
3785 46 : if (spdk_interrupt_mode_is_enabled()) {
3786 0 : spdk_poller_register_interrupt(group->poller, NULL, NULL);
3787 :
3788 0 : fd = spdk_nvme_poll_group_get_fd(group->group);
3789 0 : if (fd < 0) {
3790 0 : spdk_nvme_poll_group_destroy(group->group);
3791 0 : return -1;
3792 : }
3793 :
3794 0 : group->intr = SPDK_INTERRUPT_REGISTER(fd, bdev_nvme_interrupt_wrapper, group);
3795 0 : if (!group->intr) {
3796 0 : spdk_nvme_poll_group_destroy(group->group);
3797 0 : return -1;
3798 : }
3799 0 : }
3800 :
3801 46 : return 0;
3802 46 : }
3803 :
3804 : static void
3805 46 : bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf)
3806 : {
3807 46 : struct nvme_poll_group *group = ctx_buf;
3808 :
3809 46 : assert(TAILQ_EMPTY(&group->qpair_list));
3810 :
3811 46 : if (group->accel_channel) {
3812 0 : spdk_put_io_channel(group->accel_channel);
3813 0 : }
3814 :
3815 46 : if (spdk_interrupt_mode_is_enabled()) {
3816 0 : spdk_interrupt_unregister(&group->intr);
3817 0 : }
3818 :
3819 46 : spdk_poller_unregister(&group->poller);
3820 46 : if (spdk_nvme_poll_group_destroy(group->group)) {
3821 0 : SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n");
3822 0 : assert(false);
3823 : }
3824 46 : }
3825 :
3826 : static struct spdk_io_channel *
3827 0 : bdev_nvme_get_io_channel(void *ctx)
3828 : {
3829 0 : struct nvme_bdev *nvme_bdev = ctx;
3830 :
3831 0 : return spdk_get_io_channel(nvme_bdev);
3832 0 : }
3833 :
3834 : static void *
3835 0 : bdev_nvme_get_module_ctx(void *ctx)
3836 : {
3837 0 : struct nvme_bdev *nvme_bdev = ctx;
3838 0 : struct nvme_ns *nvme_ns;
3839 :
3840 0 : if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) {
3841 0 : return NULL;
3842 : }
3843 :
3844 0 : nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list);
3845 0 : if (!nvme_ns) {
3846 0 : return NULL;
3847 : }
3848 :
3849 0 : return nvme_ns->ns;
3850 0 : }
3851 :
3852 : static const char *
3853 0 : _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state)
3854 : {
3855 0 : switch (ana_state) {
3856 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
3857 0 : return "optimized";
3858 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
3859 0 : return "non_optimized";
3860 : case SPDK_NVME_ANA_INACCESSIBLE_STATE:
3861 0 : return "inaccessible";
3862 : case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE:
3863 0 : return "persistent_loss";
3864 : case SPDK_NVME_ANA_CHANGE_STATE:
3865 0 : return "change";
3866 : default:
3867 0 : return NULL;
3868 : }
3869 0 : }
3870 :
3871 : static int
3872 8 : bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
3873 : {
3874 8 : struct spdk_memory_domain **_domains = NULL;
3875 8 : struct nvme_bdev *nbdev = ctx;
3876 8 : struct nvme_ns *nvme_ns;
3877 8 : int i = 0, _array_size = array_size;
3878 8 : int rc = 0;
3879 :
3880 22 : TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
3881 14 : if (domains && array_size >= i) {
3882 11 : _domains = &domains[i];
3883 11 : } else {
3884 3 : _domains = NULL;
3885 : }
3886 14 : rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size);
3887 14 : if (rc > 0) {
3888 13 : i += rc;
3889 13 : if (_array_size >= rc) {
3890 9 : _array_size -= rc;
3891 9 : } else {
3892 4 : _array_size = 0;
3893 : }
3894 14 : } else if (rc < 0) {
3895 0 : return rc;
3896 : }
3897 14 : }
3898 :
3899 8 : return i;
3900 8 : }
3901 :
3902 : static const char *
3903 0 : nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr)
3904 : {
3905 0 : if (nvme_ctrlr->destruct) {
3906 0 : return "deleting";
3907 0 : } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
3908 0 : return "failed";
3909 0 : } else if (nvme_ctrlr->resetting) {
3910 0 : return "resetting";
3911 0 : } else if (nvme_ctrlr->reconnect_is_delayed > 0) {
3912 0 : return "reconnect_is_delayed";
3913 0 : } else if (nvme_ctrlr->disabled) {
3914 0 : return "disabled";
3915 : } else {
3916 0 : return "enabled";
3917 : }
3918 0 : }
3919 :
3920 : void
3921 0 : nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr)
3922 : {
3923 0 : struct spdk_nvme_transport_id *trid;
3924 0 : const struct spdk_nvme_ctrlr_opts *opts;
3925 0 : const struct spdk_nvme_ctrlr_data *cdata;
3926 0 : struct nvme_path_id *path_id;
3927 0 : int32_t numa_id;
3928 :
3929 0 : spdk_json_write_object_begin(w);
3930 :
3931 0 : spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr));
3932 :
3933 : #ifdef SPDK_CONFIG_NVME_CUSE
3934 0 : size_t cuse_name_size = 128;
3935 0 : char cuse_name[cuse_name_size];
3936 :
3937 0 : int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size);
3938 0 : if (rc == 0) {
3939 0 : spdk_json_write_named_string(w, "cuse_device", cuse_name);
3940 0 : }
3941 : #endif
3942 0 : trid = &nvme_ctrlr->active_path_id->trid;
3943 0 : spdk_json_write_named_object_begin(w, "trid");
3944 0 : nvme_bdev_dump_trid_json(trid, w);
3945 0 : spdk_json_write_object_end(w);
3946 :
3947 0 : path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link);
3948 0 : if (path_id != NULL) {
3949 0 : spdk_json_write_named_array_begin(w, "alternate_trids");
3950 0 : do {
3951 0 : trid = &path_id->trid;
3952 0 : spdk_json_write_object_begin(w);
3953 0 : nvme_bdev_dump_trid_json(trid, w);
3954 0 : spdk_json_write_object_end(w);
3955 :
3956 0 : path_id = TAILQ_NEXT(path_id, link);
3957 0 : } while (path_id != NULL);
3958 0 : spdk_json_write_array_end(w);
3959 0 : }
3960 :
3961 0 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
3962 0 : spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid);
3963 :
3964 0 : opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr);
3965 0 : spdk_json_write_named_object_begin(w, "host");
3966 0 : spdk_json_write_named_string(w, "nqn", opts->hostnqn);
3967 0 : spdk_json_write_named_string(w, "addr", opts->src_addr);
3968 0 : spdk_json_write_named_string(w, "svcid", opts->src_svcid);
3969 0 : spdk_json_write_object_end(w);
3970 :
3971 0 : numa_id = spdk_nvme_ctrlr_get_numa_id(nvme_ctrlr->ctrlr);
3972 0 : if (numa_id != SPDK_ENV_NUMA_ID_ANY) {
3973 0 : spdk_json_write_named_uint32(w, "numa_id", numa_id);
3974 0 : }
3975 0 : spdk_json_write_object_end(w);
3976 0 : }
3977 :
3978 : static void
3979 0 : nvme_namespace_info_json(struct spdk_json_write_ctx *w,
3980 : struct nvme_ns *nvme_ns)
3981 : {
3982 0 : struct spdk_nvme_ns *ns;
3983 0 : struct spdk_nvme_ctrlr *ctrlr;
3984 0 : const struct spdk_nvme_ctrlr_data *cdata;
3985 0 : const struct spdk_nvme_transport_id *trid;
3986 0 : union spdk_nvme_vs_register vs;
3987 0 : const struct spdk_nvme_ns_data *nsdata;
3988 0 : char buf[128];
3989 :
3990 0 : ns = nvme_ns->ns;
3991 0 : if (ns == NULL) {
3992 0 : return;
3993 : }
3994 :
3995 0 : ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3996 :
3997 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3998 0 : trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
3999 0 : vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
4000 :
4001 0 : spdk_json_write_object_begin(w);
4002 :
4003 0 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
4004 0 : spdk_json_write_named_string(w, "pci_address", trid->traddr);
4005 0 : }
4006 :
4007 0 : spdk_json_write_named_object_begin(w, "trid");
4008 :
4009 0 : nvme_bdev_dump_trid_json(trid, w);
4010 :
4011 0 : spdk_json_write_object_end(w);
4012 :
4013 : #ifdef SPDK_CONFIG_NVME_CUSE
4014 0 : size_t cuse_name_size = 128;
4015 0 : char cuse_name[cuse_name_size];
4016 :
4017 0 : int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
4018 : cuse_name, &cuse_name_size);
4019 0 : if (rc == 0) {
4020 0 : spdk_json_write_named_string(w, "cuse_device", cuse_name);
4021 0 : }
4022 : #endif
4023 :
4024 0 : spdk_json_write_named_object_begin(w, "ctrlr_data");
4025 :
4026 0 : spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid);
4027 :
4028 0 : spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
4029 :
4030 0 : snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
4031 0 : spdk_str_trim(buf);
4032 0 : spdk_json_write_named_string(w, "model_number", buf);
4033 :
4034 0 : snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
4035 0 : spdk_str_trim(buf);
4036 0 : spdk_json_write_named_string(w, "serial_number", buf);
4037 :
4038 0 : snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
4039 0 : spdk_str_trim(buf);
4040 0 : spdk_json_write_named_string(w, "firmware_revision", buf);
4041 :
4042 0 : if (cdata->subnqn[0] != '\0') {
4043 0 : spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
4044 0 : }
4045 :
4046 0 : spdk_json_write_named_object_begin(w, "oacs");
4047 :
4048 0 : spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
4049 0 : spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
4050 0 : spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
4051 0 : spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
4052 :
4053 0 : spdk_json_write_object_end(w);
4054 :
4055 0 : spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr);
4056 0 : spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting);
4057 :
4058 0 : spdk_json_write_object_end(w);
4059 :
4060 0 : spdk_json_write_named_object_begin(w, "vs");
4061 :
4062 0 : spdk_json_write_name(w, "nvme_version");
4063 0 : if (vs.bits.ter) {
4064 0 : spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
4065 0 : } else {
4066 0 : spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
4067 : }
4068 :
4069 0 : spdk_json_write_object_end(w);
4070 :
4071 0 : nsdata = spdk_nvme_ns_get_data(ns);
4072 :
4073 0 : spdk_json_write_named_object_begin(w, "ns_data");
4074 :
4075 0 : spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
4076 :
4077 0 : if (cdata->cmic.ana_reporting) {
4078 0 : spdk_json_write_named_string(w, "ana_state",
4079 0 : _nvme_ana_state_str(nvme_ns->ana_state));
4080 0 : }
4081 :
4082 0 : spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share);
4083 :
4084 0 : spdk_json_write_object_end(w);
4085 :
4086 0 : if (cdata->oacs.security) {
4087 0 : spdk_json_write_named_object_begin(w, "security");
4088 :
4089 0 : spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal);
4090 :
4091 0 : spdk_json_write_object_end(w);
4092 0 : }
4093 :
4094 0 : spdk_json_write_object_end(w);
4095 0 : }
4096 :
4097 : static const char *
4098 0 : nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev)
4099 : {
4100 0 : switch (nbdev->mp_policy) {
4101 : case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE:
4102 0 : return "active_passive";
4103 : case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE:
4104 0 : return "active_active";
4105 : default:
4106 0 : assert(false);
4107 : return "invalid";
4108 : }
4109 0 : }
4110 :
4111 : static const char *
4112 0 : nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev)
4113 : {
4114 0 : switch (nbdev->mp_selector) {
4115 : case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN:
4116 0 : return "round_robin";
4117 : case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH:
4118 0 : return "queue_depth";
4119 : default:
4120 0 : assert(false);
4121 : return "invalid";
4122 : }
4123 0 : }
4124 :
4125 : static int
4126 0 : bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
4127 : {
4128 0 : struct nvme_bdev *nvme_bdev = ctx;
4129 0 : struct nvme_ns *nvme_ns;
4130 :
4131 0 : pthread_mutex_lock(&nvme_bdev->mutex);
4132 0 : spdk_json_write_named_array_begin(w, "nvme");
4133 0 : TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) {
4134 0 : nvme_namespace_info_json(w, nvme_ns);
4135 0 : }
4136 0 : spdk_json_write_array_end(w);
4137 0 : spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev));
4138 0 : if (nvme_bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
4139 0 : spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nvme_bdev));
4140 0 : if (nvme_bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
4141 0 : spdk_json_write_named_uint32(w, "rr_min_io", nvme_bdev->rr_min_io);
4142 0 : }
4143 0 : }
4144 0 : pthread_mutex_unlock(&nvme_bdev->mutex);
4145 :
4146 0 : return 0;
4147 0 : }
4148 :
4149 : static void
4150 0 : bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
4151 : {
4152 : /* No config per bdev needed */
4153 0 : }
4154 :
4155 : static uint64_t
4156 0 : bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
4157 : {
4158 0 : struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
4159 0 : struct nvme_io_path *io_path;
4160 0 : struct nvme_poll_group *group;
4161 0 : uint64_t spin_time = 0;
4162 :
4163 0 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
4164 0 : group = io_path->qpair->group;
4165 :
4166 0 : if (!group || !group->collect_spin_stat) {
4167 0 : continue;
4168 : }
4169 :
4170 0 : if (group->end_ticks != 0) {
4171 0 : group->spin_ticks += (group->end_ticks - group->start_ticks);
4172 0 : group->end_ticks = 0;
4173 0 : }
4174 :
4175 0 : spin_time += group->spin_ticks;
4176 0 : group->start_ticks = 0;
4177 0 : group->spin_ticks = 0;
4178 0 : }
4179 :
4180 0 : return (spin_time * 1000000ULL) / spdk_get_ticks_hz();
4181 0 : }
4182 :
4183 : static void
4184 0 : bdev_nvme_reset_device_stat(void *ctx)
4185 : {
4186 0 : struct nvme_bdev *nbdev = ctx;
4187 :
4188 0 : if (nbdev->err_stat != NULL) {
4189 0 : memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat));
4190 0 : }
4191 0 : }
4192 :
4193 : /* JSON string should be lowercases and underscore delimited string. */
4194 : static void
4195 0 : bdev_nvme_format_nvme_status(char *dst, const char *src)
4196 : {
4197 0 : char tmp[256];
4198 :
4199 0 : spdk_strcpy_replace(dst, 256, src, " - ", "_");
4200 0 : spdk_strcpy_replace(tmp, 256, dst, "-", "_");
4201 0 : spdk_strcpy_replace(dst, 256, tmp, " ", "_");
4202 0 : spdk_strlwr(dst);
4203 0 : }
4204 :
4205 : static void
4206 0 : bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w)
4207 : {
4208 0 : struct nvme_bdev *nbdev = ctx;
4209 0 : struct spdk_nvme_status status = {};
4210 0 : uint16_t sct, sc;
4211 0 : char status_json[256];
4212 0 : const char *status_str;
4213 :
4214 0 : if (nbdev->err_stat == NULL) {
4215 0 : return;
4216 : }
4217 :
4218 0 : spdk_json_write_named_object_begin(w, "nvme_error");
4219 :
4220 0 : spdk_json_write_named_object_begin(w, "status_type");
4221 0 : for (sct = 0; sct < 8; sct++) {
4222 0 : if (nbdev->err_stat->status_type[sct] == 0) {
4223 0 : continue;
4224 : }
4225 0 : status.sct = sct;
4226 :
4227 0 : status_str = spdk_nvme_cpl_get_status_type_string(&status);
4228 0 : assert(status_str != NULL);
4229 0 : bdev_nvme_format_nvme_status(status_json, status_str);
4230 :
4231 0 : spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]);
4232 0 : }
4233 0 : spdk_json_write_object_end(w);
4234 :
4235 0 : spdk_json_write_named_object_begin(w, "status_code");
4236 0 : for (sct = 0; sct < 4; sct++) {
4237 0 : status.sct = sct;
4238 0 : for (sc = 0; sc < 256; sc++) {
4239 0 : if (nbdev->err_stat->status[sct][sc] == 0) {
4240 0 : continue;
4241 : }
4242 0 : status.sc = sc;
4243 :
4244 0 : status_str = spdk_nvme_cpl_get_status_string(&status);
4245 0 : assert(status_str != NULL);
4246 0 : bdev_nvme_format_nvme_status(status_json, status_str);
4247 :
4248 0 : spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]);
4249 0 : }
4250 0 : }
4251 0 : spdk_json_write_object_end(w);
4252 :
4253 0 : spdk_json_write_object_end(w);
4254 0 : }
4255 :
4256 : static bool
4257 0 : bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type)
4258 : {
4259 0 : struct nvme_bdev *nbdev = ctx;
4260 0 : struct nvme_ns *nvme_ns;
4261 0 : struct spdk_nvme_ctrlr *ctrlr;
4262 :
4263 0 : if (!g_opts.allow_accel_sequence) {
4264 0 : return false;
4265 : }
4266 :
4267 0 : switch (type) {
4268 : case SPDK_BDEV_IO_TYPE_WRITE:
4269 : case SPDK_BDEV_IO_TYPE_READ:
4270 0 : break;
4271 : default:
4272 0 : return false;
4273 : }
4274 :
4275 0 : nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
4276 0 : assert(nvme_ns != NULL);
4277 :
4278 0 : ctrlr = nvme_ns->ctrlr->ctrlr;
4279 0 : assert(ctrlr != NULL);
4280 :
4281 0 : return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED;
4282 0 : }
4283 :
4284 : static const struct spdk_bdev_fn_table nvmelib_fn_table = {
4285 : .destruct = bdev_nvme_destruct,
4286 : .submit_request = bdev_nvme_submit_request,
4287 : .io_type_supported = bdev_nvme_io_type_supported,
4288 : .get_io_channel = bdev_nvme_get_io_channel,
4289 : .dump_info_json = bdev_nvme_dump_info_json,
4290 : .write_config_json = bdev_nvme_write_config_json,
4291 : .get_spin_time = bdev_nvme_get_spin_time,
4292 : .get_module_ctx = bdev_nvme_get_module_ctx,
4293 : .get_memory_domains = bdev_nvme_get_memory_domains,
4294 : .accel_sequence_supported = bdev_nvme_accel_sequence_supported,
4295 : .reset_device_stat = bdev_nvme_reset_device_stat,
4296 : .dump_device_stat_json = bdev_nvme_dump_device_stat_json,
4297 : };
4298 :
4299 : typedef int (*bdev_nvme_parse_ana_log_page_cb)(
4300 : const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg);
4301 :
4302 : static int
4303 42 : bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
4304 : bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg)
4305 : {
4306 42 : struct spdk_nvme_ana_group_descriptor *copied_desc;
4307 42 : uint8_t *orig_desc;
4308 42 : uint32_t i, desc_size, copy_len;
4309 42 : int rc = 0;
4310 :
4311 42 : if (nvme_ctrlr->ana_log_page == NULL) {
4312 0 : return -EINVAL;
4313 : }
4314 :
4315 42 : copied_desc = nvme_ctrlr->copied_ana_desc;
4316 :
4317 42 : orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page);
4318 42 : copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page);
4319 :
4320 72 : for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) {
4321 67 : memcpy(copied_desc, orig_desc, copy_len);
4322 :
4323 67 : rc = cb_fn(copied_desc, cb_arg);
4324 67 : if (rc != 0) {
4325 37 : break;
4326 : }
4327 :
4328 30 : desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) +
4329 30 : copied_desc->num_of_nsid * sizeof(uint32_t);
4330 30 : orig_desc += desc_size;
4331 30 : copy_len -= desc_size;
4332 30 : }
4333 :
4334 42 : return rc;
4335 42 : }
4336 :
4337 : static int
4338 5 : nvme_ns_ana_transition_timedout(void *ctx)
4339 : {
4340 5 : struct nvme_ns *nvme_ns = ctx;
4341 :
4342 5 : spdk_poller_unregister(&nvme_ns->anatt_timer);
4343 5 : nvme_ns->ana_transition_timedout = true;
4344 :
4345 5 : return SPDK_POLLER_BUSY;
4346 5 : }
4347 :
4348 : static void
4349 46 : _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns,
4350 : const struct spdk_nvme_ana_group_descriptor *desc)
4351 : {
4352 46 : const struct spdk_nvme_ctrlr_data *cdata;
4353 :
4354 46 : nvme_ns->ana_group_id = desc->ana_group_id;
4355 46 : nvme_ns->ana_state = desc->ana_state;
4356 46 : nvme_ns->ana_state_updating = false;
4357 :
4358 46 : switch (nvme_ns->ana_state) {
4359 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
4360 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
4361 39 : nvme_ns->ana_transition_timedout = false;
4362 39 : spdk_poller_unregister(&nvme_ns->anatt_timer);
4363 39 : break;
4364 :
4365 : case SPDK_NVME_ANA_INACCESSIBLE_STATE:
4366 : case SPDK_NVME_ANA_CHANGE_STATE:
4367 6 : if (nvme_ns->anatt_timer != NULL) {
4368 1 : break;
4369 : }
4370 :
4371 5 : cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);
4372 5 : nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout,
4373 : nvme_ns,
4374 : cdata->anatt * SPDK_SEC_TO_USEC);
4375 5 : break;
4376 : default:
4377 1 : break;
4378 : }
4379 46 : }
4380 :
4381 : static int
4382 60 : nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg)
4383 : {
4384 60 : struct nvme_ns *nvme_ns = cb_arg;
4385 60 : uint32_t i;
4386 :
4387 60 : assert(nvme_ns->ns != NULL);
4388 :
4389 82 : for (i = 0; i < desc->num_of_nsid; i++) {
4390 59 : if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) {
4391 22 : continue;
4392 : }
4393 :
4394 37 : _nvme_ns_set_ana_state(nvme_ns, desc);
4395 37 : return 1;
4396 : }
4397 :
4398 23 : return 0;
4399 60 : }
4400 :
4401 : static int
4402 5 : nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid)
4403 : {
4404 5 : int rc = 0;
4405 5 : struct spdk_uuid new_uuid, namespace_uuid;
4406 5 : char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'};
4407 : /* This namespace UUID was generated using uuid_generate() method. */
4408 5 : const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"};
4409 5 : int size;
4410 :
4411 5 : assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN);
4412 :
4413 5 : spdk_uuid_set_null(&new_uuid);
4414 5 : spdk_uuid_set_null(&namespace_uuid);
4415 :
4416 5 : size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid);
4417 5 : if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) {
4418 0 : return -EINVAL;
4419 : }
4420 :
4421 5 : spdk_uuid_parse(&namespace_uuid, namespace_str);
4422 :
4423 5 : rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size);
4424 5 : if (rc == 0) {
4425 5 : memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid));
4426 5 : }
4427 :
4428 5 : return rc;
4429 5 : }
4430 :
4431 : static int
4432 39 : nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
4433 : struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
4434 : struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, void *ctx)
4435 : {
4436 39 : const struct spdk_uuid *uuid;
4437 39 : const uint8_t *nguid;
4438 39 : const struct spdk_nvme_ctrlr_data *cdata;
4439 39 : const struct spdk_nvme_ns_data *nsdata;
4440 39 : const struct spdk_nvme_ctrlr_opts *opts;
4441 39 : enum spdk_nvme_csi csi;
4442 39 : uint32_t atomic_bs, phys_bs, bs;
4443 39 : char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'};
4444 39 : int rc;
4445 :
4446 39 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
4447 39 : csi = spdk_nvme_ns_get_csi(ns);
4448 39 : opts = spdk_nvme_ctrlr_get_opts(ctrlr);
4449 :
4450 39 : switch (csi) {
4451 : case SPDK_NVME_CSI_NVM:
4452 39 : disk->product_name = "NVMe disk";
4453 39 : break;
4454 : case SPDK_NVME_CSI_ZNS:
4455 0 : disk->product_name = "NVMe ZNS disk";
4456 0 : disk->zoned = true;
4457 0 : disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
4458 0 : disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
4459 0 : spdk_nvme_ns_get_extended_sector_size(ns);
4460 0 : disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
4461 0 : disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
4462 0 : break;
4463 : default:
4464 0 : if (bdev_opts->allow_unrecognized_csi) {
4465 0 : disk->product_name = "NVMe Passthrough disk";
4466 0 : break;
4467 : }
4468 0 : SPDK_ERRLOG("unsupported CSI: %u\n", csi);
4469 0 : return -ENOTSUP;
4470 : }
4471 :
4472 39 : nguid = spdk_nvme_ns_get_nguid(ns);
4473 39 : if (!nguid) {
4474 39 : uuid = spdk_nvme_ns_get_uuid(ns);
4475 39 : if (uuid) {
4476 12 : disk->uuid = *uuid;
4477 39 : } else if (g_opts.generate_uuids) {
4478 0 : spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0');
4479 0 : rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid);
4480 0 : if (rc < 0) {
4481 0 : SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc));
4482 0 : return rc;
4483 : }
4484 0 : }
4485 39 : } else {
4486 0 : memcpy(&disk->uuid, nguid, sizeof(disk->uuid));
4487 : }
4488 :
4489 39 : disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
4490 39 : if (!disk->name) {
4491 0 : return -ENOMEM;
4492 : }
4493 :
4494 39 : disk->write_cache = 0;
4495 39 : if (cdata->vwc.present) {
4496 : /* Enable if the Volatile Write Cache exists */
4497 0 : disk->write_cache = 1;
4498 0 : }
4499 39 : if (cdata->oncs.write_zeroes) {
4500 0 : disk->max_write_zeroes = UINT16_MAX + 1;
4501 0 : }
4502 39 : disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
4503 39 : disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
4504 39 : disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr);
4505 39 : disk->ctratt.raw = cdata->ctratt.raw;
4506 39 : disk->nsid = spdk_nvme_ns_get_id(ns);
4507 : /* NVMe driver will split one request into multiple requests
4508 : * based on MDTS and stripe boundary, the bdev layer will use
4509 : * max_segment_size and max_num_segments to split one big IO
4510 : * into multiple requests, then small request can't run out
4511 : * of NVMe internal requests data structure.
4512 : */
4513 39 : if (opts && opts->io_queue_requests) {
4514 0 : disk->max_num_segments = opts->io_queue_requests / 2;
4515 0 : }
4516 39 : if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
4517 : /* The nvme driver will try to split I/O that have too many
4518 : * SGEs, but it doesn't work if that last SGE doesn't end on
4519 : * an aggregate total that is block aligned. The bdev layer has
4520 : * a more robust splitting framework, so use that instead for
4521 : * this case. (See issue #3269.)
4522 : */
4523 0 : uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr);
4524 :
4525 0 : if (disk->max_num_segments == 0) {
4526 0 : disk->max_num_segments = max_sges;
4527 0 : } else {
4528 0 : disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges);
4529 : }
4530 0 : }
4531 39 : disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
4532 :
4533 39 : nsdata = spdk_nvme_ns_get_data(ns);
4534 39 : bs = spdk_nvme_ns_get_sector_size(ns);
4535 39 : atomic_bs = bs;
4536 39 : phys_bs = bs;
4537 39 : if (nsdata->nabo == 0) {
4538 39 : if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) {
4539 0 : atomic_bs = bs * (1 + nsdata->nawupf);
4540 0 : } else {
4541 39 : atomic_bs = bs * (1 + cdata->awupf);
4542 : }
4543 39 : }
4544 39 : if (nsdata->nsfeat.optperf) {
4545 0 : phys_bs = bs * (1 + nsdata->npwg);
4546 0 : }
4547 39 : disk->phys_blocklen = spdk_min(phys_bs, atomic_bs);
4548 :
4549 39 : disk->md_len = spdk_nvme_ns_get_md_size(ns);
4550 39 : if (disk->md_len != 0) {
4551 0 : disk->md_interleave = nsdata->flbas.extended;
4552 0 : disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
4553 0 : if (disk->dif_type != SPDK_DIF_DISABLE) {
4554 0 : disk->dif_is_head_of_md = nsdata->dps.md_start;
4555 0 : disk->dif_check_flags = bdev_opts->prchk_flags;
4556 0 : disk->dif_pi_format = (enum spdk_dif_pi_format)spdk_nvme_ns_get_pi_format(ns);
4557 0 : }
4558 0 : }
4559 :
4560 39 : if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
4561 : SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
4562 39 : disk->acwu = 0;
4563 39 : } else if (nsdata->nsfeat.ns_atomic_write_unit) {
4564 0 : disk->acwu = nsdata->nacwu + 1; /* 0-based */
4565 0 : } else {
4566 0 : disk->acwu = cdata->acwu + 1; /* 0-based */
4567 : }
4568 :
4569 39 : if (cdata->oncs.copy) {
4570 : /* For now bdev interface allows only single segment copy */
4571 0 : disk->max_copy = nsdata->mssrl;
4572 0 : }
4573 :
4574 39 : disk->ctxt = ctx;
4575 39 : disk->fn_table = &nvmelib_fn_table;
4576 39 : disk->module = &nvme_if;
4577 :
4578 39 : disk->numa.id_valid = 1;
4579 39 : disk->numa.id = spdk_nvme_ctrlr_get_numa_id(ctrlr);
4580 :
4581 39 : return 0;
4582 39 : }
4583 :
4584 : static struct nvme_bdev *
4585 39 : nvme_bdev_alloc(void)
4586 : {
4587 39 : struct nvme_bdev *bdev;
4588 39 : int rc;
4589 :
4590 39 : bdev = calloc(1, sizeof(*bdev));
4591 39 : if (!bdev) {
4592 0 : SPDK_ERRLOG("bdev calloc() failed\n");
4593 0 : return NULL;
4594 : }
4595 :
4596 39 : if (g_opts.nvme_error_stat) {
4597 0 : bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat));
4598 0 : if (!bdev->err_stat) {
4599 0 : SPDK_ERRLOG("err_stat calloc() failed\n");
4600 0 : free(bdev);
4601 0 : return NULL;
4602 : }
4603 0 : }
4604 :
4605 39 : rc = pthread_mutex_init(&bdev->mutex, NULL);
4606 39 : if (rc != 0) {
4607 0 : free(bdev->err_stat);
4608 0 : free(bdev);
4609 0 : return NULL;
4610 : }
4611 :
4612 39 : bdev->ref = 1;
4613 39 : bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE;
4614 39 : bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN;
4615 39 : bdev->rr_min_io = UINT32_MAX;
4616 39 : TAILQ_INIT(&bdev->nvme_ns_list);
4617 :
4618 39 : return bdev;
4619 39 : }
4620 :
4621 : static int
4622 39 : nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
4623 : {
4624 39 : struct nvme_bdev *bdev;
4625 39 : struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr;
4626 39 : int rc;
4627 :
4628 39 : bdev = nvme_bdev_alloc();
4629 39 : if (bdev == NULL) {
4630 0 : SPDK_ERRLOG("Failed to allocate NVMe bdev\n");
4631 0 : return -ENOMEM;
4632 : }
4633 :
4634 39 : bdev->opal = nvme_ctrlr->opal_dev != NULL;
4635 :
4636 78 : rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr,
4637 39 : nvme_ns->ns, &nvme_ctrlr->opts, bdev);
4638 39 : if (rc != 0) {
4639 0 : SPDK_ERRLOG("Failed to create NVMe disk\n");
4640 0 : nvme_bdev_free(bdev);
4641 0 : return rc;
4642 : }
4643 :
4644 78 : spdk_io_device_register(bdev,
4645 : bdev_nvme_create_bdev_channel_cb,
4646 : bdev_nvme_destroy_bdev_channel_cb,
4647 : sizeof(struct nvme_bdev_channel),
4648 39 : bdev->disk.name);
4649 :
4650 39 : nvme_ns->bdev = bdev;
4651 39 : bdev->nsid = nvme_ns->id;
4652 39 : TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
4653 :
4654 39 : bdev->nbdev_ctrlr = nbdev_ctrlr;
4655 39 : TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq);
4656 :
4657 39 : rc = spdk_bdev_register(&bdev->disk);
4658 39 : if (rc != 0) {
4659 1 : SPDK_ERRLOG("spdk_bdev_register() failed\n");
4660 1 : spdk_io_device_unregister(bdev, NULL);
4661 1 : nvme_ns->bdev = NULL;
4662 1 : TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq);
4663 1 : nvme_bdev_free(bdev);
4664 1 : return rc;
4665 : }
4666 :
4667 38 : return 0;
4668 39 : }
4669 :
4670 : static bool
4671 23 : bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
4672 : {
4673 23 : const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
4674 23 : const struct spdk_uuid *uuid1, *uuid2;
4675 :
4676 23 : nsdata1 = spdk_nvme_ns_get_data(ns1);
4677 23 : nsdata2 = spdk_nvme_ns_get_data(ns2);
4678 23 : uuid1 = spdk_nvme_ns_get_uuid(ns1);
4679 23 : uuid2 = spdk_nvme_ns_get_uuid(ns2);
4680 :
4681 71 : return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 &&
4682 22 : nsdata1->eui64 == nsdata2->eui64 &&
4683 21 : ((uuid1 == NULL && uuid2 == NULL) ||
4684 29 : (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) &&
4685 18 : spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2);
4686 49 : }
4687 :
4688 : static bool
4689 0 : hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
4690 : struct spdk_nvme_ctrlr_opts *opts)
4691 : {
4692 0 : struct nvme_probe_skip_entry *entry;
4693 :
4694 0 : TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
4695 0 : if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
4696 0 : return false;
4697 : }
4698 0 : }
4699 :
4700 0 : opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
4701 0 : opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
4702 0 : opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
4703 0 : opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
4704 0 : opts->disable_read_ana_log_page = true;
4705 :
4706 0 : SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
4707 :
4708 0 : return true;
4709 0 : }
4710 :
4711 : static void
4712 0 : nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
4713 : {
4714 0 : struct nvme_ctrlr *nvme_ctrlr = ctx;
4715 :
4716 0 : if (spdk_nvme_cpl_is_error(cpl)) {
4717 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "Abort failed. Resetting controller. sc is %u, sct is %u.\n",
4718 : cpl->status.sc, cpl->status.sct);
4719 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4720 0 : } else if (cpl->cdw0 & 0x1) {
4721 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "Specified command could not be aborted.\n");
4722 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4723 0 : }
4724 0 : }
4725 :
4726 : static void
4727 0 : timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
4728 : struct spdk_nvme_qpair *qpair, uint16_t cid)
4729 : {
4730 0 : struct nvme_ctrlr *nvme_ctrlr = cb_arg;
4731 0 : union spdk_nvme_csts_register csts;
4732 0 : int rc;
4733 :
4734 0 : assert(nvme_ctrlr->ctrlr == ctrlr);
4735 :
4736 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n",
4737 : ctrlr, qpair, cid);
4738 :
4739 : /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
4740 : * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we
4741 : * would submit another fabrics cmd on the admin queue to read CSTS and check for its
4742 : * completion recursively.
4743 : */
4744 0 : if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
4745 0 : csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
4746 0 : if (csts.bits.cfs) {
4747 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Controller Fatal Status, reset required\n");
4748 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4749 0 : return;
4750 : }
4751 0 : }
4752 :
4753 0 : switch (g_opts.action_on_timeout) {
4754 : case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
4755 0 : if (qpair) {
4756 : /* Don't send abort to ctrlr when ctrlr is not available. */
4757 0 : pthread_mutex_lock(&nvme_ctrlr->mutex);
4758 0 : if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
4759 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4760 0 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Quit abort. Ctrlr is not available.\n");
4761 0 : return;
4762 : }
4763 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4764 :
4765 0 : rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
4766 0 : nvme_abort_cpl, nvme_ctrlr);
4767 0 : if (rc == 0) {
4768 0 : return;
4769 : }
4770 :
4771 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to send abort. Resetting, rc is %d.\n", rc);
4772 0 : }
4773 :
4774 : /* FALLTHROUGH */
4775 : case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
4776 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4777 0 : break;
4778 : case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
4779 0 : NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "No action for nvme controller timeout.\n");
4780 0 : break;
4781 : default:
4782 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "An invalid timeout action value is found.\n");
4783 0 : break;
4784 : }
4785 0 : }
4786 :
4787 : static struct nvme_ns *
4788 52 : nvme_ns_alloc(void)
4789 : {
4790 52 : struct nvme_ns *nvme_ns;
4791 :
4792 52 : nvme_ns = calloc(1, sizeof(struct nvme_ns));
4793 52 : if (nvme_ns == NULL) {
4794 0 : return NULL;
4795 : }
4796 :
4797 52 : if (g_opts.io_path_stat) {
4798 0 : nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
4799 0 : if (nvme_ns->stat == NULL) {
4800 0 : free(nvme_ns);
4801 0 : return NULL;
4802 : }
4803 0 : spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN);
4804 0 : }
4805 :
4806 52 : return nvme_ns;
4807 52 : }
4808 :
4809 : static void
4810 52 : nvme_ns_free(struct nvme_ns *nvme_ns)
4811 : {
4812 52 : free(nvme_ns->stat);
4813 52 : free(nvme_ns);
4814 52 : }
4815 :
4816 : static void
4817 52 : nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc)
4818 : {
4819 52 : struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
4820 52 : struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx;
4821 :
4822 52 : if (rc == 0) {
4823 50 : nvme_ns->probe_ctx = NULL;
4824 50 : nvme_ctrlr_get_ref(nvme_ctrlr);
4825 50 : } else {
4826 2 : RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
4827 2 : nvme_ns_free(nvme_ns);
4828 : }
4829 :
4830 52 : if (ctx) {
4831 51 : ctx->populates_in_progress--;
4832 51 : if (ctx->populates_in_progress == 0) {
4833 12 : nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
4834 12 : }
4835 51 : }
4836 52 : }
4837 :
4838 : static void
4839 2 : bdev_nvme_add_io_path(struct nvme_bdev_channel_iter *i,
4840 : struct nvme_bdev *nbdev,
4841 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
4842 : {
4843 2 : struct nvme_ns *nvme_ns = ctx;
4844 2 : int rc;
4845 :
4846 2 : rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
4847 2 : if (rc != 0) {
4848 0 : SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n");
4849 0 : }
4850 :
4851 2 : nvme_bdev_for_each_channel_continue(i, rc);
4852 2 : }
4853 :
4854 : static void
4855 2 : bdev_nvme_delete_io_path(struct nvme_bdev_channel_iter *i,
4856 : struct nvme_bdev *nbdev,
4857 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
4858 : {
4859 2 : struct nvme_ns *nvme_ns = ctx;
4860 2 : struct nvme_io_path *io_path;
4861 :
4862 2 : io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns);
4863 2 : if (io_path != NULL) {
4864 2 : _bdev_nvme_delete_io_path(nbdev_ch, io_path);
4865 2 : }
4866 :
4867 2 : nvme_bdev_for_each_channel_continue(i, 0);
4868 2 : }
4869 :
4870 : static void
4871 0 : bdev_nvme_add_io_path_failed(struct nvme_bdev *nbdev, void *ctx, int status)
4872 : {
4873 0 : struct nvme_ns *nvme_ns = ctx;
4874 :
4875 0 : nvme_ctrlr_populate_namespace_done(nvme_ns, -1);
4876 0 : }
4877 :
4878 : static void
4879 12 : bdev_nvme_add_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status)
4880 : {
4881 12 : struct nvme_ns *nvme_ns = ctx;
4882 :
4883 12 : if (status == 0) {
4884 12 : nvme_ctrlr_populate_namespace_done(nvme_ns, 0);
4885 12 : } else {
4886 : /* Delete the added io_paths and fail populating the namespace. */
4887 0 : nvme_bdev_for_each_channel(nbdev,
4888 : bdev_nvme_delete_io_path,
4889 0 : nvme_ns,
4890 : bdev_nvme_add_io_path_failed);
4891 : }
4892 12 : }
4893 :
4894 : static int
4895 13 : nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns)
4896 : {
4897 13 : struct nvme_ns *tmp_ns;
4898 13 : const struct spdk_nvme_ns_data *nsdata;
4899 :
4900 13 : nsdata = spdk_nvme_ns_get_data(nvme_ns->ns);
4901 13 : if (!nsdata->nmic.can_share) {
4902 0 : SPDK_ERRLOG("Namespace cannot be shared.\n");
4903 0 : return -EINVAL;
4904 : }
4905 :
4906 13 : pthread_mutex_lock(&bdev->mutex);
4907 :
4908 13 : tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list);
4909 13 : assert(tmp_ns != NULL);
4910 :
4911 13 : if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) {
4912 1 : pthread_mutex_unlock(&bdev->mutex);
4913 1 : SPDK_ERRLOG("Namespaces are not identical.\n");
4914 1 : return -EINVAL;
4915 : }
4916 :
4917 12 : bdev->ref++;
4918 12 : TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
4919 12 : nvme_ns->bdev = bdev;
4920 :
4921 12 : pthread_mutex_unlock(&bdev->mutex);
4922 :
4923 : /* Add nvme_io_path to nvme_bdev_channels dynamically. */
4924 24 : nvme_bdev_for_each_channel(bdev,
4925 : bdev_nvme_add_io_path,
4926 12 : nvme_ns,
4927 : bdev_nvme_add_io_path_done);
4928 :
4929 12 : return 0;
4930 13 : }
4931 :
4932 : static void
4933 52 : nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
4934 : {
4935 52 : struct spdk_nvme_ns *ns;
4936 52 : struct nvme_bdev *bdev;
4937 52 : int rc = 0;
4938 :
4939 52 : ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id);
4940 52 : if (!ns) {
4941 0 : NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "Invalid NS %d\n", nvme_ns->id);
4942 0 : rc = -EINVAL;
4943 0 : goto done;
4944 : }
4945 :
4946 52 : nvme_ns->ns = ns;
4947 52 : nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
4948 :
4949 52 : if (nvme_ctrlr->ana_log_page != NULL) {
4950 38 : bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns);
4951 38 : }
4952 :
4953 52 : bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id);
4954 92 : if (bdev == NULL) {
4955 39 : rc = nvme_bdev_create(nvme_ctrlr, nvme_ns);
4956 39 : } else {
4957 13 : rc = nvme_bdev_add_ns(bdev, nvme_ns);
4958 13 : if (rc == 0) {
4959 12 : return;
4960 : }
4961 : }
4962 : done:
4963 40 : nvme_ctrlr_populate_namespace_done(nvme_ns, rc);
4964 52 : }
4965 :
4966 : static void
4967 50 : nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns)
4968 : {
4969 50 : struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
4970 :
4971 50 : assert(nvme_ctrlr != NULL);
4972 :
4973 50 : pthread_mutex_lock(&nvme_ctrlr->mutex);
4974 :
4975 50 : RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
4976 :
4977 50 : if (nvme_ns->bdev != NULL) {
4978 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4979 0 : return;
4980 : }
4981 :
4982 50 : nvme_ns_free(nvme_ns);
4983 50 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4984 :
4985 50 : nvme_ctrlr_put_ref(nvme_ctrlr);
4986 50 : }
4987 :
4988 : static void
4989 11 : bdev_nvme_delete_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status)
4990 : {
4991 11 : struct nvme_ns *nvme_ns = ctx;
4992 :
4993 11 : nvme_ctrlr_depopulate_namespace_done(nvme_ns);
4994 11 : }
4995 :
4996 : static void
4997 50 : nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
4998 : {
4999 50 : struct nvme_bdev *bdev;
5000 :
5001 50 : spdk_poller_unregister(&nvme_ns->anatt_timer);
5002 :
5003 50 : bdev = nvme_ns->bdev;
5004 50 : if (bdev != NULL) {
5005 46 : pthread_mutex_lock(&bdev->mutex);
5006 :
5007 46 : assert(bdev->ref > 0);
5008 46 : bdev->ref--;
5009 46 : if (bdev->ref == 0) {
5010 35 : pthread_mutex_unlock(&bdev->mutex);
5011 :
5012 35 : spdk_bdev_unregister(&bdev->disk, NULL, NULL);
5013 35 : } else {
5014 : /* spdk_bdev_unregister() is not called until the last nvme_ns is
5015 : * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list
5016 : * and clear nvme_ns->bdev here.
5017 : */
5018 11 : TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq);
5019 11 : nvme_ns->bdev = NULL;
5020 :
5021 11 : pthread_mutex_unlock(&bdev->mutex);
5022 :
5023 : /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that,
5024 : * we call depopulate_namespace_done() to avoid use-after-free.
5025 : */
5026 22 : nvme_bdev_for_each_channel(bdev,
5027 : bdev_nvme_delete_io_path,
5028 11 : nvme_ns,
5029 : bdev_nvme_delete_io_path_done);
5030 11 : return;
5031 : }
5032 35 : }
5033 :
5034 39 : nvme_ctrlr_depopulate_namespace_done(nvme_ns);
5035 50 : }
5036 :
5037 : static void
5038 63 : nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
5039 : struct nvme_async_probe_ctx *ctx)
5040 : {
5041 63 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5042 63 : struct nvme_ns *nvme_ns, *next;
5043 63 : struct spdk_nvme_ns *ns;
5044 63 : struct nvme_bdev *bdev;
5045 63 : uint32_t nsid;
5046 63 : int rc;
5047 63 : uint64_t num_sectors;
5048 :
5049 63 : if (ctx) {
5050 : /* Initialize this count to 1 to handle the populate functions
5051 : * calling nvme_ctrlr_populate_namespace_done() immediately.
5052 : */
5053 47 : ctx->populates_in_progress = 1;
5054 47 : }
5055 :
5056 : /* First loop over our existing namespaces and see if they have been
5057 : * removed. */
5058 63 : nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
5059 67 : while (nvme_ns != NULL) {
5060 4 : next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
5061 :
5062 4 : if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) {
5063 : /* NS is still there or added again. Its attributes may have changed. */
5064 3 : ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
5065 3 : if (nvme_ns->ns != ns) {
5066 1 : assert(nvme_ns->ns == NULL);
5067 1 : nvme_ns->ns = ns;
5068 1 : NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "NSID %u was added\n", nvme_ns->id);
5069 1 : }
5070 :
5071 3 : num_sectors = spdk_nvme_ns_get_num_sectors(ns);
5072 3 : bdev = nvme_ns->bdev;
5073 3 : assert(bdev != NULL);
5074 3 : if (bdev->disk.blockcnt != num_sectors) {
5075 1 : NVME_CTRLR_NOTICELOG(nvme_ctrlr,
5076 : "NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
5077 : nvme_ns->id,
5078 : bdev->disk.name,
5079 : bdev->disk.blockcnt,
5080 : num_sectors);
5081 1 : rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
5082 1 : if (rc != 0) {
5083 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr,
5084 : "Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
5085 : bdev->disk.name, rc);
5086 0 : }
5087 1 : }
5088 3 : } else {
5089 : /* Namespace was removed */
5090 1 : nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
5091 : }
5092 :
5093 4 : nvme_ns = next;
5094 : }
5095 :
5096 : /* Loop through all of the namespaces at the nvme level and see if any of them are new */
5097 63 : nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
5098 118 : while (nsid != 0) {
5099 55 : nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
5100 :
5101 55 : if (nvme_ns == NULL) {
5102 : /* Found a new one */
5103 52 : nvme_ns = nvme_ns_alloc();
5104 52 : if (nvme_ns == NULL) {
5105 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate namespace\n");
5106 : /* This just fails to attach the namespace. It may work on a future attempt. */
5107 0 : continue;
5108 : }
5109 :
5110 52 : nvme_ns->id = nsid;
5111 52 : nvme_ns->ctrlr = nvme_ctrlr;
5112 :
5113 52 : nvme_ns->bdev = NULL;
5114 :
5115 52 : if (ctx) {
5116 51 : ctx->populates_in_progress++;
5117 51 : }
5118 52 : nvme_ns->probe_ctx = ctx;
5119 :
5120 52 : RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
5121 :
5122 52 : nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns);
5123 52 : }
5124 :
5125 55 : nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid);
5126 : }
5127 :
5128 63 : if (ctx) {
5129 : /* Decrement this count now that the loop is over to account
5130 : * for the one we started with. If the count is then 0, we
5131 : * know any populate_namespace functions completed immediately,
5132 : * so we'll kick the callback here.
5133 : */
5134 47 : ctx->populates_in_progress--;
5135 47 : if (ctx->populates_in_progress == 0) {
5136 35 : nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
5137 35 : }
5138 47 : }
5139 :
5140 63 : }
5141 :
5142 : static void
5143 62 : nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr)
5144 : {
5145 62 : struct nvme_ns *nvme_ns, *tmp;
5146 :
5147 111 : RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) {
5148 49 : nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
5149 49 : }
5150 62 : }
5151 :
5152 : static uint32_t
5153 37 : nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr)
5154 : {
5155 37 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5156 37 : const struct spdk_nvme_ctrlr_data *cdata;
5157 37 : uint32_t nsid, ns_count = 0;
5158 :
5159 37 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5160 :
5161 82 : for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
5162 82 : nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) {
5163 45 : ns_count++;
5164 45 : }
5165 :
5166 37 : return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
5167 37 : sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count *
5168 : sizeof(uint32_t);
5169 37 : }
5170 :
5171 : static int
5172 7 : nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc,
5173 : void *cb_arg)
5174 : {
5175 7 : struct nvme_ctrlr *nvme_ctrlr = cb_arg;
5176 7 : struct nvme_ns *nvme_ns;
5177 7 : uint32_t i, nsid;
5178 :
5179 13 : for (i = 0; i < desc->num_of_nsid; i++) {
5180 6 : nsid = desc->nsid[i];
5181 6 : if (nsid == 0) {
5182 0 : continue;
5183 : }
5184 :
5185 6 : nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
5186 :
5187 6 : if (nvme_ns == NULL) {
5188 : /* Target told us that an inactive namespace had an ANA change */
5189 1 : continue;
5190 : }
5191 :
5192 5 : _nvme_ns_set_ana_state(nvme_ns, desc);
5193 5 : }
5194 :
5195 7 : return 0;
5196 7 : }
5197 :
5198 : static void
5199 0 : bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
5200 : {
5201 0 : struct nvme_ns *nvme_ns;
5202 :
5203 0 : spdk_free(nvme_ctrlr->ana_log_page);
5204 0 : nvme_ctrlr->ana_log_page = NULL;
5205 :
5206 0 : for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
5207 0 : nvme_ns != NULL;
5208 0 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) {
5209 0 : nvme_ns->ana_state_updating = false;
5210 0 : nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
5211 0 : }
5212 0 : }
5213 :
5214 : static void
5215 3 : nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl)
5216 : {
5217 3 : struct nvme_ctrlr *nvme_ctrlr = ctx;
5218 :
5219 3 : if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) {
5220 6 : bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states,
5221 3 : nvme_ctrlr);
5222 3 : } else {
5223 0 : bdev_nvme_disable_read_ana_log_page(nvme_ctrlr);
5224 : }
5225 :
5226 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
5227 :
5228 3 : assert(nvme_ctrlr->ana_log_page_updating == true);
5229 3 : nvme_ctrlr->ana_log_page_updating = false;
5230 :
5231 3 : if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
5232 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5233 :
5234 0 : nvme_ctrlr_unregister(nvme_ctrlr);
5235 0 : } else {
5236 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5237 :
5238 3 : bdev_nvme_clear_io_path_caches(nvme_ctrlr);
5239 : }
5240 3 : }
5241 :
5242 : static int
5243 6 : nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
5244 : {
5245 6 : uint32_t ana_log_page_size;
5246 6 : int rc;
5247 :
5248 6 : if (nvme_ctrlr->ana_log_page == NULL) {
5249 0 : return -EINVAL;
5250 : }
5251 :
5252 6 : ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr);
5253 :
5254 6 : if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) {
5255 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr,
5256 : "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n",
5257 : ana_log_page_size, nvme_ctrlr->max_ana_log_page_size);
5258 0 : return -EINVAL;
5259 : }
5260 :
5261 6 : pthread_mutex_lock(&nvme_ctrlr->mutex);
5262 11 : if (!nvme_ctrlr_is_available(nvme_ctrlr) ||
5263 5 : nvme_ctrlr->ana_log_page_updating) {
5264 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5265 3 : return -EBUSY;
5266 : }
5267 :
5268 3 : nvme_ctrlr->ana_log_page_updating = true;
5269 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5270 :
5271 6 : rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr,
5272 : SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
5273 : SPDK_NVME_GLOBAL_NS_TAG,
5274 3 : nvme_ctrlr->ana_log_page,
5275 3 : ana_log_page_size, 0,
5276 : nvme_ctrlr_read_ana_log_page_done,
5277 3 : nvme_ctrlr);
5278 3 : if (rc != 0) {
5279 0 : nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL);
5280 0 : }
5281 :
5282 3 : return rc;
5283 6 : }
5284 :
5285 : static void
5286 0 : dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
5287 : {
5288 0 : }
5289 :
5290 : struct bdev_nvme_set_preferred_path_ctx {
5291 : struct spdk_bdev_desc *desc;
5292 : struct nvme_ns *nvme_ns;
5293 : bdev_nvme_set_preferred_path_cb cb_fn;
5294 : void *cb_arg;
5295 : };
5296 :
5297 : static void
5298 3 : bdev_nvme_set_preferred_path_done(struct nvme_bdev *nbdev, void *_ctx, int status)
5299 : {
5300 3 : struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx;
5301 :
5302 3 : assert(ctx != NULL);
5303 3 : assert(ctx->desc != NULL);
5304 3 : assert(ctx->cb_fn != NULL);
5305 :
5306 3 : spdk_bdev_close(ctx->desc);
5307 :
5308 3 : ctx->cb_fn(ctx->cb_arg, status);
5309 :
5310 3 : free(ctx);
5311 3 : }
5312 :
5313 : static void
5314 2 : _bdev_nvme_set_preferred_path(struct nvme_bdev_channel_iter *i,
5315 : struct nvme_bdev *nbdev,
5316 : struct nvme_bdev_channel *nbdev_ch, void *_ctx)
5317 : {
5318 2 : struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx;
5319 2 : struct nvme_io_path *io_path, *prev;
5320 :
5321 2 : prev = NULL;
5322 3 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
5323 3 : if (io_path->nvme_ns == ctx->nvme_ns) {
5324 2 : break;
5325 : }
5326 1 : prev = io_path;
5327 1 : }
5328 :
5329 2 : if (io_path != NULL) {
5330 2 : if (prev != NULL) {
5331 1 : STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq);
5332 1 : STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq);
5333 1 : }
5334 :
5335 : /* We can set io_path to nbdev_ch->current_io_path directly here.
5336 : * However, it needs to be conditional. To simplify the code,
5337 : * just clear nbdev_ch->current_io_path and let find_io_path()
5338 : * fill it.
5339 : *
5340 : * Automatic failback may be disabled. Hence even if the io_path is
5341 : * already at the head, clear nbdev_ch->current_io_path.
5342 : */
5343 2 : bdev_nvme_clear_current_io_path(nbdev_ch);
5344 2 : }
5345 :
5346 2 : nvme_bdev_for_each_channel_continue(i, 0);
5347 2 : }
5348 :
5349 : static struct nvme_ns *
5350 3 : bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid)
5351 : {
5352 3 : struct nvme_ns *nvme_ns, *prev;
5353 3 : const struct spdk_nvme_ctrlr_data *cdata;
5354 :
5355 3 : prev = NULL;
5356 6 : TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
5357 6 : cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);
5358 :
5359 6 : if (cdata->cntlid == cntlid) {
5360 3 : break;
5361 : }
5362 3 : prev = nvme_ns;
5363 3 : }
5364 :
5365 3 : if (nvme_ns != NULL && prev != NULL) {
5366 2 : TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq);
5367 2 : TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq);
5368 2 : }
5369 :
5370 6 : return nvme_ns;
5371 3 : }
5372 :
5373 : /* This function supports only multipath mode. There is only a single I/O path
5374 : * for each NVMe-oF controller. Hence, just move the matched I/O path to the
5375 : * head of the I/O path list for each NVMe bdev channel.
5376 : *
5377 : * NVMe bdev channel may be acquired after completing this function. move the
5378 : * matched namespace to the head of the namespace list for the NVMe bdev too.
5379 : */
5380 : void
5381 3 : bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
5382 : bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg)
5383 : {
5384 3 : struct bdev_nvme_set_preferred_path_ctx *ctx;
5385 3 : struct spdk_bdev *bdev;
5386 3 : struct nvme_bdev *nbdev;
5387 3 : int rc = 0;
5388 :
5389 3 : assert(cb_fn != NULL);
5390 :
5391 3 : ctx = calloc(1, sizeof(*ctx));
5392 3 : if (ctx == NULL) {
5393 0 : SPDK_ERRLOG("Failed to alloc context.\n");
5394 0 : rc = -ENOMEM;
5395 0 : goto err_alloc;
5396 : }
5397 :
5398 3 : ctx->cb_fn = cb_fn;
5399 3 : ctx->cb_arg = cb_arg;
5400 :
5401 3 : rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc);
5402 3 : if (rc != 0) {
5403 0 : SPDK_ERRLOG("Failed to open bdev %s.\n", name);
5404 0 : goto err_open;
5405 : }
5406 :
5407 3 : bdev = spdk_bdev_desc_get_bdev(ctx->desc);
5408 :
5409 3 : if (bdev->module != &nvme_if) {
5410 0 : SPDK_ERRLOG("bdev %s is not registered in this module.\n", name);
5411 0 : rc = -ENODEV;
5412 0 : goto err_bdev;
5413 : }
5414 :
5415 3 : nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
5416 :
5417 3 : pthread_mutex_lock(&nbdev->mutex);
5418 :
5419 3 : ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid);
5420 3 : if (ctx->nvme_ns == NULL) {
5421 0 : pthread_mutex_unlock(&nbdev->mutex);
5422 :
5423 0 : SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid);
5424 0 : rc = -ENODEV;
5425 0 : goto err_bdev;
5426 : }
5427 :
5428 3 : pthread_mutex_unlock(&nbdev->mutex);
5429 :
5430 6 : nvme_bdev_for_each_channel(nbdev,
5431 : _bdev_nvme_set_preferred_path,
5432 3 : ctx,
5433 : bdev_nvme_set_preferred_path_done);
5434 3 : return;
5435 :
5436 : err_bdev:
5437 0 : spdk_bdev_close(ctx->desc);
5438 : err_open:
5439 0 : free(ctx);
5440 : err_alloc:
5441 0 : cb_fn(cb_arg, rc);
5442 3 : }
5443 :
5444 : struct bdev_nvme_set_multipath_policy_ctx {
5445 : struct spdk_bdev_desc *desc;
5446 : spdk_bdev_nvme_set_multipath_policy_cb cb_fn;
5447 : void *cb_arg;
5448 : };
5449 :
5450 : static void
5451 3 : bdev_nvme_set_multipath_policy_done(struct nvme_bdev *nbdev, void *_ctx, int status)
5452 : {
5453 3 : struct bdev_nvme_set_multipath_policy_ctx *ctx = _ctx;
5454 :
5455 3 : assert(ctx != NULL);
5456 3 : assert(ctx->desc != NULL);
5457 3 : assert(ctx->cb_fn != NULL);
5458 :
5459 3 : spdk_bdev_close(ctx->desc);
5460 :
5461 3 : ctx->cb_fn(ctx->cb_arg, status);
5462 :
5463 3 : free(ctx);
5464 3 : }
5465 :
5466 : static void
5467 1 : _bdev_nvme_set_multipath_policy(struct nvme_bdev_channel_iter *i,
5468 : struct nvme_bdev *nbdev,
5469 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
5470 : {
5471 1 : nbdev_ch->mp_policy = nbdev->mp_policy;
5472 1 : nbdev_ch->mp_selector = nbdev->mp_selector;
5473 1 : nbdev_ch->rr_min_io = nbdev->rr_min_io;
5474 1 : bdev_nvme_clear_current_io_path(nbdev_ch);
5475 :
5476 1 : nvme_bdev_for_each_channel_continue(i, 0);
5477 1 : }
5478 :
5479 : void
5480 3 : spdk_bdev_nvme_set_multipath_policy(const char *name, enum spdk_bdev_nvme_multipath_policy policy,
5481 : enum spdk_bdev_nvme_multipath_selector selector, uint32_t rr_min_io,
5482 : spdk_bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg)
5483 : {
5484 3 : struct bdev_nvme_set_multipath_policy_ctx *ctx;
5485 3 : struct spdk_bdev *bdev;
5486 3 : struct nvme_bdev *nbdev;
5487 3 : int rc;
5488 :
5489 3 : assert(cb_fn != NULL);
5490 :
5491 3 : switch (policy) {
5492 : case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE:
5493 1 : break;
5494 : case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE:
5495 2 : switch (selector) {
5496 : case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN:
5497 1 : if (rr_min_io == UINT32_MAX) {
5498 0 : rr_min_io = 1;
5499 1 : } else if (rr_min_io == 0) {
5500 0 : rc = -EINVAL;
5501 0 : goto exit;
5502 : }
5503 1 : break;
5504 : case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH:
5505 1 : break;
5506 : default:
5507 0 : rc = -EINVAL;
5508 0 : goto exit;
5509 : }
5510 2 : break;
5511 : default:
5512 0 : rc = -EINVAL;
5513 0 : goto exit;
5514 : }
5515 :
5516 3 : ctx = calloc(1, sizeof(*ctx));
5517 3 : if (ctx == NULL) {
5518 0 : SPDK_ERRLOG("Failed to alloc context.\n");
5519 0 : rc = -ENOMEM;
5520 0 : goto exit;
5521 : }
5522 :
5523 3 : ctx->cb_fn = cb_fn;
5524 3 : ctx->cb_arg = cb_arg;
5525 :
5526 3 : rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc);
5527 3 : if (rc != 0) {
5528 0 : SPDK_ERRLOG("Failed to open bdev %s.\n", name);
5529 0 : rc = -ENODEV;
5530 0 : goto err_open;
5531 : }
5532 :
5533 3 : bdev = spdk_bdev_desc_get_bdev(ctx->desc);
5534 3 : if (bdev->module != &nvme_if) {
5535 0 : SPDK_ERRLOG("bdev %s is not registered in this module.\n", name);
5536 0 : rc = -ENODEV;
5537 0 : goto err_module;
5538 : }
5539 3 : nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
5540 :
5541 3 : pthread_mutex_lock(&nbdev->mutex);
5542 3 : nbdev->mp_policy = policy;
5543 3 : nbdev->mp_selector = selector;
5544 3 : nbdev->rr_min_io = rr_min_io;
5545 3 : pthread_mutex_unlock(&nbdev->mutex);
5546 :
5547 6 : nvme_bdev_for_each_channel(nbdev,
5548 : _bdev_nvme_set_multipath_policy,
5549 3 : ctx,
5550 : bdev_nvme_set_multipath_policy_done);
5551 3 : return;
5552 :
5553 : err_module:
5554 0 : spdk_bdev_close(ctx->desc);
5555 : err_open:
5556 0 : free(ctx);
5557 : exit:
5558 0 : cb_fn(cb_arg, rc);
5559 3 : }
5560 :
5561 : static void
5562 3 : aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
5563 : {
5564 3 : struct nvme_ctrlr *nvme_ctrlr = arg;
5565 3 : union spdk_nvme_async_event_completion event;
5566 :
5567 3 : if (spdk_nvme_cpl_is_error(cpl)) {
5568 0 : SPDK_WARNLOG("AER request execute failed\n");
5569 0 : return;
5570 : }
5571 :
5572 3 : event.raw = cpl->cdw0;
5573 3 : if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
5574 3 : (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
5575 2 : nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL);
5576 3 : } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
5577 1 : (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) {
5578 1 : nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
5579 1 : }
5580 3 : }
5581 :
5582 : static void
5583 53 : free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx)
5584 : {
5585 53 : spdk_keyring_put_key(ctx->drv_opts.tls_psk);
5586 53 : spdk_keyring_put_key(ctx->drv_opts.dhchap_key);
5587 53 : spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key);
5588 53 : free(ctx->base_name);
5589 53 : free(ctx);
5590 53 : }
5591 :
5592 : static void
5593 53 : populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc)
5594 : {
5595 53 : if (ctx->cb_fn) {
5596 53 : ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc);
5597 53 : }
5598 :
5599 53 : ctx->namespaces_populated = true;
5600 53 : if (ctx->probe_done) {
5601 : /* The probe was already completed, so we need to free the context
5602 : * here. This can happen for cases like OCSSD, where we need to
5603 : * send additional commands to the SSD after attach.
5604 : */
5605 32 : free_nvme_async_probe_ctx(ctx);
5606 32 : }
5607 53 : }
5608 :
5609 : static int
5610 20 : bdev_nvme_remove_poller(void *ctx)
5611 : {
5612 20 : struct spdk_nvme_transport_id trid_pcie;
5613 :
5614 20 : if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
5615 1 : spdk_poller_unregister(&g_hotplug_poller);
5616 1 : return SPDK_POLLER_IDLE;
5617 : }
5618 :
5619 19 : memset(&trid_pcie, 0, sizeof(trid_pcie));
5620 19 : spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
5621 :
5622 19 : if (spdk_nvme_scan_attached(&trid_pcie)) {
5623 0 : SPDK_ERRLOG_RATELIMIT("spdk_nvme_scan_attached() failed\n");
5624 0 : }
5625 :
5626 19 : return SPDK_POLLER_BUSY;
5627 20 : }
5628 :
5629 : static void
5630 61 : nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr,
5631 : struct nvme_async_probe_ctx *ctx)
5632 : {
5633 61 : struct spdk_nvme_transport_id *trid = &nvme_ctrlr->active_path_id->trid;
5634 :
5635 61 : if (spdk_nvme_trtype_is_fabrics(trid->trtype)) {
5636 61 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created to %s:%s\n",
5637 : trid->traddr, trid->trsvcid);
5638 61 : } else {
5639 0 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created\n");
5640 : }
5641 :
5642 122 : spdk_io_device_register(nvme_ctrlr,
5643 : bdev_nvme_create_ctrlr_channel_cb,
5644 : bdev_nvme_destroy_ctrlr_channel_cb,
5645 : sizeof(struct nvme_ctrlr_channel),
5646 61 : nvme_ctrlr->nbdev_ctrlr->name);
5647 :
5648 61 : nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx);
5649 :
5650 61 : if (g_hotplug_poller == NULL) {
5651 2 : g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL,
5652 : NVME_HOTPLUG_POLL_PERIOD_DEFAULT);
5653 2 : }
5654 61 : }
5655 :
5656 : static void
5657 31 : nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl)
5658 : {
5659 31 : struct nvme_ctrlr *nvme_ctrlr = _ctx;
5660 31 : struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx;
5661 :
5662 31 : nvme_ctrlr->probe_ctx = NULL;
5663 :
5664 31 : if (spdk_nvme_cpl_is_error(cpl)) {
5665 0 : nvme_ctrlr_delete(nvme_ctrlr);
5666 :
5667 0 : if (ctx != NULL) {
5668 0 : ctx->reported_bdevs = 0;
5669 0 : populate_namespaces_cb(ctx, -1);
5670 0 : }
5671 0 : return;
5672 : }
5673 :
5674 31 : nvme_ctrlr_create_done(nvme_ctrlr, ctx);
5675 31 : }
5676 :
5677 : static int
5678 31 : nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
5679 : struct nvme_async_probe_ctx *ctx)
5680 : {
5681 31 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5682 31 : const struct spdk_nvme_ctrlr_data *cdata;
5683 31 : uint32_t ana_log_page_size;
5684 :
5685 31 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5686 :
5687 : /* Set buffer size enough to include maximum number of allowed namespaces. */
5688 62 : ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
5689 31 : sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan *
5690 : sizeof(uint32_t);
5691 :
5692 31 : nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL,
5693 : SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
5694 31 : if (nvme_ctrlr->ana_log_page == NULL) {
5695 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate ANA log page buffer\n");
5696 0 : return -ENXIO;
5697 : }
5698 :
5699 : /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned.
5700 : * Hence copy each descriptor to a temporary area when parsing it.
5701 : *
5702 : * Allocate a buffer whose size is as large as ANA log page buffer because
5703 : * we do not know the size of a descriptor until actually reading it.
5704 : */
5705 31 : nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size);
5706 31 : if (nvme_ctrlr->copied_ana_desc == NULL) {
5707 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate a buffer to parse ANA descriptor\n");
5708 0 : return -ENOMEM;
5709 : }
5710 :
5711 31 : nvme_ctrlr->max_ana_log_page_size = ana_log_page_size;
5712 :
5713 31 : nvme_ctrlr->probe_ctx = ctx;
5714 :
5715 : /* Then, set the read size only to include the current active namespaces. */
5716 31 : ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr);
5717 :
5718 31 : if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) {
5719 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n",
5720 : ana_log_page_size, nvme_ctrlr->max_ana_log_page_size);
5721 0 : return -EINVAL;
5722 : }
5723 :
5724 62 : return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr,
5725 : SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
5726 : SPDK_NVME_GLOBAL_NS_TAG,
5727 31 : nvme_ctrlr->ana_log_page,
5728 31 : ana_log_page_size, 0,
5729 : nvme_ctrlr_init_ana_log_page_done,
5730 31 : nvme_ctrlr);
5731 31 : }
5732 :
5733 : /* hostnqn and subnqn were already verified before attaching a controller.
5734 : * Hence check only the multipath capability and cntlid here.
5735 : */
5736 : static bool
5737 16 : bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr)
5738 : {
5739 16 : struct nvme_ctrlr *tmp;
5740 16 : const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata;
5741 :
5742 16 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5743 :
5744 16 : if (!cdata->cmic.multi_ctrlr) {
5745 0 : SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid);
5746 0 : return false;
5747 : }
5748 :
5749 33 : TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) {
5750 18 : tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr);
5751 :
5752 18 : if (!tmp_cdata->cmic.multi_ctrlr) {
5753 0 : NVME_CTRLR_ERRLOG(tmp, "Ctrlr%u does not support multipath.\n", cdata->cntlid);
5754 0 : return false;
5755 : }
5756 18 : if (cdata->cntlid == tmp_cdata->cntlid) {
5757 1 : NVME_CTRLR_ERRLOG(tmp, "cntlid %u are duplicated.\n", tmp_cdata->cntlid);
5758 1 : return false;
5759 : }
5760 17 : }
5761 :
5762 15 : return true;
5763 16 : }
5764 :
5765 :
5766 : static int
5767 62 : nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr)
5768 : {
5769 62 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
5770 62 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5771 62 : struct nvme_ctrlr *nctrlr;
5772 62 : int rc = 0;
5773 :
5774 62 : pthread_mutex_lock(&g_bdev_nvme_mutex);
5775 :
5776 62 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
5777 62 : if (nbdev_ctrlr != NULL) {
5778 16 : if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) {
5779 1 : rc = -EINVAL;
5780 1 : goto exit;
5781 : }
5782 32 : TAILQ_FOREACH(nctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
5783 17 : if (nctrlr->opts.multipath != nvme_ctrlr->opts.multipath) {
5784 : /* All controllers with the same name must be configured the same
5785 : * way, either for multipath or failover. If the configuration doesn't
5786 : * match - report error.
5787 : */
5788 0 : rc = -EINVAL;
5789 0 : goto exit;
5790 : }
5791 17 : }
5792 15 : } else {
5793 46 : nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr));
5794 46 : if (nbdev_ctrlr == NULL) {
5795 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_bdev_ctrlr.\n");
5796 0 : rc = -ENOMEM;
5797 0 : goto exit;
5798 : }
5799 46 : nbdev_ctrlr->name = strdup(name);
5800 46 : if (nbdev_ctrlr->name == NULL) {
5801 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate name of nvme_bdev_ctrlr.\n");
5802 0 : free(nbdev_ctrlr);
5803 0 : goto exit;
5804 : }
5805 46 : TAILQ_INIT(&nbdev_ctrlr->ctrlrs);
5806 46 : TAILQ_INIT(&nbdev_ctrlr->bdevs);
5807 46 : TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
5808 : }
5809 61 : nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr;
5810 61 : TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
5811 : exit:
5812 62 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
5813 124 : return rc;
5814 62 : }
5815 :
5816 : static int
5817 62 : nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
5818 : const char *name,
5819 : const struct spdk_nvme_transport_id *trid,
5820 : struct nvme_async_probe_ctx *ctx)
5821 : {
5822 62 : struct nvme_ctrlr *nvme_ctrlr;
5823 62 : struct nvme_path_id *path_id;
5824 62 : const struct spdk_nvme_ctrlr_data *cdata;
5825 62 : struct spdk_event_handler_opts opts = {
5826 : .opts_size = SPDK_SIZEOF(&opts, fd_type),
5827 : };
5828 62 : uint64_t period;
5829 62 : int fd, rc;
5830 :
5831 62 : nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr));
5832 62 : if (nvme_ctrlr == NULL) {
5833 0 : SPDK_ERRLOG("Failed to allocate device struct\n");
5834 0 : return -ENOMEM;
5835 : }
5836 :
5837 62 : rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL);
5838 62 : if (rc != 0) {
5839 0 : free(nvme_ctrlr);
5840 0 : return rc;
5841 : }
5842 :
5843 62 : TAILQ_INIT(&nvme_ctrlr->trids);
5844 62 : TAILQ_INIT(&nvme_ctrlr->pending_resets);
5845 62 : RB_INIT(&nvme_ctrlr->namespaces);
5846 :
5847 : /* Get another reference to the key, so the first one can be released from probe_ctx */
5848 62 : if (ctx != NULL) {
5849 48 : if (ctx->drv_opts.tls_psk != NULL) {
5850 0 : nvme_ctrlr->psk = spdk_keyring_get_key(
5851 0 : spdk_key_get_name(ctx->drv_opts.tls_psk));
5852 0 : if (nvme_ctrlr->psk == NULL) {
5853 : /* Could only happen if the key was removed in the meantime */
5854 0 : SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n",
5855 : spdk_key_get_name(ctx->drv_opts.tls_psk));
5856 0 : rc = -ENOKEY;
5857 0 : goto err;
5858 : }
5859 0 : }
5860 :
5861 48 : if (ctx->drv_opts.dhchap_key != NULL) {
5862 0 : nvme_ctrlr->dhchap_key = spdk_keyring_get_key(
5863 0 : spdk_key_get_name(ctx->drv_opts.dhchap_key));
5864 0 : if (nvme_ctrlr->dhchap_key == NULL) {
5865 0 : SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n",
5866 : spdk_key_get_name(ctx->drv_opts.dhchap_key));
5867 0 : rc = -ENOKEY;
5868 0 : goto err;
5869 : }
5870 0 : }
5871 :
5872 48 : if (ctx->drv_opts.dhchap_ctrlr_key != NULL) {
5873 0 : nvme_ctrlr->dhchap_ctrlr_key =
5874 0 : spdk_keyring_get_key(
5875 0 : spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key));
5876 0 : if (nvme_ctrlr->dhchap_ctrlr_key == NULL) {
5877 0 : SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n",
5878 : spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key));
5879 0 : rc = -ENOKEY;
5880 0 : goto err;
5881 : }
5882 0 : }
5883 48 : }
5884 :
5885 : /* Check if we manage to enable interrupts on the controller. */
5886 62 : if (spdk_interrupt_mode_is_enabled() && ctx != NULL && !ctx->drv_opts.enable_interrupts) {
5887 0 : SPDK_ERRLOG("Failed to enable interrupts on the controller\n");
5888 0 : rc = -ENOTSUP;
5889 0 : goto err;
5890 : }
5891 :
5892 62 : path_id = calloc(1, sizeof(*path_id));
5893 62 : if (path_id == NULL) {
5894 0 : SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
5895 0 : rc = -ENOMEM;
5896 0 : goto err;
5897 : }
5898 :
5899 62 : path_id->trid = *trid;
5900 62 : if (ctx != NULL) {
5901 48 : memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr));
5902 48 : memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid));
5903 48 : }
5904 62 : nvme_ctrlr->active_path_id = path_id;
5905 62 : TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link);
5906 :
5907 62 : nvme_ctrlr->thread = spdk_get_thread();
5908 62 : nvme_ctrlr->ctrlr = ctrlr;
5909 62 : nvme_ctrlr->ref = 1;
5910 :
5911 62 : if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
5912 0 : SPDK_ERRLOG("OCSSDs are not supported");
5913 0 : rc = -ENOTSUP;
5914 0 : goto err;
5915 : }
5916 :
5917 62 : if (ctx != NULL) {
5918 48 : memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts));
5919 48 : } else {
5920 14 : spdk_bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts);
5921 : }
5922 :
5923 62 : period = spdk_interrupt_mode_is_enabled() ? 0 : g_opts.nvme_adminq_poll_period_us;
5924 :
5925 62 : nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr,
5926 : period);
5927 :
5928 62 : if (spdk_interrupt_mode_is_enabled()) {
5929 0 : spdk_poller_register_interrupt(nvme_ctrlr->adminq_timer_poller, NULL, NULL);
5930 :
5931 0 : fd = spdk_nvme_ctrlr_get_admin_qp_fd(nvme_ctrlr->ctrlr, &opts);
5932 0 : if (fd < 0) {
5933 0 : rc = fd;
5934 0 : goto err;
5935 : }
5936 :
5937 0 : nvme_ctrlr->intr = SPDK_INTERRUPT_REGISTER_EXT(fd, bdev_nvme_poll_adminq,
5938 : nvme_ctrlr, &opts);
5939 0 : if (!nvme_ctrlr->intr) {
5940 0 : rc = -EINVAL;
5941 0 : goto err;
5942 : }
5943 0 : }
5944 :
5945 62 : if (g_opts.timeout_us > 0) {
5946 : /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */
5947 : /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */
5948 0 : uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ?
5949 0 : g_opts.timeout_us : g_opts.timeout_admin_us;
5950 0 : spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
5951 0 : adm_timeout_us, timeout_cb, nvme_ctrlr);
5952 0 : }
5953 :
5954 62 : spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr);
5955 62 : spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr);
5956 :
5957 62 : if (spdk_nvme_ctrlr_get_flags(ctrlr) &
5958 : SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
5959 0 : nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr);
5960 0 : }
5961 :
5962 62 : rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr);
5963 62 : if (rc != 0) {
5964 1 : goto err;
5965 : }
5966 :
5967 61 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5968 :
5969 61 : if (cdata->cmic.ana_reporting) {
5970 31 : rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx);
5971 31 : if (rc == 0) {
5972 31 : return 0;
5973 : }
5974 0 : } else {
5975 30 : nvme_ctrlr_create_done(nvme_ctrlr, ctx);
5976 30 : return 0;
5977 : }
5978 :
5979 : err:
5980 1 : nvme_ctrlr_delete(nvme_ctrlr);
5981 1 : return rc;
5982 62 : }
5983 :
5984 : void
5985 34 : spdk_bdev_nvme_get_default_ctrlr_opts(struct spdk_bdev_nvme_ctrlr_opts *opts)
5986 : {
5987 34 : opts->prchk_flags = 0;
5988 34 : opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec;
5989 34 : opts->reconnect_delay_sec = g_opts.reconnect_delay_sec;
5990 34 : opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec;
5991 34 : opts->multipath = true;
5992 34 : }
5993 :
5994 : static void
5995 0 : attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
5996 : struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts)
5997 : {
5998 0 : char *name;
5999 :
6000 0 : name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
6001 0 : if (!name) {
6002 0 : SPDK_ERRLOG("Failed to assign name to NVMe device\n");
6003 0 : return;
6004 : }
6005 :
6006 0 : if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) {
6007 0 : SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
6008 0 : } else {
6009 0 : SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name);
6010 : }
6011 :
6012 0 : free(name);
6013 0 : }
6014 :
6015 : static void
6016 61 : _nvme_ctrlr_destruct(void *ctx)
6017 : {
6018 61 : struct nvme_ctrlr *nvme_ctrlr = ctx;
6019 :
6020 61 : nvme_ctrlr_depopulate_namespaces(nvme_ctrlr);
6021 61 : nvme_ctrlr_put_ref(nvme_ctrlr);
6022 61 : }
6023 :
6024 : static int
6025 58 : bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
6026 : {
6027 58 : struct nvme_probe_skip_entry *entry;
6028 :
6029 : /* The controller's destruction was already started */
6030 58 : if (nvme_ctrlr->destruct) {
6031 0 : return -EALREADY;
6032 : }
6033 :
6034 116 : if (!hotplug &&
6035 58 : nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
6036 0 : entry = calloc(1, sizeof(*entry));
6037 0 : if (!entry) {
6038 0 : return -ENOMEM;
6039 : }
6040 0 : entry->trid = nvme_ctrlr->active_path_id->trid;
6041 0 : TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
6042 0 : }
6043 :
6044 58 : nvme_ctrlr->destruct = true;
6045 58 : return 0;
6046 58 : }
6047 :
6048 : static int
6049 2 : bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
6050 : {
6051 2 : int rc;
6052 :
6053 2 : pthread_mutex_lock(&nvme_ctrlr->mutex);
6054 2 : rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug);
6055 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6056 :
6057 2 : if (rc == 0) {
6058 2 : _nvme_ctrlr_destruct(nvme_ctrlr);
6059 2 : } else if (rc == -EALREADY) {
6060 0 : rc = 0;
6061 0 : }
6062 :
6063 4 : return rc;
6064 2 : }
6065 :
6066 : static void
6067 0 : remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
6068 : {
6069 0 : struct nvme_ctrlr *nvme_ctrlr = cb_ctx;
6070 :
6071 0 : bdev_nvme_delete_ctrlr(nvme_ctrlr, true);
6072 0 : }
6073 :
6074 : static int
6075 0 : bdev_nvme_hotplug_probe(void *arg)
6076 : {
6077 0 : if (g_hotplug_probe_ctx == NULL) {
6078 0 : spdk_poller_unregister(&g_hotplug_probe_poller);
6079 0 : return SPDK_POLLER_IDLE;
6080 : }
6081 :
6082 0 : if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
6083 0 : g_hotplug_probe_ctx = NULL;
6084 0 : spdk_poller_unregister(&g_hotplug_probe_poller);
6085 0 : }
6086 :
6087 0 : return SPDK_POLLER_BUSY;
6088 0 : }
6089 :
6090 : static int
6091 0 : bdev_nvme_hotplug(void *arg)
6092 : {
6093 0 : struct spdk_nvme_transport_id trid_pcie;
6094 :
6095 0 : if (g_hotplug_probe_ctx) {
6096 0 : return SPDK_POLLER_BUSY;
6097 : }
6098 :
6099 0 : memset(&trid_pcie, 0, sizeof(trid_pcie));
6100 0 : spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
6101 :
6102 0 : g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
6103 : hotplug_probe_cb, attach_cb, NULL);
6104 :
6105 0 : if (g_hotplug_probe_ctx) {
6106 0 : assert(g_hotplug_probe_poller == NULL);
6107 0 : g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
6108 0 : }
6109 :
6110 0 : return SPDK_POLLER_BUSY;
6111 0 : }
6112 :
6113 : void
6114 0 : spdk_bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts, size_t opts_size)
6115 : {
6116 0 : if (!opts) {
6117 0 : SPDK_ERRLOG("opts should not be NULL\n");
6118 0 : return;
6119 : }
6120 :
6121 0 : if (!opts_size) {
6122 0 : SPDK_ERRLOG("opts_size should not be zero value\n");
6123 0 : return;
6124 : }
6125 :
6126 0 : opts->opts_size = opts_size;
6127 :
6128 : #define SET_FIELD(field, defval) \
6129 : opts->field = SPDK_GET_FIELD(&g_opts, field, defval, opts_size); \
6130 :
6131 0 : SET_FIELD(action_on_timeout, 0);
6132 0 : SET_FIELD(keep_alive_timeout_ms, 0);
6133 0 : SET_FIELD(timeout_us, 0);
6134 0 : SET_FIELD(timeout_admin_us, 0);
6135 0 : SET_FIELD(transport_retry_count, 0);
6136 0 : SET_FIELD(arbitration_burst, 0);
6137 0 : SET_FIELD(low_priority_weight, 0);
6138 0 : SET_FIELD(medium_priority_weight, 0);
6139 0 : SET_FIELD(high_priority_weight, 0);
6140 0 : SET_FIELD(io_queue_requests, 0);
6141 0 : SET_FIELD(nvme_adminq_poll_period_us, 0);
6142 0 : SET_FIELD(nvme_ioq_poll_period_us, 0);
6143 0 : SET_FIELD(delay_cmd_submit, 0);
6144 0 : SET_FIELD(bdev_retry_count, 0);
6145 0 : SET_FIELD(ctrlr_loss_timeout_sec, 0);
6146 0 : SET_FIELD(reconnect_delay_sec, 0);
6147 0 : SET_FIELD(fast_io_fail_timeout_sec, 0);
6148 0 : SET_FIELD(transport_ack_timeout, 0);
6149 0 : SET_FIELD(disable_auto_failback, false);
6150 0 : SET_FIELD(generate_uuids, false);
6151 0 : SET_FIELD(transport_tos, 0);
6152 0 : SET_FIELD(nvme_error_stat, false);
6153 0 : SET_FIELD(io_path_stat, false);
6154 0 : SET_FIELD(allow_accel_sequence, false);
6155 0 : SET_FIELD(rdma_srq_size, 0);
6156 0 : SET_FIELD(rdma_max_cq_size, 0);
6157 0 : SET_FIELD(rdma_cm_event_timeout_ms, 0);
6158 0 : SET_FIELD(dhchap_digests, 0);
6159 0 : SET_FIELD(dhchap_dhgroups, 0);
6160 :
6161 : #undef SET_FIELD
6162 :
6163 : /* Do not remove this statement, you should always update this statement when you adding a new field,
6164 : * and do not forget to add the SET_FIELD statement for your added field. */
6165 : SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_nvme_opts) == 120, "Incorrect size");
6166 0 : }
6167 :
6168 : static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec,
6169 : uint32_t reconnect_delay_sec,
6170 : uint32_t fast_io_fail_timeout_sec);
6171 :
6172 : static int
6173 0 : bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
6174 : {
6175 0 : if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) {
6176 : /* Can't set timeout_admin_us without also setting timeout_us */
6177 0 : SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n");
6178 0 : return -EINVAL;
6179 : }
6180 :
6181 0 : if (opts->bdev_retry_count < -1) {
6182 0 : SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n");
6183 0 : return -EINVAL;
6184 : }
6185 :
6186 0 : if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec,
6187 0 : opts->reconnect_delay_sec,
6188 0 : opts->fast_io_fail_timeout_sec)) {
6189 0 : return -EINVAL;
6190 : }
6191 :
6192 0 : return 0;
6193 0 : }
6194 :
6195 : int
6196 0 : spdk_bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
6197 : {
6198 0 : if (!opts) {
6199 0 : SPDK_ERRLOG("opts cannot be NULL\n");
6200 0 : return -1;
6201 : }
6202 :
6203 0 : if (!opts->opts_size) {
6204 0 : SPDK_ERRLOG("opts_size inside opts cannot be zero value\n");
6205 0 : return -1;
6206 : }
6207 :
6208 0 : int ret;
6209 :
6210 0 : ret = bdev_nvme_validate_opts(opts);
6211 0 : if (ret) {
6212 0 : SPDK_WARNLOG("Failed to set nvme opts.\n");
6213 0 : return ret;
6214 : }
6215 :
6216 0 : if (g_bdev_nvme_init_thread != NULL) {
6217 0 : if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
6218 0 : return -EPERM;
6219 : }
6220 0 : }
6221 :
6222 0 : if (opts->rdma_srq_size != 0 ||
6223 0 : opts->rdma_max_cq_size != 0 ||
6224 0 : opts->rdma_cm_event_timeout_ms != 0) {
6225 0 : struct spdk_nvme_transport_opts drv_opts;
6226 :
6227 0 : spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts));
6228 0 : if (opts->rdma_srq_size != 0) {
6229 0 : drv_opts.rdma_srq_size = opts->rdma_srq_size;
6230 0 : }
6231 0 : if (opts->rdma_max_cq_size != 0) {
6232 0 : drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size;
6233 0 : }
6234 0 : if (opts->rdma_cm_event_timeout_ms != 0) {
6235 0 : drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms;
6236 0 : }
6237 :
6238 0 : ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts));
6239 0 : if (ret) {
6240 0 : SPDK_ERRLOG("Failed to set NVMe transport opts.\n");
6241 0 : return ret;
6242 : }
6243 0 : }
6244 :
6245 : #define SET_FIELD(field, defval) \
6246 : g_opts.field = SPDK_GET_FIELD(opts, field, defval, opts->opts_size); \
6247 :
6248 0 : SET_FIELD(action_on_timeout, 0);
6249 0 : SET_FIELD(keep_alive_timeout_ms, 0);
6250 0 : SET_FIELD(timeout_us, 0);
6251 0 : SET_FIELD(timeout_admin_us, 0);
6252 0 : SET_FIELD(transport_retry_count, 0);
6253 0 : SET_FIELD(arbitration_burst, 0);
6254 0 : SET_FIELD(low_priority_weight, 0);
6255 0 : SET_FIELD(medium_priority_weight, 0);
6256 0 : SET_FIELD(high_priority_weight, 0);
6257 0 : SET_FIELD(io_queue_requests, 0);
6258 0 : SET_FIELD(nvme_adminq_poll_period_us, 0);
6259 0 : SET_FIELD(nvme_ioq_poll_period_us, 0);
6260 0 : SET_FIELD(delay_cmd_submit, 0);
6261 0 : SET_FIELD(bdev_retry_count, 0);
6262 0 : SET_FIELD(ctrlr_loss_timeout_sec, 0);
6263 0 : SET_FIELD(reconnect_delay_sec, 0);
6264 0 : SET_FIELD(fast_io_fail_timeout_sec, 0);
6265 0 : SET_FIELD(transport_ack_timeout, 0);
6266 0 : SET_FIELD(disable_auto_failback, false);
6267 0 : SET_FIELD(generate_uuids, false);
6268 0 : SET_FIELD(transport_tos, 0);
6269 0 : SET_FIELD(nvme_error_stat, false);
6270 0 : SET_FIELD(io_path_stat, false);
6271 0 : SET_FIELD(allow_accel_sequence, false);
6272 0 : SET_FIELD(rdma_srq_size, 0);
6273 0 : SET_FIELD(rdma_max_cq_size, 0);
6274 0 : SET_FIELD(rdma_cm_event_timeout_ms, 0);
6275 0 : SET_FIELD(dhchap_digests, 0);
6276 0 : SET_FIELD(dhchap_dhgroups, 0);
6277 :
6278 0 : g_opts.opts_size = opts->opts_size;
6279 :
6280 : #undef SET_FIELD
6281 :
6282 0 : return 0;
6283 0 : }
6284 :
6285 : struct set_nvme_hotplug_ctx {
6286 : uint64_t period_us;
6287 : bool enabled;
6288 : spdk_msg_fn fn;
6289 : void *fn_ctx;
6290 : };
6291 :
6292 : static void
6293 0 : set_nvme_hotplug_period_cb(void *_ctx)
6294 : {
6295 0 : struct set_nvme_hotplug_ctx *ctx = _ctx;
6296 :
6297 0 : spdk_poller_unregister(&g_hotplug_poller);
6298 0 : if (ctx->enabled) {
6299 0 : g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
6300 0 : } else {
6301 0 : g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL,
6302 : NVME_HOTPLUG_POLL_PERIOD_DEFAULT);
6303 : }
6304 :
6305 0 : g_nvme_hotplug_poll_period_us = ctx->period_us;
6306 0 : g_nvme_hotplug_enabled = ctx->enabled;
6307 0 : if (ctx->fn) {
6308 0 : ctx->fn(ctx->fn_ctx);
6309 0 : }
6310 :
6311 0 : free(ctx);
6312 0 : }
6313 :
6314 : int
6315 0 : bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
6316 : {
6317 0 : struct set_nvme_hotplug_ctx *ctx;
6318 :
6319 0 : if (enabled == true && !spdk_process_is_primary()) {
6320 0 : return -EPERM;
6321 : }
6322 :
6323 0 : ctx = calloc(1, sizeof(*ctx));
6324 0 : if (ctx == NULL) {
6325 0 : return -ENOMEM;
6326 : }
6327 :
6328 0 : period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
6329 0 : ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
6330 0 : ctx->enabled = enabled;
6331 0 : ctx->fn = cb;
6332 0 : ctx->fn_ctx = cb_ctx;
6333 :
6334 0 : spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
6335 0 : return 0;
6336 0 : }
6337 :
6338 : static void
6339 47 : nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
6340 : struct nvme_async_probe_ctx *ctx)
6341 : {
6342 47 : struct nvme_ns *nvme_ns;
6343 47 : struct nvme_bdev *nvme_bdev;
6344 47 : size_t j;
6345 :
6346 47 : assert(nvme_ctrlr != NULL);
6347 :
6348 47 : if (ctx->names == NULL) {
6349 0 : ctx->reported_bdevs = 0;
6350 0 : populate_namespaces_cb(ctx, 0);
6351 0 : return;
6352 : }
6353 :
6354 : /*
6355 : * Report the new bdevs that were created in this call.
6356 : * There can be more than one bdev per NVMe controller.
6357 : */
6358 47 : j = 0;
6359 47 : nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
6360 96 : while (nvme_ns != NULL) {
6361 49 : nvme_bdev = nvme_ns->bdev;
6362 49 : if (j < ctx->max_bdevs) {
6363 49 : ctx->names[j] = nvme_bdev->disk.name;
6364 49 : j++;
6365 49 : } else {
6366 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr,
6367 : "Maximum number of namespaces supported per NVMe controller is %du. "
6368 : "Unable to return all names of created bdevs\n",
6369 : ctx->max_bdevs);
6370 0 : ctx->reported_bdevs = 0;
6371 0 : populate_namespaces_cb(ctx, -ERANGE);
6372 0 : return;
6373 : }
6374 :
6375 49 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
6376 : }
6377 :
6378 47 : ctx->reported_bdevs = j;
6379 47 : populate_namespaces_cb(ctx, 0);
6380 47 : }
6381 :
6382 : static int
6383 9 : bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
6384 : struct spdk_nvme_ctrlr *new_ctrlr,
6385 : struct spdk_nvme_transport_id *trid)
6386 : {
6387 9 : struct nvme_path_id *tmp_trid;
6388 :
6389 9 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
6390 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "PCIe failover is not supported.\n");
6391 0 : return -ENOTSUP;
6392 : }
6393 :
6394 : /* Currently we only support failover to the same transport type. */
6395 9 : if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) {
6396 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr,
6397 : "Failover from trtype: %s to a different trtype: %s is not supported currently\n",
6398 : spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype),
6399 : spdk_nvme_transport_id_trtype_str(trid->trtype));
6400 0 : return -EINVAL;
6401 : }
6402 :
6403 :
6404 : /* Currently we only support failover to the same NQN. */
6405 9 : if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
6406 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr,
6407 : "Failover from subnqn: %s to a different subnqn: %s is not supported currently\n",
6408 : nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn);
6409 0 : return -EINVAL;
6410 : }
6411 :
6412 : /* Skip all the other checks if we've already registered this path. */
6413 21 : TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
6414 12 : if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
6415 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "This path (traddr: %s subnqn: %s) is already registered\n",
6416 : trid->traddr, trid->subnqn);
6417 0 : return -EALREADY;
6418 : }
6419 12 : }
6420 :
6421 9 : return 0;
6422 9 : }
6423 :
6424 : static int
6425 9 : bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr,
6426 : struct spdk_nvme_ctrlr *new_ctrlr)
6427 : {
6428 9 : struct nvme_ns *nvme_ns;
6429 9 : struct spdk_nvme_ns *new_ns;
6430 :
6431 9 : nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
6432 9 : while (nvme_ns != NULL) {
6433 0 : new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id);
6434 0 : assert(new_ns != NULL);
6435 :
6436 0 : if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) {
6437 0 : return -EINVAL;
6438 : }
6439 :
6440 0 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
6441 : }
6442 :
6443 9 : return 0;
6444 9 : }
6445 :
6446 : static int
6447 9 : _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
6448 : struct spdk_nvme_transport_id *trid)
6449 : {
6450 9 : struct nvme_path_id *active_id, *new_trid, *tmp_trid;
6451 :
6452 9 : new_trid = calloc(1, sizeof(*new_trid));
6453 9 : if (new_trid == NULL) {
6454 0 : return -ENOMEM;
6455 : }
6456 9 : new_trid->trid = *trid;
6457 :
6458 9 : active_id = nvme_ctrlr->active_path_id;
6459 9 : assert(active_id != NULL);
6460 9 : assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids));
6461 :
6462 : /* Skip the active trid not to replace it until it is failed. */
6463 9 : tmp_trid = TAILQ_NEXT(active_id, link);
6464 9 : if (tmp_trid == NULL) {
6465 6 : goto add_tail;
6466 : }
6467 :
6468 : /* It means the trid is faled if its last failed time is non-zero.
6469 : * Insert the new alternate trid before any failed trid.
6470 : */
6471 5 : TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) {
6472 3 : if (tmp_trid->last_failed_tsc != 0) {
6473 1 : TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
6474 1 : return 0;
6475 : }
6476 4 : }
6477 :
6478 : add_tail:
6479 8 : TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link);
6480 8 : return 0;
6481 9 : }
6482 :
6483 : /* This is the case that a secondary path is added to an existing
6484 : * nvme_ctrlr for failover. After checking if it can access the same
6485 : * namespaces as the primary path, it is disconnected until failover occurs.
6486 : */
6487 : static int
6488 9 : bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
6489 : struct spdk_nvme_ctrlr *new_ctrlr,
6490 : struct spdk_nvme_transport_id *trid)
6491 : {
6492 9 : int rc;
6493 :
6494 9 : assert(nvme_ctrlr != NULL);
6495 :
6496 9 : pthread_mutex_lock(&nvme_ctrlr->mutex);
6497 :
6498 9 : rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid);
6499 9 : if (rc != 0) {
6500 0 : goto exit;
6501 : }
6502 :
6503 9 : rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr);
6504 9 : if (rc != 0) {
6505 0 : goto exit;
6506 : }
6507 :
6508 9 : rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid);
6509 :
6510 : exit:
6511 9 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6512 :
6513 9 : spdk_nvme_detach(new_ctrlr);
6514 :
6515 18 : return rc;
6516 9 : }
6517 :
6518 : static void
6519 48 : connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
6520 : struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
6521 : {
6522 48 : struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
6523 48 : struct nvme_async_probe_ctx *ctx;
6524 48 : int rc;
6525 :
6526 48 : ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts);
6527 48 : ctx->ctrlr_attached = true;
6528 :
6529 48 : rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx);
6530 48 : if (rc != 0) {
6531 1 : ctx->reported_bdevs = 0;
6532 1 : populate_namespaces_cb(ctx, rc);
6533 1 : }
6534 48 : }
6535 :
6536 :
6537 : static void
6538 4 : connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
6539 : struct spdk_nvme_ctrlr *ctrlr,
6540 : const struct spdk_nvme_ctrlr_opts *opts)
6541 : {
6542 4 : struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
6543 4 : struct nvme_ctrlr *nvme_ctrlr;
6544 4 : struct nvme_async_probe_ctx *ctx;
6545 4 : int rc;
6546 :
6547 4 : ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts);
6548 4 : ctx->ctrlr_attached = true;
6549 :
6550 4 : nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
6551 4 : if (nvme_ctrlr) {
6552 4 : rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid);
6553 4 : } else {
6554 0 : rc = -ENODEV;
6555 : }
6556 :
6557 4 : ctx->reported_bdevs = 0;
6558 4 : populate_namespaces_cb(ctx, rc);
6559 4 : }
6560 :
6561 : static int
6562 53 : bdev_nvme_async_poll(void *arg)
6563 : {
6564 53 : struct nvme_async_probe_ctx *ctx = arg;
6565 53 : int rc;
6566 :
6567 53 : rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
6568 53 : if (spdk_unlikely(rc != -EAGAIN)) {
6569 53 : ctx->probe_done = true;
6570 53 : spdk_poller_unregister(&ctx->poller);
6571 53 : if (!ctx->ctrlr_attached) {
6572 : /* The probe is done, but no controller was attached.
6573 : * That means we had a failure, so report -EIO back to
6574 : * the caller (usually the RPC). populate_namespaces_cb()
6575 : * will take care of freeing the nvme_async_probe_ctx.
6576 : */
6577 1 : ctx->reported_bdevs = 0;
6578 1 : populate_namespaces_cb(ctx, -EIO);
6579 53 : } else if (ctx->namespaces_populated) {
6580 : /* The namespaces for the attached controller were all
6581 : * populated and the response was already sent to the
6582 : * caller (usually the RPC). So free the context here.
6583 : */
6584 21 : free_nvme_async_probe_ctx(ctx);
6585 21 : }
6586 53 : }
6587 :
6588 53 : return SPDK_POLLER_BUSY;
6589 53 : }
6590 :
6591 : static bool
6592 72 : bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec,
6593 : uint32_t reconnect_delay_sec,
6594 : uint32_t fast_io_fail_timeout_sec)
6595 : {
6596 72 : if (ctrlr_loss_timeout_sec < -1) {
6597 1 : SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n");
6598 1 : return false;
6599 71 : } else if (ctrlr_loss_timeout_sec == -1) {
6600 14 : if (reconnect_delay_sec == 0) {
6601 1 : SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n");
6602 1 : return false;
6603 13 : } else if (fast_io_fail_timeout_sec != 0 &&
6604 3 : fast_io_fail_timeout_sec < reconnect_delay_sec) {
6605 1 : SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n");
6606 1 : return false;
6607 : }
6608 69 : } else if (ctrlr_loss_timeout_sec != 0) {
6609 11 : if (reconnect_delay_sec == 0) {
6610 1 : SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n");
6611 1 : return false;
6612 10 : } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) {
6613 1 : SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n");
6614 1 : return false;
6615 9 : } else if (fast_io_fail_timeout_sec != 0) {
6616 6 : if (fast_io_fail_timeout_sec < reconnect_delay_sec) {
6617 1 : SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n");
6618 1 : return false;
6619 5 : } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) {
6620 1 : SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n");
6621 1 : return false;
6622 : }
6623 4 : }
6624 53 : } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) {
6625 2 : SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n");
6626 2 : return false;
6627 : }
6628 :
6629 63 : return true;
6630 72 : }
6631 :
6632 : int
6633 53 : spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid,
6634 : const char *base_name,
6635 : const char **names,
6636 : uint32_t count,
6637 : spdk_bdev_nvme_create_cb cb_fn,
6638 : void *cb_ctx,
6639 : struct spdk_nvme_ctrlr_opts *drv_opts,
6640 : struct spdk_bdev_nvme_ctrlr_opts *bdev_opts)
6641 : {
6642 53 : struct nvme_probe_skip_entry *entry, *tmp;
6643 53 : struct nvme_async_probe_ctx *ctx;
6644 53 : spdk_nvme_attach_cb attach_cb;
6645 53 : struct nvme_ctrlr *nvme_ctrlr;
6646 53 : int len;
6647 :
6648 : /* TODO expand this check to include both the host and target TRIDs.
6649 : * Only if both are the same should we fail.
6650 : */
6651 53 : if (nvme_ctrlr_get(trid, drv_opts->hostnqn) != NULL) {
6652 0 : SPDK_ERRLOG("A controller with the provided trid (traddr: %s, hostnqn: %s) "
6653 : "already exists.\n", trid->traddr, drv_opts->hostnqn);
6654 0 : return -EEXIST;
6655 : }
6656 :
6657 53 : len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX);
6658 :
6659 53 : if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) {
6660 0 : SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1);
6661 0 : return -EINVAL;
6662 : }
6663 :
6664 106 : if (bdev_opts != NULL &&
6665 106 : !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec,
6666 53 : bdev_opts->reconnect_delay_sec,
6667 53 : bdev_opts->fast_io_fail_timeout_sec)) {
6668 0 : return -EINVAL;
6669 : }
6670 :
6671 53 : ctx = calloc(1, sizeof(*ctx));
6672 53 : if (!ctx) {
6673 0 : return -ENOMEM;
6674 : }
6675 53 : ctx->base_name = strdup(base_name);
6676 53 : if (!ctx->base_name) {
6677 0 : free(ctx);
6678 0 : return -ENOMEM;
6679 : }
6680 53 : ctx->names = names;
6681 53 : ctx->max_bdevs = count;
6682 53 : ctx->cb_fn = cb_fn;
6683 53 : ctx->cb_ctx = cb_ctx;
6684 53 : ctx->trid = *trid;
6685 :
6686 53 : if (bdev_opts) {
6687 53 : memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts));
6688 53 : } else {
6689 0 : spdk_bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts);
6690 : }
6691 :
6692 53 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
6693 0 : TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
6694 0 : if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
6695 0 : TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
6696 0 : free(entry);
6697 0 : break;
6698 : }
6699 0 : }
6700 0 : }
6701 :
6702 53 : memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts));
6703 53 : ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count;
6704 53 : ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout;
6705 53 : ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
6706 53 : ctx->drv_opts.disable_read_ana_log_page = true;
6707 53 : ctx->drv_opts.transport_tos = g_opts.transport_tos;
6708 :
6709 53 : if (spdk_interrupt_mode_is_enabled()) {
6710 0 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
6711 0 : ctx->drv_opts.enable_interrupts = true;
6712 0 : } else {
6713 0 : SPDK_ERRLOG("Interrupt mode is only supported with PCIe transport\n");
6714 0 : free_nvme_async_probe_ctx(ctx);
6715 0 : return -ENOTSUP;
6716 : }
6717 0 : }
6718 :
6719 53 : if (ctx->bdev_opts.psk != NULL) {
6720 0 : ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk);
6721 0 : if (ctx->drv_opts.tls_psk == NULL) {
6722 0 : SPDK_ERRLOG("Could not load PSK: %s\n", ctx->bdev_opts.psk);
6723 0 : free_nvme_async_probe_ctx(ctx);
6724 0 : return -ENOKEY;
6725 : }
6726 0 : }
6727 :
6728 53 : if (ctx->bdev_opts.dhchap_key != NULL) {
6729 0 : ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key);
6730 0 : if (ctx->drv_opts.dhchap_key == NULL) {
6731 0 : SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n",
6732 : ctx->bdev_opts.dhchap_key);
6733 0 : free_nvme_async_probe_ctx(ctx);
6734 0 : return -ENOKEY;
6735 : }
6736 :
6737 0 : ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests;
6738 0 : ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups;
6739 0 : }
6740 53 : if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) {
6741 0 : ctx->drv_opts.dhchap_ctrlr_key =
6742 0 : spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key);
6743 0 : if (ctx->drv_opts.dhchap_ctrlr_key == NULL) {
6744 0 : SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n",
6745 : ctx->bdev_opts.dhchap_ctrlr_key);
6746 0 : free_nvme_async_probe_ctx(ctx);
6747 0 : return -ENOKEY;
6748 : }
6749 0 : }
6750 :
6751 53 : if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || ctx->bdev_opts.multipath) {
6752 49 : attach_cb = connect_attach_cb;
6753 49 : } else {
6754 4 : attach_cb = connect_set_failover_cb;
6755 : }
6756 :
6757 53 : nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
6758 53 : if (nvme_ctrlr && nvme_ctrlr->opts.multipath != ctx->bdev_opts.multipath) {
6759 : /* All controllers with the same name must be configured the same
6760 : * way, either for multipath or failover. If the configuration doesn't
6761 : * match - report error.
6762 : */
6763 0 : free_nvme_async_probe_ctx(ctx);
6764 0 : return -EINVAL;
6765 : }
6766 :
6767 53 : ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb);
6768 53 : if (ctx->probe_ctx == NULL) {
6769 0 : SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
6770 0 : free_nvme_async_probe_ctx(ctx);
6771 0 : return -ENODEV;
6772 : }
6773 53 : ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
6774 :
6775 53 : return 0;
6776 53 : }
6777 :
6778 : struct bdev_nvme_delete_ctx {
6779 : char *name;
6780 : struct nvme_path_id path_id;
6781 : bdev_nvme_delete_done_fn delete_done;
6782 : void *delete_done_ctx;
6783 : uint64_t timeout_ticks;
6784 : struct spdk_poller *poller;
6785 : };
6786 :
6787 : static void
6788 2 : free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx)
6789 : {
6790 2 : if (ctx != NULL) {
6791 1 : free(ctx->name);
6792 1 : free(ctx);
6793 1 : }
6794 2 : }
6795 :
6796 : static bool
6797 76 : nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id)
6798 : {
6799 76 : if (path_id->trid.trtype != 0) {
6800 21 : if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) {
6801 0 : if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) {
6802 0 : return false;
6803 : }
6804 0 : } else {
6805 21 : if (path_id->trid.trtype != p->trid.trtype) {
6806 0 : return false;
6807 : }
6808 : }
6809 21 : }
6810 :
6811 76 : if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) {
6812 21 : if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) {
6813 11 : return false;
6814 : }
6815 10 : }
6816 :
6817 65 : if (path_id->trid.adrfam != 0) {
6818 0 : if (path_id->trid.adrfam != p->trid.adrfam) {
6819 0 : return false;
6820 : }
6821 0 : }
6822 :
6823 65 : if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) {
6824 10 : if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) {
6825 0 : return false;
6826 : }
6827 10 : }
6828 :
6829 65 : if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) {
6830 10 : if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) {
6831 0 : return false;
6832 : }
6833 10 : }
6834 :
6835 65 : if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) {
6836 0 : if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) {
6837 0 : return false;
6838 : }
6839 0 : }
6840 :
6841 65 : if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) {
6842 0 : if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) {
6843 0 : return false;
6844 : }
6845 0 : }
6846 :
6847 65 : return true;
6848 76 : }
6849 :
6850 : static bool
6851 2 : nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id)
6852 : {
6853 2 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
6854 2 : struct nvme_ctrlr *ctrlr;
6855 2 : struct nvme_path_id *p;
6856 :
6857 2 : pthread_mutex_lock(&g_bdev_nvme_mutex);
6858 2 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
6859 2 : if (!nbdev_ctrlr) {
6860 1 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6861 1 : return false;
6862 : }
6863 :
6864 1 : TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
6865 1 : pthread_mutex_lock(&ctrlr->mutex);
6866 1 : TAILQ_FOREACH(p, &ctrlr->trids, link) {
6867 1 : if (nvme_path_id_compare(p, path_id)) {
6868 1 : pthread_mutex_unlock(&ctrlr->mutex);
6869 1 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6870 1 : return true;
6871 : }
6872 0 : }
6873 0 : pthread_mutex_unlock(&ctrlr->mutex);
6874 0 : }
6875 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6876 :
6877 0 : return false;
6878 2 : }
6879 :
6880 : static int
6881 2 : bdev_nvme_delete_complete_poll(void *arg)
6882 : {
6883 2 : struct bdev_nvme_delete_ctx *ctx = arg;
6884 2 : int rc = 0;
6885 :
6886 2 : if (nvme_path_id_exists(ctx->name, &ctx->path_id)) {
6887 1 : if (ctx->timeout_ticks > spdk_get_ticks()) {
6888 1 : return SPDK_POLLER_BUSY;
6889 : }
6890 :
6891 0 : SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name);
6892 0 : rc = -ETIMEDOUT;
6893 0 : }
6894 :
6895 1 : spdk_poller_unregister(&ctx->poller);
6896 :
6897 1 : ctx->delete_done(ctx->delete_done_ctx, rc);
6898 1 : free_bdev_nvme_delete_ctx(ctx);
6899 :
6900 1 : return SPDK_POLLER_BUSY;
6901 2 : }
6902 :
6903 : static int
6904 65 : _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id)
6905 : {
6906 65 : struct nvme_path_id *p, *t;
6907 65 : spdk_msg_fn msg_fn;
6908 65 : int rc = -ENXIO;
6909 :
6910 65 : pthread_mutex_lock(&nvme_ctrlr->mutex);
6911 :
6912 75 : TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) {
6913 75 : if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) {
6914 65 : break;
6915 : }
6916 :
6917 10 : if (!nvme_path_id_compare(p, path_id)) {
6918 3 : continue;
6919 : }
6920 :
6921 : /* We are not using the specified path. */
6922 7 : TAILQ_REMOVE(&nvme_ctrlr->trids, p, link);
6923 7 : free(p);
6924 7 : rc = 0;
6925 7 : }
6926 :
6927 65 : if (p == NULL || !nvme_path_id_compare(p, path_id)) {
6928 8 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6929 8 : return rc;
6930 : }
6931 :
6932 : /* If we made it here, then this path is a match! Now we need to remove it. */
6933 :
6934 : /* This is the active path in use right now. The active path is always the first in the list. */
6935 57 : assert(p == nvme_ctrlr->active_path_id);
6936 :
6937 57 : if (!TAILQ_NEXT(p, link)) {
6938 : /* The current path is the only path. */
6939 56 : msg_fn = _nvme_ctrlr_destruct;
6940 56 : rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false);
6941 56 : } else {
6942 : /* There is an alternative path. */
6943 1 : msg_fn = _bdev_nvme_reset_ctrlr;
6944 1 : rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true);
6945 : }
6946 :
6947 57 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6948 :
6949 57 : if (rc == 0) {
6950 57 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
6951 57 : } else if (rc == -EALREADY) {
6952 0 : rc = 0;
6953 0 : }
6954 :
6955 57 : return rc;
6956 65 : }
6957 :
6958 : int
6959 50 : bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id,
6960 : bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx)
6961 : {
6962 50 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
6963 50 : struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr;
6964 50 : struct bdev_nvme_delete_ctx *ctx = NULL;
6965 50 : int rc = -ENXIO, _rc;
6966 :
6967 50 : if (name == NULL || path_id == NULL) {
6968 0 : rc = -EINVAL;
6969 0 : goto exit;
6970 : }
6971 :
6972 50 : pthread_mutex_lock(&g_bdev_nvme_mutex);
6973 :
6974 50 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
6975 50 : if (nbdev_ctrlr == NULL) {
6976 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6977 :
6978 0 : SPDK_ERRLOG("Failed to find NVMe bdev controller\n");
6979 0 : rc = -ENODEV;
6980 0 : goto exit;
6981 : }
6982 :
6983 115 : TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) {
6984 65 : _rc = _bdev_nvme_delete(nvme_ctrlr, path_id);
6985 65 : if (_rc < 0 && _rc != -ENXIO) {
6986 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6987 0 : rc = _rc;
6988 0 : goto exit;
6989 65 : } else if (_rc == 0) {
6990 : /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr
6991 : * was deleted successfully. To remember the successful deletion,
6992 : * overwrite rc only if _rc is zero.
6993 : */
6994 59 : rc = 0;
6995 59 : }
6996 65 : }
6997 :
6998 50 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6999 :
7000 50 : if (rc != 0 || delete_done == NULL) {
7001 49 : goto exit;
7002 : }
7003 :
7004 1 : ctx = calloc(1, sizeof(*ctx));
7005 1 : if (ctx == NULL) {
7006 0 : SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n");
7007 0 : rc = -ENOMEM;
7008 0 : goto exit;
7009 : }
7010 :
7011 1 : ctx->name = strdup(name);
7012 1 : if (ctx->name == NULL) {
7013 0 : SPDK_ERRLOG("Failed to copy controller name for deletion\n");
7014 0 : rc = -ENOMEM;
7015 0 : goto exit;
7016 : }
7017 :
7018 1 : ctx->delete_done = delete_done;
7019 1 : ctx->delete_done_ctx = delete_done_ctx;
7020 1 : ctx->path_id = *path_id;
7021 1 : ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz();
7022 1 : ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000);
7023 1 : if (ctx->poller == NULL) {
7024 0 : SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n");
7025 0 : rc = -ENOMEM;
7026 0 : goto exit;
7027 : }
7028 :
7029 : exit:
7030 50 : if (rc != 0) {
7031 1 : free_bdev_nvme_delete_ctx(ctx);
7032 1 : }
7033 :
7034 100 : return rc;
7035 50 : }
7036 :
7037 : #define DISCOVERY_INFOLOG(ctx, format, ...) \
7038 : SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__);
7039 :
7040 : #define DISCOVERY_ERRLOG(ctx, format, ...) \
7041 : SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__);
7042 :
7043 : struct discovery_entry_ctx {
7044 : char name[128];
7045 : struct spdk_nvme_transport_id trid;
7046 : struct spdk_nvme_ctrlr_opts drv_opts;
7047 : struct spdk_nvmf_discovery_log_page_entry entry;
7048 : TAILQ_ENTRY(discovery_entry_ctx) tailq;
7049 : struct discovery_ctx *ctx;
7050 : };
7051 :
7052 : struct discovery_ctx {
7053 : char *name;
7054 : spdk_bdev_nvme_start_discovery_fn start_cb_fn;
7055 : spdk_bdev_nvme_stop_discovery_fn stop_cb_fn;
7056 : void *cb_ctx;
7057 : struct spdk_nvme_probe_ctx *probe_ctx;
7058 : struct spdk_nvme_detach_ctx *detach_ctx;
7059 : struct spdk_nvme_ctrlr *ctrlr;
7060 : struct spdk_nvme_transport_id trid;
7061 : struct discovery_entry_ctx *entry_ctx_in_use;
7062 : struct spdk_poller *poller;
7063 : struct spdk_nvme_ctrlr_opts drv_opts;
7064 : struct spdk_bdev_nvme_ctrlr_opts bdev_opts;
7065 : struct spdk_nvmf_discovery_log_page *log_page;
7066 : TAILQ_ENTRY(discovery_ctx) tailq;
7067 : TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs;
7068 : TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs;
7069 : int rc;
7070 : bool wait_for_attach;
7071 : uint64_t timeout_ticks;
7072 : /* Denotes that the discovery service is being started. We're waiting
7073 : * for the initial connection to the discovery controller to be
7074 : * established and attach discovered NVM ctrlrs.
7075 : */
7076 : bool initializing;
7077 : /* Denotes if a discovery is currently in progress for this context.
7078 : * That includes connecting to newly discovered subsystems. Used to
7079 : * ensure we do not start a new discovery until an existing one is
7080 : * complete.
7081 : */
7082 : bool in_progress;
7083 :
7084 : /* Denotes if another discovery is needed after the one in progress
7085 : * completes. Set when we receive an AER completion while a discovery
7086 : * is already in progress.
7087 : */
7088 : bool pending;
7089 :
7090 : /* Signal to the discovery context poller that it should stop the
7091 : * discovery service, including detaching from the current discovery
7092 : * controller.
7093 : */
7094 : bool stop;
7095 :
7096 : struct spdk_thread *calling_thread;
7097 : uint32_t index;
7098 : uint32_t attach_in_progress;
7099 : char *hostnqn;
7100 :
7101 : /* Denotes if the discovery service was started by the mdns discovery.
7102 : */
7103 : bool from_mdns_discovery_service;
7104 : };
7105 :
7106 : TAILQ_HEAD(discovery_ctxs, discovery_ctx);
7107 : static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs);
7108 :
7109 : static void get_discovery_log_page(struct discovery_ctx *ctx);
7110 :
7111 : static void
7112 0 : free_discovery_ctx(struct discovery_ctx *ctx)
7113 : {
7114 0 : free(ctx->log_page);
7115 0 : free(ctx->hostnqn);
7116 0 : free(ctx->name);
7117 0 : free(ctx);
7118 0 : }
7119 :
7120 : static void
7121 0 : discovery_complete(struct discovery_ctx *ctx)
7122 : {
7123 0 : ctx->initializing = false;
7124 0 : ctx->in_progress = false;
7125 0 : if (ctx->pending) {
7126 0 : ctx->pending = false;
7127 0 : get_discovery_log_page(ctx);
7128 0 : }
7129 0 : }
7130 :
7131 : static void
7132 0 : build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid,
7133 : struct spdk_nvmf_discovery_log_page_entry *entry)
7134 : {
7135 0 : char *space;
7136 :
7137 0 : trid->trtype = entry->trtype;
7138 0 : trid->adrfam = entry->adrfam;
7139 0 : memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr));
7140 0 : memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid));
7141 : /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and
7142 : * before call to this function trid->subnqn is zeroed out, we need
7143 : * to copy sizeof(trid->subnqn) minus one byte to make sure the last character
7144 : * remains 0. Then we can shorten the string (replace ' ' with 0) if required
7145 : */
7146 0 : memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1);
7147 :
7148 : /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated.
7149 : * But the log page entries typically pad them with spaces, not zeroes.
7150 : * So add a NULL terminator to each of these fields at the appropriate
7151 : * location.
7152 : */
7153 0 : space = strchr(trid->traddr, ' ');
7154 0 : if (space) {
7155 0 : *space = 0;
7156 0 : }
7157 0 : space = strchr(trid->trsvcid, ' ');
7158 0 : if (space) {
7159 0 : *space = 0;
7160 0 : }
7161 0 : space = strchr(trid->subnqn, ' ');
7162 0 : if (space) {
7163 0 : *space = 0;
7164 0 : }
7165 0 : }
7166 :
7167 : static void
7168 0 : _stop_discovery(void *_ctx)
7169 : {
7170 0 : struct discovery_ctx *ctx = _ctx;
7171 :
7172 0 : if (ctx->attach_in_progress > 0) {
7173 0 : spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx);
7174 0 : return;
7175 : }
7176 :
7177 0 : ctx->stop = true;
7178 :
7179 0 : while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) {
7180 0 : struct discovery_entry_ctx *entry_ctx;
7181 0 : struct nvme_path_id path = {};
7182 :
7183 0 : entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs);
7184 0 : path.trid = entry_ctx->trid;
7185 0 : bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL);
7186 0 : TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq);
7187 0 : free(entry_ctx);
7188 0 : }
7189 :
7190 0 : while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) {
7191 0 : struct discovery_entry_ctx *entry_ctx;
7192 :
7193 0 : entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs);
7194 0 : TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq);
7195 0 : free(entry_ctx);
7196 0 : }
7197 :
7198 0 : free(ctx->entry_ctx_in_use);
7199 0 : ctx->entry_ctx_in_use = NULL;
7200 0 : }
7201 :
7202 : static void
7203 0 : stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx)
7204 : {
7205 0 : ctx->stop_cb_fn = cb_fn;
7206 0 : ctx->cb_ctx = cb_ctx;
7207 :
7208 0 : if (ctx->attach_in_progress > 0) {
7209 0 : DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n",
7210 : ctx->attach_in_progress);
7211 0 : }
7212 :
7213 0 : _stop_discovery(ctx);
7214 0 : }
7215 :
7216 : static void
7217 2 : remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr)
7218 : {
7219 2 : struct discovery_ctx *d_ctx;
7220 2 : struct nvme_path_id *path_id;
7221 2 : struct spdk_nvme_transport_id trid = {};
7222 2 : struct discovery_entry_ctx *entry_ctx, *tmp;
7223 :
7224 2 : path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
7225 :
7226 2 : TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) {
7227 0 : TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) {
7228 0 : build_trid_from_log_page_entry(&trid, &entry_ctx->entry);
7229 0 : if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) {
7230 0 : continue;
7231 : }
7232 :
7233 0 : TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq);
7234 0 : free(entry_ctx);
7235 0 : DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n",
7236 : trid.subnqn, trid.traddr, trid.trsvcid);
7237 :
7238 : /* Fail discovery ctrlr to force reattach attempt */
7239 0 : spdk_nvme_ctrlr_fail(d_ctx->ctrlr);
7240 0 : }
7241 0 : }
7242 2 : }
7243 :
7244 : static void
7245 0 : discovery_remove_controllers(struct discovery_ctx *ctx)
7246 : {
7247 0 : struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page;
7248 0 : struct discovery_entry_ctx *entry_ctx, *tmp;
7249 0 : struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry;
7250 0 : struct spdk_nvme_transport_id old_trid = {};
7251 0 : uint64_t numrec, i;
7252 0 : bool found;
7253 :
7254 0 : numrec = from_le64(&log_page->numrec);
7255 0 : TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) {
7256 0 : found = false;
7257 0 : old_entry = &entry_ctx->entry;
7258 0 : build_trid_from_log_page_entry(&old_trid, old_entry);
7259 0 : for (i = 0; i < numrec; i++) {
7260 0 : new_entry = &log_page->entries[i];
7261 0 : if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) {
7262 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n",
7263 : old_trid.subnqn, old_trid.traddr, old_trid.trsvcid);
7264 0 : found = true;
7265 0 : break;
7266 : }
7267 0 : }
7268 0 : if (!found) {
7269 0 : struct nvme_path_id path = {};
7270 :
7271 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n",
7272 : old_trid.subnqn, old_trid.traddr, old_trid.trsvcid);
7273 :
7274 0 : path.trid = entry_ctx->trid;
7275 0 : bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL);
7276 0 : TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq);
7277 0 : free(entry_ctx);
7278 0 : }
7279 0 : }
7280 0 : free(log_page);
7281 0 : ctx->log_page = NULL;
7282 0 : discovery_complete(ctx);
7283 0 : }
7284 :
7285 : static void
7286 0 : complete_discovery_start(struct discovery_ctx *ctx, int status)
7287 : {
7288 0 : ctx->timeout_ticks = 0;
7289 0 : ctx->rc = status;
7290 0 : if (ctx->start_cb_fn) {
7291 0 : ctx->start_cb_fn(ctx->cb_ctx, status);
7292 0 : ctx->start_cb_fn = NULL;
7293 0 : ctx->cb_ctx = NULL;
7294 0 : }
7295 0 : }
7296 :
7297 : static void
7298 0 : discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc)
7299 : {
7300 0 : struct discovery_entry_ctx *entry_ctx = cb_ctx;
7301 0 : struct discovery_ctx *ctx = entry_ctx->ctx;
7302 :
7303 0 : DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name);
7304 0 : ctx->attach_in_progress--;
7305 0 : if (ctx->attach_in_progress == 0) {
7306 0 : complete_discovery_start(ctx, ctx->rc);
7307 0 : if (ctx->initializing && ctx->rc != 0) {
7308 0 : DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc);
7309 0 : stop_discovery(ctx, NULL, ctx->cb_ctx);
7310 0 : } else {
7311 0 : discovery_remove_controllers(ctx);
7312 : }
7313 0 : }
7314 0 : }
7315 :
7316 : static struct discovery_entry_ctx *
7317 0 : create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid)
7318 : {
7319 0 : struct discovery_entry_ctx *new_ctx;
7320 :
7321 0 : new_ctx = calloc(1, sizeof(*new_ctx));
7322 0 : if (new_ctx == NULL) {
7323 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7324 0 : return NULL;
7325 : }
7326 :
7327 0 : new_ctx->ctx = ctx;
7328 0 : memcpy(&new_ctx->trid, trid, sizeof(*trid));
7329 0 : spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts));
7330 0 : snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn);
7331 0 : return new_ctx;
7332 0 : }
7333 :
7334 : static void
7335 0 : discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl,
7336 : struct spdk_nvmf_discovery_log_page *log_page)
7337 : {
7338 0 : struct discovery_ctx *ctx = cb_arg;
7339 0 : struct discovery_entry_ctx *entry_ctx, *tmp;
7340 0 : struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry;
7341 0 : uint64_t numrec, i;
7342 0 : bool found;
7343 :
7344 0 : if (rc || spdk_nvme_cpl_is_error(cpl)) {
7345 0 : DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n");
7346 0 : return;
7347 : }
7348 :
7349 0 : ctx->log_page = log_page;
7350 0 : assert(ctx->attach_in_progress == 0);
7351 0 : numrec = from_le64(&log_page->numrec);
7352 0 : TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) {
7353 0 : TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq);
7354 0 : free(entry_ctx);
7355 0 : }
7356 0 : for (i = 0; i < numrec; i++) {
7357 0 : found = false;
7358 0 : new_entry = &log_page->entries[i];
7359 0 : if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT ||
7360 0 : new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
7361 0 : struct discovery_entry_ctx *new_ctx;
7362 0 : struct spdk_nvme_transport_id trid = {};
7363 :
7364 0 : build_trid_from_log_page_entry(&trid, new_entry);
7365 0 : new_ctx = create_discovery_entry_ctx(ctx, &trid);
7366 0 : if (new_ctx == NULL) {
7367 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7368 0 : break;
7369 : }
7370 :
7371 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq);
7372 0 : continue;
7373 0 : }
7374 0 : TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) {
7375 0 : old_entry = &entry_ctx->entry;
7376 0 : if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) {
7377 0 : found = true;
7378 0 : break;
7379 : }
7380 0 : }
7381 0 : if (!found) {
7382 0 : struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx;
7383 0 : struct discovery_ctx *d_ctx;
7384 :
7385 0 : TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) {
7386 0 : TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) {
7387 0 : if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn,
7388 : sizeof(new_entry->subnqn))) {
7389 0 : break;
7390 : }
7391 0 : }
7392 0 : if (subnqn_ctx) {
7393 0 : break;
7394 : }
7395 0 : }
7396 :
7397 0 : new_ctx = calloc(1, sizeof(*new_ctx));
7398 0 : if (new_ctx == NULL) {
7399 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7400 0 : break;
7401 : }
7402 :
7403 0 : new_ctx->ctx = ctx;
7404 0 : memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry));
7405 0 : build_trid_from_log_page_entry(&new_ctx->trid, new_entry);
7406 0 : if (subnqn_ctx) {
7407 0 : snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name);
7408 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n",
7409 : new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid,
7410 : new_ctx->name);
7411 0 : } else {
7412 0 : snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++);
7413 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n",
7414 : new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid,
7415 : new_ctx->name);
7416 : }
7417 0 : spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts));
7418 0 : snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn);
7419 0 : rc = spdk_bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0,
7420 0 : discovery_attach_controller_done, new_ctx,
7421 0 : &new_ctx->drv_opts, &ctx->bdev_opts);
7422 0 : if (rc == 0) {
7423 0 : TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq);
7424 0 : ctx->attach_in_progress++;
7425 0 : } else {
7426 0 : DISCOVERY_ERRLOG(ctx, "spdk_bdev_nvme_create failed (%s)\n", spdk_strerror(-rc));
7427 : }
7428 0 : }
7429 0 : }
7430 :
7431 0 : if (ctx->attach_in_progress == 0) {
7432 0 : discovery_remove_controllers(ctx);
7433 0 : }
7434 0 : }
7435 :
7436 : static void
7437 0 : get_discovery_log_page(struct discovery_ctx *ctx)
7438 : {
7439 0 : int rc;
7440 :
7441 0 : assert(ctx->in_progress == false);
7442 0 : ctx->in_progress = true;
7443 0 : rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx);
7444 0 : if (rc != 0) {
7445 0 : DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n");
7446 0 : }
7447 0 : DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n");
7448 0 : }
7449 :
7450 : static void
7451 0 : discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
7452 : {
7453 0 : struct discovery_ctx *ctx = arg;
7454 0 : uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
7455 :
7456 0 : if (spdk_nvme_cpl_is_error(cpl)) {
7457 0 : DISCOVERY_ERRLOG(ctx, "aer failed\n");
7458 0 : return;
7459 : }
7460 :
7461 0 : if (log_page_id != SPDK_NVME_LOG_DISCOVERY) {
7462 0 : DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id);
7463 0 : return;
7464 : }
7465 :
7466 0 : DISCOVERY_INFOLOG(ctx, "got aer\n");
7467 0 : if (ctx->in_progress) {
7468 0 : ctx->pending = true;
7469 0 : return;
7470 : }
7471 :
7472 0 : get_discovery_log_page(ctx);
7473 0 : }
7474 :
7475 : static void
7476 0 : discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
7477 : struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
7478 : {
7479 0 : struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
7480 0 : struct discovery_ctx *ctx;
7481 :
7482 0 : ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts);
7483 :
7484 0 : DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n");
7485 0 : ctx->probe_ctx = NULL;
7486 0 : ctx->ctrlr = ctrlr;
7487 :
7488 0 : if (ctx->rc != 0) {
7489 0 : DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n",
7490 : ctx->rc);
7491 0 : return;
7492 : }
7493 :
7494 0 : spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx);
7495 0 : }
7496 :
7497 : static int
7498 0 : discovery_poller(void *arg)
7499 : {
7500 0 : struct discovery_ctx *ctx = arg;
7501 0 : struct spdk_nvme_transport_id *trid;
7502 0 : int rc;
7503 :
7504 0 : if (ctx->detach_ctx) {
7505 0 : rc = spdk_nvme_detach_poll_async(ctx->detach_ctx);
7506 0 : if (rc != -EAGAIN) {
7507 0 : ctx->detach_ctx = NULL;
7508 0 : ctx->ctrlr = NULL;
7509 0 : }
7510 0 : } else if (ctx->stop) {
7511 0 : if (ctx->ctrlr != NULL) {
7512 0 : rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx);
7513 0 : if (rc == 0) {
7514 0 : return SPDK_POLLER_BUSY;
7515 : }
7516 0 : DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n");
7517 0 : }
7518 0 : spdk_poller_unregister(&ctx->poller);
7519 0 : TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq);
7520 0 : assert(ctx->start_cb_fn == NULL);
7521 0 : if (ctx->stop_cb_fn != NULL) {
7522 0 : ctx->stop_cb_fn(ctx->cb_ctx);
7523 0 : }
7524 0 : free_discovery_ctx(ctx);
7525 0 : } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) {
7526 0 : if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) {
7527 0 : DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n");
7528 0 : assert(ctx->initializing);
7529 0 : spdk_poller_unregister(&ctx->poller);
7530 0 : TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq);
7531 0 : complete_discovery_start(ctx, -ETIMEDOUT);
7532 0 : stop_discovery(ctx, NULL, NULL);
7533 0 : free_discovery_ctx(ctx);
7534 0 : return SPDK_POLLER_BUSY;
7535 : }
7536 :
7537 0 : assert(ctx->entry_ctx_in_use == NULL);
7538 0 : ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs);
7539 0 : TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq);
7540 0 : trid = &ctx->entry_ctx_in_use->trid;
7541 :
7542 : /* All controllers must be configured explicitely either for multipath or failover.
7543 : * While discovery use multipath mode, we need to set this in bdev options as well.
7544 : */
7545 0 : ctx->bdev_opts.multipath = true;
7546 :
7547 0 : ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb);
7548 0 : if (ctx->probe_ctx) {
7549 0 : spdk_poller_unregister(&ctx->poller);
7550 0 : ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000);
7551 0 : } else {
7552 0 : DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n");
7553 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq);
7554 0 : ctx->entry_ctx_in_use = NULL;
7555 : }
7556 0 : } else if (ctx->probe_ctx) {
7557 0 : if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) {
7558 0 : DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n");
7559 0 : complete_discovery_start(ctx, -ETIMEDOUT);
7560 0 : return SPDK_POLLER_BUSY;
7561 : }
7562 :
7563 0 : rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
7564 0 : if (rc != -EAGAIN) {
7565 0 : if (ctx->rc != 0) {
7566 0 : assert(ctx->initializing);
7567 0 : stop_discovery(ctx, NULL, ctx->cb_ctx);
7568 0 : } else {
7569 0 : assert(rc == 0);
7570 0 : DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n");
7571 0 : ctx->rc = rc;
7572 0 : get_discovery_log_page(ctx);
7573 : }
7574 0 : }
7575 0 : } else {
7576 0 : if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) {
7577 0 : DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n");
7578 0 : complete_discovery_start(ctx, -ETIMEDOUT);
7579 : /* We need to wait until all NVM ctrlrs are attached before we stop the
7580 : * discovery service to make sure we don't detach a ctrlr that is still
7581 : * being attached.
7582 : */
7583 0 : if (ctx->attach_in_progress == 0) {
7584 0 : stop_discovery(ctx, NULL, ctx->cb_ctx);
7585 0 : return SPDK_POLLER_BUSY;
7586 : }
7587 0 : }
7588 :
7589 0 : rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr);
7590 0 : if (rc < 0) {
7591 0 : spdk_poller_unregister(&ctx->poller);
7592 0 : ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000);
7593 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq);
7594 0 : ctx->entry_ctx_in_use = NULL;
7595 :
7596 0 : rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx);
7597 0 : if (rc != 0) {
7598 0 : DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n");
7599 0 : ctx->ctrlr = NULL;
7600 0 : }
7601 0 : }
7602 : }
7603 :
7604 0 : return SPDK_POLLER_BUSY;
7605 0 : }
7606 :
7607 : static void
7608 0 : start_discovery_poller(void *arg)
7609 : {
7610 0 : struct discovery_ctx *ctx = arg;
7611 :
7612 0 : TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq);
7613 0 : ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000);
7614 0 : }
7615 :
7616 : int
7617 0 : bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid,
7618 : const char *base_name,
7619 : struct spdk_nvme_ctrlr_opts *drv_opts,
7620 : struct spdk_bdev_nvme_ctrlr_opts *bdev_opts,
7621 : uint64_t attach_timeout,
7622 : bool from_mdns,
7623 : spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx)
7624 : {
7625 0 : struct discovery_ctx *ctx;
7626 0 : struct discovery_entry_ctx *discovery_entry_ctx;
7627 :
7628 0 : snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
7629 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
7630 0 : if (strcmp(ctx->name, base_name) == 0) {
7631 0 : return -EEXIST;
7632 : }
7633 :
7634 0 : if (ctx->entry_ctx_in_use != NULL) {
7635 0 : if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) {
7636 0 : return -EEXIST;
7637 : }
7638 0 : }
7639 :
7640 0 : TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) {
7641 0 : if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) {
7642 0 : return -EEXIST;
7643 : }
7644 0 : }
7645 0 : }
7646 :
7647 0 : ctx = calloc(1, sizeof(*ctx));
7648 0 : if (ctx == NULL) {
7649 0 : return -ENOMEM;
7650 : }
7651 :
7652 0 : ctx->name = strdup(base_name);
7653 0 : if (ctx->name == NULL) {
7654 0 : free_discovery_ctx(ctx);
7655 0 : return -ENOMEM;
7656 : }
7657 0 : memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts));
7658 0 : memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts));
7659 0 : ctx->from_mdns_discovery_service = from_mdns;
7660 0 : ctx->bdev_opts.from_discovery_service = true;
7661 0 : ctx->calling_thread = spdk_get_thread();
7662 0 : ctx->start_cb_fn = cb_fn;
7663 0 : ctx->cb_ctx = cb_ctx;
7664 0 : ctx->initializing = true;
7665 0 : if (ctx->start_cb_fn) {
7666 : /* We can use this when dumping json to denote if this RPC parameter
7667 : * was specified or not.
7668 : */
7669 0 : ctx->wait_for_attach = true;
7670 0 : }
7671 0 : if (attach_timeout != 0) {
7672 0 : ctx->timeout_ticks = spdk_get_ticks() + attach_timeout *
7673 0 : spdk_get_ticks_hz() / 1000ull;
7674 0 : }
7675 0 : TAILQ_INIT(&ctx->nvm_entry_ctxs);
7676 0 : TAILQ_INIT(&ctx->discovery_entry_ctxs);
7677 0 : memcpy(&ctx->trid, trid, sizeof(*trid));
7678 : /* Even if user did not specify hostnqn, we can still strdup("\0"); */
7679 0 : ctx->hostnqn = strdup(ctx->drv_opts.hostnqn);
7680 0 : if (ctx->hostnqn == NULL) {
7681 0 : free_discovery_ctx(ctx);
7682 0 : return -ENOMEM;
7683 : }
7684 0 : discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid);
7685 0 : if (discovery_entry_ctx == NULL) {
7686 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7687 0 : free_discovery_ctx(ctx);
7688 0 : return -ENOMEM;
7689 : }
7690 :
7691 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq);
7692 0 : spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx);
7693 0 : return 0;
7694 0 : }
7695 :
7696 : int
7697 0 : bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx)
7698 : {
7699 0 : struct discovery_ctx *ctx;
7700 :
7701 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
7702 0 : if (strcmp(name, ctx->name) == 0) {
7703 0 : if (ctx->stop) {
7704 0 : return -EALREADY;
7705 : }
7706 : /* If we're still starting the discovery service and ->rc is non-zero, we're
7707 : * going to stop it as soon as we can
7708 : */
7709 0 : if (ctx->initializing && ctx->rc != 0) {
7710 0 : return -EALREADY;
7711 : }
7712 0 : stop_discovery(ctx, cb_fn, cb_ctx);
7713 0 : return 0;
7714 : }
7715 0 : }
7716 :
7717 0 : return -ENOENT;
7718 0 : }
7719 :
7720 : static int
7721 1 : bdev_nvme_library_init(void)
7722 : {
7723 1 : g_bdev_nvme_init_thread = spdk_get_thread();
7724 :
7725 1 : spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb,
7726 : bdev_nvme_destroy_poll_group_cb,
7727 : sizeof(struct nvme_poll_group), "nvme_poll_groups");
7728 :
7729 1 : return 0;
7730 : }
7731 :
7732 : static void
7733 1 : bdev_nvme_fini_destruct_ctrlrs(void)
7734 : {
7735 1 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
7736 1 : struct nvme_ctrlr *nvme_ctrlr;
7737 :
7738 1 : pthread_mutex_lock(&g_bdev_nvme_mutex);
7739 1 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
7740 0 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
7741 0 : pthread_mutex_lock(&nvme_ctrlr->mutex);
7742 0 : if (nvme_ctrlr->destruct) {
7743 : /* This controller's destruction was already started
7744 : * before the application started shutting down
7745 : */
7746 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
7747 0 : continue;
7748 : }
7749 0 : nvme_ctrlr->destruct = true;
7750 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
7751 :
7752 0 : spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct,
7753 0 : nvme_ctrlr);
7754 0 : }
7755 0 : }
7756 :
7757 1 : g_bdev_nvme_module_finish = true;
7758 1 : if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
7759 1 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
7760 1 : spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
7761 1 : spdk_bdev_module_fini_done();
7762 1 : return;
7763 : }
7764 :
7765 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
7766 1 : }
7767 :
7768 : static void
7769 0 : check_discovery_fini(void *arg)
7770 : {
7771 0 : if (TAILQ_EMPTY(&g_discovery_ctxs)) {
7772 0 : bdev_nvme_fini_destruct_ctrlrs();
7773 0 : }
7774 0 : }
7775 :
7776 : static void
7777 1 : bdev_nvme_library_fini(void)
7778 : {
7779 1 : struct nvme_probe_skip_entry *entry, *entry_tmp;
7780 1 : struct discovery_ctx *ctx;
7781 :
7782 1 : spdk_poller_unregister(&g_hotplug_poller);
7783 1 : free(g_hotplug_probe_ctx);
7784 1 : g_hotplug_probe_ctx = NULL;
7785 :
7786 1 : TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
7787 0 : TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
7788 0 : free(entry);
7789 0 : }
7790 :
7791 1 : assert(spdk_get_thread() == g_bdev_nvme_init_thread);
7792 1 : if (TAILQ_EMPTY(&g_discovery_ctxs)) {
7793 1 : bdev_nvme_fini_destruct_ctrlrs();
7794 1 : } else {
7795 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
7796 0 : stop_discovery(ctx, check_discovery_fini, NULL);
7797 0 : }
7798 : }
7799 1 : }
7800 :
7801 : static void
7802 0 : bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio)
7803 : {
7804 0 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7805 0 : struct spdk_bdev *bdev = bdev_io->bdev;
7806 0 : struct spdk_dif_ctx dif_ctx;
7807 0 : struct spdk_dif_error err_blk = {};
7808 0 : int rc;
7809 0 : struct spdk_dif_ctx_init_ext_opts dif_opts;
7810 :
7811 0 : dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
7812 0 : dif_opts.dif_pi_format = bdev->dif_pi_format;
7813 0 : rc = spdk_dif_ctx_init(&dif_ctx,
7814 0 : bdev->blocklen, bdev->md_len, bdev->md_interleave,
7815 0 : bdev->dif_is_head_of_md, bdev->dif_type,
7816 0 : bdev_io->u.bdev.dif_check_flags,
7817 0 : bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts);
7818 0 : if (rc != 0) {
7819 0 : SPDK_ERRLOG("Initialization of DIF context failed\n");
7820 0 : return;
7821 : }
7822 :
7823 0 : if (bdev->md_interleave) {
7824 0 : rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
7825 0 : bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
7826 0 : } else {
7827 0 : struct iovec md_iov = {
7828 0 : .iov_base = bdev_io->u.bdev.md_buf,
7829 0 : .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len,
7830 : };
7831 :
7832 0 : rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
7833 0 : &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
7834 0 : }
7835 :
7836 0 : if (rc != 0) {
7837 0 : SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
7838 : err_blk.err_type, err_blk.err_offset);
7839 0 : } else {
7840 0 : SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
7841 : }
7842 0 : }
7843 :
7844 : static void
7845 0 : bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
7846 : {
7847 0 : struct nvme_bdev_io *bio = ref;
7848 :
7849 0 : if (spdk_nvme_cpl_is_success(cpl)) {
7850 : /* Run PI verification for read data buffer. */
7851 0 : bdev_nvme_verify_pi_error(bio);
7852 0 : }
7853 :
7854 : /* Return original completion status */
7855 0 : bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
7856 0 : }
7857 :
7858 : static void
7859 3 : bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
7860 : {
7861 3 : struct nvme_bdev_io *bio = ref;
7862 3 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7863 3 : int ret;
7864 :
7865 3 : if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
7866 0 : SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
7867 : cpl->status.sct, cpl->status.sc);
7868 :
7869 : /* Save completion status to use after verifying PI error. */
7870 0 : bio->cpl = *cpl;
7871 :
7872 0 : if (spdk_likely(nvme_io_path_is_available(bio->io_path))) {
7873 : /* Read without PI checking to verify PI error. */
7874 0 : ret = bdev_nvme_no_pi_readv(bio,
7875 0 : bdev_io->u.bdev.iovs,
7876 0 : bdev_io->u.bdev.iovcnt,
7877 0 : bdev_io->u.bdev.md_buf,
7878 0 : bdev_io->u.bdev.num_blocks,
7879 0 : bdev_io->u.bdev.offset_blocks);
7880 0 : if (ret == 0) {
7881 0 : return;
7882 : }
7883 0 : }
7884 0 : }
7885 :
7886 3 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7887 3 : }
7888 :
7889 : static void
7890 25 : bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
7891 : {
7892 25 : struct nvme_bdev_io *bio = ref;
7893 :
7894 25 : if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
7895 0 : SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
7896 : cpl->status.sct, cpl->status.sc);
7897 : /* Run PI verification for write data buffer if PI error is detected. */
7898 0 : bdev_nvme_verify_pi_error(bio);
7899 0 : }
7900 :
7901 25 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7902 25 : }
7903 :
7904 : static void
7905 0 : bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
7906 : {
7907 0 : struct nvme_bdev_io *bio = ref;
7908 0 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7909 :
7910 : /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
7911 : * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
7912 : */
7913 0 : bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
7914 :
7915 0 : if (spdk_nvme_cpl_is_pi_error(cpl)) {
7916 0 : SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
7917 : cpl->status.sct, cpl->status.sc);
7918 : /* Run PI verification for zone append data buffer if PI error is detected. */
7919 0 : bdev_nvme_verify_pi_error(bio);
7920 0 : }
7921 :
7922 0 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7923 0 : }
7924 :
7925 : static void
7926 1 : bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
7927 : {
7928 1 : struct nvme_bdev_io *bio = ref;
7929 :
7930 1 : if (spdk_nvme_cpl_is_pi_error(cpl)) {
7931 0 : SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
7932 : cpl->status.sct, cpl->status.sc);
7933 : /* Run PI verification for compare data buffer if PI error is detected. */
7934 0 : bdev_nvme_verify_pi_error(bio);
7935 0 : }
7936 :
7937 1 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7938 1 : }
7939 :
7940 : static void
7941 4 : bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
7942 : {
7943 4 : struct nvme_bdev_io *bio = ref;
7944 :
7945 : /* Compare operation completion */
7946 4 : if (!bio->first_fused_completed) {
7947 : /* Save compare result for write callback */
7948 2 : bio->cpl = *cpl;
7949 2 : bio->first_fused_completed = true;
7950 2 : return;
7951 : }
7952 :
7953 : /* Write operation completion */
7954 2 : if (spdk_nvme_cpl_is_error(&bio->cpl)) {
7955 : /* If bio->cpl is already an error, it means the compare operation failed. In that case,
7956 : * complete the IO with the compare operation's status.
7957 : */
7958 1 : if (!spdk_nvme_cpl_is_error(cpl)) {
7959 1 : SPDK_ERRLOG("Unexpected write success after compare failure.\n");
7960 1 : }
7961 :
7962 1 : bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
7963 1 : } else {
7964 1 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7965 : }
7966 4 : }
7967 :
7968 : static void
7969 1 : bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
7970 : {
7971 1 : struct nvme_bdev_io *bio = ref;
7972 :
7973 1 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7974 1 : }
7975 :
7976 : static int
7977 0 : fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
7978 : {
7979 0 : switch (desc->zt) {
7980 : case SPDK_NVME_ZONE_TYPE_SEQWR:
7981 0 : info->type = SPDK_BDEV_ZONE_TYPE_SEQWR;
7982 0 : break;
7983 : default:
7984 0 : SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt);
7985 0 : return -EIO;
7986 : }
7987 :
7988 0 : switch (desc->zs) {
7989 : case SPDK_NVME_ZONE_STATE_EMPTY:
7990 0 : info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
7991 0 : break;
7992 : case SPDK_NVME_ZONE_STATE_IOPEN:
7993 0 : info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
7994 0 : break;
7995 : case SPDK_NVME_ZONE_STATE_EOPEN:
7996 0 : info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
7997 0 : break;
7998 : case SPDK_NVME_ZONE_STATE_CLOSED:
7999 0 : info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
8000 0 : break;
8001 : case SPDK_NVME_ZONE_STATE_RONLY:
8002 0 : info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
8003 0 : break;
8004 : case SPDK_NVME_ZONE_STATE_FULL:
8005 0 : info->state = SPDK_BDEV_ZONE_STATE_FULL;
8006 0 : break;
8007 : case SPDK_NVME_ZONE_STATE_OFFLINE:
8008 0 : info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
8009 0 : break;
8010 : default:
8011 0 : SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
8012 0 : return -EIO;
8013 : }
8014 :
8015 0 : info->zone_id = desc->zslba;
8016 0 : info->write_pointer = desc->wp;
8017 0 : info->capacity = desc->zcap;
8018 :
8019 0 : return 0;
8020 0 : }
8021 :
8022 : static void
8023 0 : bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
8024 : {
8025 0 : struct nvme_bdev_io *bio = ref;
8026 0 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8027 0 : uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
8028 0 : uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
8029 0 : struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
8030 0 : uint64_t max_zones_per_buf, i;
8031 0 : uint32_t zone_report_bufsize;
8032 0 : struct spdk_nvme_ns *ns;
8033 0 : struct spdk_nvme_qpair *qpair;
8034 0 : int ret;
8035 :
8036 0 : if (spdk_nvme_cpl_is_error(cpl)) {
8037 0 : goto out_complete_io_nvme_cpl;
8038 : }
8039 :
8040 0 : if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
8041 0 : ret = -ENXIO;
8042 0 : goto out_complete_io_ret;
8043 : }
8044 :
8045 0 : ns = bio->io_path->nvme_ns->ns;
8046 0 : qpair = bio->io_path->qpair->qpair;
8047 :
8048 0 : zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
8049 0 : max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
8050 : sizeof(bio->zone_report_buf->descs[0]);
8051 :
8052 0 : if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
8053 0 : ret = -EINVAL;
8054 0 : goto out_complete_io_ret;
8055 : }
8056 :
8057 0 : if (!bio->zone_report_buf->nr_zones) {
8058 0 : ret = -EINVAL;
8059 0 : goto out_complete_io_ret;
8060 : }
8061 :
8062 0 : for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
8063 0 : ret = fill_zone_from_report(&info[bio->handled_zones],
8064 0 : &bio->zone_report_buf->descs[i]);
8065 0 : if (ret) {
8066 0 : goto out_complete_io_ret;
8067 : }
8068 0 : bio->handled_zones++;
8069 0 : }
8070 :
8071 0 : if (bio->handled_zones < zones_to_copy) {
8072 0 : uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
8073 0 : uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
8074 :
8075 0 : memset(bio->zone_report_buf, 0, zone_report_bufsize);
8076 0 : ret = spdk_nvme_zns_report_zones(ns, qpair,
8077 0 : bio->zone_report_buf, zone_report_bufsize,
8078 0 : slba, SPDK_NVME_ZRA_LIST_ALL, true,
8079 0 : bdev_nvme_get_zone_info_done, bio);
8080 0 : if (!ret) {
8081 0 : return;
8082 : } else {
8083 0 : goto out_complete_io_ret;
8084 : }
8085 0 : }
8086 :
8087 : out_complete_io_nvme_cpl:
8088 0 : free(bio->zone_report_buf);
8089 0 : bio->zone_report_buf = NULL;
8090 0 : bdev_nvme_io_complete_nvme_status(bio, cpl);
8091 0 : return;
8092 :
8093 : out_complete_io_ret:
8094 0 : free(bio->zone_report_buf);
8095 0 : bio->zone_report_buf = NULL;
8096 0 : bdev_nvme_io_complete(bio, ret);
8097 0 : }
8098 :
8099 : static void
8100 0 : bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
8101 : {
8102 0 : struct nvme_bdev_io *bio = ref;
8103 :
8104 0 : bdev_nvme_io_complete_nvme_status(bio, cpl);
8105 0 : }
8106 :
8107 : static void
8108 4 : bdev_nvme_admin_passthru_complete_nvme_status(void *ctx)
8109 : {
8110 4 : struct nvme_bdev_io *bio = ctx;
8111 4 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8112 4 : const struct spdk_nvme_cpl *cpl = &bio->cpl;
8113 :
8114 4 : assert(bdev_nvme_io_type_is_admin(bdev_io->type));
8115 :
8116 4 : __bdev_nvme_io_complete(bdev_io, 0, cpl);
8117 4 : }
8118 :
8119 : static void
8120 3 : bdev_nvme_abort_complete(void *ctx)
8121 : {
8122 3 : struct nvme_bdev_io *bio = ctx;
8123 3 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8124 :
8125 3 : if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
8126 3 : __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL);
8127 3 : } else {
8128 0 : __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL);
8129 : }
8130 3 : }
8131 :
8132 : static void
8133 3 : bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
8134 : {
8135 3 : struct nvme_bdev_io *bio = ref;
8136 3 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8137 :
8138 3 : bio->cpl = *cpl;
8139 3 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio);
8140 3 : }
8141 :
8142 : static void
8143 4 : bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
8144 : {
8145 4 : struct nvme_bdev_io *bio = ref;
8146 4 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8147 :
8148 4 : bio->cpl = *cpl;
8149 8 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
8150 4 : bdev_nvme_admin_passthru_complete_nvme_status, bio);
8151 4 : }
8152 :
8153 : static void
8154 0 : bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
8155 : {
8156 0 : struct nvme_bdev_io *bio = ref;
8157 0 : struct iovec *iov;
8158 :
8159 0 : bio->iov_offset = sgl_offset;
8160 0 : for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
8161 0 : iov = &bio->iovs[bio->iovpos];
8162 0 : if (bio->iov_offset < iov->iov_len) {
8163 0 : break;
8164 : }
8165 :
8166 0 : bio->iov_offset -= iov->iov_len;
8167 0 : }
8168 0 : }
8169 :
8170 : static int
8171 0 : bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
8172 : {
8173 0 : struct nvme_bdev_io *bio = ref;
8174 0 : struct iovec *iov;
8175 :
8176 0 : assert(bio->iovpos < bio->iovcnt);
8177 :
8178 0 : iov = &bio->iovs[bio->iovpos];
8179 :
8180 0 : *address = iov->iov_base;
8181 0 : *length = iov->iov_len;
8182 :
8183 0 : if (bio->iov_offset) {
8184 0 : assert(bio->iov_offset <= iov->iov_len);
8185 0 : *address += bio->iov_offset;
8186 0 : *length -= bio->iov_offset;
8187 0 : }
8188 :
8189 0 : bio->iov_offset += *length;
8190 0 : if (bio->iov_offset == iov->iov_len) {
8191 0 : bio->iovpos++;
8192 0 : bio->iov_offset = 0;
8193 0 : }
8194 :
8195 0 : return 0;
8196 0 : }
8197 :
8198 : static void
8199 0 : bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
8200 : {
8201 0 : struct nvme_bdev_io *bio = ref;
8202 0 : struct iovec *iov;
8203 :
8204 0 : bio->fused_iov_offset = sgl_offset;
8205 0 : for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
8206 0 : iov = &bio->fused_iovs[bio->fused_iovpos];
8207 0 : if (bio->fused_iov_offset < iov->iov_len) {
8208 0 : break;
8209 : }
8210 :
8211 0 : bio->fused_iov_offset -= iov->iov_len;
8212 0 : }
8213 0 : }
8214 :
8215 : static int
8216 0 : bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
8217 : {
8218 0 : struct nvme_bdev_io *bio = ref;
8219 0 : struct iovec *iov;
8220 :
8221 0 : assert(bio->fused_iovpos < bio->fused_iovcnt);
8222 :
8223 0 : iov = &bio->fused_iovs[bio->fused_iovpos];
8224 :
8225 0 : *address = iov->iov_base;
8226 0 : *length = iov->iov_len;
8227 :
8228 0 : if (bio->fused_iov_offset) {
8229 0 : assert(bio->fused_iov_offset <= iov->iov_len);
8230 0 : *address += bio->fused_iov_offset;
8231 0 : *length -= bio->fused_iov_offset;
8232 0 : }
8233 :
8234 0 : bio->fused_iov_offset += *length;
8235 0 : if (bio->fused_iov_offset == iov->iov_len) {
8236 0 : bio->fused_iovpos++;
8237 0 : bio->fused_iov_offset = 0;
8238 0 : }
8239 :
8240 0 : return 0;
8241 0 : }
8242 :
8243 : static int
8244 0 : bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8245 : void *md, uint64_t lba_count, uint64_t lba)
8246 : {
8247 0 : int rc;
8248 :
8249 0 : SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
8250 : lba_count, lba);
8251 :
8252 0 : bio->iovs = iov;
8253 0 : bio->iovcnt = iovcnt;
8254 0 : bio->iovpos = 0;
8255 0 : bio->iov_offset = 0;
8256 :
8257 0 : rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns,
8258 0 : bio->io_path->qpair->qpair,
8259 0 : lba, lba_count,
8260 0 : bdev_nvme_no_pi_readv_done, bio, 0,
8261 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
8262 0 : md, 0, 0);
8263 :
8264 0 : if (rc != 0 && rc != -ENOMEM) {
8265 0 : SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
8266 0 : }
8267 0 : return rc;
8268 0 : }
8269 :
8270 : static int
8271 3 : bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8272 : void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
8273 : struct spdk_memory_domain *domain, void *domain_ctx,
8274 : struct spdk_accel_sequence *seq)
8275 : {
8276 3 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8277 3 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8278 3 : int rc;
8279 :
8280 3 : SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8281 : lba_count, lba);
8282 :
8283 3 : bio->iovs = iov;
8284 3 : bio->iovcnt = iovcnt;
8285 3 : bio->iovpos = 0;
8286 3 : bio->iov_offset = 0;
8287 :
8288 3 : if (domain != NULL || seq != NULL) {
8289 1 : bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence);
8290 1 : bio->ext_opts.memory_domain = domain;
8291 1 : bio->ext_opts.memory_domain_ctx = domain_ctx;
8292 1 : bio->ext_opts.io_flags = flags;
8293 1 : bio->ext_opts.metadata = md;
8294 1 : bio->ext_opts.accel_sequence = seq;
8295 :
8296 1 : if (iovcnt == 1) {
8297 1 : rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done,
8298 1 : bio, &bio->ext_opts);
8299 1 : } else {
8300 0 : rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count,
8301 0 : bdev_nvme_readv_done, bio,
8302 : bdev_nvme_queued_reset_sgl,
8303 : bdev_nvme_queued_next_sge,
8304 0 : &bio->ext_opts);
8305 : }
8306 3 : } else if (iovcnt == 1) {
8307 2 : rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base,
8308 2 : md, lba, lba_count, bdev_nvme_readv_done,
8309 2 : bio, flags, 0, 0);
8310 2 : } else {
8311 0 : rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
8312 0 : bdev_nvme_readv_done, bio, flags,
8313 : bdev_nvme_queued_reset_sgl,
8314 0 : bdev_nvme_queued_next_sge, md, 0, 0);
8315 : }
8316 :
8317 3 : if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
8318 0 : SPDK_ERRLOG("readv failed: rc = %d\n", rc);
8319 0 : }
8320 6 : return rc;
8321 3 : }
8322 :
8323 : static int
8324 25 : bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8325 : void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
8326 : struct spdk_memory_domain *domain, void *domain_ctx,
8327 : struct spdk_accel_sequence *seq,
8328 : union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13)
8329 : {
8330 25 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8331 25 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8332 25 : int rc;
8333 :
8334 25 : SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8335 : lba_count, lba);
8336 :
8337 25 : bio->iovs = iov;
8338 25 : bio->iovcnt = iovcnt;
8339 25 : bio->iovpos = 0;
8340 25 : bio->iov_offset = 0;
8341 :
8342 25 : if (domain != NULL || seq != NULL) {
8343 0 : bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence);
8344 0 : bio->ext_opts.memory_domain = domain;
8345 0 : bio->ext_opts.memory_domain_ctx = domain_ctx;
8346 0 : bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype);
8347 0 : bio->ext_opts.cdw13 = cdw13.raw;
8348 0 : bio->ext_opts.metadata = md;
8349 0 : bio->ext_opts.accel_sequence = seq;
8350 :
8351 0 : if (iovcnt == 1) {
8352 0 : rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done,
8353 0 : bio, &bio->ext_opts);
8354 0 : } else {
8355 0 : rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count,
8356 0 : bdev_nvme_writev_done, bio,
8357 : bdev_nvme_queued_reset_sgl,
8358 : bdev_nvme_queued_next_sge,
8359 0 : &bio->ext_opts);
8360 : }
8361 25 : } else if (iovcnt == 1) {
8362 25 : rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base,
8363 25 : md, lba, lba_count, bdev_nvme_writev_done,
8364 25 : bio, flags, 0, 0);
8365 25 : } else {
8366 0 : rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
8367 0 : bdev_nvme_writev_done, bio, flags,
8368 : bdev_nvme_queued_reset_sgl,
8369 0 : bdev_nvme_queued_next_sge, md, 0, 0);
8370 : }
8371 :
8372 25 : if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
8373 0 : SPDK_ERRLOG("writev failed: rc = %d\n", rc);
8374 0 : }
8375 50 : return rc;
8376 25 : }
8377 :
8378 : static int
8379 0 : bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8380 : void *md, uint64_t lba_count, uint64_t zslba,
8381 : uint32_t flags)
8382 : {
8383 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8384 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8385 0 : int rc;
8386 :
8387 0 : SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
8388 : lba_count, zslba);
8389 :
8390 0 : bio->iovs = iov;
8391 0 : bio->iovcnt = iovcnt;
8392 0 : bio->iovpos = 0;
8393 0 : bio->iov_offset = 0;
8394 :
8395 0 : if (iovcnt == 1) {
8396 0 : rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
8397 0 : lba_count,
8398 0 : bdev_nvme_zone_appendv_done, bio,
8399 0 : flags,
8400 : 0, 0);
8401 0 : } else {
8402 0 : rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
8403 0 : bdev_nvme_zone_appendv_done, bio, flags,
8404 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
8405 0 : md, 0, 0);
8406 : }
8407 :
8408 0 : if (rc != 0 && rc != -ENOMEM) {
8409 0 : SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
8410 0 : }
8411 0 : return rc;
8412 0 : }
8413 :
8414 : static int
8415 1 : bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8416 : void *md, uint64_t lba_count, uint64_t lba,
8417 : uint32_t flags)
8418 : {
8419 1 : int rc;
8420 :
8421 1 : SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8422 : lba_count, lba);
8423 :
8424 1 : bio->iovs = iov;
8425 1 : bio->iovcnt = iovcnt;
8426 1 : bio->iovpos = 0;
8427 1 : bio->iov_offset = 0;
8428 :
8429 2 : rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns,
8430 1 : bio->io_path->qpair->qpair,
8431 1 : lba, lba_count,
8432 1 : bdev_nvme_comparev_done, bio, flags,
8433 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
8434 1 : md, 0, 0);
8435 :
8436 1 : if (rc != 0 && rc != -ENOMEM) {
8437 0 : SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
8438 0 : }
8439 2 : return rc;
8440 1 : }
8441 :
8442 : static int
8443 2 : bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
8444 : struct iovec *write_iov, int write_iovcnt,
8445 : void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
8446 : {
8447 2 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8448 2 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8449 2 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8450 2 : int rc;
8451 :
8452 2 : SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8453 : lba_count, lba);
8454 :
8455 2 : bio->iovs = cmp_iov;
8456 2 : bio->iovcnt = cmp_iovcnt;
8457 2 : bio->iovpos = 0;
8458 2 : bio->iov_offset = 0;
8459 2 : bio->fused_iovs = write_iov;
8460 2 : bio->fused_iovcnt = write_iovcnt;
8461 2 : bio->fused_iovpos = 0;
8462 2 : bio->fused_iov_offset = 0;
8463 :
8464 2 : if (bdev_io->num_retries == 0) {
8465 2 : bio->first_fused_submitted = false;
8466 2 : bio->first_fused_completed = false;
8467 2 : }
8468 :
8469 2 : if (!bio->first_fused_submitted) {
8470 2 : flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
8471 2 : memset(&bio->cpl, 0, sizeof(bio->cpl));
8472 :
8473 4 : rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
8474 2 : bdev_nvme_comparev_and_writev_done, bio, flags,
8475 2 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
8476 2 : if (rc == 0) {
8477 2 : bio->first_fused_submitted = true;
8478 2 : flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
8479 2 : } else {
8480 0 : if (rc != -ENOMEM) {
8481 0 : SPDK_ERRLOG("compare failed: rc = %d\n", rc);
8482 0 : }
8483 0 : return rc;
8484 : }
8485 2 : }
8486 :
8487 2 : flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
8488 :
8489 4 : rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
8490 2 : bdev_nvme_comparev_and_writev_done, bio, flags,
8491 2 : bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
8492 2 : if (rc != 0 && rc != -ENOMEM) {
8493 0 : SPDK_ERRLOG("write failed: rc = %d\n", rc);
8494 0 : rc = 0;
8495 0 : }
8496 :
8497 2 : return rc;
8498 2 : }
8499 :
8500 : static int
8501 1 : bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
8502 : {
8503 1 : struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
8504 1 : struct spdk_nvme_dsm_range *range;
8505 1 : uint64_t offset, remaining;
8506 1 : uint64_t num_ranges_u64;
8507 1 : uint16_t num_ranges;
8508 1 : int rc;
8509 :
8510 1 : num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
8511 : SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8512 1 : if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
8513 0 : SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
8514 0 : return -EINVAL;
8515 : }
8516 1 : num_ranges = (uint16_t)num_ranges_u64;
8517 :
8518 1 : offset = offset_blocks;
8519 1 : remaining = num_blocks;
8520 1 : range = &dsm_ranges[0];
8521 :
8522 : /* Fill max-size ranges until the remaining blocks fit into one range */
8523 1 : while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
8524 0 : range->attributes.raw = 0;
8525 0 : range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8526 0 : range->starting_lba = offset;
8527 :
8528 0 : offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8529 0 : remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8530 0 : range++;
8531 : }
8532 :
8533 : /* Final range describes the remaining blocks */
8534 1 : range->attributes.raw = 0;
8535 1 : range->length = remaining;
8536 1 : range->starting_lba = offset;
8537 :
8538 2 : rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns,
8539 1 : bio->io_path->qpair->qpair,
8540 : SPDK_NVME_DSM_ATTR_DEALLOCATE,
8541 1 : dsm_ranges, num_ranges,
8542 1 : bdev_nvme_queued_done, bio);
8543 :
8544 1 : return rc;
8545 1 : }
8546 :
8547 : static int
8548 0 : bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
8549 : {
8550 0 : if (num_blocks > UINT16_MAX + 1) {
8551 0 : SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n");
8552 0 : return -EINVAL;
8553 : }
8554 :
8555 0 : return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns,
8556 0 : bio->io_path->qpair->qpair,
8557 0 : offset_blocks, num_blocks,
8558 0 : bdev_nvme_queued_done, bio,
8559 : 0);
8560 0 : }
8561 :
8562 : static int
8563 0 : bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
8564 : struct spdk_bdev_zone_info *info)
8565 : {
8566 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8567 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8568 0 : uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
8569 0 : uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
8570 0 : uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
8571 :
8572 0 : if (zone_id % zone_size != 0) {
8573 0 : return -EINVAL;
8574 : }
8575 :
8576 0 : if (num_zones > total_zones || !num_zones) {
8577 0 : return -EINVAL;
8578 : }
8579 :
8580 0 : assert(!bio->zone_report_buf);
8581 0 : bio->zone_report_buf = calloc(1, zone_report_bufsize);
8582 0 : if (!bio->zone_report_buf) {
8583 0 : return -ENOMEM;
8584 : }
8585 :
8586 0 : bio->handled_zones = 0;
8587 :
8588 0 : return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
8589 0 : zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
8590 0 : bdev_nvme_get_zone_info_done, bio);
8591 0 : }
8592 :
8593 : static int
8594 0 : bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
8595 : enum spdk_bdev_zone_action action)
8596 : {
8597 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8598 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8599 :
8600 0 : switch (action) {
8601 : case SPDK_BDEV_ZONE_CLOSE:
8602 0 : return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
8603 0 : bdev_nvme_zone_management_done, bio);
8604 : case SPDK_BDEV_ZONE_FINISH:
8605 0 : return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
8606 0 : bdev_nvme_zone_management_done, bio);
8607 : case SPDK_BDEV_ZONE_OPEN:
8608 0 : return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
8609 0 : bdev_nvme_zone_management_done, bio);
8610 : case SPDK_BDEV_ZONE_RESET:
8611 0 : return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
8612 0 : bdev_nvme_zone_management_done, bio);
8613 : case SPDK_BDEV_ZONE_OFFLINE:
8614 0 : return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
8615 0 : bdev_nvme_zone_management_done, bio);
8616 : default:
8617 0 : return -EINVAL;
8618 : }
8619 0 : }
8620 :
8621 : static void
8622 5 : bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
8623 : struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
8624 : {
8625 5 : struct nvme_io_path *io_path;
8626 5 : struct nvme_ctrlr *nvme_ctrlr;
8627 5 : uint32_t max_xfer_size;
8628 5 : int rc = -ENXIO;
8629 :
8630 : /* Choose the first ctrlr which is not failed. */
8631 8 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
8632 7 : nvme_ctrlr = io_path->qpair->ctrlr;
8633 :
8634 : /* We should skip any unavailable nvme_ctrlr rather than checking
8635 : * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO.
8636 : */
8637 7 : if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
8638 3 : continue;
8639 : }
8640 :
8641 4 : max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr);
8642 :
8643 4 : if (nbytes > max_xfer_size) {
8644 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8645 0 : rc = -EINVAL;
8646 0 : goto err;
8647 : }
8648 :
8649 4 : rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes,
8650 4 : bdev_nvme_admin_passthru_done, bio);
8651 4 : if (rc == 0) {
8652 4 : return;
8653 : }
8654 1 : }
8655 :
8656 : err:
8657 1 : bdev_nvme_admin_complete(bio, rc);
8658 5 : }
8659 :
8660 : static int
8661 0 : bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
8662 : void *buf, size_t nbytes)
8663 : {
8664 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8665 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8666 0 : uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
8667 0 : struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
8668 :
8669 0 : if (nbytes > max_xfer_size) {
8670 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8671 0 : return -EINVAL;
8672 : }
8673 :
8674 : /*
8675 : * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
8676 : * so fill it out automatically.
8677 : */
8678 0 : cmd->nsid = spdk_nvme_ns_get_id(ns);
8679 :
8680 0 : return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
8681 0 : (uint32_t)nbytes, bdev_nvme_queued_done, bio);
8682 0 : }
8683 :
8684 : static int
8685 0 : bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
8686 : void *buf, size_t nbytes, void *md_buf, size_t md_len)
8687 : {
8688 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8689 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8690 0 : size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
8691 0 : uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
8692 0 : struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
8693 :
8694 0 : if (nbytes > max_xfer_size) {
8695 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8696 0 : return -EINVAL;
8697 : }
8698 :
8699 0 : if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
8700 0 : SPDK_ERRLOG("invalid meta data buffer size\n");
8701 0 : return -EINVAL;
8702 : }
8703 :
8704 : /*
8705 : * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
8706 : * so fill it out automatically.
8707 : */
8708 0 : cmd->nsid = spdk_nvme_ns_get_id(ns);
8709 :
8710 0 : return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
8711 0 : (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
8712 0 : }
8713 :
8714 : static int
8715 0 : bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio,
8716 : struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt,
8717 : size_t nbytes, void *md_buf, size_t md_len)
8718 : {
8719 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8720 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8721 0 : size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
8722 0 : uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
8723 0 : struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
8724 :
8725 0 : bio->iovs = iov;
8726 0 : bio->iovcnt = iovcnt;
8727 0 : bio->iovpos = 0;
8728 0 : bio->iov_offset = 0;
8729 :
8730 0 : if (nbytes > max_xfer_size) {
8731 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8732 0 : return -EINVAL;
8733 : }
8734 :
8735 0 : if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
8736 0 : SPDK_ERRLOG("invalid meta data buffer size\n");
8737 0 : return -EINVAL;
8738 : }
8739 :
8740 : /*
8741 : * Each NVMe bdev is a specific namespace, and all NVMe I/O commands
8742 : * require a nsid, so fill it out automatically.
8743 : */
8744 0 : cmd->nsid = spdk_nvme_ns_get_id(ns);
8745 :
8746 0 : return spdk_nvme_ctrlr_cmd_iov_raw_with_md(
8747 0 : ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio,
8748 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge);
8749 0 : }
8750 :
8751 : static void
8752 6 : bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
8753 : struct nvme_bdev_io *bio_to_abort)
8754 : {
8755 6 : struct nvme_io_path *io_path;
8756 6 : int rc = 0;
8757 :
8758 6 : rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort);
8759 6 : if (rc == 0) {
8760 1 : bdev_nvme_admin_complete(bio, 0);
8761 1 : return;
8762 : }
8763 :
8764 5 : io_path = bio_to_abort->io_path;
8765 5 : if (io_path != NULL) {
8766 6 : rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr,
8767 3 : io_path->qpair->qpair,
8768 3 : bio_to_abort,
8769 3 : bdev_nvme_abort_done, bio);
8770 3 : } else {
8771 3 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
8772 2 : rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr,
8773 : NULL,
8774 2 : bio_to_abort,
8775 2 : bdev_nvme_abort_done, bio);
8776 :
8777 2 : if (rc != -ENOENT) {
8778 1 : break;
8779 : }
8780 1 : }
8781 : }
8782 :
8783 5 : if (rc != 0) {
8784 : /* If no command was found or there was any error, complete the abort
8785 : * request with failure.
8786 : */
8787 2 : bdev_nvme_admin_complete(bio, rc);
8788 2 : }
8789 6 : }
8790 :
8791 : static int
8792 0 : bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks,
8793 : uint64_t num_blocks)
8794 : {
8795 0 : struct spdk_nvme_scc_source_range range = {
8796 0 : .slba = src_offset_blocks,
8797 0 : .nlb = num_blocks - 1
8798 : };
8799 :
8800 0 : return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns,
8801 0 : bio->io_path->qpair->qpair,
8802 0 : &range, 1, dst_offset_blocks,
8803 0 : bdev_nvme_queued_done, bio);
8804 0 : }
8805 :
8806 : static void
8807 0 : bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
8808 : {
8809 0 : const char *action;
8810 0 : uint32_t i;
8811 :
8812 0 : if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
8813 0 : action = "reset";
8814 0 : } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
8815 0 : action = "abort";
8816 0 : } else {
8817 0 : action = "none";
8818 : }
8819 :
8820 0 : spdk_json_write_object_begin(w);
8821 :
8822 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
8823 :
8824 0 : spdk_json_write_named_object_begin(w, "params");
8825 0 : spdk_json_write_named_string(w, "action_on_timeout", action);
8826 0 : spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
8827 0 : spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us);
8828 0 : spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
8829 0 : spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
8830 0 : spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
8831 0 : spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
8832 0 : spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
8833 0 : spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
8834 0 : spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
8835 0 : spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
8836 0 : spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
8837 0 : spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count);
8838 0 : spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count);
8839 0 : spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout);
8840 0 : spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec);
8841 0 : spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec);
8842 0 : spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec);
8843 0 : spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback);
8844 0 : spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids);
8845 0 : spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos);
8846 0 : spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat);
8847 0 : spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size);
8848 0 : spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat);
8849 0 : spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence);
8850 0 : spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size);
8851 0 : spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms);
8852 0 : spdk_json_write_named_array_begin(w, "dhchap_digests");
8853 0 : for (i = 0; i < 32; ++i) {
8854 0 : if (g_opts.dhchap_digests & SPDK_BIT(i)) {
8855 0 : spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i));
8856 0 : }
8857 0 : }
8858 0 : spdk_json_write_array_end(w);
8859 0 : spdk_json_write_named_array_begin(w, "dhchap_dhgroups");
8860 0 : for (i = 0; i < 32; ++i) {
8861 0 : if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) {
8862 0 : spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i));
8863 0 : }
8864 0 : }
8865 :
8866 0 : spdk_json_write_array_end(w);
8867 0 : spdk_json_write_object_end(w);
8868 :
8869 0 : spdk_json_write_object_end(w);
8870 0 : }
8871 :
8872 : static void
8873 0 : bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx)
8874 : {
8875 0 : struct spdk_nvme_transport_id trid;
8876 :
8877 0 : spdk_json_write_object_begin(w);
8878 :
8879 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery");
8880 :
8881 0 : spdk_json_write_named_object_begin(w, "params");
8882 0 : spdk_json_write_named_string(w, "name", ctx->name);
8883 0 : spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn);
8884 :
8885 0 : trid = ctx->trid;
8886 0 : memset(trid.subnqn, 0, sizeof(trid.subnqn));
8887 0 : nvme_bdev_dump_trid_json(&trid, w);
8888 :
8889 0 : spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach);
8890 0 : spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec);
8891 0 : spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec);
8892 0 : spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec",
8893 0 : ctx->bdev_opts.fast_io_fail_timeout_sec);
8894 0 : spdk_json_write_object_end(w);
8895 :
8896 0 : spdk_json_write_object_end(w);
8897 0 : }
8898 :
8899 : #ifdef SPDK_CONFIG_NVME_CUSE
8900 : static void
8901 0 : nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w,
8902 : struct nvme_ctrlr *nvme_ctrlr)
8903 : {
8904 0 : size_t cuse_name_size = 128;
8905 0 : char cuse_name[cuse_name_size];
8906 :
8907 0 : if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr,
8908 0 : cuse_name, &cuse_name_size) != 0) {
8909 0 : return;
8910 : }
8911 :
8912 0 : spdk_json_write_object_begin(w);
8913 :
8914 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register");
8915 :
8916 0 : spdk_json_write_named_object_begin(w, "params");
8917 0 : spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name);
8918 0 : spdk_json_write_object_end(w);
8919 :
8920 0 : spdk_json_write_object_end(w);
8921 0 : }
8922 : #endif
8923 :
8924 : static void
8925 0 : nvme_ctrlr_config_json(struct spdk_json_write_ctx *w,
8926 : struct nvme_ctrlr *nvme_ctrlr,
8927 : struct nvme_path_id *path_id)
8928 : {
8929 0 : struct spdk_nvme_transport_id *trid;
8930 0 : const struct spdk_nvme_ctrlr_opts *opts;
8931 :
8932 0 : if (nvme_ctrlr->opts.from_discovery_service) {
8933 : /* Do not emit an RPC for this - it will be implicitly
8934 : * covered by a separate bdev_nvme_start_discovery or
8935 : * bdev_nvme_start_mdns_discovery RPC.
8936 : */
8937 0 : return;
8938 : }
8939 :
8940 0 : trid = &path_id->trid;
8941 :
8942 0 : spdk_json_write_object_begin(w);
8943 :
8944 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
8945 :
8946 0 : spdk_json_write_named_object_begin(w, "params");
8947 0 : spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name);
8948 0 : nvme_bdev_dump_trid_json(trid, w);
8949 0 : spdk_json_write_named_bool(w, "prchk_reftag",
8950 0 : (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
8951 0 : spdk_json_write_named_bool(w, "prchk_guard",
8952 0 : (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
8953 0 : spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec);
8954 0 : spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec);
8955 0 : spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec",
8956 0 : nvme_ctrlr->opts.fast_io_fail_timeout_sec);
8957 0 : if (nvme_ctrlr->psk != NULL) {
8958 0 : spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk));
8959 0 : }
8960 0 : if (nvme_ctrlr->dhchap_key != NULL) {
8961 0 : spdk_json_write_named_string(w, "dhchap_key",
8962 0 : spdk_key_get_name(nvme_ctrlr->dhchap_key));
8963 0 : }
8964 0 : if (nvme_ctrlr->dhchap_ctrlr_key != NULL) {
8965 0 : spdk_json_write_named_string(w, "dhchap_ctrlr_key",
8966 0 : spdk_key_get_name(nvme_ctrlr->dhchap_ctrlr_key));
8967 0 : }
8968 0 : opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr);
8969 0 : spdk_json_write_named_string(w, "hostnqn", opts->hostnqn);
8970 0 : spdk_json_write_named_bool(w, "hdgst", opts->header_digest);
8971 0 : spdk_json_write_named_bool(w, "ddgst", opts->data_digest);
8972 0 : if (opts->src_addr[0] != '\0') {
8973 0 : spdk_json_write_named_string(w, "hostaddr", opts->src_addr);
8974 0 : }
8975 0 : if (opts->src_svcid[0] != '\0') {
8976 0 : spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid);
8977 0 : }
8978 :
8979 0 : if (nvme_ctrlr->opts.multipath) {
8980 0 : spdk_json_write_named_string(w, "multipath", "multipath");
8981 0 : }
8982 0 : spdk_json_write_object_end(w);
8983 :
8984 0 : spdk_json_write_object_end(w);
8985 0 : }
8986 :
8987 : static void
8988 0 : bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
8989 : {
8990 0 : spdk_json_write_object_begin(w);
8991 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
8992 :
8993 0 : spdk_json_write_named_object_begin(w, "params");
8994 0 : spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
8995 0 : spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
8996 0 : spdk_json_write_object_end(w);
8997 :
8998 0 : spdk_json_write_object_end(w);
8999 0 : }
9000 :
9001 : static int
9002 0 : bdev_nvme_config_json(struct spdk_json_write_ctx *w)
9003 : {
9004 0 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
9005 0 : struct nvme_ctrlr *nvme_ctrlr;
9006 0 : struct discovery_ctx *ctx;
9007 0 : struct nvme_path_id *path_id;
9008 :
9009 0 : bdev_nvme_opts_config_json(w);
9010 :
9011 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
9012 :
9013 0 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
9014 0 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
9015 0 : path_id = nvme_ctrlr->active_path_id;
9016 0 : assert(path_id == TAILQ_FIRST(&nvme_ctrlr->trids));
9017 0 : nvme_ctrlr_config_json(w, nvme_ctrlr, path_id);
9018 :
9019 0 : path_id = TAILQ_NEXT(path_id, link);
9020 0 : while (path_id != NULL) {
9021 0 : nvme_ctrlr_config_json(w, nvme_ctrlr, path_id);
9022 0 : path_id = TAILQ_NEXT(path_id, link);
9023 : }
9024 :
9025 : #ifdef SPDK_CONFIG_NVME_CUSE
9026 0 : nvme_ctrlr_cuse_config_json(w, nvme_ctrlr);
9027 : #endif
9028 0 : }
9029 0 : }
9030 :
9031 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
9032 0 : if (!ctx->from_mdns_discovery_service) {
9033 0 : bdev_nvme_discovery_config_json(w, ctx);
9034 0 : }
9035 0 : }
9036 :
9037 0 : bdev_nvme_mdns_discovery_config_json(w);
9038 :
9039 : /* Dump as last parameter to give all NVMe bdevs chance to be constructed
9040 : * before enabling hotplug poller.
9041 : */
9042 0 : bdev_nvme_hotplug_config_json(w);
9043 :
9044 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9045 0 : return 0;
9046 0 : }
9047 :
9048 : struct spdk_nvme_ctrlr *
9049 1 : bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
9050 : {
9051 1 : struct nvme_bdev *nbdev;
9052 1 : struct nvme_ns *nvme_ns;
9053 :
9054 1 : if (!bdev || bdev->module != &nvme_if) {
9055 0 : return NULL;
9056 : }
9057 :
9058 1 : nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
9059 1 : nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
9060 1 : assert(nvme_ns != NULL);
9061 :
9062 1 : return nvme_ns->ctrlr->ctrlr;
9063 1 : }
9064 :
9065 : static bool
9066 12 : nvme_io_path_is_current(struct nvme_io_path *io_path)
9067 : {
9068 12 : const struct nvme_bdev_channel *nbdev_ch;
9069 12 : bool current;
9070 :
9071 12 : if (!nvme_io_path_is_available(io_path)) {
9072 4 : return false;
9073 : }
9074 :
9075 8 : nbdev_ch = io_path->nbdev_ch;
9076 8 : if (nbdev_ch == NULL) {
9077 1 : current = false;
9078 8 : } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
9079 3 : struct nvme_io_path *optimized_io_path = NULL;
9080 :
9081 6 : STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) {
9082 5 : if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) {
9083 2 : break;
9084 : }
9085 3 : }
9086 :
9087 : /* A non-optimized path is only current if there are no optimized paths. */
9088 3 : current = (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) ||
9089 2 : (optimized_io_path == NULL);
9090 3 : } else {
9091 4 : if (nbdev_ch->current_io_path) {
9092 1 : current = (io_path == nbdev_ch->current_io_path);
9093 1 : } else {
9094 3 : struct nvme_io_path *first_path;
9095 :
9096 : /* We arrived here as there are no optimized paths for active-passive
9097 : * mode. Check if this io_path is the first one available on the list.
9098 : */
9099 3 : current = false;
9100 3 : STAILQ_FOREACH(first_path, &nbdev_ch->io_path_list, stailq) {
9101 3 : if (nvme_io_path_is_available(first_path)) {
9102 3 : current = (io_path == first_path);
9103 3 : break;
9104 : }
9105 0 : }
9106 3 : }
9107 : }
9108 :
9109 8 : return current;
9110 12 : }
9111 :
9112 : static struct nvme_ctrlr *
9113 0 : bdev_nvme_next_ctrlr_unsafe(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct nvme_ctrlr *prev)
9114 : {
9115 0 : struct nvme_ctrlr *next;
9116 :
9117 : /* Must be called under g_bdev_nvme_mutex */
9118 0 : next = prev != NULL ? TAILQ_NEXT(prev, tailq) : TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
9119 0 : while (next != NULL) {
9120 : /* ref can be 0 when the ctrlr was released, but hasn't been detached yet */
9121 0 : pthread_mutex_lock(&next->mutex);
9122 0 : if (next->ref > 0) {
9123 0 : next->ref++;
9124 0 : pthread_mutex_unlock(&next->mutex);
9125 0 : return next;
9126 : }
9127 :
9128 0 : pthread_mutex_unlock(&next->mutex);
9129 0 : next = TAILQ_NEXT(next, tailq);
9130 : }
9131 :
9132 0 : return NULL;
9133 0 : }
9134 :
9135 : struct bdev_nvme_set_keys_ctx {
9136 : struct nvme_ctrlr *nctrlr;
9137 : struct spdk_key *dhchap_key;
9138 : struct spdk_key *dhchap_ctrlr_key;
9139 : struct spdk_thread *thread;
9140 : bdev_nvme_set_keys_cb cb_fn;
9141 : void *cb_ctx;
9142 : int status;
9143 : };
9144 :
9145 : static void
9146 0 : bdev_nvme_free_set_keys_ctx(struct bdev_nvme_set_keys_ctx *ctx)
9147 : {
9148 0 : if (ctx == NULL) {
9149 0 : return;
9150 : }
9151 :
9152 0 : spdk_keyring_put_key(ctx->dhchap_key);
9153 0 : spdk_keyring_put_key(ctx->dhchap_ctrlr_key);
9154 0 : free(ctx);
9155 0 : }
9156 :
9157 : static void
9158 0 : _bdev_nvme_set_keys_done(void *_ctx)
9159 : {
9160 0 : struct bdev_nvme_set_keys_ctx *ctx = _ctx;
9161 :
9162 0 : ctx->cb_fn(ctx->cb_ctx, ctx->status);
9163 :
9164 0 : if (ctx->nctrlr != NULL) {
9165 0 : nvme_ctrlr_put_ref(ctx->nctrlr);
9166 0 : }
9167 0 : bdev_nvme_free_set_keys_ctx(ctx);
9168 0 : }
9169 :
9170 : static void
9171 0 : bdev_nvme_set_keys_done(struct bdev_nvme_set_keys_ctx *ctx, int status)
9172 : {
9173 0 : ctx->status = status;
9174 0 : spdk_thread_exec_msg(ctx->thread, _bdev_nvme_set_keys_done, ctx);
9175 0 : }
9176 :
9177 : static void bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx);
9178 :
9179 : static void
9180 0 : bdev_nvme_authenticate_ctrlr_continue(struct bdev_nvme_set_keys_ctx *ctx)
9181 : {
9182 0 : struct nvme_ctrlr *next;
9183 :
9184 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
9185 0 : next = bdev_nvme_next_ctrlr_unsafe(NULL, ctx->nctrlr);
9186 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9187 :
9188 0 : nvme_ctrlr_put_ref(ctx->nctrlr);
9189 0 : ctx->nctrlr = next;
9190 :
9191 0 : if (next == NULL) {
9192 0 : bdev_nvme_set_keys_done(ctx, 0);
9193 0 : } else {
9194 0 : bdev_nvme_authenticate_ctrlr(ctx);
9195 : }
9196 0 : }
9197 :
9198 : static void
9199 0 : bdev_nvme_authenticate_qpairs_done(struct spdk_io_channel_iter *i, int status)
9200 : {
9201 0 : struct bdev_nvme_set_keys_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
9202 :
9203 0 : if (status != 0) {
9204 0 : bdev_nvme_set_keys_done(ctx, status);
9205 0 : return;
9206 : }
9207 0 : bdev_nvme_authenticate_ctrlr_continue(ctx);
9208 0 : }
9209 :
9210 : static void
9211 0 : bdev_nvme_authenticate_qpair_done(void *ctx, int status)
9212 : {
9213 0 : spdk_for_each_channel_continue(ctx, status);
9214 0 : }
9215 :
9216 : static void
9217 0 : bdev_nvme_authenticate_qpair(struct spdk_io_channel_iter *i)
9218 : {
9219 0 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
9220 0 : struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
9221 0 : struct nvme_qpair *qpair = ctrlr_ch->qpair;
9222 0 : int rc;
9223 :
9224 0 : if (!nvme_qpair_is_connected(qpair)) {
9225 0 : spdk_for_each_channel_continue(i, 0);
9226 0 : return;
9227 : }
9228 :
9229 0 : rc = spdk_nvme_qpair_authenticate(qpair->qpair, bdev_nvme_authenticate_qpair_done, i);
9230 0 : if (rc != 0) {
9231 0 : spdk_for_each_channel_continue(i, rc);
9232 0 : }
9233 0 : }
9234 :
9235 : static void
9236 0 : bdev_nvme_authenticate_ctrlr_done(void *_ctx, int status)
9237 : {
9238 0 : struct bdev_nvme_set_keys_ctx *ctx = _ctx;
9239 :
9240 0 : if (status != 0) {
9241 0 : bdev_nvme_set_keys_done(ctx, status);
9242 0 : return;
9243 : }
9244 :
9245 0 : spdk_for_each_channel(ctx->nctrlr, bdev_nvme_authenticate_qpair, ctx,
9246 : bdev_nvme_authenticate_qpairs_done);
9247 0 : }
9248 :
9249 : static void
9250 0 : bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx)
9251 : {
9252 0 : struct spdk_nvme_ctrlr_key_opts opts = {};
9253 0 : struct nvme_ctrlr *nctrlr = ctx->nctrlr;
9254 0 : int rc;
9255 :
9256 0 : opts.size = SPDK_SIZEOF(&opts, dhchap_ctrlr_key);
9257 0 : opts.dhchap_key = ctx->dhchap_key;
9258 0 : opts.dhchap_ctrlr_key = ctx->dhchap_ctrlr_key;
9259 0 : rc = spdk_nvme_ctrlr_set_keys(nctrlr->ctrlr, &opts);
9260 0 : if (rc != 0) {
9261 0 : bdev_nvme_set_keys_done(ctx, rc);
9262 0 : return;
9263 : }
9264 :
9265 0 : if (ctx->dhchap_key != NULL) {
9266 0 : rc = spdk_nvme_ctrlr_authenticate(nctrlr->ctrlr,
9267 0 : bdev_nvme_authenticate_ctrlr_done, ctx);
9268 0 : if (rc != 0) {
9269 0 : bdev_nvme_set_keys_done(ctx, rc);
9270 0 : }
9271 0 : } else {
9272 0 : bdev_nvme_authenticate_ctrlr_continue(ctx);
9273 : }
9274 0 : }
9275 :
9276 : int
9277 0 : bdev_nvme_set_keys(const char *name, const char *dhchap_key, const char *dhchap_ctrlr_key,
9278 : bdev_nvme_set_keys_cb cb_fn, void *cb_ctx)
9279 : {
9280 0 : struct bdev_nvme_set_keys_ctx *ctx;
9281 0 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
9282 0 : struct nvme_ctrlr *nctrlr;
9283 :
9284 0 : ctx = calloc(1, sizeof(*ctx));
9285 0 : if (ctx == NULL) {
9286 0 : return -ENOMEM;
9287 : }
9288 :
9289 0 : if (dhchap_key != NULL) {
9290 0 : ctx->dhchap_key = spdk_keyring_get_key(dhchap_key);
9291 0 : if (ctx->dhchap_key == NULL) {
9292 0 : SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_key, name);
9293 0 : bdev_nvme_free_set_keys_ctx(ctx);
9294 0 : return -ENOKEY;
9295 : }
9296 0 : }
9297 0 : if (dhchap_ctrlr_key != NULL) {
9298 0 : ctx->dhchap_ctrlr_key = spdk_keyring_get_key(dhchap_ctrlr_key);
9299 0 : if (ctx->dhchap_ctrlr_key == NULL) {
9300 0 : SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_ctrlr_key, name);
9301 0 : bdev_nvme_free_set_keys_ctx(ctx);
9302 0 : return -ENOKEY;
9303 : }
9304 0 : }
9305 :
9306 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
9307 0 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
9308 0 : if (nbdev_ctrlr == NULL) {
9309 0 : SPDK_ERRLOG("Could not find bdev_ctrlr %s\n", name);
9310 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9311 0 : bdev_nvme_free_set_keys_ctx(ctx);
9312 0 : return -ENODEV;
9313 : }
9314 0 : nctrlr = bdev_nvme_next_ctrlr_unsafe(nbdev_ctrlr, NULL);
9315 0 : if (nctrlr == NULL) {
9316 0 : SPDK_ERRLOG("Could not find any nvme_ctrlrs on bdev_ctrlr %s\n", name);
9317 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9318 0 : bdev_nvme_free_set_keys_ctx(ctx);
9319 0 : return -ENODEV;
9320 : }
9321 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9322 :
9323 0 : ctx->nctrlr = nctrlr;
9324 0 : ctx->cb_fn = cb_fn;
9325 0 : ctx->cb_ctx = cb_ctx;
9326 0 : ctx->thread = spdk_get_thread();
9327 :
9328 0 : bdev_nvme_authenticate_ctrlr(ctx);
9329 :
9330 0 : return 0;
9331 0 : }
9332 :
9333 : void
9334 0 : nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path)
9335 : {
9336 0 : struct nvme_ns *nvme_ns = io_path->nvme_ns;
9337 0 : struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr;
9338 0 : const struct spdk_nvme_ctrlr_data *cdata;
9339 0 : const struct spdk_nvme_transport_id *trid;
9340 0 : const char *adrfam_str;
9341 :
9342 0 : spdk_json_write_object_begin(w);
9343 :
9344 0 : spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name);
9345 :
9346 0 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
9347 0 : trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr);
9348 :
9349 0 : spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid);
9350 0 : spdk_json_write_named_bool(w, "current", nvme_io_path_is_current(io_path));
9351 0 : spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair));
9352 0 : spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns));
9353 :
9354 0 : spdk_json_write_named_object_begin(w, "transport");
9355 0 : spdk_json_write_named_string(w, "trtype", trid->trstring);
9356 0 : spdk_json_write_named_string(w, "traddr", trid->traddr);
9357 0 : if (trid->trsvcid[0] != '\0') {
9358 0 : spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
9359 0 : }
9360 0 : adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
9361 0 : if (adrfam_str) {
9362 0 : spdk_json_write_named_string(w, "adrfam", adrfam_str);
9363 0 : }
9364 0 : spdk_json_write_object_end(w);
9365 :
9366 0 : spdk_json_write_object_end(w);
9367 0 : }
9368 :
9369 : void
9370 0 : bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w)
9371 : {
9372 0 : struct discovery_ctx *ctx;
9373 0 : struct discovery_entry_ctx *entry_ctx;
9374 :
9375 0 : spdk_json_write_array_begin(w);
9376 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
9377 0 : spdk_json_write_object_begin(w);
9378 0 : spdk_json_write_named_string(w, "name", ctx->name);
9379 :
9380 0 : spdk_json_write_named_object_begin(w, "trid");
9381 0 : nvme_bdev_dump_trid_json(&ctx->trid, w);
9382 0 : spdk_json_write_object_end(w);
9383 :
9384 0 : spdk_json_write_named_array_begin(w, "referrals");
9385 0 : TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) {
9386 0 : spdk_json_write_object_begin(w);
9387 0 : spdk_json_write_named_object_begin(w, "trid");
9388 0 : nvme_bdev_dump_trid_json(&entry_ctx->trid, w);
9389 0 : spdk_json_write_object_end(w);
9390 0 : spdk_json_write_object_end(w);
9391 0 : }
9392 0 : spdk_json_write_array_end(w);
9393 :
9394 0 : spdk_json_write_object_end(w);
9395 0 : }
9396 0 : spdk_json_write_array_end(w);
9397 0 : }
9398 :
9399 1 : SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
9400 :
9401 : static void
9402 0 : bdev_nvme_trace(void)
9403 : {
9404 0 : struct spdk_trace_tpoint_opts opts[] = {
9405 : {
9406 : "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START,
9407 : OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1,
9408 : {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }}
9409 : },
9410 : {
9411 : "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE,
9412 : OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0,
9413 : {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }}
9414 : }
9415 : };
9416 :
9417 :
9418 0 : spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N');
9419 0 : spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
9420 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0);
9421 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0);
9422 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0);
9423 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0);
9424 0 : }
9425 1 : SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME)
|