Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2016 Intel Corporation. All rights reserved.
3 : * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4 : * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 : * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved.
6 : */
7 :
8 : #include "spdk/stdinc.h"
9 :
10 : #include "bdev_nvme.h"
11 :
12 : #include "spdk/accel.h"
13 : #include "spdk/config.h"
14 : #include "spdk/endian.h"
15 : #include "spdk/bdev.h"
16 : #include "spdk/json.h"
17 : #include "spdk/keyring.h"
18 : #include "spdk/likely.h"
19 : #include "spdk/nvme.h"
20 : #include "spdk/nvme_ocssd.h"
21 : #include "spdk/nvme_zns.h"
22 : #include "spdk/opal.h"
23 : #include "spdk/thread.h"
24 : #include "spdk/trace.h"
25 : #include "spdk/string.h"
26 : #include "spdk/util.h"
27 : #include "spdk/uuid.h"
28 :
29 : #include "spdk/bdev_module.h"
30 : #include "spdk/log.h"
31 :
32 : #include "spdk_internal/usdt.h"
33 : #include "spdk_internal/trace_defs.h"
34 :
35 : #define CTRLR_STRING(nvme_ctrlr) \
36 : (spdk_nvme_trtype_is_fabrics(nvme_ctrlr->active_path_id->trid.trtype) ? \
37 : nvme_ctrlr->active_path_id->trid.subnqn : nvme_ctrlr->active_path_id->trid.traddr)
38 :
39 : #define CTRLR_ID(nvme_ctrlr) (spdk_nvme_ctrlr_get_id(nvme_ctrlr->ctrlr))
40 :
41 : #define NVME_CTRLR_ERRLOG(ctrlr, format, ...) \
42 : SPDK_ERRLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
43 :
44 : #define NVME_CTRLR_WARNLOG(ctrlr, format, ...) \
45 : SPDK_WARNLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
46 :
47 : #define NVME_CTRLR_NOTICELOG(ctrlr, format, ...) \
48 : SPDK_NOTICELOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
49 :
50 : #define NVME_CTRLR_INFOLOG(ctrlr, format, ...) \
51 : SPDK_INFOLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
52 :
53 : #ifdef DEBUG
54 : #define NVME_CTRLR_DEBUGLOG(ctrlr, format, ...) \
55 : SPDK_DEBUGLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
56 : #else
57 : #define NVME_CTRLR_DEBUGLOG(ctrlr, ...) do { } while (0)
58 : #endif
59 :
60 : #define BDEV_STRING(nbdev) (nbdev->disk.name)
61 :
62 : #define NVME_BDEV_ERRLOG(nbdev, format, ...) \
63 : SPDK_ERRLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__);
64 :
65 : #define NVME_BDEV_WARNLOG(nbdev, format, ...) \
66 : SPDK_WARNLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__);
67 :
68 : #define NVME_BDEV_NOTICELOG(nbdev, format, ...) \
69 : SPDK_NOTICELOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__);
70 :
71 : #define NVME_BDEV_INFOLOG(nbdev, format, ...) \
72 : SPDK_INFOLOG(bdev_nvme, "[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__);
73 :
74 : #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
75 : #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000)
76 :
77 : #define NSID_STR_LEN 10
78 :
79 : #define SPDK_CONTROLLER_NAME_MAX 512
80 :
81 : static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
82 :
83 : struct nvme_bdev_io {
84 : /** array of iovecs to transfer. */
85 : struct iovec *iovs;
86 :
87 : /** Number of iovecs in iovs array. */
88 : int iovcnt;
89 :
90 : /** Current iovec position. */
91 : int iovpos;
92 :
93 : /** Offset in current iovec. */
94 : uint32_t iov_offset;
95 :
96 : /** Offset in current iovec. */
97 : uint32_t fused_iov_offset;
98 :
99 : /** array of iovecs to transfer. */
100 : struct iovec *fused_iovs;
101 :
102 : /** Number of iovecs in iovs array. */
103 : int fused_iovcnt;
104 :
105 : /** Current iovec position. */
106 : int fused_iovpos;
107 :
108 : /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path
109 : * being reset in a reset I/O.
110 : */
111 : struct nvme_io_path *io_path;
112 :
113 : /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
114 : struct spdk_nvme_cpl cpl;
115 :
116 : /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */
117 : struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
118 :
119 : /** Keeps track if first of fused commands was submitted */
120 : bool first_fused_submitted;
121 :
122 : /** Keeps track if first of fused commands was completed */
123 : bool first_fused_completed;
124 :
125 : /* How many times the current I/O was retried. */
126 : int32_t retry_count;
127 :
128 : /** Expiration value in ticks to retry the current I/O. */
129 : uint64_t retry_ticks;
130 :
131 : /** Temporary pointer to zone report buffer */
132 : struct spdk_nvme_zns_zone_report *zone_report_buf;
133 :
134 : /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
135 : uint64_t handled_zones;
136 :
137 : /* Current tsc at submit time. */
138 : uint64_t submit_tsc;
139 :
140 : /* Used to put nvme_bdev_io into the list */
141 : TAILQ_ENTRY(nvme_bdev_io) retry_link;
142 : };
143 :
144 : struct nvme_probe_skip_entry {
145 : struct spdk_nvme_transport_id trid;
146 : TAILQ_ENTRY(nvme_probe_skip_entry) tailq;
147 : };
148 : /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
149 : static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
150 : g_skipped_nvme_ctrlrs);
151 :
152 : #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \
153 : SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \
154 : SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512))
155 :
156 : #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \
157 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \
158 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \
159 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \
160 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \
161 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192))
162 :
163 : static struct spdk_bdev_nvme_opts g_opts = {
164 : .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
165 : .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
166 : .timeout_us = 0,
167 : .timeout_admin_us = 0,
168 : .transport_retry_count = 4,
169 : .arbitration_burst = 0,
170 : .low_priority_weight = 0,
171 : .medium_priority_weight = 0,
172 : .high_priority_weight = 0,
173 : .io_queue_requests = 0,
174 : .nvme_adminq_poll_period_us = 10000ULL,
175 : .nvme_ioq_poll_period_us = 0,
176 : .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
177 : .bdev_retry_count = 3,
178 : .ctrlr_loss_timeout_sec = 0,
179 : .reconnect_delay_sec = 0,
180 : .fast_io_fail_timeout_sec = 0,
181 : .transport_ack_timeout = 0,
182 : .disable_auto_failback = false,
183 : .generate_uuids = false,
184 : .transport_tos = 0,
185 : .nvme_error_stat = false,
186 : .io_path_stat = false,
187 : .allow_accel_sequence = false,
188 : .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS,
189 : .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS,
190 : .rdma_umr_per_io = false,
191 : };
192 :
193 : #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL
194 : #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL
195 :
196 : static int g_hot_insert_nvme_controller_index = 0;
197 : static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
198 : static bool g_nvme_hotplug_enabled = false;
199 : struct spdk_thread *g_bdev_nvme_init_thread;
200 : static struct spdk_poller *g_hotplug_poller;
201 : static struct spdk_poller *g_hotplug_probe_poller;
202 : static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
203 :
204 : static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
205 : struct nvme_async_probe_ctx *ctx);
206 : static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
207 : struct nvme_async_probe_ctx *ctx);
208 : static int bdev_nvme_library_init(void);
209 : static void bdev_nvme_library_fini(void);
210 : static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch,
211 : struct spdk_bdev_io *bdev_io);
212 : static void bdev_nvme_submit_request(struct spdk_io_channel *ch,
213 : struct spdk_bdev_io *bdev_io);
214 : static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
215 : void *md, uint64_t lba_count, uint64_t lba,
216 : uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx,
217 : struct spdk_accel_sequence *seq);
218 : static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
219 : void *md, uint64_t lba_count, uint64_t lba);
220 : static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
221 : void *md, uint64_t lba_count, uint64_t lba,
222 : uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx,
223 : struct spdk_accel_sequence *seq,
224 : union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13);
225 : static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
226 : void *md, uint64_t lba_count,
227 : uint64_t zslba, uint32_t flags);
228 : static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
229 : void *md, uint64_t lba_count, uint64_t lba,
230 : uint32_t flags);
231 : static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio,
232 : struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
233 : int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
234 : uint32_t flags);
235 : static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id,
236 : uint32_t num_zones, struct spdk_bdev_zone_info *info);
237 : static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
238 : enum spdk_bdev_zone_action action);
239 : static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch,
240 : struct nvme_bdev_io *bio,
241 : struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
242 : static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
243 : void *buf, size_t nbytes);
244 : static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
245 : void *buf, size_t nbytes, void *md_buf, size_t md_len);
246 : static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
247 : struct iovec *iov, int iovcnt, size_t nbytes,
248 : void *md_buf, size_t md_len);
249 : static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch,
250 : struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
251 : static void bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio);
252 : static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
253 : static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
254 : static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
255 : static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr);
256 :
257 : static struct nvme_ns *nvme_ns_alloc(void);
258 : static void nvme_ns_free(struct nvme_ns *ns);
259 :
260 : static int
261 176 : nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2)
262 : {
263 176 : return ns1->id < ns2->id ? -1 : ns1->id > ns2->id;
264 : }
265 :
266 1089 : RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp);
267 :
268 : struct spdk_nvme_qpair *
269 1 : bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
270 : {
271 : struct nvme_ctrlr_channel *ctrlr_ch;
272 :
273 1 : assert(ctrlr_io_ch != NULL);
274 :
275 1 : ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
276 :
277 1 : return ctrlr_ch->qpair->qpair;
278 : }
279 :
280 : static int
281 0 : bdev_nvme_get_ctx_size(void)
282 : {
283 0 : return sizeof(struct nvme_bdev_io);
284 : }
285 :
286 : static struct spdk_bdev_module nvme_if = {
287 : .name = "nvme",
288 : .async_fini = true,
289 : .module_init = bdev_nvme_library_init,
290 : .module_fini = bdev_nvme_library_fini,
291 : .config_json = bdev_nvme_config_json,
292 : .get_ctx_size = bdev_nvme_get_ctx_size,
293 :
294 : };
295 1 : SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
296 :
297 : struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs);
298 : pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER;
299 : bool g_bdev_nvme_module_finish;
300 :
301 : struct nvme_bdev_ctrlr *
302 333 : nvme_bdev_ctrlr_get_by_name(const char *name)
303 : {
304 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
305 :
306 333 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
307 171 : if (strcmp(name, nbdev_ctrlr->name) == 0) {
308 171 : break;
309 : }
310 0 : }
311 :
312 333 : return nbdev_ctrlr;
313 : }
314 :
315 : static struct nvme_ctrlr *
316 59 : nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr,
317 : const struct spdk_nvme_transport_id *trid, const char *hostnqn)
318 : {
319 : const struct spdk_nvme_ctrlr_opts *opts;
320 : struct nvme_ctrlr *nvme_ctrlr;
321 :
322 100 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
323 75 : opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr);
324 75 : if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0 &&
325 34 : strcmp(hostnqn, opts->hostnqn) == 0) {
326 34 : break;
327 : }
328 41 : }
329 :
330 59 : return nvme_ctrlr;
331 : }
332 :
333 : struct nvme_ctrlr *
334 0 : nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr,
335 : uint16_t cntlid)
336 : {
337 : struct nvme_ctrlr *nvme_ctrlr;
338 : const struct spdk_nvme_ctrlr_data *cdata;
339 :
340 0 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
341 0 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
342 0 : if (cdata->cntlid == cntlid) {
343 0 : break;
344 : }
345 0 : }
346 :
347 0 : return nvme_ctrlr;
348 : }
349 :
350 : static struct nvme_bdev *
351 75 : nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid)
352 : {
353 : struct nvme_bdev *nbdev;
354 :
355 75 : pthread_mutex_lock(&g_bdev_nvme_mutex);
356 109 : TAILQ_FOREACH(nbdev, &nbdev_ctrlr->bdevs, tailq) {
357 69 : if (nbdev->nsid == nsid) {
358 35 : break;
359 : }
360 34 : }
361 75 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
362 :
363 75 : return nbdev;
364 : }
365 :
366 : struct nvme_ns *
367 145 : nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid)
368 : {
369 : struct nvme_ns ns;
370 :
371 145 : assert(nsid > 0);
372 :
373 145 : ns.id = nsid;
374 145 : return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns);
375 : }
376 :
377 : struct nvme_ns *
378 165 : nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr)
379 : {
380 165 : return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces);
381 : }
382 :
383 : struct nvme_ns *
384 74 : nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns)
385 : {
386 74 : if (ns == NULL) {
387 0 : return NULL;
388 : }
389 :
390 74 : return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
391 74 : }
392 :
393 : static struct nvme_ctrlr *
394 53 : nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid, const char *hostnqn)
395 : {
396 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
397 53 : struct nvme_ctrlr *nvme_ctrlr = NULL;
398 :
399 53 : pthread_mutex_lock(&g_bdev_nvme_mutex);
400 72 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
401 19 : nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid, hostnqn);
402 19 : if (nvme_ctrlr != NULL) {
403 0 : break;
404 : }
405 19 : }
406 53 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
407 :
408 53 : return nvme_ctrlr;
409 : }
410 :
411 : struct nvme_ctrlr *
412 126 : nvme_ctrlr_get_by_name(const char *name)
413 : {
414 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
415 126 : struct nvme_ctrlr *nvme_ctrlr = NULL;
416 :
417 126 : if (name == NULL) {
418 0 : return NULL;
419 : }
420 :
421 126 : pthread_mutex_lock(&g_bdev_nvme_mutex);
422 126 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
423 126 : if (nbdev_ctrlr != NULL) {
424 60 : nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
425 60 : }
426 126 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
427 :
428 126 : return nvme_ctrlr;
429 126 : }
430 :
431 : void
432 0 : nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx)
433 : {
434 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
435 :
436 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
437 0 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
438 0 : fn(nbdev_ctrlr, ctx);
439 0 : }
440 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
441 0 : }
442 :
443 : struct nvme_ctrlr_channel_iter {
444 : nvme_ctrlr_for_each_channel_msg fn;
445 : nvme_ctrlr_for_each_channel_done cpl;
446 : struct spdk_io_channel_iter *i;
447 : void *ctx;
448 : };
449 :
450 : void
451 166 : nvme_ctrlr_for_each_channel_continue(struct nvme_ctrlr_channel_iter *iter, int status)
452 : {
453 166 : spdk_for_each_channel_continue(iter->i, status);
454 166 : }
455 :
456 : static void
457 166 : nvme_ctrlr_each_channel_msg(struct spdk_io_channel_iter *i)
458 : {
459 166 : struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
460 166 : struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
461 166 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
462 166 : struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
463 :
464 166 : iter->i = i;
465 166 : iter->fn(iter, nvme_ctrlr, ctrlr_ch, iter->ctx);
466 166 : }
467 :
468 : static void
469 97 : nvme_ctrlr_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
470 : {
471 97 : struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
472 97 : struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
473 :
474 97 : iter->i = i;
475 97 : iter->cpl(nvme_ctrlr, iter->ctx, status);
476 :
477 97 : free(iter);
478 97 : }
479 :
480 : void
481 97 : nvme_ctrlr_for_each_channel(struct nvme_ctrlr *nvme_ctrlr,
482 : nvme_ctrlr_for_each_channel_msg fn, void *ctx,
483 : nvme_ctrlr_for_each_channel_done cpl)
484 : {
485 : struct nvme_ctrlr_channel_iter *iter;
486 :
487 97 : assert(nvme_ctrlr != NULL && fn != NULL);
488 :
489 97 : iter = calloc(1, sizeof(struct nvme_ctrlr_channel_iter));
490 97 : if (iter == NULL) {
491 0 : SPDK_ERRLOG("Unable to allocate iterator\n");
492 0 : assert(false);
493 : return;
494 : }
495 :
496 97 : iter->fn = fn;
497 97 : iter->cpl = cpl;
498 97 : iter->ctx = ctx;
499 :
500 194 : spdk_for_each_channel(nvme_ctrlr, nvme_ctrlr_each_channel_msg,
501 97 : iter, nvme_ctrlr_each_channel_cpl);
502 97 : }
503 :
504 : struct nvme_bdev_channel_iter {
505 : nvme_bdev_for_each_channel_msg fn;
506 : nvme_bdev_for_each_channel_done cpl;
507 : struct spdk_io_channel_iter *i;
508 : void *ctx;
509 : };
510 :
511 : void
512 69 : nvme_bdev_for_each_channel_continue(struct nvme_bdev_channel_iter *iter, int status)
513 : {
514 69 : spdk_for_each_channel_continue(iter->i, status);
515 69 : }
516 :
517 : static void
518 69 : nvme_bdev_each_channel_msg(struct spdk_io_channel_iter *i)
519 : {
520 69 : struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
521 69 : struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i);
522 69 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
523 69 : struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
524 :
525 69 : iter->i = i;
526 69 : iter->fn(iter, nbdev, nbdev_ch, iter->ctx);
527 69 : }
528 :
529 : static void
530 60 : nvme_bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
531 : {
532 60 : struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
533 60 : struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i);
534 :
535 60 : iter->i = i;
536 60 : iter->cpl(nbdev, iter->ctx, status);
537 :
538 60 : free(iter);
539 60 : }
540 :
541 : void
542 60 : nvme_bdev_for_each_channel(struct nvme_bdev *nbdev,
543 : nvme_bdev_for_each_channel_msg fn, void *ctx,
544 : nvme_bdev_for_each_channel_done cpl)
545 : {
546 : struct nvme_bdev_channel_iter *iter;
547 :
548 60 : assert(nbdev != NULL && fn != NULL);
549 :
550 60 : iter = calloc(1, sizeof(struct nvme_bdev_channel_iter));
551 60 : if (iter == NULL) {
552 0 : SPDK_ERRLOG("Unable to allocate iterator\n");
553 0 : assert(false);
554 : return;
555 : }
556 :
557 60 : iter->fn = fn;
558 60 : iter->cpl = cpl;
559 60 : iter->ctx = ctx;
560 :
561 60 : spdk_for_each_channel(nbdev, nvme_bdev_each_channel_msg, iter,
562 : nvme_bdev_each_channel_cpl);
563 60 : }
564 :
565 : void
566 0 : nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w)
567 : {
568 : const char *trtype_str;
569 : const char *adrfam_str;
570 :
571 0 : trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype);
572 0 : if (trtype_str) {
573 0 : spdk_json_write_named_string(w, "trtype", trtype_str);
574 0 : }
575 :
576 0 : adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
577 0 : if (adrfam_str) {
578 0 : spdk_json_write_named_string(w, "adrfam", adrfam_str);
579 0 : }
580 :
581 0 : if (trid->traddr[0] != '\0') {
582 0 : spdk_json_write_named_string(w, "traddr", trid->traddr);
583 0 : }
584 :
585 0 : if (trid->trsvcid[0] != '\0') {
586 0 : spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
587 0 : }
588 :
589 0 : if (trid->subnqn[0] != '\0') {
590 0 : spdk_json_write_named_string(w, "subnqn", trid->subnqn);
591 0 : }
592 0 : }
593 :
594 : static void
595 61 : nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr,
596 : struct nvme_ctrlr *nvme_ctrlr)
597 : {
598 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name);
599 61 : pthread_mutex_lock(&g_bdev_nvme_mutex);
600 :
601 61 : TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
602 61 : if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) {
603 15 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
604 :
605 15 : return;
606 : }
607 46 : TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
608 :
609 46 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
610 :
611 46 : assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs));
612 :
613 46 : free(nbdev_ctrlr->name);
614 46 : free(nbdev_ctrlr);
615 61 : }
616 :
617 : static void
618 62 : _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
619 : {
620 : struct nvme_path_id *path_id, *tmp_path;
621 : struct nvme_ns *ns, *tmp_ns;
622 :
623 62 : free(nvme_ctrlr->copied_ana_desc);
624 62 : spdk_free(nvme_ctrlr->ana_log_page);
625 :
626 62 : if (nvme_ctrlr->opal_dev) {
627 0 : spdk_opal_dev_destruct(nvme_ctrlr->opal_dev);
628 0 : nvme_ctrlr->opal_dev = NULL;
629 0 : }
630 :
631 62 : if (nvme_ctrlr->nbdev_ctrlr) {
632 61 : nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr);
633 61 : }
634 :
635 62 : RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) {
636 0 : RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
637 0 : nvme_ns_free(ns);
638 0 : }
639 :
640 124 : TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) {
641 62 : TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
642 62 : free(path_id);
643 62 : }
644 :
645 62 : pthread_mutex_destroy(&nvme_ctrlr->mutex);
646 62 : spdk_keyring_put_key(nvme_ctrlr->psk);
647 62 : spdk_keyring_put_key(nvme_ctrlr->dhchap_key);
648 62 : spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key);
649 62 : free(nvme_ctrlr);
650 :
651 62 : pthread_mutex_lock(&g_bdev_nvme_mutex);
652 62 : if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
653 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
654 0 : spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
655 0 : spdk_bdev_module_fini_done();
656 0 : return;
657 : }
658 62 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
659 62 : }
660 :
661 : static int
662 62 : nvme_detach_poller(void *arg)
663 : {
664 62 : struct nvme_ctrlr *nvme_ctrlr = arg;
665 : int rc;
666 :
667 62 : rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx);
668 62 : if (rc != -EAGAIN) {
669 62 : spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
670 62 : _nvme_ctrlr_delete(nvme_ctrlr);
671 62 : }
672 :
673 62 : return SPDK_POLLER_BUSY;
674 : }
675 :
676 : static void
677 62 : nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
678 : {
679 : int rc;
680 :
681 62 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
682 :
683 62 : if (spdk_interrupt_mode_is_enabled()) {
684 0 : spdk_interrupt_unregister(&nvme_ctrlr->intr);
685 0 : }
686 :
687 : /* First, unregister the adminq poller, as the driver will poll adminq if necessary */
688 62 : spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller);
689 :
690 : /* If we got here, the reset/detach poller cannot be active */
691 62 : assert(nvme_ctrlr->reset_detach_poller == NULL);
692 62 : nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller,
693 : nvme_ctrlr, 1000);
694 62 : if (nvme_ctrlr->reset_detach_poller == NULL) {
695 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to register detach poller\n");
696 0 : goto error;
697 : }
698 :
699 62 : rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx);
700 62 : if (rc != 0) {
701 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to detach the NVMe controller\n");
702 0 : goto error;
703 : }
704 :
705 62 : return;
706 : error:
707 : /* We don't have a good way to handle errors here, so just do what we can and delete the
708 : * controller without detaching the underlying NVMe device.
709 : */
710 0 : spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
711 0 : _nvme_ctrlr_delete(nvme_ctrlr);
712 62 : }
713 :
714 : static void
715 61 : nvme_ctrlr_unregister_cb(void *io_device)
716 : {
717 61 : struct nvme_ctrlr *nvme_ctrlr = io_device;
718 :
719 61 : nvme_ctrlr_delete(nvme_ctrlr);
720 61 : }
721 :
722 : static void
723 61 : nvme_ctrlr_unregister(void *ctx)
724 : {
725 61 : struct nvme_ctrlr *nvme_ctrlr = ctx;
726 :
727 61 : spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb);
728 61 : }
729 :
730 : static bool
731 249 : nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr)
732 : {
733 249 : if (!nvme_ctrlr->destruct) {
734 131 : return false;
735 : }
736 :
737 118 : if (nvme_ctrlr->ref > 0) {
738 57 : return false;
739 : }
740 :
741 61 : if (nvme_ctrlr->resetting) {
742 0 : return false;
743 : }
744 :
745 61 : if (nvme_ctrlr->ana_log_page_updating) {
746 0 : return false;
747 : }
748 :
749 61 : if (nvme_ctrlr->io_path_cache_clearing) {
750 0 : return false;
751 : }
752 :
753 61 : return true;
754 249 : }
755 :
756 : static void
757 172 : nvme_ctrlr_put_ref(struct nvme_ctrlr *nvme_ctrlr)
758 : {
759 172 : pthread_mutex_lock(&nvme_ctrlr->mutex);
760 : SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref);
761 :
762 172 : assert(nvme_ctrlr->ref > 0);
763 172 : nvme_ctrlr->ref--;
764 :
765 172 : if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
766 111 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
767 111 : return;
768 : }
769 :
770 61 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
771 :
772 61 : spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr);
773 172 : }
774 :
775 : static void
776 111 : nvme_ctrlr_get_ref(struct nvme_ctrlr *nvme_ctrlr)
777 : {
778 111 : pthread_mutex_lock(&nvme_ctrlr->mutex);
779 111 : nvme_ctrlr->ref++;
780 111 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
781 111 : }
782 :
783 : static void
784 259 : bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch)
785 : {
786 259 : nbdev_ch->current_io_path = NULL;
787 259 : nbdev_ch->rr_counter = 0;
788 259 : }
789 :
790 : static struct nvme_io_path *
791 8 : _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
792 : {
793 : struct nvme_io_path *io_path;
794 :
795 16 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
796 15 : if (io_path->nvme_ns == nvme_ns) {
797 7 : break;
798 : }
799 8 : }
800 :
801 8 : return io_path;
802 : }
803 :
804 : static struct nvme_io_path *
805 39 : nvme_io_path_alloc(void)
806 : {
807 : struct nvme_io_path *io_path;
808 :
809 39 : io_path = calloc(1, sizeof(*io_path));
810 39 : if (io_path == NULL) {
811 0 : SPDK_ERRLOG("Failed to alloc io_path.\n");
812 0 : return NULL;
813 : }
814 :
815 39 : if (g_opts.io_path_stat) {
816 0 : io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
817 0 : if (io_path->stat == NULL) {
818 0 : free(io_path);
819 0 : SPDK_ERRLOG("Failed to alloc io_path stat.\n");
820 0 : return NULL;
821 : }
822 0 : spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN);
823 0 : }
824 :
825 39 : return io_path;
826 39 : }
827 :
828 : static void
829 39 : nvme_io_path_free(struct nvme_io_path *io_path)
830 : {
831 39 : free(io_path->stat);
832 39 : free(io_path);
833 39 : }
834 :
835 : static int
836 39 : _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
837 : {
838 : struct nvme_io_path *io_path;
839 : struct spdk_io_channel *ch;
840 : struct nvme_ctrlr_channel *ctrlr_ch;
841 : struct nvme_qpair *nvme_qpair;
842 :
843 39 : io_path = nvme_io_path_alloc();
844 39 : if (io_path == NULL) {
845 0 : return -ENOMEM;
846 : }
847 :
848 39 : io_path->nvme_ns = nvme_ns;
849 :
850 39 : ch = spdk_get_io_channel(nvme_ns->ctrlr);
851 39 : if (ch == NULL) {
852 0 : nvme_io_path_free(io_path);
853 0 : SPDK_ERRLOG("Failed to alloc io_channel.\n");
854 0 : return -ENOMEM;
855 : }
856 :
857 39 : ctrlr_ch = spdk_io_channel_get_ctx(ch);
858 :
859 39 : nvme_qpair = ctrlr_ch->qpair;
860 39 : assert(nvme_qpair != NULL);
861 :
862 39 : io_path->qpair = nvme_qpair;
863 39 : TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq);
864 :
865 39 : io_path->nbdev_ch = nbdev_ch;
866 39 : STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq);
867 :
868 39 : bdev_nvme_clear_current_io_path(nbdev_ch);
869 :
870 39 : return 0;
871 39 : }
872 :
873 : static void
874 39 : bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch,
875 : struct nvme_io_path *io_path)
876 : {
877 : struct nvme_bdev_io *bio;
878 :
879 40 : TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) {
880 1 : if (bio->io_path == io_path) {
881 1 : bio->io_path = NULL;
882 1 : }
883 1 : }
884 39 : }
885 :
886 : static void
887 39 : _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path)
888 : {
889 : struct spdk_io_channel *ch;
890 : struct nvme_qpair *nvme_qpair;
891 : struct nvme_ctrlr_channel *ctrlr_ch;
892 : struct nvme_bdev *nbdev;
893 :
894 39 : nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch));
895 :
896 : /* Add the statistics to nvme_ns before this path is destroyed. */
897 39 : pthread_mutex_lock(&nbdev->mutex);
898 39 : if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) {
899 0 : spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat);
900 0 : }
901 39 : pthread_mutex_unlock(&nbdev->mutex);
902 :
903 39 : bdev_nvme_clear_current_io_path(nbdev_ch);
904 39 : bdev_nvme_clear_retry_io_path(nbdev_ch, io_path);
905 :
906 41 : STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq);
907 39 : io_path->nbdev_ch = NULL;
908 :
909 39 : nvme_qpair = io_path->qpair;
910 39 : assert(nvme_qpair != NULL);
911 :
912 39 : ctrlr_ch = nvme_qpair->ctrlr_ch;
913 39 : assert(ctrlr_ch != NULL);
914 :
915 39 : ch = spdk_io_channel_from_ctx(ctrlr_ch);
916 39 : spdk_put_io_channel(ch);
917 :
918 : /* After an io_path is removed, I/Os submitted to it may complete and update statistics
919 : * of the io_path. To avoid heap-use-after-free error from this case, do not free the
920 : * io_path here but free the io_path when the associated qpair is freed. It is ensured
921 : * that all I/Os submitted to the io_path are completed when the associated qpair is freed.
922 : */
923 39 : }
924 :
925 : static void
926 26 : _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch)
927 : {
928 : struct nvme_io_path *io_path, *tmp_io_path;
929 :
930 63 : STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) {
931 37 : _bdev_nvme_delete_io_path(nbdev_ch, io_path);
932 37 : }
933 26 : }
934 :
935 : static int
936 26 : bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf)
937 : {
938 26 : struct nvme_bdev_channel *nbdev_ch = ctx_buf;
939 26 : struct nvme_bdev *nbdev = io_device;
940 : struct nvme_ns *nvme_ns;
941 : int rc;
942 :
943 26 : STAILQ_INIT(&nbdev_ch->io_path_list);
944 26 : TAILQ_INIT(&nbdev_ch->retry_io_list);
945 :
946 26 : pthread_mutex_lock(&nbdev->mutex);
947 :
948 26 : nbdev_ch->mp_policy = nbdev->mp_policy;
949 26 : nbdev_ch->mp_selector = nbdev->mp_selector;
950 26 : nbdev_ch->rr_min_io = nbdev->rr_min_io;
951 :
952 63 : TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
953 37 : rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
954 37 : if (rc != 0) {
955 0 : pthread_mutex_unlock(&nbdev->mutex);
956 :
957 0 : _bdev_nvme_delete_io_paths(nbdev_ch);
958 0 : return rc;
959 : }
960 37 : }
961 26 : pthread_mutex_unlock(&nbdev->mutex);
962 :
963 26 : return 0;
964 26 : }
965 :
966 : /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'.
967 : * If cpl == NULL, complete the bdev_io with bdev status based on 'status'.
968 : */
969 : static inline void
970 58 : __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status,
971 : const struct spdk_nvme_cpl *cpl)
972 : {
973 58 : spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx,
974 : (uintptr_t)bdev_io);
975 58 : if (cpl) {
976 29 : spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
977 29 : } else {
978 29 : spdk_bdev_io_complete(bdev_io, status);
979 : }
980 58 : }
981 :
982 : static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch);
983 :
984 : static void
985 26 : bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf)
986 : {
987 26 : struct nvme_bdev_channel *nbdev_ch = ctx_buf;
988 :
989 26 : bdev_nvme_abort_retry_ios(nbdev_ch);
990 26 : _bdev_nvme_delete_io_paths(nbdev_ch);
991 26 : }
992 :
993 : static inline bool
994 62 : bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type)
995 : {
996 62 : switch (io_type) {
997 : case SPDK_BDEV_IO_TYPE_RESET:
998 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
999 : case SPDK_BDEV_IO_TYPE_ABORT:
1000 5 : return true;
1001 : default:
1002 57 : break;
1003 : }
1004 :
1005 57 : return false;
1006 62 : }
1007 :
1008 : static inline bool
1009 98 : nvme_ns_is_active(struct nvme_ns *nvme_ns)
1010 : {
1011 98 : if (spdk_unlikely(nvme_ns->ana_state_updating)) {
1012 1 : return false;
1013 : }
1014 :
1015 97 : if (spdk_unlikely(nvme_ns->ns == NULL)) {
1016 0 : return false;
1017 : }
1018 :
1019 97 : return true;
1020 98 : }
1021 :
1022 : static inline bool
1023 86 : nvme_ns_is_accessible(struct nvme_ns *nvme_ns)
1024 : {
1025 86 : if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) {
1026 1 : return false;
1027 : }
1028 :
1029 85 : switch (nvme_ns->ana_state) {
1030 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
1031 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1032 76 : return true;
1033 : default:
1034 9 : break;
1035 : }
1036 :
1037 9 : return false;
1038 86 : }
1039 :
1040 : static inline bool
1041 128 : nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair)
1042 : {
1043 128 : if (spdk_unlikely(nvme_qpair->qpair == NULL)) {
1044 23 : return false;
1045 : }
1046 :
1047 105 : if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) !=
1048 : SPDK_NVME_QPAIR_FAILURE_NONE)) {
1049 2 : return false;
1050 : }
1051 :
1052 103 : if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) {
1053 0 : return false;
1054 : }
1055 :
1056 103 : return true;
1057 128 : }
1058 :
1059 : static inline bool
1060 102 : nvme_io_path_is_available(struct nvme_io_path *io_path)
1061 : {
1062 102 : if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) {
1063 16 : return false;
1064 : }
1065 :
1066 86 : if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) {
1067 10 : return false;
1068 : }
1069 :
1070 76 : return true;
1071 102 : }
1072 :
1073 : static inline bool
1074 9 : nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr)
1075 : {
1076 9 : if (nvme_ctrlr->destruct) {
1077 0 : return true;
1078 : }
1079 :
1080 9 : if (nvme_ctrlr->fast_io_fail_timedout) {
1081 2 : return true;
1082 : }
1083 :
1084 7 : if (nvme_ctrlr->resetting) {
1085 5 : if (nvme_ctrlr->opts.reconnect_delay_sec != 0) {
1086 5 : return false;
1087 : } else {
1088 0 : return true;
1089 : }
1090 : }
1091 :
1092 2 : if (nvme_ctrlr->reconnect_is_delayed) {
1093 2 : return false;
1094 : }
1095 :
1096 0 : if (nvme_ctrlr->disabled) {
1097 0 : return true;
1098 : }
1099 :
1100 0 : if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
1101 0 : return true;
1102 : } else {
1103 0 : return false;
1104 : }
1105 9 : }
1106 :
1107 : static bool
1108 20 : nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr)
1109 : {
1110 20 : if (nvme_ctrlr->destruct) {
1111 0 : return false;
1112 : }
1113 :
1114 20 : if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
1115 3 : return false;
1116 : }
1117 :
1118 17 : if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) {
1119 1 : return false;
1120 : }
1121 :
1122 16 : if (nvme_ctrlr->disabled) {
1123 0 : return false;
1124 : }
1125 :
1126 16 : return true;
1127 20 : }
1128 :
1129 : /* Simulate circular linked list. */
1130 : static inline struct nvme_io_path *
1131 99 : nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path)
1132 : {
1133 : struct nvme_io_path *next_path;
1134 :
1135 99 : if (prev_path != NULL) {
1136 39 : next_path = STAILQ_NEXT(prev_path, stailq);
1137 39 : if (next_path != NULL) {
1138 14 : return next_path;
1139 : }
1140 25 : }
1141 :
1142 85 : return STAILQ_FIRST(&nbdev_ch->io_path_list);
1143 99 : }
1144 :
1145 : static struct nvme_io_path *
1146 67 : _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
1147 : {
1148 67 : struct nvme_io_path *io_path, *start, *non_optimized = NULL;
1149 :
1150 67 : start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path);
1151 :
1152 67 : io_path = start;
1153 67 : do {
1154 79 : if (spdk_likely(nvme_io_path_is_available(io_path))) {
1155 57 : switch (io_path->nvme_ns->ana_state) {
1156 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
1157 47 : nbdev_ch->current_io_path = io_path;
1158 47 : return io_path;
1159 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1160 10 : if (non_optimized == NULL) {
1161 7 : non_optimized = io_path;
1162 7 : }
1163 10 : break;
1164 : default:
1165 0 : assert(false);
1166 : break;
1167 : }
1168 10 : }
1169 32 : io_path = nvme_io_path_get_next(nbdev_ch, io_path);
1170 32 : } while (io_path != start);
1171 :
1172 20 : if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
1173 : /* We come here only if there is no optimized path. Cache even non_optimized
1174 : * path for load balance across multiple non_optimized paths.
1175 : */
1176 1 : nbdev_ch->current_io_path = non_optimized;
1177 1 : }
1178 :
1179 20 : return non_optimized;
1180 67 : }
1181 :
1182 : static struct nvme_io_path *
1183 4 : _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch)
1184 : {
1185 : struct nvme_io_path *io_path;
1186 4 : struct nvme_io_path *optimized = NULL, *non_optimized = NULL;
1187 4 : uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX;
1188 : uint32_t num_outstanding_reqs;
1189 :
1190 16 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
1191 12 : if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) {
1192 : /* The device is currently resetting. */
1193 0 : continue;
1194 : }
1195 :
1196 12 : if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) {
1197 0 : continue;
1198 : }
1199 :
1200 12 : num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair);
1201 12 : switch (io_path->nvme_ns->ana_state) {
1202 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
1203 6 : if (num_outstanding_reqs < opt_min_qd) {
1204 5 : opt_min_qd = num_outstanding_reqs;
1205 5 : optimized = io_path;
1206 5 : }
1207 6 : break;
1208 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1209 3 : if (num_outstanding_reqs < non_opt_min_qd) {
1210 3 : non_opt_min_qd = num_outstanding_reqs;
1211 3 : non_optimized = io_path;
1212 3 : }
1213 3 : break;
1214 : default:
1215 3 : break;
1216 : }
1217 12 : }
1218 :
1219 : /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */
1220 4 : if (optimized != NULL) {
1221 3 : return optimized;
1222 : }
1223 :
1224 1 : return non_optimized;
1225 4 : }
1226 :
1227 : static inline struct nvme_io_path *
1228 105 : bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
1229 : {
1230 105 : if (spdk_likely(nbdev_ch->current_io_path != NULL)) {
1231 41 : if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) {
1232 31 : return nbdev_ch->current_io_path;
1233 10 : } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
1234 10 : if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) {
1235 3 : return nbdev_ch->current_io_path;
1236 : }
1237 7 : nbdev_ch->rr_counter = 0;
1238 7 : }
1239 7 : }
1240 :
1241 71 : if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE ||
1242 14 : nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
1243 67 : return _bdev_nvme_find_io_path(nbdev_ch);
1244 : } else {
1245 4 : return _bdev_nvme_find_io_path_min_qd(nbdev_ch);
1246 : }
1247 105 : }
1248 :
1249 : /* Return true if there is any io_path whose qpair is active or ctrlr is not failed,
1250 : * or false otherwise.
1251 : *
1252 : * If any io_path has an active qpair but find_io_path() returned NULL, its namespace
1253 : * is likely to be non-accessible now but may become accessible.
1254 : *
1255 : * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr
1256 : * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed
1257 : * when starting to reset it but it is set to failed when the reset failed. Hence, if
1258 : * a ctrlr is unfailed, it is likely that it works fine or is resetting.
1259 : */
1260 : static bool
1261 15 : any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch)
1262 : {
1263 : struct nvme_io_path *io_path;
1264 :
1265 15 : if (nbdev_ch->resetting) {
1266 1 : return false;
1267 : }
1268 :
1269 16 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
1270 14 : if (io_path->nvme_ns->ana_transition_timedout) {
1271 0 : continue;
1272 : }
1273 :
1274 14 : if (nvme_qpair_is_connected(io_path->qpair) ||
1275 9 : !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) {
1276 12 : return true;
1277 : }
1278 2 : }
1279 :
1280 2 : return false;
1281 15 : }
1282 :
1283 : static void
1284 14 : bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io)
1285 : {
1286 14 : struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1287 : struct spdk_io_channel *ch;
1288 :
1289 14 : if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) {
1290 3 : _bdev_nvme_submit_request(nbdev_ch, bdev_io);
1291 3 : } else {
1292 11 : ch = spdk_io_channel_from_ctx(nbdev_ch);
1293 11 : bdev_nvme_submit_request(ch, bdev_io);
1294 : }
1295 14 : }
1296 :
1297 : static int
1298 14 : bdev_nvme_retry_ios(void *arg)
1299 : {
1300 14 : struct nvme_bdev_channel *nbdev_ch = arg;
1301 : struct nvme_bdev_io *bio, *tmp_bio;
1302 : uint64_t now, delay_us;
1303 :
1304 14 : now = spdk_get_ticks();
1305 :
1306 28 : TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) {
1307 15 : if (bio->retry_ticks > now) {
1308 1 : break;
1309 : }
1310 :
1311 14 : TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link);
1312 :
1313 14 : bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio));
1314 14 : }
1315 :
1316 14 : spdk_poller_unregister(&nbdev_ch->retry_io_poller);
1317 :
1318 14 : bio = TAILQ_FIRST(&nbdev_ch->retry_io_list);
1319 14 : if (bio != NULL) {
1320 4 : delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz();
1321 :
1322 4 : nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
1323 : delay_us);
1324 4 : }
1325 :
1326 14 : return SPDK_POLLER_BUSY;
1327 : }
1328 :
1329 : static void
1330 16 : bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch,
1331 : struct nvme_bdev_io *bio, uint64_t delay_ms)
1332 : {
1333 : struct nvme_bdev_io *tmp_bio;
1334 :
1335 16 : bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL;
1336 :
1337 16 : TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) {
1338 1 : if (tmp_bio->retry_ticks <= bio->retry_ticks) {
1339 1 : TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio,
1340 : retry_link);
1341 1 : return;
1342 : }
1343 0 : }
1344 :
1345 : /* No earlier I/Os were found. This I/O must be the new head. */
1346 15 : TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link);
1347 :
1348 15 : spdk_poller_unregister(&nbdev_ch->retry_io_poller);
1349 :
1350 15 : nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
1351 : delay_ms * 1000ULL);
1352 16 : }
1353 :
1354 : static void
1355 58 : bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch)
1356 : {
1357 : struct nvme_bdev_io *bio, *tmp_bio;
1358 :
1359 59 : TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) {
1360 1 : TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link);
1361 1 : __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL);
1362 1 : }
1363 :
1364 58 : spdk_poller_unregister(&nbdev_ch->retry_io_poller);
1365 58 : }
1366 :
1367 : static int
1368 6 : bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch,
1369 : struct nvme_bdev_io *bio_to_abort)
1370 : {
1371 : struct nvme_bdev_io *bio;
1372 :
1373 6 : TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) {
1374 1 : if (bio == bio_to_abort) {
1375 1 : TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link);
1376 1 : __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL);
1377 1 : return 0;
1378 : }
1379 0 : }
1380 :
1381 5 : return -ENOENT;
1382 6 : }
1383 :
1384 : static void
1385 12 : bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl)
1386 : {
1387 : struct nvme_bdev *nbdev;
1388 : uint16_t sct, sc;
1389 :
1390 12 : assert(spdk_nvme_cpl_is_error(cpl));
1391 :
1392 12 : nbdev = bdev_io->bdev->ctxt;
1393 :
1394 12 : if (nbdev->err_stat == NULL) {
1395 12 : return;
1396 : }
1397 :
1398 0 : sct = cpl->status.sct;
1399 0 : sc = cpl->status.sc;
1400 :
1401 0 : pthread_mutex_lock(&nbdev->mutex);
1402 :
1403 0 : nbdev->err_stat->status_type[sct]++;
1404 0 : switch (sct) {
1405 : case SPDK_NVME_SCT_GENERIC:
1406 : case SPDK_NVME_SCT_COMMAND_SPECIFIC:
1407 : case SPDK_NVME_SCT_MEDIA_ERROR:
1408 : case SPDK_NVME_SCT_PATH:
1409 0 : nbdev->err_stat->status[sct][sc]++;
1410 0 : break;
1411 : default:
1412 0 : break;
1413 : }
1414 :
1415 0 : pthread_mutex_unlock(&nbdev->mutex);
1416 12 : }
1417 :
1418 : static inline void
1419 20 : bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio)
1420 : {
1421 20 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1422 20 : uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
1423 20 : uint32_t blocklen = bdev_io->bdev->blocklen;
1424 : struct spdk_bdev_io_stat *stat;
1425 : uint64_t tsc_diff;
1426 :
1427 20 : if (bio->io_path->stat == NULL) {
1428 20 : return;
1429 : }
1430 :
1431 0 : tsc_diff = spdk_get_ticks() - bio->submit_tsc;
1432 0 : stat = bio->io_path->stat;
1433 :
1434 0 : switch (bdev_io->type) {
1435 : case SPDK_BDEV_IO_TYPE_READ:
1436 0 : stat->bytes_read += num_blocks * blocklen;
1437 0 : stat->num_read_ops++;
1438 0 : stat->read_latency_ticks += tsc_diff;
1439 0 : if (stat->max_read_latency_ticks < tsc_diff) {
1440 0 : stat->max_read_latency_ticks = tsc_diff;
1441 0 : }
1442 0 : if (stat->min_read_latency_ticks > tsc_diff) {
1443 0 : stat->min_read_latency_ticks = tsc_diff;
1444 0 : }
1445 0 : break;
1446 : case SPDK_BDEV_IO_TYPE_WRITE:
1447 0 : stat->bytes_written += num_blocks * blocklen;
1448 0 : stat->num_write_ops++;
1449 0 : stat->write_latency_ticks += tsc_diff;
1450 0 : if (stat->max_write_latency_ticks < tsc_diff) {
1451 0 : stat->max_write_latency_ticks = tsc_diff;
1452 0 : }
1453 0 : if (stat->min_write_latency_ticks > tsc_diff) {
1454 0 : stat->min_write_latency_ticks = tsc_diff;
1455 0 : }
1456 0 : break;
1457 : case SPDK_BDEV_IO_TYPE_UNMAP:
1458 0 : stat->bytes_unmapped += num_blocks * blocklen;
1459 0 : stat->num_unmap_ops++;
1460 0 : stat->unmap_latency_ticks += tsc_diff;
1461 0 : if (stat->max_unmap_latency_ticks < tsc_diff) {
1462 0 : stat->max_unmap_latency_ticks = tsc_diff;
1463 0 : }
1464 0 : if (stat->min_unmap_latency_ticks > tsc_diff) {
1465 0 : stat->min_unmap_latency_ticks = tsc_diff;
1466 0 : }
1467 0 : break;
1468 : case SPDK_BDEV_IO_TYPE_ZCOPY:
1469 : /* Track the data in the start phase only */
1470 0 : if (!bdev_io->u.bdev.zcopy.start) {
1471 0 : break;
1472 : }
1473 0 : if (bdev_io->u.bdev.zcopy.populate) {
1474 0 : stat->bytes_read += num_blocks * blocklen;
1475 0 : stat->num_read_ops++;
1476 0 : stat->read_latency_ticks += tsc_diff;
1477 0 : if (stat->max_read_latency_ticks < tsc_diff) {
1478 0 : stat->max_read_latency_ticks = tsc_diff;
1479 0 : }
1480 0 : if (stat->min_read_latency_ticks > tsc_diff) {
1481 0 : stat->min_read_latency_ticks = tsc_diff;
1482 0 : }
1483 0 : } else {
1484 0 : stat->bytes_written += num_blocks * blocklen;
1485 0 : stat->num_write_ops++;
1486 0 : stat->write_latency_ticks += tsc_diff;
1487 0 : if (stat->max_write_latency_ticks < tsc_diff) {
1488 0 : stat->max_write_latency_ticks = tsc_diff;
1489 0 : }
1490 0 : if (stat->min_write_latency_ticks > tsc_diff) {
1491 0 : stat->min_write_latency_ticks = tsc_diff;
1492 0 : }
1493 : }
1494 0 : break;
1495 : case SPDK_BDEV_IO_TYPE_COPY:
1496 0 : stat->bytes_copied += num_blocks * blocklen;
1497 0 : stat->num_copy_ops++;
1498 0 : stat->copy_latency_ticks += tsc_diff;
1499 0 : if (stat->max_copy_latency_ticks < tsc_diff) {
1500 0 : stat->max_copy_latency_ticks = tsc_diff;
1501 0 : }
1502 0 : if (stat->min_copy_latency_ticks > tsc_diff) {
1503 0 : stat->min_copy_latency_ticks = tsc_diff;
1504 0 : }
1505 0 : break;
1506 : default:
1507 0 : break;
1508 : }
1509 20 : }
1510 :
1511 : static bool
1512 11 : bdev_nvme_check_retry_io(struct nvme_bdev_io *bio,
1513 : const struct spdk_nvme_cpl *cpl,
1514 : struct nvme_bdev_channel *nbdev_ch,
1515 : uint64_t *_delay_ms)
1516 : {
1517 11 : struct nvme_io_path *io_path = bio->io_path;
1518 11 : struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr;
1519 : const struct spdk_nvme_ctrlr_data *cdata;
1520 :
1521 15 : if (spdk_nvme_cpl_is_path_error(cpl) ||
1522 5 : spdk_nvme_cpl_is_aborted_sq_deletion(cpl) ||
1523 0 : !nvme_io_path_is_available(io_path) ||
1524 4 : !nvme_ctrlr_is_available(nvme_ctrlr)) {
1525 15 : bdev_nvme_clear_current_io_path(nbdev_ch);
1526 15 : bio->io_path = NULL;
1527 15 : if (spdk_nvme_cpl_is_ana_error(cpl)) {
1528 1 : if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) {
1529 1 : io_path->nvme_ns->ana_state_updating = true;
1530 1 : }
1531 1 : }
1532 3 : if (!any_io_path_may_become_available(nbdev_ch)) {
1533 0 : return false;
1534 : }
1535 3 : *_delay_ms = 0;
1536 3 : } else {
1537 4 : bio->retry_count++;
1538 :
1539 4 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
1540 :
1541 4 : if (cpl->status.crd != 0) {
1542 1 : *_delay_ms = cdata->crdt[cpl->status.crd] * 100;
1543 1 : } else {
1544 3 : *_delay_ms = 0;
1545 : }
1546 : }
1547 :
1548 7 : return true;
1549 7 : }
1550 :
1551 : static inline void
1552 40 : bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
1553 : const struct spdk_nvme_cpl *cpl)
1554 : {
1555 40 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1556 : struct nvme_bdev_channel *nbdev_ch;
1557 : uint64_t delay_ms;
1558 :
1559 40 : assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
1560 :
1561 40 : if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
1562 20 : bdev_nvme_update_io_path_stat(bio);
1563 20 : goto complete;
1564 : }
1565 :
1566 : /* Update error counts before deciding if retry is needed.
1567 : * Hence, error counts may be more than the number of I/O errors.
1568 : */
1569 20 : bdev_nvme_update_nvme_error_stat(bdev_io, cpl);
1570 :
1571 27 : if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) ||
1572 2 : (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) {
1573 23 : goto complete;
1574 : }
1575 :
1576 : /* At this point we don't know whether the sequence was successfully executed or not, so we
1577 : * cannot retry the IO */
1578 7 : if (bdev_io->u.bdev.accel_sequence != NULL) {
1579 0 : goto complete;
1580 : }
1581 :
1582 7 : nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
1583 :
1584 7 : if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) {
1585 7 : bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms);
1586 7 : return;
1587 : }
1588 :
1589 : complete:
1590 25 : bio->retry_count = 0;
1591 25 : bio->submit_tsc = 0;
1592 25 : bdev_io->u.bdev.accel_sequence = NULL;
1593 25 : __bdev_nvme_io_complete(bdev_io, 0, cpl);
1594 32 : }
1595 :
1596 : static inline void
1597 13 : bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
1598 : {
1599 13 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1600 : struct nvme_bdev_channel *nbdev_ch;
1601 : enum spdk_bdev_io_status io_status;
1602 :
1603 13 : assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
1604 :
1605 13 : switch (rc) {
1606 : case 0:
1607 1 : io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1608 1 : break;
1609 : case -ENOMEM:
1610 0 : io_status = SPDK_BDEV_IO_STATUS_NOMEM;
1611 0 : break;
1612 : case -ENXIO:
1613 15 : if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) {
1614 12 : nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
1615 :
1616 12 : bdev_nvme_clear_current_io_path(nbdev_ch);
1617 12 : bio->io_path = NULL;
1618 :
1619 12 : if (any_io_path_may_become_available(nbdev_ch)) {
1620 9 : bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL);
1621 9 : return;
1622 : }
1623 3 : }
1624 :
1625 : /* fallthrough */
1626 : default:
1627 3 : spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence);
1628 3 : bdev_io->u.bdev.accel_sequence = NULL;
1629 3 : io_status = SPDK_BDEV_IO_STATUS_FAILED;
1630 3 : break;
1631 : }
1632 :
1633 4 : bio->retry_count = 0;
1634 4 : bio->submit_tsc = 0;
1635 4 : __bdev_nvme_io_complete(bdev_io, io_status, NULL);
1636 13 : }
1637 :
1638 : static inline void
1639 4 : bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc)
1640 : {
1641 4 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1642 : enum spdk_bdev_io_status io_status;
1643 :
1644 4 : switch (rc) {
1645 : case 0:
1646 1 : io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1647 1 : break;
1648 : case -ENOMEM:
1649 0 : io_status = SPDK_BDEV_IO_STATUS_NOMEM;
1650 0 : break;
1651 1 : case -ENXIO:
1652 : /* fallthrough */
1653 : default:
1654 3 : io_status = SPDK_BDEV_IO_STATUS_FAILED;
1655 3 : break;
1656 : }
1657 :
1658 4 : __bdev_nvme_io_complete(bdev_io, io_status, NULL);
1659 4 : }
1660 :
1661 : static void
1662 3 : bdev_nvme_clear_io_path_caches_done(struct nvme_ctrlr *nvme_ctrlr,
1663 : void *ctx, int status)
1664 : {
1665 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
1666 :
1667 3 : assert(nvme_ctrlr->io_path_cache_clearing == true);
1668 3 : nvme_ctrlr->io_path_cache_clearing = false;
1669 :
1670 3 : if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
1671 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1672 3 : return;
1673 : }
1674 :
1675 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1676 :
1677 0 : nvme_ctrlr_unregister(nvme_ctrlr);
1678 3 : }
1679 :
1680 : static void
1681 416 : _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair)
1682 : {
1683 : struct nvme_io_path *io_path;
1684 :
1685 651 : TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) {
1686 235 : if (io_path->nbdev_ch == NULL) {
1687 72 : continue;
1688 : }
1689 163 : bdev_nvme_clear_current_io_path(io_path->nbdev_ch);
1690 163 : }
1691 416 : }
1692 :
1693 : static void
1694 1 : bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel_iter *i,
1695 : struct nvme_ctrlr *nvme_ctrlr,
1696 : struct nvme_ctrlr_channel *ctrlr_ch,
1697 : void *ctx)
1698 : {
1699 1 : assert(ctrlr_ch->qpair != NULL);
1700 :
1701 1 : _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair);
1702 :
1703 1 : nvme_ctrlr_for_each_channel_continue(i, 0);
1704 1 : }
1705 :
1706 : static void
1707 3 : bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr)
1708 : {
1709 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
1710 3 : if (!nvme_ctrlr_is_available(nvme_ctrlr) ||
1711 3 : nvme_ctrlr->io_path_cache_clearing) {
1712 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1713 0 : return;
1714 : }
1715 :
1716 3 : nvme_ctrlr->io_path_cache_clearing = true;
1717 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1718 :
1719 3 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
1720 : bdev_nvme_clear_io_path_cache,
1721 : NULL,
1722 : bdev_nvme_clear_io_path_caches_done);
1723 3 : }
1724 :
1725 : static struct nvme_qpair *
1726 121 : nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair)
1727 : {
1728 : struct nvme_qpair *nvme_qpair;
1729 :
1730 138 : TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) {
1731 138 : if (nvme_qpair->qpair == qpair) {
1732 121 : break;
1733 : }
1734 17 : }
1735 :
1736 121 : return nvme_qpair;
1737 : }
1738 :
1739 : static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair);
1740 :
1741 : static void
1742 121 : bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
1743 : {
1744 121 : struct nvme_poll_group *group = poll_group_ctx;
1745 : struct nvme_qpair *nvme_qpair;
1746 : struct nvme_ctrlr *nvme_ctrlr;
1747 : struct nvme_ctrlr_channel *ctrlr_ch;
1748 : int status;
1749 :
1750 121 : nvme_qpair = nvme_poll_group_get_qpair(group, qpair);
1751 121 : if (nvme_qpair == NULL) {
1752 0 : return;
1753 : }
1754 :
1755 121 : if (nvme_qpair->qpair != NULL) {
1756 121 : spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair);
1757 121 : nvme_qpair->qpair = NULL;
1758 121 : }
1759 :
1760 121 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
1761 :
1762 121 : nvme_ctrlr = nvme_qpair->ctrlr;
1763 121 : ctrlr_ch = nvme_qpair->ctrlr_ch;
1764 :
1765 121 : if (ctrlr_ch != NULL) {
1766 74 : if (ctrlr_ch->reset_iter != NULL) {
1767 : /* We are in a full reset sequence. */
1768 69 : if (ctrlr_ch->connect_poller != NULL) {
1769 : /* qpair was failed to connect. Abort the reset sequence. */
1770 0 : NVME_CTRLR_INFOLOG(nvme_ctrlr,
1771 : "qpair %p was failed to connect. abort the reset ctrlr sequence.\n",
1772 : qpair);
1773 0 : spdk_poller_unregister(&ctrlr_ch->connect_poller);
1774 0 : status = -1;
1775 0 : } else {
1776 : /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */
1777 69 : NVME_CTRLR_INFOLOG(nvme_ctrlr,
1778 : "qpair %p was disconnected and freed in a reset ctrlr sequence.\n",
1779 : qpair);
1780 69 : status = 0;
1781 : }
1782 69 : nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, status);
1783 69 : ctrlr_ch->reset_iter = NULL;
1784 69 : } else {
1785 : /* qpair was disconnected unexpectedly. Reset controller for recovery. */
1786 5 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. reset controller.\n",
1787 : qpair);
1788 5 : bdev_nvme_failover_ctrlr(nvme_ctrlr);
1789 : }
1790 74 : } else {
1791 : /* In this case, ctrlr_channel is already deleted. */
1792 47 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. delete nvme_qpair.\n",
1793 : qpair);
1794 47 : nvme_qpair_delete(nvme_qpair);
1795 : }
1796 121 : }
1797 :
1798 : static void
1799 0 : bdev_nvme_check_io_qpairs(struct nvme_poll_group *group)
1800 : {
1801 : struct nvme_qpair *nvme_qpair;
1802 :
1803 0 : TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) {
1804 0 : if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) {
1805 0 : continue;
1806 : }
1807 :
1808 0 : if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) !=
1809 : SPDK_NVME_QPAIR_FAILURE_NONE) {
1810 0 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
1811 0 : }
1812 0 : }
1813 0 : }
1814 :
1815 : static int
1816 1209 : bdev_nvme_poll(void *arg)
1817 : {
1818 1209 : struct nvme_poll_group *group = arg;
1819 : int64_t num_completions;
1820 :
1821 1209 : if (group->collect_spin_stat && group->start_ticks == 0) {
1822 0 : group->start_ticks = spdk_get_ticks();
1823 0 : }
1824 :
1825 1209 : num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
1826 : bdev_nvme_disconnected_qpair_cb);
1827 1209 : if (group->collect_spin_stat) {
1828 0 : if (num_completions > 0) {
1829 0 : if (group->end_ticks != 0) {
1830 0 : group->spin_ticks += (group->end_ticks - group->start_ticks);
1831 0 : group->end_ticks = 0;
1832 0 : }
1833 0 : group->start_ticks = 0;
1834 0 : } else {
1835 0 : group->end_ticks = spdk_get_ticks();
1836 : }
1837 0 : }
1838 :
1839 1209 : if (spdk_unlikely(num_completions < 0)) {
1840 0 : bdev_nvme_check_io_qpairs(group);
1841 0 : }
1842 :
1843 1209 : return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
1844 : }
1845 :
1846 : static int bdev_nvme_poll_adminq(void *arg);
1847 :
1848 : static void
1849 142 : bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us)
1850 : {
1851 142 : if (spdk_interrupt_mode_is_enabled()) {
1852 0 : return;
1853 : }
1854 :
1855 142 : spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller);
1856 :
1857 142 : nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq,
1858 : nvme_ctrlr, new_period_us);
1859 142 : }
1860 :
1861 : static int
1862 191 : bdev_nvme_poll_adminq(void *arg)
1863 : {
1864 : int32_t rc;
1865 191 : struct nvme_ctrlr *nvme_ctrlr = arg;
1866 : nvme_ctrlr_disconnected_cb disconnected_cb;
1867 :
1868 191 : assert(nvme_ctrlr != NULL);
1869 :
1870 191 : rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr);
1871 191 : if (rc < 0) {
1872 86 : disconnected_cb = nvme_ctrlr->disconnected_cb;
1873 86 : nvme_ctrlr->disconnected_cb = NULL;
1874 :
1875 86 : if (disconnected_cb != NULL) {
1876 142 : bdev_nvme_change_adminq_poll_period(nvme_ctrlr,
1877 71 : g_opts.nvme_adminq_poll_period_us);
1878 71 : disconnected_cb(nvme_ctrlr);
1879 71 : } else {
1880 15 : bdev_nvme_failover_ctrlr(nvme_ctrlr);
1881 : }
1882 191 : } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) !=
1883 : SPDK_NVME_QPAIR_FAILURE_NONE) {
1884 0 : bdev_nvme_clear_io_path_caches(nvme_ctrlr);
1885 0 : }
1886 :
1887 191 : return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
1888 : }
1889 :
1890 : static void
1891 39 : nvme_bdev_free(void *io_device)
1892 : {
1893 39 : struct nvme_bdev *nbdev = io_device;
1894 :
1895 39 : pthread_mutex_destroy(&nbdev->mutex);
1896 39 : free(nbdev->disk.name);
1897 39 : free(nbdev->err_stat);
1898 39 : free(nbdev);
1899 39 : }
1900 :
1901 : static int
1902 38 : bdev_nvme_destruct(void *ctx)
1903 : {
1904 38 : struct nvme_bdev *nbdev = ctx;
1905 : struct nvme_ns *nvme_ns, *tmp_nvme_ns;
1906 :
1907 : SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nbdev->nbdev_ctrlr->name, nbdev->nsid);
1908 :
1909 38 : pthread_mutex_lock(&nbdev->mutex);
1910 :
1911 77 : TAILQ_FOREACH_SAFE(nvme_ns, &nbdev->nvme_ns_list, tailq, tmp_nvme_ns) {
1912 39 : pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
1913 :
1914 39 : nvme_ns->bdev = NULL;
1915 :
1916 39 : assert(nvme_ns->id > 0);
1917 :
1918 39 : if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) {
1919 0 : pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1920 :
1921 0 : nvme_ctrlr_put_ref(nvme_ns->ctrlr);
1922 0 : nvme_ns_free(nvme_ns);
1923 0 : } else {
1924 39 : pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1925 : }
1926 39 : }
1927 :
1928 38 : pthread_mutex_unlock(&nbdev->mutex);
1929 :
1930 38 : pthread_mutex_lock(&g_bdev_nvme_mutex);
1931 38 : TAILQ_REMOVE(&nbdev->nbdev_ctrlr->bdevs, nbdev, tailq);
1932 38 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
1933 :
1934 38 : spdk_io_device_unregister(nbdev, nvme_bdev_free);
1935 :
1936 38 : return 0;
1937 : }
1938 :
1939 : static int
1940 122 : bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair)
1941 : {
1942 : struct nvme_ctrlr *nvme_ctrlr;
1943 : struct spdk_nvme_io_qpair_opts opts;
1944 : struct spdk_nvme_qpair *qpair;
1945 : int rc;
1946 :
1947 122 : nvme_ctrlr = nvme_qpair->ctrlr;
1948 :
1949 122 : spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1950 122 : opts.create_only = true;
1951 : /* In interrupt mode qpairs must be created in sync mode, else it will never be connected.
1952 : * delay_cmd_submit must be false as in interrupt mode requests cannot be submitted in
1953 : * completion context.
1954 : */
1955 122 : if (!spdk_interrupt_mode_is_enabled()) {
1956 122 : opts.async_mode = true;
1957 122 : opts.delay_cmd_submit = g_opts.delay_cmd_submit;
1958 122 : }
1959 122 : opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
1960 122 : g_opts.io_queue_requests = opts.io_queue_requests;
1961 :
1962 122 : qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1963 122 : if (qpair == NULL) {
1964 0 : return -1;
1965 : }
1966 :
1967 : SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name,
1968 : spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread));
1969 :
1970 122 : assert(nvme_qpair->group != NULL);
1971 :
1972 122 : rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair);
1973 122 : if (rc != 0) {
1974 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to begin polling on NVMe Channel.\n");
1975 0 : goto err;
1976 : }
1977 :
1978 122 : rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair);
1979 122 : if (rc != 0) {
1980 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to connect I/O qpair.\n");
1981 0 : goto err;
1982 : }
1983 :
1984 122 : nvme_qpair->qpair = qpair;
1985 :
1986 122 : if (!g_opts.disable_auto_failback) {
1987 85 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
1988 85 : }
1989 :
1990 122 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Connecting qpair %p:%u started.\n",
1991 : qpair, spdk_nvme_qpair_get_id(qpair));
1992 :
1993 122 : return 0;
1994 :
1995 : err:
1996 0 : spdk_nvme_ctrlr_free_io_qpair(qpair);
1997 :
1998 0 : return rc;
1999 122 : }
2000 :
2001 : static void bdev_nvme_reset_io_continue(void *cb_arg, int rc);
2002 :
2003 : static void
2004 71 : bdev_nvme_complete_pending_resets(struct nvme_ctrlr *nvme_ctrlr, bool success)
2005 : {
2006 71 : int rc = 0;
2007 : struct nvme_bdev_io *bio;
2008 :
2009 71 : if (!success) {
2010 33 : rc = -1;
2011 33 : }
2012 :
2013 83 : while (!TAILQ_EMPTY(&nvme_ctrlr->pending_resets)) {
2014 12 : bio = TAILQ_FIRST(&nvme_ctrlr->pending_resets);
2015 12 : TAILQ_REMOVE(&nvme_ctrlr->pending_resets, bio, retry_link);
2016 :
2017 12 : bdev_nvme_reset_io_continue(bio, rc);
2018 : }
2019 71 : }
2020 :
2021 : /* This function marks the current trid as failed by storing the current ticks
2022 : * and then sets the next trid to the active trid within a controller if exists.
2023 : *
2024 : * The purpose of the boolean return value is to request the caller to disconnect
2025 : * the current trid now to try connecting the next trid.
2026 : */
2027 : static bool
2028 62 : bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start)
2029 : {
2030 : struct nvme_path_id *path_id, *next_path;
2031 : int rc __attribute__((unused));
2032 :
2033 62 : path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
2034 62 : assert(path_id);
2035 62 : assert(path_id == nvme_ctrlr->active_path_id);
2036 62 : next_path = TAILQ_NEXT(path_id, link);
2037 :
2038 : /* Update the last failed time. It means the trid is failed if its last
2039 : * failed time is non-zero.
2040 : */
2041 62 : path_id->last_failed_tsc = spdk_get_ticks();
2042 :
2043 62 : if (next_path == NULL) {
2044 : /* There is no alternate trid within a controller. */
2045 51 : return false;
2046 : }
2047 :
2048 11 : if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) {
2049 : /* Connect is not retried in a controller reset sequence. Connecting
2050 : * the next trid will be done by the next bdev_nvme_failover_ctrlr() call.
2051 : */
2052 3 : return false;
2053 : }
2054 :
2055 8 : assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
2056 :
2057 8 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Start failover from %s:%s to %s:%s\n",
2058 : path_id->trid.traddr, path_id->trid.trsvcid,
2059 : next_path->trid.traddr, next_path->trid.trsvcid);
2060 :
2061 8 : spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
2062 8 : nvme_ctrlr->active_path_id = next_path;
2063 8 : rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid);
2064 8 : assert(rc == 0);
2065 8 : TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
2066 8 : if (!remove) {
2067 : /** Shuffle the old trid to the end of the list and use the new one.
2068 : * Allows for round robin through multiple connections.
2069 : */
2070 6 : TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link);
2071 6 : } else {
2072 2 : free(path_id);
2073 : }
2074 :
2075 8 : if (start || next_path->last_failed_tsc == 0) {
2076 : /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed
2077 : * or used yet. Try the next trid now.
2078 : */
2079 7 : return true;
2080 : }
2081 :
2082 2 : if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() *
2083 1 : nvme_ctrlr->opts.reconnect_delay_sec) {
2084 : /* Enough backoff passed since the next trid failed. Try the next trid now. */
2085 0 : return true;
2086 : }
2087 :
2088 : /* The next trid will be tried after reconnect_delay_sec seconds. */
2089 1 : return false;
2090 62 : }
2091 :
2092 : static bool
2093 89 : bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr)
2094 : {
2095 : int32_t elapsed;
2096 :
2097 89 : if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 ||
2098 37 : nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) {
2099 63 : return false;
2100 : }
2101 :
2102 26 : elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz();
2103 26 : if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) {
2104 6 : return true;
2105 : } else {
2106 20 : return false;
2107 : }
2108 89 : }
2109 :
2110 : static bool
2111 12 : bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr)
2112 : {
2113 : uint32_t elapsed;
2114 :
2115 12 : if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) {
2116 8 : return false;
2117 : }
2118 :
2119 4 : elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz();
2120 4 : if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) {
2121 2 : return true;
2122 : } else {
2123 2 : return false;
2124 : }
2125 12 : }
2126 :
2127 : static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success);
2128 :
2129 : static void
2130 72 : nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn)
2131 : {
2132 : int rc;
2133 :
2134 72 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting ctrlr.\n");
2135 :
2136 72 : rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr);
2137 72 : if (rc != 0) {
2138 1 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "disconnecting ctrlr failed.\n");
2139 :
2140 : /* Disconnect fails if ctrlr is already resetting or removed. In this case,
2141 : * fail the reset sequence immediately.
2142 : */
2143 1 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false);
2144 1 : return;
2145 : }
2146 :
2147 : /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq.
2148 : * Set callback here to execute the specified operation after ctrlr is really disconnected.
2149 : */
2150 71 : assert(nvme_ctrlr->disconnected_cb == NULL);
2151 71 : nvme_ctrlr->disconnected_cb = cb_fn;
2152 :
2153 : /* During disconnection, reduce the period to poll adminq more often. */
2154 71 : bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0);
2155 72 : }
2156 :
2157 : enum bdev_nvme_op_after_reset {
2158 : OP_NONE,
2159 : OP_COMPLETE_PENDING_DESTRUCT,
2160 : OP_DESTRUCT,
2161 : OP_DELAYED_RECONNECT,
2162 : OP_FAILOVER,
2163 : };
2164 :
2165 : typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset;
2166 :
2167 : static _bdev_nvme_op_after_reset
2168 71 : bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success)
2169 : {
2170 71 : if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
2171 : /* Complete pending destruct after reset completes. */
2172 0 : return OP_COMPLETE_PENDING_DESTRUCT;
2173 71 : } else if (nvme_ctrlr->pending_failover) {
2174 3 : nvme_ctrlr->pending_failover = false;
2175 3 : nvme_ctrlr->reset_start_tsc = 0;
2176 3 : return OP_FAILOVER;
2177 68 : } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) {
2178 54 : nvme_ctrlr->reset_start_tsc = 0;
2179 54 : return OP_NONE;
2180 14 : } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) {
2181 2 : return OP_DESTRUCT;
2182 : } else {
2183 12 : if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) {
2184 2 : nvme_ctrlr->fast_io_fail_timedout = true;
2185 2 : }
2186 12 : return OP_DELAYED_RECONNECT;
2187 : }
2188 71 : }
2189 :
2190 : static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug);
2191 : static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
2192 :
2193 : static int
2194 9 : bdev_nvme_reconnect_delay_timer_expired(void *ctx)
2195 : {
2196 9 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2197 :
2198 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name);
2199 9 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2200 :
2201 9 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
2202 :
2203 9 : if (!nvme_ctrlr->reconnect_is_delayed) {
2204 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2205 0 : return SPDK_POLLER_BUSY;
2206 : }
2207 :
2208 9 : nvme_ctrlr->reconnect_is_delayed = false;
2209 :
2210 9 : if (nvme_ctrlr->destruct) {
2211 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2212 0 : return SPDK_POLLER_BUSY;
2213 : }
2214 :
2215 9 : assert(nvme_ctrlr->resetting == false);
2216 9 : nvme_ctrlr->resetting = true;
2217 :
2218 9 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2219 :
2220 9 : spdk_poller_resume(nvme_ctrlr->adminq_timer_poller);
2221 :
2222 9 : bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
2223 9 : return SPDK_POLLER_BUSY;
2224 9 : }
2225 :
2226 : static void
2227 12 : bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr)
2228 : {
2229 12 : spdk_poller_pause(nvme_ctrlr->adminq_timer_poller);
2230 :
2231 12 : assert(nvme_ctrlr->reconnect_is_delayed == false);
2232 12 : nvme_ctrlr->reconnect_is_delayed = true;
2233 :
2234 12 : assert(nvme_ctrlr->reconnect_delay_timer == NULL);
2235 12 : nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired,
2236 : nvme_ctrlr,
2237 : nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC);
2238 12 : }
2239 :
2240 : static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr);
2241 :
2242 : static void
2243 71 : bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success)
2244 : {
2245 71 : bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn;
2246 71 : void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg;
2247 : enum bdev_nvme_op_after_reset op_after_reset;
2248 :
2249 71 : assert(nvme_ctrlr->thread == spdk_get_thread());
2250 :
2251 71 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2252 71 : if (!success) {
2253 : /* Connecting the active trid failed. Set the next alternate trid to the
2254 : * active trid if it exists.
2255 : */
2256 35 : if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) {
2257 : /* The next alternate trid exists and is ready to try. Try it now. */
2258 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2259 :
2260 2 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Try the next alternate trid %s:%s now.\n",
2261 : nvme_ctrlr->active_path_id->trid.traddr,
2262 : nvme_ctrlr->active_path_id->trid.trsvcid);
2263 :
2264 2 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr);
2265 2 : return;
2266 : }
2267 :
2268 : /* We came here if there is no alternate trid or if the next trid exists but
2269 : * is not ready to try. We will try the active trid after reconnect_delay_sec
2270 : * seconds if it is non-zero or at the next reset call otherwise.
2271 : */
2272 33 : } else {
2273 : /* Connecting the active trid succeeded. Clear the last failed time because it
2274 : * means the trid is failed if its last failed time is non-zero.
2275 : */
2276 36 : nvme_ctrlr->active_path_id->last_failed_tsc = 0;
2277 : }
2278 :
2279 69 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Clear pending resets.\n");
2280 :
2281 : /* Make sure we clear any pending resets before returning. */
2282 69 : bdev_nvme_complete_pending_resets(nvme_ctrlr, success);
2283 :
2284 69 : if (!success) {
2285 33 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Resetting controller failed.\n");
2286 33 : } else {
2287 36 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Resetting controller successful.\n");
2288 : }
2289 :
2290 69 : nvme_ctrlr->resetting = false;
2291 69 : nvme_ctrlr->dont_retry = false;
2292 69 : nvme_ctrlr->in_failover = false;
2293 :
2294 69 : nvme_ctrlr->ctrlr_op_cb_fn = NULL;
2295 69 : nvme_ctrlr->ctrlr_op_cb_arg = NULL;
2296 :
2297 69 : op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success);
2298 69 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2299 :
2300 : /* Delay callbacks when the next operation is a failover. */
2301 69 : if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) {
2302 17 : ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1);
2303 17 : }
2304 :
2305 69 : switch (op_after_reset) {
2306 : case OP_COMPLETE_PENDING_DESTRUCT:
2307 0 : nvme_ctrlr_unregister(nvme_ctrlr);
2308 0 : break;
2309 : case OP_DESTRUCT:
2310 2 : bdev_nvme_delete_ctrlr(nvme_ctrlr, false);
2311 2 : remove_discovery_entry(nvme_ctrlr);
2312 2 : break;
2313 : case OP_DELAYED_RECONNECT:
2314 12 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer);
2315 12 : break;
2316 : case OP_FAILOVER:
2317 3 : nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn;
2318 3 : nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg;
2319 3 : bdev_nvme_failover_ctrlr(nvme_ctrlr);
2320 3 : break;
2321 : default:
2322 52 : break;
2323 : }
2324 71 : }
2325 :
2326 : static void
2327 0 : bdev_nvme_reset_create_qpairs_failed(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2328 : {
2329 0 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false);
2330 0 : }
2331 :
2332 : static void
2333 104 : bdev_nvme_reset_destroy_qpair(struct nvme_ctrlr_channel_iter *i,
2334 : struct nvme_ctrlr *nvme_ctrlr,
2335 : struct nvme_ctrlr_channel *ctrlr_ch, void *ctx)
2336 : {
2337 : struct nvme_qpair *nvme_qpair;
2338 : struct spdk_nvme_qpair *qpair;
2339 :
2340 104 : nvme_qpair = ctrlr_ch->qpair;
2341 104 : assert(nvme_qpair != NULL);
2342 :
2343 104 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
2344 :
2345 104 : qpair = nvme_qpair->qpair;
2346 104 : if (qpair != NULL) {
2347 69 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting qpair %p:%u.\n",
2348 : qpair, spdk_nvme_qpair_get_id(qpair));
2349 :
2350 69 : if (nvme_qpair->ctrlr->dont_retry) {
2351 53 : spdk_nvme_qpair_set_abort_dnr(qpair, true);
2352 53 : }
2353 69 : spdk_nvme_ctrlr_disconnect_io_qpair(qpair);
2354 :
2355 : /* The current full reset sequence will move to the next
2356 : * ctrlr_channel after the qpair is actually disconnected.
2357 : */
2358 69 : assert(ctrlr_ch->reset_iter == NULL);
2359 69 : ctrlr_ch->reset_iter = i;
2360 69 : } else {
2361 35 : nvme_ctrlr_for_each_channel_continue(i, 0);
2362 : }
2363 104 : }
2364 :
2365 : static void
2366 36 : bdev_nvme_reset_create_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2367 : {
2368 36 : if (status == 0) {
2369 36 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were created after ctrlr reset.\n");
2370 :
2371 36 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true);
2372 36 : } else {
2373 0 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were failed to create after ctrlr reset.\n");
2374 :
2375 : /* Delete the added qpairs and quiesce ctrlr to make the states clean. */
2376 0 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2377 : bdev_nvme_reset_destroy_qpair,
2378 : NULL,
2379 : bdev_nvme_reset_create_qpairs_failed);
2380 : }
2381 36 : }
2382 :
2383 : static int
2384 61 : bdev_nvme_reset_check_qpair_connected(void *ctx)
2385 : {
2386 61 : struct nvme_ctrlr_channel *ctrlr_ch = ctx;
2387 61 : struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair;
2388 : struct spdk_nvme_qpair *qpair;
2389 :
2390 61 : if (ctrlr_ch->reset_iter == NULL) {
2391 : /* qpair was already failed to connect and the reset sequence is being aborted. */
2392 0 : assert(ctrlr_ch->connect_poller == NULL);
2393 0 : assert(nvme_qpair->qpair == NULL);
2394 :
2395 0 : NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr,
2396 : "qpair was already failed to connect. reset is being aborted.\n");
2397 0 : return SPDK_POLLER_BUSY;
2398 : }
2399 :
2400 61 : qpair = nvme_qpair->qpair;
2401 61 : assert(qpair != NULL);
2402 :
2403 61 : if (!spdk_nvme_qpair_is_connected(qpair)) {
2404 0 : return SPDK_POLLER_BUSY;
2405 : }
2406 :
2407 61 : NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr, "qpair %p:%u was connected.\n",
2408 : qpair, spdk_nvme_qpair_get_id(qpair));
2409 :
2410 61 : spdk_poller_unregister(&ctrlr_ch->connect_poller);
2411 :
2412 : /* qpair was completed to connect. Move to the next ctrlr_channel */
2413 61 : nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0);
2414 61 : ctrlr_ch->reset_iter = NULL;
2415 :
2416 61 : if (!g_opts.disable_auto_failback) {
2417 44 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
2418 44 : }
2419 :
2420 61 : return SPDK_POLLER_BUSY;
2421 61 : }
2422 :
2423 : static void
2424 61 : bdev_nvme_reset_create_qpair(struct nvme_ctrlr_channel_iter *i,
2425 : struct nvme_ctrlr *nvme_ctrlr,
2426 : struct nvme_ctrlr_channel *ctrlr_ch,
2427 : void *ctx)
2428 : {
2429 61 : struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair;
2430 : struct spdk_nvme_qpair *qpair;
2431 61 : int rc = 0;
2432 :
2433 61 : if (nvme_qpair->qpair == NULL) {
2434 61 : rc = bdev_nvme_create_qpair(nvme_qpair);
2435 61 : }
2436 61 : if (rc == 0) {
2437 61 : ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected,
2438 : ctrlr_ch, 0);
2439 :
2440 61 : qpair = nvme_qpair->qpair;
2441 :
2442 61 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start checking qpair %p:%u to be connected.\n",
2443 : qpair, spdk_nvme_qpair_get_id(qpair));
2444 :
2445 : /* The current full reset sequence will move to the next
2446 : * ctrlr_channel after the qpair is actually connected.
2447 : */
2448 61 : assert(ctrlr_ch->reset_iter == NULL);
2449 61 : ctrlr_ch->reset_iter = i;
2450 61 : } else {
2451 0 : nvme_ctrlr_for_each_channel_continue(i, rc);
2452 : }
2453 61 : }
2454 :
2455 : static void
2456 36 : nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr)
2457 : {
2458 36 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
2459 : struct nvme_ns *nvme_ns;
2460 :
2461 57 : for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
2462 57 : nvme_ns != NULL;
2463 21 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) {
2464 21 : if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) {
2465 1 : SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id);
2466 : /* NS can be added again. Just nullify nvme_ns->ns. */
2467 1 : nvme_ns->ns = NULL;
2468 1 : }
2469 21 : }
2470 36 : }
2471 :
2472 :
2473 : static int
2474 70 : bdev_nvme_reconnect_ctrlr_poll(void *arg)
2475 : {
2476 70 : struct nvme_ctrlr *nvme_ctrlr = arg;
2477 : struct spdk_nvme_transport_id *trid;
2478 70 : int rc = -ETIMEDOUT;
2479 :
2480 70 : if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) {
2481 : /* Mark the ctrlr as failed. The next call to
2482 : * spdk_nvme_ctrlr_reconnect_poll_async() will then
2483 : * do the necessary cleanup and return failure.
2484 : */
2485 2 : spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
2486 2 : }
2487 :
2488 70 : rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr);
2489 70 : if (rc == -EAGAIN) {
2490 0 : return SPDK_POLLER_BUSY;
2491 : }
2492 :
2493 70 : spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
2494 70 : if (rc == 0) {
2495 36 : trid = &nvme_ctrlr->active_path_id->trid;
2496 :
2497 36 : if (spdk_nvme_trtype_is_fabrics(trid->trtype)) {
2498 36 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected to %s:%s. Create qpairs.\n",
2499 : trid->traddr, trid->trsvcid);
2500 36 : } else {
2501 0 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected. Create qpairs.\n");
2502 : }
2503 :
2504 36 : nvme_ctrlr_check_namespaces(nvme_ctrlr);
2505 :
2506 : /* Recreate all of the I/O queue pairs */
2507 36 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2508 : bdev_nvme_reset_create_qpair,
2509 : NULL,
2510 : bdev_nvme_reset_create_qpairs_done);
2511 36 : } else {
2512 34 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr could not be connected.\n");
2513 :
2514 34 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false);
2515 : }
2516 70 : return SPDK_POLLER_BUSY;
2517 70 : }
2518 :
2519 : static void
2520 70 : bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2521 : {
2522 70 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start reconnecting ctrlr.\n");
2523 :
2524 70 : spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr);
2525 :
2526 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name);
2527 70 : assert(nvme_ctrlr->reset_detach_poller == NULL);
2528 70 : nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll,
2529 : nvme_ctrlr, 0);
2530 70 : }
2531 :
2532 : static void
2533 57 : bdev_nvme_reset_destroy_qpair_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2534 : {
2535 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name);
2536 57 : assert(status == 0);
2537 :
2538 57 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were deleted.\n");
2539 :
2540 57 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2541 0 : bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
2542 0 : } else {
2543 57 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr);
2544 : }
2545 57 : }
2546 :
2547 : static void
2548 57 : bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr)
2549 : {
2550 57 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Delete qpairs for reset.\n");
2551 :
2552 57 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2553 : bdev_nvme_reset_destroy_qpair,
2554 : NULL,
2555 : bdev_nvme_reset_destroy_qpair_done);
2556 57 : }
2557 :
2558 : static void
2559 3 : bdev_nvme_reconnect_ctrlr_now(void *ctx)
2560 : {
2561 3 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2562 :
2563 3 : assert(nvme_ctrlr->resetting == true);
2564 3 : assert(nvme_ctrlr->thread == spdk_get_thread());
2565 :
2566 3 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
2567 :
2568 3 : spdk_poller_resume(nvme_ctrlr->adminq_timer_poller);
2569 :
2570 3 : bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
2571 3 : }
2572 :
2573 : static void
2574 57 : _bdev_nvme_reset_ctrlr(void *ctx)
2575 : {
2576 57 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2577 :
2578 57 : assert(nvme_ctrlr->resetting == true);
2579 57 : assert(nvme_ctrlr->thread == spdk_get_thread());
2580 :
2581 57 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2582 0 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs);
2583 0 : } else {
2584 57 : bdev_nvme_reset_destroy_qpairs(nvme_ctrlr);
2585 : }
2586 57 : }
2587 :
2588 : static int
2589 50 : bdev_nvme_reset_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, spdk_msg_fn *msg_fn)
2590 : {
2591 50 : if (nvme_ctrlr->destruct) {
2592 3 : return -ENXIO;
2593 : }
2594 :
2595 47 : if (nvme_ctrlr->resetting) {
2596 14 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset, already in progress.\n");
2597 14 : return -EBUSY;
2598 : }
2599 :
2600 33 : if (nvme_ctrlr->disabled) {
2601 1 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset. Controller is disabled.\n");
2602 1 : return -EALREADY;
2603 : }
2604 :
2605 32 : nvme_ctrlr->resetting = true;
2606 32 : nvme_ctrlr->dont_retry = true;
2607 :
2608 32 : if (nvme_ctrlr->reconnect_is_delayed) {
2609 1 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Reconnect is already scheduled.\n");
2610 1 : *msg_fn = bdev_nvme_reconnect_ctrlr_now;
2611 1 : nvme_ctrlr->reconnect_is_delayed = false;
2612 1 : } else {
2613 31 : *msg_fn = _bdev_nvme_reset_ctrlr;
2614 31 : assert(nvme_ctrlr->reset_start_tsc == 0);
2615 : }
2616 :
2617 32 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
2618 :
2619 32 : return 0;
2620 50 : }
2621 :
2622 : static int
2623 24 : bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2624 : {
2625 : spdk_msg_fn msg_fn;
2626 : int rc;
2627 :
2628 24 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2629 24 : rc = bdev_nvme_reset_ctrlr_unsafe(nvme_ctrlr, &msg_fn);
2630 24 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2631 :
2632 24 : if (rc == 0) {
2633 19 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
2634 19 : }
2635 :
2636 24 : return rc;
2637 : }
2638 :
2639 : static int
2640 3 : bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2641 : {
2642 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2643 3 : if (nvme_ctrlr->destruct) {
2644 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2645 0 : return -ENXIO;
2646 : }
2647 :
2648 3 : if (nvme_ctrlr->resetting) {
2649 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2650 0 : return -EBUSY;
2651 : }
2652 :
2653 3 : if (!nvme_ctrlr->disabled) {
2654 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2655 1 : return -EALREADY;
2656 : }
2657 :
2658 2 : nvme_ctrlr->disabled = false;
2659 2 : nvme_ctrlr->resetting = true;
2660 :
2661 2 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
2662 :
2663 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2664 :
2665 2 : spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr);
2666 2 : return 0;
2667 3 : }
2668 :
2669 : static void
2670 2 : bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr)
2671 : {
2672 2 : bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn;
2673 2 : void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg;
2674 : enum bdev_nvme_op_after_reset op_after_disable;
2675 :
2676 2 : assert(nvme_ctrlr->thread == spdk_get_thread());
2677 :
2678 2 : nvme_ctrlr->ctrlr_op_cb_fn = NULL;
2679 2 : nvme_ctrlr->ctrlr_op_cb_arg = NULL;
2680 :
2681 2 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2682 :
2683 2 : nvme_ctrlr->resetting = false;
2684 2 : nvme_ctrlr->dont_retry = false;
2685 :
2686 2 : op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true);
2687 :
2688 2 : nvme_ctrlr->disabled = true;
2689 2 : spdk_poller_pause(nvme_ctrlr->adminq_timer_poller);
2690 :
2691 : /* Make sure we clear any pending resets before returning. */
2692 2 : bdev_nvme_complete_pending_resets(nvme_ctrlr, true);
2693 :
2694 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2695 :
2696 2 : if (ctrlr_op_cb_fn) {
2697 0 : ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0);
2698 0 : }
2699 :
2700 2 : switch (op_after_disable) {
2701 : case OP_COMPLETE_PENDING_DESTRUCT:
2702 0 : nvme_ctrlr_unregister(nvme_ctrlr);
2703 0 : break;
2704 : default:
2705 2 : break;
2706 : }
2707 2 : }
2708 :
2709 : static void
2710 1 : bdev_nvme_disable_destroy_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2711 : {
2712 1 : assert(status == 0);
2713 :
2714 1 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2715 0 : bdev_nvme_disable_ctrlr_complete(nvme_ctrlr);
2716 0 : } else {
2717 1 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete);
2718 : }
2719 1 : }
2720 :
2721 : static void
2722 1 : bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr)
2723 : {
2724 1 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2725 : bdev_nvme_reset_destroy_qpair,
2726 : NULL,
2727 : bdev_nvme_disable_destroy_qpairs_done);
2728 1 : }
2729 :
2730 : static void
2731 1 : _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx)
2732 : {
2733 1 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2734 :
2735 1 : assert(nvme_ctrlr->resetting == true);
2736 1 : assert(nvme_ctrlr->thread == spdk_get_thread());
2737 :
2738 1 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
2739 :
2740 1 : bdev_nvme_disable_ctrlr_complete(nvme_ctrlr);
2741 1 : }
2742 :
2743 : static void
2744 1 : _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx)
2745 : {
2746 1 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2747 :
2748 1 : assert(nvme_ctrlr->resetting == true);
2749 1 : assert(nvme_ctrlr->thread == spdk_get_thread());
2750 :
2751 1 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2752 0 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs);
2753 0 : } else {
2754 1 : bdev_nvme_disable_destroy_qpairs(nvme_ctrlr);
2755 : }
2756 1 : }
2757 :
2758 : static int
2759 5 : bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2760 : {
2761 : spdk_msg_fn msg_fn;
2762 :
2763 5 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2764 5 : if (nvme_ctrlr->destruct) {
2765 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2766 1 : return -ENXIO;
2767 : }
2768 :
2769 4 : if (nvme_ctrlr->resetting) {
2770 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2771 1 : return -EBUSY;
2772 : }
2773 :
2774 3 : if (nvme_ctrlr->disabled) {
2775 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2776 1 : return -EALREADY;
2777 : }
2778 :
2779 2 : nvme_ctrlr->resetting = true;
2780 2 : nvme_ctrlr->dont_retry = true;
2781 :
2782 2 : if (nvme_ctrlr->reconnect_is_delayed) {
2783 1 : msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr;
2784 1 : nvme_ctrlr->reconnect_is_delayed = false;
2785 1 : } else {
2786 1 : msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr;
2787 : }
2788 :
2789 2 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
2790 :
2791 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2792 :
2793 2 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
2794 2 : return 0;
2795 5 : }
2796 :
2797 : static int
2798 6 : nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op,
2799 : bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg)
2800 : {
2801 : int rc;
2802 :
2803 6 : switch (op) {
2804 : case NVME_CTRLR_OP_RESET:
2805 5 : rc = bdev_nvme_reset_ctrlr(nvme_ctrlr);
2806 5 : break;
2807 : case NVME_CTRLR_OP_ENABLE:
2808 0 : rc = bdev_nvme_enable_ctrlr(nvme_ctrlr);
2809 0 : break;
2810 : case NVME_CTRLR_OP_DISABLE:
2811 0 : rc = bdev_nvme_disable_ctrlr(nvme_ctrlr);
2812 0 : break;
2813 : default:
2814 1 : rc = -EINVAL;
2815 1 : break;
2816 : }
2817 :
2818 6 : if (rc == 0) {
2819 3 : assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL);
2820 3 : assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL);
2821 3 : nvme_ctrlr->ctrlr_op_cb_fn = cb_fn;
2822 3 : nvme_ctrlr->ctrlr_op_cb_arg = cb_arg;
2823 3 : }
2824 6 : return rc;
2825 : }
2826 :
2827 : struct nvme_ctrlr_op_rpc_ctx {
2828 : struct nvme_ctrlr *nvme_ctrlr;
2829 : struct spdk_thread *orig_thread;
2830 : enum nvme_ctrlr_op op;
2831 : int rc;
2832 : bdev_nvme_ctrlr_op_cb cb_fn;
2833 : void *cb_arg;
2834 : };
2835 :
2836 : static void
2837 4 : _nvme_ctrlr_op_rpc_complete(void *_ctx)
2838 : {
2839 4 : struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx;
2840 :
2841 4 : assert(ctx != NULL);
2842 4 : assert(ctx->cb_fn != NULL);
2843 :
2844 4 : ctx->cb_fn(ctx->cb_arg, ctx->rc);
2845 :
2846 4 : free(ctx);
2847 4 : }
2848 :
2849 : static void
2850 4 : nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc)
2851 : {
2852 4 : struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg;
2853 :
2854 4 : ctx->rc = rc;
2855 :
2856 4 : spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx);
2857 4 : }
2858 :
2859 : void
2860 4 : nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op,
2861 : bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg)
2862 : {
2863 : struct nvme_ctrlr_op_rpc_ctx *ctx;
2864 : int rc;
2865 :
2866 4 : assert(cb_fn != NULL);
2867 :
2868 4 : ctx = calloc(1, sizeof(*ctx));
2869 4 : if (ctx == NULL) {
2870 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_ctrlr_op_rpc_ctx.\n");
2871 0 : cb_fn(cb_arg, -ENOMEM);
2872 0 : return;
2873 : }
2874 :
2875 4 : ctx->orig_thread = spdk_get_thread();
2876 4 : ctx->cb_fn = cb_fn;
2877 4 : ctx->cb_arg = cb_arg;
2878 :
2879 4 : rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx);
2880 4 : if (rc == 0) {
2881 1 : return;
2882 3 : } else if (rc == -EALREADY) {
2883 0 : rc = 0;
2884 0 : }
2885 :
2886 3 : nvme_ctrlr_op_rpc_complete(ctx, rc);
2887 4 : }
2888 :
2889 : static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc);
2890 :
2891 : static void
2892 2 : _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx)
2893 : {
2894 2 : struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx;
2895 : struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr;
2896 : int rc;
2897 :
2898 2 : prev_nvme_ctrlr = ctx->nvme_ctrlr;
2899 2 : ctx->nvme_ctrlr = NULL;
2900 :
2901 2 : if (ctx->rc != 0) {
2902 0 : goto complete;
2903 : }
2904 :
2905 2 : next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq);
2906 2 : if (next_nvme_ctrlr == NULL) {
2907 1 : goto complete;
2908 : }
2909 :
2910 1 : rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx);
2911 1 : if (rc == 0) {
2912 1 : ctx->nvme_ctrlr = next_nvme_ctrlr;
2913 1 : return;
2914 0 : } else if (rc == -EALREADY) {
2915 0 : ctx->nvme_ctrlr = next_nvme_ctrlr;
2916 0 : rc = 0;
2917 0 : }
2918 :
2919 0 : ctx->rc = rc;
2920 :
2921 : complete:
2922 1 : ctx->cb_fn(ctx->cb_arg, ctx->rc);
2923 1 : free(ctx);
2924 2 : }
2925 :
2926 : static void
2927 2 : nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc)
2928 : {
2929 2 : struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg;
2930 :
2931 2 : ctx->rc = rc;
2932 :
2933 2 : spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx);
2934 2 : }
2935 :
2936 : void
2937 1 : nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op,
2938 : bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg)
2939 : {
2940 : struct nvme_ctrlr_op_rpc_ctx *ctx;
2941 : struct nvme_ctrlr *nvme_ctrlr;
2942 : int rc;
2943 :
2944 1 : assert(cb_fn != NULL);
2945 :
2946 1 : ctx = calloc(1, sizeof(*ctx));
2947 1 : if (ctx == NULL) {
2948 0 : SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n");
2949 0 : cb_fn(cb_arg, -ENOMEM);
2950 0 : return;
2951 : }
2952 :
2953 1 : ctx->orig_thread = spdk_get_thread();
2954 1 : ctx->op = op;
2955 1 : ctx->cb_fn = cb_fn;
2956 1 : ctx->cb_arg = cb_arg;
2957 :
2958 1 : nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
2959 1 : assert(nvme_ctrlr != NULL);
2960 :
2961 1 : rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx);
2962 1 : if (rc == 0) {
2963 1 : ctx->nvme_ctrlr = nvme_ctrlr;
2964 1 : return;
2965 0 : } else if (rc == -EALREADY) {
2966 0 : ctx->nvme_ctrlr = nvme_ctrlr;
2967 0 : rc = 0;
2968 0 : }
2969 :
2970 0 : nvme_bdev_ctrlr_op_rpc_continue(ctx, rc);
2971 1 : }
2972 :
2973 : static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio);
2974 :
2975 : static void
2976 16 : bdev_nvme_unfreeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status)
2977 : {
2978 16 : struct nvme_bdev_io *bio = ctx;
2979 : enum spdk_bdev_io_status io_status;
2980 :
2981 16 : if (bio->cpl.cdw0 == 0) {
2982 12 : io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
2983 12 : } else {
2984 4 : io_status = SPDK_BDEV_IO_STATUS_FAILED;
2985 : }
2986 :
2987 16 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p completed, status:%d\n", bio, io_status);
2988 :
2989 16 : __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL);
2990 16 : }
2991 :
2992 : static void
2993 32 : bdev_nvme_unfreeze_bdev_channel(struct nvme_bdev_channel_iter *i,
2994 : struct nvme_bdev *nbdev,
2995 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
2996 : {
2997 32 : bdev_nvme_abort_retry_ios(nbdev_ch);
2998 32 : nbdev_ch->resetting = false;
2999 :
3000 32 : nvme_bdev_for_each_channel_continue(i, 0);
3001 32 : }
3002 :
3003 : static void
3004 16 : bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio)
3005 : {
3006 16 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3007 16 : struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
3008 :
3009 : /* Abort all queued I/Os for retry. */
3010 32 : nvme_bdev_for_each_channel(nbdev,
3011 : bdev_nvme_unfreeze_bdev_channel,
3012 16 : bio,
3013 : bdev_nvme_unfreeze_bdev_channel_done);
3014 16 : }
3015 :
3016 : static void
3017 26 : _bdev_nvme_reset_io_continue(void *ctx)
3018 : {
3019 26 : struct nvme_bdev_io *bio = ctx;
3020 : struct nvme_io_path *prev_io_path, *next_io_path;
3021 : int rc;
3022 :
3023 26 : prev_io_path = bio->io_path;
3024 26 : bio->io_path = NULL;
3025 :
3026 26 : next_io_path = STAILQ_NEXT(prev_io_path, stailq);
3027 26 : if (next_io_path == NULL) {
3028 16 : goto complete;
3029 : }
3030 :
3031 10 : rc = _bdev_nvme_reset_io(next_io_path, bio);
3032 10 : if (rc == 0) {
3033 10 : return;
3034 : }
3035 :
3036 : complete:
3037 16 : bdev_nvme_reset_io_complete(bio);
3038 26 : }
3039 :
3040 : static void
3041 26 : bdev_nvme_reset_io_continue(void *cb_arg, int rc)
3042 : {
3043 26 : struct nvme_bdev_io *bio = cb_arg;
3044 26 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3045 26 : struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
3046 :
3047 26 : NVME_BDEV_INFOLOG(nbdev, "continue reset_io %p, rc:%d\n", bio, rc);
3048 :
3049 : /* Reset status is initialized as "failed". Set to "success" once we have at least one
3050 : * successfully reset nvme_ctrlr.
3051 : */
3052 26 : if (rc == 0) {
3053 16 : bio->cpl.cdw0 = 0;
3054 16 : }
3055 :
3056 26 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio);
3057 26 : }
3058 :
3059 : static int
3060 26 : _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio)
3061 : {
3062 26 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3063 26 : struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
3064 26 : struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr;
3065 : spdk_msg_fn msg_fn;
3066 : int rc;
3067 :
3068 26 : assert(bio->io_path == NULL);
3069 26 : bio->io_path = io_path;
3070 :
3071 26 : pthread_mutex_lock(&nvme_ctrlr->mutex);
3072 26 : rc = bdev_nvme_reset_ctrlr_unsafe(nvme_ctrlr, &msg_fn);
3073 26 : if (rc == -EBUSY) {
3074 : /*
3075 : * Reset call is queued only if it is from the app framework. This is on purpose so that
3076 : * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
3077 : * upper level. If they are in the middle of a reset, we won't try to schedule another one.
3078 : */
3079 12 : TAILQ_INSERT_TAIL(&nvme_ctrlr->pending_resets, bio, retry_link);
3080 12 : }
3081 26 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
3082 :
3083 26 : if (rc == 0) {
3084 13 : assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL);
3085 13 : assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL);
3086 13 : nvme_ctrlr->ctrlr_op_cb_fn = bdev_nvme_reset_io_continue;
3087 13 : nvme_ctrlr->ctrlr_op_cb_arg = bio;
3088 :
3089 13 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
3090 :
3091 13 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p started resetting ctrlr [%s, %u].\n",
3092 : bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr));
3093 26 : } else if (rc == -EBUSY) {
3094 12 : rc = 0;
3095 :
3096 12 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p was queued to ctrlr [%s, %u].\n",
3097 : bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr));
3098 12 : } else {
3099 1 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p could not reset ctrlr [%s, %u], rc:%d\n",
3100 : bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr), rc);
3101 : }
3102 :
3103 26 : return rc;
3104 : }
3105 :
3106 : static void
3107 16 : bdev_nvme_freeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status)
3108 : {
3109 16 : struct nvme_bdev_io *bio = ctx;
3110 16 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3111 : struct nvme_bdev_channel *nbdev_ch;
3112 : struct nvme_io_path *io_path;
3113 : int rc;
3114 :
3115 16 : nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
3116 :
3117 : /* Initialize with failed status. With multipath it is enough to have at least one successful
3118 : * nvme_ctrlr reset. If there is none, reset status will remain failed.
3119 : */
3120 16 : bio->cpl.cdw0 = 1;
3121 :
3122 : /* Reset all nvme_ctrlrs of a bdev controller sequentially. */
3123 16 : io_path = STAILQ_FIRST(&nbdev_ch->io_path_list);
3124 16 : assert(io_path != NULL);
3125 :
3126 16 : rc = _bdev_nvme_reset_io(io_path, bio);
3127 16 : if (rc != 0) {
3128 : /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */
3129 1 : rc = (rc == -EALREADY) ? 0 : rc;
3130 :
3131 1 : bdev_nvme_reset_io_continue(bio, rc);
3132 1 : }
3133 16 : }
3134 :
3135 : static void
3136 30 : bdev_nvme_freeze_bdev_channel(struct nvme_bdev_channel_iter *i,
3137 : struct nvme_bdev *nbdev,
3138 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
3139 : {
3140 30 : nbdev_ch->resetting = true;
3141 :
3142 30 : nvme_bdev_for_each_channel_continue(i, 0);
3143 30 : }
3144 :
3145 : static void
3146 15 : bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio)
3147 : {
3148 15 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p started.\n", bio);
3149 :
3150 30 : nvme_bdev_for_each_channel(nbdev,
3151 : bdev_nvme_freeze_bdev_channel,
3152 15 : bio,
3153 : bdev_nvme_freeze_bdev_channel_done);
3154 15 : }
3155 :
3156 : static int
3157 32 : bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove)
3158 : {
3159 32 : if (nvme_ctrlr->destruct) {
3160 : /* Don't bother resetting if the controller is in the process of being destructed. */
3161 2 : return -ENXIO;
3162 : }
3163 :
3164 30 : if (nvme_ctrlr->resetting) {
3165 3 : if (!nvme_ctrlr->in_failover) {
3166 3 : NVME_CTRLR_NOTICELOG(nvme_ctrlr,
3167 : "Reset is already in progress. Defer failover until reset completes.\n");
3168 :
3169 : /* Defer failover until reset completes. */
3170 3 : nvme_ctrlr->pending_failover = true;
3171 3 : return -EINPROGRESS;
3172 : } else {
3173 0 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform failover, already in progress.\n");
3174 0 : return -EBUSY;
3175 : }
3176 : }
3177 :
3178 27 : bdev_nvme_failover_trid(nvme_ctrlr, remove, true);
3179 :
3180 27 : if (nvme_ctrlr->reconnect_is_delayed) {
3181 1 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Reconnect is already scheduled.\n");
3182 :
3183 : /* We rely on the next reconnect for the failover. */
3184 1 : return -EALREADY;
3185 : }
3186 :
3187 26 : if (nvme_ctrlr->disabled) {
3188 0 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Controller is disabled.\n");
3189 :
3190 : /* We rely on the enablement for the failover. */
3191 0 : return -EALREADY;
3192 : }
3193 :
3194 26 : nvme_ctrlr->resetting = true;
3195 26 : nvme_ctrlr->in_failover = true;
3196 :
3197 26 : assert(nvme_ctrlr->reset_start_tsc == 0);
3198 26 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
3199 :
3200 26 : return 0;
3201 32 : }
3202 :
3203 : static int
3204 30 : bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
3205 : {
3206 : int rc;
3207 :
3208 30 : pthread_mutex_lock(&nvme_ctrlr->mutex);
3209 30 : rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false);
3210 30 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
3211 :
3212 30 : if (rc == 0) {
3213 25 : spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr);
3214 30 : } else if (rc == -EALREADY) {
3215 0 : rc = 0;
3216 0 : }
3217 :
3218 30 : return rc;
3219 : }
3220 :
3221 : static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks,
3222 : uint64_t num_blocks);
3223 :
3224 : static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks,
3225 : uint64_t num_blocks);
3226 :
3227 : static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks,
3228 : uint64_t src_offset_blocks,
3229 : uint64_t num_blocks);
3230 :
3231 : static void
3232 1 : bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3233 : bool success)
3234 : {
3235 1 : struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
3236 : int ret;
3237 :
3238 1 : if (!success) {
3239 0 : ret = -EINVAL;
3240 0 : goto exit;
3241 : }
3242 :
3243 1 : if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
3244 0 : ret = -ENXIO;
3245 0 : goto exit;
3246 : }
3247 :
3248 2 : ret = bdev_nvme_readv(bio,
3249 1 : bdev_io->u.bdev.iovs,
3250 1 : bdev_io->u.bdev.iovcnt,
3251 1 : bdev_io->u.bdev.md_buf,
3252 1 : bdev_io->u.bdev.num_blocks,
3253 1 : bdev_io->u.bdev.offset_blocks,
3254 1 : bdev_io->u.bdev.dif_check_flags,
3255 1 : bdev_io->u.bdev.memory_domain,
3256 1 : bdev_io->u.bdev.memory_domain_ctx,
3257 1 : bdev_io->u.bdev.accel_sequence);
3258 :
3259 : exit:
3260 1 : if (spdk_unlikely(ret != 0)) {
3261 0 : bdev_nvme_io_complete(bio, ret);
3262 0 : }
3263 1 : }
3264 :
3265 : static inline void
3266 59 : _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io)
3267 : {
3268 59 : struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
3269 59 : struct spdk_bdev *bdev = bdev_io->bdev;
3270 : struct nvme_bdev_io *nbdev_io_to_abort;
3271 59 : int rc = 0;
3272 :
3273 59 : switch (bdev_io->type) {
3274 : case SPDK_BDEV_IO_TYPE_READ:
3275 3 : if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
3276 :
3277 4 : rc = bdev_nvme_readv(nbdev_io,
3278 2 : bdev_io->u.bdev.iovs,
3279 2 : bdev_io->u.bdev.iovcnt,
3280 2 : bdev_io->u.bdev.md_buf,
3281 2 : bdev_io->u.bdev.num_blocks,
3282 2 : bdev_io->u.bdev.offset_blocks,
3283 2 : bdev_io->u.bdev.dif_check_flags,
3284 2 : bdev_io->u.bdev.memory_domain,
3285 2 : bdev_io->u.bdev.memory_domain_ctx,
3286 2 : bdev_io->u.bdev.accel_sequence);
3287 2 : } else {
3288 2 : spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
3289 1 : bdev_io->u.bdev.num_blocks * bdev->blocklen);
3290 1 : rc = 0;
3291 : }
3292 3 : break;
3293 : case SPDK_BDEV_IO_TYPE_WRITE:
3294 50 : rc = bdev_nvme_writev(nbdev_io,
3295 25 : bdev_io->u.bdev.iovs,
3296 25 : bdev_io->u.bdev.iovcnt,
3297 25 : bdev_io->u.bdev.md_buf,
3298 25 : bdev_io->u.bdev.num_blocks,
3299 25 : bdev_io->u.bdev.offset_blocks,
3300 25 : bdev_io->u.bdev.dif_check_flags,
3301 25 : bdev_io->u.bdev.memory_domain,
3302 25 : bdev_io->u.bdev.memory_domain_ctx,
3303 25 : bdev_io->u.bdev.accel_sequence,
3304 25 : bdev_io->u.bdev.nvme_cdw12,
3305 25 : bdev_io->u.bdev.nvme_cdw13);
3306 25 : break;
3307 : case SPDK_BDEV_IO_TYPE_COMPARE:
3308 2 : rc = bdev_nvme_comparev(nbdev_io,
3309 1 : bdev_io->u.bdev.iovs,
3310 1 : bdev_io->u.bdev.iovcnt,
3311 1 : bdev_io->u.bdev.md_buf,
3312 1 : bdev_io->u.bdev.num_blocks,
3313 1 : bdev_io->u.bdev.offset_blocks,
3314 1 : bdev_io->u.bdev.dif_check_flags);
3315 1 : break;
3316 : case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
3317 4 : rc = bdev_nvme_comparev_and_writev(nbdev_io,
3318 2 : bdev_io->u.bdev.iovs,
3319 2 : bdev_io->u.bdev.iovcnt,
3320 2 : bdev_io->u.bdev.fused_iovs,
3321 2 : bdev_io->u.bdev.fused_iovcnt,
3322 2 : bdev_io->u.bdev.md_buf,
3323 2 : bdev_io->u.bdev.num_blocks,
3324 2 : bdev_io->u.bdev.offset_blocks,
3325 2 : bdev_io->u.bdev.dif_check_flags);
3326 2 : break;
3327 : case SPDK_BDEV_IO_TYPE_UNMAP:
3328 2 : rc = bdev_nvme_unmap(nbdev_io,
3329 1 : bdev_io->u.bdev.offset_blocks,
3330 1 : bdev_io->u.bdev.num_blocks);
3331 1 : break;
3332 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3333 0 : rc = bdev_nvme_write_zeroes(nbdev_io,
3334 0 : bdev_io->u.bdev.offset_blocks,
3335 0 : bdev_io->u.bdev.num_blocks);
3336 0 : break;
3337 : case SPDK_BDEV_IO_TYPE_RESET:
3338 15 : nbdev_io->io_path = NULL;
3339 15 : bdev_nvme_reset_io(bdev->ctxt, nbdev_io);
3340 15 : return;
3341 :
3342 : case SPDK_BDEV_IO_TYPE_FLUSH:
3343 1 : bdev_nvme_io_complete(nbdev_io, 0);
3344 1 : return;
3345 :
3346 : case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
3347 0 : rc = bdev_nvme_zone_appendv(nbdev_io,
3348 0 : bdev_io->u.bdev.iovs,
3349 0 : bdev_io->u.bdev.iovcnt,
3350 0 : bdev_io->u.bdev.md_buf,
3351 0 : bdev_io->u.bdev.num_blocks,
3352 0 : bdev_io->u.bdev.offset_blocks,
3353 0 : bdev_io->u.bdev.dif_check_flags);
3354 0 : break;
3355 : case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
3356 0 : rc = bdev_nvme_get_zone_info(nbdev_io,
3357 0 : bdev_io->u.zone_mgmt.zone_id,
3358 0 : bdev_io->u.zone_mgmt.num_zones,
3359 0 : bdev_io->u.zone_mgmt.buf);
3360 0 : break;
3361 : case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
3362 0 : rc = bdev_nvme_zone_management(nbdev_io,
3363 0 : bdev_io->u.zone_mgmt.zone_id,
3364 0 : bdev_io->u.zone_mgmt.zone_action);
3365 0 : break;
3366 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
3367 5 : nbdev_io->io_path = NULL;
3368 10 : bdev_nvme_admin_passthru(nbdev_ch,
3369 5 : nbdev_io,
3370 5 : &bdev_io->u.nvme_passthru.cmd,
3371 5 : bdev_io->u.nvme_passthru.buf,
3372 5 : bdev_io->u.nvme_passthru.nbytes);
3373 5 : return;
3374 :
3375 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3376 0 : rc = bdev_nvme_io_passthru(nbdev_io,
3377 0 : &bdev_io->u.nvme_passthru.cmd,
3378 0 : bdev_io->u.nvme_passthru.buf,
3379 0 : bdev_io->u.nvme_passthru.nbytes);
3380 0 : break;
3381 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3382 0 : rc = bdev_nvme_io_passthru_md(nbdev_io,
3383 0 : &bdev_io->u.nvme_passthru.cmd,
3384 0 : bdev_io->u.nvme_passthru.buf,
3385 0 : bdev_io->u.nvme_passthru.nbytes,
3386 0 : bdev_io->u.nvme_passthru.md_buf,
3387 0 : bdev_io->u.nvme_passthru.md_len);
3388 0 : break;
3389 : case SPDK_BDEV_IO_TYPE_NVME_IOV_MD:
3390 0 : rc = bdev_nvme_iov_passthru_md(nbdev_io,
3391 0 : &bdev_io->u.nvme_passthru.cmd,
3392 0 : bdev_io->u.nvme_passthru.iovs,
3393 0 : bdev_io->u.nvme_passthru.iovcnt,
3394 0 : bdev_io->u.nvme_passthru.nbytes,
3395 0 : bdev_io->u.nvme_passthru.md_buf,
3396 0 : bdev_io->u.nvme_passthru.md_len);
3397 0 : break;
3398 : case SPDK_BDEV_IO_TYPE_ABORT:
3399 6 : nbdev_io->io_path = NULL;
3400 6 : nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
3401 12 : bdev_nvme_abort(nbdev_ch,
3402 6 : nbdev_io,
3403 6 : nbdev_io_to_abort);
3404 6 : return;
3405 :
3406 : case SPDK_BDEV_IO_TYPE_COPY:
3407 0 : rc = bdev_nvme_copy(nbdev_io,
3408 0 : bdev_io->u.bdev.offset_blocks,
3409 0 : bdev_io->u.bdev.copy.src_offset_blocks,
3410 0 : bdev_io->u.bdev.num_blocks);
3411 0 : break;
3412 : default:
3413 0 : rc = -EINVAL;
3414 0 : break;
3415 : }
3416 :
3417 32 : if (spdk_unlikely(rc != 0)) {
3418 0 : bdev_nvme_io_complete(nbdev_io, rc);
3419 0 : }
3420 59 : }
3421 :
3422 : static void
3423 68 : bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
3424 : {
3425 68 : struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
3426 68 : struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
3427 :
3428 68 : if (spdk_likely(nbdev_io->submit_tsc == 0)) {
3429 68 : nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io);
3430 68 : } else {
3431 : /* There are cases where submit_tsc != 0, i.e. retry I/O.
3432 : * We need to update submit_tsc here.
3433 : */
3434 0 : nbdev_io->submit_tsc = spdk_get_ticks();
3435 : }
3436 :
3437 68 : spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io);
3438 68 : nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
3439 68 : if (spdk_unlikely(!nbdev_io->io_path)) {
3440 13 : if (!bdev_nvme_io_type_is_admin(bdev_io->type)) {
3441 12 : bdev_nvme_io_complete(nbdev_io, -ENXIO);
3442 12 : return;
3443 : }
3444 :
3445 : /* Admin commands do not use the optimal I/O path.
3446 : * Simply fall through even if it is not found.
3447 : */
3448 1 : }
3449 :
3450 56 : _bdev_nvme_submit_request(nbdev_ch, bdev_io);
3451 68 : }
3452 :
3453 : static bool
3454 0 : bdev_nvme_is_supported_csi(enum spdk_nvme_csi csi)
3455 : {
3456 0 : switch (csi) {
3457 : case SPDK_NVME_CSI_NVM:
3458 0 : return true;
3459 : case SPDK_NVME_CSI_ZNS:
3460 0 : return true;
3461 : default:
3462 0 : return false;
3463 : }
3464 0 : }
3465 :
3466 : static bool
3467 0 : bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
3468 : {
3469 0 : struct nvme_bdev *nbdev = ctx;
3470 : struct nvme_ns *nvme_ns;
3471 : struct spdk_nvme_ns *ns;
3472 : struct spdk_nvme_ctrlr *ctrlr;
3473 : const struct spdk_nvme_ctrlr_data *cdata;
3474 :
3475 0 : nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
3476 0 : assert(nvme_ns != NULL);
3477 0 : ns = nvme_ns->ns;
3478 0 : if (ns == NULL) {
3479 0 : return false;
3480 : }
3481 :
3482 0 : if (!bdev_nvme_is_supported_csi(spdk_nvme_ns_get_csi(ns))) {
3483 0 : switch (io_type) {
3484 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
3485 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3486 0 : return true;
3487 :
3488 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3489 0 : return spdk_nvme_ns_get_md_size(ns) ? true : false;
3490 :
3491 : default:
3492 0 : return false;
3493 : }
3494 : }
3495 :
3496 0 : ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3497 :
3498 0 : switch (io_type) {
3499 : case SPDK_BDEV_IO_TYPE_READ:
3500 : case SPDK_BDEV_IO_TYPE_WRITE:
3501 : case SPDK_BDEV_IO_TYPE_RESET:
3502 : case SPDK_BDEV_IO_TYPE_FLUSH:
3503 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
3504 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3505 : case SPDK_BDEV_IO_TYPE_ABORT:
3506 0 : return true;
3507 :
3508 : case SPDK_BDEV_IO_TYPE_COMPARE:
3509 0 : return spdk_nvme_ns_supports_compare(ns);
3510 :
3511 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3512 0 : return spdk_nvme_ns_get_md_size(ns) ? true : false;
3513 :
3514 : case SPDK_BDEV_IO_TYPE_UNMAP:
3515 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3516 0 : return cdata->oncs.dsm;
3517 :
3518 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3519 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3520 0 : return cdata->oncs.write_zeroes;
3521 :
3522 : case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
3523 0 : if (spdk_nvme_ctrlr_get_flags(ctrlr) &
3524 : SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
3525 0 : return true;
3526 : }
3527 0 : return false;
3528 :
3529 : case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
3530 : case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
3531 0 : return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
3532 :
3533 : case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
3534 0 : return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
3535 0 : spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
3536 :
3537 : case SPDK_BDEV_IO_TYPE_COPY:
3538 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3539 0 : return cdata->oncs.copy;
3540 :
3541 : default:
3542 0 : return false;
3543 : }
3544 0 : }
3545 :
3546 : static int
3547 61 : nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch)
3548 : {
3549 : struct nvme_qpair *nvme_qpair;
3550 : struct spdk_io_channel *pg_ch;
3551 : int rc;
3552 :
3553 61 : nvme_qpair = calloc(1, sizeof(*nvme_qpair));
3554 61 : if (!nvme_qpair) {
3555 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to alloc nvme_qpair.\n");
3556 0 : return -1;
3557 : }
3558 :
3559 61 : TAILQ_INIT(&nvme_qpair->io_path_list);
3560 :
3561 61 : nvme_qpair->ctrlr = nvme_ctrlr;
3562 61 : nvme_qpair->ctrlr_ch = ctrlr_ch;
3563 :
3564 61 : pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
3565 61 : if (!pg_ch) {
3566 0 : free(nvme_qpair);
3567 0 : return -1;
3568 : }
3569 :
3570 61 : nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch);
3571 :
3572 : #ifdef SPDK_CONFIG_VTUNE
3573 : nvme_qpair->group->collect_spin_stat = true;
3574 : #else
3575 61 : nvme_qpair->group->collect_spin_stat = false;
3576 : #endif
3577 :
3578 61 : if (!nvme_ctrlr->disabled) {
3579 : /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will
3580 : * be created when it's enabled.
3581 : */
3582 61 : rc = bdev_nvme_create_qpair(nvme_qpair);
3583 61 : if (rc != 0) {
3584 : /* nvme_ctrlr can't create IO qpair if connection is down.
3585 : * If reconnect_delay_sec is non-zero, creating IO qpair is retried
3586 : * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero,
3587 : * submitted IO will be queued until IO qpair is successfully created.
3588 : *
3589 : * Hence, if both are satisfied, ignore the failure.
3590 : */
3591 0 : if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) {
3592 0 : spdk_put_io_channel(pg_ch);
3593 0 : free(nvme_qpair);
3594 0 : return rc;
3595 : }
3596 0 : }
3597 61 : }
3598 :
3599 61 : TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq);
3600 :
3601 61 : ctrlr_ch->qpair = nvme_qpair;
3602 :
3603 61 : nvme_ctrlr_get_ref(nvme_ctrlr);
3604 :
3605 61 : return 0;
3606 61 : }
3607 :
3608 : static int
3609 61 : bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf)
3610 : {
3611 61 : struct nvme_ctrlr *nvme_ctrlr = io_device;
3612 61 : struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
3613 :
3614 61 : return nvme_qpair_create(nvme_ctrlr, ctrlr_ch);
3615 : }
3616 :
3617 : static void
3618 61 : nvme_qpair_delete(struct nvme_qpair *nvme_qpair)
3619 : {
3620 : struct nvme_io_path *io_path, *next;
3621 :
3622 61 : assert(nvme_qpair->group != NULL);
3623 :
3624 100 : TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) {
3625 39 : TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq);
3626 39 : nvme_io_path_free(io_path);
3627 39 : }
3628 :
3629 61 : TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq);
3630 :
3631 61 : spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group));
3632 :
3633 61 : nvme_ctrlr_put_ref(nvme_qpair->ctrlr);
3634 :
3635 61 : free(nvme_qpair);
3636 61 : }
3637 :
3638 : static void
3639 61 : bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf)
3640 : {
3641 61 : struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
3642 : struct nvme_qpair *nvme_qpair;
3643 :
3644 61 : nvme_qpair = ctrlr_ch->qpair;
3645 61 : assert(nvme_qpair != NULL);
3646 :
3647 61 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
3648 :
3649 61 : if (nvme_qpair->qpair != NULL) {
3650 : /* Always try to disconnect the qpair, even if a reset is in progress.
3651 : * The qpair may have been created after the reset process started.
3652 : */
3653 47 : spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair);
3654 47 : if (ctrlr_ch->reset_iter) {
3655 : /* Skip current ctrlr_channel in a full reset sequence because
3656 : * it is being deleted now.
3657 : */
3658 0 : nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0);
3659 0 : }
3660 :
3661 : /* We cannot release a reference to the poll group now.
3662 : * The qpair may be disconnected asynchronously later.
3663 : * We need to poll it until it is actually disconnected.
3664 : * Just detach the qpair from the deleting ctrlr_channel.
3665 : */
3666 47 : nvme_qpair->ctrlr_ch = NULL;
3667 47 : } else {
3668 14 : assert(ctrlr_ch->reset_iter == NULL);
3669 :
3670 14 : nvme_qpair_delete(nvme_qpair);
3671 : }
3672 61 : }
3673 :
3674 : static inline struct spdk_io_channel *
3675 0 : bdev_nvme_get_accel_channel(struct nvme_poll_group *group)
3676 : {
3677 0 : if (spdk_unlikely(!group->accel_channel)) {
3678 0 : group->accel_channel = spdk_accel_get_io_channel();
3679 0 : if (!group->accel_channel) {
3680 0 : SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
3681 : group);
3682 0 : return NULL;
3683 : }
3684 0 : }
3685 :
3686 0 : return group->accel_channel;
3687 0 : }
3688 :
3689 : static void
3690 0 : bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
3691 : {
3692 0 : spdk_accel_sequence_finish(seq, cb_fn, cb_arg);
3693 0 : }
3694 :
3695 : static void
3696 0 : bdev_nvme_abort_sequence(void *seq)
3697 : {
3698 0 : spdk_accel_sequence_abort(seq);
3699 0 : }
3700 :
3701 : static void
3702 0 : bdev_nvme_reverse_sequence(void *seq)
3703 : {
3704 0 : spdk_accel_sequence_reverse(seq);
3705 0 : }
3706 :
3707 : static int
3708 0 : bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt,
3709 : struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed,
3710 : spdk_nvme_accel_step_cb cb_fn, void *cb_arg)
3711 : {
3712 : struct spdk_io_channel *ch;
3713 0 : struct nvme_poll_group *group = ctx;
3714 :
3715 0 : ch = bdev_nvme_get_accel_channel(group);
3716 0 : if (spdk_unlikely(ch == NULL)) {
3717 0 : return -ENOMEM;
3718 : }
3719 :
3720 0 : return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt,
3721 0 : domain, domain_ctx, seed, cb_fn, cb_arg);
3722 0 : }
3723 :
3724 : static int
3725 0 : bdev_nvme_append_copy(void *ctx, void **seq, struct iovec *dst_iovs, uint32_t dst_iovcnt,
3726 : struct spdk_memory_domain *dst_domain, void *dst_domain_ctx,
3727 : struct iovec *src_iovs, uint32_t src_iovcnt,
3728 : struct spdk_memory_domain *src_domain, void *src_domain_ctx,
3729 : spdk_nvme_accel_step_cb cb_fn, void *cb_arg)
3730 : {
3731 : struct spdk_io_channel *ch;
3732 0 : struct nvme_poll_group *group = ctx;
3733 :
3734 0 : ch = bdev_nvme_get_accel_channel(group);
3735 0 : if (spdk_unlikely(ch == NULL)) {
3736 0 : return -ENOMEM;
3737 : }
3738 :
3739 0 : return spdk_accel_append_copy((struct spdk_accel_sequence **)seq, ch,
3740 0 : dst_iovs, dst_iovcnt, dst_domain, dst_domain_ctx,
3741 0 : src_iovs, src_iovcnt, src_domain, src_domain_ctx,
3742 0 : cb_fn, cb_arg);
3743 0 : }
3744 :
3745 : static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
3746 : .table_size = sizeof(struct spdk_nvme_accel_fn_table),
3747 : .append_crc32c = bdev_nvme_append_crc32c,
3748 : .append_copy = bdev_nvme_append_copy,
3749 : .finish_sequence = bdev_nvme_finish_sequence,
3750 : .reverse_sequence = bdev_nvme_reverse_sequence,
3751 : .abort_sequence = bdev_nvme_abort_sequence,
3752 : };
3753 :
3754 : static void
3755 0 : bdev_nvme_poll_group_interrupt_cb(struct spdk_nvme_poll_group *group, void *ctx)
3756 : {
3757 0 : bdev_nvme_poll(ctx);
3758 0 : }
3759 :
3760 : static int
3761 46 : bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf)
3762 : {
3763 46 : struct nvme_poll_group *group = ctx_buf;
3764 : struct spdk_fd_group *fgrp;
3765 : uint64_t period;
3766 : int rc;
3767 :
3768 46 : TAILQ_INIT(&group->qpair_list);
3769 :
3770 46 : group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
3771 46 : if (group->group == NULL) {
3772 0 : return -1;
3773 : }
3774 :
3775 46 : period = spdk_interrupt_mode_is_enabled() ? 0 : g_opts.nvme_ioq_poll_period_us;
3776 46 : group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, period);
3777 :
3778 46 : if (group->poller == NULL) {
3779 0 : spdk_nvme_poll_group_destroy(group->group);
3780 0 : return -1;
3781 : }
3782 :
3783 46 : if (spdk_interrupt_mode_is_enabled()) {
3784 0 : spdk_poller_register_interrupt(group->poller, NULL, NULL);
3785 :
3786 0 : fgrp = spdk_nvme_poll_group_get_fd_group(group->group);
3787 0 : if (fgrp == NULL) {
3788 0 : spdk_nvme_poll_group_destroy(group->group);
3789 0 : return -1;
3790 : }
3791 :
3792 0 : rc = spdk_nvme_poll_group_set_interrupt_callback(group->group,
3793 0 : bdev_nvme_poll_group_interrupt_cb, group);
3794 0 : if (rc != 0) {
3795 0 : spdk_nvme_poll_group_destroy(group->group);
3796 0 : return -1;
3797 : }
3798 :
3799 0 : group->intr = spdk_interrupt_register_fd_group(fgrp, "bdev_nvme_interrupt");
3800 0 : if (!group->intr) {
3801 0 : spdk_nvme_poll_group_destroy(group->group);
3802 0 : return -1;
3803 : }
3804 0 : }
3805 :
3806 46 : return 0;
3807 46 : }
3808 :
3809 : static void
3810 46 : bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf)
3811 : {
3812 46 : struct nvme_poll_group *group = ctx_buf;
3813 :
3814 46 : assert(TAILQ_EMPTY(&group->qpair_list));
3815 :
3816 46 : if (group->accel_channel) {
3817 0 : spdk_put_io_channel(group->accel_channel);
3818 0 : }
3819 :
3820 46 : if (spdk_interrupt_mode_is_enabled()) {
3821 0 : spdk_interrupt_unregister(&group->intr);
3822 0 : }
3823 :
3824 46 : spdk_poller_unregister(&group->poller);
3825 46 : if (spdk_nvme_poll_group_destroy(group->group)) {
3826 0 : SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n");
3827 0 : assert(false);
3828 : }
3829 46 : }
3830 :
3831 : static struct spdk_io_channel *
3832 0 : bdev_nvme_get_io_channel(void *ctx)
3833 : {
3834 0 : struct nvme_bdev *nbdev = ctx;
3835 :
3836 0 : return spdk_get_io_channel(nbdev);
3837 : }
3838 :
3839 : static void *
3840 0 : bdev_nvme_get_module_ctx(void *ctx)
3841 : {
3842 0 : struct nvme_bdev *nbdev = ctx;
3843 : struct nvme_ns *nvme_ns;
3844 :
3845 0 : if (!nbdev || nbdev->disk.module != &nvme_if) {
3846 0 : return NULL;
3847 : }
3848 :
3849 0 : nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
3850 0 : if (!nvme_ns) {
3851 0 : return NULL;
3852 : }
3853 :
3854 0 : return nvme_ns->ns;
3855 0 : }
3856 :
3857 : static const char *
3858 0 : _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state)
3859 : {
3860 0 : switch (ana_state) {
3861 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
3862 0 : return "optimized";
3863 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
3864 0 : return "non_optimized";
3865 : case SPDK_NVME_ANA_INACCESSIBLE_STATE:
3866 0 : return "inaccessible";
3867 : case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE:
3868 0 : return "persistent_loss";
3869 : case SPDK_NVME_ANA_CHANGE_STATE:
3870 0 : return "change";
3871 : default:
3872 0 : return NULL;
3873 : }
3874 0 : }
3875 :
3876 : static int
3877 8 : bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
3878 : {
3879 8 : struct spdk_memory_domain **_domains = NULL;
3880 8 : struct nvme_bdev *nbdev = ctx;
3881 : struct nvme_ns *nvme_ns;
3882 8 : int i = 0, _array_size = array_size;
3883 8 : int rc = 0;
3884 :
3885 22 : TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
3886 14 : if (domains && array_size >= i) {
3887 11 : _domains = &domains[i];
3888 11 : } else {
3889 3 : _domains = NULL;
3890 : }
3891 14 : rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size);
3892 14 : if (rc > 0) {
3893 13 : i += rc;
3894 13 : if (_array_size >= rc) {
3895 9 : _array_size -= rc;
3896 9 : } else {
3897 4 : _array_size = 0;
3898 : }
3899 14 : } else if (rc < 0) {
3900 0 : return rc;
3901 : }
3902 14 : }
3903 :
3904 8 : return i;
3905 8 : }
3906 :
3907 : static const char *
3908 0 : nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr)
3909 : {
3910 0 : if (nvme_ctrlr->destruct) {
3911 0 : return "deleting";
3912 0 : } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
3913 0 : return "failed";
3914 0 : } else if (nvme_ctrlr->resetting) {
3915 0 : return "resetting";
3916 0 : } else if (nvme_ctrlr->reconnect_is_delayed > 0) {
3917 0 : return "reconnect_is_delayed";
3918 0 : } else if (nvme_ctrlr->disabled) {
3919 0 : return "disabled";
3920 : } else {
3921 0 : return "enabled";
3922 : }
3923 0 : }
3924 :
3925 : void
3926 0 : nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr)
3927 : {
3928 : struct spdk_nvme_transport_id *trid;
3929 : const struct spdk_nvme_ctrlr_opts *opts;
3930 : const struct spdk_nvme_ctrlr_data *cdata;
3931 : struct nvme_path_id *path_id;
3932 : int32_t numa_id;
3933 :
3934 0 : spdk_json_write_object_begin(w);
3935 :
3936 0 : spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr));
3937 :
3938 : #ifdef SPDK_CONFIG_NVME_CUSE
3939 : size_t cuse_name_size = 128;
3940 : char cuse_name[cuse_name_size];
3941 :
3942 : int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size);
3943 : if (rc == 0) {
3944 : spdk_json_write_named_string(w, "cuse_device", cuse_name);
3945 : }
3946 : #endif
3947 0 : trid = &nvme_ctrlr->active_path_id->trid;
3948 0 : spdk_json_write_named_object_begin(w, "trid");
3949 0 : nvme_bdev_dump_trid_json(trid, w);
3950 0 : spdk_json_write_object_end(w);
3951 :
3952 0 : path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link);
3953 0 : if (path_id != NULL) {
3954 0 : spdk_json_write_named_array_begin(w, "alternate_trids");
3955 0 : do {
3956 0 : trid = &path_id->trid;
3957 0 : spdk_json_write_object_begin(w);
3958 0 : nvme_bdev_dump_trid_json(trid, w);
3959 0 : spdk_json_write_object_end(w);
3960 :
3961 0 : path_id = TAILQ_NEXT(path_id, link);
3962 0 : } while (path_id != NULL);
3963 0 : spdk_json_write_array_end(w);
3964 0 : }
3965 :
3966 0 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
3967 0 : spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid);
3968 :
3969 0 : opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr);
3970 0 : spdk_json_write_named_object_begin(w, "host");
3971 0 : spdk_json_write_named_string(w, "nqn", opts->hostnqn);
3972 0 : spdk_json_write_named_string(w, "addr", opts->src_addr);
3973 0 : spdk_json_write_named_string(w, "svcid", opts->src_svcid);
3974 0 : spdk_json_write_object_end(w);
3975 :
3976 0 : numa_id = spdk_nvme_ctrlr_get_numa_id(nvme_ctrlr->ctrlr);
3977 0 : if (numa_id != SPDK_ENV_NUMA_ID_ANY) {
3978 0 : spdk_json_write_named_uint32(w, "numa_id", numa_id);
3979 0 : }
3980 0 : spdk_json_write_object_end(w);
3981 0 : }
3982 :
3983 : static void
3984 0 : nvme_namespace_info_json(struct spdk_json_write_ctx *w,
3985 : struct nvme_ns *nvme_ns)
3986 : {
3987 : struct spdk_nvme_ns *ns;
3988 : struct spdk_nvme_ctrlr *ctrlr;
3989 : const struct spdk_nvme_ctrlr_data *cdata;
3990 : const struct spdk_nvme_transport_id *trid;
3991 : union spdk_nvme_vs_register vs;
3992 : const struct spdk_nvme_ns_data *nsdata;
3993 : char buf[128];
3994 :
3995 0 : ns = nvme_ns->ns;
3996 0 : if (ns == NULL) {
3997 0 : return;
3998 : }
3999 :
4000 0 : ctrlr = spdk_nvme_ns_get_ctrlr(ns);
4001 :
4002 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
4003 0 : trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
4004 0 : vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
4005 :
4006 0 : spdk_json_write_object_begin(w);
4007 :
4008 0 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
4009 0 : spdk_json_write_named_string(w, "pci_address", trid->traddr);
4010 0 : }
4011 :
4012 0 : spdk_json_write_named_object_begin(w, "trid");
4013 :
4014 0 : nvme_bdev_dump_trid_json(trid, w);
4015 :
4016 0 : spdk_json_write_object_end(w);
4017 :
4018 : #ifdef SPDK_CONFIG_NVME_CUSE
4019 : size_t cuse_name_size = 128;
4020 : char cuse_name[cuse_name_size];
4021 :
4022 : int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
4023 : cuse_name, &cuse_name_size);
4024 : if (rc == 0) {
4025 : spdk_json_write_named_string(w, "cuse_device", cuse_name);
4026 : }
4027 : #endif
4028 :
4029 0 : spdk_json_write_named_object_begin(w, "ctrlr_data");
4030 :
4031 0 : spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid);
4032 :
4033 0 : spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
4034 :
4035 0 : snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
4036 0 : spdk_str_trim(buf);
4037 0 : spdk_json_write_named_string(w, "model_number", buf);
4038 :
4039 0 : snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
4040 0 : spdk_str_trim(buf);
4041 0 : spdk_json_write_named_string(w, "serial_number", buf);
4042 :
4043 0 : snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
4044 0 : spdk_str_trim(buf);
4045 0 : spdk_json_write_named_string(w, "firmware_revision", buf);
4046 :
4047 0 : if (cdata->subnqn[0] != '\0') {
4048 0 : spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
4049 0 : }
4050 :
4051 0 : spdk_json_write_named_object_begin(w, "oacs");
4052 :
4053 0 : spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
4054 0 : spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
4055 0 : spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
4056 0 : spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
4057 :
4058 0 : spdk_json_write_object_end(w);
4059 :
4060 0 : spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr);
4061 0 : spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting);
4062 :
4063 0 : spdk_json_write_object_end(w);
4064 :
4065 0 : spdk_json_write_named_object_begin(w, "vs");
4066 :
4067 0 : spdk_json_write_name(w, "nvme_version");
4068 0 : if (vs.bits.ter) {
4069 0 : spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
4070 0 : } else {
4071 0 : spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
4072 : }
4073 :
4074 0 : spdk_json_write_object_end(w);
4075 :
4076 0 : nsdata = spdk_nvme_ns_get_data(ns);
4077 :
4078 0 : spdk_json_write_named_object_begin(w, "ns_data");
4079 :
4080 0 : spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
4081 :
4082 0 : if (cdata->cmic.ana_reporting) {
4083 0 : spdk_json_write_named_string(w, "ana_state",
4084 0 : _nvme_ana_state_str(nvme_ns->ana_state));
4085 0 : }
4086 :
4087 0 : spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share);
4088 :
4089 0 : spdk_json_write_object_end(w);
4090 :
4091 0 : if (cdata->oacs.security) {
4092 0 : spdk_json_write_named_object_begin(w, "security");
4093 :
4094 0 : spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal);
4095 :
4096 0 : spdk_json_write_object_end(w);
4097 0 : }
4098 :
4099 0 : spdk_json_write_object_end(w);
4100 0 : }
4101 :
4102 : static const char *
4103 0 : nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev)
4104 : {
4105 0 : switch (nbdev->mp_policy) {
4106 : case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE:
4107 0 : return "active_passive";
4108 : case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE:
4109 0 : return "active_active";
4110 : default:
4111 0 : assert(false);
4112 : return "invalid";
4113 : }
4114 0 : }
4115 :
4116 : static const char *
4117 0 : nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev)
4118 : {
4119 0 : switch (nbdev->mp_selector) {
4120 : case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN:
4121 0 : return "round_robin";
4122 : case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH:
4123 0 : return "queue_depth";
4124 : default:
4125 0 : assert(false);
4126 : return "invalid";
4127 : }
4128 0 : }
4129 :
4130 : static int
4131 0 : bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
4132 : {
4133 0 : struct nvme_bdev *nbdev = ctx;
4134 : struct nvme_ns *nvme_ns;
4135 :
4136 0 : pthread_mutex_lock(&nbdev->mutex);
4137 0 : spdk_json_write_named_array_begin(w, "nvme");
4138 0 : TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
4139 0 : nvme_namespace_info_json(w, nvme_ns);
4140 0 : }
4141 0 : spdk_json_write_array_end(w);
4142 0 : spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nbdev));
4143 0 : if (nbdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
4144 0 : spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nbdev));
4145 0 : if (nbdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
4146 0 : spdk_json_write_named_uint32(w, "rr_min_io", nbdev->rr_min_io);
4147 0 : }
4148 0 : }
4149 0 : pthread_mutex_unlock(&nbdev->mutex);
4150 :
4151 0 : return 0;
4152 : }
4153 :
4154 : static void
4155 0 : bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
4156 : {
4157 : /* No config per bdev needed */
4158 0 : }
4159 :
4160 : static uint64_t
4161 0 : bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
4162 : {
4163 0 : struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
4164 : struct nvme_io_path *io_path;
4165 : struct nvme_poll_group *group;
4166 0 : uint64_t spin_time = 0;
4167 :
4168 0 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
4169 0 : group = io_path->qpair->group;
4170 :
4171 0 : if (!group || !group->collect_spin_stat) {
4172 0 : continue;
4173 : }
4174 :
4175 0 : if (group->end_ticks != 0) {
4176 0 : group->spin_ticks += (group->end_ticks - group->start_ticks);
4177 0 : group->end_ticks = 0;
4178 0 : }
4179 :
4180 0 : spin_time += group->spin_ticks;
4181 0 : group->start_ticks = 0;
4182 0 : group->spin_ticks = 0;
4183 0 : }
4184 :
4185 0 : return (spin_time * 1000000ULL) / spdk_get_ticks_hz();
4186 : }
4187 :
4188 : static void
4189 0 : bdev_nvme_reset_device_stat(void *ctx)
4190 : {
4191 0 : struct nvme_bdev *nbdev = ctx;
4192 :
4193 0 : if (nbdev->err_stat != NULL) {
4194 0 : memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat));
4195 0 : }
4196 0 : }
4197 :
4198 : /* JSON string should be lowercases and underscore delimited string. */
4199 : static void
4200 0 : bdev_nvme_format_nvme_status(char *dst, const char *src)
4201 : {
4202 : char tmp[256];
4203 :
4204 0 : spdk_strcpy_replace(dst, 256, src, " - ", "_");
4205 0 : spdk_strcpy_replace(tmp, 256, dst, "-", "_");
4206 0 : spdk_strcpy_replace(dst, 256, tmp, " ", "_");
4207 0 : spdk_strlwr(dst);
4208 0 : }
4209 :
4210 : static void
4211 0 : bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w)
4212 : {
4213 0 : struct nvme_bdev *nbdev = ctx;
4214 0 : struct spdk_nvme_status status = {};
4215 : uint16_t sct, sc;
4216 : char status_json[256];
4217 : const char *status_str;
4218 :
4219 0 : if (nbdev->err_stat == NULL) {
4220 0 : return;
4221 : }
4222 :
4223 0 : spdk_json_write_named_object_begin(w, "nvme_error");
4224 :
4225 0 : spdk_json_write_named_object_begin(w, "status_type");
4226 0 : for (sct = 0; sct < 8; sct++) {
4227 0 : if (nbdev->err_stat->status_type[sct] == 0) {
4228 0 : continue;
4229 : }
4230 0 : status.sct = sct;
4231 :
4232 0 : status_str = spdk_nvme_cpl_get_status_type_string(&status);
4233 0 : assert(status_str != NULL);
4234 0 : bdev_nvme_format_nvme_status(status_json, status_str);
4235 :
4236 0 : spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]);
4237 0 : }
4238 0 : spdk_json_write_object_end(w);
4239 :
4240 0 : spdk_json_write_named_object_begin(w, "status_code");
4241 0 : for (sct = 0; sct < 4; sct++) {
4242 0 : status.sct = sct;
4243 0 : for (sc = 0; sc < 256; sc++) {
4244 0 : if (nbdev->err_stat->status[sct][sc] == 0) {
4245 0 : continue;
4246 : }
4247 0 : status.sc = sc;
4248 :
4249 0 : status_str = spdk_nvme_cpl_get_status_string(&status);
4250 0 : assert(status_str != NULL);
4251 0 : bdev_nvme_format_nvme_status(status_json, status_str);
4252 :
4253 0 : spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]);
4254 0 : }
4255 0 : }
4256 0 : spdk_json_write_object_end(w);
4257 :
4258 0 : spdk_json_write_object_end(w);
4259 0 : }
4260 :
4261 : static bool
4262 0 : bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type)
4263 : {
4264 0 : struct nvme_bdev *nbdev = ctx;
4265 : struct nvme_ns *nvme_ns;
4266 : struct spdk_nvme_ctrlr *ctrlr;
4267 :
4268 0 : if (!g_opts.allow_accel_sequence) {
4269 0 : return false;
4270 : }
4271 :
4272 0 : switch (type) {
4273 : case SPDK_BDEV_IO_TYPE_WRITE:
4274 : case SPDK_BDEV_IO_TYPE_READ:
4275 0 : break;
4276 : default:
4277 0 : return false;
4278 : }
4279 :
4280 0 : nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
4281 0 : assert(nvme_ns != NULL);
4282 :
4283 0 : ctrlr = nvme_ns->ctrlr->ctrlr;
4284 0 : assert(ctrlr != NULL);
4285 :
4286 0 : return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED;
4287 0 : }
4288 :
4289 : static const struct spdk_bdev_fn_table nvmelib_fn_table = {
4290 : .destruct = bdev_nvme_destruct,
4291 : .submit_request = bdev_nvme_submit_request,
4292 : .io_type_supported = bdev_nvme_io_type_supported,
4293 : .get_io_channel = bdev_nvme_get_io_channel,
4294 : .dump_info_json = bdev_nvme_dump_info_json,
4295 : .write_config_json = bdev_nvme_write_config_json,
4296 : .get_spin_time = bdev_nvme_get_spin_time,
4297 : .get_module_ctx = bdev_nvme_get_module_ctx,
4298 : .get_memory_domains = bdev_nvme_get_memory_domains,
4299 : .accel_sequence_supported = bdev_nvme_accel_sequence_supported,
4300 : .reset_device_stat = bdev_nvme_reset_device_stat,
4301 : .dump_device_stat_json = bdev_nvme_dump_device_stat_json,
4302 : };
4303 :
4304 : typedef int (*bdev_nvme_parse_ana_log_page_cb)(
4305 : const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg);
4306 :
4307 : static int
4308 42 : bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
4309 : bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg)
4310 : {
4311 : struct spdk_nvme_ana_group_descriptor *copied_desc;
4312 : uint8_t *orig_desc;
4313 : uint32_t i, desc_size, copy_len;
4314 42 : int rc = 0;
4315 :
4316 42 : if (nvme_ctrlr->ana_log_page == NULL) {
4317 0 : return -EINVAL;
4318 : }
4319 :
4320 42 : copied_desc = nvme_ctrlr->copied_ana_desc;
4321 :
4322 42 : orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page);
4323 42 : copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page);
4324 :
4325 72 : for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) {
4326 67 : memcpy(copied_desc, orig_desc, copy_len);
4327 :
4328 67 : rc = cb_fn(copied_desc, cb_arg);
4329 67 : if (rc != 0) {
4330 37 : break;
4331 : }
4332 :
4333 30 : desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) +
4334 30 : copied_desc->num_of_nsid * sizeof(uint32_t);
4335 30 : orig_desc += desc_size;
4336 30 : copy_len -= desc_size;
4337 30 : }
4338 :
4339 42 : return rc;
4340 42 : }
4341 :
4342 : static int
4343 5 : nvme_ns_ana_transition_timedout(void *ctx)
4344 : {
4345 5 : struct nvme_ns *nvme_ns = ctx;
4346 :
4347 5 : spdk_poller_unregister(&nvme_ns->anatt_timer);
4348 5 : nvme_ns->ana_transition_timedout = true;
4349 :
4350 5 : return SPDK_POLLER_BUSY;
4351 : }
4352 :
4353 : static void
4354 46 : _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns,
4355 : const struct spdk_nvme_ana_group_descriptor *desc)
4356 : {
4357 : const struct spdk_nvme_ctrlr_data *cdata;
4358 :
4359 46 : nvme_ns->ana_group_id = desc->ana_group_id;
4360 46 : nvme_ns->ana_state = desc->ana_state;
4361 46 : nvme_ns->ana_state_updating = false;
4362 :
4363 46 : switch (nvme_ns->ana_state) {
4364 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
4365 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
4366 39 : nvme_ns->ana_transition_timedout = false;
4367 39 : spdk_poller_unregister(&nvme_ns->anatt_timer);
4368 39 : break;
4369 :
4370 : case SPDK_NVME_ANA_INACCESSIBLE_STATE:
4371 : case SPDK_NVME_ANA_CHANGE_STATE:
4372 6 : if (nvme_ns->anatt_timer != NULL) {
4373 1 : break;
4374 : }
4375 :
4376 5 : cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);
4377 5 : nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout,
4378 : nvme_ns,
4379 : cdata->anatt * SPDK_SEC_TO_USEC);
4380 5 : break;
4381 : default:
4382 1 : break;
4383 : }
4384 46 : }
4385 :
4386 : static int
4387 60 : nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg)
4388 : {
4389 60 : struct nvme_ns *nvme_ns = cb_arg;
4390 : uint32_t i;
4391 :
4392 60 : assert(nvme_ns->ns != NULL);
4393 :
4394 82 : for (i = 0; i < desc->num_of_nsid; i++) {
4395 59 : if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) {
4396 22 : continue;
4397 : }
4398 :
4399 37 : _nvme_ns_set_ana_state(nvme_ns, desc);
4400 37 : return 1;
4401 : }
4402 :
4403 23 : return 0;
4404 60 : }
4405 :
4406 : static int
4407 5 : nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid)
4408 : {
4409 5 : int rc = 0;
4410 : struct spdk_uuid new_uuid, namespace_uuid;
4411 5 : char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'};
4412 : /* This namespace UUID was generated using uuid_generate() method. */
4413 5 : const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"};
4414 : int size;
4415 :
4416 5 : assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN);
4417 :
4418 5 : spdk_uuid_set_null(&new_uuid);
4419 5 : spdk_uuid_set_null(&namespace_uuid);
4420 :
4421 5 : size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid);
4422 5 : if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) {
4423 0 : return -EINVAL;
4424 : }
4425 :
4426 5 : spdk_uuid_parse(&namespace_uuid, namespace_str);
4427 :
4428 5 : rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size);
4429 5 : if (rc == 0) {
4430 5 : memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid));
4431 5 : }
4432 :
4433 5 : return rc;
4434 5 : }
4435 :
4436 : static int
4437 39 : nbdev_create(struct spdk_bdev *disk, const char *base_name,
4438 : struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
4439 : struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, void *ctx)
4440 : {
4441 : const struct spdk_uuid *uuid;
4442 : const uint8_t *nguid;
4443 : const struct spdk_nvme_ctrlr_data *cdata;
4444 : const struct spdk_nvme_ns_data *nsdata;
4445 : const struct spdk_nvme_ctrlr_opts *opts;
4446 : enum spdk_nvme_csi csi;
4447 : uint32_t atomic_bs, phys_bs, bs;
4448 39 : char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'};
4449 : int rc;
4450 :
4451 39 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
4452 39 : csi = spdk_nvme_ns_get_csi(ns);
4453 39 : opts = spdk_nvme_ctrlr_get_opts(ctrlr);
4454 :
4455 39 : switch (csi) {
4456 : case SPDK_NVME_CSI_NVM:
4457 39 : disk->product_name = "NVMe disk";
4458 39 : break;
4459 : case SPDK_NVME_CSI_ZNS:
4460 0 : disk->product_name = "NVMe ZNS disk";
4461 0 : disk->zoned = true;
4462 0 : disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
4463 0 : disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
4464 0 : spdk_nvme_ns_get_extended_sector_size(ns);
4465 0 : disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
4466 0 : disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
4467 0 : break;
4468 : default:
4469 0 : if (bdev_opts->allow_unrecognized_csi) {
4470 0 : disk->product_name = "NVMe Passthrough disk";
4471 0 : break;
4472 : }
4473 0 : SPDK_ERRLOG("unsupported CSI: %u\n", csi);
4474 0 : return -ENOTSUP;
4475 : }
4476 :
4477 39 : nguid = spdk_nvme_ns_get_nguid(ns);
4478 39 : if (!nguid) {
4479 39 : uuid = spdk_nvme_ns_get_uuid(ns);
4480 39 : if (uuid) {
4481 12 : disk->uuid = *uuid;
4482 39 : } else if (g_opts.generate_uuids) {
4483 0 : spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0');
4484 0 : rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid);
4485 0 : if (rc < 0) {
4486 0 : SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc));
4487 0 : return rc;
4488 : }
4489 0 : }
4490 39 : } else {
4491 0 : memcpy(&disk->uuid, nguid, sizeof(disk->uuid));
4492 : }
4493 :
4494 39 : disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
4495 39 : if (!disk->name) {
4496 0 : return -ENOMEM;
4497 : }
4498 :
4499 39 : disk->write_cache = 0;
4500 39 : if (cdata->vwc.present) {
4501 : /* Enable if the Volatile Write Cache exists */
4502 0 : disk->write_cache = 1;
4503 0 : }
4504 39 : if (cdata->oncs.write_zeroes) {
4505 0 : disk->max_write_zeroes = UINT16_MAX + 1;
4506 0 : }
4507 39 : disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
4508 39 : disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
4509 39 : disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr);
4510 39 : disk->ctratt.raw = cdata->ctratt.raw;
4511 39 : disk->nsid = spdk_nvme_ns_get_id(ns);
4512 : /* NVMe driver will split one request into multiple requests
4513 : * based on MDTS and stripe boundary, the bdev layer will use
4514 : * max_segment_size and max_num_segments to split one big IO
4515 : * into multiple requests, then small request can't run out
4516 : * of NVMe internal requests data structure.
4517 : */
4518 39 : if (opts && opts->io_queue_requests) {
4519 0 : disk->max_num_segments = opts->io_queue_requests / 2;
4520 0 : }
4521 39 : if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
4522 : /* The nvme driver will try to split I/O that have too many
4523 : * SGEs, but it doesn't work if that last SGE doesn't end on
4524 : * an aggregate total that is block aligned. The bdev layer has
4525 : * a more robust splitting framework, so use that instead for
4526 : * this case. (See issue #3269.)
4527 : */
4528 0 : uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr);
4529 :
4530 0 : if (disk->max_num_segments == 0) {
4531 0 : disk->max_num_segments = max_sges;
4532 0 : } else {
4533 0 : disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges);
4534 : }
4535 0 : }
4536 39 : disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
4537 :
4538 39 : nsdata = spdk_nvme_ns_get_data(ns);
4539 39 : bs = spdk_nvme_ns_get_sector_size(ns);
4540 39 : atomic_bs = bs;
4541 39 : phys_bs = bs;
4542 39 : if (nsdata->nabo == 0) {
4543 39 : if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) {
4544 0 : atomic_bs = bs * (1 + nsdata->nawupf);
4545 0 : } else {
4546 39 : atomic_bs = bs * (1 + cdata->awupf);
4547 : }
4548 39 : }
4549 39 : if (nsdata->nsfeat.optperf) {
4550 0 : phys_bs = bs * (1 + nsdata->npwg);
4551 0 : }
4552 39 : disk->phys_blocklen = spdk_min(phys_bs, atomic_bs);
4553 :
4554 39 : disk->md_len = spdk_nvme_ns_get_md_size(ns);
4555 39 : if (disk->md_len != 0) {
4556 0 : disk->md_interleave = nsdata->flbas.extended;
4557 0 : disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
4558 0 : if (disk->dif_type != SPDK_DIF_DISABLE) {
4559 0 : disk->dif_is_head_of_md = nsdata->dps.md_start;
4560 0 : disk->dif_check_flags = bdev_opts->prchk_flags;
4561 0 : disk->dif_pi_format = (enum spdk_dif_pi_format)spdk_nvme_ns_get_pi_format(ns);
4562 0 : }
4563 0 : }
4564 :
4565 39 : if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
4566 : SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
4567 39 : disk->acwu = 0;
4568 39 : } else if (nsdata->nsfeat.ns_atomic_write_unit) {
4569 0 : disk->acwu = nsdata->nacwu + 1; /* 0-based */
4570 0 : } else {
4571 0 : disk->acwu = cdata->acwu + 1; /* 0-based */
4572 : }
4573 :
4574 39 : if (cdata->oncs.copy) {
4575 : /* For now bdev interface allows only single segment copy */
4576 0 : disk->max_copy = nsdata->mssrl;
4577 0 : }
4578 :
4579 39 : disk->ctxt = ctx;
4580 39 : disk->fn_table = &nvmelib_fn_table;
4581 39 : disk->module = &nvme_if;
4582 :
4583 39 : disk->numa.id_valid = 1;
4584 39 : disk->numa.id = spdk_nvme_ctrlr_get_numa_id(ctrlr);
4585 :
4586 39 : return 0;
4587 39 : }
4588 :
4589 : static struct nvme_bdev *
4590 39 : nvme_bdev_alloc(void)
4591 : {
4592 : struct nvme_bdev *nbdev;
4593 : int rc;
4594 :
4595 39 : nbdev = calloc(1, sizeof(*nbdev));
4596 39 : if (!nbdev) {
4597 0 : SPDK_ERRLOG("nbdev calloc() failed\n");
4598 0 : return NULL;
4599 : }
4600 :
4601 39 : if (g_opts.nvme_error_stat) {
4602 0 : nbdev->err_stat = calloc(1, sizeof(struct nvme_error_stat));
4603 0 : if (!nbdev->err_stat) {
4604 0 : SPDK_ERRLOG("err_stat calloc() failed\n");
4605 0 : free(nbdev);
4606 0 : return NULL;
4607 : }
4608 0 : }
4609 :
4610 39 : rc = pthread_mutex_init(&nbdev->mutex, NULL);
4611 39 : if (rc != 0) {
4612 0 : free(nbdev->err_stat);
4613 0 : free(nbdev);
4614 0 : return NULL;
4615 : }
4616 :
4617 39 : nbdev->ref = 1;
4618 39 : nbdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE;
4619 39 : nbdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN;
4620 39 : nbdev->rr_min_io = UINT32_MAX;
4621 39 : TAILQ_INIT(&nbdev->nvme_ns_list);
4622 :
4623 39 : return nbdev;
4624 39 : }
4625 :
4626 : static int
4627 39 : nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
4628 : {
4629 : struct nvme_bdev *nbdev;
4630 39 : struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr;
4631 : int rc;
4632 :
4633 39 : nbdev = nvme_bdev_alloc();
4634 39 : if (nbdev == NULL) {
4635 0 : SPDK_ERRLOG("Failed to allocate NVMe bdev\n");
4636 0 : return -ENOMEM;
4637 : }
4638 :
4639 39 : nbdev->opal = nvme_ctrlr->opal_dev != NULL;
4640 :
4641 78 : rc = nbdev_create(&nbdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr,
4642 39 : nvme_ns->ns, &nvme_ctrlr->opts, nbdev);
4643 39 : if (rc != 0) {
4644 0 : SPDK_ERRLOG("Failed to create NVMe disk\n");
4645 0 : nvme_bdev_free(nbdev);
4646 0 : return rc;
4647 : }
4648 :
4649 78 : spdk_io_device_register(nbdev,
4650 : bdev_nvme_create_bdev_channel_cb,
4651 : bdev_nvme_destroy_bdev_channel_cb,
4652 : sizeof(struct nvme_bdev_channel),
4653 39 : nbdev->disk.name);
4654 :
4655 39 : nvme_ns->bdev = nbdev;
4656 39 : nbdev->nsid = nvme_ns->id;
4657 39 : TAILQ_INSERT_TAIL(&nbdev->nvme_ns_list, nvme_ns, tailq);
4658 :
4659 39 : pthread_mutex_lock(&g_bdev_nvme_mutex);
4660 :
4661 39 : nbdev->nbdev_ctrlr = nbdev_ctrlr;
4662 39 : TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, nbdev, tailq);
4663 :
4664 39 : rc = spdk_bdev_register(&nbdev->disk);
4665 39 : if (rc != 0) {
4666 1 : SPDK_ERRLOG("spdk_bdev_register() failed\n");
4667 1 : spdk_io_device_unregister(nbdev, NULL);
4668 1 : nvme_ns->bdev = NULL;
4669 :
4670 1 : TAILQ_REMOVE(&nbdev_ctrlr->bdevs, nbdev, tailq);
4671 :
4672 1 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
4673 :
4674 1 : nvme_bdev_free(nbdev);
4675 1 : return rc;
4676 : }
4677 :
4678 38 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
4679 :
4680 38 : return 0;
4681 39 : }
4682 :
4683 : static bool
4684 23 : bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
4685 : {
4686 : const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
4687 : const struct spdk_uuid *uuid1, *uuid2;
4688 :
4689 23 : nsdata1 = spdk_nvme_ns_get_data(ns1);
4690 23 : nsdata2 = spdk_nvme_ns_get_data(ns2);
4691 23 : uuid1 = spdk_nvme_ns_get_uuid(ns1);
4692 23 : uuid2 = spdk_nvme_ns_get_uuid(ns2);
4693 :
4694 71 : return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 &&
4695 22 : nsdata1->eui64 == nsdata2->eui64 &&
4696 21 : ((uuid1 == NULL && uuid2 == NULL) ||
4697 29 : (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) &&
4698 18 : spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2);
4699 : }
4700 :
4701 : static bool
4702 0 : hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
4703 : struct spdk_nvme_ctrlr_opts *opts)
4704 : {
4705 : struct nvme_probe_skip_entry *entry;
4706 :
4707 0 : TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
4708 0 : if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
4709 0 : return false;
4710 : }
4711 0 : }
4712 :
4713 0 : opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
4714 0 : opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
4715 0 : opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
4716 0 : opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
4717 0 : opts->disable_read_ana_log_page = true;
4718 :
4719 0 : SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
4720 :
4721 0 : return true;
4722 0 : }
4723 :
4724 : static void
4725 0 : nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
4726 : {
4727 0 : struct nvme_ctrlr *nvme_ctrlr = ctx;
4728 :
4729 0 : if (spdk_nvme_cpl_is_error(cpl)) {
4730 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "Abort failed. Resetting controller. sc is %u, sct is %u.\n",
4731 : cpl->status.sc, cpl->status.sct);
4732 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4733 0 : } else if (cpl->cdw0 & 0x1) {
4734 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "Specified command could not be aborted.\n");
4735 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4736 0 : }
4737 0 : }
4738 :
4739 : static void
4740 0 : timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
4741 : struct spdk_nvme_qpair *qpair, uint16_t cid)
4742 : {
4743 0 : struct nvme_ctrlr *nvme_ctrlr = cb_arg;
4744 : union spdk_nvme_csts_register csts;
4745 : int rc;
4746 :
4747 0 : assert(nvme_ctrlr->ctrlr == ctrlr);
4748 :
4749 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n",
4750 : ctrlr, qpair, cid);
4751 :
4752 : /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
4753 : * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we
4754 : * would submit another fabrics cmd on the admin queue to read CSTS and check for its
4755 : * completion recursively.
4756 : */
4757 0 : if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
4758 0 : csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
4759 0 : if (csts.bits.cfs) {
4760 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Controller Fatal Status, reset required\n");
4761 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4762 0 : return;
4763 : }
4764 0 : }
4765 :
4766 0 : switch (g_opts.action_on_timeout) {
4767 : case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
4768 0 : if (qpair) {
4769 : /* Don't send abort to ctrlr when ctrlr is not available. */
4770 0 : pthread_mutex_lock(&nvme_ctrlr->mutex);
4771 0 : if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
4772 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4773 0 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Quit abort. Ctrlr is not available.\n");
4774 0 : return;
4775 : }
4776 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4777 :
4778 0 : rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
4779 0 : nvme_abort_cpl, nvme_ctrlr);
4780 0 : if (rc == 0) {
4781 0 : return;
4782 : }
4783 :
4784 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to send abort. Resetting, rc is %d.\n", rc);
4785 0 : }
4786 :
4787 : /* FALLTHROUGH */
4788 : case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
4789 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4790 0 : break;
4791 : case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
4792 0 : NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "No action for nvme controller timeout.\n");
4793 0 : break;
4794 : default:
4795 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "An invalid timeout action value is found.\n");
4796 0 : break;
4797 : }
4798 0 : }
4799 :
4800 : static struct nvme_ns *
4801 52 : nvme_ns_alloc(void)
4802 : {
4803 : struct nvme_ns *nvme_ns;
4804 :
4805 52 : nvme_ns = calloc(1, sizeof(struct nvme_ns));
4806 52 : if (nvme_ns == NULL) {
4807 0 : return NULL;
4808 : }
4809 :
4810 52 : if (g_opts.io_path_stat) {
4811 0 : nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
4812 0 : if (nvme_ns->stat == NULL) {
4813 0 : free(nvme_ns);
4814 0 : return NULL;
4815 : }
4816 0 : spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN);
4817 0 : }
4818 :
4819 52 : return nvme_ns;
4820 52 : }
4821 :
4822 : static void
4823 52 : nvme_ns_free(struct nvme_ns *nvme_ns)
4824 : {
4825 52 : free(nvme_ns->stat);
4826 52 : free(nvme_ns);
4827 52 : }
4828 :
4829 : static void
4830 52 : nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc)
4831 : {
4832 52 : struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
4833 52 : struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx;
4834 :
4835 52 : if (rc == 0) {
4836 50 : nvme_ns->probe_ctx = NULL;
4837 50 : nvme_ctrlr_get_ref(nvme_ctrlr);
4838 50 : } else {
4839 2 : pthread_mutex_lock(&nvme_ctrlr->mutex);
4840 2 : RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
4841 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4842 :
4843 2 : nvme_ns_free(nvme_ns);
4844 : }
4845 :
4846 52 : if (ctx) {
4847 51 : ctx->populates_in_progress--;
4848 51 : if (ctx->populates_in_progress == 0) {
4849 12 : nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
4850 12 : }
4851 51 : }
4852 52 : }
4853 :
4854 : static void
4855 2 : bdev_nvme_add_io_path(struct nvme_bdev_channel_iter *i,
4856 : struct nvme_bdev *nbdev,
4857 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
4858 : {
4859 2 : struct nvme_ns *nvme_ns = ctx;
4860 : int rc;
4861 :
4862 2 : rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
4863 2 : if (rc != 0) {
4864 0 : SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n");
4865 0 : }
4866 :
4867 2 : nvme_bdev_for_each_channel_continue(i, rc);
4868 2 : }
4869 :
4870 : static void
4871 2 : bdev_nvme_delete_io_path(struct nvme_bdev_channel_iter *i,
4872 : struct nvme_bdev *nbdev,
4873 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
4874 : {
4875 2 : struct nvme_ns *nvme_ns = ctx;
4876 : struct nvme_io_path *io_path;
4877 :
4878 2 : io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns);
4879 2 : if (io_path != NULL) {
4880 2 : _bdev_nvme_delete_io_path(nbdev_ch, io_path);
4881 2 : }
4882 :
4883 2 : nvme_bdev_for_each_channel_continue(i, 0);
4884 2 : }
4885 :
4886 : static void
4887 0 : bdev_nvme_add_io_path_failed(struct nvme_bdev *nbdev, void *ctx, int status)
4888 : {
4889 0 : struct nvme_ns *nvme_ns = ctx;
4890 :
4891 0 : nvme_ctrlr_populate_namespace_done(nvme_ns, -1);
4892 0 : }
4893 :
4894 : static void
4895 12 : bdev_nvme_add_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status)
4896 : {
4897 12 : struct nvme_ns *nvme_ns = ctx;
4898 :
4899 12 : if (status == 0) {
4900 12 : nvme_ctrlr_populate_namespace_done(nvme_ns, 0);
4901 12 : } else {
4902 : /* Delete the added io_paths and fail populating the namespace. */
4903 0 : nvme_bdev_for_each_channel(nbdev,
4904 : bdev_nvme_delete_io_path,
4905 0 : nvme_ns,
4906 : bdev_nvme_add_io_path_failed);
4907 : }
4908 12 : }
4909 :
4910 : static int
4911 13 : nvme_bdev_add_ns(struct nvme_bdev *nbdev, struct nvme_ns *nvme_ns)
4912 : {
4913 : struct nvme_ns *tmp_ns;
4914 : const struct spdk_nvme_ns_data *nsdata;
4915 :
4916 13 : nsdata = spdk_nvme_ns_get_data(nvme_ns->ns);
4917 13 : if (!nsdata->nmic.can_share) {
4918 0 : SPDK_ERRLOG("Namespace cannot be shared.\n");
4919 0 : return -EINVAL;
4920 : }
4921 :
4922 13 : pthread_mutex_lock(&nbdev->mutex);
4923 :
4924 13 : tmp_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
4925 13 : assert(tmp_ns != NULL);
4926 :
4927 13 : if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) {
4928 1 : pthread_mutex_unlock(&nbdev->mutex);
4929 1 : SPDK_ERRLOG("Namespaces are not identical.\n");
4930 1 : return -EINVAL;
4931 : }
4932 :
4933 12 : nbdev->ref++;
4934 12 : TAILQ_INSERT_TAIL(&nbdev->nvme_ns_list, nvme_ns, tailq);
4935 12 : nvme_ns->bdev = nbdev;
4936 :
4937 12 : pthread_mutex_unlock(&nbdev->mutex);
4938 :
4939 : /* Add nvme_io_path to nvme_bdev_channels dynamically. */
4940 24 : nvme_bdev_for_each_channel(nbdev,
4941 : bdev_nvme_add_io_path,
4942 12 : nvme_ns,
4943 : bdev_nvme_add_io_path_done);
4944 :
4945 12 : return 0;
4946 13 : }
4947 :
4948 : static void
4949 52 : nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
4950 : {
4951 : struct spdk_nvme_ns *ns;
4952 : struct nvme_bdev *bdev;
4953 52 : int rc = 0;
4954 :
4955 52 : ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id);
4956 52 : if (!ns) {
4957 0 : NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "Invalid NS %d\n", nvme_ns->id);
4958 0 : rc = -EINVAL;
4959 0 : goto done;
4960 : }
4961 :
4962 52 : nvme_ns->ns = ns;
4963 52 : nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
4964 :
4965 52 : if (nvme_ctrlr->ana_log_page != NULL) {
4966 38 : bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns);
4967 38 : }
4968 :
4969 52 : bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id);
4970 92 : if (bdev == NULL) {
4971 39 : rc = nvme_bdev_create(nvme_ctrlr, nvme_ns);
4972 39 : } else {
4973 13 : rc = nvme_bdev_add_ns(bdev, nvme_ns);
4974 13 : if (rc == 0) {
4975 12 : return;
4976 : }
4977 : }
4978 : done:
4979 40 : nvme_ctrlr_populate_namespace_done(nvme_ns, rc);
4980 52 : }
4981 :
4982 : static void
4983 50 : nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns)
4984 : {
4985 50 : struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
4986 :
4987 50 : assert(nvme_ctrlr != NULL);
4988 :
4989 50 : pthread_mutex_lock(&nvme_ctrlr->mutex);
4990 :
4991 50 : RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
4992 :
4993 50 : if (nvme_ns->bdev != NULL) {
4994 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4995 0 : return;
4996 : }
4997 :
4998 50 : nvme_ns_free(nvme_ns);
4999 50 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5000 :
5001 50 : nvme_ctrlr_put_ref(nvme_ctrlr);
5002 50 : }
5003 :
5004 : static void
5005 11 : bdev_nvme_delete_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status)
5006 : {
5007 11 : struct nvme_ns *nvme_ns = ctx;
5008 :
5009 11 : nvme_ctrlr_depopulate_namespace_done(nvme_ns);
5010 11 : }
5011 :
5012 : static void
5013 50 : nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
5014 : {
5015 : struct nvme_bdev *nbdev;
5016 :
5017 50 : if (nvme_ns->depopulating) {
5018 : /* Maybe we received 2 AENs in a row */
5019 0 : return;
5020 : }
5021 50 : nvme_ns->depopulating = true;
5022 :
5023 50 : spdk_poller_unregister(&nvme_ns->anatt_timer);
5024 :
5025 50 : nbdev = nvme_ns->bdev;
5026 50 : if (nbdev != NULL) {
5027 46 : pthread_mutex_lock(&nbdev->mutex);
5028 :
5029 46 : assert(nbdev->ref > 0);
5030 46 : nbdev->ref--;
5031 46 : if (nbdev->ref == 0) {
5032 35 : pthread_mutex_unlock(&nbdev->mutex);
5033 :
5034 35 : spdk_bdev_unregister(&nbdev->disk, NULL, NULL);
5035 35 : } else {
5036 : /* spdk_bdev_unregister() is not called until the last nvme_ns is
5037 : * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list
5038 : * and clear nvme_ns->bdev here.
5039 : */
5040 11 : TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq);
5041 :
5042 11 : pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
5043 11 : nvme_ns->bdev = NULL;
5044 11 : pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
5045 :
5046 11 : pthread_mutex_unlock(&nbdev->mutex);
5047 :
5048 : /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that,
5049 : * we call depopulate_namespace_done() to avoid use-after-free.
5050 : */
5051 22 : nvme_bdev_for_each_channel(nbdev,
5052 : bdev_nvme_delete_io_path,
5053 11 : nvme_ns,
5054 : bdev_nvme_delete_io_path_done);
5055 11 : return;
5056 : }
5057 35 : }
5058 :
5059 39 : nvme_ctrlr_depopulate_namespace_done(nvme_ns);
5060 50 : }
5061 :
5062 : static void
5063 63 : nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
5064 : struct nvme_async_probe_ctx *ctx)
5065 : {
5066 63 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5067 : struct nvme_ns *nvme_ns, *next;
5068 : struct spdk_nvme_ns *ns;
5069 : struct nvme_bdev *nbdev;
5070 : uint32_t nsid;
5071 : int rc;
5072 : uint64_t num_sectors;
5073 :
5074 63 : if (ctx) {
5075 : /* Initialize this count to 1 to handle the populate functions
5076 : * calling nvme_ctrlr_populate_namespace_done() immediately.
5077 : */
5078 47 : ctx->populates_in_progress = 1;
5079 47 : }
5080 :
5081 : /* First loop over our existing namespaces and see if they have been
5082 : * removed. */
5083 63 : nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
5084 67 : while (nvme_ns != NULL) {
5085 4 : next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
5086 :
5087 4 : if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) {
5088 : /* NS is still there or added again. Its attributes may have changed. */
5089 3 : ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
5090 3 : if (nvme_ns->ns != ns) {
5091 1 : assert(nvme_ns->ns == NULL);
5092 1 : nvme_ns->ns = ns;
5093 1 : NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "NSID %u was added\n", nvme_ns->id);
5094 1 : }
5095 :
5096 3 : num_sectors = spdk_nvme_ns_get_num_sectors(ns);
5097 3 : nbdev = nvme_ns->bdev;
5098 3 : assert(nbdev != NULL);
5099 3 : if (nbdev->disk.blockcnt != num_sectors) {
5100 1 : NVME_CTRLR_NOTICELOG(nvme_ctrlr,
5101 : "NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
5102 : nvme_ns->id,
5103 : nbdev->disk.name,
5104 : nbdev->disk.blockcnt,
5105 : num_sectors);
5106 1 : rc = spdk_bdev_notify_blockcnt_change(&nbdev->disk, num_sectors);
5107 1 : if (rc != 0) {
5108 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr,
5109 : "Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
5110 : nbdev->disk.name, rc);
5111 0 : }
5112 1 : }
5113 3 : } else {
5114 : /* Namespace was removed */
5115 1 : nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
5116 : }
5117 :
5118 4 : nvme_ns = next;
5119 : }
5120 :
5121 : /* Loop through all of the namespaces at the nvme level and see if any of them are new */
5122 63 : nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
5123 118 : while (nsid != 0) {
5124 55 : nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
5125 :
5126 55 : if (nvme_ns == NULL) {
5127 : /* Found a new one */
5128 52 : nvme_ns = nvme_ns_alloc();
5129 52 : if (nvme_ns == NULL) {
5130 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate namespace\n");
5131 : /* This just fails to attach the namespace. It may work on a future attempt. */
5132 0 : continue;
5133 : }
5134 :
5135 52 : nvme_ns->id = nsid;
5136 52 : nvme_ns->ctrlr = nvme_ctrlr;
5137 :
5138 52 : nvme_ns->bdev = NULL;
5139 :
5140 52 : if (ctx) {
5141 51 : ctx->populates_in_progress++;
5142 51 : }
5143 52 : nvme_ns->probe_ctx = ctx;
5144 :
5145 52 : pthread_mutex_lock(&nvme_ctrlr->mutex);
5146 52 : RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
5147 52 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5148 :
5149 52 : nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns);
5150 52 : }
5151 :
5152 55 : nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid);
5153 : }
5154 :
5155 63 : if (ctx) {
5156 : /* Decrement this count now that the loop is over to account
5157 : * for the one we started with. If the count is then 0, we
5158 : * know any populate_namespace functions completed immediately,
5159 : * so we'll kick the callback here.
5160 : */
5161 47 : ctx->populates_in_progress--;
5162 47 : if (ctx->populates_in_progress == 0) {
5163 35 : nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
5164 35 : }
5165 47 : }
5166 :
5167 63 : }
5168 :
5169 : static void
5170 62 : nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr)
5171 : {
5172 : struct nvme_ns *nvme_ns, *tmp;
5173 :
5174 111 : RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) {
5175 49 : nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
5176 49 : }
5177 62 : }
5178 :
5179 : static uint32_t
5180 37 : nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr)
5181 : {
5182 37 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5183 : const struct spdk_nvme_ctrlr_data *cdata;
5184 37 : uint32_t nsid, ns_count = 0;
5185 :
5186 37 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5187 :
5188 82 : for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
5189 82 : nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) {
5190 45 : ns_count++;
5191 45 : }
5192 :
5193 74 : return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
5194 37 : sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count *
5195 : sizeof(uint32_t);
5196 : }
5197 :
5198 : static int
5199 7 : nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc,
5200 : void *cb_arg)
5201 : {
5202 7 : struct nvme_ctrlr *nvme_ctrlr = cb_arg;
5203 : struct nvme_ns *nvme_ns;
5204 : uint32_t i, nsid;
5205 :
5206 13 : for (i = 0; i < desc->num_of_nsid; i++) {
5207 6 : nsid = desc->nsid[i];
5208 6 : if (nsid == 0) {
5209 0 : continue;
5210 : }
5211 :
5212 6 : nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
5213 :
5214 6 : if (nvme_ns == NULL) {
5215 : /* Target told us that an inactive namespace had an ANA change */
5216 1 : continue;
5217 : }
5218 :
5219 5 : _nvme_ns_set_ana_state(nvme_ns, desc);
5220 5 : }
5221 :
5222 7 : return 0;
5223 : }
5224 :
5225 : static void
5226 0 : bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
5227 : {
5228 : struct nvme_ns *nvme_ns;
5229 :
5230 0 : spdk_free(nvme_ctrlr->ana_log_page);
5231 0 : nvme_ctrlr->ana_log_page = NULL;
5232 :
5233 0 : for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
5234 0 : nvme_ns != NULL;
5235 0 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) {
5236 0 : nvme_ns->ana_state_updating = false;
5237 0 : nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
5238 0 : }
5239 0 : }
5240 :
5241 : static void
5242 3 : nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl)
5243 : {
5244 3 : struct nvme_ctrlr *nvme_ctrlr = ctx;
5245 :
5246 3 : if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) {
5247 6 : bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states,
5248 3 : nvme_ctrlr);
5249 3 : } else {
5250 0 : bdev_nvme_disable_read_ana_log_page(nvme_ctrlr);
5251 : }
5252 :
5253 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
5254 :
5255 3 : assert(nvme_ctrlr->ana_log_page_updating == true);
5256 3 : nvme_ctrlr->ana_log_page_updating = false;
5257 :
5258 3 : if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
5259 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5260 :
5261 0 : nvme_ctrlr_unregister(nvme_ctrlr);
5262 0 : } else {
5263 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5264 :
5265 3 : bdev_nvme_clear_io_path_caches(nvme_ctrlr);
5266 : }
5267 3 : }
5268 :
5269 : static int
5270 6 : nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
5271 : {
5272 : uint32_t ana_log_page_size;
5273 : int rc;
5274 :
5275 6 : if (nvme_ctrlr->ana_log_page == NULL) {
5276 0 : return -EINVAL;
5277 : }
5278 :
5279 6 : ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr);
5280 :
5281 6 : if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) {
5282 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr,
5283 : "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n",
5284 : ana_log_page_size, nvme_ctrlr->max_ana_log_page_size);
5285 0 : return -EINVAL;
5286 : }
5287 :
5288 6 : pthread_mutex_lock(&nvme_ctrlr->mutex);
5289 6 : if (!nvme_ctrlr_is_available(nvme_ctrlr) ||
5290 5 : nvme_ctrlr->ana_log_page_updating) {
5291 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5292 3 : return -EBUSY;
5293 : }
5294 :
5295 3 : nvme_ctrlr->ana_log_page_updating = true;
5296 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5297 :
5298 6 : rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr,
5299 : SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
5300 : SPDK_NVME_GLOBAL_NS_TAG,
5301 3 : nvme_ctrlr->ana_log_page,
5302 3 : ana_log_page_size, 0,
5303 : nvme_ctrlr_read_ana_log_page_done,
5304 3 : nvme_ctrlr);
5305 3 : if (rc != 0) {
5306 0 : nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL);
5307 0 : }
5308 :
5309 3 : return rc;
5310 6 : }
5311 :
5312 : static void
5313 0 : dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
5314 : {
5315 0 : }
5316 :
5317 : struct bdev_nvme_set_preferred_path_ctx {
5318 : struct spdk_bdev_desc *desc;
5319 : struct nvme_ns *nvme_ns;
5320 : bdev_nvme_set_preferred_path_cb cb_fn;
5321 : void *cb_arg;
5322 : };
5323 :
5324 : static void
5325 3 : bdev_nvme_set_preferred_path_done(struct nvme_bdev *nbdev, void *_ctx, int status)
5326 : {
5327 3 : struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx;
5328 :
5329 3 : assert(ctx != NULL);
5330 3 : assert(ctx->desc != NULL);
5331 3 : assert(ctx->cb_fn != NULL);
5332 :
5333 3 : spdk_bdev_close(ctx->desc);
5334 :
5335 3 : ctx->cb_fn(ctx->cb_arg, status);
5336 :
5337 3 : free(ctx);
5338 3 : }
5339 :
5340 : static void
5341 2 : _bdev_nvme_set_preferred_path(struct nvme_bdev_channel_iter *i,
5342 : struct nvme_bdev *nbdev,
5343 : struct nvme_bdev_channel *nbdev_ch, void *_ctx)
5344 : {
5345 2 : struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx;
5346 : struct nvme_io_path *io_path, *prev;
5347 :
5348 2 : prev = NULL;
5349 3 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
5350 3 : if (io_path->nvme_ns == ctx->nvme_ns) {
5351 2 : break;
5352 : }
5353 1 : prev = io_path;
5354 1 : }
5355 :
5356 2 : if (io_path != NULL) {
5357 2 : if (prev != NULL) {
5358 1 : STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq);
5359 1 : STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq);
5360 1 : }
5361 :
5362 : /* We can set io_path to nbdev_ch->current_io_path directly here.
5363 : * However, it needs to be conditional. To simplify the code,
5364 : * just clear nbdev_ch->current_io_path and let find_io_path()
5365 : * fill it.
5366 : *
5367 : * Automatic failback may be disabled. Hence even if the io_path is
5368 : * already at the head, clear nbdev_ch->current_io_path.
5369 : */
5370 2 : bdev_nvme_clear_current_io_path(nbdev_ch);
5371 2 : }
5372 :
5373 2 : nvme_bdev_for_each_channel_continue(i, 0);
5374 2 : }
5375 :
5376 : static struct nvme_ns *
5377 3 : bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid)
5378 : {
5379 : struct nvme_ns *nvme_ns, *prev;
5380 : const struct spdk_nvme_ctrlr_data *cdata;
5381 :
5382 3 : prev = NULL;
5383 6 : TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
5384 6 : cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);
5385 :
5386 6 : if (cdata->cntlid == cntlid) {
5387 3 : break;
5388 : }
5389 3 : prev = nvme_ns;
5390 3 : }
5391 :
5392 3 : if (nvme_ns != NULL && prev != NULL) {
5393 2 : TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq);
5394 2 : TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq);
5395 2 : }
5396 :
5397 3 : return nvme_ns;
5398 : }
5399 :
5400 : /* This function supports only multipath mode. There is only a single I/O path
5401 : * for each NVMe-oF controller. Hence, just move the matched I/O path to the
5402 : * head of the I/O path list for each NVMe bdev channel.
5403 : *
5404 : * NVMe bdev channel may be acquired after completing this function. move the
5405 : * matched namespace to the head of the namespace list for the NVMe bdev too.
5406 : */
5407 : void
5408 3 : bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
5409 : bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg)
5410 : {
5411 : struct bdev_nvme_set_preferred_path_ctx *ctx;
5412 : struct spdk_bdev *bdev;
5413 : struct nvme_bdev *nbdev;
5414 3 : int rc = 0;
5415 :
5416 3 : assert(cb_fn != NULL);
5417 :
5418 3 : ctx = calloc(1, sizeof(*ctx));
5419 3 : if (ctx == NULL) {
5420 0 : SPDK_ERRLOG("Failed to alloc context.\n");
5421 0 : rc = -ENOMEM;
5422 0 : goto err_alloc;
5423 : }
5424 :
5425 3 : ctx->cb_fn = cb_fn;
5426 3 : ctx->cb_arg = cb_arg;
5427 :
5428 3 : rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc);
5429 3 : if (rc != 0) {
5430 0 : SPDK_ERRLOG("Failed to open bdev %s.\n", name);
5431 0 : goto err_open;
5432 : }
5433 :
5434 3 : bdev = spdk_bdev_desc_get_bdev(ctx->desc);
5435 :
5436 3 : if (bdev->module != &nvme_if) {
5437 0 : SPDK_ERRLOG("bdev %s is not registered in this module.\n", name);
5438 0 : rc = -ENODEV;
5439 0 : goto err_bdev;
5440 : }
5441 :
5442 3 : nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
5443 :
5444 3 : pthread_mutex_lock(&nbdev->mutex);
5445 :
5446 3 : ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid);
5447 3 : if (ctx->nvme_ns == NULL) {
5448 0 : pthread_mutex_unlock(&nbdev->mutex);
5449 :
5450 0 : SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid);
5451 0 : rc = -ENODEV;
5452 0 : goto err_bdev;
5453 : }
5454 :
5455 3 : pthread_mutex_unlock(&nbdev->mutex);
5456 :
5457 6 : nvme_bdev_for_each_channel(nbdev,
5458 : _bdev_nvme_set_preferred_path,
5459 3 : ctx,
5460 : bdev_nvme_set_preferred_path_done);
5461 3 : return;
5462 :
5463 : err_bdev:
5464 0 : spdk_bdev_close(ctx->desc);
5465 : err_open:
5466 0 : free(ctx);
5467 : err_alloc:
5468 0 : cb_fn(cb_arg, rc);
5469 3 : }
5470 :
5471 : struct bdev_nvme_set_multipath_policy_ctx {
5472 : struct spdk_bdev_desc *desc;
5473 : spdk_bdev_nvme_set_multipath_policy_cb cb_fn;
5474 : void *cb_arg;
5475 : };
5476 :
5477 : static void
5478 3 : bdev_nvme_set_multipath_policy_done(struct nvme_bdev *nbdev, void *_ctx, int status)
5479 : {
5480 3 : struct bdev_nvme_set_multipath_policy_ctx *ctx = _ctx;
5481 :
5482 3 : assert(ctx != NULL);
5483 3 : assert(ctx->desc != NULL);
5484 3 : assert(ctx->cb_fn != NULL);
5485 :
5486 3 : spdk_bdev_close(ctx->desc);
5487 :
5488 3 : ctx->cb_fn(ctx->cb_arg, status);
5489 :
5490 3 : free(ctx);
5491 3 : }
5492 :
5493 : static void
5494 1 : _bdev_nvme_set_multipath_policy(struct nvme_bdev_channel_iter *i,
5495 : struct nvme_bdev *nbdev,
5496 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
5497 : {
5498 1 : nbdev_ch->mp_policy = nbdev->mp_policy;
5499 1 : nbdev_ch->mp_selector = nbdev->mp_selector;
5500 1 : nbdev_ch->rr_min_io = nbdev->rr_min_io;
5501 1 : bdev_nvme_clear_current_io_path(nbdev_ch);
5502 :
5503 1 : nvme_bdev_for_each_channel_continue(i, 0);
5504 1 : }
5505 :
5506 : void
5507 3 : spdk_bdev_nvme_set_multipath_policy(const char *name, enum spdk_bdev_nvme_multipath_policy policy,
5508 : enum spdk_bdev_nvme_multipath_selector selector, uint32_t rr_min_io,
5509 : spdk_bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg)
5510 : {
5511 : struct bdev_nvme_set_multipath_policy_ctx *ctx;
5512 : struct spdk_bdev *bdev;
5513 : struct nvme_bdev *nbdev;
5514 : int rc;
5515 :
5516 3 : assert(cb_fn != NULL);
5517 :
5518 3 : switch (policy) {
5519 : case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE:
5520 1 : break;
5521 : case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE:
5522 2 : switch (selector) {
5523 : case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN:
5524 1 : if (rr_min_io == UINT32_MAX) {
5525 0 : rr_min_io = 1;
5526 1 : } else if (rr_min_io == 0) {
5527 0 : rc = -EINVAL;
5528 0 : goto exit;
5529 : }
5530 1 : break;
5531 : case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH:
5532 1 : break;
5533 : default:
5534 0 : rc = -EINVAL;
5535 0 : goto exit;
5536 : }
5537 2 : break;
5538 : default:
5539 0 : rc = -EINVAL;
5540 0 : goto exit;
5541 : }
5542 :
5543 3 : ctx = calloc(1, sizeof(*ctx));
5544 3 : if (ctx == NULL) {
5545 0 : SPDK_ERRLOG("Failed to alloc context.\n");
5546 0 : rc = -ENOMEM;
5547 0 : goto exit;
5548 : }
5549 :
5550 3 : ctx->cb_fn = cb_fn;
5551 3 : ctx->cb_arg = cb_arg;
5552 :
5553 3 : rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc);
5554 3 : if (rc != 0) {
5555 0 : SPDK_ERRLOG("Failed to open bdev %s.\n", name);
5556 0 : rc = -ENODEV;
5557 0 : goto err_open;
5558 : }
5559 :
5560 3 : bdev = spdk_bdev_desc_get_bdev(ctx->desc);
5561 3 : if (bdev->module != &nvme_if) {
5562 0 : SPDK_ERRLOG("bdev %s is not registered in this module.\n", name);
5563 0 : rc = -ENODEV;
5564 0 : goto err_module;
5565 : }
5566 3 : nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
5567 :
5568 3 : pthread_mutex_lock(&nbdev->mutex);
5569 3 : nbdev->mp_policy = policy;
5570 3 : nbdev->mp_selector = selector;
5571 3 : nbdev->rr_min_io = rr_min_io;
5572 3 : pthread_mutex_unlock(&nbdev->mutex);
5573 :
5574 6 : nvme_bdev_for_each_channel(nbdev,
5575 : _bdev_nvme_set_multipath_policy,
5576 3 : ctx,
5577 : bdev_nvme_set_multipath_policy_done);
5578 3 : return;
5579 :
5580 : err_module:
5581 0 : spdk_bdev_close(ctx->desc);
5582 : err_open:
5583 0 : free(ctx);
5584 : exit:
5585 0 : cb_fn(cb_arg, rc);
5586 3 : }
5587 :
5588 : static void
5589 3 : aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
5590 : {
5591 3 : struct nvme_ctrlr *nvme_ctrlr = arg;
5592 : union spdk_nvme_async_event_completion event;
5593 :
5594 3 : if (spdk_nvme_cpl_is_error(cpl)) {
5595 0 : SPDK_WARNLOG("AER request execute failed\n");
5596 0 : return;
5597 : }
5598 :
5599 3 : event.raw = cpl->cdw0;
5600 3 : if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
5601 3 : (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
5602 2 : nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL);
5603 3 : } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
5604 1 : (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) {
5605 1 : nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
5606 1 : }
5607 3 : }
5608 :
5609 : static void
5610 53 : free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx)
5611 : {
5612 53 : spdk_keyring_put_key(ctx->drv_opts.tls_psk);
5613 53 : spdk_keyring_put_key(ctx->drv_opts.dhchap_key);
5614 53 : spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key);
5615 53 : free(ctx->base_name);
5616 53 : free(ctx);
5617 53 : }
5618 :
5619 : static void
5620 53 : populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc)
5621 : {
5622 53 : if (ctx->cb_fn) {
5623 53 : ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc);
5624 53 : }
5625 :
5626 53 : ctx->namespaces_populated = true;
5627 53 : if (ctx->probe_done) {
5628 : /* The probe was already completed, so we need to free the context
5629 : * here. This can happen for cases like OCSSD, where we need to
5630 : * send additional commands to the SSD after attach.
5631 : */
5632 32 : free_nvme_async_probe_ctx(ctx);
5633 32 : }
5634 53 : }
5635 :
5636 : static int
5637 20 : bdev_nvme_remove_poller(void *ctx)
5638 : {
5639 : struct spdk_nvme_transport_id trid_pcie;
5640 :
5641 20 : if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
5642 1 : spdk_poller_unregister(&g_hotplug_poller);
5643 1 : return SPDK_POLLER_IDLE;
5644 : }
5645 :
5646 19 : memset(&trid_pcie, 0, sizeof(trid_pcie));
5647 19 : spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
5648 :
5649 19 : if (spdk_nvme_scan_attached(&trid_pcie)) {
5650 0 : SPDK_ERRLOG_RATELIMIT("spdk_nvme_scan_attached() failed\n");
5651 0 : }
5652 :
5653 19 : return SPDK_POLLER_BUSY;
5654 20 : }
5655 :
5656 : static void
5657 61 : nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr,
5658 : struct nvme_async_probe_ctx *ctx)
5659 : {
5660 61 : struct spdk_nvme_transport_id *trid = &nvme_ctrlr->active_path_id->trid;
5661 :
5662 61 : if (spdk_nvme_trtype_is_fabrics(trid->trtype)) {
5663 61 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created to %s:%s\n",
5664 : trid->traddr, trid->trsvcid);
5665 61 : } else {
5666 0 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created\n");
5667 : }
5668 :
5669 122 : spdk_io_device_register(nvme_ctrlr,
5670 : bdev_nvme_create_ctrlr_channel_cb,
5671 : bdev_nvme_destroy_ctrlr_channel_cb,
5672 : sizeof(struct nvme_ctrlr_channel),
5673 61 : nvme_ctrlr->nbdev_ctrlr->name);
5674 :
5675 61 : nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx);
5676 :
5677 61 : if (g_hotplug_poller == NULL) {
5678 2 : g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL,
5679 : NVME_HOTPLUG_POLL_PERIOD_DEFAULT);
5680 2 : }
5681 61 : }
5682 :
5683 : static void
5684 31 : nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl)
5685 : {
5686 31 : struct nvme_ctrlr *nvme_ctrlr = _ctx;
5687 31 : struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx;
5688 :
5689 31 : nvme_ctrlr->probe_ctx = NULL;
5690 :
5691 31 : if (spdk_nvme_cpl_is_error(cpl)) {
5692 0 : nvme_ctrlr_delete(nvme_ctrlr);
5693 :
5694 0 : if (ctx != NULL) {
5695 0 : ctx->reported_bdevs = 0;
5696 0 : populate_namespaces_cb(ctx, -1);
5697 0 : }
5698 0 : return;
5699 : }
5700 :
5701 31 : nvme_ctrlr_create_done(nvme_ctrlr, ctx);
5702 31 : }
5703 :
5704 : static int
5705 31 : nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
5706 : struct nvme_async_probe_ctx *ctx)
5707 : {
5708 31 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5709 : const struct spdk_nvme_ctrlr_data *cdata;
5710 : uint32_t ana_log_page_size;
5711 :
5712 31 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5713 :
5714 : /* Set buffer size enough to include maximum number of allowed namespaces. */
5715 62 : ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
5716 31 : sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan *
5717 : sizeof(uint32_t);
5718 :
5719 31 : nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL,
5720 : SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
5721 31 : if (nvme_ctrlr->ana_log_page == NULL) {
5722 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate ANA log page buffer\n");
5723 0 : return -ENXIO;
5724 : }
5725 :
5726 : /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned.
5727 : * Hence copy each descriptor to a temporary area when parsing it.
5728 : *
5729 : * Allocate a buffer whose size is as large as ANA log page buffer because
5730 : * we do not know the size of a descriptor until actually reading it.
5731 : */
5732 31 : nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size);
5733 31 : if (nvme_ctrlr->copied_ana_desc == NULL) {
5734 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate a buffer to parse ANA descriptor\n");
5735 0 : return -ENOMEM;
5736 : }
5737 :
5738 31 : nvme_ctrlr->max_ana_log_page_size = ana_log_page_size;
5739 :
5740 31 : nvme_ctrlr->probe_ctx = ctx;
5741 :
5742 : /* Then, set the read size only to include the current active namespaces. */
5743 31 : ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr);
5744 :
5745 31 : if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) {
5746 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n",
5747 : ana_log_page_size, nvme_ctrlr->max_ana_log_page_size);
5748 0 : return -EINVAL;
5749 : }
5750 :
5751 62 : return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr,
5752 : SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
5753 : SPDK_NVME_GLOBAL_NS_TAG,
5754 31 : nvme_ctrlr->ana_log_page,
5755 31 : ana_log_page_size, 0,
5756 : nvme_ctrlr_init_ana_log_page_done,
5757 31 : nvme_ctrlr);
5758 31 : }
5759 :
5760 : /* hostnqn and subnqn were already verified before attaching a controller.
5761 : * Hence check only the multipath capability and cntlid here.
5762 : */
5763 : static bool
5764 16 : bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr)
5765 : {
5766 : struct nvme_ctrlr *tmp;
5767 : const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata;
5768 :
5769 16 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5770 :
5771 16 : if (!cdata->cmic.multi_ctrlr) {
5772 0 : SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid);
5773 0 : return false;
5774 : }
5775 :
5776 33 : TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) {
5777 18 : tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr);
5778 :
5779 18 : if (!tmp_cdata->cmic.multi_ctrlr) {
5780 0 : NVME_CTRLR_ERRLOG(tmp, "Ctrlr%u does not support multipath.\n", cdata->cntlid);
5781 0 : return false;
5782 : }
5783 18 : if (cdata->cntlid == tmp_cdata->cntlid) {
5784 1 : NVME_CTRLR_ERRLOG(tmp, "cntlid %u are duplicated.\n", tmp_cdata->cntlid);
5785 1 : return false;
5786 : }
5787 17 : }
5788 :
5789 15 : return true;
5790 16 : }
5791 :
5792 :
5793 : static int
5794 62 : nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr)
5795 : {
5796 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
5797 62 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5798 : struct nvme_ctrlr *nctrlr;
5799 62 : int rc = 0;
5800 :
5801 62 : pthread_mutex_lock(&g_bdev_nvme_mutex);
5802 :
5803 62 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
5804 62 : if (nbdev_ctrlr != NULL) {
5805 16 : if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) {
5806 1 : rc = -EINVAL;
5807 1 : goto exit;
5808 : }
5809 32 : TAILQ_FOREACH(nctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
5810 17 : if (nctrlr->opts.multipath != nvme_ctrlr->opts.multipath) {
5811 : /* All controllers with the same name must be configured the same
5812 : * way, either for multipath or failover. If the configuration doesn't
5813 : * match - report error.
5814 : */
5815 0 : rc = -EINVAL;
5816 0 : goto exit;
5817 : }
5818 17 : }
5819 15 : } else {
5820 46 : nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr));
5821 46 : if (nbdev_ctrlr == NULL) {
5822 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_bdev_ctrlr.\n");
5823 0 : rc = -ENOMEM;
5824 0 : goto exit;
5825 : }
5826 46 : nbdev_ctrlr->name = strdup(name);
5827 46 : if (nbdev_ctrlr->name == NULL) {
5828 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate name of nvme_bdev_ctrlr.\n");
5829 0 : free(nbdev_ctrlr);
5830 0 : goto exit;
5831 : }
5832 46 : TAILQ_INIT(&nbdev_ctrlr->ctrlrs);
5833 46 : TAILQ_INIT(&nbdev_ctrlr->bdevs);
5834 46 : TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
5835 : }
5836 61 : nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr;
5837 61 : TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
5838 : exit:
5839 62 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
5840 62 : return rc;
5841 : }
5842 :
5843 : static int
5844 62 : nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
5845 : const char *name,
5846 : const struct spdk_nvme_transport_id *trid,
5847 : struct nvme_async_probe_ctx *ctx)
5848 : {
5849 : struct nvme_ctrlr *nvme_ctrlr;
5850 : struct nvme_path_id *path_id;
5851 : const struct spdk_nvme_ctrlr_data *cdata;
5852 62 : struct spdk_event_handler_opts opts = {
5853 : .opts_size = SPDK_SIZEOF(&opts, fd_type),
5854 : };
5855 : uint64_t period;
5856 : int fd, rc;
5857 :
5858 62 : nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr));
5859 62 : if (nvme_ctrlr == NULL) {
5860 0 : SPDK_ERRLOG("Failed to allocate device struct\n");
5861 0 : return -ENOMEM;
5862 : }
5863 :
5864 62 : rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL);
5865 62 : if (rc != 0) {
5866 0 : free(nvme_ctrlr);
5867 0 : return rc;
5868 : }
5869 :
5870 62 : TAILQ_INIT(&nvme_ctrlr->trids);
5871 62 : TAILQ_INIT(&nvme_ctrlr->pending_resets);
5872 62 : RB_INIT(&nvme_ctrlr->namespaces);
5873 :
5874 : /* Get another reference to the key, so the first one can be released from probe_ctx */
5875 62 : if (ctx != NULL) {
5876 48 : if (ctx->drv_opts.tls_psk != NULL) {
5877 0 : nvme_ctrlr->psk = spdk_keyring_get_key(
5878 0 : spdk_key_get_name(ctx->drv_opts.tls_psk));
5879 0 : if (nvme_ctrlr->psk == NULL) {
5880 : /* Could only happen if the key was removed in the meantime */
5881 0 : SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n",
5882 : spdk_key_get_name(ctx->drv_opts.tls_psk));
5883 0 : rc = -ENOKEY;
5884 0 : goto err;
5885 : }
5886 0 : }
5887 :
5888 48 : if (ctx->drv_opts.dhchap_key != NULL) {
5889 0 : nvme_ctrlr->dhchap_key = spdk_keyring_get_key(
5890 0 : spdk_key_get_name(ctx->drv_opts.dhchap_key));
5891 0 : if (nvme_ctrlr->dhchap_key == NULL) {
5892 0 : SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n",
5893 : spdk_key_get_name(ctx->drv_opts.dhchap_key));
5894 0 : rc = -ENOKEY;
5895 0 : goto err;
5896 : }
5897 0 : }
5898 :
5899 48 : if (ctx->drv_opts.dhchap_ctrlr_key != NULL) {
5900 0 : nvme_ctrlr->dhchap_ctrlr_key =
5901 0 : spdk_keyring_get_key(
5902 0 : spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key));
5903 0 : if (nvme_ctrlr->dhchap_ctrlr_key == NULL) {
5904 0 : SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n",
5905 : spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key));
5906 0 : rc = -ENOKEY;
5907 0 : goto err;
5908 : }
5909 0 : }
5910 48 : }
5911 :
5912 : /* Check if we manage to enable interrupts on the controller. */
5913 62 : if (spdk_interrupt_mode_is_enabled() && ctx != NULL && !ctx->drv_opts.enable_interrupts) {
5914 0 : SPDK_ERRLOG("Failed to enable interrupts on the controller\n");
5915 0 : rc = -ENOTSUP;
5916 0 : goto err;
5917 : }
5918 :
5919 62 : path_id = calloc(1, sizeof(*path_id));
5920 62 : if (path_id == NULL) {
5921 0 : SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
5922 0 : rc = -ENOMEM;
5923 0 : goto err;
5924 : }
5925 :
5926 62 : path_id->trid = *trid;
5927 62 : if (ctx != NULL) {
5928 48 : memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr));
5929 48 : memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid));
5930 48 : }
5931 62 : nvme_ctrlr->active_path_id = path_id;
5932 62 : TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link);
5933 :
5934 62 : nvme_ctrlr->thread = spdk_get_thread();
5935 62 : nvme_ctrlr->ctrlr = ctrlr;
5936 62 : nvme_ctrlr->ref = 1;
5937 :
5938 62 : if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
5939 0 : SPDK_ERRLOG("OCSSDs are not supported");
5940 0 : rc = -ENOTSUP;
5941 0 : goto err;
5942 : }
5943 :
5944 62 : if (ctx != NULL) {
5945 48 : memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts));
5946 48 : } else {
5947 14 : spdk_bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts);
5948 : }
5949 :
5950 62 : period = spdk_interrupt_mode_is_enabled() ? 0 : g_opts.nvme_adminq_poll_period_us;
5951 :
5952 62 : nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr,
5953 : period);
5954 :
5955 62 : if (spdk_interrupt_mode_is_enabled()) {
5956 0 : spdk_poller_register_interrupt(nvme_ctrlr->adminq_timer_poller, NULL, NULL);
5957 :
5958 0 : fd = spdk_nvme_ctrlr_get_admin_qp_fd(nvme_ctrlr->ctrlr, &opts);
5959 0 : if (fd < 0) {
5960 0 : rc = fd;
5961 0 : goto err;
5962 : }
5963 :
5964 0 : nvme_ctrlr->intr = SPDK_INTERRUPT_REGISTER_EXT(fd, bdev_nvme_poll_adminq,
5965 : nvme_ctrlr, &opts);
5966 0 : if (!nvme_ctrlr->intr) {
5967 0 : rc = -EINVAL;
5968 0 : goto err;
5969 : }
5970 0 : }
5971 :
5972 62 : if (g_opts.timeout_us > 0) {
5973 : /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */
5974 : /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */
5975 0 : uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ?
5976 0 : g_opts.timeout_us : g_opts.timeout_admin_us;
5977 0 : spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
5978 0 : adm_timeout_us, timeout_cb, nvme_ctrlr);
5979 0 : }
5980 :
5981 62 : spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr);
5982 62 : spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr);
5983 :
5984 62 : if (spdk_nvme_ctrlr_get_flags(ctrlr) &
5985 : SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
5986 0 : nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr);
5987 0 : }
5988 :
5989 62 : rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr);
5990 62 : if (rc != 0) {
5991 1 : goto err;
5992 : }
5993 :
5994 61 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5995 :
5996 61 : if (cdata->cmic.ana_reporting) {
5997 31 : rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx);
5998 31 : if (rc == 0) {
5999 31 : return 0;
6000 : }
6001 0 : } else {
6002 30 : nvme_ctrlr_create_done(nvme_ctrlr, ctx);
6003 30 : return 0;
6004 : }
6005 :
6006 : err:
6007 1 : nvme_ctrlr_delete(nvme_ctrlr);
6008 1 : return rc;
6009 62 : }
6010 :
6011 : void
6012 34 : spdk_bdev_nvme_get_default_ctrlr_opts(struct spdk_bdev_nvme_ctrlr_opts *opts)
6013 : {
6014 34 : opts->prchk_flags = 0;
6015 34 : opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec;
6016 34 : opts->reconnect_delay_sec = g_opts.reconnect_delay_sec;
6017 34 : opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec;
6018 34 : opts->multipath = true;
6019 34 : }
6020 :
6021 : static void
6022 0 : attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
6023 : struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts)
6024 : {
6025 : char *name;
6026 :
6027 0 : name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
6028 0 : if (!name) {
6029 0 : SPDK_ERRLOG("Failed to assign name to NVMe device\n");
6030 0 : return;
6031 : }
6032 :
6033 0 : if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) {
6034 0 : SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
6035 0 : } else {
6036 0 : SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name);
6037 : }
6038 :
6039 0 : free(name);
6040 0 : }
6041 :
6042 : static void
6043 61 : _nvme_ctrlr_destruct(void *ctx)
6044 : {
6045 61 : struct nvme_ctrlr *nvme_ctrlr = ctx;
6046 :
6047 61 : nvme_ctrlr_depopulate_namespaces(nvme_ctrlr);
6048 61 : nvme_ctrlr_put_ref(nvme_ctrlr);
6049 61 : }
6050 :
6051 : static int
6052 58 : bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
6053 : {
6054 : struct nvme_probe_skip_entry *entry;
6055 :
6056 : /* The controller's destruction was already started */
6057 58 : if (nvme_ctrlr->destruct) {
6058 0 : return -EALREADY;
6059 : }
6060 :
6061 58 : if (!hotplug &&
6062 58 : nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
6063 0 : entry = calloc(1, sizeof(*entry));
6064 0 : if (!entry) {
6065 0 : return -ENOMEM;
6066 : }
6067 0 : entry->trid = nvme_ctrlr->active_path_id->trid;
6068 0 : TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
6069 0 : }
6070 :
6071 58 : nvme_ctrlr->destruct = true;
6072 58 : return 0;
6073 58 : }
6074 :
6075 : static int
6076 2 : bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
6077 : {
6078 : int rc;
6079 :
6080 2 : pthread_mutex_lock(&nvme_ctrlr->mutex);
6081 2 : rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug);
6082 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6083 :
6084 2 : if (rc == 0) {
6085 2 : _nvme_ctrlr_destruct(nvme_ctrlr);
6086 2 : } else if (rc == -EALREADY) {
6087 0 : rc = 0;
6088 0 : }
6089 :
6090 2 : return rc;
6091 : }
6092 :
6093 : static void
6094 0 : remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
6095 : {
6096 0 : struct nvme_ctrlr *nvme_ctrlr = cb_ctx;
6097 :
6098 0 : bdev_nvme_delete_ctrlr(nvme_ctrlr, true);
6099 0 : }
6100 :
6101 : static int
6102 0 : bdev_nvme_hotplug_probe(void *arg)
6103 : {
6104 0 : if (g_hotplug_probe_ctx == NULL) {
6105 0 : spdk_poller_unregister(&g_hotplug_probe_poller);
6106 0 : return SPDK_POLLER_IDLE;
6107 : }
6108 :
6109 0 : if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
6110 0 : g_hotplug_probe_ctx = NULL;
6111 0 : spdk_poller_unregister(&g_hotplug_probe_poller);
6112 0 : }
6113 :
6114 0 : return SPDK_POLLER_BUSY;
6115 0 : }
6116 :
6117 : static int
6118 0 : bdev_nvme_hotplug(void *arg)
6119 : {
6120 : struct spdk_nvme_transport_id trid_pcie;
6121 :
6122 0 : if (g_hotplug_probe_ctx) {
6123 0 : return SPDK_POLLER_BUSY;
6124 : }
6125 :
6126 0 : memset(&trid_pcie, 0, sizeof(trid_pcie));
6127 0 : spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
6128 :
6129 0 : g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
6130 : hotplug_probe_cb, attach_cb, NULL);
6131 :
6132 0 : if (g_hotplug_probe_ctx) {
6133 0 : assert(g_hotplug_probe_poller == NULL);
6134 0 : g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
6135 0 : }
6136 :
6137 0 : return SPDK_POLLER_BUSY;
6138 0 : }
6139 :
6140 : void
6141 0 : spdk_bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts, size_t opts_size)
6142 : {
6143 0 : if (!opts) {
6144 0 : SPDK_ERRLOG("opts should not be NULL\n");
6145 0 : return;
6146 : }
6147 :
6148 0 : if (!opts_size) {
6149 0 : SPDK_ERRLOG("opts_size should not be zero value\n");
6150 0 : return;
6151 : }
6152 :
6153 0 : opts->opts_size = opts_size;
6154 :
6155 : #define SET_FIELD(field, defval) \
6156 : opts->field = SPDK_GET_FIELD(&g_opts, field, defval, opts_size); \
6157 :
6158 0 : SET_FIELD(action_on_timeout, 0);
6159 0 : SET_FIELD(keep_alive_timeout_ms, 0);
6160 0 : SET_FIELD(timeout_us, 0);
6161 0 : SET_FIELD(timeout_admin_us, 0);
6162 0 : SET_FIELD(transport_retry_count, 0);
6163 0 : SET_FIELD(arbitration_burst, 0);
6164 0 : SET_FIELD(low_priority_weight, 0);
6165 0 : SET_FIELD(medium_priority_weight, 0);
6166 0 : SET_FIELD(high_priority_weight, 0);
6167 0 : SET_FIELD(io_queue_requests, 0);
6168 0 : SET_FIELD(nvme_adminq_poll_period_us, 0);
6169 0 : SET_FIELD(nvme_ioq_poll_period_us, 0);
6170 0 : SET_FIELD(delay_cmd_submit, 0);
6171 0 : SET_FIELD(bdev_retry_count, 0);
6172 0 : SET_FIELD(ctrlr_loss_timeout_sec, 0);
6173 0 : SET_FIELD(reconnect_delay_sec, 0);
6174 0 : SET_FIELD(fast_io_fail_timeout_sec, 0);
6175 0 : SET_FIELD(transport_ack_timeout, 0);
6176 0 : SET_FIELD(disable_auto_failback, false);
6177 0 : SET_FIELD(generate_uuids, false);
6178 0 : SET_FIELD(transport_tos, 0);
6179 0 : SET_FIELD(nvme_error_stat, false);
6180 0 : SET_FIELD(io_path_stat, false);
6181 0 : SET_FIELD(allow_accel_sequence, false);
6182 0 : SET_FIELD(rdma_srq_size, 0);
6183 0 : SET_FIELD(rdma_max_cq_size, 0);
6184 0 : SET_FIELD(rdma_cm_event_timeout_ms, 0);
6185 0 : SET_FIELD(dhchap_digests, 0);
6186 0 : SET_FIELD(dhchap_dhgroups, 0);
6187 0 : SET_FIELD(rdma_umr_per_io, false);
6188 :
6189 : #undef SET_FIELD
6190 :
6191 : /* Do not remove this statement, you should always update this statement when you adding a new field,
6192 : * and do not forget to add the SET_FIELD statement for your added field. */
6193 : SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_nvme_opts) == 128, "Incorrect size");
6194 0 : }
6195 :
6196 : static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec,
6197 : uint32_t reconnect_delay_sec,
6198 : uint32_t fast_io_fail_timeout_sec);
6199 :
6200 : static int
6201 0 : bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
6202 : {
6203 0 : if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) {
6204 : /* Can't set timeout_admin_us without also setting timeout_us */
6205 0 : SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n");
6206 0 : return -EINVAL;
6207 : }
6208 :
6209 0 : if (opts->bdev_retry_count < -1) {
6210 0 : SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n");
6211 0 : return -EINVAL;
6212 : }
6213 :
6214 0 : if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec,
6215 0 : opts->reconnect_delay_sec,
6216 0 : opts->fast_io_fail_timeout_sec)) {
6217 0 : return -EINVAL;
6218 : }
6219 :
6220 0 : return 0;
6221 0 : }
6222 :
6223 : int
6224 0 : spdk_bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
6225 : {
6226 : struct spdk_nvme_transport_opts drv_opts;
6227 : int ret;
6228 :
6229 0 : if (!opts) {
6230 0 : SPDK_ERRLOG("opts cannot be NULL\n");
6231 0 : return -1;
6232 : }
6233 :
6234 0 : if (!opts->opts_size) {
6235 0 : SPDK_ERRLOG("opts_size inside opts cannot be zero value\n");
6236 0 : return -1;
6237 : }
6238 :
6239 0 : ret = bdev_nvme_validate_opts(opts);
6240 0 : if (ret) {
6241 0 : SPDK_WARNLOG("Failed to set nvme opts.\n");
6242 0 : return ret;
6243 : }
6244 :
6245 0 : if (g_bdev_nvme_init_thread != NULL) {
6246 0 : if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
6247 0 : return -EPERM;
6248 : }
6249 0 : }
6250 :
6251 0 : spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts));
6252 0 : if (opts->rdma_srq_size != 0) {
6253 0 : drv_opts.rdma_srq_size = opts->rdma_srq_size;
6254 0 : }
6255 0 : if (opts->rdma_max_cq_size != 0) {
6256 0 : drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size;
6257 0 : }
6258 0 : if (opts->rdma_cm_event_timeout_ms != 0) {
6259 0 : drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms;
6260 0 : }
6261 0 : if (drv_opts.rdma_umr_per_io != opts->rdma_umr_per_io) {
6262 0 : drv_opts.rdma_umr_per_io = opts->rdma_umr_per_io;
6263 0 : }
6264 0 : ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts));
6265 0 : if (ret) {
6266 0 : SPDK_ERRLOG("Failed to set NVMe transport opts.\n");
6267 0 : return ret;
6268 : }
6269 :
6270 : #define SET_FIELD(field, defval) \
6271 : g_opts.field = SPDK_GET_FIELD(opts, field, defval, opts->opts_size); \
6272 :
6273 0 : SET_FIELD(action_on_timeout, 0);
6274 0 : SET_FIELD(keep_alive_timeout_ms, 0);
6275 0 : SET_FIELD(timeout_us, 0);
6276 0 : SET_FIELD(timeout_admin_us, 0);
6277 0 : SET_FIELD(transport_retry_count, 0);
6278 0 : SET_FIELD(arbitration_burst, 0);
6279 0 : SET_FIELD(low_priority_weight, 0);
6280 0 : SET_FIELD(medium_priority_weight, 0);
6281 0 : SET_FIELD(high_priority_weight, 0);
6282 0 : SET_FIELD(io_queue_requests, 0);
6283 0 : SET_FIELD(nvme_adminq_poll_period_us, 0);
6284 0 : SET_FIELD(nvme_ioq_poll_period_us, 0);
6285 0 : SET_FIELD(delay_cmd_submit, 0);
6286 0 : SET_FIELD(bdev_retry_count, 0);
6287 0 : SET_FIELD(ctrlr_loss_timeout_sec, 0);
6288 0 : SET_FIELD(reconnect_delay_sec, 0);
6289 0 : SET_FIELD(fast_io_fail_timeout_sec, 0);
6290 0 : SET_FIELD(transport_ack_timeout, 0);
6291 0 : SET_FIELD(disable_auto_failback, false);
6292 0 : SET_FIELD(generate_uuids, false);
6293 0 : SET_FIELD(transport_tos, 0);
6294 0 : SET_FIELD(nvme_error_stat, false);
6295 0 : SET_FIELD(io_path_stat, false);
6296 0 : SET_FIELD(allow_accel_sequence, false);
6297 0 : SET_FIELD(rdma_srq_size, 0);
6298 0 : SET_FIELD(rdma_max_cq_size, 0);
6299 0 : SET_FIELD(rdma_cm_event_timeout_ms, 0);
6300 0 : SET_FIELD(dhchap_digests, 0);
6301 0 : SET_FIELD(dhchap_dhgroups, 0);
6302 :
6303 0 : g_opts.opts_size = opts->opts_size;
6304 :
6305 : #undef SET_FIELD
6306 :
6307 0 : return 0;
6308 0 : }
6309 :
6310 : struct set_nvme_hotplug_ctx {
6311 : uint64_t period_us;
6312 : bool enabled;
6313 : spdk_msg_fn fn;
6314 : void *fn_ctx;
6315 : };
6316 :
6317 : static void
6318 0 : set_nvme_hotplug_period_cb(void *_ctx)
6319 : {
6320 0 : struct set_nvme_hotplug_ctx *ctx = _ctx;
6321 :
6322 0 : spdk_poller_unregister(&g_hotplug_poller);
6323 0 : if (ctx->enabled) {
6324 0 : g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
6325 0 : } else {
6326 0 : g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL,
6327 : NVME_HOTPLUG_POLL_PERIOD_DEFAULT);
6328 : }
6329 :
6330 0 : g_nvme_hotplug_poll_period_us = ctx->period_us;
6331 0 : g_nvme_hotplug_enabled = ctx->enabled;
6332 0 : if (ctx->fn) {
6333 0 : ctx->fn(ctx->fn_ctx);
6334 0 : }
6335 :
6336 0 : free(ctx);
6337 0 : }
6338 :
6339 : int
6340 0 : bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
6341 : {
6342 : struct set_nvme_hotplug_ctx *ctx;
6343 :
6344 0 : if (enabled == true && !spdk_process_is_primary()) {
6345 0 : return -EPERM;
6346 : }
6347 :
6348 0 : ctx = calloc(1, sizeof(*ctx));
6349 0 : if (ctx == NULL) {
6350 0 : return -ENOMEM;
6351 : }
6352 :
6353 0 : period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
6354 0 : ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
6355 0 : ctx->enabled = enabled;
6356 0 : ctx->fn = cb;
6357 0 : ctx->fn_ctx = cb_ctx;
6358 :
6359 0 : spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
6360 0 : return 0;
6361 0 : }
6362 :
6363 : static void
6364 47 : nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
6365 : struct nvme_async_probe_ctx *ctx)
6366 : {
6367 : struct nvme_ns *nvme_ns;
6368 : struct nvme_bdev *nvme_bdev;
6369 : size_t j;
6370 :
6371 47 : assert(nvme_ctrlr != NULL);
6372 :
6373 47 : if (ctx->names == NULL) {
6374 0 : ctx->reported_bdevs = 0;
6375 0 : populate_namespaces_cb(ctx, 0);
6376 0 : return;
6377 : }
6378 :
6379 : /*
6380 : * Report the new bdevs that were created in this call.
6381 : * There can be more than one bdev per NVMe controller.
6382 : */
6383 47 : j = 0;
6384 :
6385 47 : pthread_mutex_lock(&nvme_ctrlr->mutex);
6386 :
6387 47 : nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
6388 96 : while (nvme_ns != NULL) {
6389 49 : nvme_bdev = nvme_ns->bdev;
6390 49 : if (j < ctx->max_bdevs) {
6391 49 : ctx->names[j] = nvme_bdev->disk.name;
6392 49 : j++;
6393 49 : } else {
6394 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6395 :
6396 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr,
6397 : "Maximum number of namespaces supported per NVMe controller is %du. "
6398 : "Unable to return all names of created bdevs\n",
6399 : ctx->max_bdevs);
6400 0 : ctx->reported_bdevs = 0;
6401 0 : populate_namespaces_cb(ctx, -ERANGE);
6402 0 : return;
6403 : }
6404 :
6405 49 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
6406 : }
6407 :
6408 47 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6409 :
6410 47 : ctx->reported_bdevs = j;
6411 47 : populate_namespaces_cb(ctx, 0);
6412 47 : }
6413 :
6414 : static int
6415 9 : bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
6416 : struct spdk_nvme_ctrlr *new_ctrlr,
6417 : struct spdk_nvme_transport_id *trid)
6418 : {
6419 : struct nvme_path_id *tmp_trid;
6420 :
6421 9 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
6422 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "PCIe failover is not supported.\n");
6423 0 : return -ENOTSUP;
6424 : }
6425 :
6426 : /* Currently we only support failover to the same transport type. */
6427 9 : if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) {
6428 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr,
6429 : "Failover from trtype: %s to a different trtype: %s is not supported currently\n",
6430 : spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype),
6431 : spdk_nvme_transport_id_trtype_str(trid->trtype));
6432 0 : return -EINVAL;
6433 : }
6434 :
6435 :
6436 : /* Currently we only support failover to the same NQN. */
6437 9 : if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
6438 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr,
6439 : "Failover from subnqn: %s to a different subnqn: %s is not supported currently\n",
6440 : nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn);
6441 0 : return -EINVAL;
6442 : }
6443 :
6444 : /* Skip all the other checks if we've already registered this path. */
6445 21 : TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
6446 12 : if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
6447 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "This path (traddr: %s subnqn: %s) is already registered\n",
6448 : trid->traddr, trid->subnqn);
6449 0 : return -EALREADY;
6450 : }
6451 12 : }
6452 :
6453 9 : return 0;
6454 9 : }
6455 :
6456 : static int
6457 9 : bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr,
6458 : struct spdk_nvme_ctrlr *new_ctrlr)
6459 : {
6460 : struct nvme_ns *nvme_ns;
6461 : struct spdk_nvme_ns *new_ns;
6462 :
6463 9 : nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
6464 9 : while (nvme_ns != NULL) {
6465 0 : new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id);
6466 0 : assert(new_ns != NULL);
6467 :
6468 0 : if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) {
6469 0 : return -EINVAL;
6470 : }
6471 :
6472 0 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
6473 : }
6474 :
6475 9 : return 0;
6476 9 : }
6477 :
6478 : static int
6479 9 : _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
6480 : struct spdk_nvme_transport_id *trid)
6481 : {
6482 : struct nvme_path_id *active_id, *new_trid, *tmp_trid;
6483 :
6484 9 : new_trid = calloc(1, sizeof(*new_trid));
6485 9 : if (new_trid == NULL) {
6486 0 : return -ENOMEM;
6487 : }
6488 9 : new_trid->trid = *trid;
6489 :
6490 9 : active_id = nvme_ctrlr->active_path_id;
6491 9 : assert(active_id != NULL);
6492 9 : assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids));
6493 :
6494 : /* Skip the active trid not to replace it until it is failed. */
6495 9 : tmp_trid = TAILQ_NEXT(active_id, link);
6496 9 : if (tmp_trid == NULL) {
6497 6 : goto add_tail;
6498 : }
6499 :
6500 : /* It means the trid is faled if its last failed time is non-zero.
6501 : * Insert the new alternate trid before any failed trid.
6502 : */
6503 5 : TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) {
6504 3 : if (tmp_trid->last_failed_tsc != 0) {
6505 1 : TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
6506 1 : return 0;
6507 : }
6508 4 : }
6509 :
6510 : add_tail:
6511 8 : TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link);
6512 8 : return 0;
6513 9 : }
6514 :
6515 : /* This is the case that a secondary path is added to an existing
6516 : * nvme_ctrlr for failover. After checking if it can access the same
6517 : * namespaces as the primary path, it is disconnected until failover occurs.
6518 : */
6519 : static int
6520 9 : bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
6521 : struct spdk_nvme_ctrlr *new_ctrlr,
6522 : struct spdk_nvme_transport_id *trid)
6523 : {
6524 : int rc;
6525 :
6526 9 : assert(nvme_ctrlr != NULL);
6527 :
6528 9 : pthread_mutex_lock(&nvme_ctrlr->mutex);
6529 :
6530 9 : rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid);
6531 9 : if (rc != 0) {
6532 0 : goto exit;
6533 : }
6534 :
6535 9 : rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr);
6536 9 : if (rc != 0) {
6537 0 : goto exit;
6538 : }
6539 :
6540 9 : rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid);
6541 :
6542 : exit:
6543 9 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6544 :
6545 9 : spdk_nvme_detach(new_ctrlr);
6546 :
6547 9 : return rc;
6548 : }
6549 :
6550 : static void
6551 48 : connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
6552 : struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
6553 : {
6554 48 : struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
6555 : struct nvme_async_probe_ctx *ctx;
6556 : int rc;
6557 :
6558 48 : ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts);
6559 48 : ctx->ctrlr_attached = true;
6560 :
6561 48 : rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx);
6562 48 : if (rc != 0) {
6563 1 : ctx->reported_bdevs = 0;
6564 1 : populate_namespaces_cb(ctx, rc);
6565 1 : }
6566 48 : }
6567 :
6568 :
6569 : static void
6570 4 : connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
6571 : struct spdk_nvme_ctrlr *ctrlr,
6572 : const struct spdk_nvme_ctrlr_opts *opts)
6573 : {
6574 4 : struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
6575 : struct nvme_ctrlr *nvme_ctrlr;
6576 : struct nvme_async_probe_ctx *ctx;
6577 : int rc;
6578 :
6579 4 : ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts);
6580 4 : ctx->ctrlr_attached = true;
6581 :
6582 4 : nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
6583 4 : if (nvme_ctrlr) {
6584 4 : rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid);
6585 4 : } else {
6586 0 : rc = -ENODEV;
6587 : }
6588 :
6589 4 : ctx->reported_bdevs = 0;
6590 4 : populate_namespaces_cb(ctx, rc);
6591 4 : }
6592 :
6593 : static int
6594 53 : bdev_nvme_async_poll(void *arg)
6595 : {
6596 53 : struct nvme_async_probe_ctx *ctx = arg;
6597 : int rc;
6598 :
6599 53 : rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
6600 53 : if (spdk_unlikely(rc != -EAGAIN)) {
6601 53 : ctx->probe_done = true;
6602 53 : spdk_poller_unregister(&ctx->poller);
6603 53 : if (!ctx->ctrlr_attached) {
6604 : /* The probe is done, but no controller was attached.
6605 : * That means we had a failure, so report -EIO back to
6606 : * the caller (usually the RPC). populate_namespaces_cb()
6607 : * will take care of freeing the nvme_async_probe_ctx.
6608 : */
6609 1 : ctx->reported_bdevs = 0;
6610 1 : populate_namespaces_cb(ctx, -EIO);
6611 53 : } else if (ctx->namespaces_populated) {
6612 : /* The namespaces for the attached controller were all
6613 : * populated and the response was already sent to the
6614 : * caller (usually the RPC). So free the context here.
6615 : */
6616 21 : free_nvme_async_probe_ctx(ctx);
6617 21 : }
6618 53 : }
6619 :
6620 53 : return SPDK_POLLER_BUSY;
6621 : }
6622 :
6623 : static bool
6624 72 : bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec,
6625 : uint32_t reconnect_delay_sec,
6626 : uint32_t fast_io_fail_timeout_sec)
6627 : {
6628 72 : if (ctrlr_loss_timeout_sec < -1) {
6629 1 : SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n");
6630 1 : return false;
6631 71 : } else if (ctrlr_loss_timeout_sec == -1) {
6632 14 : if (reconnect_delay_sec == 0) {
6633 1 : SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n");
6634 1 : return false;
6635 13 : } else if (fast_io_fail_timeout_sec != 0 &&
6636 3 : fast_io_fail_timeout_sec < reconnect_delay_sec) {
6637 1 : SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n");
6638 1 : return false;
6639 : }
6640 69 : } else if (ctrlr_loss_timeout_sec != 0) {
6641 11 : if (reconnect_delay_sec == 0) {
6642 1 : SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n");
6643 1 : return false;
6644 10 : } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) {
6645 1 : SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n");
6646 1 : return false;
6647 9 : } else if (fast_io_fail_timeout_sec != 0) {
6648 6 : if (fast_io_fail_timeout_sec < reconnect_delay_sec) {
6649 1 : SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n");
6650 1 : return false;
6651 5 : } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) {
6652 1 : SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n");
6653 1 : return false;
6654 : }
6655 4 : }
6656 53 : } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) {
6657 2 : SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n");
6658 2 : return false;
6659 : }
6660 :
6661 63 : return true;
6662 72 : }
6663 :
6664 : int
6665 53 : spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid,
6666 : const char *base_name,
6667 : const char **names,
6668 : uint32_t count,
6669 : spdk_bdev_nvme_create_cb cb_fn,
6670 : void *cb_ctx,
6671 : struct spdk_nvme_ctrlr_opts *drv_opts,
6672 : struct spdk_bdev_nvme_ctrlr_opts *bdev_opts)
6673 : {
6674 : struct nvme_probe_skip_entry *entry, *tmp;
6675 : struct nvme_async_probe_ctx *ctx;
6676 : spdk_nvme_attach_cb attach_cb;
6677 : struct nvme_ctrlr *nvme_ctrlr;
6678 : int len;
6679 :
6680 : /* TODO expand this check to include both the host and target TRIDs.
6681 : * Only if both are the same should we fail.
6682 : */
6683 53 : if (nvme_ctrlr_get(trid, drv_opts->hostnqn) != NULL) {
6684 0 : SPDK_ERRLOG("A controller with the provided trid (traddr: %s, hostnqn: %s) "
6685 : "already exists.\n", trid->traddr, drv_opts->hostnqn);
6686 0 : return -EEXIST;
6687 : }
6688 :
6689 53 : len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX);
6690 :
6691 53 : if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) {
6692 0 : SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1);
6693 0 : return -EINVAL;
6694 : }
6695 :
6696 53 : if (bdev_opts != NULL &&
6697 106 : !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec,
6698 53 : bdev_opts->reconnect_delay_sec,
6699 53 : bdev_opts->fast_io_fail_timeout_sec)) {
6700 0 : return -EINVAL;
6701 : }
6702 :
6703 53 : ctx = calloc(1, sizeof(*ctx));
6704 53 : if (!ctx) {
6705 0 : return -ENOMEM;
6706 : }
6707 53 : ctx->base_name = strdup(base_name);
6708 53 : if (!ctx->base_name) {
6709 0 : free(ctx);
6710 0 : return -ENOMEM;
6711 : }
6712 53 : ctx->names = names;
6713 53 : ctx->max_bdevs = count;
6714 53 : ctx->cb_fn = cb_fn;
6715 53 : ctx->cb_ctx = cb_ctx;
6716 53 : ctx->trid = *trid;
6717 :
6718 53 : if (bdev_opts) {
6719 53 : memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts));
6720 53 : } else {
6721 0 : spdk_bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts);
6722 : }
6723 :
6724 53 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
6725 0 : TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
6726 0 : if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
6727 0 : TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
6728 0 : free(entry);
6729 0 : break;
6730 : }
6731 0 : }
6732 0 : }
6733 :
6734 53 : memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts));
6735 53 : ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count;
6736 53 : ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout;
6737 53 : ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
6738 53 : ctx->drv_opts.disable_read_ana_log_page = true;
6739 53 : ctx->drv_opts.transport_tos = g_opts.transport_tos;
6740 :
6741 53 : if (spdk_interrupt_mode_is_enabled()) {
6742 0 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
6743 0 : ctx->drv_opts.enable_interrupts = true;
6744 0 : } else {
6745 0 : SPDK_ERRLOG("Interrupt mode is only supported with PCIe transport\n");
6746 0 : free_nvme_async_probe_ctx(ctx);
6747 0 : return -ENOTSUP;
6748 : }
6749 0 : }
6750 :
6751 53 : if (ctx->bdev_opts.psk != NULL) {
6752 0 : ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk);
6753 0 : if (ctx->drv_opts.tls_psk == NULL) {
6754 0 : SPDK_ERRLOG("Could not load PSK: %s\n", ctx->bdev_opts.psk);
6755 0 : free_nvme_async_probe_ctx(ctx);
6756 0 : return -ENOKEY;
6757 : }
6758 0 : }
6759 :
6760 53 : if (ctx->bdev_opts.dhchap_key != NULL) {
6761 0 : ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key);
6762 0 : if (ctx->drv_opts.dhchap_key == NULL) {
6763 0 : SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n",
6764 : ctx->bdev_opts.dhchap_key);
6765 0 : free_nvme_async_probe_ctx(ctx);
6766 0 : return -ENOKEY;
6767 : }
6768 :
6769 0 : ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests;
6770 0 : ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups;
6771 0 : }
6772 53 : if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) {
6773 0 : ctx->drv_opts.dhchap_ctrlr_key =
6774 0 : spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key);
6775 0 : if (ctx->drv_opts.dhchap_ctrlr_key == NULL) {
6776 0 : SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n",
6777 : ctx->bdev_opts.dhchap_ctrlr_key);
6778 0 : free_nvme_async_probe_ctx(ctx);
6779 0 : return -ENOKEY;
6780 : }
6781 0 : }
6782 :
6783 53 : if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || ctx->bdev_opts.multipath) {
6784 49 : attach_cb = connect_attach_cb;
6785 49 : } else {
6786 4 : attach_cb = connect_set_failover_cb;
6787 : }
6788 :
6789 53 : nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
6790 53 : if (nvme_ctrlr && nvme_ctrlr->opts.multipath != ctx->bdev_opts.multipath) {
6791 : /* All controllers with the same name must be configured the same
6792 : * way, either for multipath or failover. If the configuration doesn't
6793 : * match - report error.
6794 : */
6795 0 : free_nvme_async_probe_ctx(ctx);
6796 0 : return -EINVAL;
6797 : }
6798 :
6799 53 : ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb);
6800 53 : if (ctx->probe_ctx == NULL) {
6801 0 : SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
6802 0 : free_nvme_async_probe_ctx(ctx);
6803 0 : return -ENODEV;
6804 : }
6805 53 : ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
6806 :
6807 53 : return 0;
6808 53 : }
6809 :
6810 : struct bdev_nvme_delete_ctx {
6811 : char *name;
6812 : struct nvme_path_id path_id;
6813 : bdev_nvme_delete_done_fn delete_done;
6814 : void *delete_done_ctx;
6815 : uint64_t timeout_ticks;
6816 : struct spdk_poller *poller;
6817 : };
6818 :
6819 : static void
6820 2 : free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx)
6821 : {
6822 2 : if (ctx != NULL) {
6823 1 : free(ctx->name);
6824 1 : free(ctx);
6825 1 : }
6826 2 : }
6827 :
6828 : static bool
6829 76 : nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id)
6830 : {
6831 76 : if (path_id->trid.trtype != 0) {
6832 21 : if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) {
6833 0 : if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) {
6834 0 : return false;
6835 : }
6836 0 : } else {
6837 21 : if (path_id->trid.trtype != p->trid.trtype) {
6838 0 : return false;
6839 : }
6840 : }
6841 21 : }
6842 :
6843 76 : if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) {
6844 21 : if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) {
6845 11 : return false;
6846 : }
6847 10 : }
6848 :
6849 65 : if (path_id->trid.adrfam != 0) {
6850 0 : if (path_id->trid.adrfam != p->trid.adrfam) {
6851 0 : return false;
6852 : }
6853 0 : }
6854 :
6855 65 : if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) {
6856 10 : if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) {
6857 0 : return false;
6858 : }
6859 10 : }
6860 :
6861 65 : if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) {
6862 10 : if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) {
6863 0 : return false;
6864 : }
6865 10 : }
6866 :
6867 65 : if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) {
6868 0 : if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) {
6869 0 : return false;
6870 : }
6871 0 : }
6872 :
6873 65 : if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) {
6874 0 : if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) {
6875 0 : return false;
6876 : }
6877 0 : }
6878 :
6879 65 : return true;
6880 76 : }
6881 :
6882 : static bool
6883 2 : nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id)
6884 : {
6885 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
6886 : struct nvme_ctrlr *ctrlr;
6887 : struct nvme_path_id *p;
6888 :
6889 2 : pthread_mutex_lock(&g_bdev_nvme_mutex);
6890 2 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
6891 2 : if (!nbdev_ctrlr) {
6892 1 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6893 1 : return false;
6894 : }
6895 :
6896 1 : TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
6897 1 : pthread_mutex_lock(&ctrlr->mutex);
6898 1 : TAILQ_FOREACH(p, &ctrlr->trids, link) {
6899 1 : if (nvme_path_id_compare(p, path_id)) {
6900 1 : pthread_mutex_unlock(&ctrlr->mutex);
6901 1 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6902 1 : return true;
6903 : }
6904 0 : }
6905 0 : pthread_mutex_unlock(&ctrlr->mutex);
6906 0 : }
6907 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6908 :
6909 0 : return false;
6910 2 : }
6911 :
6912 : static int
6913 2 : bdev_nvme_delete_complete_poll(void *arg)
6914 : {
6915 2 : struct bdev_nvme_delete_ctx *ctx = arg;
6916 2 : int rc = 0;
6917 :
6918 2 : if (nvme_path_id_exists(ctx->name, &ctx->path_id)) {
6919 1 : if (ctx->timeout_ticks > spdk_get_ticks()) {
6920 1 : return SPDK_POLLER_BUSY;
6921 : }
6922 :
6923 0 : SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name);
6924 0 : rc = -ETIMEDOUT;
6925 0 : }
6926 :
6927 1 : spdk_poller_unregister(&ctx->poller);
6928 :
6929 1 : ctx->delete_done(ctx->delete_done_ctx, rc);
6930 1 : free_bdev_nvme_delete_ctx(ctx);
6931 :
6932 1 : return SPDK_POLLER_BUSY;
6933 2 : }
6934 :
6935 : static int
6936 65 : _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id)
6937 : {
6938 : struct nvme_path_id *p, *t;
6939 : spdk_msg_fn msg_fn;
6940 65 : int rc = -ENXIO;
6941 :
6942 65 : pthread_mutex_lock(&nvme_ctrlr->mutex);
6943 :
6944 75 : TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) {
6945 75 : if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) {
6946 65 : break;
6947 : }
6948 :
6949 10 : if (!nvme_path_id_compare(p, path_id)) {
6950 3 : continue;
6951 : }
6952 :
6953 : /* We are not using the specified path. */
6954 7 : TAILQ_REMOVE(&nvme_ctrlr->trids, p, link);
6955 7 : free(p);
6956 7 : rc = 0;
6957 7 : }
6958 :
6959 65 : if (p == NULL || !nvme_path_id_compare(p, path_id)) {
6960 8 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6961 8 : return rc;
6962 : }
6963 :
6964 : /* If we made it here, then this path is a match! Now we need to remove it. */
6965 :
6966 : /* This is the active path in use right now. The active path is always the first in the list. */
6967 57 : assert(p == nvme_ctrlr->active_path_id);
6968 :
6969 57 : if (!TAILQ_NEXT(p, link)) {
6970 : /* The current path is the only path. */
6971 56 : msg_fn = _nvme_ctrlr_destruct;
6972 56 : rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false);
6973 56 : } else {
6974 : /* There is an alternative path. */
6975 1 : msg_fn = _bdev_nvme_reset_ctrlr;
6976 1 : rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true);
6977 : }
6978 :
6979 57 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6980 :
6981 57 : if (rc == 0) {
6982 57 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
6983 57 : } else if (rc == -EALREADY) {
6984 0 : rc = 0;
6985 0 : }
6986 :
6987 57 : return rc;
6988 65 : }
6989 :
6990 : int
6991 50 : bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id,
6992 : bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx)
6993 : {
6994 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
6995 : struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr;
6996 50 : struct bdev_nvme_delete_ctx *ctx = NULL;
6997 50 : int rc = -ENXIO, _rc;
6998 :
6999 50 : if (name == NULL || path_id == NULL) {
7000 0 : rc = -EINVAL;
7001 0 : goto exit;
7002 : }
7003 :
7004 50 : pthread_mutex_lock(&g_bdev_nvme_mutex);
7005 :
7006 50 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
7007 50 : if (nbdev_ctrlr == NULL) {
7008 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
7009 :
7010 0 : SPDK_ERRLOG("Failed to find NVMe bdev controller\n");
7011 0 : rc = -ENODEV;
7012 0 : goto exit;
7013 : }
7014 :
7015 115 : TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) {
7016 65 : _rc = _bdev_nvme_delete(nvme_ctrlr, path_id);
7017 65 : if (_rc < 0 && _rc != -ENXIO) {
7018 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
7019 0 : rc = _rc;
7020 0 : goto exit;
7021 65 : } else if (_rc == 0) {
7022 : /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr
7023 : * was deleted successfully. To remember the successful deletion,
7024 : * overwrite rc only if _rc is zero.
7025 : */
7026 59 : rc = 0;
7027 59 : }
7028 65 : }
7029 :
7030 50 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
7031 :
7032 50 : if (rc != 0 || delete_done == NULL) {
7033 49 : goto exit;
7034 : }
7035 :
7036 1 : ctx = calloc(1, sizeof(*ctx));
7037 1 : if (ctx == NULL) {
7038 0 : SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n");
7039 0 : rc = -ENOMEM;
7040 0 : goto exit;
7041 : }
7042 :
7043 1 : ctx->name = strdup(name);
7044 1 : if (ctx->name == NULL) {
7045 0 : SPDK_ERRLOG("Failed to copy controller name for deletion\n");
7046 0 : rc = -ENOMEM;
7047 0 : goto exit;
7048 : }
7049 :
7050 1 : ctx->delete_done = delete_done;
7051 1 : ctx->delete_done_ctx = delete_done_ctx;
7052 1 : ctx->path_id = *path_id;
7053 1 : ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz();
7054 1 : ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000);
7055 1 : if (ctx->poller == NULL) {
7056 0 : SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n");
7057 0 : rc = -ENOMEM;
7058 0 : goto exit;
7059 : }
7060 :
7061 : exit:
7062 50 : if (rc != 0) {
7063 1 : free_bdev_nvme_delete_ctx(ctx);
7064 1 : }
7065 :
7066 50 : return rc;
7067 : }
7068 :
7069 : #define DISCOVERY_INFOLOG(ctx, format, ...) \
7070 : SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__);
7071 :
7072 : #define DISCOVERY_ERRLOG(ctx, format, ...) \
7073 : SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__);
7074 :
7075 : struct discovery_entry_ctx {
7076 : char name[128];
7077 : struct spdk_nvme_transport_id trid;
7078 : struct spdk_nvme_ctrlr_opts drv_opts;
7079 : struct spdk_nvmf_discovery_log_page_entry entry;
7080 : TAILQ_ENTRY(discovery_entry_ctx) tailq;
7081 : struct discovery_ctx *ctx;
7082 : };
7083 :
7084 : struct discovery_ctx {
7085 : char *name;
7086 : spdk_bdev_nvme_start_discovery_fn start_cb_fn;
7087 : spdk_bdev_nvme_stop_discovery_fn stop_cb_fn;
7088 : void *cb_ctx;
7089 : struct spdk_nvme_probe_ctx *probe_ctx;
7090 : struct spdk_nvme_detach_ctx *detach_ctx;
7091 : struct spdk_nvme_ctrlr *ctrlr;
7092 : struct spdk_nvme_transport_id trid;
7093 : struct discovery_entry_ctx *entry_ctx_in_use;
7094 : struct spdk_poller *poller;
7095 : struct spdk_nvme_ctrlr_opts drv_opts;
7096 : struct spdk_bdev_nvme_ctrlr_opts bdev_opts;
7097 : struct spdk_nvmf_discovery_log_page *log_page;
7098 : TAILQ_ENTRY(discovery_ctx) tailq;
7099 : TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs;
7100 : TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs;
7101 : int rc;
7102 : bool wait_for_attach;
7103 : uint64_t timeout_ticks;
7104 : /* Denotes that the discovery service is being started. We're waiting
7105 : * for the initial connection to the discovery controller to be
7106 : * established and attach discovered NVM ctrlrs.
7107 : */
7108 : bool initializing;
7109 : /* Denotes if a discovery is currently in progress for this context.
7110 : * That includes connecting to newly discovered subsystems. Used to
7111 : * ensure we do not start a new discovery until an existing one is
7112 : * complete.
7113 : */
7114 : bool in_progress;
7115 :
7116 : /* Denotes if another discovery is needed after the one in progress
7117 : * completes. Set when we receive an AER completion while a discovery
7118 : * is already in progress.
7119 : */
7120 : bool pending;
7121 :
7122 : /* Signal to the discovery context poller that it should stop the
7123 : * discovery service, including detaching from the current discovery
7124 : * controller.
7125 : */
7126 : bool stop;
7127 :
7128 : struct spdk_thread *calling_thread;
7129 : uint32_t index;
7130 : uint32_t attach_in_progress;
7131 : char *hostnqn;
7132 :
7133 : /* Denotes if the discovery service was started by the mdns discovery.
7134 : */
7135 : bool from_mdns_discovery_service;
7136 : };
7137 :
7138 : TAILQ_HEAD(discovery_ctxs, discovery_ctx);
7139 : static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs);
7140 :
7141 : static void get_discovery_log_page(struct discovery_ctx *ctx);
7142 :
7143 : static void
7144 0 : free_discovery_ctx(struct discovery_ctx *ctx)
7145 : {
7146 0 : free(ctx->log_page);
7147 0 : free(ctx->hostnqn);
7148 0 : free(ctx->name);
7149 0 : free(ctx);
7150 0 : }
7151 :
7152 : static void
7153 0 : discovery_complete(struct discovery_ctx *ctx)
7154 : {
7155 0 : ctx->initializing = false;
7156 0 : ctx->in_progress = false;
7157 0 : if (ctx->pending) {
7158 0 : ctx->pending = false;
7159 0 : get_discovery_log_page(ctx);
7160 0 : }
7161 0 : }
7162 :
7163 : static void
7164 0 : build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid,
7165 : struct spdk_nvmf_discovery_log_page_entry *entry)
7166 : {
7167 : char *space;
7168 :
7169 0 : trid->trtype = entry->trtype;
7170 0 : trid->adrfam = entry->adrfam;
7171 0 : memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr));
7172 0 : memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid));
7173 : /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and
7174 : * before call to this function trid->subnqn is zeroed out, we need
7175 : * to copy sizeof(trid->subnqn) minus one byte to make sure the last character
7176 : * remains 0. Then we can shorten the string (replace ' ' with 0) if required
7177 : */
7178 0 : memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1);
7179 :
7180 : /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated.
7181 : * But the log page entries typically pad them with spaces, not zeroes.
7182 : * So add a NULL terminator to each of these fields at the appropriate
7183 : * location.
7184 : */
7185 0 : space = strchr(trid->traddr, ' ');
7186 0 : if (space) {
7187 0 : *space = 0;
7188 0 : }
7189 0 : space = strchr(trid->trsvcid, ' ');
7190 0 : if (space) {
7191 0 : *space = 0;
7192 0 : }
7193 0 : space = strchr(trid->subnqn, ' ');
7194 0 : if (space) {
7195 0 : *space = 0;
7196 0 : }
7197 0 : }
7198 :
7199 : static void
7200 0 : _stop_discovery(void *_ctx)
7201 : {
7202 0 : struct discovery_ctx *ctx = _ctx;
7203 :
7204 0 : if (ctx->attach_in_progress > 0) {
7205 0 : spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx);
7206 0 : return;
7207 : }
7208 :
7209 0 : ctx->stop = true;
7210 :
7211 0 : while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) {
7212 : struct discovery_entry_ctx *entry_ctx;
7213 0 : struct nvme_path_id path = {};
7214 :
7215 0 : entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs);
7216 0 : path.trid = entry_ctx->trid;
7217 0 : bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL);
7218 0 : TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq);
7219 0 : free(entry_ctx);
7220 : }
7221 :
7222 0 : while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) {
7223 : struct discovery_entry_ctx *entry_ctx;
7224 :
7225 0 : entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs);
7226 0 : TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq);
7227 0 : free(entry_ctx);
7228 : }
7229 :
7230 0 : free(ctx->entry_ctx_in_use);
7231 0 : ctx->entry_ctx_in_use = NULL;
7232 0 : }
7233 :
7234 : static void
7235 0 : stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx)
7236 : {
7237 0 : ctx->stop_cb_fn = cb_fn;
7238 0 : ctx->cb_ctx = cb_ctx;
7239 :
7240 0 : if (ctx->attach_in_progress > 0) {
7241 0 : DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n",
7242 : ctx->attach_in_progress);
7243 0 : }
7244 :
7245 0 : _stop_discovery(ctx);
7246 0 : }
7247 :
7248 : static void
7249 2 : remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr)
7250 : {
7251 : struct discovery_ctx *d_ctx;
7252 : struct nvme_path_id *path_id;
7253 2 : struct spdk_nvme_transport_id trid = {};
7254 : struct discovery_entry_ctx *entry_ctx, *tmp;
7255 :
7256 2 : path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
7257 :
7258 2 : TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) {
7259 0 : TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) {
7260 0 : build_trid_from_log_page_entry(&trid, &entry_ctx->entry);
7261 0 : if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) {
7262 0 : continue;
7263 : }
7264 :
7265 0 : TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq);
7266 0 : free(entry_ctx);
7267 0 : DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n",
7268 : trid.subnqn, trid.traddr, trid.trsvcid);
7269 :
7270 : /* Fail discovery ctrlr to force reattach attempt */
7271 0 : spdk_nvme_ctrlr_fail(d_ctx->ctrlr);
7272 0 : }
7273 0 : }
7274 2 : }
7275 :
7276 : static void
7277 0 : discovery_remove_controllers(struct discovery_ctx *ctx)
7278 : {
7279 0 : struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page;
7280 : struct discovery_entry_ctx *entry_ctx, *tmp;
7281 : struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry;
7282 0 : struct spdk_nvme_transport_id old_trid = {};
7283 : uint64_t numrec, i;
7284 : bool found;
7285 :
7286 0 : numrec = from_le64(&log_page->numrec);
7287 0 : TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) {
7288 0 : found = false;
7289 0 : old_entry = &entry_ctx->entry;
7290 0 : build_trid_from_log_page_entry(&old_trid, old_entry);
7291 0 : for (i = 0; i < numrec; i++) {
7292 0 : new_entry = &log_page->entries[i];
7293 0 : if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) {
7294 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n",
7295 : old_trid.subnqn, old_trid.traddr, old_trid.trsvcid);
7296 0 : found = true;
7297 0 : break;
7298 : }
7299 0 : }
7300 0 : if (!found) {
7301 0 : struct nvme_path_id path = {};
7302 :
7303 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n",
7304 : old_trid.subnqn, old_trid.traddr, old_trid.trsvcid);
7305 :
7306 0 : path.trid = entry_ctx->trid;
7307 0 : bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL);
7308 0 : TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq);
7309 0 : free(entry_ctx);
7310 0 : }
7311 0 : }
7312 0 : free(log_page);
7313 0 : ctx->log_page = NULL;
7314 0 : discovery_complete(ctx);
7315 0 : }
7316 :
7317 : static void
7318 0 : complete_discovery_start(struct discovery_ctx *ctx, int status)
7319 : {
7320 0 : ctx->timeout_ticks = 0;
7321 0 : ctx->rc = status;
7322 0 : if (ctx->start_cb_fn) {
7323 0 : ctx->start_cb_fn(ctx->cb_ctx, status);
7324 0 : ctx->start_cb_fn = NULL;
7325 0 : ctx->cb_ctx = NULL;
7326 0 : }
7327 0 : }
7328 :
7329 : static void
7330 0 : discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc)
7331 : {
7332 0 : struct discovery_entry_ctx *entry_ctx = cb_ctx;
7333 0 : struct discovery_ctx *ctx = entry_ctx->ctx;
7334 :
7335 0 : DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name);
7336 0 : ctx->attach_in_progress--;
7337 0 : if (ctx->attach_in_progress == 0) {
7338 0 : complete_discovery_start(ctx, ctx->rc);
7339 0 : if (ctx->initializing && ctx->rc != 0) {
7340 0 : DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc);
7341 0 : stop_discovery(ctx, NULL, ctx->cb_ctx);
7342 0 : } else {
7343 0 : discovery_remove_controllers(ctx);
7344 : }
7345 0 : }
7346 0 : }
7347 :
7348 : static struct discovery_entry_ctx *
7349 0 : create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid)
7350 : {
7351 : struct discovery_entry_ctx *new_ctx;
7352 :
7353 0 : new_ctx = calloc(1, sizeof(*new_ctx));
7354 0 : if (new_ctx == NULL) {
7355 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7356 0 : return NULL;
7357 : }
7358 :
7359 0 : new_ctx->ctx = ctx;
7360 0 : memcpy(&new_ctx->trid, trid, sizeof(*trid));
7361 0 : spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts));
7362 0 : snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn);
7363 0 : return new_ctx;
7364 0 : }
7365 :
7366 : static void
7367 0 : discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl,
7368 : struct spdk_nvmf_discovery_log_page *log_page)
7369 : {
7370 0 : struct discovery_ctx *ctx = cb_arg;
7371 : struct discovery_entry_ctx *entry_ctx, *tmp;
7372 : struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry;
7373 : uint64_t numrec, i;
7374 : bool found;
7375 :
7376 0 : if (rc || spdk_nvme_cpl_is_error(cpl)) {
7377 0 : DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n");
7378 0 : return;
7379 : }
7380 :
7381 0 : ctx->log_page = log_page;
7382 0 : assert(ctx->attach_in_progress == 0);
7383 0 : numrec = from_le64(&log_page->numrec);
7384 0 : TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) {
7385 0 : TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq);
7386 0 : free(entry_ctx);
7387 0 : }
7388 0 : for (i = 0; i < numrec; i++) {
7389 0 : found = false;
7390 0 : new_entry = &log_page->entries[i];
7391 0 : if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT ||
7392 0 : new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
7393 : struct discovery_entry_ctx *new_ctx;
7394 0 : struct spdk_nvme_transport_id trid = {};
7395 :
7396 0 : build_trid_from_log_page_entry(&trid, new_entry);
7397 0 : new_ctx = create_discovery_entry_ctx(ctx, &trid);
7398 0 : if (new_ctx == NULL) {
7399 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7400 0 : break;
7401 : }
7402 :
7403 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq);
7404 0 : continue;
7405 : }
7406 0 : TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) {
7407 0 : old_entry = &entry_ctx->entry;
7408 0 : if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) {
7409 0 : found = true;
7410 0 : break;
7411 : }
7412 0 : }
7413 0 : if (!found) {
7414 0 : struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx;
7415 : struct discovery_ctx *d_ctx;
7416 :
7417 0 : TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) {
7418 0 : TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) {
7419 0 : if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn,
7420 : sizeof(new_entry->subnqn))) {
7421 0 : break;
7422 : }
7423 0 : }
7424 0 : if (subnqn_ctx) {
7425 0 : break;
7426 : }
7427 0 : }
7428 :
7429 0 : new_ctx = calloc(1, sizeof(*new_ctx));
7430 0 : if (new_ctx == NULL) {
7431 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7432 0 : break;
7433 : }
7434 :
7435 0 : new_ctx->ctx = ctx;
7436 0 : memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry));
7437 0 : build_trid_from_log_page_entry(&new_ctx->trid, new_entry);
7438 0 : if (subnqn_ctx) {
7439 0 : snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name);
7440 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n",
7441 : new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid,
7442 : new_ctx->name);
7443 0 : } else {
7444 0 : snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++);
7445 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n",
7446 : new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid,
7447 : new_ctx->name);
7448 : }
7449 0 : spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts));
7450 0 : snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn);
7451 0 : rc = spdk_bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0,
7452 0 : discovery_attach_controller_done, new_ctx,
7453 0 : &new_ctx->drv_opts, &ctx->bdev_opts);
7454 0 : if (rc == 0) {
7455 0 : TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq);
7456 0 : ctx->attach_in_progress++;
7457 0 : } else {
7458 0 : DISCOVERY_ERRLOG(ctx, "spdk_bdev_nvme_create failed (%s)\n", spdk_strerror(-rc));
7459 : }
7460 0 : }
7461 0 : }
7462 :
7463 0 : if (ctx->attach_in_progress == 0) {
7464 0 : discovery_remove_controllers(ctx);
7465 0 : }
7466 0 : }
7467 :
7468 : static void
7469 0 : get_discovery_log_page(struct discovery_ctx *ctx)
7470 : {
7471 : int rc;
7472 :
7473 0 : assert(ctx->in_progress == false);
7474 0 : ctx->in_progress = true;
7475 0 : rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx);
7476 0 : if (rc != 0) {
7477 0 : DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n");
7478 0 : }
7479 0 : DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n");
7480 0 : }
7481 :
7482 : static void
7483 0 : discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
7484 : {
7485 0 : struct discovery_ctx *ctx = arg;
7486 0 : uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
7487 :
7488 0 : if (spdk_nvme_cpl_is_error(cpl)) {
7489 0 : DISCOVERY_ERRLOG(ctx, "aer failed\n");
7490 0 : return;
7491 : }
7492 :
7493 0 : if (log_page_id != SPDK_NVME_LOG_DISCOVERY) {
7494 0 : DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id);
7495 0 : return;
7496 : }
7497 :
7498 0 : DISCOVERY_INFOLOG(ctx, "got aer\n");
7499 0 : if (ctx->in_progress) {
7500 0 : ctx->pending = true;
7501 0 : return;
7502 : }
7503 :
7504 0 : get_discovery_log_page(ctx);
7505 0 : }
7506 :
7507 : static void
7508 0 : discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
7509 : struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
7510 : {
7511 0 : struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
7512 : struct discovery_ctx *ctx;
7513 :
7514 0 : ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts);
7515 :
7516 0 : DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n");
7517 0 : ctx->probe_ctx = NULL;
7518 0 : ctx->ctrlr = ctrlr;
7519 :
7520 0 : if (ctx->rc != 0) {
7521 0 : DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n",
7522 : ctx->rc);
7523 0 : return;
7524 : }
7525 :
7526 0 : spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx);
7527 0 : }
7528 :
7529 : static int
7530 0 : discovery_poller(void *arg)
7531 : {
7532 0 : struct discovery_ctx *ctx = arg;
7533 : struct spdk_nvme_transport_id *trid;
7534 : int rc;
7535 :
7536 0 : if (ctx->detach_ctx) {
7537 0 : rc = spdk_nvme_detach_poll_async(ctx->detach_ctx);
7538 0 : if (rc != -EAGAIN) {
7539 0 : ctx->detach_ctx = NULL;
7540 0 : ctx->ctrlr = NULL;
7541 0 : }
7542 0 : } else if (ctx->stop) {
7543 0 : if (ctx->ctrlr != NULL) {
7544 0 : rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx);
7545 0 : if (rc == 0) {
7546 0 : return SPDK_POLLER_BUSY;
7547 : }
7548 0 : DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n");
7549 0 : }
7550 0 : spdk_poller_unregister(&ctx->poller);
7551 0 : TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq);
7552 0 : assert(ctx->start_cb_fn == NULL);
7553 0 : if (ctx->stop_cb_fn != NULL) {
7554 0 : ctx->stop_cb_fn(ctx->cb_ctx);
7555 0 : }
7556 0 : free_discovery_ctx(ctx);
7557 0 : } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) {
7558 0 : if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) {
7559 0 : DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n");
7560 0 : assert(ctx->initializing);
7561 0 : spdk_poller_unregister(&ctx->poller);
7562 0 : TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq);
7563 0 : complete_discovery_start(ctx, -ETIMEDOUT);
7564 0 : stop_discovery(ctx, NULL, NULL);
7565 0 : free_discovery_ctx(ctx);
7566 0 : return SPDK_POLLER_BUSY;
7567 : }
7568 :
7569 0 : assert(ctx->entry_ctx_in_use == NULL);
7570 0 : ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs);
7571 0 : TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq);
7572 0 : trid = &ctx->entry_ctx_in_use->trid;
7573 :
7574 : /* All controllers must be configured explicitely either for multipath or failover.
7575 : * While discovery use multipath mode, we need to set this in bdev options as well.
7576 : */
7577 0 : ctx->bdev_opts.multipath = true;
7578 :
7579 0 : ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb);
7580 0 : if (ctx->probe_ctx) {
7581 0 : spdk_poller_unregister(&ctx->poller);
7582 0 : ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000);
7583 0 : } else {
7584 0 : DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n");
7585 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq);
7586 0 : ctx->entry_ctx_in_use = NULL;
7587 : }
7588 0 : } else if (ctx->probe_ctx) {
7589 0 : if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) {
7590 0 : DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n");
7591 0 : complete_discovery_start(ctx, -ETIMEDOUT);
7592 0 : return SPDK_POLLER_BUSY;
7593 : }
7594 :
7595 0 : rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
7596 0 : if (rc != -EAGAIN) {
7597 0 : if (ctx->rc != 0) {
7598 0 : assert(ctx->initializing);
7599 0 : stop_discovery(ctx, NULL, ctx->cb_ctx);
7600 0 : } else {
7601 0 : assert(rc == 0);
7602 0 : DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n");
7603 0 : ctx->rc = rc;
7604 0 : get_discovery_log_page(ctx);
7605 : }
7606 0 : }
7607 0 : } else {
7608 0 : if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) {
7609 0 : DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n");
7610 0 : complete_discovery_start(ctx, -ETIMEDOUT);
7611 : /* We need to wait until all NVM ctrlrs are attached before we stop the
7612 : * discovery service to make sure we don't detach a ctrlr that is still
7613 : * being attached.
7614 : */
7615 0 : if (ctx->attach_in_progress == 0) {
7616 0 : stop_discovery(ctx, NULL, ctx->cb_ctx);
7617 0 : return SPDK_POLLER_BUSY;
7618 : }
7619 0 : }
7620 :
7621 0 : rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr);
7622 0 : if (rc < 0) {
7623 0 : spdk_poller_unregister(&ctx->poller);
7624 0 : ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000);
7625 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq);
7626 0 : ctx->entry_ctx_in_use = NULL;
7627 :
7628 0 : rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx);
7629 0 : if (rc != 0) {
7630 0 : DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n");
7631 0 : ctx->ctrlr = NULL;
7632 0 : }
7633 0 : }
7634 : }
7635 :
7636 0 : return SPDK_POLLER_BUSY;
7637 0 : }
7638 :
7639 : static void
7640 0 : start_discovery_poller(void *arg)
7641 : {
7642 0 : struct discovery_ctx *ctx = arg;
7643 :
7644 0 : TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq);
7645 0 : ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000);
7646 0 : }
7647 :
7648 : int
7649 0 : bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid,
7650 : const char *base_name,
7651 : struct spdk_nvme_ctrlr_opts *drv_opts,
7652 : struct spdk_bdev_nvme_ctrlr_opts *bdev_opts,
7653 : uint64_t attach_timeout,
7654 : bool from_mdns,
7655 : spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx)
7656 : {
7657 : struct discovery_ctx *ctx;
7658 : struct discovery_entry_ctx *discovery_entry_ctx;
7659 :
7660 0 : snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
7661 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
7662 0 : if (strcmp(ctx->name, base_name) == 0) {
7663 0 : return -EEXIST;
7664 : }
7665 :
7666 0 : if (ctx->entry_ctx_in_use != NULL) {
7667 0 : if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) {
7668 0 : return -EEXIST;
7669 : }
7670 0 : }
7671 :
7672 0 : TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) {
7673 0 : if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) {
7674 0 : return -EEXIST;
7675 : }
7676 0 : }
7677 0 : }
7678 :
7679 0 : ctx = calloc(1, sizeof(*ctx));
7680 0 : if (ctx == NULL) {
7681 0 : return -ENOMEM;
7682 : }
7683 :
7684 0 : ctx->name = strdup(base_name);
7685 0 : if (ctx->name == NULL) {
7686 0 : free_discovery_ctx(ctx);
7687 0 : return -ENOMEM;
7688 : }
7689 0 : memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts));
7690 0 : memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts));
7691 0 : ctx->from_mdns_discovery_service = from_mdns;
7692 0 : ctx->bdev_opts.from_discovery_service = true;
7693 0 : ctx->calling_thread = spdk_get_thread();
7694 0 : ctx->start_cb_fn = cb_fn;
7695 0 : ctx->cb_ctx = cb_ctx;
7696 0 : ctx->initializing = true;
7697 0 : if (ctx->start_cb_fn) {
7698 : /* We can use this when dumping json to denote if this RPC parameter
7699 : * was specified or not.
7700 : */
7701 0 : ctx->wait_for_attach = true;
7702 0 : }
7703 0 : if (attach_timeout != 0) {
7704 0 : ctx->timeout_ticks = spdk_get_ticks() + attach_timeout *
7705 0 : spdk_get_ticks_hz() / 1000ull;
7706 0 : }
7707 0 : TAILQ_INIT(&ctx->nvm_entry_ctxs);
7708 0 : TAILQ_INIT(&ctx->discovery_entry_ctxs);
7709 0 : memcpy(&ctx->trid, trid, sizeof(*trid));
7710 : /* Even if user did not specify hostnqn, we can still strdup("\0"); */
7711 0 : ctx->hostnqn = strdup(ctx->drv_opts.hostnqn);
7712 0 : if (ctx->hostnqn == NULL) {
7713 0 : free_discovery_ctx(ctx);
7714 0 : return -ENOMEM;
7715 : }
7716 0 : discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid);
7717 0 : if (discovery_entry_ctx == NULL) {
7718 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7719 0 : free_discovery_ctx(ctx);
7720 0 : return -ENOMEM;
7721 : }
7722 :
7723 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq);
7724 0 : spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx);
7725 0 : return 0;
7726 0 : }
7727 :
7728 : int
7729 0 : bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx)
7730 : {
7731 : struct discovery_ctx *ctx;
7732 :
7733 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
7734 0 : if (strcmp(name, ctx->name) == 0) {
7735 0 : if (ctx->stop) {
7736 0 : return -EALREADY;
7737 : }
7738 : /* If we're still starting the discovery service and ->rc is non-zero, we're
7739 : * going to stop it as soon as we can
7740 : */
7741 0 : if (ctx->initializing && ctx->rc != 0) {
7742 0 : return -EALREADY;
7743 : }
7744 0 : stop_discovery(ctx, cb_fn, cb_ctx);
7745 0 : return 0;
7746 : }
7747 0 : }
7748 :
7749 0 : return -ENOENT;
7750 0 : }
7751 :
7752 : static int
7753 1 : bdev_nvme_library_init(void)
7754 : {
7755 1 : g_bdev_nvme_init_thread = spdk_get_thread();
7756 :
7757 1 : spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb,
7758 : bdev_nvme_destroy_poll_group_cb,
7759 : sizeof(struct nvme_poll_group), "nvme_poll_groups");
7760 :
7761 1 : return 0;
7762 : }
7763 :
7764 : static void
7765 1 : bdev_nvme_fini_destruct_ctrlrs(void)
7766 : {
7767 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
7768 : struct nvme_ctrlr *nvme_ctrlr;
7769 :
7770 1 : pthread_mutex_lock(&g_bdev_nvme_mutex);
7771 1 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
7772 0 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
7773 0 : pthread_mutex_lock(&nvme_ctrlr->mutex);
7774 0 : if (nvme_ctrlr->destruct) {
7775 : /* This controller's destruction was already started
7776 : * before the application started shutting down
7777 : */
7778 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
7779 0 : continue;
7780 : }
7781 0 : nvme_ctrlr->destruct = true;
7782 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
7783 :
7784 0 : spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct,
7785 0 : nvme_ctrlr);
7786 0 : }
7787 0 : }
7788 :
7789 1 : g_bdev_nvme_module_finish = true;
7790 1 : if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
7791 1 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
7792 1 : spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
7793 1 : spdk_bdev_module_fini_done();
7794 1 : return;
7795 : }
7796 :
7797 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
7798 1 : }
7799 :
7800 : static void
7801 0 : check_discovery_fini(void *arg)
7802 : {
7803 0 : if (TAILQ_EMPTY(&g_discovery_ctxs)) {
7804 0 : bdev_nvme_fini_destruct_ctrlrs();
7805 0 : }
7806 0 : }
7807 :
7808 : static void
7809 1 : bdev_nvme_library_fini(void)
7810 : {
7811 : struct nvme_probe_skip_entry *entry, *entry_tmp;
7812 : struct discovery_ctx *ctx;
7813 :
7814 1 : spdk_poller_unregister(&g_hotplug_poller);
7815 1 : free(g_hotplug_probe_ctx);
7816 1 : g_hotplug_probe_ctx = NULL;
7817 :
7818 1 : TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
7819 0 : TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
7820 0 : free(entry);
7821 0 : }
7822 :
7823 1 : assert(spdk_get_thread() == g_bdev_nvme_init_thread);
7824 1 : if (TAILQ_EMPTY(&g_discovery_ctxs)) {
7825 1 : bdev_nvme_fini_destruct_ctrlrs();
7826 1 : } else {
7827 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
7828 0 : stop_discovery(ctx, check_discovery_fini, NULL);
7829 0 : }
7830 : }
7831 1 : }
7832 :
7833 : static void
7834 0 : bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio)
7835 : {
7836 0 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7837 0 : struct spdk_bdev *bdev = bdev_io->bdev;
7838 : struct spdk_dif_ctx dif_ctx;
7839 0 : struct spdk_dif_error err_blk = {};
7840 : int rc;
7841 : struct spdk_dif_ctx_init_ext_opts dif_opts;
7842 :
7843 0 : dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
7844 0 : dif_opts.dif_pi_format = bdev->dif_pi_format;
7845 0 : rc = spdk_dif_ctx_init(&dif_ctx,
7846 0 : bdev->blocklen, bdev->md_len, bdev->md_interleave,
7847 0 : bdev->dif_is_head_of_md, bdev->dif_type,
7848 0 : bdev_io->u.bdev.dif_check_flags,
7849 0 : bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts);
7850 0 : if (rc != 0) {
7851 0 : SPDK_ERRLOG("Initialization of DIF context failed\n");
7852 0 : return;
7853 : }
7854 :
7855 0 : if (bdev->md_interleave) {
7856 0 : rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
7857 0 : bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
7858 0 : } else {
7859 0 : struct iovec md_iov = {
7860 0 : .iov_base = bdev_io->u.bdev.md_buf,
7861 0 : .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len,
7862 : };
7863 :
7864 0 : rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
7865 0 : &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
7866 : }
7867 :
7868 0 : if (rc != 0) {
7869 0 : SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
7870 : err_blk.err_type, err_blk.err_offset);
7871 0 : } else {
7872 0 : SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
7873 : }
7874 0 : }
7875 :
7876 : static void
7877 0 : bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
7878 : {
7879 0 : struct nvme_bdev_io *bio = ref;
7880 :
7881 0 : if (spdk_nvme_cpl_is_success(cpl)) {
7882 : /* Run PI verification for read data buffer. */
7883 0 : bdev_nvme_verify_pi_error(bio);
7884 0 : }
7885 :
7886 : /* Return original completion status */
7887 0 : bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
7888 0 : }
7889 :
7890 : static void
7891 3 : bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
7892 : {
7893 3 : struct nvme_bdev_io *bio = ref;
7894 3 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7895 : int ret;
7896 :
7897 3 : if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
7898 0 : SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
7899 : cpl->status.sct, cpl->status.sc);
7900 :
7901 : /* Save completion status to use after verifying PI error. */
7902 0 : bio->cpl = *cpl;
7903 :
7904 0 : if (spdk_likely(nvme_io_path_is_available(bio->io_path))) {
7905 : /* Read without PI checking to verify PI error. */
7906 0 : ret = bdev_nvme_no_pi_readv(bio,
7907 0 : bdev_io->u.bdev.iovs,
7908 0 : bdev_io->u.bdev.iovcnt,
7909 0 : bdev_io->u.bdev.md_buf,
7910 0 : bdev_io->u.bdev.num_blocks,
7911 0 : bdev_io->u.bdev.offset_blocks);
7912 0 : if (ret == 0) {
7913 0 : return;
7914 : }
7915 0 : }
7916 0 : }
7917 :
7918 3 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7919 3 : }
7920 :
7921 : static void
7922 25 : bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
7923 : {
7924 25 : struct nvme_bdev_io *bio = ref;
7925 :
7926 25 : if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
7927 0 : SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
7928 : cpl->status.sct, cpl->status.sc);
7929 : /* Run PI verification for write data buffer if PI error is detected. */
7930 0 : bdev_nvme_verify_pi_error(bio);
7931 0 : }
7932 :
7933 25 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7934 25 : }
7935 :
7936 : static void
7937 0 : bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
7938 : {
7939 0 : struct nvme_bdev_io *bio = ref;
7940 0 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7941 :
7942 : /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
7943 : * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
7944 : */
7945 0 : bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
7946 :
7947 0 : if (spdk_nvme_cpl_is_pi_error(cpl)) {
7948 0 : SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
7949 : cpl->status.sct, cpl->status.sc);
7950 : /* Run PI verification for zone append data buffer if PI error is detected. */
7951 0 : bdev_nvme_verify_pi_error(bio);
7952 0 : }
7953 :
7954 0 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7955 0 : }
7956 :
7957 : static void
7958 1 : bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
7959 : {
7960 1 : struct nvme_bdev_io *bio = ref;
7961 :
7962 1 : if (spdk_nvme_cpl_is_pi_error(cpl)) {
7963 0 : SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
7964 : cpl->status.sct, cpl->status.sc);
7965 : /* Run PI verification for compare data buffer if PI error is detected. */
7966 0 : bdev_nvme_verify_pi_error(bio);
7967 0 : }
7968 :
7969 1 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7970 1 : }
7971 :
7972 : static void
7973 4 : bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
7974 : {
7975 4 : struct nvme_bdev_io *bio = ref;
7976 :
7977 : /* Compare operation completion */
7978 4 : if (!bio->first_fused_completed) {
7979 : /* Save compare result for write callback */
7980 2 : bio->cpl = *cpl;
7981 2 : bio->first_fused_completed = true;
7982 2 : return;
7983 : }
7984 :
7985 : /* Write operation completion */
7986 2 : if (spdk_nvme_cpl_is_error(&bio->cpl)) {
7987 : /* If bio->cpl is already an error, it means the compare operation failed. In that case,
7988 : * complete the IO with the compare operation's status.
7989 : */
7990 1 : if (!spdk_nvme_cpl_is_error(cpl)) {
7991 1 : SPDK_ERRLOG("Unexpected write success after compare failure.\n");
7992 1 : }
7993 :
7994 1 : bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
7995 1 : } else {
7996 1 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7997 : }
7998 4 : }
7999 :
8000 : static void
8001 1 : bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
8002 : {
8003 1 : struct nvme_bdev_io *bio = ref;
8004 :
8005 1 : bdev_nvme_io_complete_nvme_status(bio, cpl);
8006 1 : }
8007 :
8008 : static int
8009 0 : fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
8010 : {
8011 0 : switch (desc->zt) {
8012 : case SPDK_NVME_ZONE_TYPE_SEQWR:
8013 0 : info->type = SPDK_BDEV_ZONE_TYPE_SEQWR;
8014 0 : break;
8015 : default:
8016 0 : SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt);
8017 0 : return -EIO;
8018 : }
8019 :
8020 0 : switch (desc->zs) {
8021 : case SPDK_NVME_ZONE_STATE_EMPTY:
8022 0 : info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
8023 0 : break;
8024 : case SPDK_NVME_ZONE_STATE_IOPEN:
8025 0 : info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
8026 0 : break;
8027 : case SPDK_NVME_ZONE_STATE_EOPEN:
8028 0 : info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
8029 0 : break;
8030 : case SPDK_NVME_ZONE_STATE_CLOSED:
8031 0 : info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
8032 0 : break;
8033 : case SPDK_NVME_ZONE_STATE_RONLY:
8034 0 : info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
8035 0 : break;
8036 : case SPDK_NVME_ZONE_STATE_FULL:
8037 0 : info->state = SPDK_BDEV_ZONE_STATE_FULL;
8038 0 : break;
8039 : case SPDK_NVME_ZONE_STATE_OFFLINE:
8040 0 : info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
8041 0 : break;
8042 : default:
8043 0 : SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
8044 0 : return -EIO;
8045 : }
8046 :
8047 0 : info->zone_id = desc->zslba;
8048 0 : info->write_pointer = desc->wp;
8049 0 : info->capacity = desc->zcap;
8050 :
8051 0 : return 0;
8052 0 : }
8053 :
8054 : static void
8055 0 : bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
8056 : {
8057 0 : struct nvme_bdev_io *bio = ref;
8058 0 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8059 0 : uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
8060 0 : uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
8061 0 : struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
8062 : uint64_t max_zones_per_buf, i;
8063 : uint32_t zone_report_bufsize;
8064 : struct spdk_nvme_ns *ns;
8065 : struct spdk_nvme_qpair *qpair;
8066 : int ret;
8067 :
8068 0 : if (spdk_nvme_cpl_is_error(cpl)) {
8069 0 : goto out_complete_io_nvme_cpl;
8070 : }
8071 :
8072 0 : if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
8073 0 : ret = -ENXIO;
8074 0 : goto out_complete_io_ret;
8075 : }
8076 :
8077 0 : ns = bio->io_path->nvme_ns->ns;
8078 0 : qpair = bio->io_path->qpair->qpair;
8079 :
8080 0 : zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
8081 0 : max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
8082 : sizeof(bio->zone_report_buf->descs[0]);
8083 :
8084 0 : if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
8085 0 : ret = -EINVAL;
8086 0 : goto out_complete_io_ret;
8087 : }
8088 :
8089 0 : if (!bio->zone_report_buf->nr_zones) {
8090 0 : ret = -EINVAL;
8091 0 : goto out_complete_io_ret;
8092 : }
8093 :
8094 0 : for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
8095 0 : ret = fill_zone_from_report(&info[bio->handled_zones],
8096 0 : &bio->zone_report_buf->descs[i]);
8097 0 : if (ret) {
8098 0 : goto out_complete_io_ret;
8099 : }
8100 0 : bio->handled_zones++;
8101 0 : }
8102 :
8103 0 : if (bio->handled_zones < zones_to_copy) {
8104 0 : uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
8105 0 : uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
8106 :
8107 0 : memset(bio->zone_report_buf, 0, zone_report_bufsize);
8108 0 : ret = spdk_nvme_zns_report_zones(ns, qpair,
8109 0 : bio->zone_report_buf, zone_report_bufsize,
8110 0 : slba, SPDK_NVME_ZRA_LIST_ALL, true,
8111 0 : bdev_nvme_get_zone_info_done, bio);
8112 0 : if (!ret) {
8113 0 : return;
8114 : } else {
8115 0 : goto out_complete_io_ret;
8116 : }
8117 : }
8118 :
8119 : out_complete_io_nvme_cpl:
8120 0 : free(bio->zone_report_buf);
8121 0 : bio->zone_report_buf = NULL;
8122 0 : bdev_nvme_io_complete_nvme_status(bio, cpl);
8123 0 : return;
8124 :
8125 : out_complete_io_ret:
8126 0 : free(bio->zone_report_buf);
8127 0 : bio->zone_report_buf = NULL;
8128 0 : bdev_nvme_io_complete(bio, ret);
8129 0 : }
8130 :
8131 : static void
8132 0 : bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
8133 : {
8134 0 : struct nvme_bdev_io *bio = ref;
8135 :
8136 0 : bdev_nvme_io_complete_nvme_status(bio, cpl);
8137 0 : }
8138 :
8139 : static void
8140 4 : bdev_nvme_admin_passthru_complete_nvme_status(void *ctx)
8141 : {
8142 4 : struct nvme_bdev_io *bio = ctx;
8143 4 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8144 4 : const struct spdk_nvme_cpl *cpl = &bio->cpl;
8145 :
8146 4 : assert(bdev_nvme_io_type_is_admin(bdev_io->type));
8147 :
8148 4 : __bdev_nvme_io_complete(bdev_io, 0, cpl);
8149 4 : }
8150 :
8151 : static void
8152 3 : bdev_nvme_abort_complete(void *ctx)
8153 : {
8154 3 : struct nvme_bdev_io *bio = ctx;
8155 3 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8156 :
8157 3 : if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
8158 3 : __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL);
8159 3 : } else {
8160 0 : __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL);
8161 : }
8162 3 : }
8163 :
8164 : static void
8165 3 : bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
8166 : {
8167 3 : struct nvme_bdev_io *bio = ref;
8168 3 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8169 :
8170 3 : bio->cpl = *cpl;
8171 3 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio);
8172 3 : }
8173 :
8174 : static void
8175 4 : bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
8176 : {
8177 4 : struct nvme_bdev_io *bio = ref;
8178 4 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8179 :
8180 4 : bio->cpl = *cpl;
8181 8 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
8182 4 : bdev_nvme_admin_passthru_complete_nvme_status, bio);
8183 4 : }
8184 :
8185 : static void
8186 0 : bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
8187 : {
8188 0 : struct nvme_bdev_io *bio = ref;
8189 : struct iovec *iov;
8190 :
8191 0 : bio->iov_offset = sgl_offset;
8192 0 : for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
8193 0 : iov = &bio->iovs[bio->iovpos];
8194 0 : if (bio->iov_offset < iov->iov_len) {
8195 0 : break;
8196 : }
8197 :
8198 0 : bio->iov_offset -= iov->iov_len;
8199 0 : }
8200 0 : }
8201 :
8202 : static int
8203 0 : bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
8204 : {
8205 0 : struct nvme_bdev_io *bio = ref;
8206 : struct iovec *iov;
8207 :
8208 0 : assert(bio->iovpos < bio->iovcnt);
8209 :
8210 0 : iov = &bio->iovs[bio->iovpos];
8211 :
8212 0 : *address = iov->iov_base;
8213 0 : *length = iov->iov_len;
8214 :
8215 0 : if (bio->iov_offset) {
8216 0 : assert(bio->iov_offset <= iov->iov_len);
8217 0 : *address += bio->iov_offset;
8218 0 : *length -= bio->iov_offset;
8219 0 : }
8220 :
8221 0 : bio->iov_offset += *length;
8222 0 : if (bio->iov_offset == iov->iov_len) {
8223 0 : bio->iovpos++;
8224 0 : bio->iov_offset = 0;
8225 0 : }
8226 :
8227 0 : return 0;
8228 : }
8229 :
8230 : static void
8231 0 : bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
8232 : {
8233 0 : struct nvme_bdev_io *bio = ref;
8234 : struct iovec *iov;
8235 :
8236 0 : bio->fused_iov_offset = sgl_offset;
8237 0 : for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
8238 0 : iov = &bio->fused_iovs[bio->fused_iovpos];
8239 0 : if (bio->fused_iov_offset < iov->iov_len) {
8240 0 : break;
8241 : }
8242 :
8243 0 : bio->fused_iov_offset -= iov->iov_len;
8244 0 : }
8245 0 : }
8246 :
8247 : static int
8248 0 : bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
8249 : {
8250 0 : struct nvme_bdev_io *bio = ref;
8251 : struct iovec *iov;
8252 :
8253 0 : assert(bio->fused_iovpos < bio->fused_iovcnt);
8254 :
8255 0 : iov = &bio->fused_iovs[bio->fused_iovpos];
8256 :
8257 0 : *address = iov->iov_base;
8258 0 : *length = iov->iov_len;
8259 :
8260 0 : if (bio->fused_iov_offset) {
8261 0 : assert(bio->fused_iov_offset <= iov->iov_len);
8262 0 : *address += bio->fused_iov_offset;
8263 0 : *length -= bio->fused_iov_offset;
8264 0 : }
8265 :
8266 0 : bio->fused_iov_offset += *length;
8267 0 : if (bio->fused_iov_offset == iov->iov_len) {
8268 0 : bio->fused_iovpos++;
8269 0 : bio->fused_iov_offset = 0;
8270 0 : }
8271 :
8272 0 : return 0;
8273 : }
8274 :
8275 : static int
8276 0 : bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8277 : void *md, uint64_t lba_count, uint64_t lba)
8278 : {
8279 : int rc;
8280 :
8281 0 : SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
8282 : lba_count, lba);
8283 :
8284 0 : bio->iovs = iov;
8285 0 : bio->iovcnt = iovcnt;
8286 0 : bio->iovpos = 0;
8287 0 : bio->iov_offset = 0;
8288 :
8289 0 : rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns,
8290 0 : bio->io_path->qpair->qpair,
8291 0 : lba, lba_count,
8292 0 : bdev_nvme_no_pi_readv_done, bio, 0,
8293 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
8294 0 : md, 0, 0);
8295 :
8296 0 : if (rc != 0 && rc != -ENOMEM) {
8297 0 : SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
8298 0 : }
8299 0 : return rc;
8300 : }
8301 :
8302 : static int
8303 3 : bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8304 : void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
8305 : struct spdk_memory_domain *domain, void *domain_ctx,
8306 : struct spdk_accel_sequence *seq)
8307 : {
8308 3 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8309 3 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8310 : int rc;
8311 :
8312 3 : SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8313 : lba_count, lba);
8314 :
8315 3 : bio->iovs = iov;
8316 3 : bio->iovcnt = iovcnt;
8317 3 : bio->iovpos = 0;
8318 3 : bio->iov_offset = 0;
8319 :
8320 3 : if (domain != NULL || seq != NULL) {
8321 1 : bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence);
8322 1 : bio->ext_opts.memory_domain = domain;
8323 1 : bio->ext_opts.memory_domain_ctx = domain_ctx;
8324 1 : bio->ext_opts.io_flags = flags;
8325 1 : bio->ext_opts.metadata = md;
8326 1 : bio->ext_opts.accel_sequence = seq;
8327 :
8328 1 : if (iovcnt == 1) {
8329 2 : rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done,
8330 1 : bio, &bio->ext_opts);
8331 1 : } else {
8332 0 : rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count,
8333 0 : bdev_nvme_readv_done, bio,
8334 : bdev_nvme_queued_reset_sgl,
8335 : bdev_nvme_queued_next_sge,
8336 0 : &bio->ext_opts);
8337 : }
8338 3 : } else if (iovcnt == 1) {
8339 4 : rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base,
8340 2 : md, lba, lba_count, bdev_nvme_readv_done,
8341 2 : bio, flags, 0, 0);
8342 2 : } else {
8343 0 : rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
8344 0 : bdev_nvme_readv_done, bio, flags,
8345 : bdev_nvme_queued_reset_sgl,
8346 0 : bdev_nvme_queued_next_sge, md, 0, 0);
8347 : }
8348 :
8349 3 : if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
8350 0 : SPDK_ERRLOG("readv failed: rc = %d\n", rc);
8351 0 : }
8352 3 : return rc;
8353 : }
8354 :
8355 : static int
8356 25 : bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8357 : void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
8358 : struct spdk_memory_domain *domain, void *domain_ctx,
8359 : struct spdk_accel_sequence *seq,
8360 : union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13)
8361 : {
8362 25 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8363 25 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8364 : int rc;
8365 :
8366 25 : SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8367 : lba_count, lba);
8368 :
8369 25 : bio->iovs = iov;
8370 25 : bio->iovcnt = iovcnt;
8371 25 : bio->iovpos = 0;
8372 25 : bio->iov_offset = 0;
8373 :
8374 25 : if (domain != NULL || seq != NULL) {
8375 0 : bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence);
8376 0 : bio->ext_opts.memory_domain = domain;
8377 0 : bio->ext_opts.memory_domain_ctx = domain_ctx;
8378 0 : bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype);
8379 0 : bio->ext_opts.cdw13 = cdw13.raw;
8380 0 : bio->ext_opts.metadata = md;
8381 0 : bio->ext_opts.accel_sequence = seq;
8382 :
8383 0 : if (iovcnt == 1) {
8384 0 : rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done,
8385 0 : bio, &bio->ext_opts);
8386 0 : } else {
8387 0 : rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count,
8388 0 : bdev_nvme_writev_done, bio,
8389 : bdev_nvme_queued_reset_sgl,
8390 : bdev_nvme_queued_next_sge,
8391 0 : &bio->ext_opts);
8392 : }
8393 25 : } else if (iovcnt == 1) {
8394 50 : rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base,
8395 25 : md, lba, lba_count, bdev_nvme_writev_done,
8396 25 : bio, flags, 0, 0);
8397 25 : } else {
8398 0 : rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
8399 0 : bdev_nvme_writev_done, bio, flags,
8400 : bdev_nvme_queued_reset_sgl,
8401 0 : bdev_nvme_queued_next_sge, md, 0, 0);
8402 : }
8403 :
8404 25 : if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
8405 0 : SPDK_ERRLOG("writev failed: rc = %d\n", rc);
8406 0 : }
8407 25 : return rc;
8408 : }
8409 :
8410 : static int
8411 0 : bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8412 : void *md, uint64_t lba_count, uint64_t zslba,
8413 : uint32_t flags)
8414 : {
8415 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8416 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8417 : int rc;
8418 :
8419 0 : SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
8420 : lba_count, zslba);
8421 :
8422 0 : bio->iovs = iov;
8423 0 : bio->iovcnt = iovcnt;
8424 0 : bio->iovpos = 0;
8425 0 : bio->iov_offset = 0;
8426 :
8427 0 : if (iovcnt == 1) {
8428 0 : rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
8429 0 : lba_count,
8430 0 : bdev_nvme_zone_appendv_done, bio,
8431 0 : flags,
8432 : 0, 0);
8433 0 : } else {
8434 0 : rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
8435 0 : bdev_nvme_zone_appendv_done, bio, flags,
8436 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
8437 0 : md, 0, 0);
8438 : }
8439 :
8440 0 : if (rc != 0 && rc != -ENOMEM) {
8441 0 : SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
8442 0 : }
8443 0 : return rc;
8444 : }
8445 :
8446 : static int
8447 1 : bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8448 : void *md, uint64_t lba_count, uint64_t lba,
8449 : uint32_t flags)
8450 : {
8451 : int rc;
8452 :
8453 1 : SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8454 : lba_count, lba);
8455 :
8456 1 : bio->iovs = iov;
8457 1 : bio->iovcnt = iovcnt;
8458 1 : bio->iovpos = 0;
8459 1 : bio->iov_offset = 0;
8460 :
8461 2 : rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns,
8462 1 : bio->io_path->qpair->qpair,
8463 1 : lba, lba_count,
8464 1 : bdev_nvme_comparev_done, bio, flags,
8465 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
8466 1 : md, 0, 0);
8467 :
8468 1 : if (rc != 0 && rc != -ENOMEM) {
8469 0 : SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
8470 0 : }
8471 1 : return rc;
8472 : }
8473 :
8474 : static int
8475 2 : bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
8476 : struct iovec *write_iov, int write_iovcnt,
8477 : void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
8478 : {
8479 2 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8480 2 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8481 2 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8482 : int rc;
8483 :
8484 2 : SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8485 : lba_count, lba);
8486 :
8487 2 : bio->iovs = cmp_iov;
8488 2 : bio->iovcnt = cmp_iovcnt;
8489 2 : bio->iovpos = 0;
8490 2 : bio->iov_offset = 0;
8491 2 : bio->fused_iovs = write_iov;
8492 2 : bio->fused_iovcnt = write_iovcnt;
8493 2 : bio->fused_iovpos = 0;
8494 2 : bio->fused_iov_offset = 0;
8495 :
8496 2 : if (bdev_io->num_retries == 0) {
8497 2 : bio->first_fused_submitted = false;
8498 2 : bio->first_fused_completed = false;
8499 2 : }
8500 :
8501 2 : if (!bio->first_fused_submitted) {
8502 2 : flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
8503 2 : memset(&bio->cpl, 0, sizeof(bio->cpl));
8504 :
8505 4 : rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
8506 2 : bdev_nvme_comparev_and_writev_done, bio, flags,
8507 2 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
8508 2 : if (rc == 0) {
8509 2 : bio->first_fused_submitted = true;
8510 2 : flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
8511 2 : } else {
8512 0 : if (rc != -ENOMEM) {
8513 0 : SPDK_ERRLOG("compare failed: rc = %d\n", rc);
8514 0 : }
8515 0 : return rc;
8516 : }
8517 2 : }
8518 :
8519 2 : flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
8520 :
8521 4 : rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
8522 2 : bdev_nvme_comparev_and_writev_done, bio, flags,
8523 2 : bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
8524 2 : if (rc != 0 && rc != -ENOMEM) {
8525 0 : SPDK_ERRLOG("write failed: rc = %d\n", rc);
8526 0 : rc = 0;
8527 0 : }
8528 :
8529 2 : return rc;
8530 2 : }
8531 :
8532 : static int
8533 1 : bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
8534 : {
8535 : struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
8536 : struct spdk_nvme_dsm_range *range;
8537 : uint64_t offset, remaining;
8538 : uint64_t num_ranges_u64;
8539 : uint16_t num_ranges;
8540 : int rc;
8541 :
8542 1 : num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
8543 : SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8544 1 : if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
8545 0 : SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
8546 0 : return -EINVAL;
8547 : }
8548 1 : num_ranges = (uint16_t)num_ranges_u64;
8549 :
8550 1 : offset = offset_blocks;
8551 1 : remaining = num_blocks;
8552 1 : range = &dsm_ranges[0];
8553 :
8554 : /* Fill max-size ranges until the remaining blocks fit into one range */
8555 1 : while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
8556 0 : range->attributes.raw = 0;
8557 0 : range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8558 0 : range->starting_lba = offset;
8559 :
8560 0 : offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8561 0 : remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8562 0 : range++;
8563 : }
8564 :
8565 : /* Final range describes the remaining blocks */
8566 1 : range->attributes.raw = 0;
8567 1 : range->length = remaining;
8568 1 : range->starting_lba = offset;
8569 :
8570 2 : rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns,
8571 1 : bio->io_path->qpair->qpair,
8572 : SPDK_NVME_DSM_ATTR_DEALLOCATE,
8573 1 : dsm_ranges, num_ranges,
8574 1 : bdev_nvme_queued_done, bio);
8575 :
8576 1 : return rc;
8577 1 : }
8578 :
8579 : static int
8580 0 : bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
8581 : {
8582 0 : if (num_blocks > UINT16_MAX + 1) {
8583 0 : SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n");
8584 0 : return -EINVAL;
8585 : }
8586 :
8587 0 : return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns,
8588 0 : bio->io_path->qpair->qpair,
8589 0 : offset_blocks, num_blocks,
8590 0 : bdev_nvme_queued_done, bio,
8591 : 0);
8592 0 : }
8593 :
8594 : static int
8595 0 : bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
8596 : struct spdk_bdev_zone_info *info)
8597 : {
8598 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8599 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8600 0 : uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
8601 0 : uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
8602 0 : uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
8603 :
8604 0 : if (zone_id % zone_size != 0) {
8605 0 : return -EINVAL;
8606 : }
8607 :
8608 0 : if (num_zones > total_zones || !num_zones) {
8609 0 : return -EINVAL;
8610 : }
8611 :
8612 0 : assert(!bio->zone_report_buf);
8613 0 : bio->zone_report_buf = calloc(1, zone_report_bufsize);
8614 0 : if (!bio->zone_report_buf) {
8615 0 : return -ENOMEM;
8616 : }
8617 :
8618 0 : bio->handled_zones = 0;
8619 :
8620 0 : return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
8621 0 : zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
8622 0 : bdev_nvme_get_zone_info_done, bio);
8623 0 : }
8624 :
8625 : static int
8626 0 : bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
8627 : enum spdk_bdev_zone_action action)
8628 : {
8629 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8630 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8631 :
8632 0 : switch (action) {
8633 : case SPDK_BDEV_ZONE_CLOSE:
8634 0 : return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
8635 0 : bdev_nvme_zone_management_done, bio);
8636 : case SPDK_BDEV_ZONE_FINISH:
8637 0 : return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
8638 0 : bdev_nvme_zone_management_done, bio);
8639 : case SPDK_BDEV_ZONE_OPEN:
8640 0 : return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
8641 0 : bdev_nvme_zone_management_done, bio);
8642 : case SPDK_BDEV_ZONE_RESET:
8643 0 : return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
8644 0 : bdev_nvme_zone_management_done, bio);
8645 : case SPDK_BDEV_ZONE_OFFLINE:
8646 0 : return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
8647 0 : bdev_nvme_zone_management_done, bio);
8648 : default:
8649 0 : return -EINVAL;
8650 : }
8651 0 : }
8652 :
8653 : static void
8654 5 : bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
8655 : struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
8656 : {
8657 : struct nvme_io_path *io_path;
8658 : struct nvme_ctrlr *nvme_ctrlr;
8659 : uint32_t max_xfer_size;
8660 5 : int rc = -ENXIO;
8661 :
8662 : /* Choose the first ctrlr which is not failed. */
8663 8 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
8664 7 : nvme_ctrlr = io_path->qpair->ctrlr;
8665 :
8666 : /* We should skip any unavailable nvme_ctrlr rather than checking
8667 : * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO.
8668 : */
8669 7 : if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
8670 3 : continue;
8671 : }
8672 :
8673 4 : max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr);
8674 :
8675 4 : if (nbytes > max_xfer_size) {
8676 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8677 0 : rc = -EINVAL;
8678 0 : goto err;
8679 : }
8680 :
8681 8 : rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes,
8682 4 : bdev_nvme_admin_passthru_done, bio);
8683 4 : if (rc == 0) {
8684 4 : return;
8685 : }
8686 1 : }
8687 :
8688 : err:
8689 1 : bdev_nvme_admin_complete(bio, rc);
8690 5 : }
8691 :
8692 : static int
8693 0 : bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
8694 : void *buf, size_t nbytes)
8695 : {
8696 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8697 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8698 0 : uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
8699 0 : struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
8700 :
8701 0 : if (nbytes > max_xfer_size) {
8702 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8703 0 : return -EINVAL;
8704 : }
8705 :
8706 : /*
8707 : * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
8708 : * so fill it out automatically.
8709 : */
8710 0 : cmd->nsid = spdk_nvme_ns_get_id(ns);
8711 :
8712 0 : return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
8713 0 : (uint32_t)nbytes, bdev_nvme_queued_done, bio);
8714 0 : }
8715 :
8716 : static int
8717 0 : bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
8718 : void *buf, size_t nbytes, void *md_buf, size_t md_len)
8719 : {
8720 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8721 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8722 0 : size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
8723 0 : uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
8724 0 : struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
8725 :
8726 0 : if (nbytes > max_xfer_size) {
8727 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8728 0 : return -EINVAL;
8729 : }
8730 :
8731 0 : if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
8732 0 : SPDK_ERRLOG("invalid meta data buffer size\n");
8733 0 : return -EINVAL;
8734 : }
8735 :
8736 : /*
8737 : * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
8738 : * so fill it out automatically.
8739 : */
8740 0 : cmd->nsid = spdk_nvme_ns_get_id(ns);
8741 :
8742 0 : return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
8743 0 : (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
8744 0 : }
8745 :
8746 : static int
8747 0 : bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio,
8748 : struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt,
8749 : size_t nbytes, void *md_buf, size_t md_len)
8750 : {
8751 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8752 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8753 0 : size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
8754 0 : uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
8755 0 : struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
8756 :
8757 0 : bio->iovs = iov;
8758 0 : bio->iovcnt = iovcnt;
8759 0 : bio->iovpos = 0;
8760 0 : bio->iov_offset = 0;
8761 :
8762 0 : if (nbytes > max_xfer_size) {
8763 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8764 0 : return -EINVAL;
8765 : }
8766 :
8767 0 : if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
8768 0 : SPDK_ERRLOG("invalid meta data buffer size\n");
8769 0 : return -EINVAL;
8770 : }
8771 :
8772 : /*
8773 : * Each NVMe bdev is a specific namespace, and all NVMe I/O commands
8774 : * require a nsid, so fill it out automatically.
8775 : */
8776 0 : cmd->nsid = spdk_nvme_ns_get_id(ns);
8777 :
8778 0 : return spdk_nvme_ctrlr_cmd_iov_raw_with_md(
8779 0 : ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio,
8780 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge);
8781 0 : }
8782 :
8783 : static void
8784 6 : bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
8785 : struct nvme_bdev_io *bio_to_abort)
8786 : {
8787 : struct nvme_io_path *io_path;
8788 6 : int rc = 0;
8789 :
8790 6 : rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort);
8791 6 : if (rc == 0) {
8792 1 : bdev_nvme_admin_complete(bio, 0);
8793 1 : return;
8794 : }
8795 :
8796 5 : io_path = bio_to_abort->io_path;
8797 5 : if (io_path != NULL) {
8798 6 : rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr,
8799 3 : io_path->qpair->qpair,
8800 3 : bio_to_abort,
8801 3 : bdev_nvme_abort_done, bio);
8802 3 : } else {
8803 3 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
8804 4 : rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr,
8805 : NULL,
8806 2 : bio_to_abort,
8807 2 : bdev_nvme_abort_done, bio);
8808 :
8809 2 : if (rc != -ENOENT) {
8810 1 : break;
8811 : }
8812 1 : }
8813 : }
8814 :
8815 5 : if (rc != 0) {
8816 : /* If no command was found or there was any error, complete the abort
8817 : * request with failure.
8818 : */
8819 2 : bdev_nvme_admin_complete(bio, rc);
8820 2 : }
8821 6 : }
8822 :
8823 : static int
8824 0 : bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks,
8825 : uint64_t num_blocks)
8826 : {
8827 0 : struct spdk_nvme_scc_source_range range = {
8828 0 : .slba = src_offset_blocks,
8829 0 : .nlb = num_blocks - 1
8830 : };
8831 :
8832 0 : return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns,
8833 0 : bio->io_path->qpair->qpair,
8834 0 : &range, 1, dst_offset_blocks,
8835 0 : bdev_nvme_queued_done, bio);
8836 : }
8837 :
8838 : static void
8839 0 : bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
8840 : {
8841 : const char *action;
8842 : uint32_t i;
8843 :
8844 0 : if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
8845 0 : action = "reset";
8846 0 : } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
8847 0 : action = "abort";
8848 0 : } else {
8849 0 : action = "none";
8850 : }
8851 :
8852 0 : spdk_json_write_object_begin(w);
8853 :
8854 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
8855 :
8856 0 : spdk_json_write_named_object_begin(w, "params");
8857 0 : spdk_json_write_named_string(w, "action_on_timeout", action);
8858 0 : spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
8859 0 : spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us);
8860 0 : spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
8861 0 : spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
8862 0 : spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
8863 0 : spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
8864 0 : spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
8865 0 : spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
8866 0 : spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
8867 0 : spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
8868 0 : spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
8869 0 : spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count);
8870 0 : spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count);
8871 0 : spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout);
8872 0 : spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec);
8873 0 : spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec);
8874 0 : spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec);
8875 0 : spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback);
8876 0 : spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids);
8877 0 : spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos);
8878 0 : spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat);
8879 0 : spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size);
8880 0 : spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat);
8881 0 : spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence);
8882 0 : spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size);
8883 0 : spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms);
8884 0 : spdk_json_write_named_array_begin(w, "dhchap_digests");
8885 0 : for (i = 0; i < 32; ++i) {
8886 0 : if (g_opts.dhchap_digests & SPDK_BIT(i)) {
8887 0 : spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i));
8888 0 : }
8889 0 : }
8890 0 : spdk_json_write_array_end(w);
8891 0 : spdk_json_write_named_array_begin(w, "dhchap_dhgroups");
8892 0 : for (i = 0; i < 32; ++i) {
8893 0 : if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) {
8894 0 : spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i));
8895 0 : }
8896 0 : }
8897 :
8898 0 : spdk_json_write_array_end(w);
8899 0 : spdk_json_write_named_bool(w, "rdma_umr_per_io", g_opts.rdma_umr_per_io);
8900 0 : spdk_json_write_object_end(w);
8901 :
8902 0 : spdk_json_write_object_end(w);
8903 0 : }
8904 :
8905 : static void
8906 0 : bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx)
8907 : {
8908 : struct spdk_nvme_transport_id trid;
8909 :
8910 0 : spdk_json_write_object_begin(w);
8911 :
8912 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery");
8913 :
8914 0 : spdk_json_write_named_object_begin(w, "params");
8915 0 : spdk_json_write_named_string(w, "name", ctx->name);
8916 0 : spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn);
8917 :
8918 0 : trid = ctx->trid;
8919 0 : memset(trid.subnqn, 0, sizeof(trid.subnqn));
8920 0 : nvme_bdev_dump_trid_json(&trid, w);
8921 :
8922 0 : spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach);
8923 0 : spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec);
8924 0 : spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec);
8925 0 : spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec",
8926 0 : ctx->bdev_opts.fast_io_fail_timeout_sec);
8927 0 : spdk_json_write_object_end(w);
8928 :
8929 0 : spdk_json_write_object_end(w);
8930 0 : }
8931 :
8932 : #ifdef SPDK_CONFIG_NVME_CUSE
8933 : static void
8934 : nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w,
8935 : struct nvme_ctrlr *nvme_ctrlr)
8936 : {
8937 : size_t cuse_name_size = 128;
8938 : char cuse_name[cuse_name_size];
8939 :
8940 : if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr,
8941 : cuse_name, &cuse_name_size) != 0) {
8942 : return;
8943 : }
8944 :
8945 : spdk_json_write_object_begin(w);
8946 :
8947 : spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register");
8948 :
8949 : spdk_json_write_named_object_begin(w, "params");
8950 : spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name);
8951 : spdk_json_write_object_end(w);
8952 :
8953 : spdk_json_write_object_end(w);
8954 : }
8955 : #endif
8956 :
8957 : static void
8958 0 : nvme_ctrlr_config_json(struct spdk_json_write_ctx *w,
8959 : struct nvme_ctrlr *nvme_ctrlr,
8960 : struct nvme_path_id *path_id)
8961 : {
8962 : struct spdk_nvme_transport_id *trid;
8963 : const struct spdk_nvme_ctrlr_opts *opts;
8964 :
8965 0 : if (nvme_ctrlr->opts.from_discovery_service) {
8966 : /* Do not emit an RPC for this - it will be implicitly
8967 : * covered by a separate bdev_nvme_start_discovery or
8968 : * bdev_nvme_start_mdns_discovery RPC.
8969 : */
8970 0 : return;
8971 : }
8972 :
8973 0 : trid = &path_id->trid;
8974 :
8975 0 : spdk_json_write_object_begin(w);
8976 :
8977 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
8978 :
8979 0 : spdk_json_write_named_object_begin(w, "params");
8980 0 : spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name);
8981 0 : nvme_bdev_dump_trid_json(trid, w);
8982 0 : spdk_json_write_named_bool(w, "prchk_reftag",
8983 0 : (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
8984 0 : spdk_json_write_named_bool(w, "prchk_guard",
8985 0 : (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
8986 0 : spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec);
8987 0 : spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec);
8988 0 : spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec",
8989 0 : nvme_ctrlr->opts.fast_io_fail_timeout_sec);
8990 0 : if (nvme_ctrlr->psk != NULL) {
8991 0 : spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk));
8992 0 : }
8993 0 : if (nvme_ctrlr->dhchap_key != NULL) {
8994 0 : spdk_json_write_named_string(w, "dhchap_key",
8995 0 : spdk_key_get_name(nvme_ctrlr->dhchap_key));
8996 0 : }
8997 0 : if (nvme_ctrlr->dhchap_ctrlr_key != NULL) {
8998 0 : spdk_json_write_named_string(w, "dhchap_ctrlr_key",
8999 0 : spdk_key_get_name(nvme_ctrlr->dhchap_ctrlr_key));
9000 0 : }
9001 0 : opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr);
9002 0 : spdk_json_write_named_string(w, "hostnqn", opts->hostnqn);
9003 0 : spdk_json_write_named_bool(w, "hdgst", opts->header_digest);
9004 0 : spdk_json_write_named_bool(w, "ddgst", opts->data_digest);
9005 0 : if (opts->src_addr[0] != '\0') {
9006 0 : spdk_json_write_named_string(w, "hostaddr", opts->src_addr);
9007 0 : }
9008 0 : if (opts->src_svcid[0] != '\0') {
9009 0 : spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid);
9010 0 : }
9011 :
9012 0 : if (nvme_ctrlr->opts.multipath) {
9013 0 : spdk_json_write_named_string(w, "multipath", "multipath");
9014 0 : }
9015 0 : spdk_json_write_object_end(w);
9016 :
9017 0 : spdk_json_write_object_end(w);
9018 0 : }
9019 :
9020 : static void
9021 0 : bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
9022 : {
9023 0 : spdk_json_write_object_begin(w);
9024 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
9025 :
9026 0 : spdk_json_write_named_object_begin(w, "params");
9027 0 : spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
9028 0 : spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
9029 0 : spdk_json_write_object_end(w);
9030 :
9031 0 : spdk_json_write_object_end(w);
9032 0 : }
9033 :
9034 : static int
9035 0 : bdev_nvme_config_json(struct spdk_json_write_ctx *w)
9036 : {
9037 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
9038 : struct nvme_ctrlr *nvme_ctrlr;
9039 : struct discovery_ctx *ctx;
9040 : struct nvme_path_id *path_id;
9041 :
9042 0 : bdev_nvme_opts_config_json(w);
9043 :
9044 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
9045 :
9046 0 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
9047 0 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
9048 0 : path_id = nvme_ctrlr->active_path_id;
9049 0 : assert(path_id == TAILQ_FIRST(&nvme_ctrlr->trids));
9050 0 : nvme_ctrlr_config_json(w, nvme_ctrlr, path_id);
9051 :
9052 0 : path_id = TAILQ_NEXT(path_id, link);
9053 0 : while (path_id != NULL) {
9054 0 : nvme_ctrlr_config_json(w, nvme_ctrlr, path_id);
9055 0 : path_id = TAILQ_NEXT(path_id, link);
9056 : }
9057 :
9058 : #ifdef SPDK_CONFIG_NVME_CUSE
9059 : nvme_ctrlr_cuse_config_json(w, nvme_ctrlr);
9060 : #endif
9061 0 : }
9062 0 : }
9063 :
9064 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
9065 0 : if (!ctx->from_mdns_discovery_service) {
9066 0 : bdev_nvme_discovery_config_json(w, ctx);
9067 0 : }
9068 0 : }
9069 :
9070 0 : bdev_nvme_mdns_discovery_config_json(w);
9071 :
9072 : /* Dump as last parameter to give all NVMe bdevs chance to be constructed
9073 : * before enabling hotplug poller.
9074 : */
9075 0 : bdev_nvme_hotplug_config_json(w);
9076 :
9077 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9078 0 : return 0;
9079 : }
9080 :
9081 : struct spdk_nvme_ctrlr *
9082 1 : bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
9083 : {
9084 : struct nvme_bdev *nbdev;
9085 : struct nvme_ns *nvme_ns;
9086 :
9087 1 : if (!bdev || bdev->module != &nvme_if) {
9088 0 : return NULL;
9089 : }
9090 :
9091 1 : nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
9092 1 : nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
9093 1 : assert(nvme_ns != NULL);
9094 :
9095 1 : return nvme_ns->ctrlr->ctrlr;
9096 1 : }
9097 :
9098 : static bool
9099 12 : nvme_io_path_is_current(struct nvme_io_path *io_path)
9100 : {
9101 : const struct nvme_bdev_channel *nbdev_ch;
9102 : bool current;
9103 :
9104 12 : if (!nvme_io_path_is_available(io_path)) {
9105 4 : return false;
9106 : }
9107 :
9108 8 : nbdev_ch = io_path->nbdev_ch;
9109 8 : if (nbdev_ch == NULL) {
9110 1 : current = false;
9111 8 : } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
9112 3 : struct nvme_io_path *optimized_io_path = NULL;
9113 :
9114 6 : STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) {
9115 5 : if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) {
9116 2 : break;
9117 : }
9118 3 : }
9119 :
9120 : /* A non-optimized path is only current if there are no optimized paths. */
9121 3 : current = (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) ||
9122 2 : (optimized_io_path == NULL);
9123 3 : } else {
9124 4 : if (nbdev_ch->current_io_path) {
9125 1 : current = (io_path == nbdev_ch->current_io_path);
9126 1 : } else {
9127 : struct nvme_io_path *first_path;
9128 :
9129 : /* We arrived here as there are no optimized paths for active-passive
9130 : * mode. Check if this io_path is the first one available on the list.
9131 : */
9132 3 : current = false;
9133 3 : STAILQ_FOREACH(first_path, &nbdev_ch->io_path_list, stailq) {
9134 3 : if (nvme_io_path_is_available(first_path)) {
9135 3 : current = (io_path == first_path);
9136 3 : break;
9137 : }
9138 0 : }
9139 : }
9140 : }
9141 :
9142 8 : return current;
9143 12 : }
9144 :
9145 : static struct nvme_ctrlr *
9146 0 : bdev_nvme_next_ctrlr_unsafe(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct nvme_ctrlr *prev)
9147 : {
9148 : struct nvme_ctrlr *next;
9149 :
9150 : /* Must be called under g_bdev_nvme_mutex */
9151 0 : next = prev != NULL ? TAILQ_NEXT(prev, tailq) : TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
9152 0 : while (next != NULL) {
9153 : /* ref can be 0 when the ctrlr was released, but hasn't been detached yet */
9154 0 : pthread_mutex_lock(&next->mutex);
9155 0 : if (next->ref > 0) {
9156 0 : next->ref++;
9157 0 : pthread_mutex_unlock(&next->mutex);
9158 0 : return next;
9159 : }
9160 :
9161 0 : pthread_mutex_unlock(&next->mutex);
9162 0 : next = TAILQ_NEXT(next, tailq);
9163 : }
9164 :
9165 0 : return NULL;
9166 0 : }
9167 :
9168 : struct bdev_nvme_set_keys_ctx {
9169 : struct nvme_ctrlr *nctrlr;
9170 : struct spdk_key *dhchap_key;
9171 : struct spdk_key *dhchap_ctrlr_key;
9172 : struct spdk_thread *thread;
9173 : bdev_nvme_set_keys_cb cb_fn;
9174 : void *cb_ctx;
9175 : int status;
9176 : };
9177 :
9178 : static void
9179 0 : bdev_nvme_free_set_keys_ctx(struct bdev_nvme_set_keys_ctx *ctx)
9180 : {
9181 0 : if (ctx == NULL) {
9182 0 : return;
9183 : }
9184 :
9185 0 : spdk_keyring_put_key(ctx->dhchap_key);
9186 0 : spdk_keyring_put_key(ctx->dhchap_ctrlr_key);
9187 0 : free(ctx);
9188 0 : }
9189 :
9190 : static void
9191 0 : _bdev_nvme_set_keys_done(void *_ctx)
9192 : {
9193 0 : struct bdev_nvme_set_keys_ctx *ctx = _ctx;
9194 :
9195 0 : ctx->cb_fn(ctx->cb_ctx, ctx->status);
9196 :
9197 0 : if (ctx->nctrlr != NULL) {
9198 0 : nvme_ctrlr_put_ref(ctx->nctrlr);
9199 0 : }
9200 0 : bdev_nvme_free_set_keys_ctx(ctx);
9201 0 : }
9202 :
9203 : static void
9204 0 : bdev_nvme_set_keys_done(struct bdev_nvme_set_keys_ctx *ctx, int status)
9205 : {
9206 0 : ctx->status = status;
9207 0 : spdk_thread_exec_msg(ctx->thread, _bdev_nvme_set_keys_done, ctx);
9208 0 : }
9209 :
9210 : static void bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx);
9211 :
9212 : static void
9213 0 : bdev_nvme_authenticate_ctrlr_continue(struct bdev_nvme_set_keys_ctx *ctx)
9214 : {
9215 : struct nvme_ctrlr *next;
9216 :
9217 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
9218 0 : next = bdev_nvme_next_ctrlr_unsafe(NULL, ctx->nctrlr);
9219 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9220 :
9221 0 : nvme_ctrlr_put_ref(ctx->nctrlr);
9222 0 : ctx->nctrlr = next;
9223 :
9224 0 : if (next == NULL) {
9225 0 : bdev_nvme_set_keys_done(ctx, 0);
9226 0 : } else {
9227 0 : bdev_nvme_authenticate_ctrlr(ctx);
9228 : }
9229 0 : }
9230 :
9231 : static void
9232 0 : bdev_nvme_authenticate_qpairs_done(struct spdk_io_channel_iter *i, int status)
9233 : {
9234 0 : struct bdev_nvme_set_keys_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
9235 :
9236 0 : if (status != 0) {
9237 0 : bdev_nvme_set_keys_done(ctx, status);
9238 0 : return;
9239 : }
9240 0 : bdev_nvme_authenticate_ctrlr_continue(ctx);
9241 0 : }
9242 :
9243 : static void
9244 0 : bdev_nvme_authenticate_qpair_done(void *ctx, int status)
9245 : {
9246 0 : spdk_for_each_channel_continue(ctx, status);
9247 0 : }
9248 :
9249 : static void
9250 0 : bdev_nvme_authenticate_qpair(struct spdk_io_channel_iter *i)
9251 : {
9252 0 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
9253 0 : struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
9254 0 : struct nvme_qpair *qpair = ctrlr_ch->qpair;
9255 : int rc;
9256 :
9257 0 : if (!nvme_qpair_is_connected(qpair)) {
9258 0 : spdk_for_each_channel_continue(i, 0);
9259 0 : return;
9260 : }
9261 :
9262 0 : rc = spdk_nvme_qpair_authenticate(qpair->qpair, bdev_nvme_authenticate_qpair_done, i);
9263 0 : if (rc != 0) {
9264 0 : spdk_for_each_channel_continue(i, rc);
9265 0 : }
9266 0 : }
9267 :
9268 : static void
9269 0 : bdev_nvme_authenticate_ctrlr_done(void *_ctx, int status)
9270 : {
9271 0 : struct bdev_nvme_set_keys_ctx *ctx = _ctx;
9272 :
9273 0 : if (status != 0) {
9274 0 : bdev_nvme_set_keys_done(ctx, status);
9275 0 : return;
9276 : }
9277 :
9278 0 : spdk_for_each_channel(ctx->nctrlr, bdev_nvme_authenticate_qpair, ctx,
9279 : bdev_nvme_authenticate_qpairs_done);
9280 0 : }
9281 :
9282 : static void
9283 0 : bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx)
9284 : {
9285 0 : struct spdk_nvme_ctrlr_key_opts opts = {};
9286 0 : struct nvme_ctrlr *nctrlr = ctx->nctrlr;
9287 : int rc;
9288 :
9289 0 : opts.size = SPDK_SIZEOF(&opts, dhchap_ctrlr_key);
9290 0 : opts.dhchap_key = ctx->dhchap_key;
9291 0 : opts.dhchap_ctrlr_key = ctx->dhchap_ctrlr_key;
9292 0 : rc = spdk_nvme_ctrlr_set_keys(nctrlr->ctrlr, &opts);
9293 0 : if (rc != 0) {
9294 0 : bdev_nvme_set_keys_done(ctx, rc);
9295 0 : return;
9296 : }
9297 :
9298 0 : if (ctx->dhchap_key != NULL) {
9299 0 : rc = spdk_nvme_ctrlr_authenticate(nctrlr->ctrlr,
9300 0 : bdev_nvme_authenticate_ctrlr_done, ctx);
9301 0 : if (rc != 0) {
9302 0 : bdev_nvme_set_keys_done(ctx, rc);
9303 0 : }
9304 0 : } else {
9305 0 : bdev_nvme_authenticate_ctrlr_continue(ctx);
9306 : }
9307 0 : }
9308 :
9309 : int
9310 0 : bdev_nvme_set_keys(const char *name, const char *dhchap_key, const char *dhchap_ctrlr_key,
9311 : bdev_nvme_set_keys_cb cb_fn, void *cb_ctx)
9312 : {
9313 : struct bdev_nvme_set_keys_ctx *ctx;
9314 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
9315 : struct nvme_ctrlr *nctrlr;
9316 :
9317 0 : ctx = calloc(1, sizeof(*ctx));
9318 0 : if (ctx == NULL) {
9319 0 : return -ENOMEM;
9320 : }
9321 :
9322 0 : if (dhchap_key != NULL) {
9323 0 : ctx->dhchap_key = spdk_keyring_get_key(dhchap_key);
9324 0 : if (ctx->dhchap_key == NULL) {
9325 0 : SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_key, name);
9326 0 : bdev_nvme_free_set_keys_ctx(ctx);
9327 0 : return -ENOKEY;
9328 : }
9329 0 : }
9330 0 : if (dhchap_ctrlr_key != NULL) {
9331 0 : ctx->dhchap_ctrlr_key = spdk_keyring_get_key(dhchap_ctrlr_key);
9332 0 : if (ctx->dhchap_ctrlr_key == NULL) {
9333 0 : SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_ctrlr_key, name);
9334 0 : bdev_nvme_free_set_keys_ctx(ctx);
9335 0 : return -ENOKEY;
9336 : }
9337 0 : }
9338 :
9339 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
9340 0 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
9341 0 : if (nbdev_ctrlr == NULL) {
9342 0 : SPDK_ERRLOG("Could not find bdev_ctrlr %s\n", name);
9343 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9344 0 : bdev_nvme_free_set_keys_ctx(ctx);
9345 0 : return -ENODEV;
9346 : }
9347 0 : nctrlr = bdev_nvme_next_ctrlr_unsafe(nbdev_ctrlr, NULL);
9348 0 : if (nctrlr == NULL) {
9349 0 : SPDK_ERRLOG("Could not find any nvme_ctrlrs on bdev_ctrlr %s\n", name);
9350 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9351 0 : bdev_nvme_free_set_keys_ctx(ctx);
9352 0 : return -ENODEV;
9353 : }
9354 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9355 :
9356 0 : ctx->nctrlr = nctrlr;
9357 0 : ctx->cb_fn = cb_fn;
9358 0 : ctx->cb_ctx = cb_ctx;
9359 0 : ctx->thread = spdk_get_thread();
9360 :
9361 0 : bdev_nvme_authenticate_ctrlr(ctx);
9362 :
9363 0 : return 0;
9364 0 : }
9365 :
9366 : void
9367 0 : nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path)
9368 : {
9369 0 : struct nvme_ns *nvme_ns = io_path->nvme_ns;
9370 0 : struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr;
9371 : const struct spdk_nvme_ctrlr_data *cdata;
9372 : const struct spdk_nvme_transport_id *trid;
9373 : const char *adrfam_str;
9374 :
9375 0 : spdk_json_write_object_begin(w);
9376 :
9377 0 : spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name);
9378 :
9379 0 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
9380 0 : trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr);
9381 :
9382 0 : spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid);
9383 0 : spdk_json_write_named_bool(w, "current", nvme_io_path_is_current(io_path));
9384 0 : spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair));
9385 0 : spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns));
9386 :
9387 0 : spdk_json_write_named_object_begin(w, "transport");
9388 0 : spdk_json_write_named_string(w, "trtype", trid->trstring);
9389 0 : spdk_json_write_named_string(w, "traddr", trid->traddr);
9390 0 : if (trid->trsvcid[0] != '\0') {
9391 0 : spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
9392 0 : }
9393 0 : adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
9394 0 : if (adrfam_str) {
9395 0 : spdk_json_write_named_string(w, "adrfam", adrfam_str);
9396 0 : }
9397 0 : spdk_json_write_object_end(w);
9398 :
9399 0 : spdk_json_write_object_end(w);
9400 0 : }
9401 :
9402 : void
9403 0 : bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w)
9404 : {
9405 : struct discovery_ctx *ctx;
9406 : struct discovery_entry_ctx *entry_ctx;
9407 :
9408 0 : spdk_json_write_array_begin(w);
9409 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
9410 0 : spdk_json_write_object_begin(w);
9411 0 : spdk_json_write_named_string(w, "name", ctx->name);
9412 :
9413 0 : spdk_json_write_named_object_begin(w, "trid");
9414 0 : nvme_bdev_dump_trid_json(&ctx->trid, w);
9415 0 : spdk_json_write_object_end(w);
9416 :
9417 0 : spdk_json_write_named_array_begin(w, "referrals");
9418 0 : TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) {
9419 0 : spdk_json_write_object_begin(w);
9420 0 : spdk_json_write_named_object_begin(w, "trid");
9421 0 : nvme_bdev_dump_trid_json(&entry_ctx->trid, w);
9422 0 : spdk_json_write_object_end(w);
9423 0 : spdk_json_write_object_end(w);
9424 0 : }
9425 0 : spdk_json_write_array_end(w);
9426 :
9427 0 : spdk_json_write_object_end(w);
9428 0 : }
9429 0 : spdk_json_write_array_end(w);
9430 0 : }
9431 :
9432 1 : SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
9433 :
9434 : static void
9435 0 : bdev_nvme_trace(void)
9436 : {
9437 0 : struct spdk_trace_tpoint_opts opts[] = {
9438 : {
9439 : "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START,
9440 : OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1,
9441 : {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }}
9442 : },
9443 : {
9444 : "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE,
9445 : OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0,
9446 : {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }}
9447 : }
9448 : };
9449 :
9450 :
9451 0 : spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N');
9452 0 : spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
9453 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0);
9454 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0);
9455 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0);
9456 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0);
9457 0 : }
9458 1 : SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME)
|