Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2021 Intel Corporation. All rights reserved.
3 : * Copyright (c) 2021 Mellanox Technologies LTD. All rights reserved.
4 : * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 : */
6 :
7 : /*
8 : * NVMe over PCIe common library
9 : */
10 :
11 : #include "spdk/stdinc.h"
12 : #include "spdk/likely.h"
13 : #include "spdk/string.h"
14 : #include "nvme_internal.h"
15 : #include "nvme_pcie_internal.h"
16 : #include "spdk/trace.h"
17 :
18 : #include "spdk_internal/trace_defs.h"
19 :
20 : __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL;
21 :
22 : static struct spdk_nvme_pcie_stat g_dummy_stat = {};
23 :
24 : static void nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair,
25 : struct nvme_tracker *tr);
26 :
27 : static inline uint64_t
28 2093 : nvme_pcie_vtophys(struct spdk_nvme_ctrlr *ctrlr, const void *buf, uint64_t *size)
29 : {
30 2093 : if (spdk_likely(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE)) {
31 2086 : return spdk_vtophys(buf, size);
32 : } else {
33 : /* vfio-user address translation with IOVA=VA mode */
34 7 : return (uint64_t)(uintptr_t)buf;
35 : }
36 : }
37 :
38 : int
39 6 : nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair)
40 : {
41 6 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
42 : uint32_t i;
43 :
44 : /* all head/tail vals are set to 0 */
45 6 : pqpair->last_sq_tail = pqpair->sq_tail = pqpair->sq_head = pqpair->cq_head = 0;
46 :
47 : /*
48 : * First time through the completion queue, HW will set phase
49 : * bit on completions to 1. So set this to 1 here, indicating
50 : * we're looking for a 1 to know which entries have completed.
51 : * we'll toggle the bit each time when the completion queue
52 : * rolls over.
53 : */
54 6 : pqpair->flags.phase = 1;
55 46 : for (i = 0; i < pqpair->num_entries; i++) {
56 40 : pqpair->cpl[i].status.p = 0;
57 : }
58 :
59 6 : return 0;
60 : }
61 :
62 : static void
63 27 : nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr)
64 : {
65 27 : tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
66 27 : tr->cid = cid;
67 27 : tr->req = NULL;
68 27 : }
69 :
70 : static void *
71 4 : nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t size, uint64_t alignment,
72 : uint64_t *phys_addr)
73 : {
74 4 : struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
75 : uintptr_t addr;
76 :
77 4 : if (pctrlr->cmb.mem_register_addr != NULL) {
78 : /* BAR is mapped for data */
79 1 : return NULL;
80 : }
81 :
82 3 : addr = (uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset;
83 3 : addr = (addr + (alignment - 1)) & ~(alignment - 1);
84 :
85 : /* CMB may only consume part of the BAR, calculate accordingly */
86 3 : if (addr + size > ((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.size)) {
87 1 : SPDK_ERRLOG("Tried to allocate past valid CMB range!\n");
88 1 : return NULL;
89 : }
90 2 : *phys_addr = pctrlr->cmb.bar_pa + addr - (uintptr_t)pctrlr->cmb.bar_va;
91 :
92 2 : pctrlr->cmb.current_offset = (addr + size) - (uintptr_t)pctrlr->cmb.bar_va;
93 :
94 2 : return (void *)addr;
95 : }
96 :
97 : int
98 4 : nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair,
99 : const struct spdk_nvme_io_qpair_opts *opts)
100 : {
101 4 : struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
102 4 : struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
103 4 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
104 : struct nvme_tracker *tr;
105 : uint16_t i;
106 : uint16_t num_trackers;
107 4 : size_t page_align = sysconf(_SC_PAGESIZE);
108 : size_t queue_align, queue_len;
109 4 : uint32_t flags = SPDK_MALLOC_DMA;
110 : int32_t numa_id;
111 4 : uint64_t sq_paddr = 0;
112 4 : uint64_t cq_paddr = 0;
113 :
114 4 : if (opts) {
115 2 : pqpair->sq_vaddr = opts->sq.vaddr;
116 2 : pqpair->cq_vaddr = opts->cq.vaddr;
117 2 : pqpair->flags.disable_pcie_sgl_merge = opts->disable_pcie_sgl_merge;
118 2 : sq_paddr = opts->sq.paddr;
119 2 : cq_paddr = opts->cq.paddr;
120 : }
121 :
122 4 : pqpair->retry_count = ctrlr->opts.transport_retry_count;
123 :
124 : /*
125 : * Limit the maximum number of completions to return per call to prevent wraparound,
126 : * and calculate how many trackers can be submitted at once without overflowing the
127 : * completion queue.
128 : */
129 4 : pqpair->max_completions_cap = pqpair->num_entries / 4;
130 4 : pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS);
131 4 : pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS);
132 4 : num_trackers = pqpair->num_entries - pqpair->max_completions_cap;
133 :
134 4 : SPDK_INFOLOG(nvme, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n",
135 : pqpair->max_completions_cap, num_trackers);
136 :
137 4 : assert(num_trackers != 0);
138 :
139 4 : pqpair->sq_in_cmb = false;
140 :
141 4 : if (nvme_qpair_is_admin_queue(&pqpair->qpair)) {
142 1 : flags |= SPDK_MALLOC_SHARE;
143 : }
144 :
145 : /* cmd and cpl rings must be aligned on page size boundaries. */
146 4 : if (ctrlr->opts.use_cmb_sqs) {
147 1 : pqpair->cmd = nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
148 : page_align, &pqpair->cmd_bus_addr);
149 1 : if (pqpair->cmd != NULL) {
150 1 : pqpair->sq_in_cmb = true;
151 : }
152 : }
153 :
154 4 : if (pqpair->sq_in_cmb == false) {
155 3 : if (pqpair->sq_vaddr) {
156 1 : pqpair->cmd = pqpair->sq_vaddr;
157 : } else {
158 : /* To ensure physical address contiguity we make each ring occupy
159 : * a single hugepage only. See MAX_IO_QUEUE_ENTRIES.
160 : */
161 2 : queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cmd);
162 2 : queue_align = spdk_max(spdk_align32pow2(queue_len), page_align);
163 2 : pqpair->cmd = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_NUMA_ID_ANY, flags);
164 2 : if (pqpair->cmd == NULL) {
165 0 : SPDK_ERRLOG("alloc qpair_cmd failed\n");
166 0 : return -ENOMEM;
167 : }
168 : }
169 3 : if (sq_paddr) {
170 1 : assert(pqpair->sq_vaddr != NULL);
171 1 : pqpair->cmd_bus_addr = sq_paddr;
172 : } else {
173 2 : pqpair->cmd_bus_addr = nvme_pcie_vtophys(ctrlr, pqpair->cmd, NULL);
174 2 : if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) {
175 0 : SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n");
176 0 : return -EFAULT;
177 : }
178 : }
179 : }
180 :
181 4 : if (pqpair->cq_vaddr) {
182 2 : pqpair->cpl = pqpair->cq_vaddr;
183 : } else {
184 2 : queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cpl);
185 2 : queue_align = spdk_max(spdk_align32pow2(queue_len), page_align);
186 2 : numa_id = spdk_nvme_ctrlr_get_numa_id(ctrlr);
187 2 : pqpair->cpl = spdk_zmalloc(queue_len, queue_align, NULL, numa_id, flags);
188 2 : if (pqpair->cpl == NULL) {
189 0 : SPDK_ERRLOG("alloc qpair_cpl failed\n");
190 0 : return -ENOMEM;
191 : }
192 : }
193 4 : if (cq_paddr) {
194 2 : assert(pqpair->cq_vaddr != NULL);
195 2 : pqpair->cpl_bus_addr = cq_paddr;
196 : } else {
197 2 : pqpair->cpl_bus_addr = nvme_pcie_vtophys(ctrlr, pqpair->cpl, NULL);
198 2 : if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) {
199 0 : SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n");
200 0 : return -EFAULT;
201 : }
202 : }
203 :
204 4 : pqpair->sq_tdbl = pctrlr->doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32;
205 4 : pqpair->cq_hdbl = pctrlr->doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32;
206 :
207 : /*
208 : * Reserve space for all of the trackers in a single allocation.
209 : * struct nvme_tracker must be padded so that its size is already a power of 2.
210 : * This ensures the PRP list embedded in the nvme_tracker object will not span a
211 : * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing.
212 : */
213 4 : pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL,
214 : SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_SHARE);
215 4 : if (pqpair->tr == NULL) {
216 0 : SPDK_ERRLOG("nvme_tr failed\n");
217 0 : return -ENOMEM;
218 : }
219 :
220 4 : TAILQ_INIT(&pqpair->free_tr);
221 4 : TAILQ_INIT(&pqpair->outstanding_tr);
222 4 : pqpair->qpair.queue_depth = 0;
223 :
224 31 : for (i = 0; i < num_trackers; i++) {
225 27 : tr = &pqpair->tr[i];
226 27 : nvme_qpair_construct_tracker(tr, i, nvme_pcie_vtophys(ctrlr, tr, NULL));
227 27 : TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
228 : }
229 :
230 4 : nvme_pcie_qpair_reset(qpair);
231 :
232 4 : return 0;
233 : }
234 :
235 : int
236 1 : nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t num_entries)
237 : {
238 : struct nvme_pcie_qpair *pqpair;
239 : int rc;
240 :
241 1 : pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_SHARE);
242 1 : if (pqpair == NULL) {
243 0 : return -ENOMEM;
244 : }
245 :
246 1 : pqpair->num_entries = num_entries;
247 1 : pqpair->flags.delay_cmd_submit = 0;
248 1 : pqpair->pcie_state = NVME_PCIE_QPAIR_READY;
249 :
250 1 : ctrlr->adminq = &pqpair->qpair;
251 :
252 1 : rc = nvme_qpair_init(ctrlr->adminq,
253 : 0, /* qpair ID */
254 : ctrlr,
255 : SPDK_NVME_QPRIO_URGENT,
256 : num_entries,
257 : false);
258 1 : if (rc != 0) {
259 0 : return rc;
260 : }
261 :
262 1 : pqpair->stat = spdk_zmalloc(sizeof(*pqpair->stat), 64, NULL, SPDK_ENV_NUMA_ID_ANY,
263 : SPDK_MALLOC_SHARE);
264 1 : if (!pqpair->stat) {
265 0 : SPDK_ERRLOG("Failed to allocate admin qpair statistics\n");
266 0 : return -ENOMEM;
267 : }
268 :
269 1 : return nvme_pcie_qpair_construct(ctrlr->adminq, NULL);
270 : }
271 :
272 : /**
273 : * Note: the ctrlr_lock must be held when calling this function.
274 : */
275 : void
276 0 : nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair,
277 : struct nvme_request *req, struct spdk_nvme_cpl *cpl)
278 : {
279 0 : struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
280 0 : struct nvme_request *active_req = req;
281 : struct spdk_nvme_ctrlr_process *active_proc;
282 :
283 : /*
284 : * The admin request is from another process. Move to the per
285 : * process list for that process to handle it later.
286 : */
287 0 : assert(nvme_qpair_is_admin_queue(qpair));
288 0 : assert(active_req->pid != getpid());
289 :
290 0 : active_proc = nvme_ctrlr_get_process(ctrlr, active_req->pid);
291 0 : if (active_proc) {
292 : /* Save the original completion information */
293 0 : memcpy(&active_req->cpl, cpl, sizeof(*cpl));
294 0 : STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq);
295 : } else {
296 0 : SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n",
297 : active_req->pid);
298 0 : nvme_cleanup_user_req(active_req);
299 0 : nvme_free_request(active_req);
300 : }
301 0 : }
302 :
303 : /**
304 : * Note: the ctrlr_lock must be held when calling this function.
305 : */
306 : void
307 0 : nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair)
308 : {
309 0 : struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
310 : struct nvme_request *req, *tmp_req;
311 0 : pid_t pid = getpid();
312 : struct spdk_nvme_ctrlr_process *proc;
313 :
314 : /*
315 : * Check whether there is any pending admin request from
316 : * other active processes.
317 : */
318 0 : assert(nvme_qpair_is_admin_queue(qpair));
319 :
320 0 : proc = nvme_ctrlr_get_current_process(ctrlr);
321 0 : if (!proc) {
322 0 : SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid);
323 0 : assert(proc);
324 0 : return;
325 : }
326 :
327 0 : STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
328 0 : STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
329 :
330 0 : assert(req->pid == pid);
331 :
332 0 : nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl);
333 : }
334 : }
335 :
336 : int
337 7 : nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr,
338 : struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn,
339 : void *cb_arg)
340 : {
341 7 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
342 : struct nvme_request *req;
343 : struct spdk_nvme_cmd *cmd;
344 :
345 7 : req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
346 7 : if (req == NULL) {
347 2 : return -ENOMEM;
348 : }
349 :
350 5 : cmd = &req->cmd;
351 5 : cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ;
352 :
353 5 : cmd->cdw10_bits.create_io_q.qid = io_que->id;
354 5 : cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1;
355 :
356 5 : cmd->cdw11_bits.create_io_cq.pc = 1;
357 5 : cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr;
358 :
359 5 : return nvme_ctrlr_submit_admin_request(ctrlr, req);
360 : }
361 :
362 : int
363 5 : nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr,
364 : struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
365 : {
366 5 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
367 : struct nvme_request *req;
368 : struct spdk_nvme_cmd *cmd;
369 :
370 5 : req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
371 5 : if (req == NULL) {
372 1 : return -ENOMEM;
373 : }
374 :
375 4 : cmd = &req->cmd;
376 4 : cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ;
377 :
378 4 : cmd->cdw10_bits.create_io_q.qid = io_que->id;
379 4 : cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1;
380 4 : cmd->cdw11_bits.create_io_sq.pc = 1;
381 4 : cmd->cdw11_bits.create_io_sq.qprio = io_que->qprio;
382 4 : cmd->cdw11_bits.create_io_sq.cqid = io_que->id;
383 4 : cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr;
384 :
385 4 : return nvme_ctrlr_submit_admin_request(ctrlr, req);
386 : }
387 :
388 : int
389 3 : nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
390 : spdk_nvme_cmd_cb cb_fn, void *cb_arg)
391 : {
392 : struct nvme_request *req;
393 : struct spdk_nvme_cmd *cmd;
394 :
395 3 : req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
396 3 : if (req == NULL) {
397 1 : return -ENOMEM;
398 : }
399 :
400 2 : cmd = &req->cmd;
401 2 : cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ;
402 2 : cmd->cdw10_bits.delete_io_q.qid = qpair->id;
403 :
404 2 : return nvme_ctrlr_submit_admin_request(ctrlr, req);
405 : }
406 :
407 : int
408 2 : nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
409 : spdk_nvme_cmd_cb cb_fn, void *cb_arg)
410 : {
411 : struct nvme_request *req;
412 : struct spdk_nvme_cmd *cmd;
413 :
414 2 : req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
415 2 : if (req == NULL) {
416 1 : return -ENOMEM;
417 : }
418 :
419 1 : cmd = &req->cmd;
420 1 : cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ;
421 1 : cmd->cdw10_bits.delete_io_q.qid = qpair->id;
422 :
423 1 : return nvme_ctrlr_submit_admin_request(ctrlr, req);
424 : }
425 :
426 : static void
427 1 : nvme_completion_sq_error_delete_cq_cb(void *arg, const struct spdk_nvme_cpl *cpl)
428 : {
429 1 : struct spdk_nvme_qpair *qpair = arg;
430 1 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
431 :
432 1 : if (spdk_nvme_cpl_is_error(cpl)) {
433 0 : SPDK_ERRLOG("delete_io_cq failed!\n");
434 : }
435 :
436 1 : pqpair->pcie_state = NVME_PCIE_QPAIR_FAILED;
437 1 : }
438 :
439 : static void
440 3 : nvme_completion_create_sq_cb(void *arg, const struct spdk_nvme_cpl *cpl)
441 : {
442 3 : struct spdk_nvme_qpair *qpair = arg;
443 3 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
444 3 : struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
445 3 : struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
446 : int rc;
447 :
448 3 : if (pqpair->flags.defer_destruction) {
449 : /* This qpair was deleted by the application while the
450 : * connection was still in progress. We had to wait
451 : * to free the qpair resources until this outstanding
452 : * command was completed. Now that we have the completion
453 : * free it now.
454 : */
455 0 : nvme_pcie_qpair_destroy(qpair);
456 0 : return;
457 : }
458 :
459 3 : if (spdk_nvme_cpl_is_error(cpl)) {
460 1 : SPDK_ERRLOG("nvme_create_io_sq failed, deleting cq!\n");
461 1 : rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_sq_error_delete_cq_cb,
462 : qpair);
463 1 : if (rc != 0) {
464 0 : SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
465 0 : pqpair->pcie_state = NVME_PCIE_QPAIR_FAILED;
466 : }
467 1 : return;
468 : }
469 2 : pqpair->pcie_state = NVME_PCIE_QPAIR_READY;
470 2 : if (ctrlr->shadow_doorbell) {
471 1 : pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) *
472 1 : pctrlr->doorbell_stride_u32;
473 1 : pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) *
474 1 : pctrlr->doorbell_stride_u32;
475 1 : pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) *
476 1 : pctrlr->doorbell_stride_u32;
477 1 : pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) *
478 1 : pctrlr->doorbell_stride_u32;
479 1 : pqpair->flags.has_shadow_doorbell = 1;
480 : } else {
481 1 : pqpair->flags.has_shadow_doorbell = 0;
482 : }
483 2 : nvme_pcie_qpair_reset(qpair);
484 :
485 : }
486 :
487 : static void
488 4 : nvme_completion_create_cq_cb(void *arg, const struct spdk_nvme_cpl *cpl)
489 : {
490 4 : struct spdk_nvme_qpair *qpair = arg;
491 4 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
492 : int rc;
493 :
494 4 : if (pqpair->flags.defer_destruction) {
495 : /* This qpair was deleted by the application while the
496 : * connection was still in progress. We had to wait
497 : * to free the qpair resources until this outstanding
498 : * command was completed. Now that we have the completion
499 : * free it now.
500 : */
501 0 : nvme_pcie_qpair_destroy(qpair);
502 0 : return;
503 : }
504 :
505 4 : if (spdk_nvme_cpl_is_error(cpl)) {
506 1 : pqpair->pcie_state = NVME_PCIE_QPAIR_FAILED;
507 1 : SPDK_ERRLOG("nvme_create_io_cq failed!\n");
508 1 : return;
509 : }
510 :
511 3 : rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_create_sq_cb, qpair);
512 :
513 3 : if (rc != 0) {
514 0 : SPDK_ERRLOG("Failed to send request to create_io_sq, deleting cq!\n");
515 0 : rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_sq_error_delete_cq_cb,
516 : qpair);
517 0 : if (rc != 0) {
518 0 : SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
519 0 : pqpair->pcie_state = NVME_PCIE_QPAIR_FAILED;
520 : }
521 0 : return;
522 : }
523 3 : pqpair->pcie_state = NVME_PCIE_QPAIR_WAIT_FOR_SQ;
524 : }
525 :
526 : static int
527 5 : _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
528 : uint16_t qid)
529 : {
530 5 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
531 : int rc;
532 :
533 : /* Statistics may already be allocated in the case of controller reset */
534 5 : if (qpair->poll_group) {
535 5 : struct nvme_pcie_poll_group *group = SPDK_CONTAINEROF(qpair->poll_group,
536 : struct nvme_pcie_poll_group, group);
537 :
538 5 : pqpair->stat = &group->stats;
539 5 : pqpair->shared_stats = true;
540 : } else {
541 0 : if (pqpair->stat == NULL) {
542 0 : pqpair->stat = calloc(1, sizeof(*pqpair->stat));
543 0 : if (!pqpair->stat) {
544 0 : SPDK_ERRLOG("Failed to allocate qpair statistics\n");
545 0 : nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
546 0 : return -ENOMEM;
547 : }
548 : }
549 : }
550 :
551 5 : rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_create_cq_cb, qpair);
552 :
553 5 : if (rc != 0) {
554 1 : SPDK_ERRLOG("Failed to send request to create_io_cq\n");
555 1 : nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
556 1 : return rc;
557 : }
558 4 : pqpair->pcie_state = NVME_PCIE_QPAIR_WAIT_FOR_CQ;
559 4 : return 0;
560 : }
561 :
562 : int
563 5 : nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
564 : {
565 5 : int rc = 0;
566 :
567 5 : if (!nvme_qpair_is_admin_queue(qpair)) {
568 5 : rc = _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id);
569 : } else {
570 0 : nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED);
571 : }
572 :
573 5 : return rc;
574 : }
575 :
576 : void
577 0 : nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
578 : {
579 0 : if (!nvme_qpair_is_admin_queue(qpair) || !ctrlr->is_disconnecting) {
580 0 : nvme_transport_ctrlr_disconnect_qpair_done(qpair);
581 : } else {
582 : /* If this function is called for the admin qpair via spdk_nvme_ctrlr_reset()
583 : * or spdk_nvme_ctrlr_disconnect(), initiate a Controller Level Reset.
584 : * Then we can abort trackers safely because the Controller Level Reset deletes
585 : * all I/O SQ/CQs.
586 : */
587 0 : nvme_ctrlr_disable(ctrlr);
588 : }
589 0 : }
590 :
591 : /* Used when dst points to MMIO (i.e. CMB) in a virtual machine - in these cases we must
592 : * not use wide instructions because QEMU will not emulate such instructions to MMIO space.
593 : * So this function ensures we only copy 8 bytes at a time.
594 : */
595 : static inline void
596 0 : nvme_pcie_copy_command_mmio(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
597 : {
598 0 : uint64_t *dst64 = (uint64_t *)dst;
599 0 : const uint64_t *src64 = (const uint64_t *)src;
600 : uint32_t i;
601 :
602 0 : for (i = 0; i < sizeof(*dst) / 8; i++) {
603 0 : dst64[i] = src64[i];
604 : }
605 0 : }
606 :
607 : static inline void
608 0 : nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
609 : {
610 : /* dst and src are known to be non-overlapping and 64-byte aligned. */
611 : #if defined(__SSE2__)
612 0 : __m128i *d128 = (__m128i *)dst;
613 0 : const __m128i *s128 = (const __m128i *)src;
614 :
615 0 : _mm_stream_si128(&d128[0], _mm_load_si128(&s128[0]));
616 0 : _mm_stream_si128(&d128[1], _mm_load_si128(&s128[1]));
617 0 : _mm_stream_si128(&d128[2], _mm_load_si128(&s128[2]));
618 0 : _mm_stream_si128(&d128[3], _mm_load_si128(&s128[3]));
619 : #else
620 : *dst = *src;
621 : #endif
622 0 : }
623 :
624 : void
625 0 : nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
626 : {
627 : struct nvme_request *req;
628 0 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
629 0 : struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
630 :
631 0 : req = tr->req;
632 0 : assert(req != NULL);
633 :
634 0 : spdk_trace_record(TRACE_NVME_PCIE_SUBMIT, qpair->id, 0, (uintptr_t)req, req->cb_arg,
635 : (uint32_t)req->cmd.cid, (uint32_t)req->cmd.opc,
636 : req->cmd.cdw10, req->cmd.cdw11, req->cmd.cdw12,
637 : pqpair->qpair.queue_depth);
638 :
639 0 : if (req->cmd.fuse) {
640 : /*
641 : * Keep track of the fuse operation sequence so that we ring the doorbell only
642 : * after the second fuse is submitted.
643 : */
644 0 : qpair->last_fuse = req->cmd.fuse;
645 : }
646 :
647 : /* Don't use wide instructions to copy NVMe command, this is limited by QEMU
648 : * virtual NVMe controller, the maximum access width is 8 Bytes for one time.
649 : */
650 0 : if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH) && pqpair->sq_in_cmb)) {
651 0 : nvme_pcie_copy_command_mmio(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
652 : } else {
653 : /* Copy the command from the tracker to the submission queue. */
654 0 : nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
655 : }
656 :
657 0 : if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) {
658 0 : pqpair->sq_tail = 0;
659 : }
660 :
661 0 : if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) {
662 0 : SPDK_ERRLOG("sq_tail is passing sq_head!\n");
663 : }
664 :
665 0 : if (!pqpair->flags.delay_cmd_submit) {
666 0 : nvme_pcie_qpair_ring_sq_doorbell(qpair);
667 : }
668 0 : }
669 :
670 : void
671 0 : nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
672 : struct spdk_nvme_cpl *cpl, bool print_on_error)
673 : {
674 0 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
675 : struct nvme_request *req;
676 : bool retry, error;
677 : bool print_error;
678 :
679 0 : req = tr->req;
680 :
681 0 : spdk_trace_record(TRACE_NVME_PCIE_COMPLETE, qpair->id, 0, (uintptr_t)req, req->cb_arg,
682 : (uint32_t)req->cmd.cid, (uint32_t)cpl->status_raw, pqpair->qpair.queue_depth);
683 :
684 0 : assert(req != NULL);
685 :
686 0 : error = spdk_nvme_cpl_is_error(cpl);
687 0 : retry = error && nvme_completion_is_retry(cpl) &&
688 0 : req->retries < pqpair->retry_count;
689 0 : print_error = error && print_on_error && !qpair->ctrlr->opts.disable_error_logging;
690 :
691 0 : if (print_error) {
692 0 : spdk_nvme_qpair_print_command(qpair, &req->cmd);
693 : }
694 :
695 0 : if (print_error || SPDK_DEBUGLOG_FLAG_ENABLED("nvme")) {
696 0 : spdk_nvme_qpair_print_completion(qpair, cpl);
697 : }
698 :
699 0 : assert(cpl->cid == req->cmd.cid);
700 :
701 0 : if (retry) {
702 0 : req->retries++;
703 0 : nvme_pcie_qpair_submit_tracker(qpair, tr);
704 : } else {
705 0 : TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list);
706 0 : pqpair->qpair.queue_depth--;
707 :
708 : /* Only check admin requests from different processes. */
709 0 : if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) {
710 0 : nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl);
711 : } else {
712 0 : nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl);
713 : }
714 :
715 0 : tr->req = NULL;
716 :
717 0 : TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
718 : }
719 0 : }
720 :
721 : void
722 0 : nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair,
723 : struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
724 : bool print_on_error)
725 : {
726 0 : struct spdk_nvme_cpl cpl;
727 :
728 0 : memset(&cpl, 0, sizeof(cpl));
729 0 : cpl.sqid = qpair->id;
730 0 : cpl.cid = tr->cid;
731 0 : cpl.status.sct = sct;
732 0 : cpl.status.sc = sc;
733 0 : cpl.status.dnr = dnr;
734 0 : nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
735 0 : }
736 :
737 : void
738 0 : nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr)
739 : {
740 0 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
741 : struct nvme_tracker *tr, *temp, *last;
742 :
743 0 : last = TAILQ_LAST(&pqpair->outstanding_tr, nvme_outstanding_tr_head);
744 :
745 : /* Abort previously submitted (outstanding) trs */
746 0 : TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) {
747 0 : if (!qpair->ctrlr->opts.disable_error_logging) {
748 0 : SPDK_ERRLOG("aborting outstanding command\n");
749 : }
750 0 : nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
751 : SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true);
752 :
753 0 : if (tr == last) {
754 0 : break;
755 : }
756 : }
757 0 : }
758 :
759 : void
760 1 : nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
761 : {
762 1 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
763 : struct nvme_tracker *tr;
764 :
765 1 : tr = TAILQ_FIRST(&pqpair->outstanding_tr);
766 1 : while (tr != NULL) {
767 0 : assert(tr->req != NULL);
768 0 : if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
769 0 : nvme_pcie_qpair_manual_complete_tracker(qpair, tr,
770 : SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0,
771 : false);
772 0 : tr = TAILQ_FIRST(&pqpair->outstanding_tr);
773 : } else {
774 0 : tr = TAILQ_NEXT(tr, tq_list);
775 : }
776 : }
777 1 : }
778 :
779 : void
780 1 : nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair)
781 : {
782 1 : nvme_pcie_admin_qpair_abort_aers(qpair);
783 1 : }
784 :
785 : void
786 0 : nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
787 : {
788 0 : nvme_pcie_qpair_abort_trackers(qpair, dnr);
789 0 : }
790 :
791 : static void
792 0 : nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
793 : {
794 : uint64_t t02;
795 : struct nvme_tracker *tr, *tmp;
796 0 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
797 0 : struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
798 : struct spdk_nvme_ctrlr_process *active_proc;
799 :
800 : /* Don't check timeouts during controller initialization. */
801 0 : if (ctrlr->state != NVME_CTRLR_STATE_READY) {
802 0 : return;
803 : }
804 :
805 0 : if (nvme_qpair_is_admin_queue(qpair)) {
806 0 : active_proc = nvme_ctrlr_get_current_process(ctrlr);
807 : } else {
808 0 : active_proc = qpair->active_proc;
809 : }
810 :
811 : /* Only check timeouts if the current process has a timeout callback. */
812 0 : if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
813 0 : return;
814 : }
815 :
816 0 : t02 = spdk_get_ticks();
817 0 : TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
818 0 : assert(tr->req != NULL);
819 :
820 0 : if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) {
821 : /*
822 : * The requests are in order, so as soon as one has not timed out,
823 : * stop iterating.
824 : */
825 0 : break;
826 : }
827 : }
828 : }
829 :
830 : int32_t
831 0 : nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
832 : {
833 0 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
834 : struct nvme_tracker *tr;
835 : struct spdk_nvme_cpl *cpl, *next_cpl;
836 0 : uint32_t num_completions = 0;
837 0 : struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
838 : uint16_t next_cq_head;
839 : uint8_t next_phase;
840 0 : bool next_is_valid = false;
841 : int rc;
842 :
843 0 : if (spdk_unlikely(pqpair->pcie_state == NVME_PCIE_QPAIR_FAILED)) {
844 0 : return -ENXIO;
845 : }
846 :
847 0 : if (spdk_unlikely(nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING)) {
848 0 : if (pqpair->pcie_state == NVME_PCIE_QPAIR_READY) {
849 : /* It is possible that another thread set the pcie_state to
850 : * QPAIR_READY, if it polled the adminq and processed the SQ
851 : * completion for this qpair. So check for that condition
852 : * here and then update the qpair's state to CONNECTED, since
853 : * we can only set the qpair state from the qpair's thread.
854 : * (Note: this fixed issue #2157.)
855 : */
856 0 : nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED);
857 0 : } else if (pqpair->pcie_state == NVME_PCIE_QPAIR_FAILED) {
858 0 : nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
859 0 : return -ENXIO;
860 : } else {
861 0 : rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
862 0 : if (rc < 0) {
863 0 : return rc;
864 0 : } else if (pqpair->pcie_state == NVME_PCIE_QPAIR_FAILED) {
865 0 : nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
866 0 : return -ENXIO;
867 : }
868 : }
869 0 : return 0;
870 : }
871 :
872 0 : if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
873 0 : nvme_ctrlr_lock(ctrlr);
874 : }
875 :
876 0 : if (max_completions == 0 || max_completions > pqpair->max_completions_cap) {
877 : /*
878 : * max_completions == 0 means unlimited, but complete at most
879 : * max_completions_cap batch of I/O at a time so that the completion
880 : * queue doorbells don't wrap around.
881 : */
882 0 : max_completions = pqpair->max_completions_cap;
883 : }
884 :
885 0 : pqpair->stat->polls++;
886 :
887 : while (1) {
888 0 : cpl = &pqpair->cpl[pqpair->cq_head];
889 :
890 0 : if (!next_is_valid && cpl->status.p != pqpair->flags.phase) {
891 0 : break;
892 : }
893 :
894 0 : if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) {
895 0 : next_cq_head = pqpair->cq_head + 1;
896 0 : next_phase = pqpair->flags.phase;
897 : } else {
898 0 : next_cq_head = 0;
899 0 : next_phase = !pqpair->flags.phase;
900 : }
901 0 : next_cpl = &pqpair->cpl[next_cq_head];
902 0 : next_is_valid = (next_cpl->status.p == next_phase);
903 0 : if (next_is_valid) {
904 0 : __builtin_prefetch(&pqpair->tr[next_cpl->cid]);
905 : }
906 :
907 : #if defined(__PPC64__) || defined(__riscv) || defined(__loongarch__)
908 : /*
909 : * This memory barrier prevents reordering of:
910 : * - load after store from/to tr
911 : * - load after load cpl phase and cpl cid
912 : */
913 : spdk_mb();
914 : #elif defined(__aarch64__)
915 : __asm volatile("dmb oshld" ::: "memory");
916 : #endif
917 :
918 0 : if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) {
919 0 : pqpair->cq_head = 0;
920 0 : pqpair->flags.phase = !pqpair->flags.phase;
921 : }
922 :
923 0 : tr = &pqpair->tr[cpl->cid];
924 0 : pqpair->sq_head = cpl->sqhd;
925 :
926 0 : if (tr->req) {
927 : /* Prefetch the req's STAILQ_ENTRY since we'll need to access it
928 : * as part of putting the req back on the qpair's free list.
929 : */
930 0 : __builtin_prefetch(&tr->req->stailq);
931 0 : nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true);
932 : } else {
933 0 : SPDK_ERRLOG("cpl does not map to outstanding cmd\n");
934 0 : spdk_nvme_qpair_print_completion(qpair, cpl);
935 0 : assert(0);
936 : }
937 :
938 0 : if (++num_completions == max_completions) {
939 0 : break;
940 : }
941 : }
942 :
943 0 : if (num_completions > 0) {
944 0 : pqpair->stat->completions += num_completions;
945 0 : nvme_pcie_qpair_ring_cq_doorbell(qpair);
946 : } else {
947 0 : pqpair->stat->idle_polls++;
948 : }
949 :
950 0 : if (pqpair->flags.delay_cmd_submit) {
951 0 : if (pqpair->last_sq_tail != pqpair->sq_tail) {
952 0 : nvme_pcie_qpair_ring_sq_doorbell(qpair);
953 0 : pqpair->last_sq_tail = pqpair->sq_tail;
954 : }
955 : }
956 :
957 0 : if (spdk_unlikely(ctrlr->timeout_enabled)) {
958 : /*
959 : * User registered for timeout callback
960 : */
961 0 : nvme_pcie_qpair_check_timeout(qpair);
962 : }
963 :
964 : /* Before returning, complete any pending admin request or
965 : * process the admin qpair disconnection.
966 : */
967 0 : if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
968 0 : nvme_pcie_qpair_complete_pending_admin_request(qpair);
969 :
970 0 : if (nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING) {
971 0 : rc = nvme_ctrlr_disable_poll(qpair->ctrlr);
972 0 : if (rc != -EAGAIN) {
973 0 : nvme_transport_ctrlr_disconnect_qpair_done(qpair);
974 : }
975 : }
976 :
977 0 : nvme_ctrlr_unlock(ctrlr);
978 : }
979 :
980 0 : if (spdk_unlikely(pqpair->flags.has_pending_vtophys_failures)) {
981 : struct nvme_tracker *tr, *tmp;
982 :
983 0 : TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
984 0 : if (tr->bad_vtophys) {
985 0 : tr->bad_vtophys = 0;
986 0 : nvme_pcie_fail_request_bad_vtophys(qpair, tr);
987 : }
988 : }
989 0 : pqpair->flags.has_pending_vtophys_failures = 0;
990 : }
991 :
992 0 : return num_completions;
993 : }
994 :
995 : int
996 4 : nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair)
997 : {
998 4 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
999 :
1000 4 : if (nvme_qpair_is_admin_queue(qpair)) {
1001 1 : nvme_pcie_admin_qpair_destroy(qpair);
1002 : }
1003 : /*
1004 : * We check sq_vaddr and cq_vaddr to see if the user specified the memory
1005 : * buffers when creating the I/O queue.
1006 : * If the user specified them, we cannot free that memory.
1007 : * Nor do we free it if it's in the CMB.
1008 : */
1009 4 : if (!pqpair->sq_vaddr && pqpair->cmd && !pqpair->sq_in_cmb) {
1010 2 : spdk_free(pqpair->cmd);
1011 : }
1012 4 : if (!pqpair->cq_vaddr && pqpair->cpl) {
1013 2 : spdk_free(pqpair->cpl);
1014 : }
1015 4 : if (pqpair->tr) {
1016 4 : spdk_free(pqpair->tr);
1017 : }
1018 :
1019 4 : nvme_qpair_deinit(qpair);
1020 :
1021 4 : if (!pqpair->shared_stats && (!qpair->active_proc ||
1022 0 : qpair->active_proc == nvme_ctrlr_get_current_process(qpair->ctrlr))) {
1023 4 : if (qpair->id) {
1024 3 : free(pqpair->stat);
1025 : } else {
1026 : /* statistics of admin qpair are allocates from huge pages because
1027 : * admin qpair is shared for multi-process */
1028 1 : spdk_free(pqpair->stat);
1029 : }
1030 :
1031 : }
1032 :
1033 4 : spdk_free(pqpair);
1034 :
1035 4 : return 0;
1036 : }
1037 :
1038 : struct spdk_nvme_qpair *
1039 0 : nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
1040 : const struct spdk_nvme_io_qpair_opts *opts)
1041 : {
1042 : struct nvme_pcie_qpair *pqpair;
1043 : struct spdk_nvme_qpair *qpair;
1044 : int rc;
1045 :
1046 0 : assert(ctrlr != NULL);
1047 :
1048 0 : pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL,
1049 : SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_SHARE);
1050 0 : if (pqpair == NULL) {
1051 0 : return NULL;
1052 : }
1053 :
1054 0 : pqpair->num_entries = opts->io_queue_size;
1055 0 : pqpair->flags.delay_cmd_submit = opts->delay_cmd_submit;
1056 :
1057 0 : qpair = &pqpair->qpair;
1058 :
1059 0 : rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests, opts->async_mode);
1060 0 : if (rc != 0) {
1061 0 : nvme_pcie_qpair_destroy(qpair);
1062 0 : return NULL;
1063 : }
1064 :
1065 0 : rc = nvme_pcie_qpair_construct(qpair, opts);
1066 :
1067 0 : if (rc != 0) {
1068 0 : nvme_pcie_qpair_destroy(qpair);
1069 0 : return NULL;
1070 : }
1071 :
1072 0 : return qpair;
1073 : }
1074 :
1075 : int
1076 0 : nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1077 : {
1078 0 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1079 : struct nvme_completion_poll_status *status;
1080 : int rc;
1081 :
1082 0 : assert(ctrlr != NULL);
1083 :
1084 0 : if (ctrlr->is_removed) {
1085 0 : goto free;
1086 : }
1087 :
1088 0 : if (ctrlr->prepare_for_reset) {
1089 0 : if (nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING) {
1090 0 : pqpair->flags.defer_destruction = true;
1091 : }
1092 0 : goto clear_shadow_doorbells;
1093 : }
1094 :
1095 : /* If attempting to delete a qpair that's still being connected, we have to wait until it's
1096 : * finished, so that we don't free it while it's waiting for the create cq/sq callbacks.
1097 : */
1098 0 : while (pqpair->pcie_state == NVME_PCIE_QPAIR_WAIT_FOR_CQ ||
1099 0 : pqpair->pcie_state == NVME_PCIE_QPAIR_WAIT_FOR_SQ) {
1100 0 : rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
1101 0 : if (rc < 0) {
1102 0 : break;
1103 : }
1104 : }
1105 :
1106 0 : status = calloc(1, sizeof(*status));
1107 0 : if (!status) {
1108 0 : SPDK_ERRLOG("Failed to allocate status tracker\n");
1109 0 : goto free;
1110 : }
1111 :
1112 : /* Delete the I/O submission queue */
1113 0 : rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, status);
1114 0 : if (rc != 0) {
1115 0 : SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc);
1116 0 : free(status);
1117 0 : goto free;
1118 : }
1119 0 : if (nvme_wait_for_completion(ctrlr->adminq, status)) {
1120 0 : if (!status->timed_out) {
1121 0 : free(status);
1122 : }
1123 0 : goto free;
1124 : }
1125 :
1126 : /* Now that the submission queue is deleted, the device is supposed to have
1127 : * completed any outstanding I/O. Try to complete them. If they don't complete,
1128 : * they'll be marked as aborted and completed below. */
1129 0 : if (qpair->active_proc == nvme_ctrlr_get_current_process(ctrlr)) {
1130 0 : nvme_pcie_qpair_process_completions(qpair, 0);
1131 : }
1132 :
1133 0 : memset(status, 0, sizeof(*status));
1134 : /* Delete the completion queue */
1135 0 : rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status);
1136 0 : if (rc != 0) {
1137 0 : SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
1138 0 : free(status);
1139 0 : goto free;
1140 : }
1141 0 : if (nvme_wait_for_completion(ctrlr->adminq, status)) {
1142 0 : if (!status->timed_out) {
1143 0 : free(status);
1144 : }
1145 0 : goto free;
1146 : }
1147 0 : free(status);
1148 :
1149 0 : clear_shadow_doorbells:
1150 0 : if (pqpair->flags.has_shadow_doorbell && ctrlr->shadow_doorbell) {
1151 0 : *pqpair->shadow_doorbell.sq_tdbl = 0;
1152 0 : *pqpair->shadow_doorbell.cq_hdbl = 0;
1153 0 : *pqpair->shadow_doorbell.sq_eventidx = 0;
1154 0 : *pqpair->shadow_doorbell.cq_eventidx = 0;
1155 : }
1156 0 : free:
1157 0 : if (qpair->no_deletion_notification_needed == 0) {
1158 : /* Abort the rest of the I/O */
1159 0 : nvme_pcie_qpair_abort_trackers(qpair, 1);
1160 : }
1161 :
1162 0 : if (!pqpair->flags.defer_destruction) {
1163 0 : nvme_pcie_qpair_destroy(qpair);
1164 : }
1165 0 : return 0;
1166 : }
1167 :
1168 : static void
1169 3 : nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
1170 : {
1171 3 : if (!qpair->in_completion_context) {
1172 3 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1173 :
1174 3 : tr->bad_vtophys = 1;
1175 3 : pqpair->flags.has_pending_vtophys_failures = 1;
1176 3 : return;
1177 : }
1178 :
1179 : /*
1180 : * Bad vtophys translation, so abort this request and return
1181 : * immediately.
1182 : */
1183 0 : SPDK_ERRLOG("vtophys or other payload buffer related error\n");
1184 0 : nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
1185 : SPDK_NVME_SC_INVALID_FIELD,
1186 : 1 /* do not retry */, true);
1187 : }
1188 :
1189 : /*
1190 : * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes.
1191 : *
1192 : * *prp_index will be updated to account for the number of PRP entries used.
1193 : */
1194 : static inline int
1195 25 : nvme_pcie_prp_list_append(struct spdk_nvme_ctrlr *ctrlr, struct nvme_tracker *tr,
1196 : uint32_t *prp_index, void *virt_addr, size_t len,
1197 : uint32_t page_size)
1198 : {
1199 25 : struct spdk_nvme_cmd *cmd = &tr->req->cmd;
1200 25 : uintptr_t page_mask = page_size - 1;
1201 : uint64_t phys_addr;
1202 : uint32_t i;
1203 :
1204 25 : SPDK_DEBUGLOG(nvme, "prp_index:%u virt_addr:%p len:%u\n",
1205 : *prp_index, virt_addr, (uint32_t)len);
1206 :
1207 25 : if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) {
1208 2 : SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1209 2 : return -EFAULT;
1210 : }
1211 :
1212 23 : i = *prp_index;
1213 2070 : while (len) {
1214 : uint32_t seg_len;
1215 :
1216 : /*
1217 : * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array,
1218 : * so prp_index == count is valid.
1219 : */
1220 2051 : if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) {
1221 2 : SPDK_ERRLOG("out of PRP entries\n");
1222 2 : return -EFAULT;
1223 : }
1224 :
1225 2049 : phys_addr = nvme_pcie_vtophys(ctrlr, virt_addr, NULL);
1226 2049 : if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) {
1227 1 : SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr);
1228 1 : return -EFAULT;
1229 : }
1230 :
1231 2048 : if (i == 0) {
1232 19 : SPDK_DEBUGLOG(nvme, "prp1 = %p\n", (void *)phys_addr);
1233 19 : cmd->dptr.prp.prp1 = phys_addr;
1234 19 : seg_len = page_size - ((uintptr_t)virt_addr & page_mask);
1235 : } else {
1236 2029 : if ((phys_addr & page_mask) != 0) {
1237 1 : SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr);
1238 1 : return -EFAULT;
1239 : }
1240 :
1241 2028 : SPDK_DEBUGLOG(nvme, "prp[%u] = %p\n", i - 1, (void *)phys_addr);
1242 2028 : tr->u.prp[i - 1] = phys_addr;
1243 2028 : seg_len = page_size;
1244 : }
1245 :
1246 2047 : seg_len = spdk_min(seg_len, len);
1247 2047 : virt_addr = (uint8_t *)virt_addr + seg_len;
1248 2047 : len -= seg_len;
1249 2047 : i++;
1250 : }
1251 :
1252 19 : cmd->psdt = SPDK_NVME_PSDT_PRP;
1253 19 : if (i <= 1) {
1254 6 : cmd->dptr.prp.prp2 = 0;
1255 13 : } else if (i == 2) {
1256 6 : cmd->dptr.prp.prp2 = tr->u.prp[0];
1257 6 : SPDK_DEBUGLOG(nvme, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2);
1258 : } else {
1259 7 : cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr;
1260 7 : SPDK_DEBUGLOG(nvme, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2);
1261 : }
1262 :
1263 19 : *prp_index = i;
1264 19 : return 0;
1265 : }
1266 :
1267 : static int
1268 0 : nvme_pcie_qpair_build_request_invalid(struct spdk_nvme_qpair *qpair,
1269 : struct nvme_request *req, struct nvme_tracker *tr, bool dword_aligned)
1270 : {
1271 0 : assert(0);
1272 : nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1273 : return -EINVAL;
1274 : }
1275 :
1276 : /**
1277 : * Build PRP list describing physically contiguous payload buffer.
1278 : */
1279 : static int
1280 4 : nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1281 : struct nvme_tracker *tr, bool dword_aligned)
1282 : {
1283 4 : uint32_t prp_index = 0;
1284 : int rc;
1285 :
1286 4 : rc = nvme_pcie_prp_list_append(qpair->ctrlr, tr, &prp_index,
1287 4 : (uint8_t *)req->payload.contig_or_cb_arg + req->payload_offset,
1288 4 : req->payload_size, qpair->ctrlr->page_size);
1289 4 : if (rc) {
1290 1 : nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1291 : } else {
1292 3 : SPDK_DEBUGLOG(nvme, "Number of PRP entries: %" PRIu32 "\n", prp_index);
1293 : }
1294 :
1295 4 : return rc;
1296 : }
1297 :
1298 : /**
1299 : * Build an SGL describing a physically contiguous payload buffer.
1300 : *
1301 : * This is more efficient than using PRP because large buffers can be
1302 : * described this way.
1303 : */
1304 : static int
1305 3 : nvme_pcie_qpair_build_contig_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1306 : struct nvme_tracker *tr, bool dword_aligned)
1307 : {
1308 : uint8_t *virt_addr;
1309 3 : uint64_t phys_addr, mapping_length;
1310 : uint32_t length;
1311 : struct spdk_nvme_sgl_descriptor *sgl;
1312 3 : uint32_t nseg = 0;
1313 :
1314 3 : assert(req->payload_size != 0);
1315 3 : assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
1316 :
1317 3 : sgl = tr->u.sgl;
1318 3 : req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
1319 3 : req->cmd.dptr.sgl1.unkeyed.subtype = 0;
1320 :
1321 3 : length = req->payload_size;
1322 : /* ubsan complains about applying zero offset to null pointer if contig_or_cb_arg is NULL,
1323 : * so just double cast it to make it go away */
1324 3 : virt_addr = (uint8_t *)((uintptr_t)req->payload.contig_or_cb_arg + req->payload_offset);
1325 :
1326 7 : while (length > 0) {
1327 4 : if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1328 0 : nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1329 0 : return -EFAULT;
1330 : }
1331 :
1332 4 : if (dword_aligned && ((uintptr_t)virt_addr & 3)) {
1333 0 : SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1334 0 : nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1335 0 : return -EFAULT;
1336 : }
1337 :
1338 4 : mapping_length = length;
1339 4 : phys_addr = nvme_pcie_vtophys(qpair->ctrlr, virt_addr, &mapping_length);
1340 4 : if (phys_addr == SPDK_VTOPHYS_ERROR) {
1341 0 : nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1342 0 : return -EFAULT;
1343 : }
1344 :
1345 4 : mapping_length = spdk_min(length, mapping_length);
1346 :
1347 4 : length -= mapping_length;
1348 4 : virt_addr += mapping_length;
1349 :
1350 4 : sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1351 4 : sgl->unkeyed.length = mapping_length;
1352 4 : sgl->address = phys_addr;
1353 4 : sgl->unkeyed.subtype = 0;
1354 :
1355 4 : sgl++;
1356 4 : nseg++;
1357 : }
1358 :
1359 3 : if (nseg == 1) {
1360 : /*
1361 : * The whole transfer can be described by a single SGL descriptor.
1362 : * Use the special case described by the spec where SGL1's type is Data Block.
1363 : * This means the SGL in the tracker is not used at all, so copy the first (and only)
1364 : * SGL element into SGL1.
1365 : */
1366 2 : req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1367 2 : req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
1368 2 : req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
1369 : } else {
1370 : /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because
1371 : * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page.
1372 : */
1373 1 : req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
1374 1 : req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
1375 1 : req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
1376 : }
1377 :
1378 3 : SPDK_DEBUGLOG(nvme, "Number of SGL descriptors: %" PRIu32 "\n", nseg);
1379 3 : return 0;
1380 : }
1381 :
1382 : /**
1383 : * Build SGL list describing scattered payload buffer.
1384 : */
1385 : static int
1386 2 : nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1387 : struct nvme_tracker *tr, bool dword_aligned)
1388 : {
1389 : int rc;
1390 2 : void *virt_addr;
1391 2 : uint64_t phys_addr, mapping_length;
1392 2 : uint32_t remaining_transfer_len, remaining_user_sge_len, length;
1393 : struct spdk_nvme_sgl_descriptor *sgl;
1394 2 : uint32_t nseg = 0;
1395 2 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1396 :
1397 : /*
1398 : * Build scattered payloads.
1399 : */
1400 2 : assert(req->payload_size != 0);
1401 2 : assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
1402 2 : assert(req->payload.reset_sgl_fn != NULL);
1403 2 : assert(req->payload.next_sge_fn != NULL);
1404 2 : req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
1405 :
1406 2 : sgl = tr->u.sgl;
1407 2 : req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
1408 2 : req->cmd.dptr.sgl1.unkeyed.subtype = 0;
1409 :
1410 2 : remaining_transfer_len = req->payload_size;
1411 :
1412 6 : while (remaining_transfer_len > 0) {
1413 4 : rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg,
1414 : &virt_addr, &remaining_user_sge_len);
1415 4 : if (rc) {
1416 0 : nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1417 0 : return -EFAULT;
1418 : }
1419 :
1420 : /* Bit Bucket SGL descriptor */
1421 4 : if ((uint64_t)virt_addr == UINT64_MAX) {
1422 : /* TODO: enable WRITE and COMPARE when necessary */
1423 0 : if (req->cmd.opc != SPDK_NVME_OPC_READ) {
1424 0 : SPDK_ERRLOG("Only READ command can be supported\n");
1425 0 : goto exit;
1426 : }
1427 0 : if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1428 0 : SPDK_ERRLOG("Too many SGL entries\n");
1429 0 : goto exit;
1430 : }
1431 :
1432 0 : sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_BIT_BUCKET;
1433 : /* If the SGL describes a destination data buffer, the length of data
1434 : * buffer shall be discarded by controller, and the length is included
1435 : * in Number of Logical Blocks (NLB) parameter. Otherwise, the length
1436 : * is not included in the NLB parameter.
1437 : */
1438 0 : remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
1439 0 : remaining_transfer_len -= remaining_user_sge_len;
1440 :
1441 0 : sgl->unkeyed.length = remaining_user_sge_len;
1442 0 : sgl->address = 0;
1443 0 : sgl->unkeyed.subtype = 0;
1444 :
1445 0 : sgl++;
1446 0 : nseg++;
1447 :
1448 0 : continue;
1449 : }
1450 :
1451 4 : remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
1452 4 : remaining_transfer_len -= remaining_user_sge_len;
1453 8 : while (remaining_user_sge_len > 0) {
1454 4 : if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1455 0 : SPDK_ERRLOG("Too many SGL entries\n");
1456 0 : goto exit;
1457 : }
1458 :
1459 4 : if (dword_aligned && ((uintptr_t)virt_addr & 3)) {
1460 0 : SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1461 0 : goto exit;
1462 : }
1463 :
1464 4 : mapping_length = remaining_user_sge_len;
1465 4 : phys_addr = nvme_pcie_vtophys(qpair->ctrlr, virt_addr, &mapping_length);
1466 4 : if (phys_addr == SPDK_VTOPHYS_ERROR) {
1467 0 : goto exit;
1468 : }
1469 :
1470 4 : length = spdk_min(remaining_user_sge_len, mapping_length);
1471 4 : remaining_user_sge_len -= length;
1472 4 : virt_addr = (uint8_t *)virt_addr + length;
1473 :
1474 4 : if (!pqpair->flags.disable_pcie_sgl_merge && nseg > 0 &&
1475 2 : phys_addr == (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) {
1476 : /* extend previous entry */
1477 0 : (*(sgl - 1)).unkeyed.length += length;
1478 0 : continue;
1479 : }
1480 :
1481 4 : sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1482 4 : sgl->unkeyed.length = length;
1483 4 : sgl->address = phys_addr;
1484 4 : sgl->unkeyed.subtype = 0;
1485 :
1486 4 : sgl++;
1487 4 : nseg++;
1488 : }
1489 : }
1490 :
1491 2 : if (nseg == 1) {
1492 : /*
1493 : * The whole transfer can be described by a single SGL descriptor.
1494 : * Use the special case described by the spec where SGL1's type is Data Block.
1495 : * This means the SGL in the tracker is not used at all, so copy the first (and only)
1496 : * SGL element into SGL1.
1497 : */
1498 1 : req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1499 1 : req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
1500 1 : req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
1501 : } else {
1502 : /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because
1503 : * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page.
1504 : */
1505 1 : req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
1506 1 : req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
1507 1 : req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
1508 : }
1509 :
1510 2 : SPDK_DEBUGLOG(nvme, "Number of SGL descriptors: %" PRIu32 "\n", nseg);
1511 2 : return 0;
1512 :
1513 0 : exit:
1514 0 : nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1515 0 : return -EFAULT;
1516 : }
1517 :
1518 : /**
1519 : * Build PRP list describing scattered payload buffer.
1520 : */
1521 : static int
1522 1 : nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1523 : struct nvme_tracker *tr, bool dword_aligned)
1524 : {
1525 : int rc;
1526 1 : void *virt_addr;
1527 1 : uint32_t remaining_transfer_len, length;
1528 1 : uint32_t prp_index = 0;
1529 1 : uint32_t page_size = qpair->ctrlr->page_size;
1530 :
1531 : /*
1532 : * Build scattered payloads.
1533 : */
1534 1 : assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
1535 1 : assert(req->payload.reset_sgl_fn != NULL);
1536 1 : req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
1537 :
1538 1 : remaining_transfer_len = req->payload_size;
1539 2 : while (remaining_transfer_len > 0) {
1540 1 : assert(req->payload.next_sge_fn != NULL);
1541 1 : rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length);
1542 1 : if (rc) {
1543 0 : nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1544 0 : return -EFAULT;
1545 : }
1546 :
1547 1 : length = spdk_min(remaining_transfer_len, length);
1548 :
1549 : /*
1550 : * Any incompatible sges should have been handled up in the splitting routine,
1551 : * but assert here as an additional check.
1552 : *
1553 : * All SGEs except last must end on a page boundary.
1554 : */
1555 1 : assert((length == remaining_transfer_len) ||
1556 : _is_page_aligned((uintptr_t)virt_addr + length, page_size));
1557 :
1558 1 : rc = nvme_pcie_prp_list_append(qpair->ctrlr, tr, &prp_index, virt_addr, length, page_size);
1559 1 : if (rc) {
1560 0 : nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1561 0 : return rc;
1562 : }
1563 :
1564 1 : remaining_transfer_len -= length;
1565 : }
1566 :
1567 1 : SPDK_DEBUGLOG(nvme, "Number of PRP entries: %" PRIu32 "\n", prp_index);
1568 1 : return 0;
1569 : }
1570 :
1571 : typedef int(*build_req_fn)(struct spdk_nvme_qpair *, struct nvme_request *, struct nvme_tracker *,
1572 : bool);
1573 :
1574 : static build_req_fn const g_nvme_pcie_build_req_table[][2] = {
1575 : [NVME_PAYLOAD_TYPE_INVALID] = {
1576 : nvme_pcie_qpair_build_request_invalid, /* PRP */
1577 : nvme_pcie_qpair_build_request_invalid /* SGL */
1578 : },
1579 : [NVME_PAYLOAD_TYPE_CONTIG] = {
1580 : nvme_pcie_qpair_build_contig_request, /* PRP */
1581 : nvme_pcie_qpair_build_contig_hw_sgl_request /* SGL */
1582 : },
1583 : [NVME_PAYLOAD_TYPE_SGL] = {
1584 : nvme_pcie_qpair_build_prps_sgl_request, /* PRP */
1585 : nvme_pcie_qpair_build_hw_sgl_request /* SGL */
1586 : }
1587 : };
1588 :
1589 : static int
1590 5 : nvme_pcie_qpair_build_metadata(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
1591 : bool sgl_supported, bool mptr_sgl_supported, bool dword_aligned)
1592 : {
1593 : void *md_payload;
1594 5 : struct nvme_request *req = tr->req;
1595 5 : uint64_t mapping_length;
1596 :
1597 5 : if (req->payload.md) {
1598 5 : md_payload = (uint8_t *)req->payload.md + req->md_offset;
1599 5 : if (dword_aligned && ((uintptr_t)md_payload & 3)) {
1600 0 : SPDK_ERRLOG("virt_addr %p not dword aligned\n", md_payload);
1601 0 : goto exit;
1602 : }
1603 :
1604 5 : mapping_length = req->md_size;
1605 5 : if (sgl_supported && mptr_sgl_supported && dword_aligned) {
1606 2 : assert(req->cmd.psdt == SPDK_NVME_PSDT_SGL_MPTR_CONTIG);
1607 2 : req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_SGL;
1608 :
1609 2 : tr->meta_sgl.address = nvme_pcie_vtophys(qpair->ctrlr, md_payload, &mapping_length);
1610 2 : if (tr->meta_sgl.address == SPDK_VTOPHYS_ERROR || mapping_length != req->md_size) {
1611 1 : goto exit;
1612 : }
1613 1 : tr->meta_sgl.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1614 1 : tr->meta_sgl.unkeyed.length = req->md_size;
1615 1 : tr->meta_sgl.unkeyed.subtype = 0;
1616 1 : req->cmd.mptr = tr->prp_sgl_bus_addr - sizeof(struct spdk_nvme_sgl_descriptor);
1617 : } else {
1618 3 : req->cmd.mptr = nvme_pcie_vtophys(qpair->ctrlr, md_payload, &mapping_length);
1619 3 : if (req->cmd.mptr == SPDK_VTOPHYS_ERROR || mapping_length != req->md_size) {
1620 1 : goto exit;
1621 : }
1622 : }
1623 : }
1624 :
1625 3 : return 0;
1626 :
1627 2 : exit:
1628 2 : nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1629 2 : return -EINVAL;
1630 : }
1631 :
1632 : int
1633 0 : nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
1634 : {
1635 : struct nvme_tracker *tr;
1636 0 : int rc = 0;
1637 0 : struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
1638 0 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1639 : enum nvme_payload_type payload_type;
1640 : bool sgl_supported;
1641 : bool mptr_sgl_supported;
1642 0 : bool dword_aligned = true;
1643 :
1644 0 : if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
1645 0 : nvme_ctrlr_lock(ctrlr);
1646 : }
1647 :
1648 0 : tr = TAILQ_FIRST(&pqpair->free_tr);
1649 :
1650 0 : if (tr == NULL) {
1651 0 : pqpair->stat->queued_requests++;
1652 : /* Inform the upper layer to try again later. */
1653 0 : rc = -EAGAIN;
1654 0 : goto exit;
1655 : }
1656 :
1657 0 : pqpair->stat->submitted_requests++;
1658 0 : TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */
1659 0 : TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list);
1660 0 : pqpair->qpair.queue_depth++;
1661 0 : tr->req = req;
1662 0 : tr->cb_fn = req->cb_fn;
1663 0 : tr->cb_arg = req->cb_arg;
1664 0 : req->cmd.cid = tr->cid;
1665 : /* Use PRP by default. This bit will be overridden below if needed. */
1666 0 : req->cmd.psdt = SPDK_NVME_PSDT_PRP;
1667 :
1668 0 : if (req->payload_size != 0) {
1669 0 : payload_type = nvme_payload_type(&req->payload);
1670 : /* According to the specification, PRPs shall be used for all
1671 : * Admin commands for NVMe over PCIe implementations.
1672 : */
1673 0 : sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) != 0 &&
1674 0 : !nvme_qpair_is_admin_queue(qpair);
1675 0 : mptr_sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_MPTR_SGL_SUPPORTED) != 0 &&
1676 0 : !nvme_qpair_is_admin_queue(qpair);
1677 :
1678 0 : if (sgl_supported) {
1679 : /* Don't use SGL for DSM command */
1680 0 : if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_NO_SGL_FOR_DSM) &&
1681 : (req->cmd.opc == SPDK_NVME_OPC_DATASET_MANAGEMENT))) {
1682 0 : sgl_supported = false;
1683 : }
1684 : }
1685 :
1686 0 : if (sgl_supported && !(ctrlr->flags & SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT)) {
1687 0 : dword_aligned = false;
1688 : }
1689 :
1690 : /* If we fail to build the request or the metadata, do not return the -EFAULT back up
1691 : * the stack. This ensures that we always fail these types of requests via a
1692 : * completion callback, and never in the context of the submission.
1693 : */
1694 0 : rc = g_nvme_pcie_build_req_table[payload_type][sgl_supported](qpair, req, tr, dword_aligned);
1695 0 : if (rc < 0) {
1696 0 : assert(rc == -EFAULT);
1697 0 : rc = 0;
1698 0 : goto exit;
1699 : }
1700 :
1701 0 : rc = nvme_pcie_qpair_build_metadata(qpair, tr, sgl_supported, mptr_sgl_supported, dword_aligned);
1702 0 : if (rc < 0) {
1703 0 : assert(rc == -EFAULT);
1704 0 : rc = 0;
1705 0 : goto exit;
1706 : }
1707 : }
1708 :
1709 0 : nvme_pcie_qpair_submit_tracker(qpair, tr);
1710 :
1711 0 : exit:
1712 0 : if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
1713 0 : nvme_ctrlr_unlock(ctrlr);
1714 : }
1715 :
1716 0 : return rc;
1717 : }
1718 :
1719 : struct spdk_nvme_transport_poll_group *
1720 1 : nvme_pcie_poll_group_create(void)
1721 : {
1722 1 : struct nvme_pcie_poll_group *group = calloc(1, sizeof(*group));
1723 :
1724 1 : if (group == NULL) {
1725 0 : SPDK_ERRLOG("Unable to allocate poll group.\n");
1726 0 : return NULL;
1727 : }
1728 :
1729 1 : return &group->group;
1730 : }
1731 :
1732 : int
1733 0 : nvme_pcie_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
1734 : {
1735 0 : return 0;
1736 : }
1737 :
1738 : int
1739 0 : nvme_pcie_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
1740 : {
1741 0 : return 0;
1742 : }
1743 :
1744 : int
1745 0 : nvme_pcie_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
1746 : struct spdk_nvme_qpair *qpair)
1747 : {
1748 0 : return 0;
1749 : }
1750 :
1751 : int
1752 0 : nvme_pcie_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
1753 : struct spdk_nvme_qpair *qpair)
1754 : {
1755 0 : struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1756 :
1757 0 : pqpair->stat = &g_dummy_stat;
1758 0 : return 0;
1759 : }
1760 :
1761 : int64_t
1762 0 : nvme_pcie_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
1763 : uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
1764 : {
1765 : struct spdk_nvme_qpair *qpair, *tmp_qpair;
1766 0 : int32_t local_completions = 0;
1767 0 : int64_t total_completions = 0;
1768 :
1769 0 : STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) {
1770 0 : disconnected_qpair_cb(qpair, tgroup->group->ctx);
1771 : }
1772 :
1773 0 : STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) {
1774 0 : local_completions = spdk_nvme_qpair_process_completions(qpair, completions_per_qpair);
1775 0 : if (spdk_unlikely(local_completions < 0)) {
1776 0 : disconnected_qpair_cb(qpair, tgroup->group->ctx);
1777 0 : total_completions = -ENXIO;
1778 0 : } else if (spdk_likely(total_completions >= 0)) {
1779 0 : total_completions += local_completions;
1780 : }
1781 : }
1782 :
1783 0 : return total_completions;
1784 : }
1785 :
1786 : int
1787 1 : nvme_pcie_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup)
1788 : {
1789 1 : if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) {
1790 0 : return -EBUSY;
1791 : }
1792 :
1793 1 : free(tgroup);
1794 :
1795 1 : return 0;
1796 : }
1797 :
1798 : int
1799 3 : nvme_pcie_poll_group_get_stats(struct spdk_nvme_transport_poll_group *tgroup,
1800 : struct spdk_nvme_transport_poll_group_stat **_stats)
1801 : {
1802 : struct nvme_pcie_poll_group *group;
1803 : struct spdk_nvme_transport_poll_group_stat *stats;
1804 :
1805 3 : if (tgroup == NULL || _stats == NULL) {
1806 2 : SPDK_ERRLOG("Invalid stats or group pointer\n");
1807 2 : return -EINVAL;
1808 : }
1809 :
1810 1 : stats = calloc(1, sizeof(*stats));
1811 1 : if (!stats) {
1812 0 : SPDK_ERRLOG("Can't allocate memory for stats\n");
1813 0 : return -ENOMEM;
1814 : }
1815 1 : stats->trtype = SPDK_NVME_TRANSPORT_PCIE;
1816 1 : group = SPDK_CONTAINEROF(tgroup, struct nvme_pcie_poll_group, group);
1817 1 : memcpy(&stats->pcie, &group->stats, sizeof(group->stats));
1818 :
1819 1 : *_stats = stats;
1820 :
1821 1 : return 0;
1822 : }
1823 :
1824 : void
1825 1 : nvme_pcie_poll_group_free_stats(struct spdk_nvme_transport_poll_group *tgroup,
1826 : struct spdk_nvme_transport_poll_group_stat *stats)
1827 : {
1828 1 : free(stats);
1829 1 : }
1830 :
1831 : static void
1832 0 : nvme_pcie_trace(void)
1833 : {
1834 0 : struct spdk_trace_tpoint_opts opts[] = {
1835 : {
1836 : "NVME_PCIE_SUBMIT", TRACE_NVME_PCIE_SUBMIT,
1837 : OWNER_TYPE_NVME_PCIE_QP, OBJECT_NVME_PCIE_REQ, 1,
1838 : { { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
1839 : { "cid", SPDK_TRACE_ARG_TYPE_INT, 4 },
1840 : { "opc", SPDK_TRACE_ARG_TYPE_INT, 4 },
1841 : { "dw10", SPDK_TRACE_ARG_TYPE_PTR, 4 },
1842 : { "dw11", SPDK_TRACE_ARG_TYPE_PTR, 4 },
1843 : { "dw12", SPDK_TRACE_ARG_TYPE_PTR, 4 },
1844 : { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
1845 : }
1846 : },
1847 : {
1848 : "NVME_PCIE_COMPLETE", TRACE_NVME_PCIE_COMPLETE,
1849 : OWNER_TYPE_NVME_PCIE_QP, OBJECT_NVME_PCIE_REQ, 0,
1850 : { { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
1851 : { "cid", SPDK_TRACE_ARG_TYPE_INT, 4 },
1852 : { "cpl", SPDK_TRACE_ARG_TYPE_PTR, 4 },
1853 : { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
1854 : }
1855 : },
1856 : };
1857 :
1858 0 : spdk_trace_register_object(OBJECT_NVME_PCIE_REQ, 'p');
1859 0 : spdk_trace_register_owner_type(OWNER_TYPE_NVME_PCIE_QP, 'q');
1860 0 : spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
1861 0 : }
1862 2 : SPDK_TRACE_REGISTER_FN(nvme_pcie_trace, "nvme_pcie", TRACE_GROUP_NVME_PCIE)
|