LCOV - code coverage report
Current view: top level - lib/nvmf - rdma.c (source / functions) Hit Total Coverage
Test: ut_cov_unit.info Lines: 808 2891 27.9 %
Date: 2024-12-16 07:09:28 Functions: 31 117 26.5 %

          Line data    Source code
       1             : /*   SPDX-License-Identifier: BSD-3-Clause
       2             :  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
       3             :  *   Copyright (c) 2019-2021 Mellanox Technologies LTD. All rights reserved.
       4             :  *   Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
       5             :  */
       6             : 
       7             : #include "spdk/stdinc.h"
       8             : 
       9             : #include "spdk/config.h"
      10             : #include "spdk/thread.h"
      11             : #include "spdk/likely.h"
      12             : #include "spdk/nvmf_transport.h"
      13             : #include "spdk/string.h"
      14             : #include "spdk/trace.h"
      15             : #include "spdk/tree.h"
      16             : #include "spdk/util.h"
      17             : 
      18             : #include "spdk_internal/assert.h"
      19             : #include "spdk/log.h"
      20             : #include "spdk_internal/rdma_provider.h"
      21             : #include "spdk_internal/rdma_utils.h"
      22             : 
      23             : #include "nvmf_internal.h"
      24             : #include "transport.h"
      25             : 
      26             : #include "spdk_internal/trace_defs.h"
      27             : 
      28             : struct spdk_nvme_rdma_hooks g_nvmf_hooks = {};
      29             : const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma;
      30             : 
      31             : /*
      32             :  RDMA Connection Resource Defaults
      33             :  */
      34             : #define NVMF_DEFAULT_MSDBD              16
      35             : #define NVMF_DEFAULT_TX_SGE             SPDK_NVMF_MAX_SGL_ENTRIES
      36             : #define NVMF_DEFAULT_RSP_SGE            1
      37             : #define NVMF_DEFAULT_RX_SGE             2
      38             : 
      39             : #define NVMF_RDMA_MAX_EVENTS_PER_POLL   32
      40             : 
      41             : SPDK_STATIC_ASSERT(NVMF_DEFAULT_MSDBD <= SPDK_NVMF_MAX_SGL_ENTRIES,
      42             :                    "MSDBD must not exceed SPDK_NVMF_MAX_SGL_ENTRIES");
      43             : 
      44             : /* The RDMA completion queue size */
      45             : #define DEFAULT_NVMF_RDMA_CQ_SIZE       4096
      46             : #define MAX_WR_PER_QP(queue_depth)      (queue_depth * 3 + 2)
      47             : 
      48             : enum spdk_nvmf_rdma_request_state {
      49             :         /* The request is not currently in use */
      50             :         RDMA_REQUEST_STATE_FREE = 0,
      51             : 
      52             :         /* Initial state when request first received */
      53             :         RDMA_REQUEST_STATE_NEW,
      54             : 
      55             :         /* The request is queued until a data buffer is available. */
      56             :         RDMA_REQUEST_STATE_NEED_BUFFER,
      57             : 
      58             :         /* The request is waiting on RDMA queue depth availability
      59             :          * to transfer data from the host to the controller.
      60             :          */
      61             :         RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING,
      62             : 
      63             :         /* The request is currently transferring data from the host to the controller. */
      64             :         RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
      65             : 
      66             :         /* The request is ready to execute at the block device */
      67             :         RDMA_REQUEST_STATE_READY_TO_EXECUTE,
      68             : 
      69             :         /* The request is currently executing at the block device */
      70             :         RDMA_REQUEST_STATE_EXECUTING,
      71             : 
      72             :         /* The request finished executing at the block device */
      73             :         RDMA_REQUEST_STATE_EXECUTED,
      74             : 
      75             :         /* The request is waiting on RDMA queue depth availability
      76             :          * to transfer data from the controller to the host.
      77             :          */
      78             :         RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING,
      79             : 
      80             :         /* The request is waiting on RDMA queue depth availability
      81             :          * to send response to the host.
      82             :          */
      83             :         RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING,
      84             : 
      85             :         /* The request is ready to send a completion */
      86             :         RDMA_REQUEST_STATE_READY_TO_COMPLETE,
      87             : 
      88             :         /* The request is currently transferring data from the controller to the host. */
      89             :         RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST,
      90             : 
      91             :         /* The request currently has an outstanding completion without an
      92             :          * associated data transfer.
      93             :          */
      94             :         RDMA_REQUEST_STATE_COMPLETING,
      95             : 
      96             :         /* The request completed and can be marked free. */
      97             :         RDMA_REQUEST_STATE_COMPLETED,
      98             : 
      99             :         /* Terminator */
     100             :         RDMA_REQUEST_NUM_STATES,
     101             : };
     102             : 
     103             : static void
     104           0 : nvmf_trace(void)
     105             : {
     106           0 :         spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r');
     107             : 
     108           0 :         struct spdk_trace_tpoint_opts opts[] = {
     109             :                 {
     110             :                         "RDMA_REQ_NEW", TRACE_RDMA_REQUEST_STATE_NEW,
     111             :                         OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 1,
     112             :                         {
     113             :                                 { "qpair", SPDK_TRACE_ARG_TYPE_PTR, 8 },
     114             :                                 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
     115             :                         }
     116             :                 },
     117             :                 {
     118             :                         "RDMA_REQ_COMPLETED", TRACE_RDMA_REQUEST_STATE_COMPLETED,
     119             :                         OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
     120             :                         {
     121             :                                 { "qpair", SPDK_TRACE_ARG_TYPE_PTR, 8 },
     122             :                                 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
     123             :                         }
     124             :                 },
     125             :         };
     126             : 
     127           0 :         spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
     128           0 :         spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", TRACE_RDMA_REQUEST_STATE_NEED_BUFFER,
     129             :                                         OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
     130             :                                         SPDK_TRACE_ARG_TYPE_PTR, "qpair");
     131           0 :         spdk_trace_register_description("RDMA_REQ_TX_PENDING_C2H",
     132             :                                         TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING,
     133             :                                         OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
     134             :                                         SPDK_TRACE_ARG_TYPE_PTR, "qpair");
     135           0 :         spdk_trace_register_description("RDMA_REQ_TX_PENDING_H2C",
     136             :                                         TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING,
     137             :                                         OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
     138             :                                         SPDK_TRACE_ARG_TYPE_PTR, "qpair");
     139           0 :         spdk_trace_register_description("RDMA_REQ_TX_H2C",
     140             :                                         TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
     141             :                                         OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
     142             :                                         SPDK_TRACE_ARG_TYPE_PTR, "qpair");
     143           0 :         spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE",
     144             :                                         TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE,
     145             :                                         OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
     146             :                                         SPDK_TRACE_ARG_TYPE_PTR, "qpair");
     147           0 :         spdk_trace_register_description("RDMA_REQ_EXECUTING",
     148             :                                         TRACE_RDMA_REQUEST_STATE_EXECUTING,
     149             :                                         OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
     150             :                                         SPDK_TRACE_ARG_TYPE_PTR, "qpair");
     151           0 :         spdk_trace_register_description("RDMA_REQ_EXECUTED",
     152             :                                         TRACE_RDMA_REQUEST_STATE_EXECUTED,
     153             :                                         OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
     154             :                                         SPDK_TRACE_ARG_TYPE_PTR, "qpair");
     155           0 :         spdk_trace_register_description("RDMA_REQ_RDY2COMPL_PEND",
     156             :                                         TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING,
     157             :                                         OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
     158             :                                         SPDK_TRACE_ARG_TYPE_PTR, "qpair");
     159           0 :         spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPL",
     160             :                                         TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE,
     161             :                                         OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
     162             :                                         SPDK_TRACE_ARG_TYPE_PTR, "qpair");
     163           0 :         spdk_trace_register_description("RDMA_REQ_COMPLETING_C2H",
     164             :                                         TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST,
     165             :                                         OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
     166             :                                         SPDK_TRACE_ARG_TYPE_PTR, "qpair");
     167           0 :         spdk_trace_register_description("RDMA_REQ_COMPLETING",
     168             :                                         TRACE_RDMA_REQUEST_STATE_COMPLETING,
     169             :                                         OWNER_TYPE_NONE, OBJECT_NVMF_RDMA_IO, 0,
     170             :                                         SPDK_TRACE_ARG_TYPE_PTR, "qpair");
     171             : 
     172           0 :         spdk_trace_register_description("RDMA_QP_CREATE", TRACE_RDMA_QP_CREATE,
     173             :                                         OWNER_TYPE_NONE, OBJECT_NONE, 0,
     174             :                                         SPDK_TRACE_ARG_TYPE_INT, "");
     175           0 :         spdk_trace_register_description("RDMA_IBV_ASYNC_EVENT", TRACE_RDMA_IBV_ASYNC_EVENT,
     176             :                                         OWNER_TYPE_NONE, OBJECT_NONE, 0,
     177             :                                         SPDK_TRACE_ARG_TYPE_INT, "type");
     178           0 :         spdk_trace_register_description("RDMA_CM_ASYNC_EVENT", TRACE_RDMA_CM_ASYNC_EVENT,
     179             :                                         OWNER_TYPE_NONE, OBJECT_NONE, 0,
     180             :                                         SPDK_TRACE_ARG_TYPE_INT, "type");
     181           0 :         spdk_trace_register_description("RDMA_QP_DISCONNECT", TRACE_RDMA_QP_DISCONNECT,
     182             :                                         OWNER_TYPE_NONE, OBJECT_NONE, 0,
     183             :                                         SPDK_TRACE_ARG_TYPE_INT, "");
     184           0 :         spdk_trace_register_description("RDMA_QP_DESTROY", TRACE_RDMA_QP_DESTROY,
     185             :                                         OWNER_TYPE_NONE, OBJECT_NONE, 0,
     186             :                                         SPDK_TRACE_ARG_TYPE_INT, "");
     187             : 
     188           0 :         spdk_trace_tpoint_register_relation(TRACE_BDEV_IO_START, OBJECT_NVMF_RDMA_IO, 1);
     189           0 :         spdk_trace_tpoint_register_relation(TRACE_BDEV_IO_DONE, OBJECT_NVMF_RDMA_IO, 0);
     190           0 : }
     191           2 : SPDK_TRACE_REGISTER_FN(nvmf_trace, "nvmf_rdma", TRACE_GROUP_NVMF_RDMA)
     192             : 
     193             : enum spdk_nvmf_rdma_wr_type {
     194             :         RDMA_WR_TYPE_RECV,
     195             :         RDMA_WR_TYPE_SEND,
     196             :         RDMA_WR_TYPE_DATA,
     197             : };
     198             : 
     199             : struct spdk_nvmf_rdma_wr {
     200             :         /* Uses enum spdk_nvmf_rdma_wr_type */
     201             :         uint8_t type;
     202             : };
     203             : 
     204             : /* This structure holds commands as they are received off the wire.
     205             :  * It must be dynamically paired with a full request object
     206             :  * (spdk_nvmf_rdma_request) to service a request. It is separate
     207             :  * from the request because RDMA does not appear to order
     208             :  * completions, so occasionally we'll get a new incoming
     209             :  * command when there aren't any free request objects.
     210             :  */
     211             : struct spdk_nvmf_rdma_recv {
     212             :         struct ibv_recv_wr                      wr;
     213             :         struct ibv_sge                          sgl[NVMF_DEFAULT_RX_SGE];
     214             : 
     215             :         struct spdk_nvmf_rdma_qpair             *qpair;
     216             : 
     217             :         /* In-capsule data buffer */
     218             :         uint8_t                                 *buf;
     219             : 
     220             :         struct spdk_nvmf_rdma_wr                rdma_wr;
     221             :         uint64_t                                receive_tsc;
     222             : 
     223             :         STAILQ_ENTRY(spdk_nvmf_rdma_recv)       link;
     224             : };
     225             : 
     226             : struct spdk_nvmf_rdma_request_data {
     227             :         struct ibv_send_wr              wr;
     228             :         struct ibv_sge                  sgl[SPDK_NVMF_MAX_SGL_ENTRIES];
     229             : };
     230             : 
     231             : struct spdk_nvmf_rdma_request {
     232             :         struct spdk_nvmf_request                req;
     233             : 
     234             :         bool                                    fused_failed;
     235             : 
     236             :         struct spdk_nvmf_rdma_wr                data_wr;
     237             :         struct spdk_nvmf_rdma_wr                rsp_wr;
     238             : 
     239             :         /* Uses enum spdk_nvmf_rdma_request_state */
     240             :         uint8_t                                 state;
     241             : 
     242             :         /* Data offset in req.iov */
     243             :         uint32_t                                offset;
     244             : 
     245             :         struct spdk_nvmf_rdma_recv              *recv;
     246             : 
     247             :         struct {
     248             :                 struct  ibv_send_wr             wr;
     249             :                 struct  ibv_sge                 sgl[NVMF_DEFAULT_RSP_SGE];
     250             :         } rsp;
     251             : 
     252             :         uint16_t                                iovpos;
     253             :         uint16_t                                num_outstanding_data_wr;
     254             :         /* Used to split Write IO with multi SGL payload */
     255             :         uint16_t                                num_remaining_data_wr;
     256             :         uint64_t                                receive_tsc;
     257             :         struct spdk_nvmf_rdma_request           *fused_pair;
     258             :         STAILQ_ENTRY(spdk_nvmf_rdma_request)    state_link;
     259             :         struct ibv_send_wr                      *remaining_tranfer_in_wrs;
     260             :         struct ibv_send_wr                      *transfer_wr;
     261             :         struct spdk_nvmf_rdma_request_data      data;
     262             : };
     263             : 
     264             : struct spdk_nvmf_rdma_resource_opts {
     265             :         struct spdk_nvmf_rdma_qpair     *qpair;
     266             :         /* qp points either to an ibv_qp object or an ibv_srq object depending on the value of shared. */
     267             :         void                            *qp;
     268             :         struct spdk_rdma_utils_mem_map  *map;
     269             :         uint32_t                        max_queue_depth;
     270             :         uint32_t                        in_capsule_data_size;
     271             :         bool                            shared;
     272             : };
     273             : 
     274             : struct spdk_nvmf_rdma_resources {
     275             :         /* Array of size "max_queue_depth" containing RDMA requests. */
     276             :         struct spdk_nvmf_rdma_request           *reqs;
     277             : 
     278             :         /* Array of size "max_queue_depth" containing RDMA recvs. */
     279             :         struct spdk_nvmf_rdma_recv              *recvs;
     280             : 
     281             :         /* Array of size "max_queue_depth" containing 64 byte capsules
     282             :          * used for receive.
     283             :          */
     284             :         union nvmf_h2c_msg                      *cmds;
     285             : 
     286             :         /* Array of size "max_queue_depth" containing 16 byte completions
     287             :          * to be sent back to the user.
     288             :          */
     289             :         union nvmf_c2h_msg                      *cpls;
     290             : 
     291             :         /* Array of size "max_queue_depth * InCapsuleDataSize" containing
     292             :          * buffers to be used for in capsule data.
     293             :          */
     294             :         void                                    *bufs;
     295             : 
     296             :         /* Receives that are waiting for a request object */
     297             :         STAILQ_HEAD(, spdk_nvmf_rdma_recv)      incoming_queue;
     298             : 
     299             :         /* Queue to track free requests */
     300             :         STAILQ_HEAD(, spdk_nvmf_rdma_request)   free_queue;
     301             : };
     302             : 
     303             : typedef void (*spdk_nvmf_rdma_qpair_ibv_event)(struct spdk_nvmf_rdma_qpair *rqpair);
     304             : 
     305             : typedef void (*spdk_poller_destroy_cb)(void *ctx);
     306             : 
     307             : struct spdk_nvmf_rdma_ibv_event_ctx {
     308             :         struct spdk_nvmf_rdma_qpair                     *rqpair;
     309             : };
     310             : 
     311             : struct spdk_nvmf_rdma_qpair {
     312             :         struct spdk_nvmf_qpair                  qpair;
     313             : 
     314             :         struct spdk_nvmf_rdma_device            *device;
     315             :         struct spdk_nvmf_rdma_poller            *poller;
     316             : 
     317             :         struct spdk_rdma_provider_qp            *rdma_qp;
     318             :         struct rdma_cm_id                       *cm_id;
     319             :         struct spdk_rdma_provider_srq           *srq;
     320             :         struct rdma_cm_id                       *listen_id;
     321             : 
     322             :         /* Cache the QP number to improve QP search by RB tree. */
     323             :         uint32_t                                qp_num;
     324             : 
     325             :         /* The maximum number of I/O outstanding on this connection at one time */
     326             :         uint16_t                                max_queue_depth;
     327             : 
     328             :         /* The maximum number of active RDMA READ and ATOMIC operations at one time */
     329             :         uint16_t                                max_read_depth;
     330             : 
     331             :         /* The maximum number of RDMA SEND operations at one time */
     332             :         uint32_t                                max_send_depth;
     333             : 
     334             :         /* The current number of outstanding WRs from this qpair's
     335             :          * recv queue. Should not exceed device->attr.max_queue_depth.
     336             :          */
     337             :         uint16_t                                current_recv_depth;
     338             : 
     339             :         /* The current number of active RDMA READ operations */
     340             :         uint16_t                                current_read_depth;
     341             : 
     342             :         /* The current number of posted WRs from this qpair's
     343             :          * send queue. Should not exceed max_send_depth.
     344             :          */
     345             :         uint32_t                                current_send_depth;
     346             : 
     347             :         /* The maximum number of SGEs per WR on the send queue */
     348             :         uint32_t                                max_send_sge;
     349             : 
     350             :         /* The maximum number of SGEs per WR on the recv queue */
     351             :         uint32_t                                max_recv_sge;
     352             : 
     353             :         struct spdk_nvmf_rdma_resources         *resources;
     354             : 
     355             :         STAILQ_HEAD(, spdk_nvmf_rdma_request)   pending_rdma_read_queue;
     356             : 
     357             :         STAILQ_HEAD(, spdk_nvmf_rdma_request)   pending_rdma_write_queue;
     358             : 
     359             :         STAILQ_HEAD(, spdk_nvmf_rdma_request)   pending_rdma_send_queue;
     360             : 
     361             :         /* Number of requests not in the free state */
     362             :         uint32_t                                qd;
     363             : 
     364             :         bool                                    ibv_in_error_state;
     365             : 
     366             :         RB_ENTRY(spdk_nvmf_rdma_qpair)          node;
     367             : 
     368             :         STAILQ_ENTRY(spdk_nvmf_rdma_qpair)      recv_link;
     369             : 
     370             :         STAILQ_ENTRY(spdk_nvmf_rdma_qpair)      send_link;
     371             : 
     372             :         /* Points to the a request that has fuse bits set to
     373             :          * SPDK_NVME_CMD_FUSE_FIRST, when the qpair is waiting
     374             :          * for the request that has SPDK_NVME_CMD_FUSE_SECOND.
     375             :          */
     376             :         struct spdk_nvmf_rdma_request           *fused_first;
     377             : 
     378             :         /*
     379             :          * io_channel which is used to destroy qpair when it is removed from poll group
     380             :          */
     381             :         struct spdk_io_channel          *destruct_channel;
     382             : 
     383             :         /* ctx for async processing of last_wqe_reached event */
     384             :         struct spdk_nvmf_rdma_ibv_event_ctx     *last_wqe_reached_ctx;
     385             : 
     386             :         /* Lets us know that we have received the last_wqe event. */
     387             :         bool                                    last_wqe_reached;
     388             : 
     389             :         /* Indicate that nvmf_rdma_close_qpair is called */
     390             :         bool                                    to_close;
     391             : };
     392             : 
     393             : struct spdk_nvmf_rdma_poller_stat {
     394             :         uint64_t                                completions;
     395             :         uint64_t                                polls;
     396             :         uint64_t                                idle_polls;
     397             :         uint64_t                                requests;
     398             :         uint64_t                                request_latency;
     399             :         uint64_t                                pending_free_request;
     400             :         uint64_t                                pending_rdma_read;
     401             :         uint64_t                                pending_rdma_write;
     402             :         uint64_t                                pending_rdma_send;
     403             :         struct spdk_rdma_provider_qp_stats      qp_stats;
     404             : };
     405             : 
     406             : struct spdk_nvmf_rdma_poller {
     407             :         struct spdk_nvmf_rdma_device            *device;
     408             :         struct spdk_nvmf_rdma_poll_group        *group;
     409             : 
     410             :         int                                     num_cqe;
     411             :         int                                     required_num_wr;
     412             :         struct ibv_cq                           *cq;
     413             : 
     414             :         /* The maximum number of I/O outstanding on the shared receive queue at one time */
     415             :         uint16_t                                max_srq_depth;
     416             :         bool                                    need_destroy;
     417             : 
     418             :         /* Shared receive queue */
     419             :         struct spdk_rdma_provider_srq           *srq;
     420             : 
     421             :         struct spdk_nvmf_rdma_resources         *resources;
     422             :         struct spdk_nvmf_rdma_poller_stat       stat;
     423             : 
     424             :         spdk_poller_destroy_cb                  destroy_cb;
     425             :         void                                    *destroy_cb_ctx;
     426             : 
     427             :         RB_HEAD(qpairs_tree, spdk_nvmf_rdma_qpair) qpairs;
     428             : 
     429             :         STAILQ_HEAD(, spdk_nvmf_rdma_qpair)     qpairs_pending_recv;
     430             : 
     431             :         STAILQ_HEAD(, spdk_nvmf_rdma_qpair)     qpairs_pending_send;
     432             : 
     433             :         TAILQ_ENTRY(spdk_nvmf_rdma_poller)      link;
     434             : };
     435             : 
     436             : struct spdk_nvmf_rdma_poll_group_stat {
     437             :         uint64_t                                pending_data_buffer;
     438             : };
     439             : 
     440             : struct spdk_nvmf_rdma_poll_group {
     441             :         struct spdk_nvmf_transport_poll_group           group;
     442             :         struct spdk_nvmf_rdma_poll_group_stat           stat;
     443             :         TAILQ_HEAD(, spdk_nvmf_rdma_poller)             pollers;
     444             :         TAILQ_ENTRY(spdk_nvmf_rdma_poll_group)          link;
     445             : };
     446             : 
     447             : struct spdk_nvmf_rdma_conn_sched {
     448             :         struct spdk_nvmf_rdma_poll_group *next_admin_pg;
     449             :         struct spdk_nvmf_rdma_poll_group *next_io_pg;
     450             : };
     451             : 
     452             : /* Assuming rdma_cm uses just one protection domain per ibv_context. */
     453             : struct spdk_nvmf_rdma_device {
     454             :         struct ibv_device_attr                  attr;
     455             :         struct ibv_context                      *context;
     456             : 
     457             :         struct spdk_rdma_utils_mem_map          *map;
     458             :         struct ibv_pd                           *pd;
     459             : 
     460             :         int                                     num_srq;
     461             :         bool                                    need_destroy;
     462             :         bool                                    ready_to_destroy;
     463             :         bool                                    is_ready;
     464             : 
     465             :         TAILQ_ENTRY(spdk_nvmf_rdma_device)      link;
     466             : };
     467             : 
     468             : struct spdk_nvmf_rdma_port {
     469             :         const struct spdk_nvme_transport_id     *trid;
     470             :         struct rdma_cm_id                       *id;
     471             :         struct spdk_nvmf_rdma_device            *device;
     472             :         TAILQ_ENTRY(spdk_nvmf_rdma_port)        link;
     473             : };
     474             : 
     475             : struct rdma_transport_opts {
     476             :         int             num_cqe;
     477             :         uint32_t        max_srq_depth;
     478             :         bool            no_srq;
     479             :         bool            no_wr_batching;
     480             :         int             acceptor_backlog;
     481             : };
     482             : 
     483             : struct spdk_nvmf_rdma_transport {
     484             :         struct spdk_nvmf_transport      transport;
     485             :         struct rdma_transport_opts      rdma_opts;
     486             : 
     487             :         struct spdk_nvmf_rdma_conn_sched conn_sched;
     488             : 
     489             :         struct rdma_event_channel       *event_channel;
     490             : 
     491             :         struct spdk_mempool             *data_wr_pool;
     492             : 
     493             :         struct spdk_poller              *accept_poller;
     494             : 
     495             :         /* fields used to poll RDMA/IB events */
     496             :         nfds_t                  npoll_fds;
     497             :         struct pollfd           *poll_fds;
     498             : 
     499             :         TAILQ_HEAD(, spdk_nvmf_rdma_device)     devices;
     500             :         TAILQ_HEAD(, spdk_nvmf_rdma_port)       ports;
     501             :         TAILQ_HEAD(, spdk_nvmf_rdma_poll_group) poll_groups;
     502             : 
     503             :         /* ports that are removed unexpectedly and need retry listen */
     504             :         TAILQ_HEAD(, spdk_nvmf_rdma_port)               retry_ports;
     505             : };
     506             : 
     507             : struct poller_manage_ctx {
     508             :         struct spdk_nvmf_rdma_transport         *rtransport;
     509             :         struct spdk_nvmf_rdma_poll_group        *rgroup;
     510             :         struct spdk_nvmf_rdma_poller            *rpoller;
     511             :         struct spdk_nvmf_rdma_device            *device;
     512             : 
     513             :         struct spdk_thread                      *thread;
     514             :         volatile int                            *inflight_op_counter;
     515             : };
     516             : 
     517             : static const struct spdk_json_object_decoder rdma_transport_opts_decoder[] = {
     518             :         {
     519             :                 "num_cqe", offsetof(struct rdma_transport_opts, num_cqe),
     520             :                 spdk_json_decode_int32, true
     521             :         },
     522             :         {
     523             :                 "max_srq_depth", offsetof(struct rdma_transport_opts, max_srq_depth),
     524             :                 spdk_json_decode_uint32, true
     525             :         },
     526             :         {
     527             :                 "no_srq", offsetof(struct rdma_transport_opts, no_srq),
     528             :                 spdk_json_decode_bool, true
     529             :         },
     530             :         {
     531             :                 "no_wr_batching", offsetof(struct rdma_transport_opts, no_wr_batching),
     532             :                 spdk_json_decode_bool, true
     533             :         },
     534             :         {
     535             :                 "acceptor_backlog", offsetof(struct rdma_transport_opts, acceptor_backlog),
     536             :                 spdk_json_decode_int32, true
     537             :         },
     538             : };
     539             : 
     540             : static int
     541           2 : nvmf_rdma_qpair_compare(struct spdk_nvmf_rdma_qpair *rqpair1, struct spdk_nvmf_rdma_qpair *rqpair2)
     542             : {
     543           2 :         return rqpair1->qp_num < rqpair2->qp_num ? -1 : rqpair1->qp_num > rqpair2->qp_num;
     544             : }
     545             : 
     546           0 : RB_GENERATE_STATIC(qpairs_tree, spdk_nvmf_rdma_qpair, node, nvmf_rdma_qpair_compare);
     547             : 
     548             : static bool nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
     549             :                                       struct spdk_nvmf_rdma_request *rdma_req);
     550             : 
     551             : static void _poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport,
     552             :                                  struct spdk_nvmf_rdma_poller *rpoller);
     553             : 
     554             : static void _poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport,
     555             :                                  struct spdk_nvmf_rdma_poller *rpoller);
     556             : 
     557             : static void _nvmf_rdma_remove_destroyed_device(void *c);
     558             : 
     559             : static inline enum spdk_nvme_media_error_status_code
     560           0 : nvmf_rdma_dif_error_to_compl_status(uint8_t err_type) {
     561           0 :         enum spdk_nvme_media_error_status_code result;
     562           0 :         switch (err_type)
     563             :         {
     564             :         case SPDK_DIF_REFTAG_ERROR:
     565           0 :                 result = SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR;
     566           0 :                 break;
     567             :         case SPDK_DIF_APPTAG_ERROR:
     568           0 :                 result = SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR;
     569           0 :                 break;
     570             :         case SPDK_DIF_GUARD_ERROR:
     571           0 :                 result = SPDK_NVME_SC_GUARD_CHECK_ERROR;
     572           0 :                 break;
     573             :         default:
     574           0 :                 SPDK_UNREACHABLE();
     575             :         }
     576             : 
     577           0 :         return result;
     578           0 : }
     579             : 
     580             : /*
     581             :  * Return data_wrs to pool starting from \b data_wr
     582             :  * Request's own response and data WR are excluded
     583             :  */
     584             : static void
     585           7 : _nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req,
     586             :                              struct ibv_send_wr *data_wr,
     587             :                              struct spdk_mempool *pool)
     588             : {
     589           7 :         struct spdk_nvmf_rdma_request_data      *work_requests[SPDK_NVMF_MAX_SGL_ENTRIES];
     590           7 :         struct spdk_nvmf_rdma_request_data      *nvmf_data;
     591           7 :         struct ibv_send_wr                      *next_send_wr;
     592           7 :         uint64_t                                req_wrid = (uint64_t)&rdma_req->data_wr;
     593           7 :         uint32_t                                num_wrs = 0;
     594             : 
     595          15 :         while (data_wr && data_wr->wr_id == req_wrid) {
     596           8 :                 nvmf_data = SPDK_CONTAINEROF(data_wr, struct spdk_nvmf_rdma_request_data, wr);
     597           8 :                 memset(nvmf_data->sgl, 0, sizeof(data_wr->sg_list[0]) * data_wr->num_sge);
     598           8 :                 data_wr->num_sge = 0;
     599           8 :                 next_send_wr = data_wr->next;
     600           8 :                 if (data_wr != &rdma_req->data.wr) {
     601           1 :                         data_wr->next = NULL;
     602           1 :                         assert(num_wrs < SPDK_NVMF_MAX_SGL_ENTRIES);
     603           1 :                         work_requests[num_wrs] = nvmf_data;
     604           1 :                         num_wrs++;
     605           1 :                 }
     606           8 :                 data_wr = (!next_send_wr || next_send_wr == &rdma_req->rsp.wr) ? NULL : next_send_wr;
     607             :         }
     608             : 
     609           7 :         if (num_wrs) {
     610           1 :                 spdk_mempool_put_bulk(pool, (void **) work_requests, num_wrs);
     611           1 :         }
     612           7 : }
     613             : 
     614             : static void
     615           7 : nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req,
     616             :                             struct spdk_nvmf_rdma_transport *rtransport)
     617             : {
     618           7 :         rdma_req->num_outstanding_data_wr = 0;
     619             : 
     620           7 :         _nvmf_rdma_request_free_data(rdma_req, rdma_req->transfer_wr, rtransport->data_wr_pool);
     621             : 
     622           7 :         if (rdma_req->remaining_tranfer_in_wrs) {
     623           0 :                 _nvmf_rdma_request_free_data(rdma_req, rdma_req->remaining_tranfer_in_wrs,
     624           0 :                                              rtransport->data_wr_pool);
     625           0 :                 rdma_req->remaining_tranfer_in_wrs = NULL;
     626           0 :         }
     627             : 
     628           7 :         rdma_req->data.wr.next = NULL;
     629           7 :         rdma_req->rsp.wr.next = NULL;
     630           7 : }
     631             : 
     632             : static void
     633           0 : nvmf_rdma_dump_request(struct spdk_nvmf_rdma_request *req)
     634             : {
     635           0 :         SPDK_ERRLOG("\t\tRequest Data From Pool: %d\n", req->req.data_from_pool);
     636           0 :         if (req->req.cmd) {
     637           0 :                 SPDK_ERRLOG("\t\tRequest opcode: %d\n", req->req.cmd->nvmf_cmd.opcode);
     638           0 :         }
     639           0 :         if (req->recv) {
     640           0 :                 SPDK_ERRLOG("\t\tRequest recv wr_id%lu\n", req->recv->wr.wr_id);
     641           0 :         }
     642           0 : }
     643             : 
     644             : static void
     645           0 : nvmf_rdma_dump_qpair_contents(struct spdk_nvmf_rdma_qpair *rqpair)
     646             : {
     647           0 :         int i;
     648             : 
     649           0 :         SPDK_ERRLOG("Dumping contents of queue pair (QID %d)\n", rqpair->qpair.qid);
     650           0 :         for (i = 0; i < rqpair->max_queue_depth; i++) {
     651           0 :                 if (rqpair->resources->reqs[i].state != RDMA_REQUEST_STATE_FREE) {
     652           0 :                         nvmf_rdma_dump_request(&rqpair->resources->reqs[i]);
     653           0 :                 }
     654           0 :         }
     655           0 : }
     656             : 
     657             : static void
     658           1 : nvmf_rdma_resources_destroy(struct spdk_nvmf_rdma_resources *resources)
     659             : {
     660           1 :         spdk_free(resources->cmds);
     661           1 :         spdk_free(resources->cpls);
     662           1 :         spdk_free(resources->bufs);
     663           1 :         spdk_free(resources->reqs);
     664           1 :         spdk_free(resources->recvs);
     665           1 :         free(resources);
     666           1 : }
     667             : 
     668             : 
     669             : static struct spdk_nvmf_rdma_resources *
     670           1 : nvmf_rdma_resources_create(struct spdk_nvmf_rdma_resource_opts *opts)
     671             : {
     672           1 :         struct spdk_nvmf_rdma_resources         *resources;
     673           1 :         struct spdk_nvmf_rdma_request           *rdma_req;
     674           1 :         struct spdk_nvmf_rdma_recv              *rdma_recv;
     675           1 :         struct spdk_rdma_provider_qp            *qp = NULL;
     676           1 :         struct spdk_rdma_provider_srq           *srq = NULL;
     677           1 :         struct ibv_recv_wr                      *bad_wr = NULL;
     678           1 :         struct spdk_rdma_utils_memory_translation translation;
     679           1 :         uint32_t                                i;
     680           1 :         int                                     rc = 0;
     681             : 
     682           1 :         resources = calloc(1, sizeof(struct spdk_nvmf_rdma_resources));
     683           1 :         if (!resources) {
     684           0 :                 SPDK_ERRLOG("Unable to allocate resources for receive queue.\n");
     685           0 :                 return NULL;
     686             :         }
     687             : 
     688           1 :         resources->reqs = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->reqs),
     689             :                                        0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
     690           1 :         resources->recvs = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->recvs),
     691             :                                         0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
     692           1 :         resources->cmds = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cmds),
     693             :                                        0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
     694           1 :         resources->cpls = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cpls),
     695             :                                        0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
     696             : 
     697           1 :         if (opts->in_capsule_data_size > 0) {
     698           1 :                 resources->bufs = spdk_zmalloc(opts->max_queue_depth * opts->in_capsule_data_size,
     699             :                                                0x1000, NULL, SPDK_ENV_LCORE_ID_ANY,
     700             :                                                SPDK_MALLOC_DMA);
     701           1 :         }
     702             : 
     703           2 :         if (!resources->reqs || !resources->recvs || !resources->cmds ||
     704           1 :             !resources->cpls || (opts->in_capsule_data_size && !resources->bufs)) {
     705           0 :                 SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n");
     706           0 :                 goto cleanup;
     707             :         }
     708             : 
     709           1 :         SPDK_DEBUGLOG(rdma, "Command Array: %p Length: %lx\n",
     710             :                       resources->cmds, opts->max_queue_depth * sizeof(*resources->cmds));
     711           1 :         SPDK_DEBUGLOG(rdma, "Completion Array: %p Length: %lx\n",
     712             :                       resources->cpls, opts->max_queue_depth * sizeof(*resources->cpls));
     713           1 :         if (resources->bufs) {
     714           1 :                 SPDK_DEBUGLOG(rdma, "In Capsule Data Array: %p Length: %x\n",
     715             :                               resources->bufs, opts->max_queue_depth *
     716             :                               opts->in_capsule_data_size);
     717           1 :         }
     718             : 
     719             :         /* Initialize queues */
     720           1 :         STAILQ_INIT(&resources->incoming_queue);
     721           1 :         STAILQ_INIT(&resources->free_queue);
     722             : 
     723           1 :         if (opts->shared) {
     724           1 :                 srq = (struct spdk_rdma_provider_srq *)opts->qp;
     725           1 :         } else {
     726           0 :                 qp = (struct spdk_rdma_provider_qp *)opts->qp;
     727             :         }
     728             : 
     729         129 :         for (i = 0; i < opts->max_queue_depth; i++) {
     730         128 :                 rdma_recv = &resources->recvs[i];
     731         128 :                 rdma_recv->qpair = opts->qpair;
     732             : 
     733             :                 /* Set up memory to receive commands */
     734         128 :                 if (resources->bufs) {
     735         256 :                         rdma_recv->buf = (void *)((uintptr_t)resources->bufs + (i *
     736         128 :                                                   opts->in_capsule_data_size));
     737         128 :                 }
     738             : 
     739         128 :                 rdma_recv->rdma_wr.type = RDMA_WR_TYPE_RECV;
     740             : 
     741         128 :                 rdma_recv->sgl[0].addr = (uintptr_t)&resources->cmds[i];
     742         128 :                 rdma_recv->sgl[0].length = sizeof(resources->cmds[i]);
     743         128 :                 rc = spdk_rdma_utils_get_translation(opts->map, &resources->cmds[i], sizeof(resources->cmds[i]),
     744             :                                                      &translation);
     745         128 :                 if (rc) {
     746           0 :                         goto cleanup;
     747             :                 }
     748         128 :                 rdma_recv->sgl[0].lkey = spdk_rdma_utils_memory_translation_get_lkey(&translation);
     749         128 :                 rdma_recv->wr.num_sge = 1;
     750             : 
     751         128 :                 if (rdma_recv->buf) {
     752         128 :                         rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf;
     753         128 :                         rdma_recv->sgl[1].length = opts->in_capsule_data_size;
     754         128 :                         rc = spdk_rdma_utils_get_translation(opts->map, rdma_recv->buf, opts->in_capsule_data_size,
     755             :                                                              &translation);
     756         128 :                         if (rc) {
     757           0 :                                 goto cleanup;
     758             :                         }
     759         128 :                         rdma_recv->sgl[1].lkey = spdk_rdma_utils_memory_translation_get_lkey(&translation);
     760         128 :                         rdma_recv->wr.num_sge++;
     761         128 :                 }
     762             : 
     763         128 :                 rdma_recv->wr.wr_id = (uintptr_t)&rdma_recv->rdma_wr;
     764         128 :                 rdma_recv->wr.sg_list = rdma_recv->sgl;
     765         128 :                 if (srq) {
     766           0 :                         spdk_rdma_provider_srq_queue_recv_wrs(srq, &rdma_recv->wr);
     767           0 :                 } else {
     768         128 :                         spdk_rdma_provider_qp_queue_recv_wrs(qp, &rdma_recv->wr);
     769             :                 }
     770         128 :         }
     771             : 
     772         129 :         for (i = 0; i < opts->max_queue_depth; i++) {
     773         128 :                 rdma_req = &resources->reqs[i];
     774             : 
     775         128 :                 if (opts->qpair != NULL) {
     776         128 :                         rdma_req->req.qpair = &opts->qpair->qpair;
     777         128 :                 } else {
     778           0 :                         rdma_req->req.qpair = NULL;
     779             :                 }
     780         128 :                 rdma_req->req.cmd = NULL;
     781         128 :                 rdma_req->req.iovcnt = 0;
     782         128 :                 rdma_req->req.stripped_data = NULL;
     783             : 
     784             :                 /* Set up memory to send responses */
     785         128 :                 rdma_req->req.rsp = &resources->cpls[i];
     786             : 
     787         128 :                 rdma_req->rsp.sgl[0].addr = (uintptr_t)&resources->cpls[i];
     788         128 :                 rdma_req->rsp.sgl[0].length = sizeof(resources->cpls[i]);
     789         128 :                 rc = spdk_rdma_utils_get_translation(opts->map, &resources->cpls[i], sizeof(resources->cpls[i]),
     790             :                                                      &translation);
     791         128 :                 if (rc) {
     792           0 :                         goto cleanup;
     793             :                 }
     794         128 :                 rdma_req->rsp.sgl[0].lkey = spdk_rdma_utils_memory_translation_get_lkey(&translation);
     795             : 
     796         128 :                 rdma_req->rsp_wr.type = RDMA_WR_TYPE_SEND;
     797         128 :                 rdma_req->rsp.wr.wr_id = (uintptr_t)&rdma_req->rsp_wr;
     798         128 :                 rdma_req->rsp.wr.next = NULL;
     799         128 :                 rdma_req->rsp.wr.opcode = IBV_WR_SEND;
     800         128 :                 rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED;
     801         128 :                 rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl;
     802         128 :                 rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl);
     803             : 
     804             :                 /* Set up memory for data buffers */
     805         128 :                 rdma_req->data_wr.type = RDMA_WR_TYPE_DATA;
     806         128 :                 rdma_req->data.wr.wr_id = (uintptr_t)&rdma_req->data_wr;
     807         128 :                 rdma_req->data.wr.next = NULL;
     808         128 :                 rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED;
     809         128 :                 rdma_req->data.wr.sg_list = rdma_req->data.sgl;
     810         128 :                 rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl);
     811             : 
     812             :                 /* Initialize request state to FREE */
     813         128 :                 rdma_req->state = RDMA_REQUEST_STATE_FREE;
     814         128 :                 STAILQ_INSERT_TAIL(&resources->free_queue, rdma_req, state_link);
     815         128 :         }
     816             : 
     817           1 :         if (srq) {
     818           0 :                 rc = spdk_rdma_provider_srq_flush_recv_wrs(srq, &bad_wr);
     819           0 :         } else {
     820           1 :                 rc = spdk_rdma_provider_qp_flush_recv_wrs(qp, &bad_wr);
     821             :         }
     822             : 
     823           1 :         if (rc) {
     824           0 :                 goto cleanup;
     825             :         }
     826             : 
     827           1 :         return resources;
     828             : 
     829             : cleanup:
     830           0 :         nvmf_rdma_resources_destroy(resources);
     831           0 :         return NULL;
     832           1 : }
     833             : 
     834             : static void
     835           0 : nvmf_rdma_qpair_clean_ibv_events(struct spdk_nvmf_rdma_qpair *rqpair)
     836             : {
     837           0 :         struct spdk_nvmf_rdma_ibv_event_ctx *ctx;
     838             : 
     839           0 :         ctx = rqpair->last_wqe_reached_ctx;
     840           0 :         if (ctx) {
     841           0 :                 ctx->rqpair = NULL;
     842             :                 /* Memory allocated for ctx is freed in nvmf_rdma_qpair_process_last_wqe_event */
     843           0 :                 rqpair->last_wqe_reached_ctx = NULL;
     844           0 :         }
     845           0 : }
     846             : 
     847             : static void nvmf_rdma_poller_destroy(struct spdk_nvmf_rdma_poller *poller);
     848             : 
     849             : static void
     850           0 : nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair)
     851             : {
     852           0 :         struct spdk_nvmf_rdma_recv      *rdma_recv, *recv_tmp;
     853           0 :         struct ibv_recv_wr              *bad_recv_wr = NULL;
     854           0 :         int                             rc;
     855             : 
     856           0 :         spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair);
     857             : 
     858           0 :         if (rqpair->qd != 0) {
     859           0 :                 struct spdk_nvmf_qpair *qpair = &rqpair->qpair;
     860           0 :                 struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(qpair->transport,
     861             :                                 struct spdk_nvmf_rdma_transport, transport);
     862           0 :                 struct spdk_nvmf_rdma_request *req;
     863           0 :                 uint32_t i, max_req_count = 0;
     864             : 
     865           0 :                 SPDK_WARNLOG("Destroying qpair when queue depth is %d\n", rqpair->qd);
     866             : 
     867           0 :                 if (rqpair->srq == NULL) {
     868           0 :                         nvmf_rdma_dump_qpair_contents(rqpair);
     869           0 :                         max_req_count = rqpair->max_queue_depth;
     870           0 :                 } else if (rqpair->poller && rqpair->resources) {
     871           0 :                         max_req_count = rqpair->poller->max_srq_depth;
     872           0 :                 }
     873             : 
     874           0 :                 SPDK_DEBUGLOG(rdma, "Release incomplete requests\n");
     875           0 :                 for (i = 0; i < max_req_count; i++) {
     876           0 :                         req = &rqpair->resources->reqs[i];
     877           0 :                         if (req->req.qpair == qpair && req->state != RDMA_REQUEST_STATE_FREE) {
     878             :                                 /* nvmf_rdma_request_process checks qpair ibv and internal state
     879             :                                  * and completes a request */
     880           0 :                                 nvmf_rdma_request_process(rtransport, req);
     881           0 :                         }
     882           0 :                 }
     883           0 :                 assert(rqpair->qd == 0);
     884           0 :         }
     885             : 
     886           0 :         if (rqpair->poller) {
     887           0 :                 RB_REMOVE(qpairs_tree, &rqpair->poller->qpairs, rqpair);
     888             : 
     889           0 :                 if (rqpair->srq != NULL && rqpair->resources != NULL) {
     890             :                         /* Drop all received but unprocessed commands for this queue and return them to SRQ */
     891           0 :                         STAILQ_FOREACH_SAFE(rdma_recv, &rqpair->resources->incoming_queue, link, recv_tmp) {
     892           0 :                                 if (rqpair == rdma_recv->qpair) {
     893           0 :                                         STAILQ_REMOVE(&rqpair->resources->incoming_queue, rdma_recv, spdk_nvmf_rdma_recv, link);
     894           0 :                                         spdk_rdma_provider_srq_queue_recv_wrs(rqpair->srq, &rdma_recv->wr);
     895           0 :                                         rc = spdk_rdma_provider_srq_flush_recv_wrs(rqpair->srq, &bad_recv_wr);
     896           0 :                                         if (rc) {
     897           0 :                                                 SPDK_ERRLOG("Unable to re-post rx descriptor\n");
     898           0 :                                         }
     899           0 :                                 }
     900           0 :                         }
     901           0 :                 }
     902           0 :         }
     903             : 
     904           0 :         if (rqpair->cm_id) {
     905           0 :                 if (rqpair->rdma_qp != NULL) {
     906           0 :                         spdk_rdma_provider_qp_destroy(rqpair->rdma_qp);
     907           0 :                         rqpair->rdma_qp = NULL;
     908           0 :                 }
     909             : 
     910           0 :                 if (rqpair->poller != NULL && rqpair->srq == NULL) {
     911           0 :                         rqpair->poller->required_num_wr -= MAX_WR_PER_QP(rqpair->max_queue_depth);
     912           0 :                 }
     913           0 :         }
     914             : 
     915           0 :         if (rqpair->srq == NULL && rqpair->resources != NULL) {
     916           0 :                 nvmf_rdma_resources_destroy(rqpair->resources);
     917           0 :         }
     918             : 
     919           0 :         nvmf_rdma_qpair_clean_ibv_events(rqpair);
     920             : 
     921           0 :         if (rqpair->destruct_channel) {
     922           0 :                 spdk_put_io_channel(rqpair->destruct_channel);
     923           0 :                 rqpair->destruct_channel = NULL;
     924           0 :         }
     925             : 
     926           0 :         if (rqpair->poller && rqpair->poller->need_destroy && RB_EMPTY(&rqpair->poller->qpairs)) {
     927           0 :                 nvmf_rdma_poller_destroy(rqpair->poller);
     928           0 :         }
     929             : 
     930             :         /* destroy cm_id last so cma device will not be freed before we destroy the cq. */
     931           0 :         if (rqpair->cm_id) {
     932           0 :                 rdma_destroy_id(rqpair->cm_id);
     933           0 :         }
     934             : 
     935           0 :         free(rqpair);
     936           0 : }
     937             : 
     938             : static int
     939           5 : nvmf_rdma_resize_cq(struct spdk_nvmf_rdma_qpair *rqpair, struct spdk_nvmf_rdma_device *device)
     940             : {
     941           5 :         struct spdk_nvmf_rdma_poller    *rpoller;
     942           5 :         int                             rc, num_cqe, required_num_wr;
     943             : 
     944             :         /* Enlarge CQ size dynamically */
     945           5 :         rpoller = rqpair->poller;
     946           5 :         required_num_wr = rpoller->required_num_wr + MAX_WR_PER_QP(rqpair->max_queue_depth);
     947           5 :         num_cqe = rpoller->num_cqe;
     948           5 :         if (num_cqe < required_num_wr) {
     949           4 :                 num_cqe = spdk_max(num_cqe * 2, required_num_wr);
     950           4 :                 num_cqe = spdk_min(num_cqe, device->attr.max_cqe);
     951           4 :         }
     952             : 
     953           5 :         if (rpoller->num_cqe != num_cqe) {
     954           4 :                 if (device->context->device->transport_type == IBV_TRANSPORT_IWARP) {
     955           1 :                         SPDK_ERRLOG("iWARP doesn't support CQ resize. Current capacity %u, required %u\n"
     956             :                                     "Using CQ of insufficient size may lead to CQ overrun\n", rpoller->num_cqe, num_cqe);
     957           1 :                         return -1;
     958             :                 }
     959           3 :                 if (required_num_wr > device->attr.max_cqe) {
     960           1 :                         SPDK_ERRLOG("RDMA CQE requirement (%d) exceeds device max_cqe limitation (%d)\n",
     961             :                                     required_num_wr, device->attr.max_cqe);
     962           1 :                         return -1;
     963             :                 }
     964             : 
     965           2 :                 SPDK_DEBUGLOG(rdma, "Resize RDMA CQ from %d to %d\n", rpoller->num_cqe, num_cqe);
     966           2 :                 rc = ibv_resize_cq(rpoller->cq, num_cqe);
     967           2 :                 if (rc) {
     968           1 :                         SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno));
     969           1 :                         return -1;
     970             :                 }
     971             : 
     972           1 :                 rpoller->num_cqe = num_cqe;
     973           1 :         }
     974             : 
     975           2 :         rpoller->required_num_wr = required_num_wr;
     976           2 :         return 0;
     977           5 : }
     978             : 
     979             : static int
     980           0 : nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair)
     981             : {
     982           0 :         struct spdk_nvmf_rdma_qpair             *rqpair;
     983           0 :         struct spdk_nvmf_rdma_transport         *rtransport;
     984           0 :         struct spdk_nvmf_transport              *transport;
     985           0 :         struct spdk_nvmf_rdma_resource_opts     opts;
     986           0 :         struct spdk_nvmf_rdma_device            *device;
     987           0 :         struct spdk_rdma_provider_qp_init_attr  qp_init_attr = {};
     988             : 
     989           0 :         rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
     990           0 :         device = rqpair->device;
     991             : 
     992           0 :         qp_init_attr.qp_context = rqpair;
     993           0 :         qp_init_attr.pd         = device->pd;
     994           0 :         qp_init_attr.send_cq    = rqpair->poller->cq;
     995           0 :         qp_init_attr.recv_cq    = rqpair->poller->cq;
     996             : 
     997           0 :         if (rqpair->srq) {
     998           0 :                 qp_init_attr.srq                = rqpair->srq->srq;
     999           0 :         } else {
    1000           0 :                 qp_init_attr.cap.max_recv_wr    = rqpair->max_queue_depth;
    1001             :         }
    1002             : 
    1003             :         /* SEND, READ, and WRITE operations */
    1004           0 :         qp_init_attr.cap.max_send_wr    = (uint32_t)rqpair->max_queue_depth * 2;
    1005           0 :         qp_init_attr.cap.max_send_sge   = spdk_min((uint32_t)device->attr.max_sge, NVMF_DEFAULT_TX_SGE);
    1006           0 :         qp_init_attr.cap.max_recv_sge   = spdk_min((uint32_t)device->attr.max_sge, NVMF_DEFAULT_RX_SGE);
    1007           0 :         qp_init_attr.stats              = &rqpair->poller->stat.qp_stats;
    1008             : 
    1009           0 :         if (rqpair->srq == NULL && nvmf_rdma_resize_cq(rqpair, device) < 0) {
    1010           0 :                 SPDK_ERRLOG("Failed to resize the completion queue. Cannot initialize qpair.\n");
    1011           0 :                 goto error;
    1012             :         }
    1013             : 
    1014           0 :         rqpair->rdma_qp = spdk_rdma_provider_qp_create(rqpair->cm_id, &qp_init_attr);
    1015           0 :         if (!rqpair->rdma_qp) {
    1016           0 :                 goto error;
    1017             :         }
    1018             : 
    1019           0 :         rqpair->qp_num = rqpair->rdma_qp->qp->qp_num;
    1020             : 
    1021           0 :         rqpair->max_send_depth = spdk_min((uint32_t)(rqpair->max_queue_depth * 2),
    1022             :                                           qp_init_attr.cap.max_send_wr);
    1023           0 :         rqpair->max_send_sge = spdk_min(NVMF_DEFAULT_TX_SGE, qp_init_attr.cap.max_send_sge);
    1024           0 :         rqpair->max_recv_sge = spdk_min(NVMF_DEFAULT_RX_SGE, qp_init_attr.cap.max_recv_sge);
    1025           0 :         spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair);
    1026           0 :         SPDK_DEBUGLOG(rdma, "New RDMA Connection: %p\n", qpair);
    1027             : 
    1028           0 :         if (rqpair->poller->srq == NULL) {
    1029           0 :                 rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
    1030           0 :                 transport = &rtransport->transport;
    1031             : 
    1032           0 :                 opts.qp = rqpair->rdma_qp;
    1033           0 :                 opts.map = device->map;
    1034           0 :                 opts.qpair = rqpair;
    1035           0 :                 opts.shared = false;
    1036           0 :                 opts.max_queue_depth = rqpair->max_queue_depth;
    1037           0 :                 opts.in_capsule_data_size = transport->opts.in_capsule_data_size;
    1038             : 
    1039           0 :                 rqpair->resources = nvmf_rdma_resources_create(&opts);
    1040             : 
    1041           0 :                 if (!rqpair->resources) {
    1042           0 :                         SPDK_ERRLOG("Unable to allocate resources for receive queue.\n");
    1043           0 :                         rdma_destroy_qp(rqpair->cm_id);
    1044           0 :                         goto error;
    1045             :                 }
    1046           0 :         } else {
    1047           0 :                 rqpair->resources = rqpair->poller->resources;
    1048             :         }
    1049             : 
    1050           0 :         rqpair->current_recv_depth = 0;
    1051           0 :         STAILQ_INIT(&rqpair->pending_rdma_read_queue);
    1052           0 :         STAILQ_INIT(&rqpair->pending_rdma_write_queue);
    1053           0 :         STAILQ_INIT(&rqpair->pending_rdma_send_queue);
    1054           0 :         rqpair->qpair.queue_depth = 0;
    1055             : 
    1056           0 :         return 0;
    1057             : 
    1058             : error:
    1059           0 :         rdma_destroy_id(rqpair->cm_id);
    1060           0 :         rqpair->cm_id = NULL;
    1061           0 :         return -1;
    1062           0 : }
    1063             : 
    1064             : /* Append the given recv wr structure to the resource structs outstanding recvs list. */
    1065             : /* This function accepts either a single wr or the first wr in a linked list. */
    1066             : static void
    1067           6 : nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first)
    1068             : {
    1069           6 :         struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport,
    1070             :                         struct spdk_nvmf_rdma_transport, transport);
    1071             : 
    1072           6 :         if (rqpair->srq != NULL) {
    1073           0 :                 spdk_rdma_provider_srq_queue_recv_wrs(rqpair->srq, first);
    1074           0 :         } else {
    1075           6 :                 if (spdk_rdma_provider_qp_queue_recv_wrs(rqpair->rdma_qp, first)) {
    1076           6 :                         STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_recv, rqpair, recv_link);
    1077           6 :                 }
    1078             :         }
    1079             : 
    1080           6 :         if (rtransport->rdma_opts.no_wr_batching) {
    1081           0 :                 _poller_submit_recvs(rtransport, rqpair->poller);
    1082           0 :         }
    1083           6 : }
    1084             : 
    1085             : static inline void
    1086           4 : request_transfer_in(struct spdk_nvmf_request *req)
    1087             : {
    1088           4 :         struct spdk_nvmf_rdma_request   *rdma_req;
    1089           4 :         struct spdk_nvmf_qpair          *qpair;
    1090           4 :         struct spdk_nvmf_rdma_qpair     *rqpair;
    1091           4 :         struct spdk_nvmf_rdma_transport *rtransport;
    1092             : 
    1093           4 :         qpair = req->qpair;
    1094           4 :         rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
    1095           4 :         rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
    1096           4 :         rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport,
    1097             :                                       struct spdk_nvmf_rdma_transport, transport);
    1098             : 
    1099           4 :         assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER);
    1100           4 :         assert(rdma_req != NULL);
    1101             : 
    1102           4 :         if (spdk_rdma_provider_qp_queue_send_wrs(rqpair->rdma_qp, rdma_req->transfer_wr)) {
    1103           4 :                 STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link);
    1104           4 :         }
    1105           4 :         if (rtransport->rdma_opts.no_wr_batching) {
    1106           0 :                 _poller_submit_sends(rtransport, rqpair->poller);
    1107           0 :         }
    1108             : 
    1109           4 :         assert(rqpair->current_read_depth + rdma_req->num_outstanding_data_wr <= rqpair->max_read_depth);
    1110           4 :         rqpair->current_read_depth += rdma_req->num_outstanding_data_wr;
    1111           4 :         assert(rqpair->current_send_depth + rdma_req->num_outstanding_data_wr <= rqpair->max_send_depth);
    1112           4 :         rqpair->current_send_depth += rdma_req->num_outstanding_data_wr;
    1113           4 : }
    1114             : 
    1115             : static inline void
    1116           0 : nvmf_rdma_request_reset_transfer_in(struct spdk_nvmf_rdma_request *rdma_req,
    1117             :                                     struct spdk_nvmf_rdma_transport *rtransport)
    1118             : {
    1119             :         /* Put completed WRs back to pool and move transfer_wr pointer */
    1120           0 :         _nvmf_rdma_request_free_data(rdma_req, rdma_req->transfer_wr, rtransport->data_wr_pool);
    1121           0 :         rdma_req->transfer_wr = rdma_req->remaining_tranfer_in_wrs;
    1122           0 :         rdma_req->remaining_tranfer_in_wrs = NULL;
    1123           0 :         rdma_req->num_outstanding_data_wr = rdma_req->num_remaining_data_wr;
    1124           0 :         rdma_req->num_remaining_data_wr = 0;
    1125           0 : }
    1126             : 
    1127             : static inline int
    1128           0 : request_prepare_transfer_in_part(struct spdk_nvmf_request *req, uint32_t num_reads_available)
    1129             : {
    1130           0 :         struct spdk_nvmf_rdma_request   *rdma_req;
    1131           0 :         struct ibv_send_wr              *wr;
    1132           0 :         uint32_t i;
    1133             : 
    1134           0 :         rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
    1135             : 
    1136           0 :         assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER);
    1137           0 :         assert(rdma_req != NULL);
    1138           0 :         assert(num_reads_available > 0);
    1139           0 :         assert(rdma_req->num_outstanding_data_wr > num_reads_available);
    1140           0 :         wr = rdma_req->transfer_wr;
    1141             : 
    1142           0 :         for (i = 0; i < num_reads_available - 1; i++) {
    1143           0 :                 wr = wr->next;
    1144           0 :         }
    1145             : 
    1146           0 :         rdma_req->remaining_tranfer_in_wrs = wr->next;
    1147           0 :         rdma_req->num_remaining_data_wr = rdma_req->num_outstanding_data_wr - num_reads_available;
    1148           0 :         rdma_req->num_outstanding_data_wr = num_reads_available;
    1149             :         /* Break chain of WRs to send only part. Once this portion completes, we continue sending RDMA_READs */
    1150           0 :         wr->next = NULL;
    1151             : 
    1152           0 :         return 0;
    1153           0 : }
    1154             : 
    1155             : static int
    1156           6 : request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
    1157             : {
    1158           6 :         int                             num_outstanding_data_wr = 0;
    1159           6 :         struct spdk_nvmf_rdma_request   *rdma_req;
    1160           6 :         struct spdk_nvmf_qpair          *qpair;
    1161           6 :         struct spdk_nvmf_rdma_qpair     *rqpair;
    1162           6 :         struct spdk_nvme_cpl            *rsp;
    1163           6 :         struct ibv_send_wr              *first = NULL;
    1164           6 :         struct spdk_nvmf_rdma_transport *rtransport;
    1165             : 
    1166           6 :         *data_posted = 0;
    1167           6 :         qpair = req->qpair;
    1168           6 :         rsp = &req->rsp->nvme_cpl;
    1169           6 :         rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
    1170           6 :         rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
    1171           6 :         rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport,
    1172             :                                       struct spdk_nvmf_rdma_transport, transport);
    1173             : 
    1174             :         /* Advance our sq_head pointer */
    1175           6 :         if (qpair->sq_head == qpair->sq_head_max) {
    1176           6 :                 qpair->sq_head = 0;
    1177           6 :         } else {
    1178           0 :                 qpair->sq_head++;
    1179             :         }
    1180           6 :         rsp->sqhd = qpair->sq_head;
    1181             : 
    1182             :         /* queue the capsule for the recv buffer */
    1183           6 :         assert(rdma_req->recv != NULL);
    1184             : 
    1185           6 :         nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr);
    1186             : 
    1187           6 :         rdma_req->recv = NULL;
    1188           6 :         assert(rqpair->current_recv_depth > 0);
    1189           6 :         rqpair->current_recv_depth--;
    1190             : 
    1191             :         /* Build the response which consists of optional
    1192             :          * RDMA WRITEs to transfer data, plus an RDMA SEND
    1193             :          * containing the response.
    1194             :          */
    1195           6 :         first = &rdma_req->rsp.wr;
    1196             : 
    1197           6 :         if (spdk_unlikely(rsp->status.sc != SPDK_NVME_SC_SUCCESS)) {
    1198             :                 /* On failure, data was not read from the controller. So clear the
    1199             :                  * number of outstanding data WRs to zero.
    1200             :                  */
    1201           1 :                 rdma_req->num_outstanding_data_wr = 0;
    1202           6 :         } else if (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
    1203           1 :                 first = rdma_req->transfer_wr;
    1204           1 :                 *data_posted = 1;
    1205           1 :                 num_outstanding_data_wr = rdma_req->num_outstanding_data_wr;
    1206           1 :         }
    1207           6 :         if (spdk_rdma_provider_qp_queue_send_wrs(rqpair->rdma_qp, first)) {
    1208           6 :                 STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link);
    1209           6 :         }
    1210           6 :         if (rtransport->rdma_opts.no_wr_batching) {
    1211           0 :                 _poller_submit_sends(rtransport, rqpair->poller);
    1212           0 :         }
    1213             : 
    1214             :         /* +1 for the rsp wr */
    1215           6 :         assert(rqpair->current_send_depth + num_outstanding_data_wr + 1 <= rqpair->max_send_depth);
    1216           6 :         rqpair->current_send_depth += num_outstanding_data_wr + 1;
    1217             : 
    1218           6 :         return 0;
    1219           6 : }
    1220             : 
    1221             : static int
    1222           0 : nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair)
    1223             : {
    1224           0 :         struct spdk_nvmf_rdma_accept_private_data       accept_data;
    1225           0 :         struct rdma_conn_param                          ctrlr_event_data = {};
    1226           0 :         int                                             rc;
    1227             : 
    1228           0 :         accept_data.recfmt = 0;
    1229           0 :         accept_data.crqsize = rqpair->max_queue_depth;
    1230             : 
    1231           0 :         ctrlr_event_data.private_data = &accept_data;
    1232           0 :         ctrlr_event_data.private_data_len = sizeof(accept_data);
    1233           0 :         if (id->ps == RDMA_PS_TCP) {
    1234           0 :                 ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */
    1235           0 :                 ctrlr_event_data.initiator_depth = rqpair->max_read_depth;
    1236           0 :         }
    1237             : 
    1238             :         /* Configure infinite retries for the initiator side qpair.
    1239             :          * We need to pass this value to the initiator to prevent the
    1240             :          * initiator side NIC from completing SEND requests back to the
    1241             :          * initiator with status rnr_retry_count_exceeded. */
    1242           0 :         ctrlr_event_data.rnr_retry_count = 0x7;
    1243             : 
    1244             :         /* When qpair is created without use of rdma cm API, an additional
    1245             :          * information must be provided to initiator in the connection response:
    1246             :          * whether qpair is using SRQ and its qp_num
    1247             :          * Fields below are ignored by rdma cm if qpair has been
    1248             :          * created using rdma cm API. */
    1249           0 :         ctrlr_event_data.srq = rqpair->srq ? 1 : 0;
    1250           0 :         ctrlr_event_data.qp_num = rqpair->qp_num;
    1251             : 
    1252           0 :         rc = spdk_rdma_provider_qp_accept(rqpair->rdma_qp, &ctrlr_event_data);
    1253           0 :         if (rc) {
    1254           0 :                 SPDK_ERRLOG("Error %d on spdk_rdma_provider_qp_accept\n", errno);
    1255           0 :         } else {
    1256           0 :                 SPDK_DEBUGLOG(rdma, "Sent back the accept\n");
    1257             :         }
    1258             : 
    1259           0 :         return rc;
    1260           0 : }
    1261             : 
    1262             : static void
    1263           0 : nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error)
    1264             : {
    1265           0 :         struct spdk_nvmf_rdma_reject_private_data       rej_data;
    1266             : 
    1267           0 :         rej_data.recfmt = 0;
    1268           0 :         rej_data.sts = error;
    1269             : 
    1270           0 :         rdma_reject(id, &rej_data, sizeof(rej_data));
    1271           0 : }
    1272             : 
    1273             : static int
    1274           0 : nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event)
    1275             : {
    1276           0 :         struct spdk_nvmf_rdma_transport *rtransport;
    1277           0 :         struct spdk_nvmf_rdma_qpair     *rqpair = NULL;
    1278           0 :         struct spdk_nvmf_rdma_port      *port;
    1279           0 :         struct rdma_conn_param          *rdma_param = NULL;
    1280           0 :         const struct spdk_nvmf_rdma_request_private_data *private_data = NULL;
    1281           0 :         uint16_t                        max_queue_depth;
    1282           0 :         uint16_t                        max_read_depth;
    1283             : 
    1284           0 :         rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
    1285             : 
    1286           0 :         assert(event->id != NULL); /* Impossible. Can't even reject the connection. */
    1287           0 :         assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */
    1288             : 
    1289           0 :         rdma_param = &event->param.conn;
    1290           0 :         if (rdma_param->private_data == NULL ||
    1291           0 :             rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) {
    1292           0 :                 SPDK_ERRLOG("connect request: no private data provided\n");
    1293           0 :                 nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH);
    1294           0 :                 return -1;
    1295             :         }
    1296             : 
    1297           0 :         private_data = rdma_param->private_data;
    1298           0 :         if (private_data->recfmt != 0) {
    1299           0 :                 SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n");
    1300           0 :                 nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT);
    1301           0 :                 return -1;
    1302             :         }
    1303             : 
    1304           0 :         SPDK_DEBUGLOG(rdma, "Connect Recv on fabric intf name %s, dev_name %s\n",
    1305             :                       event->id->verbs->device->name, event->id->verbs->device->dev_name);
    1306             : 
    1307           0 :         port = event->listen_id->context;
    1308           0 :         SPDK_DEBUGLOG(rdma, "Listen Id was %p with verbs %p. ListenAddr: %p\n",
    1309             :                       event->listen_id, event->listen_id->verbs, port);
    1310             : 
    1311             :         /* Figure out the supported queue depth. This is a multi-step process
    1312             :          * that takes into account hardware maximums, host provided values,
    1313             :          * and our target's internal memory limits */
    1314             : 
    1315           0 :         SPDK_DEBUGLOG(rdma, "Calculating Queue Depth\n");
    1316             : 
    1317             :         /* Start with the maximum queue depth allowed by the target */
    1318           0 :         max_queue_depth = rtransport->transport.opts.max_queue_depth;
    1319           0 :         max_read_depth = rtransport->transport.opts.max_queue_depth;
    1320           0 :         SPDK_DEBUGLOG(rdma, "Target Max Queue Depth: %d\n",
    1321             :                       rtransport->transport.opts.max_queue_depth);
    1322             : 
    1323             :         /* Next check the local NIC's hardware limitations */
    1324           0 :         SPDK_DEBUGLOG(rdma,
    1325             :                       "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n",
    1326             :                       port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom);
    1327           0 :         max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr);
    1328           0 :         max_read_depth = spdk_min(max_read_depth, port->device->attr.max_qp_init_rd_atom);
    1329             : 
    1330             :         /* Next check the remote NIC's hardware limitations */
    1331           0 :         SPDK_DEBUGLOG(rdma,
    1332             :                       "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n",
    1333             :                       rdma_param->initiator_depth, rdma_param->responder_resources);
    1334             :         /* from man3 rdma_get_cm_event
    1335             :          * responder_resources - Specifies the number of responder resources that is requested by the recipient.
    1336             :          * The responder_resources field must match the initiator depth specified by the remote node when running
    1337             :          * the rdma_connect and rdma_accept functions. */
    1338           0 :         if (rdma_param->responder_resources != 0) {
    1339           0 :                 if (private_data->qid) {
    1340           0 :                         SPDK_DEBUGLOG(rdma, "Host (Initiator) is not allowed to use RDMA operations,"
    1341             :                                       " responder_resources must be 0 but set to %u\n",
    1342             :                                       rdma_param->responder_resources);
    1343           0 :                 } else {
    1344           0 :                         SPDK_WARNLOG("Host (Initiator) is not allowed to use RDMA operations,"
    1345             :                                      " responder_resources must be 0 but set to %u\n",
    1346             :                                      rdma_param->responder_resources);
    1347             :                 }
    1348           0 :         }
    1349             :         /* from man3 rdma_get_cm_event
    1350             :          * initiator_depth - Specifies the maximum number of outstanding RDMA read operations that the recipient holds.
    1351             :          * The initiator_depth field must match the responder resources specified by the remote node when running
    1352             :          * the rdma_connect and rdma_accept functions. */
    1353           0 :         if (rdma_param->initiator_depth == 0) {
    1354           0 :                 SPDK_ERRLOG("Host (Initiator) doesn't support RDMA_READ or atomic operations\n");
    1355           0 :                 nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_IRD);
    1356           0 :                 return -1;
    1357             :         }
    1358           0 :         max_read_depth = spdk_min(max_read_depth, rdma_param->initiator_depth);
    1359             : 
    1360           0 :         SPDK_DEBUGLOG(rdma, "Host Receive Queue Size: %d\n", private_data->hrqsize);
    1361           0 :         SPDK_DEBUGLOG(rdma, "Host Send Queue Size: %d\n", private_data->hsqsize);
    1362           0 :         max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize);
    1363           0 :         max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1);
    1364             : 
    1365           0 :         SPDK_DEBUGLOG(rdma, "Final Negotiated Queue Depth: %d R/W Depth: %d\n",
    1366             :                       max_queue_depth, max_read_depth);
    1367             : 
    1368           0 :         rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair));
    1369           0 :         if (rqpair == NULL) {
    1370           0 :                 SPDK_ERRLOG("Could not allocate new connection.\n");
    1371           0 :                 nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES);
    1372           0 :                 return -1;
    1373             :         }
    1374             : 
    1375           0 :         rqpair->device = port->device;
    1376           0 :         rqpair->max_queue_depth = max_queue_depth;
    1377           0 :         rqpair->max_read_depth = max_read_depth;
    1378           0 :         rqpair->cm_id = event->id;
    1379           0 :         rqpair->listen_id = event->listen_id;
    1380           0 :         rqpair->qpair.transport = transport;
    1381             :         /* use qid from the private data to determine the qpair type
    1382             :            qid will be set to the appropriate value when the controller is created */
    1383           0 :         rqpair->qpair.qid = private_data->qid;
    1384           0 :         rqpair->qpair.numa.id_valid = 1;
    1385           0 :         rqpair->qpair.numa.id = spdk_rdma_cm_id_get_numa_id(rqpair->cm_id);
    1386             : 
    1387           0 :         event->id->context = &rqpair->qpair;
    1388             : 
    1389           0 :         spdk_nvmf_tgt_new_qpair(transport->tgt, &rqpair->qpair);
    1390             : 
    1391           0 :         return 0;
    1392           0 : }
    1393             : 
    1394             : static inline void
    1395          28 : nvmf_rdma_setup_wr(struct ibv_send_wr *wr, struct ibv_send_wr *next,
    1396             :                    enum spdk_nvme_data_transfer xfer)
    1397             : {
    1398          28 :         if (xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
    1399          24 :                 wr->opcode = IBV_WR_RDMA_WRITE;
    1400          24 :                 wr->send_flags = 0;
    1401          24 :                 wr->next = next;
    1402          28 :         } else if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
    1403           4 :                 wr->opcode = IBV_WR_RDMA_READ;
    1404           4 :                 wr->send_flags = IBV_SEND_SIGNALED;
    1405           4 :                 wr->next = NULL;
    1406           4 :         } else {
    1407           0 :                 assert(0);
    1408             :         }
    1409          28 : }
    1410             : 
    1411             : static int
    1412           6 : nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport,
    1413             :                        struct spdk_nvmf_rdma_request *rdma_req,
    1414             :                        uint32_t num_sgl_descriptors)
    1415             : {
    1416           6 :         struct spdk_nvmf_rdma_request_data      *work_requests[SPDK_NVMF_MAX_SGL_ENTRIES];
    1417           6 :         struct spdk_nvmf_rdma_request_data      *current_data_wr;
    1418           6 :         uint32_t                                i;
    1419             : 
    1420           6 :         if (spdk_unlikely(num_sgl_descriptors > SPDK_NVMF_MAX_SGL_ENTRIES)) {
    1421           0 :                 SPDK_ERRLOG("Requested too much entries (%u), the limit is %u\n",
    1422             :                             num_sgl_descriptors, SPDK_NVMF_MAX_SGL_ENTRIES);
    1423           0 :                 return -EINVAL;
    1424             :         }
    1425             : 
    1426           6 :         if (spdk_unlikely(spdk_mempool_get_bulk(rtransport->data_wr_pool, (void **)work_requests,
    1427             :                                                 num_sgl_descriptors))) {
    1428           0 :                 return -ENOMEM;
    1429             :         }
    1430             : 
    1431           6 :         current_data_wr = &rdma_req->data;
    1432             : 
    1433          12 :         for (i = 0; i < num_sgl_descriptors; i++) {
    1434           6 :                 nvmf_rdma_setup_wr(&current_data_wr->wr, &work_requests[i]->wr, rdma_req->req.xfer);
    1435           6 :                 current_data_wr->wr.next = &work_requests[i]->wr;
    1436           6 :                 current_data_wr = work_requests[i];
    1437           6 :                 current_data_wr->wr.sg_list = current_data_wr->sgl;
    1438           6 :                 current_data_wr->wr.wr_id = rdma_req->data.wr.wr_id;
    1439           6 :         }
    1440             : 
    1441           6 :         nvmf_rdma_setup_wr(&current_data_wr->wr, &rdma_req->rsp.wr, rdma_req->req.xfer);
    1442             : 
    1443           6 :         return 0;
    1444           6 : }
    1445             : 
    1446             : static inline void
    1447          16 : nvmf_rdma_setup_request(struct spdk_nvmf_rdma_request *rdma_req)
    1448             : {
    1449          16 :         struct ibv_send_wr              *wr = &rdma_req->data.wr;
    1450          16 :         struct spdk_nvme_sgl_descriptor *sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1;
    1451             : 
    1452          16 :         wr->wr.rdma.rkey = sgl->keyed.key;
    1453          16 :         wr->wr.rdma.remote_addr = sgl->address;
    1454          16 :         nvmf_rdma_setup_wr(wr, &rdma_req->rsp.wr, rdma_req->req.xfer);
    1455          16 : }
    1456             : 
    1457             : static inline void
    1458           1 : nvmf_rdma_update_remote_addr(struct spdk_nvmf_rdma_request *rdma_req, uint32_t num_wrs)
    1459             : {
    1460           1 :         struct ibv_send_wr              *wr = &rdma_req->data.wr;
    1461           1 :         struct spdk_nvme_sgl_descriptor *sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1;
    1462           1 :         uint32_t                        i;
    1463           1 :         int                             j;
    1464           1 :         uint64_t                        remote_addr_offset = 0;
    1465             : 
    1466           3 :         for (i = 0; i < num_wrs; ++i) {
    1467           2 :                 wr->wr.rdma.rkey = sgl->keyed.key;
    1468           2 :                 wr->wr.rdma.remote_addr = sgl->address + remote_addr_offset;
    1469          19 :                 for (j = 0; j < wr->num_sge; ++j) {
    1470          17 :                         remote_addr_offset += wr->sg_list[j].length;
    1471          17 :                 }
    1472           2 :                 wr = wr->next;
    1473           2 :         }
    1474           1 : }
    1475             : 
    1476             : static int
    1477          15 : nvmf_rdma_fill_wr_sgl(struct spdk_nvmf_rdma_device *device,
    1478             :                       struct spdk_nvmf_rdma_request *rdma_req,
    1479             :                       struct ibv_send_wr *wr,
    1480             :                       uint32_t total_length)
    1481             : {
    1482          15 :         struct spdk_rdma_utils_memory_translation mem_translation;
    1483          15 :         struct ibv_sge  *sg_ele;
    1484          15 :         struct iovec *iov;
    1485          15 :         uint32_t lkey, remaining;
    1486          15 :         int rc;
    1487             : 
    1488          15 :         wr->num_sge = 0;
    1489             : 
    1490          74 :         while (total_length && wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES) {
    1491          59 :                 iov = &rdma_req->req.iov[rdma_req->iovpos];
    1492          59 :                 rc = spdk_rdma_utils_get_translation(device->map, iov->iov_base, iov->iov_len, &mem_translation);
    1493          59 :                 if (spdk_unlikely(rc)) {
    1494           0 :                         return rc;
    1495             :                 }
    1496             : 
    1497          59 :                 lkey = spdk_rdma_utils_memory_translation_get_lkey(&mem_translation);
    1498          59 :                 sg_ele = &wr->sg_list[wr->num_sge];
    1499          59 :                 remaining = spdk_min((uint32_t)iov->iov_len - rdma_req->offset, total_length);
    1500             : 
    1501          59 :                 sg_ele->lkey = lkey;
    1502          59 :                 sg_ele->addr = (uintptr_t)iov->iov_base + rdma_req->offset;
    1503          59 :                 sg_ele->length = remaining;
    1504          59 :                 SPDK_DEBUGLOG(rdma, "sge[%d] %p addr 0x%"PRIx64", len %u\n", wr->num_sge, sg_ele, sg_ele->addr,
    1505             :                               sg_ele->length);
    1506          59 :                 rdma_req->offset += sg_ele->length;
    1507          59 :                 total_length -= sg_ele->length;
    1508          59 :                 wr->num_sge++;
    1509             : 
    1510          59 :                 if (rdma_req->offset == iov->iov_len) {
    1511          57 :                         rdma_req->offset = 0;
    1512          57 :                         rdma_req->iovpos++;
    1513          57 :                 }
    1514             :         }
    1515             : 
    1516          15 :         if (spdk_unlikely(total_length)) {
    1517           0 :                 SPDK_ERRLOG("Not enough SG entries to hold data buffer\n");
    1518           0 :                 return -EINVAL;
    1519             :         }
    1520             : 
    1521          15 :         return 0;
    1522          15 : }
    1523             : 
    1524             : static int
    1525          10 : nvmf_rdma_fill_wr_sgl_with_dif(struct spdk_nvmf_rdma_device *device,
    1526             :                                struct spdk_nvmf_rdma_request *rdma_req,
    1527             :                                struct ibv_send_wr *wr,
    1528             :                                uint32_t total_length,
    1529             :                                uint32_t num_extra_wrs)
    1530             : {
    1531          10 :         struct spdk_rdma_utils_memory_translation mem_translation;
    1532          10 :         struct spdk_dif_ctx *dif_ctx = &rdma_req->req.dif.dif_ctx;
    1533          10 :         struct ibv_sge *sg_ele;
    1534          10 :         struct iovec *iov;
    1535          10 :         struct iovec *rdma_iov;
    1536          10 :         uint32_t lkey, remaining;
    1537          10 :         uint32_t remaining_data_block, data_block_size, md_size;
    1538          10 :         uint32_t sge_len;
    1539          10 :         int rc;
    1540             : 
    1541          10 :         data_block_size = dif_ctx->block_size - dif_ctx->md_size;
    1542             : 
    1543          10 :         if (spdk_likely(!rdma_req->req.stripped_data)) {
    1544           5 :                 rdma_iov = rdma_req->req.iov;
    1545           5 :                 remaining_data_block = data_block_size;
    1546           5 :                 md_size = dif_ctx->md_size;
    1547           5 :         } else {
    1548           5 :                 rdma_iov = rdma_req->req.stripped_data->iov;
    1549           5 :                 total_length = total_length / dif_ctx->block_size * data_block_size;
    1550           5 :                 remaining_data_block = total_length;
    1551           5 :                 md_size = 0;
    1552             :         }
    1553             : 
    1554          10 :         wr->num_sge = 0;
    1555             : 
    1556          40 :         while (total_length && (num_extra_wrs || wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES)) {
    1557          15 :                 iov = rdma_iov + rdma_req->iovpos;
    1558          15 :                 rc = spdk_rdma_utils_get_translation(device->map, iov->iov_base, iov->iov_len, &mem_translation);
    1559          15 :                 if (spdk_unlikely(rc)) {
    1560           0 :                         return rc;
    1561             :                 }
    1562             : 
    1563          15 :                 lkey = spdk_rdma_utils_memory_translation_get_lkey(&mem_translation);
    1564          15 :                 sg_ele = &wr->sg_list[wr->num_sge];
    1565          15 :                 remaining = spdk_min((uint32_t)iov->iov_len - rdma_req->offset, total_length);
    1566             : 
    1567          53 :                 while (remaining) {
    1568          38 :                         if (wr->num_sge >= SPDK_NVMF_MAX_SGL_ENTRIES) {
    1569           1 :                                 if (num_extra_wrs > 0 && wr->next) {
    1570           1 :                                         wr = wr->next;
    1571           1 :                                         wr->num_sge = 0;
    1572           1 :                                         sg_ele = &wr->sg_list[wr->num_sge];
    1573           1 :                                         num_extra_wrs--;
    1574           1 :                                 } else {
    1575           0 :                                         break;
    1576             :                                 }
    1577           1 :                         }
    1578          38 :                         sg_ele->lkey = lkey;
    1579          38 :                         sg_ele->addr = (uintptr_t)((char *)iov->iov_base + rdma_req->offset);
    1580          38 :                         sge_len = spdk_min(remaining, remaining_data_block);
    1581          38 :                         sg_ele->length = sge_len;
    1582          38 :                         SPDK_DEBUGLOG(rdma, "sge[%d] %p addr 0x%"PRIx64", len %u\n", wr->num_sge, sg_ele,
    1583             :                                       sg_ele->addr, sg_ele->length);
    1584          38 :                         remaining -= sge_len;
    1585          38 :                         remaining_data_block -= sge_len;
    1586          38 :                         rdma_req->offset += sge_len;
    1587          38 :                         total_length -= sge_len;
    1588             : 
    1589          38 :                         sg_ele++;
    1590          38 :                         wr->num_sge++;
    1591             : 
    1592          38 :                         if (remaining_data_block == 0) {
    1593             :                                 /* skip metadata */
    1594          34 :                                 rdma_req->offset += md_size;
    1595          34 :                                 total_length -= md_size;
    1596             :                                 /* Metadata that do not fit this IO buffer will be included in the next IO buffer */
    1597          34 :                                 remaining -= spdk_min(remaining, md_size);
    1598          34 :                                 remaining_data_block = data_block_size;
    1599          34 :                         }
    1600             : 
    1601          38 :                         if (remaining == 0) {
    1602             :                                 /* By subtracting the size of the last IOV from the offset, we ensure that we skip
    1603             :                                    the remaining metadata bits at the beginning of the next buffer */
    1604          15 :                                 rdma_req->offset -= spdk_min(iov->iov_len, rdma_req->offset);
    1605          15 :                                 rdma_req->iovpos++;
    1606          15 :                         }
    1607             :                 }
    1608             :         }
    1609             : 
    1610          10 :         if (spdk_unlikely(total_length)) {
    1611           0 :                 SPDK_ERRLOG("Not enough SG entries to hold data buffer\n");
    1612           0 :                 return -EINVAL;
    1613             :         }
    1614             : 
    1615          10 :         return 0;
    1616          10 : }
    1617             : 
    1618             : static inline uint32_t
    1619           8 : nvmf_rdma_calc_num_wrs(uint32_t length, uint32_t io_unit_size, uint32_t block_size)
    1620             : {
    1621             :         /* estimate the number of SG entries and WRs needed to process the request */
    1622           8 :         uint32_t num_sge = 0;
    1623           8 :         uint32_t i;
    1624           8 :         uint32_t num_buffers = SPDK_CEIL_DIV(length, io_unit_size);
    1625             : 
    1626          23 :         for (i = 0; i < num_buffers && length > 0; i++) {
    1627          15 :                 uint32_t buffer_len = spdk_min(length, io_unit_size);
    1628          15 :                 uint32_t num_sge_in_block = SPDK_CEIL_DIV(buffer_len, block_size);
    1629             : 
    1630          15 :                 if (num_sge_in_block * block_size > buffer_len) {
    1631          11 :                         ++num_sge_in_block;
    1632          11 :                 }
    1633          15 :                 num_sge += num_sge_in_block;
    1634          15 :                 length -= buffer_len;
    1635          15 :         }
    1636           8 :         return SPDK_CEIL_DIV(num_sge, SPDK_NVMF_MAX_SGL_ENTRIES);
    1637           8 : }
    1638             : 
    1639             : static int
    1640          16 : nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport,
    1641             :                             struct spdk_nvmf_rdma_device *device,
    1642             :                             struct spdk_nvmf_rdma_request *rdma_req)
    1643             : {
    1644          16 :         struct spdk_nvmf_rdma_qpair             *rqpair;
    1645          16 :         struct spdk_nvmf_rdma_poll_group        *rgroup;
    1646          16 :         struct spdk_nvmf_request                *req = &rdma_req->req;
    1647          16 :         struct ibv_send_wr                      *wr = &rdma_req->data.wr;
    1648          16 :         int                                     rc;
    1649          16 :         uint32_t                                num_wrs = 1;
    1650          16 :         uint32_t                                length;
    1651             : 
    1652          16 :         rqpair = SPDK_CONTAINEROF(req->qpair, struct spdk_nvmf_rdma_qpair, qpair);
    1653          16 :         rgroup = rqpair->poller->group;
    1654             : 
    1655             :         /* rdma wr specifics */
    1656          16 :         nvmf_rdma_setup_request(rdma_req);
    1657             : 
    1658          16 :         length = req->length;
    1659          16 :         if (spdk_unlikely(req->dif_enabled)) {
    1660           8 :                 req->dif.orig_length = length;
    1661           8 :                 length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx);
    1662           8 :                 req->dif.elba_length = length;
    1663           8 :         }
    1664             : 
    1665          16 :         rc = spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport,
    1666          16 :                                            length);
    1667          16 :         if (spdk_unlikely(rc != 0)) {
    1668           1 :                 return rc;
    1669             :         }
    1670             : 
    1671          15 :         assert(req->iovcnt <= rqpair->max_send_sge);
    1672             : 
    1673             :         /* When dif_insert_or_strip is true and the I/O data length is greater than one block,
    1674             :          * the stripped_buffers are got for DIF stripping. */
    1675          15 :         if (spdk_unlikely(req->dif_enabled && (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST)
    1676             :                           && (req->dif.elba_length > req->dif.dif_ctx.block_size))) {
    1677          14 :                 rc = nvmf_request_get_stripped_buffers(req, &rgroup->group,
    1678           7 :                                                        &rtransport->transport, req->dif.orig_length);
    1679           7 :                 if (rc != 0) {
    1680           4 :                         SPDK_INFOLOG(rdma, "Get stripped buffers fail %d, fallback to req.iov.\n", rc);
    1681           4 :                 }
    1682           7 :         }
    1683             : 
    1684          15 :         rdma_req->iovpos = 0;
    1685             : 
    1686          15 :         if (spdk_unlikely(req->dif_enabled)) {
    1687          16 :                 num_wrs = nvmf_rdma_calc_num_wrs(length, rtransport->transport.opts.io_unit_size,
    1688           8 :                                                  req->dif.dif_ctx.block_size);
    1689           8 :                 if (num_wrs > 1) {
    1690           1 :                         rc = nvmf_request_alloc_wrs(rtransport, rdma_req, num_wrs - 1);
    1691           1 :                         if (spdk_unlikely(rc != 0)) {
    1692           0 :                                 goto err_exit;
    1693             :                         }
    1694           1 :                 }
    1695             : 
    1696           8 :                 rc = nvmf_rdma_fill_wr_sgl_with_dif(device, rdma_req, wr, length, num_wrs - 1);
    1697           8 :                 if (spdk_unlikely(rc != 0)) {
    1698           0 :                         goto err_exit;
    1699             :                 }
    1700             : 
    1701           8 :                 if (num_wrs > 1) {
    1702           1 :                         nvmf_rdma_update_remote_addr(rdma_req, num_wrs);
    1703           1 :                 }
    1704           8 :         } else {
    1705           7 :                 rc = nvmf_rdma_fill_wr_sgl(device, rdma_req, wr, length);
    1706           7 :                 if (spdk_unlikely(rc != 0)) {
    1707           0 :                         goto err_exit;
    1708             :                 }
    1709             :         }
    1710             : 
    1711             :         /* set the number of outstanding data WRs for this request. */
    1712          15 :         rdma_req->num_outstanding_data_wr = num_wrs;
    1713             : 
    1714          15 :         return rc;
    1715             : 
    1716             : err_exit:
    1717           0 :         spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport);
    1718           0 :         nvmf_rdma_request_free_data(rdma_req, rtransport);
    1719           0 :         req->iovcnt = 0;
    1720           0 :         return rc;
    1721          16 : }
    1722             : 
    1723             : static int
    1724           5 : nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtransport,
    1725             :                                       struct spdk_nvmf_rdma_device *device,
    1726             :                                       struct spdk_nvmf_rdma_request *rdma_req)
    1727             : {
    1728           5 :         struct spdk_nvmf_rdma_qpair             *rqpair;
    1729           5 :         struct spdk_nvmf_rdma_poll_group        *rgroup;
    1730           5 :         struct ibv_send_wr                      *current_wr;
    1731           5 :         struct spdk_nvmf_request                *req = &rdma_req->req;
    1732           5 :         struct spdk_nvme_sgl_descriptor         *inline_segment, *desc;
    1733           5 :         uint32_t                                num_sgl_descriptors;
    1734           5 :         uint32_t                                lengths[SPDK_NVMF_MAX_SGL_ENTRIES], total_length = 0;
    1735           5 :         uint32_t                                i;
    1736           5 :         int                                     rc;
    1737             : 
    1738           5 :         rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
    1739           5 :         rgroup = rqpair->poller->group;
    1740             : 
    1741           5 :         inline_segment = &req->cmd->nvme_cmd.dptr.sgl1;
    1742           5 :         assert(inline_segment->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT);
    1743           5 :         assert(inline_segment->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET);
    1744             : 
    1745           5 :         num_sgl_descriptors = inline_segment->unkeyed.length / sizeof(struct spdk_nvme_sgl_descriptor);
    1746           5 :         assert(num_sgl_descriptors <= SPDK_NVMF_MAX_SGL_ENTRIES);
    1747             : 
    1748           5 :         desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address;
    1749          15 :         for (i = 0; i < num_sgl_descriptors; i++) {
    1750          10 :                 if (spdk_likely(!req->dif_enabled)) {
    1751           8 :                         lengths[i] = desc->keyed.length;
    1752           8 :                 } else {
    1753           2 :                         req->dif.orig_length += desc->keyed.length;
    1754           2 :                         lengths[i] = spdk_dif_get_length_with_md(desc->keyed.length, &req->dif.dif_ctx);
    1755           2 :                         req->dif.elba_length += lengths[i];
    1756             :                 }
    1757          10 :                 total_length += lengths[i];
    1758          10 :                 desc++;
    1759          10 :         }
    1760             : 
    1761           5 :         if (spdk_unlikely(total_length > rtransport->transport.opts.max_io_size)) {
    1762           0 :                 SPDK_ERRLOG("Multi SGL length 0x%x exceeds max io size 0x%x\n",
    1763             :                             total_length, rtransport->transport.opts.max_io_size);
    1764           0 :                 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
    1765           0 :                 return -EINVAL;
    1766             :         }
    1767             : 
    1768           5 :         rc = nvmf_request_alloc_wrs(rtransport, rdma_req, num_sgl_descriptors - 1);
    1769           5 :         if (spdk_unlikely(rc != 0)) {
    1770           0 :                 return -ENOMEM;
    1771             :         }
    1772             : 
    1773           5 :         rc = spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport, total_length);
    1774           5 :         if (spdk_unlikely(rc != 0)) {
    1775           0 :                 nvmf_rdma_request_free_data(rdma_req, rtransport);
    1776           0 :                 return rc;
    1777             :         }
    1778             : 
    1779             :         /* When dif_insert_or_strip is true and the I/O data length is greater than one block,
    1780             :          * the stripped_buffers are got for DIF stripping. */
    1781           5 :         if (spdk_unlikely(req->dif_enabled && (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST)
    1782             :                           && (req->dif.elba_length > req->dif.dif_ctx.block_size))) {
    1783           2 :                 rc = nvmf_request_get_stripped_buffers(req, &rgroup->group,
    1784           1 :                                                        &rtransport->transport, req->dif.orig_length);
    1785           1 :                 if (spdk_unlikely(rc != 0)) {
    1786           0 :                         SPDK_INFOLOG(rdma, "Get stripped buffers fail %d, fallback to req.iov.\n", rc);
    1787           0 :                 }
    1788           1 :         }
    1789             : 
    1790             :         /* The first WR must always be the embedded data WR. This is how we unwind them later. */
    1791           5 :         current_wr = &rdma_req->data.wr;
    1792           5 :         assert(current_wr != NULL);
    1793             : 
    1794           5 :         req->length = 0;
    1795           5 :         rdma_req->iovpos = 0;
    1796           5 :         desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address;
    1797          15 :         for (i = 0; i < num_sgl_descriptors; i++) {
    1798             :                 /* The descriptors must be keyed data block descriptors with an address, not an offset. */
    1799          10 :                 if (spdk_unlikely(desc->generic.type != SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK ||
    1800             :                                   desc->keyed.subtype != SPDK_NVME_SGL_SUBTYPE_ADDRESS)) {
    1801           0 :                         rc = -EINVAL;
    1802           0 :                         goto err_exit;
    1803             :                 }
    1804             : 
    1805          10 :                 if (spdk_likely(!req->dif_enabled)) {
    1806           8 :                         rc = nvmf_rdma_fill_wr_sgl(device, rdma_req, current_wr, lengths[i]);
    1807           8 :                 } else {
    1808           4 :                         rc = nvmf_rdma_fill_wr_sgl_with_dif(device, rdma_req, current_wr,
    1809           2 :                                                             lengths[i], 0);
    1810             :                 }
    1811          10 :                 if (spdk_unlikely(rc != 0)) {
    1812           0 :                         rc = -ENOMEM;
    1813           0 :                         goto err_exit;
    1814             :                 }
    1815             : 
    1816          10 :                 req->length += desc->keyed.length;
    1817          10 :                 current_wr->wr.rdma.rkey = desc->keyed.key;
    1818          10 :                 current_wr->wr.rdma.remote_addr = desc->address;
    1819          10 :                 current_wr = current_wr->next;
    1820          10 :                 desc++;
    1821          10 :         }
    1822             : 
    1823             : #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
    1824             :         /* Go back to the last descriptor in the list. */
    1825           5 :         desc--;
    1826           5 :         if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) {
    1827           0 :                 if (desc->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) {
    1828           0 :                         rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV;
    1829           0 :                         rdma_req->rsp.wr.imm_data = desc->keyed.key;
    1830           0 :                 }
    1831           0 :         }
    1832             : #endif
    1833             : 
    1834           5 :         rdma_req->num_outstanding_data_wr = num_sgl_descriptors;
    1835             : 
    1836           5 :         return 0;
    1837             : 
    1838             : err_exit:
    1839           0 :         spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport);
    1840           0 :         nvmf_rdma_request_free_data(rdma_req, rtransport);
    1841           0 :         return rc;
    1842           5 : }
    1843             : 
    1844             : static int
    1845          25 : nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
    1846             :                             struct spdk_nvmf_rdma_device *device,
    1847             :                             struct spdk_nvmf_rdma_request *rdma_req)
    1848             : {
    1849          25 :         struct spdk_nvmf_request                *req = &rdma_req->req;
    1850          25 :         struct spdk_nvme_cpl                    *rsp;
    1851          25 :         struct spdk_nvme_sgl_descriptor         *sgl;
    1852          25 :         int                                     rc;
    1853          25 :         uint32_t                                length;
    1854             : 
    1855          25 :         rsp = &req->rsp->nvme_cpl;
    1856          25 :         sgl = &req->cmd->nvme_cmd.dptr.sgl1;
    1857             : 
    1858          25 :         if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK &&
    1859          17 :             (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS ||
    1860           0 :              sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) {
    1861             : 
    1862          17 :                 length = sgl->keyed.length;
    1863          17 :                 if (spdk_unlikely(length > rtransport->transport.opts.max_io_size)) {
    1864           1 :                         SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n",
    1865             :                                     length, rtransport->transport.opts.max_io_size);
    1866           1 :                         rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
    1867           1 :                         return -1;
    1868             :                 }
    1869             : #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
    1870          16 :                 if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) {
    1871           0 :                         if (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) {
    1872           0 :                                 rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV;
    1873           0 :                                 rdma_req->rsp.wr.imm_data = sgl->keyed.key;
    1874           0 :                         }
    1875           0 :                 }
    1876             : #endif
    1877             : 
    1878             :                 /* fill request length and populate iovs */
    1879          16 :                 req->length = length;
    1880             : 
    1881          16 :                 rc = nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req);
    1882          16 :                 if (spdk_unlikely(rc < 0)) {
    1883           1 :                         if (rc == -EINVAL) {
    1884           0 :                                 SPDK_ERRLOG("SGL length exceeds the max I/O size\n");
    1885           0 :                                 rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
    1886           0 :                                 return -1;
    1887             :                         }
    1888             :                         /* No available buffers. Queue this request up. */
    1889           1 :                         SPDK_DEBUGLOG(rdma, "No available large data buffers. Queueing request %p\n", rdma_req);
    1890           1 :                         return 0;
    1891             :                 }
    1892             : 
    1893          15 :                 SPDK_DEBUGLOG(rdma, "Request %p took %d buffer/s from central pool\n", rdma_req,
    1894             :                               req->iovcnt);
    1895             : 
    1896          15 :                 return 0;
    1897          11 :         } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK &&
    1898           3 :                    sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) {
    1899           3 :                 uint64_t offset = sgl->address;
    1900           3 :                 uint32_t max_len = rtransport->transport.opts.in_capsule_data_size;
    1901             : 
    1902           3 :                 SPDK_DEBUGLOG(nvmf, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n",
    1903             :                               offset, sgl->unkeyed.length);
    1904             : 
    1905           3 :                 if (spdk_unlikely(offset > max_len)) {
    1906           0 :                         SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n",
    1907             :                                     offset, max_len);
    1908           0 :                         rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET;
    1909           0 :                         return -1;
    1910             :                 }
    1911           3 :                 max_len -= (uint32_t)offset;
    1912             : 
    1913           3 :                 if (spdk_unlikely(sgl->unkeyed.length > max_len)) {
    1914           2 :                         SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n",
    1915             :                                     sgl->unkeyed.length, max_len);
    1916           2 :                         rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
    1917           2 :                         return -1;
    1918             :                 }
    1919             : 
    1920           1 :                 rdma_req->num_outstanding_data_wr = 0;
    1921           1 :                 req->data_from_pool = false;
    1922           1 :                 req->length = sgl->unkeyed.length;
    1923             : 
    1924           1 :                 req->iov[0].iov_base = rdma_req->recv->buf + offset;
    1925           1 :                 req->iov[0].iov_len = req->length;
    1926           1 :                 req->iovcnt = 1;
    1927             : 
    1928           1 :                 return 0;
    1929          13 :         } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT &&
    1930           5 :                    sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) {
    1931             : 
    1932           5 :                 rc = nvmf_rdma_request_fill_iovs_multi_sgl(rtransport, device, rdma_req);
    1933           5 :                 if (spdk_unlikely(rc == -ENOMEM)) {
    1934           0 :                         SPDK_DEBUGLOG(rdma, "No available large data buffers. Queueing request %p\n", rdma_req);
    1935           0 :                         return 0;
    1936           5 :                 } else if (spdk_unlikely(rc == -EINVAL)) {
    1937           0 :                         SPDK_ERRLOG("Multi SGL element request length exceeds the max I/O size\n");
    1938           0 :                         rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
    1939           0 :                         return -1;
    1940             :                 }
    1941             : 
    1942           5 :                 SPDK_DEBUGLOG(rdma, "Request %p took %d buffer/s from central pool\n", rdma_req,
    1943             :                               req->iovcnt);
    1944             : 
    1945           5 :                 return 0;
    1946             :         }
    1947             : 
    1948           0 :         SPDK_ERRLOG("Invalid NVMf I/O Command SGL:  Type 0x%x, Subtype 0x%x\n",
    1949             :                     sgl->generic.type, sgl->generic.subtype);
    1950           0 :         rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID;
    1951           0 :         return -1;
    1952          25 : }
    1953             : 
    1954             : static void
    1955           6 : _nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req,
    1956             :                         struct spdk_nvmf_rdma_transport *rtransport)
    1957             : {
    1958           6 :         struct spdk_nvmf_rdma_qpair             *rqpair;
    1959           6 :         struct spdk_nvmf_rdma_poll_group        *rgroup;
    1960             : 
    1961           6 :         rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
    1962           6 :         if (rdma_req->req.data_from_pool) {
    1963           5 :                 rgroup = rqpair->poller->group;
    1964             : 
    1965           5 :                 spdk_nvmf_request_free_buffers(&rdma_req->req, &rgroup->group, &rtransport->transport);
    1966           5 :         }
    1967           6 :         if (rdma_req->req.stripped_data) {
    1968           0 :                 nvmf_request_free_stripped_buffers(&rdma_req->req,
    1969           0 :                                                    &rqpair->poller->group->group,
    1970           0 :                                                    &rtransport->transport);
    1971           0 :         }
    1972           6 :         nvmf_rdma_request_free_data(rdma_req, rtransport);
    1973           6 :         rdma_req->req.length = 0;
    1974           6 :         rdma_req->req.iovcnt = 0;
    1975           6 :         rdma_req->offset = 0;
    1976           6 :         rdma_req->req.dif_enabled = false;
    1977           6 :         rdma_req->fused_failed = false;
    1978           6 :         rdma_req->transfer_wr = NULL;
    1979           6 :         if (rdma_req->fused_pair) {
    1980             :                 /* This req was part of a valid fused pair, but failed before it got to
    1981             :                  * READ_TO_EXECUTE state.  This means we need to fail the other request
    1982             :                  * in the pair, because it is no longer part of a valid pair.  If the pair
    1983             :                  * already reached READY_TO_EXECUTE state, we need to kick it.
    1984             :                  */
    1985           0 :                 rdma_req->fused_pair->fused_failed = true;
    1986           0 :                 if (rdma_req->fused_pair->state == RDMA_REQUEST_STATE_READY_TO_EXECUTE) {
    1987           0 :                         nvmf_rdma_request_process(rtransport, rdma_req->fused_pair);
    1988           0 :                 }
    1989           0 :                 rdma_req->fused_pair = NULL;
    1990           0 :         }
    1991           6 :         memset(&rdma_req->req.dif, 0, sizeof(rdma_req->req.dif));
    1992           6 :         rqpair->qd--;
    1993             : 
    1994           6 :         STAILQ_INSERT_HEAD(&rqpair->resources->free_queue, rdma_req, state_link);
    1995           6 :         rqpair->qpair.queue_depth--;
    1996           6 :         rdma_req->state = RDMA_REQUEST_STATE_FREE;
    1997           6 : }
    1998             : 
    1999             : static void
    2000           6 : nvmf_rdma_check_fused_ordering(struct spdk_nvmf_rdma_transport *rtransport,
    2001             :                                struct spdk_nvmf_rdma_qpair *rqpair,
    2002             :                                struct spdk_nvmf_rdma_request *rdma_req)
    2003             : {
    2004           6 :         enum spdk_nvme_cmd_fuse last, next;
    2005             : 
    2006           6 :         last = rqpair->fused_first ? rqpair->fused_first->req.cmd->nvme_cmd.fuse : SPDK_NVME_CMD_FUSE_NONE;
    2007           6 :         next = rdma_req->req.cmd->nvme_cmd.fuse;
    2008             : 
    2009           6 :         assert(last != SPDK_NVME_CMD_FUSE_SECOND);
    2010             : 
    2011           6 :         if (spdk_likely(last == SPDK_NVME_CMD_FUSE_NONE && next == SPDK_NVME_CMD_FUSE_NONE)) {
    2012           6 :                 return;
    2013             :         }
    2014             : 
    2015           0 :         if (last == SPDK_NVME_CMD_FUSE_FIRST) {
    2016           0 :                 if (next == SPDK_NVME_CMD_FUSE_SECOND) {
    2017             :                         /* This is a valid pair of fused commands.  Point them at each other
    2018             :                          * so they can be submitted consecutively once ready to be executed.
    2019             :                          */
    2020           0 :                         rqpair->fused_first->fused_pair = rdma_req;
    2021           0 :                         rdma_req->fused_pair = rqpair->fused_first;
    2022           0 :                         rqpair->fused_first = NULL;
    2023           0 :                         return;
    2024             :                 } else {
    2025             :                         /* Mark the last req as failed since it wasn't followed by a SECOND. */
    2026           0 :                         rqpair->fused_first->fused_failed = true;
    2027             : 
    2028             :                         /* If the last req is in READY_TO_EXECUTE state, then call
    2029             :                          * nvmf_rdma_request_process(), otherwise nothing else will kick it.
    2030             :                          */
    2031           0 :                         if (rqpair->fused_first->state == RDMA_REQUEST_STATE_READY_TO_EXECUTE) {
    2032           0 :                                 nvmf_rdma_request_process(rtransport, rqpair->fused_first);
    2033           0 :                         }
    2034             : 
    2035           0 :                         rqpair->fused_first = NULL;
    2036             :                 }
    2037           0 :         }
    2038             : 
    2039           0 :         if (next == SPDK_NVME_CMD_FUSE_FIRST) {
    2040             :                 /* Set rqpair->fused_first here so that we know to check that the next request
    2041             :                  * is a SECOND (and to fail this one if it isn't).
    2042             :                  */
    2043           0 :                 rqpair->fused_first = rdma_req;
    2044           0 :         } else if (next == SPDK_NVME_CMD_FUSE_SECOND) {
    2045             :                 /* Mark this req failed since it ia SECOND and the last one was not a FIRST. */
    2046           0 :                 rdma_req->fused_failed = true;
    2047           0 :         }
    2048           6 : }
    2049             : 
    2050             : static void
    2051           5 : nvmf_rdma_poll_group_insert_need_buffer_req(struct spdk_nvmf_rdma_poll_group *rgroup,
    2052             :                 struct spdk_nvmf_rdma_request *rdma_req)
    2053             : {
    2054           5 :         struct spdk_nvmf_request *r;
    2055             : 
    2056             :         /* CONNECT commands have a timeout, so we need to avoid a CONNECT command
    2057             :          * from getting buried behind a long list of other non-FABRIC requests
    2058             :          * waiting for a buffer. Note that even though the CONNECT command's data is
    2059             :          * in-capsule, the request still goes to this STAILQ.
    2060             :          */
    2061           5 :         if (spdk_likely(rdma_req->req.cmd->nvme_cmd.opc != SPDK_NVME_OPC_FABRIC)) {
    2062             :                 /* This is the most likely case. */
    2063           5 :                 STAILQ_INSERT_TAIL(&rgroup->group.pending_buf_queue, &rdma_req->req, buf_link);
    2064           5 :                 return;
    2065             :         } else {
    2066             :                 /* STAILQ doesn't have INSERT_BEFORE, so we need to either INSERT_HEAD
    2067             :                  * or INSERT_AFTER. Put it after any other FABRIC commands that are
    2068             :                  * already in the queue.
    2069             :                  */
    2070           0 :                 r = STAILQ_FIRST(&rgroup->group.pending_buf_queue);
    2071           0 :                 if (r == NULL || r->cmd->nvme_cmd.opc != SPDK_NVME_OPC_FABRIC) {
    2072           0 :                         STAILQ_INSERT_HEAD(&rgroup->group.pending_buf_queue, &rdma_req->req, buf_link);
    2073           0 :                         return;
    2074             :                 }
    2075           0 :                 while (true) {
    2076           0 :                         struct spdk_nvmf_request *next;
    2077             : 
    2078           0 :                         next = STAILQ_NEXT(r, buf_link);
    2079           0 :                         if (next == NULL || next->cmd->nvme_cmd.opc != SPDK_NVME_OPC_FABRIC) {
    2080           0 :                                 STAILQ_INSERT_AFTER(&rgroup->group.pending_buf_queue, r, &rdma_req->req, buf_link);
    2081           0 :                                 return;
    2082             :                         }
    2083           0 :                         r = next;
    2084           0 :                 }
    2085             :         }
    2086           5 : }
    2087             : 
    2088             : bool
    2089          23 : nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
    2090             :                           struct spdk_nvmf_rdma_request *rdma_req)
    2091             : {
    2092          23 :         struct spdk_nvmf_rdma_qpair     *rqpair;
    2093          23 :         struct spdk_nvmf_rdma_device    *device;
    2094          23 :         struct spdk_nvmf_rdma_poll_group *rgroup;
    2095          23 :         struct spdk_nvme_cpl            *rsp = &rdma_req->req.rsp->nvme_cpl;
    2096          23 :         int                             rc;
    2097          23 :         struct spdk_nvmf_rdma_recv      *rdma_recv;
    2098          23 :         enum spdk_nvmf_rdma_request_state prev_state;
    2099          23 :         bool                            progress = false;
    2100          23 :         int                             data_posted;
    2101          23 :         uint32_t                        num_blocks, num_rdma_reads_available, qdepth;
    2102             : 
    2103          23 :         rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
    2104          23 :         device = rqpair->device;
    2105          23 :         rgroup = rqpair->poller->group;
    2106             : 
    2107          23 :         assert(rdma_req->state != RDMA_REQUEST_STATE_FREE);
    2108             : 
    2109             :         /* If the queue pair is in an error state, force the request to the completed state
    2110             :          * to release resources. */
    2111          23 :         if (spdk_unlikely(rqpair->ibv_in_error_state || !spdk_nvmf_qpair_is_active(&rqpair->qpair))) {
    2112           0 :                 switch (rdma_req->state) {
    2113             :                 case RDMA_REQUEST_STATE_NEED_BUFFER:
    2114           0 :                         STAILQ_REMOVE(&rgroup->group.pending_buf_queue, &rdma_req->req, spdk_nvmf_request, buf_link);
    2115           0 :                         break;
    2116             :                 case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING:
    2117           0 :                         STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
    2118           0 :                         break;
    2119             :                 case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
    2120           0 :                         if (rdma_req->num_remaining_data_wr) {
    2121             :                                 /* Partially sent request is still in the pending_rdma_read_queue,
    2122             :                                  * remove it before completing */
    2123           0 :                                 rdma_req->num_remaining_data_wr = 0;
    2124           0 :                                 STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
    2125           0 :                         }
    2126           0 :                         break;
    2127             :                 case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING:
    2128           0 :                         STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
    2129           0 :                         break;
    2130             :                 case RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING:
    2131           0 :                         STAILQ_REMOVE(&rqpair->pending_rdma_send_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
    2132           0 :                         break;
    2133             :                 default:
    2134           0 :                         break;
    2135             :                 }
    2136           0 :                 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
    2137           0 :         }
    2138             : 
    2139             :         /* The loop here is to allow for several back-to-back state changes. */
    2140          23 :         do {
    2141          66 :                 prev_state = rdma_req->state;
    2142             : 
    2143          66 :                 SPDK_DEBUGLOG(rdma, "Request %p entering state %d\n", rdma_req, prev_state);
    2144             : 
    2145          66 :                 switch (rdma_req->state) {
    2146             :                 case RDMA_REQUEST_STATE_FREE:
    2147             :                         /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW
    2148             :                          * to escape this state. */
    2149           6 :                         break;
    2150             :                 case RDMA_REQUEST_STATE_NEW:
    2151           6 :                         spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0,
    2152             :                                           (uintptr_t)rdma_req, (uintptr_t)rqpair, rqpair->qpair.queue_depth);
    2153           6 :                         rdma_recv = rdma_req->recv;
    2154             : 
    2155             :                         /* The first element of the SGL is the NVMe command */
    2156           6 :                         rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr;
    2157           6 :                         memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp));
    2158           6 :                         rdma_req->transfer_wr = &rdma_req->data.wr;
    2159             : 
    2160           6 :                         if (spdk_unlikely(rqpair->ibv_in_error_state || !spdk_nvmf_qpair_is_active(&rqpair->qpair))) {
    2161           0 :                                 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
    2162           0 :                                 break;
    2163             :                         }
    2164             : 
    2165           6 :                         if (spdk_unlikely(spdk_nvmf_request_get_dif_ctx(&rdma_req->req, &rdma_req->req.dif.dif_ctx))) {
    2166           0 :                                 rdma_req->req.dif_enabled = true;
    2167           0 :                         }
    2168             : 
    2169           6 :                         nvmf_rdma_check_fused_ordering(rtransport, rqpair, rdma_req);
    2170             : 
    2171             : #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
    2172           6 :                         rdma_req->rsp.wr.opcode = IBV_WR_SEND;
    2173           6 :                         rdma_req->rsp.wr.imm_data = 0;
    2174             : #endif
    2175             : 
    2176             :                         /* The next state transition depends on the data transfer needs of this request. */
    2177           6 :                         rdma_req->req.xfer = spdk_nvmf_req_get_xfer(&rdma_req->req);
    2178             : 
    2179           6 :                         if (spdk_unlikely(rdma_req->req.xfer == SPDK_NVME_DATA_BIDIRECTIONAL)) {
    2180           1 :                                 rsp->status.sct = SPDK_NVME_SCT_GENERIC;
    2181           1 :                                 rsp->status.sc = SPDK_NVME_SC_INVALID_OPCODE;
    2182           1 :                                 STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link);
    2183           1 :                                 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
    2184           1 :                                 SPDK_DEBUGLOG(rdma, "Request %p: invalid xfer type (BIDIRECTIONAL)\n", rdma_req);
    2185           1 :                                 break;
    2186             :                         }
    2187             : 
    2188             :                         /* If no data to transfer, ready to execute. */
    2189           5 :                         if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) {
    2190           0 :                                 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
    2191           0 :                                 break;
    2192             :                         }
    2193             : 
    2194           5 :                         rdma_req->state = RDMA_REQUEST_STATE_NEED_BUFFER;
    2195           5 :                         nvmf_rdma_poll_group_insert_need_buffer_req(rgroup, rdma_req);
    2196           5 :                         break;
    2197             :                 case RDMA_REQUEST_STATE_NEED_BUFFER:
    2198           5 :                         spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0,
    2199             :                                           (uintptr_t)rdma_req, (uintptr_t)rqpair);
    2200             : 
    2201           5 :                         assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE);
    2202             : 
    2203           5 :                         if (&rdma_req->req != STAILQ_FIRST(&rgroup->group.pending_buf_queue)) {
    2204             :                                 /* This request needs to wait in line to obtain a buffer */
    2205           0 :                                 break;
    2206             :                         }
    2207             : 
    2208             :                         /* Try to get a data buffer */
    2209           5 :                         rc = nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req);
    2210           5 :                         if (spdk_unlikely(rc < 0)) {
    2211           0 :                                 STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link);
    2212           0 :                                 STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link);
    2213           0 :                                 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
    2214           0 :                                 break;
    2215             :                         }
    2216             : 
    2217           5 :                         if (rdma_req->req.iovcnt == 0) {
    2218             :                                 /* No buffers available. */
    2219           0 :                                 rgroup->stat.pending_data_buffer++;
    2220           0 :                                 break;
    2221             :                         }
    2222             : 
    2223           5 :                         STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link);
    2224             : 
    2225             :                         /* If data is transferring from host to controller and the data didn't
    2226             :                          * arrive using in capsule data, we need to do a transfer from the host.
    2227             :                          */
    2228           9 :                         if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER &&
    2229           4 :                             rdma_req->req.data_from_pool) {
    2230           4 :                                 STAILQ_INSERT_TAIL(&rqpair->pending_rdma_read_queue, rdma_req, state_link);
    2231           4 :                                 rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING;
    2232           4 :                                 break;
    2233             :                         }
    2234             : 
    2235           1 :                         rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
    2236           1 :                         break;
    2237             :                 case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING:
    2238           4 :                         spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 0, 0,
    2239             :                                           (uintptr_t)rdma_req, (uintptr_t)rqpair);
    2240             : 
    2241           4 :                         if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_read_queue)) {
    2242             :                                 /* This request needs to wait in line to perform RDMA */
    2243           0 :                                 break;
    2244             :                         }
    2245           4 :                         assert(rqpair->max_send_depth >= rqpair->current_send_depth);
    2246           4 :                         qdepth = rqpair->max_send_depth - rqpair->current_send_depth;
    2247           4 :                         assert(rqpair->max_read_depth >= rqpair->current_read_depth);
    2248           4 :                         num_rdma_reads_available = rqpair->max_read_depth - rqpair->current_read_depth;
    2249           8 :                         if (rdma_req->num_outstanding_data_wr > qdepth ||
    2250           4 :                             rdma_req->num_outstanding_data_wr > num_rdma_reads_available) {
    2251           0 :                                 if (num_rdma_reads_available && qdepth) {
    2252             :                                         /* Send as much as we can */
    2253           0 :                                         request_prepare_transfer_in_part(&rdma_req->req, spdk_min(num_rdma_reads_available, qdepth));
    2254           0 :                                 } else {
    2255             :                                         /* We can only have so many WRs outstanding. we have to wait until some finish. */
    2256           0 :                                         rqpair->poller->stat.pending_rdma_read++;
    2257           0 :                                         break;
    2258             :                                 }
    2259           0 :                         }
    2260             : 
    2261             :                         /* We have already verified that this request is the head of the queue. */
    2262           4 :                         if (rdma_req->num_remaining_data_wr == 0) {
    2263           4 :                                 STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_read_queue, state_link);
    2264           4 :                         }
    2265             : 
    2266           4 :                         request_transfer_in(&rdma_req->req);
    2267           4 :                         rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER;
    2268             : 
    2269           4 :                         break;
    2270             :                 case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
    2271           4 :                         spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0,
    2272             :                                           (uintptr_t)rdma_req, (uintptr_t)rqpair);
    2273             :                         /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE
    2274             :                          * to escape this state. */
    2275           4 :                         break;
    2276             :                 case RDMA_REQUEST_STATE_READY_TO_EXECUTE:
    2277           5 :                         spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0,
    2278             :                                           (uintptr_t)rdma_req, (uintptr_t)rqpair);
    2279             : 
    2280           5 :                         if (spdk_unlikely(rdma_req->req.dif_enabled)) {
    2281           0 :                                 if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
    2282             :                                         /* generate DIF for write operation */
    2283           0 :                                         num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size);
    2284           0 :                                         assert(num_blocks > 0);
    2285             : 
    2286           0 :                                         rc = spdk_dif_generate(rdma_req->req.iov, rdma_req->req.iovcnt,
    2287           0 :                                                                num_blocks, &rdma_req->req.dif.dif_ctx);
    2288           0 :                                         if (rc != 0) {
    2289           0 :                                                 SPDK_ERRLOG("DIF generation failed\n");
    2290           0 :                                                 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
    2291           0 :                                                 spdk_nvmf_qpair_disconnect(&rqpair->qpair);
    2292           0 :                                                 break;
    2293             :                                         }
    2294           0 :                                 }
    2295             : 
    2296           0 :                                 assert(rdma_req->req.dif.elba_length >= rdma_req->req.length);
    2297             :                                 /* set extended length before IO operation */
    2298           0 :                                 rdma_req->req.length = rdma_req->req.dif.elba_length;
    2299           0 :                         }
    2300             : 
    2301           5 :                         if (rdma_req->req.cmd->nvme_cmd.fuse != SPDK_NVME_CMD_FUSE_NONE) {
    2302           0 :                                 if (rdma_req->fused_failed) {
    2303             :                                         /* This request failed FUSED semantics.  Fail it immediately, without
    2304             :                                          * even sending it to the target layer.
    2305             :                                          */
    2306           0 :                                         rsp->status.sct = SPDK_NVME_SCT_GENERIC;
    2307           0 :                                         rsp->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED;
    2308           0 :                                         STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link);
    2309           0 :                                         rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
    2310           0 :                                         break;
    2311             :                                 }
    2312             : 
    2313           0 :                                 if (rdma_req->fused_pair == NULL ||
    2314           0 :                                     rdma_req->fused_pair->state != RDMA_REQUEST_STATE_READY_TO_EXECUTE) {
    2315             :                                         /* This request is ready to execute, but either we don't know yet if it's
    2316             :                                          * valid - i.e. this is a FIRST but we haven't received the next
    2317             :                                          * request yet or the other request of this fused pair isn't ready to
    2318             :                                          * execute.  So break here and this request will get processed later either
    2319             :                                          * when the other request is ready or we find that this request isn't valid.
    2320             :                                          */
    2321           0 :                                         break;
    2322             :                                 }
    2323           0 :                         }
    2324             : 
    2325             :                         /* If we get to this point, and this request is a fused command, we know that
    2326             :                          * it is part of valid sequence (FIRST followed by a SECOND) and that both
    2327             :                          * requests are READY_TO_EXECUTE. So call spdk_nvmf_request_exec() both on this
    2328             :                          * request, and the other request of the fused pair, in the correct order.
    2329             :                          * Also clear the ->fused_pair pointers on both requests, since after this point
    2330             :                          * we no longer need to maintain the relationship between these two requests.
    2331             :                          */
    2332           5 :                         if (rdma_req->req.cmd->nvme_cmd.fuse == SPDK_NVME_CMD_FUSE_SECOND) {
    2333           0 :                                 assert(rdma_req->fused_pair != NULL);
    2334           0 :                                 assert(rdma_req->fused_pair->fused_pair != NULL);
    2335           0 :                                 rdma_req->fused_pair->state = RDMA_REQUEST_STATE_EXECUTING;
    2336           0 :                                 spdk_nvmf_request_exec(&rdma_req->fused_pair->req);
    2337           0 :                                 rdma_req->fused_pair->fused_pair = NULL;
    2338           0 :                                 rdma_req->fused_pair = NULL;
    2339           0 :                         }
    2340           5 :                         rdma_req->state = RDMA_REQUEST_STATE_EXECUTING;
    2341           5 :                         spdk_nvmf_request_exec(&rdma_req->req);
    2342           5 :                         if (rdma_req->req.cmd->nvme_cmd.fuse == SPDK_NVME_CMD_FUSE_FIRST) {
    2343           0 :                                 assert(rdma_req->fused_pair != NULL);
    2344           0 :                                 assert(rdma_req->fused_pair->fused_pair != NULL);
    2345           0 :                                 rdma_req->fused_pair->state = RDMA_REQUEST_STATE_EXECUTING;
    2346           0 :                                 spdk_nvmf_request_exec(&rdma_req->fused_pair->req);
    2347           0 :                                 rdma_req->fused_pair->fused_pair = NULL;
    2348           0 :                                 rdma_req->fused_pair = NULL;
    2349           0 :                         }
    2350           5 :                         break;
    2351             :                 case RDMA_REQUEST_STATE_EXECUTING:
    2352           5 :                         spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0,
    2353             :                                           (uintptr_t)rdma_req, (uintptr_t)rqpair);
    2354             :                         /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED
    2355             :                          * to escape this state. */
    2356           5 :                         break;
    2357             :                 case RDMA_REQUEST_STATE_EXECUTED:
    2358           5 :                         spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0,
    2359             :                                           (uintptr_t)rdma_req, (uintptr_t)rqpair);
    2360          10 :                         if (rsp->status.sc == SPDK_NVME_SC_SUCCESS &&
    2361           5 :                             rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
    2362           1 :                                 STAILQ_INSERT_TAIL(&rqpair->pending_rdma_write_queue, rdma_req, state_link);
    2363           1 :                                 rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING;
    2364           1 :                         } else {
    2365           4 :                                 STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link);
    2366           4 :                                 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
    2367             :                         }
    2368           5 :                         if (spdk_unlikely(rdma_req->req.dif_enabled)) {
    2369             :                                 /* restore the original length */
    2370           0 :                                 rdma_req->req.length = rdma_req->req.dif.orig_length;
    2371             : 
    2372           0 :                                 if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
    2373           0 :                                         struct spdk_dif_error error_blk;
    2374             : 
    2375           0 :                                         num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size);
    2376           0 :                                         if (!rdma_req->req.stripped_data) {
    2377           0 :                                                 rc = spdk_dif_verify(rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks,
    2378           0 :                                                                      &rdma_req->req.dif.dif_ctx, &error_blk);
    2379           0 :                                         } else {
    2380           0 :                                                 rc = spdk_dif_verify_copy(rdma_req->req.stripped_data->iov,
    2381           0 :                                                                           rdma_req->req.stripped_data->iovcnt,
    2382           0 :                                                                           rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks,
    2383           0 :                                                                           &rdma_req->req.dif.dif_ctx, &error_blk);
    2384             :                                         }
    2385           0 :                                         if (rc) {
    2386           0 :                                                 struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl;
    2387             : 
    2388           0 :                                                 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", error_blk.err_type,
    2389             :                                                             error_blk.err_offset);
    2390           0 :                                                 rsp->status.sct = SPDK_NVME_SCT_MEDIA_ERROR;
    2391           0 :                                                 rsp->status.sc = nvmf_rdma_dif_error_to_compl_status(error_blk.err_type);
    2392           0 :                                                 STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
    2393           0 :                                                 STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req, state_link);
    2394           0 :                                                 rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
    2395           0 :                                         }
    2396           0 :                                 }
    2397           0 :                         }
    2398           5 :                         break;
    2399             :                 case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING:
    2400           1 :                         spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 0, 0,
    2401             :                                           (uintptr_t)rdma_req, (uintptr_t)rqpair);
    2402             : 
    2403           1 :                         if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_write_queue)) {
    2404             :                                 /* This request needs to wait in line to perform RDMA */
    2405           0 :                                 break;
    2406             :                         }
    2407           2 :                         if ((rqpair->current_send_depth + rdma_req->num_outstanding_data_wr + 1) >
    2408           1 :                             rqpair->max_send_depth) {
    2409             :                                 /* We can only have so many WRs outstanding. we have to wait until some finish.
    2410             :                                  * +1 since each request has an additional wr in the resp. */
    2411           0 :                                 rqpair->poller->stat.pending_rdma_write++;
    2412           0 :                                 break;
    2413             :                         }
    2414             : 
    2415             :                         /* We have already verified that this request is the head of the queue. */
    2416           1 :                         STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_write_queue, state_link);
    2417             : 
    2418             :                         /* The data transfer will be kicked off from
    2419             :                          * RDMA_REQUEST_STATE_READY_TO_COMPLETE state.
    2420             :                          * We verified that data + response fit into send queue, so we can go to the next state directly
    2421             :                          */
    2422           1 :                         rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
    2423           1 :                         break;
    2424             :                 case RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING:
    2425           7 :                         spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING, 0, 0,
    2426             :                                           (uintptr_t)rdma_req, (uintptr_t)rqpair);
    2427             : 
    2428           7 :                         if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_send_queue)) {
    2429             :                                 /* This request needs to wait in line to send the completion */
    2430           0 :                                 break;
    2431             :                         }
    2432             : 
    2433           7 :                         assert(rqpair->current_send_depth <= rqpair->max_send_depth);
    2434           7 :                         if (rqpair->current_send_depth == rqpair->max_send_depth) {
    2435             :                                 /* We can only have so many WRs outstanding. we have to wait until some finish */
    2436           2 :                                 rqpair->poller->stat.pending_rdma_send++;
    2437           2 :                                 break;
    2438             :                         }
    2439             : 
    2440             :                         /* We have already verified that this request is the head of the queue. */
    2441           5 :                         STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_send_queue, state_link);
    2442             : 
    2443             :                         /* The response sending will be kicked off from
    2444             :                          * RDMA_REQUEST_STATE_READY_TO_COMPLETE state.
    2445             :                          */
    2446           5 :                         rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
    2447           5 :                         break;
    2448             :                 case RDMA_REQUEST_STATE_READY_TO_COMPLETE:
    2449           6 :                         spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0,
    2450             :                                           (uintptr_t)rdma_req, (uintptr_t)rqpair);
    2451           6 :                         rc = request_transfer_out(&rdma_req->req, &data_posted);
    2452           6 :                         assert(rc == 0); /* No good way to handle this currently */
    2453           6 :                         if (spdk_unlikely(rc)) {
    2454           0 :                                 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
    2455           0 :                         } else {
    2456           6 :                                 rdma_req->state = data_posted ? RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST :
    2457             :                                                   RDMA_REQUEST_STATE_COMPLETING;
    2458             :                         }
    2459           6 :                         break;
    2460             :                 case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST:
    2461           1 :                         spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0,
    2462             :                                           (uintptr_t)rdma_req, (uintptr_t)rqpair);
    2463             :                         /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED
    2464             :                          * to escape this state. */
    2465           1 :                         break;
    2466             :                 case RDMA_REQUEST_STATE_COMPLETING:
    2467           5 :                         spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0,
    2468             :                                           (uintptr_t)rdma_req, (uintptr_t)rqpair);
    2469             :                         /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED
    2470             :                          * to escape this state. */
    2471           5 :                         break;
    2472             :                 case RDMA_REQUEST_STATE_COMPLETED:
    2473           6 :                         spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0,
    2474             :                                           (uintptr_t)rdma_req, (uintptr_t)rqpair, rqpair->qpair.queue_depth);
    2475             : 
    2476           6 :                         rqpair->poller->stat.request_latency += spdk_get_ticks() - rdma_req->receive_tsc;
    2477           6 :                         _nvmf_rdma_request_free(rdma_req, rtransport);
    2478           6 :                         break;
    2479           0 :                 case RDMA_REQUEST_NUM_STATES:
    2480             :                 default:
    2481           0 :                         assert(0);
    2482             :                         break;
    2483             :                 }
    2484             : 
    2485          66 :                 if (rdma_req->state != prev_state) {
    2486          43 :                         progress = true;
    2487          43 :                 }
    2488          66 :         } while (rdma_req->state != prev_state);
    2489             : 
    2490          23 :         return progress;
    2491          23 : }
    2492             : 
    2493             : /* Public API callbacks begin here */
    2494             : 
    2495             : #define SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH 128
    2496             : #define SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH 128
    2497             : #define SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH 4096
    2498             : #define SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR 128
    2499             : #define SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE 4096
    2500             : #define SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE 131072
    2501             : #define SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE (SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE / SPDK_NVMF_MAX_SGL_ENTRIES)
    2502             : #define SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS 4095
    2503             : #define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE UINT32_MAX
    2504             : #define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false
    2505             : #define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false
    2506             : #define SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG 100
    2507             : #define SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC 1
    2508             : #define SPDK_NVMF_RDMA_DEFAULT_NO_WR_BATCHING false
    2509             : #define SPDK_NVMF_RDMA_DEFAULT_DATA_WR_POOL_SIZE 4095
    2510             : 
    2511             : static void
    2512           1 : nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
    2513             : {
    2514           1 :         opts->max_queue_depth =              SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH;
    2515           1 :         opts->max_qpairs_per_ctrlr = SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR;
    2516           1 :         opts->in_capsule_data_size = SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE;
    2517           1 :         opts->max_io_size =          SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE;
    2518           1 :         opts->io_unit_size =         SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE;
    2519           1 :         opts->max_aq_depth =         SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH;
    2520           1 :         opts->num_shared_buffers =   SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS;
    2521           1 :         opts->buf_cache_size =               SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE;
    2522           1 :         opts->dif_insert_or_strip =  SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP;
    2523           1 :         opts->abort_timeout_sec =    SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC;
    2524           1 :         opts->transport_specific =      NULL;
    2525           1 :         opts->data_wr_pool_size      =       SPDK_NVMF_RDMA_DEFAULT_DATA_WR_POOL_SIZE;
    2526           1 : }
    2527             : 
    2528             : static int nvmf_rdma_destroy(struct spdk_nvmf_transport *transport,
    2529             :                              spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg);
    2530             : 
    2531             : static inline bool
    2532           0 : nvmf_rdma_is_rxe_device(struct spdk_nvmf_rdma_device *device)
    2533             : {
    2534           0 :         return device->attr.vendor_id == SPDK_RDMA_RXE_VENDOR_ID_OLD ||
    2535           0 :                device->attr.vendor_id == SPDK_RDMA_RXE_VENDOR_ID_NEW;
    2536             : }
    2537             : 
    2538             : static int nvmf_rdma_accept(void *ctx);
    2539             : static bool nvmf_rdma_retry_listen_port(struct spdk_nvmf_rdma_transport *rtransport);
    2540             : static void destroy_ib_device(struct spdk_nvmf_rdma_transport *rtransport,
    2541             :                               struct spdk_nvmf_rdma_device *device);
    2542             : 
    2543             : static int
    2544           0 : create_ib_device(struct spdk_nvmf_rdma_transport *rtransport, struct ibv_context *context,
    2545             :                  struct spdk_nvmf_rdma_device **new_device)
    2546             : {
    2547           0 :         struct spdk_nvmf_rdma_device    *device;
    2548           0 :         int                             flag = 0;
    2549           0 :         int                             rc = 0;
    2550             : 
    2551           0 :         device = calloc(1, sizeof(*device));
    2552           0 :         if (!device) {
    2553           0 :                 SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n");
    2554           0 :                 return -ENOMEM;
    2555             :         }
    2556           0 :         device->context = context;
    2557           0 :         rc = ibv_query_device(device->context, &device->attr);
    2558           0 :         if (rc < 0) {
    2559           0 :                 SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
    2560           0 :                 free(device);
    2561           0 :                 return rc;
    2562             :         }
    2563             : 
    2564             : #ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
    2565           0 :         if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) {
    2566           0 :                 SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,");
    2567           0 :                 SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id);
    2568           0 :         }
    2569             : 
    2570             :         /**
    2571             :          * The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE.
    2572             :          * The Soft-RoCE RXE driver does not currently support send with invalidate,
    2573             :          * but incorrectly reports that it does. There are changes making their way
    2574             :          * through the kernel now that will enable this feature. When they are merged,
    2575             :          * we can conditionally enable this feature.
    2576             :          *
    2577             :          * TODO: enable this for versions of the kernel rxe driver that support it.
    2578             :          */
    2579           0 :         if (nvmf_rdma_is_rxe_device(device)) {
    2580           0 :                 device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS);
    2581           0 :         }
    2582             : #endif
    2583             : 
    2584             :         /* set up device context async ev fd as NON_BLOCKING */
    2585           0 :         flag = fcntl(device->context->async_fd, F_GETFL);
    2586           0 :         rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK);
    2587           0 :         if (rc < 0) {
    2588           0 :                 SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n");
    2589           0 :                 free(device);
    2590           0 :                 return rc;
    2591             :         }
    2592             : 
    2593           0 :         TAILQ_INSERT_TAIL(&rtransport->devices, device, link);
    2594           0 :         SPDK_DEBUGLOG(rdma, "New device %p is added to RDMA transport\n", device);
    2595             : 
    2596           0 :         if (g_nvmf_hooks.get_ibv_pd) {
    2597           0 :                 device->pd = g_nvmf_hooks.get_ibv_pd(NULL, device->context);
    2598           0 :         } else {
    2599           0 :                 device->pd = ibv_alloc_pd(device->context);
    2600             :         }
    2601             : 
    2602           0 :         if (!device->pd) {
    2603           0 :                 SPDK_ERRLOG("Unable to allocate protection domain.\n");
    2604           0 :                 destroy_ib_device(rtransport, device);
    2605           0 :                 return -ENOMEM;
    2606             :         }
    2607             : 
    2608           0 :         assert(device->map == NULL);
    2609             : 
    2610           0 :         device->map = spdk_rdma_utils_create_mem_map(device->pd, &g_nvmf_hooks, IBV_ACCESS_LOCAL_WRITE);
    2611           0 :         if (!device->map) {
    2612           0 :                 SPDK_ERRLOG("Unable to allocate memory map for listen address\n");
    2613           0 :                 destroy_ib_device(rtransport, device);
    2614           0 :                 return -ENOMEM;
    2615             :         }
    2616             : 
    2617           0 :         assert(device->map != NULL);
    2618           0 :         assert(device->pd != NULL);
    2619             : 
    2620           0 :         if (new_device) {
    2621           0 :                 *new_device = device;
    2622           0 :         }
    2623           0 :         SPDK_NOTICELOG("Create IB device %s(%p/%p) succeed.\n", ibv_get_device_name(context->device),
    2624             :                        device, context);
    2625             : 
    2626           0 :         return 0;
    2627           0 : }
    2628             : 
    2629             : static void
    2630           0 : free_poll_fds(struct spdk_nvmf_rdma_transport *rtransport)
    2631             : {
    2632           0 :         if (rtransport->poll_fds) {
    2633           0 :                 free(rtransport->poll_fds);
    2634           0 :                 rtransport->poll_fds = NULL;
    2635           0 :         }
    2636           0 :         rtransport->npoll_fds = 0;
    2637           0 : }
    2638             : 
    2639             : static int
    2640           0 : generate_poll_fds(struct spdk_nvmf_rdma_transport *rtransport)
    2641             : {
    2642             :         /* Set up poll descriptor array to monitor events from RDMA and IB
    2643             :          * in a single poll syscall
    2644             :          */
    2645           0 :         int device_count = 0;
    2646           0 :         int i = 0;
    2647           0 :         struct spdk_nvmf_rdma_device *device, *tmp;
    2648             : 
    2649           0 :         TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
    2650           0 :                 device_count++;
    2651           0 :         }
    2652             : 
    2653           0 :         rtransport->npoll_fds = device_count + 1;
    2654             : 
    2655           0 :         rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd));
    2656           0 :         if (rtransport->poll_fds == NULL) {
    2657           0 :                 SPDK_ERRLOG("poll_fds allocation failed\n");
    2658           0 :                 return -ENOMEM;
    2659             :         }
    2660             : 
    2661           0 :         rtransport->poll_fds[i].fd = rtransport->event_channel->fd;
    2662           0 :         rtransport->poll_fds[i++].events = POLLIN;
    2663             : 
    2664           0 :         TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
    2665           0 :                 rtransport->poll_fds[i].fd = device->context->async_fd;
    2666           0 :                 rtransport->poll_fds[i++].events = POLLIN;
    2667           0 :         }
    2668             : 
    2669           0 :         return 0;
    2670           0 : }
    2671             : 
    2672             : static struct spdk_nvmf_transport *
    2673           0 : nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
    2674             : {
    2675           0 :         int rc;
    2676           0 :         struct spdk_nvmf_rdma_transport *rtransport;
    2677           0 :         struct spdk_nvmf_rdma_device    *device;
    2678           0 :         struct ibv_context              **contexts;
    2679           0 :         size_t                          data_wr_pool_size;
    2680           0 :         uint32_t                        i;
    2681           0 :         int                             flag;
    2682           0 :         uint32_t                        sge_count;
    2683           0 :         uint32_t                        min_shared_buffers;
    2684           0 :         uint32_t                        min_in_capsule_data_size;
    2685           0 :         int                             max_device_sge = SPDK_NVMF_MAX_SGL_ENTRIES;
    2686             : 
    2687           0 :         rtransport = calloc(1, sizeof(*rtransport));
    2688           0 :         if (!rtransport) {
    2689           0 :                 return NULL;
    2690             :         }
    2691             : 
    2692           0 :         TAILQ_INIT(&rtransport->devices);
    2693           0 :         TAILQ_INIT(&rtransport->ports);
    2694           0 :         TAILQ_INIT(&rtransport->poll_groups);
    2695           0 :         TAILQ_INIT(&rtransport->retry_ports);
    2696             : 
    2697           0 :         rtransport->transport.ops = &spdk_nvmf_transport_rdma;
    2698           0 :         rtransport->rdma_opts.num_cqe = DEFAULT_NVMF_RDMA_CQ_SIZE;
    2699           0 :         rtransport->rdma_opts.max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH;
    2700           0 :         rtransport->rdma_opts.no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ;
    2701           0 :         rtransport->rdma_opts.acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG;
    2702           0 :         rtransport->rdma_opts.no_wr_batching = SPDK_NVMF_RDMA_DEFAULT_NO_WR_BATCHING;
    2703           0 :         if (opts->transport_specific != NULL &&
    2704           0 :             spdk_json_decode_object_relaxed(opts->transport_specific, rdma_transport_opts_decoder,
    2705             :                                             SPDK_COUNTOF(rdma_transport_opts_decoder),
    2706           0 :                                             &rtransport->rdma_opts)) {
    2707           0 :                 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n");
    2708           0 :                 nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
    2709           0 :                 return NULL;
    2710             :         }
    2711             : 
    2712           0 :         SPDK_INFOLOG(rdma, "*** RDMA Transport Init ***\n"
    2713             :                      "  Transport opts:  max_ioq_depth=%d, max_io_size=%d,\n"
    2714             :                      "  max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n"
    2715             :                      "  in_capsule_data_size=%d, max_aq_depth=%d,\n"
    2716             :                      "  num_shared_buffers=%d, num_cqe=%d, max_srq_depth=%d, no_srq=%d,"
    2717             :                      "  acceptor_backlog=%d, no_wr_batching=%d abort_timeout_sec=%d\n",
    2718             :                      opts->max_queue_depth,
    2719             :                      opts->max_io_size,
    2720             :                      opts->max_qpairs_per_ctrlr - 1,
    2721             :                      opts->io_unit_size,
    2722             :                      opts->in_capsule_data_size,
    2723             :                      opts->max_aq_depth,
    2724             :                      opts->num_shared_buffers,
    2725             :                      rtransport->rdma_opts.num_cqe,
    2726             :                      rtransport->rdma_opts.max_srq_depth,
    2727             :                      rtransport->rdma_opts.no_srq,
    2728             :                      rtransport->rdma_opts.acceptor_backlog,
    2729             :                      rtransport->rdma_opts.no_wr_batching,
    2730             :                      opts->abort_timeout_sec);
    2731             : 
    2732             :         /* I/O unit size cannot be larger than max I/O size */
    2733           0 :         if (opts->io_unit_size > opts->max_io_size) {
    2734           0 :                 opts->io_unit_size = opts->max_io_size;
    2735           0 :         }
    2736             : 
    2737           0 :         if (rtransport->rdma_opts.acceptor_backlog <= 0) {
    2738           0 :                 SPDK_ERRLOG("The acceptor backlog cannot be less than 1, setting to the default value of (%d).\n",
    2739             :                             SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG);
    2740           0 :                 rtransport->rdma_opts.acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG;
    2741           0 :         }
    2742             : 
    2743           0 :         if (opts->num_shared_buffers < (SPDK_NVMF_MAX_SGL_ENTRIES * 2)) {
    2744           0 :                 SPDK_ERRLOG("The number of shared data buffers (%d) is less than"
    2745             :                             "the minimum number required to guarantee that forward progress can be made (%d)\n",
    2746             :                             opts->num_shared_buffers, (SPDK_NVMF_MAX_SGL_ENTRIES * 2));
    2747           0 :                 nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
    2748           0 :                 return NULL;
    2749             :         }
    2750             : 
    2751             :         /* If buf_cache_size == UINT32_MAX, we will dynamically pick a cache size later that we know will fit. */
    2752           0 :         if (opts->buf_cache_size < UINT32_MAX) {
    2753           0 :                 min_shared_buffers = spdk_env_get_core_count() * opts->buf_cache_size;
    2754           0 :                 if (min_shared_buffers > opts->num_shared_buffers) {
    2755           0 :                         SPDK_ERRLOG("There are not enough buffers to satisfy"
    2756             :                                     "per-poll group caches for each thread. (%" PRIu32 ")"
    2757             :                                     "supplied. (%" PRIu32 ") required\n", opts->num_shared_buffers, min_shared_buffers);
    2758           0 :                         SPDK_ERRLOG("Please specify a larger number of shared buffers\n");
    2759           0 :                         nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
    2760           0 :                         return NULL;
    2761             :                 }
    2762           0 :         }
    2763             : 
    2764           0 :         sge_count = opts->max_io_size / opts->io_unit_size;
    2765           0 :         if (sge_count > NVMF_DEFAULT_TX_SGE) {
    2766           0 :                 SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size);
    2767           0 :                 nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
    2768           0 :                 return NULL;
    2769             :         }
    2770             : 
    2771           0 :         min_in_capsule_data_size = sizeof(struct spdk_nvme_sgl_descriptor) * SPDK_NVMF_MAX_SGL_ENTRIES;
    2772           0 :         if (opts->in_capsule_data_size < min_in_capsule_data_size) {
    2773           0 :                 SPDK_WARNLOG("In capsule data size is set to %u, this is minimum size required to support msdbd=16\n",
    2774             :                              min_in_capsule_data_size);
    2775           0 :                 opts->in_capsule_data_size = min_in_capsule_data_size;
    2776           0 :         }
    2777             : 
    2778           0 :         rtransport->event_channel = rdma_create_event_channel();
    2779           0 :         if (rtransport->event_channel == NULL) {
    2780           0 :                 SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno));
    2781           0 :                 nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
    2782           0 :                 return NULL;
    2783             :         }
    2784             : 
    2785           0 :         flag = fcntl(rtransport->event_channel->fd, F_GETFL);
    2786           0 :         if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) {
    2787           0 :                 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n",
    2788             :                             rtransport->event_channel->fd, spdk_strerror(errno));
    2789           0 :                 nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
    2790           0 :                 return NULL;
    2791             :         }
    2792             : 
    2793           0 :         data_wr_pool_size = opts->data_wr_pool_size;
    2794           0 :         if (data_wr_pool_size < SPDK_NVMF_MAX_SGL_ENTRIES * 2 * spdk_env_get_core_count()) {
    2795           0 :                 data_wr_pool_size = SPDK_NVMF_MAX_SGL_ENTRIES * 2 * spdk_env_get_core_count();
    2796           0 :                 SPDK_NOTICELOG("data_wr_pool_size is changed to %zu to guarantee enough cache for handling "
    2797             :                                "at least one IO in each core\n", data_wr_pool_size);
    2798           0 :         }
    2799           0 :         rtransport->data_wr_pool = spdk_mempool_create("spdk_nvmf_rdma_wr_data", data_wr_pool_size,
    2800             :                                    sizeof(struct spdk_nvmf_rdma_request_data), SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
    2801             :                                    SPDK_ENV_NUMA_ID_ANY);
    2802           0 :         if (!rtransport->data_wr_pool) {
    2803           0 :                 if (spdk_mempool_lookup("spdk_nvmf_rdma_wr_data") != NULL) {
    2804           0 :                         SPDK_ERRLOG("Unable to allocate work request pool for poll group: already exists\n");
    2805           0 :                         SPDK_ERRLOG("Probably running in multiprocess environment, which is "
    2806             :                                     "unsupported by the nvmf library\n");
    2807           0 :                 } else {
    2808           0 :                         SPDK_ERRLOG("Unable to allocate work request pool for poll group\n");
    2809             :                 }
    2810           0 :                 nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
    2811           0 :                 return NULL;
    2812             :         }
    2813             : 
    2814           0 :         contexts = rdma_get_devices(NULL);
    2815           0 :         if (contexts == NULL) {
    2816           0 :                 SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
    2817           0 :                 nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
    2818           0 :                 return NULL;
    2819             :         }
    2820             : 
    2821           0 :         i = 0;
    2822           0 :         rc = 0;
    2823           0 :         while (contexts[i] != NULL) {
    2824           0 :                 rc = create_ib_device(rtransport, contexts[i], &device);
    2825           0 :                 if (rc < 0) {
    2826           0 :                         break;
    2827             :                 }
    2828           0 :                 i++;
    2829           0 :                 max_device_sge = spdk_min(max_device_sge, device->attr.max_sge);
    2830           0 :                 device->is_ready = true;
    2831             :         }
    2832           0 :         rdma_free_devices(contexts);
    2833             : 
    2834           0 :         if (opts->io_unit_size * max_device_sge < opts->max_io_size) {
    2835             :                 /* divide and round up. */
    2836           0 :                 opts->io_unit_size = (opts->max_io_size + max_device_sge - 1) / max_device_sge;
    2837             : 
    2838             :                 /* round up to the nearest 4k. */
    2839           0 :                 opts->io_unit_size = (opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT - 1) & ~NVMF_DATA_BUFFER_MASK;
    2840             : 
    2841           0 :                 opts->io_unit_size = spdk_max(opts->io_unit_size, SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE);
    2842           0 :                 SPDK_NOTICELOG("Adjusting the io unit size to fit the device's maximum I/O size. New I/O unit size %u\n",
    2843             :                                opts->io_unit_size);
    2844           0 :         }
    2845             : 
    2846           0 :         if (rc < 0) {
    2847           0 :                 nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
    2848           0 :                 return NULL;
    2849             :         }
    2850             : 
    2851           0 :         rc = generate_poll_fds(rtransport);
    2852           0 :         if (rc < 0) {
    2853           0 :                 nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
    2854           0 :                 return NULL;
    2855             :         }
    2856             : 
    2857           0 :         rtransport->accept_poller = SPDK_POLLER_REGISTER(nvmf_rdma_accept, &rtransport->transport,
    2858             :                                     opts->acceptor_poll_rate);
    2859           0 :         if (!rtransport->accept_poller) {
    2860           0 :                 nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
    2861           0 :                 return NULL;
    2862             :         }
    2863             : 
    2864           0 :         return &rtransport->transport;
    2865           0 : }
    2866             : 
    2867             : static void
    2868           0 : destroy_ib_device(struct spdk_nvmf_rdma_transport *rtransport,
    2869             :                   struct spdk_nvmf_rdma_device *device)
    2870             : {
    2871           0 :         TAILQ_REMOVE(&rtransport->devices, device, link);
    2872           0 :         spdk_rdma_utils_free_mem_map(&device->map);
    2873           0 :         if (device->pd) {
    2874           0 :                 if (!g_nvmf_hooks.get_ibv_pd) {
    2875           0 :                         ibv_dealloc_pd(device->pd);
    2876           0 :                 }
    2877           0 :         }
    2878           0 :         SPDK_DEBUGLOG(rdma, "IB device [%p] is destroyed.\n", device);
    2879           0 :         free(device);
    2880           0 : }
    2881             : 
    2882             : static void
    2883           0 : nvmf_rdma_dump_opts(struct spdk_nvmf_transport *transport, struct spdk_json_write_ctx *w)
    2884             : {
    2885           0 :         struct spdk_nvmf_rdma_transport *rtransport;
    2886           0 :         assert(w != NULL);
    2887             : 
    2888           0 :         rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
    2889           0 :         spdk_json_write_named_uint32(w, "max_srq_depth", rtransport->rdma_opts.max_srq_depth);
    2890           0 :         spdk_json_write_named_bool(w, "no_srq", rtransport->rdma_opts.no_srq);
    2891           0 :         if (rtransport->rdma_opts.no_srq == true) {
    2892           0 :                 spdk_json_write_named_int32(w, "num_cqe", rtransport->rdma_opts.num_cqe);
    2893           0 :         }
    2894           0 :         spdk_json_write_named_int32(w, "acceptor_backlog", rtransport->rdma_opts.acceptor_backlog);
    2895           0 :         spdk_json_write_named_bool(w, "no_wr_batching", rtransport->rdma_opts.no_wr_batching);
    2896           0 : }
    2897             : 
    2898             : static int
    2899           0 : nvmf_rdma_destroy(struct spdk_nvmf_transport *transport,
    2900             :                   spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg)
    2901             : {
    2902           0 :         struct spdk_nvmf_rdma_transport *rtransport;
    2903           0 :         struct spdk_nvmf_rdma_port      *port, *port_tmp;
    2904           0 :         struct spdk_nvmf_rdma_device    *device, *device_tmp;
    2905             : 
    2906           0 :         rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
    2907             : 
    2908           0 :         TAILQ_FOREACH_SAFE(port, &rtransport->retry_ports, link, port_tmp) {
    2909           0 :                 TAILQ_REMOVE(&rtransport->retry_ports, port, link);
    2910           0 :                 free(port);
    2911           0 :         }
    2912             : 
    2913           0 :         TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) {
    2914           0 :                 TAILQ_REMOVE(&rtransport->ports, port, link);
    2915           0 :                 rdma_destroy_id(port->id);
    2916           0 :                 free(port);
    2917           0 :         }
    2918             : 
    2919           0 :         free_poll_fds(rtransport);
    2920             : 
    2921           0 :         if (rtransport->event_channel != NULL) {
    2922           0 :                 rdma_destroy_event_channel(rtransport->event_channel);
    2923           0 :         }
    2924             : 
    2925           0 :         TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) {
    2926           0 :                 destroy_ib_device(rtransport, device);
    2927           0 :         }
    2928             : 
    2929           0 :         if (rtransport->data_wr_pool != NULL) {
    2930           0 :                 if (spdk_mempool_count(rtransport->data_wr_pool) != transport->opts.data_wr_pool_size) {
    2931           0 :                         SPDK_ERRLOG("transport wr pool count is %zu but should be %u\n",
    2932             :                                     spdk_mempool_count(rtransport->data_wr_pool),
    2933             :                                     transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES);
    2934           0 :                 }
    2935           0 :         }
    2936             : 
    2937           0 :         spdk_mempool_free(rtransport->data_wr_pool);
    2938             : 
    2939           0 :         spdk_poller_unregister(&rtransport->accept_poller);
    2940           0 :         free(rtransport);
    2941             : 
    2942           0 :         if (cb_fn) {
    2943           0 :                 cb_fn(cb_arg);
    2944           0 :         }
    2945           0 :         return 0;
    2946           0 : }
    2947             : 
    2948             : static int nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id,
    2949             :                                      struct spdk_nvme_transport_id *trid,
    2950             :                                      bool peer);
    2951             : 
    2952             : static bool nvmf_rdma_rescan_devices(struct spdk_nvmf_rdma_transport *rtransport);
    2953             : 
    2954             : static int
    2955           0 : nvmf_rdma_listen(struct spdk_nvmf_transport *transport, const struct spdk_nvme_transport_id *trid,
    2956             :                  struct spdk_nvmf_listen_opts *listen_opts)
    2957             : {
    2958           0 :         struct spdk_nvmf_rdma_transport *rtransport;
    2959           0 :         struct spdk_nvmf_rdma_device    *device;
    2960           0 :         struct spdk_nvmf_rdma_port      *port, *tmp_port;
    2961           0 :         struct addrinfo                 *res;
    2962           0 :         struct addrinfo                 hints;
    2963           0 :         int                             family;
    2964           0 :         int                             rc;
    2965           0 :         long int                        port_val;
    2966           0 :         bool                            is_retry = false;
    2967             : 
    2968           0 :         if (!strlen(trid->trsvcid)) {
    2969           0 :                 SPDK_ERRLOG("Service id is required\n");
    2970           0 :                 return -EINVAL;
    2971             :         }
    2972             : 
    2973           0 :         rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
    2974           0 :         assert(rtransport->event_channel != NULL);
    2975             : 
    2976           0 :         port = calloc(1, sizeof(*port));
    2977           0 :         if (!port) {
    2978           0 :                 SPDK_ERRLOG("Port allocation failed\n");
    2979           0 :                 return -ENOMEM;
    2980             :         }
    2981             : 
    2982           0 :         port->trid = trid;
    2983             : 
    2984           0 :         switch (trid->adrfam) {
    2985             :         case SPDK_NVMF_ADRFAM_IPV4:
    2986           0 :                 family = AF_INET;
    2987           0 :                 break;
    2988             :         case SPDK_NVMF_ADRFAM_IPV6:
    2989           0 :                 family = AF_INET6;
    2990           0 :                 break;
    2991             :         default:
    2992           0 :                 SPDK_ERRLOG("Unhandled ADRFAM %d\n", trid->adrfam);
    2993           0 :                 free(port);
    2994           0 :                 return -EINVAL;
    2995             :         }
    2996             : 
    2997           0 :         memset(&hints, 0, sizeof(hints));
    2998           0 :         hints.ai_family = family;
    2999           0 :         hints.ai_flags = AI_NUMERICSERV;
    3000           0 :         hints.ai_socktype = SOCK_STREAM;
    3001           0 :         hints.ai_protocol = 0;
    3002             : 
    3003             :         /* Range check the trsvcid. Fail in 3 cases:
    3004             :          * < 0: means that spdk_strtol hit an error
    3005             :          * 0: this results in ephemeral port which we don't want
    3006             :          * > 65535: port too high
    3007             :          */
    3008           0 :         port_val = spdk_strtol(trid->trsvcid, 10);
    3009           0 :         if (port_val <= 0 || port_val > 65535) {
    3010           0 :                 SPDK_ERRLOG("invalid trsvcid %s\n", trid->trsvcid);
    3011           0 :                 free(port);
    3012           0 :                 return -EINVAL;
    3013             :         }
    3014             : 
    3015           0 :         rc = getaddrinfo(trid->traddr, trid->trsvcid, &hints, &res);
    3016           0 :         if (rc) {
    3017           0 :                 SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc);
    3018           0 :                 free(port);
    3019           0 :                 return -(abs(rc));
    3020             :         }
    3021             : 
    3022           0 :         rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP);
    3023           0 :         if (rc < 0) {
    3024           0 :                 SPDK_ERRLOG("rdma_create_id() failed\n");
    3025           0 :                 freeaddrinfo(res);
    3026           0 :                 free(port);
    3027           0 :                 return rc;
    3028             :         }
    3029             : 
    3030           0 :         rc = rdma_bind_addr(port->id, res->ai_addr);
    3031           0 :         freeaddrinfo(res);
    3032             : 
    3033           0 :         if (rc < 0) {
    3034           0 :                 TAILQ_FOREACH(tmp_port, &rtransport->retry_ports, link) {
    3035           0 :                         if (spdk_nvme_transport_id_compare(tmp_port->trid, trid) == 0) {
    3036           0 :                                 is_retry = true;
    3037           0 :                                 break;
    3038             :                         }
    3039           0 :                 }
    3040           0 :                 if (!is_retry) {
    3041           0 :                         SPDK_ERRLOG("rdma_bind_addr() failed\n");
    3042           0 :                 }
    3043           0 :                 rdma_destroy_id(port->id);
    3044           0 :                 free(port);
    3045           0 :                 return rc;
    3046             :         }
    3047             : 
    3048           0 :         if (!port->id->verbs) {
    3049           0 :                 SPDK_ERRLOG("ibv_context is null\n");
    3050           0 :                 rdma_destroy_id(port->id);
    3051           0 :                 free(port);
    3052           0 :                 return -1;
    3053             :         }
    3054             : 
    3055           0 :         rc = rdma_listen(port->id, rtransport->rdma_opts.acceptor_backlog);
    3056           0 :         if (rc < 0) {
    3057           0 :                 SPDK_ERRLOG("rdma_listen() failed\n");
    3058           0 :                 rdma_destroy_id(port->id);
    3059           0 :                 free(port);
    3060           0 :                 return rc;
    3061             :         }
    3062             : 
    3063           0 :         TAILQ_FOREACH(device, &rtransport->devices, link) {
    3064           0 :                 if (device->context == port->id->verbs && device->is_ready) {
    3065           0 :                         port->device = device;
    3066           0 :                         break;
    3067             :                 }
    3068           0 :         }
    3069           0 :         if (!port->device) {
    3070           0 :                 SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n",
    3071             :                             port->id->verbs);
    3072           0 :                 rdma_destroy_id(port->id);
    3073           0 :                 free(port);
    3074           0 :                 nvmf_rdma_rescan_devices(rtransport);
    3075           0 :                 return -EINVAL;
    3076             :         }
    3077             : 
    3078           0 :         SPDK_NOTICELOG("*** NVMe/RDMA Target Listening on %s port %s ***\n",
    3079             :                        trid->traddr, trid->trsvcid);
    3080             : 
    3081           0 :         TAILQ_INSERT_TAIL(&rtransport->ports, port, link);
    3082           0 :         return 0;
    3083           0 : }
    3084             : 
    3085             : static void
    3086           0 : nvmf_rdma_stop_listen_ex(struct spdk_nvmf_transport *transport,
    3087             :                          const struct spdk_nvme_transport_id *trid, bool need_retry)
    3088             : {
    3089           0 :         struct spdk_nvmf_rdma_transport *rtransport;
    3090           0 :         struct spdk_nvmf_rdma_port      *port, *tmp;
    3091             : 
    3092           0 :         rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
    3093             : 
    3094           0 :         if (!need_retry) {
    3095           0 :                 TAILQ_FOREACH_SAFE(port, &rtransport->retry_ports, link, tmp) {
    3096           0 :                         if (spdk_nvme_transport_id_compare(port->trid, trid) == 0) {
    3097           0 :                                 TAILQ_REMOVE(&rtransport->retry_ports, port, link);
    3098           0 :                                 free(port);
    3099           0 :                         }
    3100           0 :                 }
    3101           0 :         }
    3102             : 
    3103           0 :         TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) {
    3104           0 :                 if (spdk_nvme_transport_id_compare(port->trid, trid) == 0) {
    3105           0 :                         SPDK_DEBUGLOG(rdma, "Port %s:%s removed. need retry: %d\n",
    3106             :                                       port->trid->traddr, port->trid->trsvcid, need_retry);
    3107           0 :                         TAILQ_REMOVE(&rtransport->ports, port, link);
    3108           0 :                         rdma_destroy_id(port->id);
    3109           0 :                         port->id = NULL;
    3110           0 :                         port->device = NULL;
    3111           0 :                         if (need_retry) {
    3112           0 :                                 TAILQ_INSERT_TAIL(&rtransport->retry_ports, port, link);
    3113           0 :                         } else {
    3114           0 :                                 free(port);
    3115             :                         }
    3116           0 :                         break;
    3117             :                 }
    3118           0 :         }
    3119           0 : }
    3120             : 
    3121             : static void
    3122           0 : nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport,
    3123             :                       const struct spdk_nvme_transport_id *trid)
    3124             : {
    3125           0 :         nvmf_rdma_stop_listen_ex(transport, trid, false);
    3126           0 : }
    3127             : 
    3128             : static void _nvmf_rdma_register_poller_in_group(void *c);
    3129             : static void _nvmf_rdma_remove_poller_in_group(void *c);
    3130             : 
    3131             : static bool
    3132           0 : nvmf_rdma_all_pollers_management_done(void *c)
    3133             : {
    3134           0 :         struct poller_manage_ctx        *ctx = c;
    3135           0 :         int                             counter;
    3136             : 
    3137           0 :         counter = __atomic_sub_fetch(ctx->inflight_op_counter, 1, __ATOMIC_SEQ_CST);
    3138           0 :         SPDK_DEBUGLOG(rdma, "nvmf_rdma_all_pollers_management_done called. counter: %d, poller: %p\n",
    3139             :                       counter, ctx->rpoller);
    3140             : 
    3141           0 :         if (counter == 0) {
    3142           0 :                 free((void *)ctx->inflight_op_counter);
    3143           0 :         }
    3144           0 :         free(ctx);
    3145             : 
    3146           0 :         return counter == 0;
    3147           0 : }
    3148             : 
    3149             : static int
    3150           0 : nvmf_rdma_manage_poller(struct spdk_nvmf_rdma_transport *rtransport,
    3151             :                         struct spdk_nvmf_rdma_device *device, bool *has_inflight, bool is_add)
    3152             : {
    3153           0 :         struct spdk_nvmf_rdma_poll_group        *rgroup;
    3154           0 :         struct spdk_nvmf_rdma_poller            *rpoller;
    3155           0 :         struct spdk_nvmf_poll_group             *poll_group;
    3156           0 :         struct poller_manage_ctx                *ctx;
    3157           0 :         bool                                    found;
    3158           0 :         int                                     *inflight_counter;
    3159           0 :         spdk_msg_fn                             do_fn;
    3160             : 
    3161           0 :         *has_inflight = false;
    3162           0 :         do_fn = is_add ? _nvmf_rdma_register_poller_in_group : _nvmf_rdma_remove_poller_in_group;
    3163           0 :         inflight_counter = calloc(1, sizeof(int));
    3164           0 :         if (!inflight_counter) {
    3165           0 :                 SPDK_ERRLOG("Failed to allocate inflight counter when removing pollers\n");
    3166           0 :                 return -ENOMEM;
    3167             :         }
    3168             : 
    3169           0 :         TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) {
    3170           0 :                 (*inflight_counter)++;
    3171           0 :         }
    3172             : 
    3173           0 :         TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) {
    3174           0 :                 found = false;
    3175           0 :                 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
    3176           0 :                         if (rpoller->device == device) {
    3177           0 :                                 found = true;
    3178           0 :                                 break;
    3179             :                         }
    3180           0 :                 }
    3181           0 :                 if (found == is_add) {
    3182           0 :                         __atomic_fetch_sub(inflight_counter, 1, __ATOMIC_SEQ_CST);
    3183           0 :                         continue;
    3184             :                 }
    3185             : 
    3186           0 :                 ctx = calloc(1, sizeof(struct poller_manage_ctx));
    3187           0 :                 if (!ctx) {
    3188           0 :                         SPDK_ERRLOG("Failed to allocate poller_manage_ctx when removing pollers\n");
    3189           0 :                         if (!*has_inflight) {
    3190           0 :                                 free(inflight_counter);
    3191           0 :                         }
    3192           0 :                         return -ENOMEM;
    3193             :                 }
    3194             : 
    3195           0 :                 ctx->rtransport = rtransport;
    3196           0 :                 ctx->rgroup = rgroup;
    3197           0 :                 ctx->rpoller = rpoller;
    3198           0 :                 ctx->device = device;
    3199           0 :                 ctx->thread = spdk_get_thread();
    3200           0 :                 ctx->inflight_op_counter = inflight_counter;
    3201           0 :                 *has_inflight = true;
    3202             : 
    3203           0 :                 poll_group = rgroup->group.group;
    3204           0 :                 if (poll_group->thread != spdk_get_thread()) {
    3205           0 :                         spdk_thread_send_msg(poll_group->thread, do_fn, ctx);
    3206           0 :                 } else {
    3207           0 :                         do_fn(ctx);
    3208             :                 }
    3209           0 :         }
    3210             : 
    3211           0 :         if (!*has_inflight) {
    3212           0 :                 free(inflight_counter);
    3213           0 :         }
    3214             : 
    3215           0 :         return 0;
    3216           0 : }
    3217             : 
    3218             : static void nvmf_rdma_handle_device_removal(struct spdk_nvmf_rdma_transport *rtransport,
    3219             :                 struct spdk_nvmf_rdma_device *device);
    3220             : 
    3221             : static struct spdk_nvmf_rdma_device *
    3222           0 : nvmf_rdma_find_ib_device(struct spdk_nvmf_rdma_transport *rtransport,
    3223             :                          struct ibv_context *context)
    3224             : {
    3225           0 :         struct spdk_nvmf_rdma_device    *device, *tmp_device;
    3226             : 
    3227           0 :         TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp_device) {
    3228           0 :                 if (device->need_destroy) {
    3229           0 :                         continue;
    3230             :                 }
    3231             : 
    3232           0 :                 if (strcmp(device->context->device->dev_name, context->device->dev_name) == 0) {
    3233           0 :                         return device;
    3234             :                 }
    3235           0 :         }
    3236             : 
    3237           0 :         return NULL;
    3238           0 : }
    3239             : 
    3240             : static bool
    3241           0 : nvmf_rdma_check_devices_context(struct spdk_nvmf_rdma_transport *rtransport,
    3242             :                                 struct ibv_context *context)
    3243             : {
    3244           0 :         struct spdk_nvmf_rdma_device    *old_device, *new_device;
    3245           0 :         int                             rc = 0;
    3246           0 :         bool                            has_inflight;
    3247             : 
    3248           0 :         old_device = nvmf_rdma_find_ib_device(rtransport, context);
    3249             : 
    3250           0 :         if (old_device) {
    3251           0 :                 if (old_device->context != context && !old_device->need_destroy && old_device->is_ready) {
    3252             :                         /* context may not have time to be cleaned when rescan. exactly one context
    3253             :                          * is valid for a device so this context must be invalid and just remove it. */
    3254           0 :                         SPDK_WARNLOG("Device %p has a invalid context %p\n", old_device, old_device->context);
    3255           0 :                         old_device->need_destroy = true;
    3256           0 :                         nvmf_rdma_handle_device_removal(rtransport, old_device);
    3257           0 :                 }
    3258           0 :                 return false;
    3259             :         }
    3260             : 
    3261           0 :         rc = create_ib_device(rtransport, context, &new_device);
    3262             :         /* TODO: update transport opts. */
    3263           0 :         if (rc < 0) {
    3264           0 :                 SPDK_ERRLOG("Failed to create ib device for context: %s(%p)\n",
    3265             :                             ibv_get_device_name(context->device), context);
    3266           0 :                 return false;
    3267             :         }
    3268             : 
    3269           0 :         rc = nvmf_rdma_manage_poller(rtransport, new_device, &has_inflight, true);
    3270           0 :         if (rc < 0) {
    3271           0 :                 SPDK_ERRLOG("Failed to add poller for device context: %s(%p)\n",
    3272             :                             ibv_get_device_name(context->device), context);
    3273           0 :                 return false;
    3274             :         }
    3275             : 
    3276           0 :         if (has_inflight) {
    3277           0 :                 new_device->is_ready = true;
    3278           0 :         }
    3279             : 
    3280           0 :         return true;
    3281           0 : }
    3282             : 
    3283             : static bool
    3284           0 : nvmf_rdma_rescan_devices(struct spdk_nvmf_rdma_transport *rtransport)
    3285             : {
    3286           0 :         struct spdk_nvmf_rdma_device    *device;
    3287           0 :         struct ibv_device               **ibv_device_list = NULL;
    3288           0 :         struct ibv_context              **contexts = NULL;
    3289           0 :         int                             i = 0;
    3290           0 :         int                             num_dev = 0;
    3291           0 :         bool                            new_create = false, has_new_device = false;
    3292           0 :         struct ibv_context              *tmp_verbs = NULL;
    3293             : 
    3294             :         /* do not rescan when any device is destroying, or context may be freed when
    3295             :          * regenerating the poll fds.
    3296             :          */
    3297           0 :         TAILQ_FOREACH(device, &rtransport->devices, link) {
    3298           0 :                 if (device->need_destroy) {
    3299           0 :                         return false;
    3300             :                 }
    3301           0 :         }
    3302             : 
    3303           0 :         ibv_device_list = ibv_get_device_list(&num_dev);
    3304             : 
    3305             :         /* There is a bug in librdmacm. If verbs init failed in rdma_get_devices, it'll be
    3306             :          * marked as dead verbs and never be init again. So we need to make sure the
    3307             :          * verbs is available before we call rdma_get_devices. */
    3308           0 :         if (num_dev >= 0) {
    3309           0 :                 for (i = 0; i < num_dev; i++) {
    3310           0 :                         tmp_verbs = ibv_open_device(ibv_device_list[i]);
    3311           0 :                         if (!tmp_verbs) {
    3312           0 :                                 SPDK_WARNLOG("Failed to init ibv device %p, err %d. Skip rescan.\n", ibv_device_list[i], errno);
    3313           0 :                                 break;
    3314             :                         }
    3315           0 :                         if (nvmf_rdma_find_ib_device(rtransport, tmp_verbs) == NULL) {
    3316           0 :                                 SPDK_DEBUGLOG(rdma, "Find new verbs init ibv device %p(%s).\n", ibv_device_list[i],
    3317             :                                               tmp_verbs->device->dev_name);
    3318           0 :                                 has_new_device = true;
    3319           0 :                         }
    3320           0 :                         ibv_close_device(tmp_verbs);
    3321           0 :                 }
    3322           0 :                 ibv_free_device_list(ibv_device_list);
    3323           0 :                 if (!tmp_verbs || !has_new_device) {
    3324           0 :                         return false;
    3325             :                 }
    3326           0 :         }
    3327             : 
    3328           0 :         contexts = rdma_get_devices(NULL);
    3329             : 
    3330           0 :         for (i = 0; contexts && contexts[i] != NULL; i++) {
    3331           0 :                 new_create |= nvmf_rdma_check_devices_context(rtransport, contexts[i]);
    3332           0 :         }
    3333             : 
    3334           0 :         if (new_create) {
    3335           0 :                 free_poll_fds(rtransport);
    3336           0 :                 generate_poll_fds(rtransport);
    3337           0 :         }
    3338             : 
    3339           0 :         if (contexts) {
    3340           0 :                 rdma_free_devices(contexts);
    3341           0 :         }
    3342             : 
    3343           0 :         return new_create;
    3344           0 : }
    3345             : 
    3346             : static bool
    3347           0 : nvmf_rdma_retry_listen_port(struct spdk_nvmf_rdma_transport *rtransport)
    3348             : {
    3349           0 :         struct spdk_nvmf_rdma_port      *port, *tmp_port;
    3350           0 :         int                             rc = 0;
    3351           0 :         bool                            new_create = false;
    3352             : 
    3353           0 :         if (TAILQ_EMPTY(&rtransport->retry_ports)) {
    3354           0 :                 return false;
    3355             :         }
    3356             : 
    3357           0 :         new_create = nvmf_rdma_rescan_devices(rtransport);
    3358             : 
    3359           0 :         TAILQ_FOREACH_SAFE(port, &rtransport->retry_ports, link, tmp_port) {
    3360           0 :                 rc = nvmf_rdma_listen(&rtransport->transport, port->trid, NULL);
    3361             : 
    3362           0 :                 TAILQ_REMOVE(&rtransport->retry_ports, port, link);
    3363           0 :                 if (rc) {
    3364           0 :                         if (new_create) {
    3365           0 :                                 SPDK_ERRLOG("Found new IB device but port %s:%s is still failed(%d) to listen.\n",
    3366             :                                             port->trid->traddr, port->trid->trsvcid, rc);
    3367           0 :                         }
    3368           0 :                         TAILQ_INSERT_TAIL(&rtransport->retry_ports, port, link);
    3369           0 :                         break;
    3370             :                 } else {
    3371           0 :                         SPDK_NOTICELOG("Port %s:%s come back\n", port->trid->traddr, port->trid->trsvcid);
    3372           0 :                         free(port);
    3373             :                 }
    3374           0 :         }
    3375             : 
    3376           0 :         return true;
    3377           0 : }
    3378             : 
    3379             : static void
    3380           0 : nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport,
    3381             :                                 struct spdk_nvmf_rdma_qpair *rqpair, bool drain)
    3382             : {
    3383           0 :         struct spdk_nvmf_request *req, *tmp;
    3384           0 :         struct spdk_nvmf_rdma_request   *rdma_req, *req_tmp;
    3385           0 :         struct spdk_nvmf_rdma_resources *resources;
    3386             : 
    3387             :         /* First process requests which are waiting for response to be sent */
    3388           0 :         STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_send_queue, state_link, req_tmp) {
    3389           0 :                 if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
    3390           0 :                         break;
    3391             :                 }
    3392           0 :         }
    3393             : 
    3394             :         /* We process I/O in the data transfer pending queue at the highest priority. */
    3395           0 :         STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_read_queue, state_link, req_tmp) {
    3396           0 :                 if (rdma_req->state != RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING) {
    3397             :                         /* Requests in this queue might be in state RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
    3398             :                          * they are transmitting data over network but we keep them in the list to guarantee
    3399             :                          * fair processing. */
    3400           0 :                         continue;
    3401             :                 }
    3402           0 :                 if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
    3403           0 :                         break;
    3404             :                 }
    3405           0 :         }
    3406             : 
    3407             :         /* Then RDMA writes since reads have stronger restrictions than writes */
    3408           0 :         STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_write_queue, state_link, req_tmp) {
    3409           0 :                 if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
    3410           0 :                         break;
    3411             :                 }
    3412           0 :         }
    3413             : 
    3414             :         /* Then we handle request waiting on memory buffers. */
    3415           0 :         STAILQ_FOREACH_SAFE(req, &rqpair->poller->group->group.pending_buf_queue, buf_link, tmp) {
    3416           0 :                 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
    3417           0 :                 if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
    3418           0 :                         break;
    3419             :                 }
    3420           0 :         }
    3421             : 
    3422           0 :         resources = rqpair->resources;
    3423           0 :         while (!STAILQ_EMPTY(&resources->free_queue) && !STAILQ_EMPTY(&resources->incoming_queue)) {
    3424           0 :                 rdma_req = STAILQ_FIRST(&resources->free_queue);
    3425           0 :                 STAILQ_REMOVE_HEAD(&resources->free_queue, state_link);
    3426           0 :                 rdma_req->recv = STAILQ_FIRST(&resources->incoming_queue);
    3427           0 :                 STAILQ_REMOVE_HEAD(&resources->incoming_queue, link);
    3428             : 
    3429           0 :                 if (rqpair->srq != NULL) {
    3430           0 :                         rdma_req->req.qpair = &rdma_req->recv->qpair->qpair;
    3431           0 :                         rdma_req->recv->qpair->qd++;
    3432           0 :                 } else {
    3433           0 :                         rqpair->qd++;
    3434             :                 }
    3435             : 
    3436           0 :                 rdma_req->receive_tsc = rdma_req->recv->receive_tsc;
    3437           0 :                 rdma_req->state = RDMA_REQUEST_STATE_NEW;
    3438           0 :                 if (nvmf_rdma_request_process(rtransport, rdma_req) == false) {
    3439           0 :                         break;
    3440             :                 }
    3441             :         }
    3442           0 :         if (!STAILQ_EMPTY(&resources->incoming_queue) && STAILQ_EMPTY(&resources->free_queue)) {
    3443           0 :                 rqpair->poller->stat.pending_free_request++;
    3444           0 :         }
    3445           0 : }
    3446             : 
    3447             : static void
    3448           0 : nvmf_rdma_poller_process_pending_buf_queue(struct spdk_nvmf_rdma_transport *rtransport,
    3449             :                 struct spdk_nvmf_rdma_poller *rpoller)
    3450             : {
    3451           0 :         struct spdk_nvmf_request *req, *tmp;
    3452           0 :         struct spdk_nvmf_rdma_request *rdma_req;
    3453             : 
    3454           0 :         STAILQ_FOREACH_SAFE(req, &rpoller->group->group.pending_buf_queue, buf_link, tmp) {
    3455           0 :                 rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
    3456           0 :                 if (nvmf_rdma_request_process(rtransport, rdma_req) == false) {
    3457           0 :                         break;
    3458             :                 }
    3459           0 :         }
    3460           0 : }
    3461             : 
    3462             : static inline bool
    3463           0 : nvmf_rdma_device_supports_last_wqe_reached(struct spdk_nvmf_rdma_device *device)
    3464             : {
    3465             :         /* iWARP transport and SoftRoCE driver don't support LAST_WQE_REACHED ibv async event */
    3466           0 :         return !nvmf_rdma_is_rxe_device(device) &&
    3467           0 :                device->context->device->transport_type != IBV_TRANSPORT_IWARP;
    3468             : }
    3469             : 
    3470             : static void
    3471           0 : nvmf_rdma_destroy_drained_qpair(struct spdk_nvmf_rdma_qpair *rqpair)
    3472             : {
    3473           0 :         struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport,
    3474             :                         struct spdk_nvmf_rdma_transport, transport);
    3475             : 
    3476           0 :         nvmf_rdma_qpair_process_pending(rtransport, rqpair, true);
    3477             : 
    3478             :         /* nvmf_rdma_close_qpair is not called */
    3479           0 :         if (!rqpair->to_close) {
    3480           0 :                 return;
    3481             :         }
    3482             : 
    3483             :         /* device is already destroyed and we should force destroy this qpair. */
    3484           0 :         if (rqpair->poller && rqpair->poller->need_destroy) {
    3485           0 :                 nvmf_rdma_qpair_destroy(rqpair);
    3486           0 :                 return;
    3487             :         }
    3488             : 
    3489             :         /* In non SRQ path, we will reach rqpair->max_queue_depth. In SRQ path, we will get the last_wqe event. */
    3490           0 :         if (rqpair->current_send_depth != 0) {
    3491           0 :                 return;
    3492             :         }
    3493             : 
    3494           0 :         if (rqpair->srq == NULL && rqpair->current_recv_depth != rqpair->max_queue_depth) {
    3495           0 :                 return;
    3496             :         }
    3497             : 
    3498             :         /* For devices that support LAST_WQE_REACHED with srq, we need to
    3499             :          * wait to destroy the qpair until that event has been received.
    3500             :          */
    3501           0 :         if (rqpair->srq != NULL && rqpair->last_wqe_reached == false &&
    3502           0 :             nvmf_rdma_device_supports_last_wqe_reached(rqpair->device)) {
    3503           0 :                 return;
    3504             :         }
    3505             : 
    3506           0 :         assert(rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED ||
    3507             :                rqpair->qpair.state == SPDK_NVMF_QPAIR_ERROR);
    3508             : 
    3509           0 :         nvmf_rdma_qpair_destroy(rqpair);
    3510           0 : }
    3511             : 
    3512             : static int
    3513           0 : nvmf_rdma_disconnect(struct rdma_cm_event *evt, bool *event_acked)
    3514             : {
    3515           0 :         struct spdk_nvmf_qpair          *qpair;
    3516           0 :         struct spdk_nvmf_rdma_qpair     *rqpair;
    3517             : 
    3518           0 :         if (evt->id == NULL) {
    3519           0 :                 SPDK_ERRLOG("disconnect request: missing cm_id\n");
    3520           0 :                 return -1;
    3521             :         }
    3522             : 
    3523           0 :         qpair = evt->id->context;
    3524           0 :         if (qpair == NULL) {
    3525           0 :                 SPDK_ERRLOG("disconnect request: no active connection\n");
    3526           0 :                 return -1;
    3527             :         }
    3528             : 
    3529           0 :         rdma_ack_cm_event(evt);
    3530           0 :         *event_acked = true;
    3531             : 
    3532           0 :         rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
    3533             : 
    3534           0 :         spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair);
    3535             : 
    3536           0 :         spdk_nvmf_qpair_disconnect(&rqpair->qpair);
    3537             : 
    3538           0 :         return 0;
    3539           0 : }
    3540             : 
    3541             : #ifdef DEBUG
    3542             : static const char *CM_EVENT_STR[] = {
    3543             :         "RDMA_CM_EVENT_ADDR_RESOLVED",
    3544             :         "RDMA_CM_EVENT_ADDR_ERROR",
    3545             :         "RDMA_CM_EVENT_ROUTE_RESOLVED",
    3546             :         "RDMA_CM_EVENT_ROUTE_ERROR",
    3547             :         "RDMA_CM_EVENT_CONNECT_REQUEST",
    3548             :         "RDMA_CM_EVENT_CONNECT_RESPONSE",
    3549             :         "RDMA_CM_EVENT_CONNECT_ERROR",
    3550             :         "RDMA_CM_EVENT_UNREACHABLE",
    3551             :         "RDMA_CM_EVENT_REJECTED",
    3552             :         "RDMA_CM_EVENT_ESTABLISHED",
    3553             :         "RDMA_CM_EVENT_DISCONNECTED",
    3554             :         "RDMA_CM_EVENT_DEVICE_REMOVAL",
    3555             :         "RDMA_CM_EVENT_MULTICAST_JOIN",
    3556             :         "RDMA_CM_EVENT_MULTICAST_ERROR",
    3557             :         "RDMA_CM_EVENT_ADDR_CHANGE",
    3558             :         "RDMA_CM_EVENT_TIMEWAIT_EXIT"
    3559             : };
    3560             : #endif /* DEBUG */
    3561             : 
    3562             : static void
    3563           0 : nvmf_rdma_disconnect_qpairs_on_port(struct spdk_nvmf_rdma_transport *rtransport,
    3564             :                                     struct spdk_nvmf_rdma_port *port)
    3565             : {
    3566           0 :         struct spdk_nvmf_rdma_poll_group        *rgroup;
    3567           0 :         struct spdk_nvmf_rdma_poller            *rpoller;
    3568           0 :         struct spdk_nvmf_rdma_qpair             *rqpair;
    3569             : 
    3570           0 :         TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) {
    3571           0 :                 TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
    3572           0 :                         RB_FOREACH(rqpair, qpairs_tree, &rpoller->qpairs) {
    3573           0 :                                 if (rqpair->listen_id == port->id) {
    3574           0 :                                         spdk_nvmf_qpair_disconnect(&rqpair->qpair);
    3575           0 :                                 }
    3576           0 :                         }
    3577           0 :                 }
    3578           0 :         }
    3579           0 : }
    3580             : 
    3581             : static bool
    3582           0 : nvmf_rdma_handle_cm_event_addr_change(struct spdk_nvmf_transport *transport,
    3583             :                                       struct rdma_cm_event *event)
    3584             : {
    3585           0 :         const struct spdk_nvme_transport_id     *trid;
    3586           0 :         struct spdk_nvmf_rdma_port              *port;
    3587           0 :         struct spdk_nvmf_rdma_transport         *rtransport;
    3588           0 :         bool                                    event_acked = false;
    3589             : 
    3590           0 :         rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
    3591           0 :         TAILQ_FOREACH(port, &rtransport->ports, link) {
    3592           0 :                 if (port->id == event->id) {
    3593           0 :                         SPDK_ERRLOG("ADDR_CHANGE: IP %s:%s migrated\n", port->trid->traddr, port->trid->trsvcid);
    3594           0 :                         rdma_ack_cm_event(event);
    3595           0 :                         event_acked = true;
    3596           0 :                         trid = port->trid;
    3597           0 :                         break;
    3598             :                 }
    3599           0 :         }
    3600             : 
    3601           0 :         if (event_acked) {
    3602           0 :                 nvmf_rdma_disconnect_qpairs_on_port(rtransport, port);
    3603             : 
    3604           0 :                 nvmf_rdma_stop_listen(transport, trid);
    3605           0 :                 nvmf_rdma_listen(transport, trid, NULL);
    3606           0 :         }
    3607             : 
    3608           0 :         return event_acked;
    3609           0 : }
    3610             : 
    3611             : static void
    3612           0 : nvmf_rdma_handle_device_removal(struct spdk_nvmf_rdma_transport *rtransport,
    3613             :                                 struct spdk_nvmf_rdma_device *device)
    3614             : {
    3615           0 :         struct spdk_nvmf_rdma_port      *port, *port_tmp;
    3616           0 :         int                             rc;
    3617           0 :         bool                            has_inflight;
    3618             : 
    3619           0 :         rc = nvmf_rdma_manage_poller(rtransport, device, &has_inflight, false);
    3620           0 :         if (rc) {
    3621           0 :                 SPDK_ERRLOG("Failed to handle device removal, rc %d\n", rc);
    3622           0 :                 return;
    3623             :         }
    3624             : 
    3625           0 :         if (!has_inflight) {
    3626             :                 /* no pollers, destroy the device */
    3627           0 :                 device->ready_to_destroy = true;
    3628           0 :                 spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_remove_destroyed_device, rtransport);
    3629           0 :         }
    3630             : 
    3631           0 :         TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) {
    3632           0 :                 if (port->device == device) {
    3633           0 :                         SPDK_NOTICELOG("Port %s:%s on device %s is being removed.\n",
    3634             :                                        port->trid->traddr,
    3635             :                                        port->trid->trsvcid,
    3636             :                                        ibv_get_device_name(port->device->context->device));
    3637             : 
    3638             :                         /* keep NVMF listener and only destroy structures of the
    3639             :                          * RDMA transport. when the device comes back we can retry listening
    3640             :                          * and the application's workflow will not be interrupted.
    3641             :                          */
    3642           0 :                         nvmf_rdma_stop_listen_ex(&rtransport->transport, port->trid, true);
    3643           0 :                 }
    3644           0 :         }
    3645           0 : }
    3646             : 
    3647             : static void
    3648           0 : nvmf_rdma_handle_cm_event_port_removal(struct spdk_nvmf_transport *transport,
    3649             :                                        struct rdma_cm_event *event)
    3650             : {
    3651           0 :         struct spdk_nvmf_rdma_port              *port, *tmp_port;
    3652           0 :         struct spdk_nvmf_rdma_transport         *rtransport;
    3653             : 
    3654           0 :         port = event->id->context;
    3655           0 :         rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
    3656             : 
    3657           0 :         rdma_ack_cm_event(event);
    3658             : 
    3659             :         /* if device removal happens during ctrl qpair disconnecting, it's possible that we receive
    3660             :          * an DEVICE_REMOVAL event on qpair but the id->qp is just NULL. So we should make sure that
    3661             :          * we are handling a port event here.
    3662             :          */
    3663           0 :         TAILQ_FOREACH(tmp_port, &rtransport->ports, link) {
    3664           0 :                 if (port == tmp_port && port->device && !port->device->need_destroy) {
    3665           0 :                         port->device->need_destroy = true;
    3666           0 :                         nvmf_rdma_handle_device_removal(rtransport, port->device);
    3667           0 :                 }
    3668           0 :         }
    3669           0 : }
    3670             : 
    3671             : static void
    3672           0 : nvmf_process_cm_events(struct spdk_nvmf_transport *transport, uint32_t max_events)
    3673             : {
    3674           0 :         struct spdk_nvmf_rdma_transport *rtransport;
    3675           0 :         struct rdma_cm_event            *event;
    3676           0 :         uint32_t                        i;
    3677           0 :         int                             rc;
    3678           0 :         bool                            event_acked;
    3679             : 
    3680           0 :         rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
    3681             : 
    3682           0 :         if (rtransport->event_channel == NULL) {
    3683           0 :                 return;
    3684             :         }
    3685             : 
    3686           0 :         for (i = 0; i < max_events; i++) {
    3687           0 :                 event_acked = false;
    3688           0 :                 rc = rdma_get_cm_event(rtransport->event_channel, &event);
    3689           0 :                 if (rc) {
    3690           0 :                         if (errno != EAGAIN && errno != EWOULDBLOCK) {
    3691           0 :                                 SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno));
    3692           0 :                         }
    3693           0 :                         break;
    3694             :                 }
    3695             : 
    3696           0 :                 SPDK_DEBUGLOG(rdma, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]);
    3697             : 
    3698           0 :                 spdk_trace_record(TRACE_RDMA_CM_ASYNC_EVENT, 0, 0, 0, event->event);
    3699             : 
    3700           0 :                 switch (event->event) {
    3701             :                 case RDMA_CM_EVENT_ADDR_RESOLVED:
    3702             :                 case RDMA_CM_EVENT_ADDR_ERROR:
    3703             :                 case RDMA_CM_EVENT_ROUTE_RESOLVED:
    3704             :                 case RDMA_CM_EVENT_ROUTE_ERROR:
    3705             :                         /* No action required. The target never attempts to resolve routes. */
    3706           0 :                         break;
    3707             :                 case RDMA_CM_EVENT_CONNECT_REQUEST:
    3708           0 :                         rc = nvmf_rdma_connect(transport, event);
    3709           0 :                         if (rc < 0) {
    3710           0 :                                 SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc);
    3711           0 :                                 break;
    3712             :                         }
    3713           0 :                         break;
    3714             :                 case RDMA_CM_EVENT_CONNECT_RESPONSE:
    3715             :                         /* The target never initiates a new connection. So this will not occur. */
    3716           0 :                         break;
    3717             :                 case RDMA_CM_EVENT_CONNECT_ERROR:
    3718             :                         /* Can this happen? The docs say it can, but not sure what causes it. */
    3719           0 :                         break;
    3720             :                 case RDMA_CM_EVENT_UNREACHABLE:
    3721             :                 case RDMA_CM_EVENT_REJECTED:
    3722             :                         /* These only occur on the client side. */
    3723           0 :                         break;
    3724             :                 case RDMA_CM_EVENT_ESTABLISHED:
    3725             :                         /* TODO: Should we be waiting for this event anywhere? */
    3726           0 :                         break;
    3727             :                 case RDMA_CM_EVENT_DISCONNECTED:
    3728           0 :                         rc = nvmf_rdma_disconnect(event, &event_acked);
    3729           0 :                         if (rc < 0) {
    3730           0 :                                 SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc);
    3731           0 :                                 break;
    3732             :                         }
    3733           0 :                         break;
    3734             :                 case RDMA_CM_EVENT_DEVICE_REMOVAL:
    3735             :                         /* In case of device removal, kernel IB part triggers IBV_EVENT_DEVICE_FATAL
    3736             :                          * which triggers RDMA_CM_EVENT_DEVICE_REMOVAL on all cma_id’s.
    3737             :                          * Once these events are sent to SPDK, we should release all IB resources and
    3738             :                          * don't make attempts to call any ibv_query/modify/create functions. We can only call
    3739             :                          * ibv_destroy* functions to release user space memory allocated by IB. All kernel
    3740             :                          * resources are already cleaned. */
    3741           0 :                         if (event->id->qp) {
    3742             :                                 /* If rdma_cm event has a valid `qp` pointer then the event refers to the
    3743             :                                  * corresponding qpair. Otherwise the event refers to a listening device. */
    3744           0 :                                 rc = nvmf_rdma_disconnect(event, &event_acked);
    3745           0 :                                 if (rc < 0) {
    3746           0 :                                         SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc);
    3747           0 :                                         break;
    3748             :                                 }
    3749           0 :                         } else {
    3750           0 :                                 nvmf_rdma_handle_cm_event_port_removal(transport, event);
    3751           0 :                                 event_acked = true;
    3752             :                         }
    3753           0 :                         break;
    3754             :                 case RDMA_CM_EVENT_MULTICAST_JOIN:
    3755             :                 case RDMA_CM_EVENT_MULTICAST_ERROR:
    3756             :                         /* Multicast is not used */
    3757           0 :                         break;
    3758             :                 case RDMA_CM_EVENT_ADDR_CHANGE:
    3759           0 :                         event_acked = nvmf_rdma_handle_cm_event_addr_change(transport, event);
    3760           0 :                         break;
    3761             :                 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
    3762             :                         /* For now, do nothing. The target never re-uses queue pairs. */
    3763           0 :                         break;
    3764             :                 default:
    3765           0 :                         SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event);
    3766           0 :                         break;
    3767             :                 }
    3768           0 :                 if (!event_acked) {
    3769           0 :                         rdma_ack_cm_event(event);
    3770           0 :                 }
    3771           0 :         }
    3772           0 : }
    3773             : 
    3774             : static void
    3775           0 : nvmf_rdma_handle_last_wqe_reached(struct spdk_nvmf_rdma_qpair *rqpair)
    3776             : {
    3777           0 :         rqpair->last_wqe_reached = true;
    3778           0 :         nvmf_rdma_destroy_drained_qpair(rqpair);
    3779           0 : }
    3780             : 
    3781             : static void
    3782           0 : nvmf_rdma_qpair_process_last_wqe_event(void *ctx)
    3783             : {
    3784           0 :         struct spdk_nvmf_rdma_ibv_event_ctx *event_ctx = ctx;
    3785           0 :         struct spdk_nvmf_rdma_qpair *rqpair;
    3786             : 
    3787           0 :         rqpair = event_ctx->rqpair;
    3788             : 
    3789           0 :         if (rqpair) {
    3790           0 :                 assert(event_ctx == rqpair->last_wqe_reached_ctx);
    3791           0 :                 rqpair->last_wqe_reached_ctx = NULL;
    3792           0 :                 nvmf_rdma_handle_last_wqe_reached(rqpair);
    3793           0 :         }
    3794           0 :         free(event_ctx);
    3795           0 : }
    3796             : 
    3797             : static int
    3798           0 : nvmf_rdma_send_qpair_last_wqe_event(struct spdk_nvmf_rdma_qpair *rqpair)
    3799             : {
    3800           0 :         struct spdk_nvmf_rdma_ibv_event_ctx *ctx;
    3801           0 :         struct spdk_thread *thr = NULL;
    3802           0 :         int rc;
    3803             : 
    3804           0 :         if (rqpair->qpair.group) {
    3805           0 :                 thr = rqpair->qpair.group->thread;
    3806           0 :         } else if (rqpair->destruct_channel) {
    3807           0 :                 thr = spdk_io_channel_get_thread(rqpair->destruct_channel);
    3808           0 :         }
    3809             : 
    3810           0 :         if (!thr) {
    3811           0 :                 SPDK_DEBUGLOG(rdma, "rqpair %p has no thread\n", rqpair);
    3812           0 :                 return -EINVAL;
    3813             :         }
    3814             : 
    3815           0 :         if (rqpair->last_wqe_reached || rqpair->last_wqe_reached_ctx != NULL) {
    3816           0 :                 SPDK_ERRLOG("LAST_WQE_REACHED already received for rqpair %p\n", rqpair);
    3817           0 :                 return -EALREADY;
    3818             :         }
    3819             : 
    3820           0 :         ctx = calloc(1, sizeof(*ctx));
    3821           0 :         if (!ctx) {
    3822           0 :                 return -ENOMEM;
    3823             :         }
    3824             : 
    3825           0 :         ctx->rqpair = rqpair;
    3826           0 :         rqpair->last_wqe_reached_ctx = ctx;
    3827             : 
    3828           0 :         rc = spdk_thread_send_msg(thr, nvmf_rdma_qpair_process_last_wqe_event, ctx);
    3829           0 :         if (rc) {
    3830           0 :                 rqpair->last_wqe_reached_ctx = NULL;
    3831           0 :                 free(ctx);
    3832           0 :         }
    3833             : 
    3834           0 :         return rc;
    3835           0 : }
    3836             : 
    3837             : static int
    3838           0 : nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device)
    3839             : {
    3840           0 :         int                             rc;
    3841           0 :         struct spdk_nvmf_rdma_qpair     *rqpair = NULL;
    3842           0 :         struct ibv_async_event          event;
    3843             : 
    3844           0 :         rc = ibv_get_async_event(device->context, &event);
    3845             : 
    3846           0 :         if (rc) {
    3847             :                 /* In non-blocking mode -1 means there are no events available */
    3848           0 :                 return rc;
    3849             :         }
    3850             : 
    3851           0 :         switch (event.event_type) {
    3852             :         case IBV_EVENT_QP_FATAL:
    3853             :         case IBV_EVENT_QP_LAST_WQE_REACHED:
    3854             :         case IBV_EVENT_QP_REQ_ERR:
    3855             :         case IBV_EVENT_QP_ACCESS_ERR:
    3856             :         case IBV_EVENT_COMM_EST:
    3857             :         case IBV_EVENT_PATH_MIG:
    3858             :         case IBV_EVENT_PATH_MIG_ERR:
    3859           0 :                 rqpair = event.element.qp->qp_context;
    3860           0 :                 if (!rqpair) {
    3861             :                         /* Any QP event for NVMe-RDMA initiator may be returned. */
    3862           0 :                         SPDK_NOTICELOG("Async QP event for unknown QP: %s\n",
    3863             :                                        ibv_event_type_str(event.event_type));
    3864           0 :                         break;
    3865             :                 }
    3866             : 
    3867           0 :                 switch (event.event_type) {
    3868             :                 case IBV_EVENT_QP_FATAL:
    3869           0 :                         SPDK_ERRLOG("Fatal event received for rqpair %p\n", rqpair);
    3870           0 :                         spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0,
    3871             :                                           (uintptr_t)rqpair, event.event_type);
    3872           0 :                         rqpair->ibv_in_error_state = true;
    3873           0 :                         spdk_nvmf_qpair_disconnect(&rqpair->qpair);
    3874           0 :                         break;
    3875             :                 case IBV_EVENT_QP_LAST_WQE_REACHED:
    3876             :                         /* This event only occurs for shared receive queues. */
    3877           0 :                         SPDK_DEBUGLOG(rdma, "Last WQE reached event received for rqpair %p\n", rqpair);
    3878           0 :                         rc = nvmf_rdma_send_qpair_last_wqe_event(rqpair);
    3879           0 :                         if (rc) {
    3880           0 :                                 SPDK_WARNLOG("Failed to send LAST_WQE_REACHED event. rqpair %p, err %d\n", rqpair, rc);
    3881           0 :                                 rqpair->last_wqe_reached = true;
    3882           0 :                         }
    3883           0 :                         break;
    3884             :                 case IBV_EVENT_QP_REQ_ERR:
    3885             :                 case IBV_EVENT_QP_ACCESS_ERR:
    3886             :                 case IBV_EVENT_COMM_EST:
    3887             :                 case IBV_EVENT_PATH_MIG:
    3888             :                 case IBV_EVENT_PATH_MIG_ERR:
    3889           0 :                         SPDK_NOTICELOG("Async QP event: %s\n",
    3890             :                                        ibv_event_type_str(event.event_type));
    3891           0 :                         spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0,
    3892             :                                           (uintptr_t)rqpair, event.event_type);
    3893           0 :                         rqpair->ibv_in_error_state = true;
    3894           0 :                         break;
    3895             :                 default:
    3896           0 :                         break;
    3897             :                 }
    3898           0 :                 break;
    3899             :         case IBV_EVENT_DEVICE_FATAL:
    3900           0 :                 SPDK_ERRLOG("Device Fatal event[%s] received on %s. device: %p\n",
    3901             :                             ibv_event_type_str(event.event_type), ibv_get_device_name(device->context->device), device);
    3902           0 :                 device->need_destroy = true;
    3903           0 :                 break;
    3904             :         case IBV_EVENT_CQ_ERR:
    3905             :         case IBV_EVENT_PORT_ACTIVE:
    3906             :         case IBV_EVENT_PORT_ERR:
    3907             :         case IBV_EVENT_LID_CHANGE:
    3908             :         case IBV_EVENT_PKEY_CHANGE:
    3909             :         case IBV_EVENT_SM_CHANGE:
    3910             :         case IBV_EVENT_SRQ_ERR:
    3911             :         case IBV_EVENT_SRQ_LIMIT_REACHED:
    3912             :         case IBV_EVENT_CLIENT_REREGISTER:
    3913             :         case IBV_EVENT_GID_CHANGE:
    3914           0 :         case IBV_EVENT_SQ_DRAINED:
    3915             :         default:
    3916           0 :                 SPDK_NOTICELOG("Async event: %s\n",
    3917             :                                ibv_event_type_str(event.event_type));
    3918           0 :                 spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 0, event.event_type);
    3919           0 :                 break;
    3920             :         }
    3921           0 :         ibv_ack_async_event(&event);
    3922             : 
    3923           0 :         return 0;
    3924           0 : }
    3925             : 
    3926             : static void
    3927           0 : nvmf_process_ib_events(struct spdk_nvmf_rdma_device *device, uint32_t max_events)
    3928             : {
    3929           0 :         int rc = 0;
    3930           0 :         uint32_t i = 0;
    3931             : 
    3932           0 :         for (i = 0; i < max_events; i++) {
    3933           0 :                 rc = nvmf_process_ib_event(device);
    3934           0 :                 if (rc) {
    3935           0 :                         break;
    3936             :                 }
    3937           0 :         }
    3938             : 
    3939           0 :         SPDK_DEBUGLOG(rdma, "Device %s: %u events processed\n", device->context->device->name, i);
    3940           0 : }
    3941             : 
    3942             : static int
    3943           0 : nvmf_rdma_accept(void *ctx)
    3944             : {
    3945           0 :         int     nfds, i = 0;
    3946           0 :         struct spdk_nvmf_transport *transport = ctx;
    3947           0 :         struct spdk_nvmf_rdma_transport *rtransport;
    3948           0 :         struct spdk_nvmf_rdma_device *device, *tmp;
    3949           0 :         uint32_t count;
    3950           0 :         short revents;
    3951           0 :         bool do_retry;
    3952             : 
    3953           0 :         rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
    3954           0 :         do_retry = nvmf_rdma_retry_listen_port(rtransport);
    3955             : 
    3956           0 :         count = nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0);
    3957             : 
    3958           0 :         if (nfds <= 0) {
    3959           0 :                 return do_retry ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
    3960             :         }
    3961             : 
    3962             :         /* The first poll descriptor is RDMA CM event */
    3963           0 :         if (rtransport->poll_fds[i++].revents & POLLIN) {
    3964           0 :                 nvmf_process_cm_events(transport, NVMF_RDMA_MAX_EVENTS_PER_POLL);
    3965           0 :                 nfds--;
    3966           0 :         }
    3967             : 
    3968           0 :         if (nfds == 0) {
    3969           0 :                 return SPDK_POLLER_BUSY;
    3970             :         }
    3971             : 
    3972             :         /* Second and subsequent poll descriptors are IB async events */
    3973           0 :         TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
    3974           0 :                 revents = rtransport->poll_fds[i++].revents;
    3975           0 :                 if (revents & POLLIN) {
    3976           0 :                         if (spdk_likely(!device->need_destroy)) {
    3977           0 :                                 nvmf_process_ib_events(device, NVMF_RDMA_MAX_EVENTS_PER_POLL);
    3978           0 :                                 if (spdk_unlikely(device->need_destroy)) {
    3979           0 :                                         nvmf_rdma_handle_device_removal(rtransport, device);
    3980           0 :                                 }
    3981           0 :                         }
    3982           0 :                         nfds--;
    3983           0 :                 } else if (revents & POLLNVAL || revents & POLLHUP) {
    3984           0 :                         SPDK_ERRLOG("Receive unknown revent %x on device %p\n", (int)revents, device);
    3985           0 :                         nfds--;
    3986           0 :                 }
    3987           0 :         }
    3988             :         /* check all flagged fd's have been served */
    3989           0 :         assert(nfds == 0);
    3990             : 
    3991           0 :         return count > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
    3992           0 : }
    3993             : 
    3994             : static void
    3995           0 : nvmf_rdma_cdata_init(struct spdk_nvmf_transport *transport, struct spdk_nvmf_subsystem *subsystem,
    3996             :                      struct spdk_nvmf_ctrlr_data *cdata)
    3997             : {
    3998           0 :         cdata->nvmf_specific.msdbd = NVMF_DEFAULT_MSDBD;
    3999             : 
    4000             :         /* Disable in-capsule data transfer for RDMA controller when dif_insert_or_strip is enabled
    4001             :         since in-capsule data only works with NVME drives that support SGL memory layout */
    4002           0 :         if (transport->opts.dif_insert_or_strip) {
    4003           0 :                 cdata->nvmf_specific.ioccsz = sizeof(struct spdk_nvme_cmd) / 16;
    4004           0 :         }
    4005             : 
    4006           0 :         if (cdata->nvmf_specific.ioccsz > ((sizeof(struct spdk_nvme_cmd) + 0x1000) / 16)) {
    4007           0 :                 SPDK_WARNLOG("RDMA is configured to support up to 16 SGL entries while in capsule"
    4008             :                              " data is greater than 4KiB.\n");
    4009           0 :                 SPDK_WARNLOG("When used in conjunction with the NVMe-oF initiator from the Linux "
    4010             :                              "kernel between versions 5.4 and 5.12 data corruption may occur for "
    4011             :                              "writes that are not a multiple of 4KiB in size.\n");
    4012           0 :         }
    4013           0 : }
    4014             : 
    4015             : static void
    4016           0 : nvmf_rdma_discover(struct spdk_nvmf_transport *transport,
    4017             :                    struct spdk_nvme_transport_id *trid,
    4018             :                    struct spdk_nvmf_discovery_log_page_entry *entry)
    4019             : {
    4020           0 :         entry->trtype = SPDK_NVMF_TRTYPE_RDMA;
    4021           0 :         entry->adrfam = trid->adrfam;
    4022           0 :         entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_REQUIRED;
    4023             : 
    4024           0 :         spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' ');
    4025           0 :         spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' ');
    4026             : 
    4027           0 :         entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED;
    4028           0 :         entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE;
    4029           0 :         entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM;
    4030           0 : }
    4031             : 
    4032             : static int
    4033           0 : nvmf_rdma_poller_create(struct spdk_nvmf_rdma_transport *rtransport,
    4034             :                         struct spdk_nvmf_rdma_poll_group *rgroup, struct spdk_nvmf_rdma_device *device,
    4035             :                         struct spdk_nvmf_rdma_poller **out_poller)
    4036             : {
    4037           0 :         struct spdk_nvmf_rdma_poller            *poller;
    4038           0 :         struct spdk_rdma_provider_srq_init_attr srq_init_attr;
    4039           0 :         struct spdk_nvmf_rdma_resource_opts     opts;
    4040           0 :         int                                     num_cqe;
    4041             : 
    4042           0 :         poller = calloc(1, sizeof(*poller));
    4043           0 :         if (!poller) {
    4044           0 :                 SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n");
    4045           0 :                 return -1;
    4046             :         }
    4047             : 
    4048           0 :         poller->device = device;
    4049           0 :         poller->group = rgroup;
    4050           0 :         *out_poller = poller;
    4051             : 
    4052           0 :         RB_INIT(&poller->qpairs);
    4053           0 :         STAILQ_INIT(&poller->qpairs_pending_send);
    4054           0 :         STAILQ_INIT(&poller->qpairs_pending_recv);
    4055             : 
    4056           0 :         TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link);
    4057           0 :         SPDK_DEBUGLOG(rdma, "Create poller %p on device %p in poll group %p.\n", poller, device, rgroup);
    4058           0 :         if (rtransport->rdma_opts.no_srq == false && device->num_srq < device->attr.max_srq) {
    4059           0 :                 if ((int)rtransport->rdma_opts.max_srq_depth > device->attr.max_srq_wr) {
    4060           0 :                         SPDK_WARNLOG("Requested SRQ depth %u, max supported by dev %s is %d\n",
    4061             :                                      rtransport->rdma_opts.max_srq_depth, device->context->device->name, device->attr.max_srq_wr);
    4062           0 :                 }
    4063           0 :                 poller->max_srq_depth = spdk_min((int)rtransport->rdma_opts.max_srq_depth, device->attr.max_srq_wr);
    4064             : 
    4065           0 :                 device->num_srq++;
    4066           0 :                 memset(&srq_init_attr, 0, sizeof(srq_init_attr));
    4067           0 :                 srq_init_attr.pd = device->pd;
    4068           0 :                 srq_init_attr.stats = &poller->stat.qp_stats.recv;
    4069           0 :                 srq_init_attr.srq_init_attr.attr.max_wr = poller->max_srq_depth;
    4070           0 :                 srq_init_attr.srq_init_attr.attr.max_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE);
    4071           0 :                 poller->srq = spdk_rdma_provider_srq_create(&srq_init_attr);
    4072           0 :                 if (!poller->srq) {
    4073           0 :                         SPDK_ERRLOG("Unable to create shared receive queue, errno %d\n", errno);
    4074           0 :                         return -1;
    4075             :                 }
    4076             : 
    4077           0 :                 opts.qp = poller->srq;
    4078           0 :                 opts.map = device->map;
    4079           0 :                 opts.qpair = NULL;
    4080           0 :                 opts.shared = true;
    4081           0 :                 opts.max_queue_depth = poller->max_srq_depth;
    4082           0 :                 opts.in_capsule_data_size = rtransport->transport.opts.in_capsule_data_size;
    4083             : 
    4084           0 :                 poller->resources = nvmf_rdma_resources_create(&opts);
    4085           0 :                 if (!poller->resources) {
    4086           0 :                         SPDK_ERRLOG("Unable to allocate resources for shared receive queue.\n");
    4087           0 :                         return -1;
    4088             :                 }
    4089           0 :         }
    4090             : 
    4091             :         /*
    4092             :          * When using an srq, we can limit the completion queue at startup.
    4093             :          * The following formula represents the calculation:
    4094             :          * num_cqe = num_recv + num_data_wr + num_send_wr.
    4095             :          * where num_recv=num_data_wr=and num_send_wr=poller->max_srq_depth
    4096             :          */
    4097           0 :         if (poller->srq) {
    4098           0 :                 num_cqe = poller->max_srq_depth * 3;
    4099           0 :         } else {
    4100           0 :                 num_cqe = rtransport->rdma_opts.num_cqe;
    4101             :         }
    4102             : 
    4103           0 :         poller->cq = ibv_create_cq(device->context, num_cqe, poller, NULL, 0);
    4104           0 :         if (!poller->cq) {
    4105           0 :                 SPDK_ERRLOG("Unable to create completion queue\n");
    4106           0 :                 return -1;
    4107             :         }
    4108           0 :         poller->num_cqe = num_cqe;
    4109           0 :         return 0;
    4110           0 : }
    4111             : 
    4112             : static void
    4113           0 : _nvmf_rdma_register_poller_in_group(void *c)
    4114             : {
    4115           0 :         struct spdk_nvmf_rdma_poller    *poller;
    4116           0 :         struct poller_manage_ctx        *ctx = c;
    4117           0 :         struct spdk_nvmf_rdma_device    *device;
    4118           0 :         int                             rc;
    4119             : 
    4120           0 :         rc = nvmf_rdma_poller_create(ctx->rtransport, ctx->rgroup, ctx->device, &poller);
    4121           0 :         if (rc < 0 && poller) {
    4122           0 :                 nvmf_rdma_poller_destroy(poller);
    4123           0 :         }
    4124             : 
    4125           0 :         device = ctx->device;
    4126           0 :         if (nvmf_rdma_all_pollers_management_done(ctx)) {
    4127           0 :                 device->is_ready = true;
    4128           0 :         }
    4129           0 : }
    4130             : 
    4131             : static void nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group);
    4132             : 
    4133             : static struct spdk_nvmf_transport_poll_group *
    4134           5 : nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport,
    4135             :                             struct spdk_nvmf_poll_group *group)
    4136             : {
    4137           5 :         struct spdk_nvmf_rdma_transport         *rtransport;
    4138           5 :         struct spdk_nvmf_rdma_poll_group        *rgroup;
    4139           5 :         struct spdk_nvmf_rdma_poller            *poller;
    4140           5 :         struct spdk_nvmf_rdma_device            *device;
    4141           5 :         int                                     rc;
    4142             : 
    4143           5 :         if (spdk_interrupt_mode_is_enabled()) {
    4144           0 :                 SPDK_ERRLOG("RDMA transport does not support interrupt mode\n");
    4145           0 :                 return NULL;
    4146             :         }
    4147             : 
    4148           5 :         rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
    4149             : 
    4150           5 :         rgroup = calloc(1, sizeof(*rgroup));
    4151           5 :         if (!rgroup) {
    4152           0 :                 return NULL;
    4153             :         }
    4154             : 
    4155           5 :         TAILQ_INIT(&rgroup->pollers);
    4156             : 
    4157           5 :         TAILQ_FOREACH(device, &rtransport->devices, link) {
    4158           0 :                 rc = nvmf_rdma_poller_create(rtransport, rgroup, device, &poller);
    4159           0 :                 if (rc < 0) {
    4160           0 :                         nvmf_rdma_poll_group_destroy(&rgroup->group);
    4161           0 :                         return NULL;
    4162             :                 }
    4163           0 :         }
    4164             : 
    4165           5 :         TAILQ_INSERT_TAIL(&rtransport->poll_groups, rgroup, link);
    4166           5 :         if (rtransport->conn_sched.next_admin_pg == NULL) {
    4167           1 :                 rtransport->conn_sched.next_admin_pg = rgroup;
    4168           1 :                 rtransport->conn_sched.next_io_pg = rgroup;
    4169           1 :         }
    4170             : 
    4171           5 :         return &rgroup->group;
    4172           5 : }
    4173             : 
    4174             : static uint32_t
    4175          12 : nvmf_poll_group_get_io_qpair_count(struct spdk_nvmf_poll_group *pg)
    4176             : {
    4177          12 :         uint32_t count;
    4178             : 
    4179             :         /* Just assume that unassociated qpairs will eventually be io
    4180             :          * qpairs.  This is close enough for the use cases for this
    4181             :          * function.
    4182             :          */
    4183          12 :         pthread_mutex_lock(&pg->mutex);
    4184          12 :         count = pg->stat.current_io_qpairs + pg->current_unassociated_qpairs;
    4185          12 :         pthread_mutex_unlock(&pg->mutex);
    4186             : 
    4187          24 :         return count;
    4188          12 : }
    4189             : 
    4190             : static struct spdk_nvmf_transport_poll_group *
    4191          14 : nvmf_rdma_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)
    4192             : {
    4193          14 :         struct spdk_nvmf_rdma_transport *rtransport;
    4194          14 :         struct spdk_nvmf_rdma_poll_group **pg;
    4195          14 :         struct spdk_nvmf_transport_poll_group *result;
    4196          14 :         uint32_t count;
    4197             : 
    4198          14 :         rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
    4199             : 
    4200          14 :         if (TAILQ_EMPTY(&rtransport->poll_groups)) {
    4201           2 :                 return NULL;
    4202             :         }
    4203             : 
    4204          12 :         if (qpair->qid == 0) {
    4205           6 :                 pg = &rtransport->conn_sched.next_admin_pg;
    4206           6 :         } else {
    4207           6 :                 struct spdk_nvmf_rdma_poll_group *pg_min, *pg_start, *pg_current;
    4208           6 :                 uint32_t min_value;
    4209             : 
    4210           6 :                 pg = &rtransport->conn_sched.next_io_pg;
    4211           6 :                 pg_min = *pg;
    4212           6 :                 pg_start = *pg;
    4213           6 :                 pg_current = *pg;
    4214           6 :                 min_value = nvmf_poll_group_get_io_qpair_count(pg_current->group.group);
    4215             : 
    4216           6 :                 while (1) {
    4217           6 :                         count = nvmf_poll_group_get_io_qpair_count(pg_current->group.group);
    4218             : 
    4219           6 :                         if (count < min_value) {
    4220           0 :                                 min_value = count;
    4221           0 :                                 pg_min = pg_current;
    4222           0 :                         }
    4223             : 
    4224           6 :                         pg_current = TAILQ_NEXT(pg_current, link);
    4225           6 :                         if (pg_current == NULL) {
    4226           2 :                                 pg_current = TAILQ_FIRST(&rtransport->poll_groups);
    4227           2 :                         }
    4228             : 
    4229           6 :                         if (pg_current == pg_start || min_value == 0) {
    4230           6 :                                 break;
    4231             :                         }
    4232             :                 }
    4233           6 :                 *pg = pg_min;
    4234           6 :         }
    4235             : 
    4236          12 :         assert(*pg != NULL);
    4237             : 
    4238          12 :         result = &(*pg)->group;
    4239             : 
    4240          12 :         *pg = TAILQ_NEXT(*pg, link);
    4241          12 :         if (*pg == NULL) {
    4242           4 :                 *pg = TAILQ_FIRST(&rtransport->poll_groups);
    4243           4 :         }
    4244             : 
    4245          12 :         return result;
    4246          14 : }
    4247             : 
    4248             : static void
    4249           0 : nvmf_rdma_poller_destroy(struct spdk_nvmf_rdma_poller *poller)
    4250             : {
    4251           0 :         struct spdk_nvmf_rdma_qpair     *qpair, *tmp_qpair;
    4252           0 :         int                             rc;
    4253             : 
    4254           0 :         TAILQ_REMOVE(&poller->group->pollers, poller, link);
    4255           0 :         RB_FOREACH_SAFE(qpair, qpairs_tree, &poller->qpairs, tmp_qpair) {
    4256           0 :                 nvmf_rdma_qpair_destroy(qpair);
    4257           0 :         }
    4258             : 
    4259           0 :         if (poller->srq) {
    4260           0 :                 if (poller->resources) {
    4261           0 :                         nvmf_rdma_resources_destroy(poller->resources);
    4262           0 :                 }
    4263           0 :                 spdk_rdma_provider_srq_destroy(poller->srq);
    4264           0 :                 SPDK_DEBUGLOG(rdma, "Destroyed RDMA shared queue %p\n", poller->srq);
    4265           0 :         }
    4266             : 
    4267           0 :         if (poller->cq) {
    4268           0 :                 rc = ibv_destroy_cq(poller->cq);
    4269           0 :                 if (rc != 0) {
    4270           0 :                         SPDK_ERRLOG("Destroy cq return %d, error: %s\n", rc, strerror(errno));
    4271           0 :                 }
    4272           0 :         }
    4273             : 
    4274           0 :         if (poller->destroy_cb) {
    4275           0 :                 poller->destroy_cb(poller->destroy_cb_ctx);
    4276           0 :                 poller->destroy_cb = NULL;
    4277           0 :         }
    4278             : 
    4279           0 :         free(poller);
    4280           0 : }
    4281             : 
    4282             : static void
    4283           5 : nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
    4284             : {
    4285           5 :         struct spdk_nvmf_rdma_poll_group        *rgroup, *next_rgroup;
    4286           5 :         struct spdk_nvmf_rdma_poller            *poller, *tmp;
    4287           5 :         struct spdk_nvmf_rdma_transport         *rtransport;
    4288             : 
    4289           5 :         rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
    4290           5 :         if (!rgroup) {
    4291           0 :                 return;
    4292             :         }
    4293             : 
    4294           5 :         TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) {
    4295           0 :                 nvmf_rdma_poller_destroy(poller);
    4296           0 :         }
    4297             : 
    4298           5 :         if (rgroup->group.transport == NULL) {
    4299             :                 /* Transport can be NULL when nvmf_rdma_poll_group_create()
    4300             :                  * calls this function directly in a failure path. */
    4301           0 :                 free(rgroup);
    4302           0 :                 return;
    4303             :         }
    4304             : 
    4305           5 :         rtransport = SPDK_CONTAINEROF(rgroup->group.transport, struct spdk_nvmf_rdma_transport, transport);
    4306             : 
    4307           5 :         next_rgroup = TAILQ_NEXT(rgroup, link);
    4308           5 :         TAILQ_REMOVE(&rtransport->poll_groups, rgroup, link);
    4309           5 :         if (next_rgroup == NULL) {
    4310           1 :                 next_rgroup = TAILQ_FIRST(&rtransport->poll_groups);
    4311           1 :         }
    4312           5 :         if (rtransport->conn_sched.next_admin_pg == rgroup) {
    4313           5 :                 rtransport->conn_sched.next_admin_pg = next_rgroup;
    4314           5 :         }
    4315           5 :         if (rtransport->conn_sched.next_io_pg == rgroup) {
    4316           5 :                 rtransport->conn_sched.next_io_pg = next_rgroup;
    4317           5 :         }
    4318             : 
    4319           5 :         free(rgroup);
    4320           5 : }
    4321             : 
    4322             : static void
    4323           0 : nvmf_rdma_qpair_reject_connection(struct spdk_nvmf_rdma_qpair *rqpair)
    4324             : {
    4325           0 :         if (rqpair->cm_id != NULL) {
    4326           0 :                 nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES);
    4327           0 :         }
    4328           0 : }
    4329             : 
    4330             : static int
    4331           0 : nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
    4332             :                          struct spdk_nvmf_qpair *qpair)
    4333             : {
    4334           0 :         struct spdk_nvmf_rdma_poll_group        *rgroup;
    4335           0 :         struct spdk_nvmf_rdma_qpair             *rqpair;
    4336           0 :         struct spdk_nvmf_rdma_device            *device;
    4337           0 :         struct spdk_nvmf_rdma_poller            *poller;
    4338           0 :         int                                     rc;
    4339             : 
    4340           0 :         rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
    4341           0 :         rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
    4342             : 
    4343           0 :         device = rqpair->device;
    4344             : 
    4345           0 :         TAILQ_FOREACH(poller, &rgroup->pollers, link) {
    4346           0 :                 if (poller->device == device) {
    4347           0 :                         break;
    4348             :                 }
    4349           0 :         }
    4350             : 
    4351           0 :         if (!poller) {
    4352           0 :                 SPDK_ERRLOG("No poller found for device.\n");
    4353           0 :                 return -1;
    4354             :         }
    4355             : 
    4356           0 :         if (poller->need_destroy) {
    4357           0 :                 SPDK_ERRLOG("Poller is destroying.\n");
    4358           0 :                 return -1;
    4359             :         }
    4360             : 
    4361           0 :         rqpair->poller = poller;
    4362           0 :         rqpair->srq = rqpair->poller->srq;
    4363             : 
    4364           0 :         rc = nvmf_rdma_qpair_initialize(qpair);
    4365           0 :         if (rc < 0) {
    4366           0 :                 SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair);
    4367           0 :                 rqpair->poller = NULL;
    4368           0 :                 rqpair->srq = NULL;
    4369           0 :                 return -1;
    4370             :         }
    4371             : 
    4372           0 :         RB_INSERT(qpairs_tree, &poller->qpairs, rqpair);
    4373             : 
    4374           0 :         rc = nvmf_rdma_event_accept(rqpair->cm_id, rqpair);
    4375           0 :         if (rc) {
    4376             :                 /* Try to reject, but we probably can't */
    4377           0 :                 nvmf_rdma_qpair_reject_connection(rqpair);
    4378           0 :                 return -1;
    4379             :         }
    4380             : 
    4381           0 :         return 0;
    4382           0 : }
    4383             : 
    4384             : static int
    4385           0 : nvmf_rdma_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
    4386             :                             struct spdk_nvmf_qpair *qpair)
    4387             : {
    4388           0 :         struct spdk_nvmf_rdma_qpair             *rqpair;
    4389             : 
    4390           0 :         rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
    4391           0 :         assert(group->transport->tgt != NULL);
    4392             : 
    4393           0 :         rqpair->destruct_channel = spdk_get_io_channel(group->transport->tgt);
    4394             : 
    4395           0 :         if (!rqpair->destruct_channel) {
    4396           0 :                 SPDK_WARNLOG("failed to get io_channel, qpair %p\n", qpair);
    4397           0 :                 return 0;
    4398             :         }
    4399             : 
    4400             :         /* Sanity check that we get io_channel on the correct thread */
    4401           0 :         if (qpair->group) {
    4402           0 :                 assert(qpair->group->thread == spdk_io_channel_get_thread(rqpair->destruct_channel));
    4403           0 :         }
    4404             : 
    4405           0 :         return 0;
    4406           0 : }
    4407             : 
    4408             : static int
    4409           0 : nvmf_rdma_request_free(struct spdk_nvmf_request *req)
    4410             : {
    4411           0 :         struct spdk_nvmf_rdma_request   *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
    4412           0 :         struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport,
    4413             :                         struct spdk_nvmf_rdma_transport, transport);
    4414           0 :         struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair,
    4415             :                                               struct spdk_nvmf_rdma_qpair, qpair);
    4416             : 
    4417             :         /*
    4418             :          * AER requests are freed when a qpair is destroyed. The recv corresponding to that request
    4419             :          * needs to be returned to the shared receive queue or the poll group will eventually be
    4420             :          * starved of RECV structures.
    4421             :          */
    4422           0 :         if (rqpair->srq && rdma_req->recv) {
    4423           0 :                 int rc;
    4424           0 :                 struct ibv_recv_wr *bad_recv_wr;
    4425             : 
    4426           0 :                 spdk_rdma_provider_srq_queue_recv_wrs(rqpair->srq, &rdma_req->recv->wr);
    4427           0 :                 rc = spdk_rdma_provider_srq_flush_recv_wrs(rqpair->srq, &bad_recv_wr);
    4428           0 :                 if (rc) {
    4429           0 :                         SPDK_ERRLOG("Unable to re-post rx descriptor\n");
    4430           0 :                 }
    4431           0 :         }
    4432             : 
    4433           0 :         _nvmf_rdma_request_free(rdma_req, rtransport);
    4434           0 :         return 0;
    4435           0 : }
    4436             : 
    4437             : static int
    4438           0 : nvmf_rdma_request_complete(struct spdk_nvmf_request *req)
    4439             : {
    4440           0 :         struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport,
    4441             :                         struct spdk_nvmf_rdma_transport, transport);
    4442           0 :         struct spdk_nvmf_rdma_request   *rdma_req = SPDK_CONTAINEROF(req,
    4443             :                         struct spdk_nvmf_rdma_request, req);
    4444           0 :         struct spdk_nvmf_rdma_qpair     *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair,
    4445             :                         struct spdk_nvmf_rdma_qpair, qpair);
    4446             : 
    4447           0 :         if (spdk_unlikely(rqpair->ibv_in_error_state)) {
    4448             :                 /* The connection is dead. Move the request directly to the completed state. */
    4449           0 :                 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
    4450           0 :         } else {
    4451             :                 /* The connection is alive, so process the request as normal */
    4452           0 :                 rdma_req->state = RDMA_REQUEST_STATE_EXECUTED;
    4453             :         }
    4454             : 
    4455           0 :         nvmf_rdma_request_process(rtransport, rdma_req);
    4456             : 
    4457           0 :         return 0;
    4458           0 : }
    4459             : 
    4460             : static void
    4461           0 : nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair,
    4462             :                       spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg)
    4463             : {
    4464           0 :         struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
    4465             : 
    4466           0 :         rqpair->to_close = true;
    4467             : 
    4468           0 :         if (rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED) {
    4469           0 :                 nvmf_rdma_qpair_reject_connection(rqpair);
    4470           0 :         }
    4471           0 :         if (rqpair->rdma_qp) {
    4472           0 :                 spdk_rdma_provider_qp_disconnect(rqpair->rdma_qp);
    4473           0 :         }
    4474             : 
    4475           0 :         nvmf_rdma_destroy_drained_qpair(rqpair);
    4476             : 
    4477           0 :         if (cb_fn) {
    4478           0 :                 cb_fn(cb_arg);
    4479           0 :         }
    4480           0 : }
    4481             : 
    4482             : static struct spdk_nvmf_rdma_qpair *
    4483           0 : get_rdma_qpair_from_wc(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_wc *wc)
    4484             : {
    4485           0 :         struct spdk_nvmf_rdma_qpair find;
    4486             : 
    4487           0 :         find.qp_num = wc->qp_num;
    4488             : 
    4489           0 :         return RB_FIND(qpairs_tree, &rpoller->qpairs, &find);
    4490           0 : }
    4491             : 
    4492             : #ifdef DEBUG
    4493             : static int
    4494           0 : nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req)
    4495             : {
    4496           0 :         return rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST ||
    4497           0 :                rdma_req->state == RDMA_REQUEST_STATE_COMPLETING;
    4498             : }
    4499             : #endif
    4500             : 
    4501             : static void
    4502           0 : _poller_reset_failed_recvs(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_recv_wr *bad_recv_wr,
    4503             :                            int rc)
    4504             : {
    4505           0 :         struct spdk_nvmf_rdma_recv      *rdma_recv;
    4506           0 :         struct spdk_nvmf_rdma_wr        *bad_rdma_wr;
    4507             : 
    4508           0 :         SPDK_ERRLOG("Failed to post a recv for the poller %p with errno %d\n", rpoller, -rc);
    4509           0 :         while (bad_recv_wr != NULL) {
    4510           0 :                 bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_recv_wr->wr_id;
    4511           0 :                 rdma_recv = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr);
    4512             : 
    4513           0 :                 rdma_recv->qpair->current_recv_depth++;
    4514           0 :                 bad_recv_wr = bad_recv_wr->next;
    4515           0 :                 SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rdma_recv->qpair, -rc);
    4516           0 :                 spdk_nvmf_qpair_disconnect(&rdma_recv->qpair->qpair);
    4517             :         }
    4518           0 : }
    4519             : 
    4520             : static void
    4521           0 : _qp_reset_failed_recvs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *bad_recv_wr, int rc)
    4522             : {
    4523           0 :         SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rqpair, -rc);
    4524           0 :         while (bad_recv_wr != NULL) {
    4525           0 :                 bad_recv_wr = bad_recv_wr->next;
    4526           0 :                 rqpair->current_recv_depth++;
    4527             :         }
    4528           0 :         spdk_nvmf_qpair_disconnect(&rqpair->qpair);
    4529           0 : }
    4530             : 
    4531             : static void
    4532           0 : _poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport,
    4533             :                      struct spdk_nvmf_rdma_poller *rpoller)
    4534             : {
    4535           0 :         struct spdk_nvmf_rdma_qpair     *rqpair;
    4536           0 :         struct ibv_recv_wr              *bad_recv_wr;
    4537           0 :         int                             rc;
    4538             : 
    4539           0 :         if (rpoller->srq) {
    4540           0 :                 rc = spdk_rdma_provider_srq_flush_recv_wrs(rpoller->srq, &bad_recv_wr);
    4541           0 :                 if (spdk_unlikely(rc)) {
    4542           0 :                         _poller_reset_failed_recvs(rpoller, bad_recv_wr, rc);
    4543           0 :                 }
    4544           0 :         } else {
    4545           0 :                 while (!STAILQ_EMPTY(&rpoller->qpairs_pending_recv)) {
    4546           0 :                         rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_recv);
    4547           0 :                         rc = spdk_rdma_provider_qp_flush_recv_wrs(rqpair->rdma_qp, &bad_recv_wr);
    4548           0 :                         if (spdk_unlikely(rc)) {
    4549           0 :                                 _qp_reset_failed_recvs(rqpair, bad_recv_wr, rc);
    4550           0 :                         }
    4551           0 :                         STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_recv, recv_link);
    4552             :                 }
    4553             :         }
    4554           0 : }
    4555             : 
    4556             : static void
    4557           0 : _qp_reset_failed_sends(struct spdk_nvmf_rdma_transport *rtransport,
    4558             :                        struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *bad_wr, int rc)
    4559             : {
    4560           0 :         struct spdk_nvmf_rdma_wr        *bad_rdma_wr;
    4561           0 :         struct spdk_nvmf_rdma_request   *prev_rdma_req = NULL, *cur_rdma_req = NULL;
    4562             : 
    4563           0 :         SPDK_ERRLOG("Failed to post a send for the qpair %p with errno %d\n", rqpair, -rc);
    4564           0 :         for (; bad_wr != NULL; bad_wr = bad_wr->next) {
    4565           0 :                 bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_wr->wr_id;
    4566           0 :                 assert(rqpair->current_send_depth > 0);
    4567           0 :                 rqpair->current_send_depth--;
    4568           0 :                 switch (bad_rdma_wr->type) {
    4569             :                 case RDMA_WR_TYPE_DATA:
    4570           0 :                         cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, data_wr);
    4571           0 :                         if (bad_wr->opcode == IBV_WR_RDMA_READ) {
    4572           0 :                                 assert(rqpair->current_read_depth > 0);
    4573           0 :                                 rqpair->current_read_depth--;
    4574           0 :                         }
    4575           0 :                         break;
    4576             :                 case RDMA_WR_TYPE_SEND:
    4577           0 :                         cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, rsp_wr);
    4578           0 :                         break;
    4579             :                 default:
    4580           0 :                         SPDK_ERRLOG("Found a RECV in the list of pending SEND requests for qpair %p\n", rqpair);
    4581           0 :                         prev_rdma_req = cur_rdma_req;
    4582           0 :                         continue;
    4583             :                 }
    4584             : 
    4585           0 :                 if (prev_rdma_req == cur_rdma_req) {
    4586             :                         /* this request was handled by an earlier wr. i.e. we were performing an nvme read. */
    4587             :                         /* We only have to check against prev_wr since each requests wrs are contiguous in this list. */
    4588           0 :                         continue;
    4589             :                 }
    4590             : 
    4591           0 :                 switch (cur_rdma_req->state) {
    4592             :                 case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
    4593           0 :                         cur_rdma_req->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
    4594           0 :                         STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, cur_rdma_req, state_link);
    4595           0 :                         cur_rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
    4596           0 :                         break;
    4597             :                 case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST:
    4598             :                 case RDMA_REQUEST_STATE_COMPLETING:
    4599           0 :                         cur_rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
    4600           0 :                         break;
    4601             :                 default:
    4602           0 :                         SPDK_ERRLOG("Found a request in a bad state %d when draining pending SEND requests for qpair %p\n",
    4603             :                                     cur_rdma_req->state, rqpair);
    4604           0 :                         continue;
    4605             :                 }
    4606             : 
    4607           0 :                 nvmf_rdma_request_process(rtransport, cur_rdma_req);
    4608           0 :                 prev_rdma_req = cur_rdma_req;
    4609           0 :         }
    4610             : 
    4611           0 :         if (spdk_nvmf_qpair_is_active(&rqpair->qpair)) {
    4612             :                 /* Disconnect the connection. */
    4613           0 :                 spdk_nvmf_qpair_disconnect(&rqpair->qpair);
    4614           0 :         }
    4615             : 
    4616           0 : }
    4617             : 
    4618             : static void
    4619           0 : _poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport,
    4620             :                      struct spdk_nvmf_rdma_poller *rpoller)
    4621             : {
    4622           0 :         struct spdk_nvmf_rdma_qpair     *rqpair;
    4623           0 :         struct ibv_send_wr              *bad_wr = NULL;
    4624           0 :         int                             rc;
    4625             : 
    4626           0 :         while (!STAILQ_EMPTY(&rpoller->qpairs_pending_send)) {
    4627           0 :                 rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_send);
    4628           0 :                 rc = spdk_rdma_provider_qp_flush_send_wrs(rqpair->rdma_qp, &bad_wr);
    4629             : 
    4630             :                 /* bad wr always points to the first wr that failed. */
    4631           0 :                 if (spdk_unlikely(rc)) {
    4632           0 :                         _qp_reset_failed_sends(rtransport, rqpair, bad_wr, rc);
    4633           0 :                 }
    4634           0 :                 STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_send, send_link);
    4635             :         }
    4636           0 : }
    4637             : 
    4638             : static const char *
    4639           0 : nvmf_rdma_wr_type_str(enum spdk_nvmf_rdma_wr_type wr_type)
    4640             : {
    4641           0 :         switch (wr_type) {
    4642             :         case RDMA_WR_TYPE_RECV:
    4643           0 :                 return "RECV";
    4644             :         case RDMA_WR_TYPE_SEND:
    4645           0 :                 return "SEND";
    4646             :         case RDMA_WR_TYPE_DATA:
    4647           0 :                 return "DATA";
    4648             :         default:
    4649           0 :                 SPDK_ERRLOG("Unknown WR type %d\n", wr_type);
    4650           0 :                 SPDK_UNREACHABLE();
    4651             :         }
    4652           0 : }
    4653             : 
    4654             : static inline void
    4655           0 : nvmf_rdma_log_wc_status(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_wc *wc)
    4656             : {
    4657           0 :         enum spdk_nvmf_rdma_wr_type wr_type = ((struct spdk_nvmf_rdma_wr *)wc->wr_id)->type;
    4658             : 
    4659           0 :         if (wc->status == IBV_WC_WR_FLUSH_ERR) {
    4660             :                 /* If qpair is in ERR state, we will receive completions for all posted and not completed
    4661             :                  * Work Requests with IBV_WC_WR_FLUSH_ERR status. Don't log an error in that case */
    4662           0 :                 SPDK_DEBUGLOG(rdma,
    4663             :                               "Error on CQ %p, (qp state %d, in_error %d) request 0x%lu, type %s, status: (%d): %s\n",
    4664             :                               rqpair->poller->cq, rqpair->qpair.state, rqpair->ibv_in_error_state, wc->wr_id,
    4665             :                               nvmf_rdma_wr_type_str(wr_type), wc->status, ibv_wc_status_str(wc->status));
    4666           0 :         } else {
    4667           0 :                 SPDK_ERRLOG("Error on CQ %p, (qp state %d, in_error %d) request 0x%lu, type %s, status: (%d): %s\n",
    4668             :                             rqpair->poller->cq, rqpair->qpair.state, rqpair->ibv_in_error_state, wc->wr_id,
    4669             :                             nvmf_rdma_wr_type_str(wr_type), wc->status, ibv_wc_status_str(wc->status));
    4670             :         }
    4671           0 : }
    4672             : 
    4673             : static int
    4674           0 : nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
    4675             :                       struct spdk_nvmf_rdma_poller *rpoller)
    4676             : {
    4677           0 :         struct ibv_wc wc[32];
    4678           0 :         struct spdk_nvmf_rdma_wr        *rdma_wr;
    4679           0 :         struct spdk_nvmf_rdma_request   *rdma_req;
    4680           0 :         struct spdk_nvmf_rdma_recv      *rdma_recv;
    4681           0 :         struct spdk_nvmf_rdma_qpair     *rqpair, *tmp_rqpair;
    4682           0 :         int reaped, i;
    4683           0 :         int count = 0;
    4684           0 :         int rc;
    4685           0 :         bool error = false;
    4686           0 :         uint64_t poll_tsc = spdk_get_ticks();
    4687             : 
    4688           0 :         if (spdk_unlikely(rpoller->need_destroy)) {
    4689             :                 /* If qpair is closed before poller destroy, nvmf_rdma_destroy_drained_qpair may not
    4690             :                  * be called because we cannot poll anything from cq. So we call that here to force
    4691             :                  * destroy the qpair after to_close turning true.
    4692             :                  */
    4693           0 :                 RB_FOREACH_SAFE(rqpair, qpairs_tree, &rpoller->qpairs, tmp_rqpair) {
    4694           0 :                         nvmf_rdma_destroy_drained_qpair(rqpair);
    4695           0 :                 }
    4696           0 :                 return 0;
    4697             :         }
    4698             : 
    4699             :         /* Poll for completing operations. */
    4700           0 :         reaped = ibv_poll_cq(rpoller->cq, 32, wc);
    4701           0 :         if (spdk_unlikely(reaped < 0)) {
    4702           0 :                 SPDK_ERRLOG("Error polling CQ! (%d): %s\n",
    4703             :                             errno, spdk_strerror(errno));
    4704           0 :                 return -1;
    4705           0 :         } else if (reaped == 0) {
    4706           0 :                 rpoller->stat.idle_polls++;
    4707           0 :         }
    4708             : 
    4709           0 :         rpoller->stat.polls++;
    4710           0 :         rpoller->stat.completions += reaped;
    4711             : 
    4712           0 :         for (i = 0; i < reaped; i++) {
    4713             : 
    4714           0 :                 rdma_wr = (struct spdk_nvmf_rdma_wr *)wc[i].wr_id;
    4715             : 
    4716           0 :                 switch (rdma_wr->type) {
    4717             :                 case RDMA_WR_TYPE_SEND:
    4718           0 :                         rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp_wr);
    4719           0 :                         rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
    4720             : 
    4721           0 :                         if (spdk_likely(!wc[i].status)) {
    4722           0 :                                 count++;
    4723           0 :                                 assert(wc[i].opcode == IBV_WC_SEND);
    4724           0 :                                 assert(nvmf_rdma_req_is_completing(rdma_req));
    4725           0 :                         }
    4726             : 
    4727           0 :                         rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
    4728             :                         /* RDMA_WRITE operation completed. +1 since it was chained with rsp WR */
    4729           0 :                         assert(rqpair->current_send_depth >= (uint32_t)rdma_req->num_outstanding_data_wr + 1);
    4730           0 :                         rqpair->current_send_depth -= rdma_req->num_outstanding_data_wr + 1;
    4731           0 :                         rdma_req->num_outstanding_data_wr = 0;
    4732             : 
    4733           0 :                         nvmf_rdma_request_process(rtransport, rdma_req);
    4734           0 :                         break;
    4735             :                 case RDMA_WR_TYPE_RECV:
    4736             :                         /* rdma_recv->qpair will be invalid if using an SRQ.  In that case we have to get the qpair from the wc. */
    4737           0 :                         rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr);
    4738           0 :                         if (rpoller->srq != NULL) {
    4739           0 :                                 rdma_recv->qpair = get_rdma_qpair_from_wc(rpoller, &wc[i]);
    4740             :                                 /* It is possible that there are still some completions for destroyed QP
    4741             :                                  * associated with SRQ. We just ignore these late completions and re-post
    4742             :                                  * receive WRs back to SRQ.
    4743             :                                  */
    4744           0 :                                 if (spdk_unlikely(NULL == rdma_recv->qpair)) {
    4745           0 :                                         struct ibv_recv_wr *bad_wr;
    4746             : 
    4747           0 :                                         rdma_recv->wr.next = NULL;
    4748           0 :                                         spdk_rdma_provider_srq_queue_recv_wrs(rpoller->srq, &rdma_recv->wr);
    4749           0 :                                         rc = spdk_rdma_provider_srq_flush_recv_wrs(rpoller->srq, &bad_wr);
    4750           0 :                                         if (rc) {
    4751           0 :                                                 SPDK_ERRLOG("Failed to re-post recv WR to SRQ, err %d\n", rc);
    4752           0 :                                         }
    4753             :                                         continue;
    4754           0 :                                 }
    4755           0 :                         }
    4756           0 :                         rqpair = rdma_recv->qpair;
    4757             : 
    4758           0 :                         assert(rqpair != NULL);
    4759           0 :                         if (spdk_likely(!wc[i].status)) {
    4760           0 :                                 assert(wc[i].opcode == IBV_WC_RECV);
    4761           0 :                                 if (rqpair->current_recv_depth >= rqpair->max_queue_depth) {
    4762           0 :                                         spdk_nvmf_qpair_disconnect(&rqpair->qpair);
    4763           0 :                                         break;
    4764             :                                 }
    4765           0 :                         }
    4766             : 
    4767           0 :                         rdma_recv->wr.next = NULL;
    4768           0 :                         rqpair->current_recv_depth++;
    4769           0 :                         rdma_recv->receive_tsc = poll_tsc;
    4770           0 :                         rpoller->stat.requests++;
    4771           0 :                         STAILQ_INSERT_HEAD(&rqpair->resources->incoming_queue, rdma_recv, link);
    4772           0 :                         rqpair->qpair.queue_depth++;
    4773           0 :                         break;
    4774             :                 case RDMA_WR_TYPE_DATA:
    4775           0 :                         rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data_wr);
    4776           0 :                         rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
    4777             : 
    4778           0 :                         assert(rdma_req->num_outstanding_data_wr > 0);
    4779             : 
    4780           0 :                         rqpair->current_send_depth--;
    4781           0 :                         rdma_req->num_outstanding_data_wr--;
    4782           0 :                         if (spdk_likely(!wc[i].status)) {
    4783           0 :                                 assert(wc[i].opcode == IBV_WC_RDMA_READ);
    4784           0 :                                 rqpair->current_read_depth--;
    4785             :                                 /* wait for all outstanding reads associated with the same rdma_req to complete before proceeding. */
    4786           0 :                                 if (rdma_req->num_outstanding_data_wr == 0) {
    4787           0 :                                         if (rdma_req->num_remaining_data_wr) {
    4788             :                                                 /* Only part of RDMA_READ operations was submitted, process the rest */
    4789           0 :                                                 nvmf_rdma_request_reset_transfer_in(rdma_req, rtransport);
    4790           0 :                                                 rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING;
    4791           0 :                                                 nvmf_rdma_request_process(rtransport, rdma_req);
    4792           0 :                                                 break;
    4793             :                                         }
    4794           0 :                                         rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
    4795           0 :                                         nvmf_rdma_request_process(rtransport, rdma_req);
    4796           0 :                                 }
    4797           0 :                         } else {
    4798             :                                 /* If the data transfer fails still force the queue into the error state,
    4799             :                                  * if we were performing an RDMA_READ, we need to force the request into a
    4800             :                                  * completed state since it wasn't linked to a send. However, in the RDMA_WRITE
    4801             :                                  * case, we should wait for the SEND to complete. */
    4802           0 :                                 if (rdma_req->data.wr.opcode == IBV_WR_RDMA_READ) {
    4803           0 :                                         rqpair->current_read_depth--;
    4804           0 :                                         if (rdma_req->num_outstanding_data_wr == 0) {
    4805           0 :                                                 if (rdma_req->num_remaining_data_wr) {
    4806             :                                                         /* Partially sent request is still in the pending_rdma_read_queue,
    4807             :                                                          * remove it now before completing */
    4808           0 :                                                         rdma_req->num_remaining_data_wr = 0;
    4809           0 :                                                         STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
    4810           0 :                                                 }
    4811           0 :                                                 rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
    4812           0 :                                                 nvmf_rdma_request_process(rtransport, rdma_req);
    4813           0 :                                         }
    4814           0 :                                 }
    4815             :                         }
    4816           0 :                         break;
    4817             :                 default:
    4818           0 :                         SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode);
    4819           0 :                         continue;
    4820             :                 }
    4821             : 
    4822             :                 /* Handle error conditions */
    4823           0 :                 if (spdk_unlikely(wc[i].status)) {
    4824           0 :                         rqpair->ibv_in_error_state = true;
    4825           0 :                         nvmf_rdma_log_wc_status(rqpair, &wc[i]);
    4826             : 
    4827           0 :                         error = true;
    4828             : 
    4829           0 :                         if (spdk_nvmf_qpair_is_active(&rqpair->qpair)) {
    4830             :                                 /* Disconnect the connection. */
    4831           0 :                                 spdk_nvmf_qpair_disconnect(&rqpair->qpair);
    4832           0 :                         } else {
    4833           0 :                                 nvmf_rdma_destroy_drained_qpair(rqpair);
    4834             :                         }
    4835           0 :                         continue;
    4836             :                 }
    4837             : 
    4838           0 :                 nvmf_rdma_qpair_process_pending(rtransport, rqpair, false);
    4839             : 
    4840           0 :                 if (spdk_unlikely(!spdk_nvmf_qpair_is_active(&rqpair->qpair))) {
    4841           0 :                         nvmf_rdma_destroy_drained_qpair(rqpair);
    4842           0 :                 }
    4843           0 :         }
    4844             : 
    4845           0 :         if (spdk_unlikely(error == true)) {
    4846           0 :                 return -1;
    4847             :         }
    4848             : 
    4849           0 :         if (reaped == 0) {
    4850             :                 /* In some cases we may not receive any CQE but we still may have pending IO requests waiting for
    4851             :                  * a resource (e.g. a WR from the data_wr_pool).
    4852             :                  * We need to start processing of such requests if no CQE reaped */
    4853           0 :                 nvmf_rdma_poller_process_pending_buf_queue(rtransport, rpoller);
    4854           0 :         }
    4855             : 
    4856             :         /* submit outstanding work requests. */
    4857           0 :         _poller_submit_recvs(rtransport, rpoller);
    4858           0 :         _poller_submit_sends(rtransport, rpoller);
    4859             : 
    4860           0 :         return count;
    4861           0 : }
    4862             : 
    4863             : static void
    4864           0 : _nvmf_rdma_remove_destroyed_device(void *c)
    4865             : {
    4866           0 :         struct spdk_nvmf_rdma_transport *rtransport = c;
    4867           0 :         struct spdk_nvmf_rdma_device    *device, *device_tmp;
    4868           0 :         int                             rc;
    4869             : 
    4870           0 :         TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) {
    4871           0 :                 if (device->ready_to_destroy) {
    4872           0 :                         destroy_ib_device(rtransport, device);
    4873           0 :                 }
    4874           0 :         }
    4875             : 
    4876           0 :         free_poll_fds(rtransport);
    4877           0 :         rc = generate_poll_fds(rtransport);
    4878             :         /* cannot handle fd allocation error here */
    4879           0 :         if (rc != 0) {
    4880           0 :                 SPDK_ERRLOG("Failed to generate poll fds after remove ib device.\n");
    4881           0 :         }
    4882           0 : }
    4883             : 
    4884             : static void
    4885           0 : _nvmf_rdma_remove_poller_in_group_cb(void *c)
    4886             : {
    4887           0 :         struct poller_manage_ctx        *ctx = c;
    4888           0 :         struct spdk_nvmf_rdma_transport *rtransport = ctx->rtransport;
    4889           0 :         struct spdk_nvmf_rdma_device    *device = ctx->device;
    4890           0 :         struct spdk_thread              *thread = ctx->thread;
    4891             : 
    4892           0 :         if (nvmf_rdma_all_pollers_management_done(c)) {
    4893             :                 /* destroy device when last poller is destroyed */
    4894           0 :                 device->ready_to_destroy = true;
    4895           0 :                 spdk_thread_send_msg(thread, _nvmf_rdma_remove_destroyed_device, rtransport);
    4896           0 :         }
    4897           0 : }
    4898             : 
    4899             : static void
    4900           0 : _nvmf_rdma_remove_poller_in_group(void *c)
    4901             : {
    4902           0 :         struct poller_manage_ctx                *ctx = c;
    4903             : 
    4904           0 :         ctx->rpoller->need_destroy = true;
    4905           0 :         ctx->rpoller->destroy_cb_ctx = ctx;
    4906           0 :         ctx->rpoller->destroy_cb = _nvmf_rdma_remove_poller_in_group_cb;
    4907             : 
    4908             :         /* qp will be disconnected after receiving a RDMA_CM_EVENT_DEVICE_REMOVAL event. */
    4909           0 :         if (RB_EMPTY(&ctx->rpoller->qpairs)) {
    4910           0 :                 nvmf_rdma_poller_destroy(ctx->rpoller);
    4911           0 :         }
    4912           0 : }
    4913             : 
    4914             : static int
    4915           0 : nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
    4916             : {
    4917           0 :         struct spdk_nvmf_rdma_transport *rtransport;
    4918           0 :         struct spdk_nvmf_rdma_poll_group *rgroup;
    4919           0 :         struct spdk_nvmf_rdma_poller    *rpoller, *tmp;
    4920           0 :         int                             count = 0, rc, rc2 = 0;
    4921             : 
    4922           0 :         rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport);
    4923           0 :         rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
    4924             : 
    4925           0 :         TAILQ_FOREACH_SAFE(rpoller, &rgroup->pollers, link, tmp) {
    4926           0 :                 rc = nvmf_rdma_poller_poll(rtransport, rpoller);
    4927           0 :                 if (spdk_unlikely(rc < 0)) {
    4928           0 :                         if (rc2 == 0) {
    4929           0 :                                 rc2 = rc;
    4930           0 :                         }
    4931           0 :                         continue;
    4932             :                 }
    4933           0 :                 count += rc;
    4934           0 :         }
    4935             : 
    4936           0 :         return rc2 ? rc2 : count;
    4937           0 : }
    4938             : 
    4939             : static int
    4940           0 : nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id,
    4941             :                           struct spdk_nvme_transport_id *trid,
    4942             :                           bool peer)
    4943             : {
    4944           0 :         struct sockaddr *saddr;
    4945           0 :         uint16_t port;
    4946             : 
    4947           0 :         spdk_nvme_trid_populate_transport(trid, SPDK_NVME_TRANSPORT_RDMA);
    4948             : 
    4949           0 :         if (peer) {
    4950           0 :                 saddr = rdma_get_peer_addr(id);
    4951           0 :         } else {
    4952           0 :                 saddr = rdma_get_local_addr(id);
    4953             :         }
    4954           0 :         switch (saddr->sa_family) {
    4955             :         case AF_INET: {
    4956           0 :                 struct sockaddr_in *saddr_in = (struct sockaddr_in *)saddr;
    4957             : 
    4958           0 :                 trid->adrfam = SPDK_NVMF_ADRFAM_IPV4;
    4959           0 :                 inet_ntop(AF_INET, &saddr_in->sin_addr,
    4960           0 :                           trid->traddr, sizeof(trid->traddr));
    4961           0 :                 if (peer) {
    4962           0 :                         port = ntohs(rdma_get_dst_port(id));
    4963           0 :                 } else {
    4964           0 :                         port = ntohs(rdma_get_src_port(id));
    4965             :                 }
    4966           0 :                 snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port);
    4967             :                 break;
    4968           0 :         }
    4969             :         case AF_INET6: {
    4970           0 :                 struct sockaddr_in6 *saddr_in = (struct sockaddr_in6 *)saddr;
    4971           0 :                 trid->adrfam = SPDK_NVMF_ADRFAM_IPV6;
    4972           0 :                 inet_ntop(AF_INET6, &saddr_in->sin6_addr,
    4973           0 :                           trid->traddr, sizeof(trid->traddr));
    4974           0 :                 if (peer) {
    4975           0 :                         port = ntohs(rdma_get_dst_port(id));
    4976           0 :                 } else {
    4977           0 :                         port = ntohs(rdma_get_src_port(id));
    4978             :                 }
    4979           0 :                 snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port);
    4980             :                 break;
    4981           0 :         }
    4982             :         default:
    4983           0 :                 return -1;
    4984             : 
    4985             :         }
    4986             : 
    4987           0 :         return 0;
    4988           0 : }
    4989             : 
    4990             : static int
    4991           0 : nvmf_rdma_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
    4992             :                               struct spdk_nvme_transport_id *trid)
    4993             : {
    4994           0 :         struct spdk_nvmf_rdma_qpair     *rqpair;
    4995             : 
    4996           0 :         rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
    4997             : 
    4998           0 :         return nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, true);
    4999           0 : }
    5000             : 
    5001             : static int
    5002           0 : nvmf_rdma_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
    5003             :                                struct spdk_nvme_transport_id *trid)
    5004             : {
    5005           0 :         struct spdk_nvmf_rdma_qpair     *rqpair;
    5006             : 
    5007           0 :         rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
    5008             : 
    5009           0 :         return nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, false);
    5010           0 : }
    5011             : 
    5012             : static int
    5013           0 : nvmf_rdma_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
    5014             :                                 struct spdk_nvme_transport_id *trid)
    5015             : {
    5016           0 :         struct spdk_nvmf_rdma_qpair     *rqpair;
    5017             : 
    5018           0 :         rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
    5019             : 
    5020           0 :         return nvmf_rdma_trid_from_cm_id(rqpair->listen_id, trid, false);
    5021           0 : }
    5022             : 
    5023             : void
    5024           0 : spdk_nvmf_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks)
    5025             : {
    5026           0 :         g_nvmf_hooks = *hooks;
    5027           0 : }
    5028             : 
    5029             : static void
    5030           0 : nvmf_rdma_request_set_abort_status(struct spdk_nvmf_request *req,
    5031             :                                    struct spdk_nvmf_rdma_request *rdma_req_to_abort,
    5032             :                                    struct spdk_nvmf_rdma_qpair *rqpair)
    5033             : {
    5034           0 :         rdma_req_to_abort->req.rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
    5035           0 :         rdma_req_to_abort->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
    5036             : 
    5037           0 :         STAILQ_INSERT_TAIL(&rqpair->pending_rdma_send_queue, rdma_req_to_abort, state_link);
    5038           0 :         rdma_req_to_abort->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING;
    5039             : 
    5040           0 :         req->rsp->nvme_cpl.cdw0 &= ~1U;       /* Command was successfully aborted. */
    5041           0 : }
    5042             : 
    5043             : static int
    5044           0 : _nvmf_rdma_qpair_abort_request(void *ctx)
    5045             : {
    5046           0 :         struct spdk_nvmf_request *req = ctx;
    5047           0 :         struct spdk_nvmf_rdma_request *rdma_req_to_abort = SPDK_CONTAINEROF(
    5048             :                                 req->req_to_abort, struct spdk_nvmf_rdma_request, req);
    5049           0 :         struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(req->req_to_abort->qpair,
    5050             :                                               struct spdk_nvmf_rdma_qpair, qpair);
    5051           0 :         int rc;
    5052             : 
    5053           0 :         spdk_poller_unregister(&req->poller);
    5054             : 
    5055           0 :         switch (rdma_req_to_abort->state) {
    5056             :         case RDMA_REQUEST_STATE_EXECUTING:
    5057           0 :                 rc = nvmf_ctrlr_abort_request(req);
    5058           0 :                 if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS) {
    5059           0 :                         return SPDK_POLLER_BUSY;
    5060             :                 }
    5061           0 :                 break;
    5062             : 
    5063             :         case RDMA_REQUEST_STATE_NEED_BUFFER:
    5064           0 :                 STAILQ_REMOVE(&rqpair->poller->group->group.pending_buf_queue,
    5065             :                               &rdma_req_to_abort->req, spdk_nvmf_request, buf_link);
    5066             : 
    5067           0 :                 nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort, rqpair);
    5068           0 :                 break;
    5069             : 
    5070             :         case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING:
    5071           0 :                 STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req_to_abort,
    5072             :                               spdk_nvmf_rdma_request, state_link);
    5073             : 
    5074           0 :                 nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort, rqpair);
    5075           0 :                 break;
    5076             : 
    5077             :         case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING:
    5078           0 :                 STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req_to_abort,
    5079             :                               spdk_nvmf_rdma_request, state_link);
    5080             : 
    5081           0 :                 nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort, rqpair);
    5082           0 :                 break;
    5083             : 
    5084             :         case RDMA_REQUEST_STATE_READY_TO_COMPLETE_PENDING:
    5085             :                 /* Remove req from the list here to re-use common function */
    5086           0 :                 STAILQ_REMOVE(&rqpair->pending_rdma_send_queue, rdma_req_to_abort,
    5087             :                               spdk_nvmf_rdma_request, state_link);
    5088             : 
    5089           0 :                 nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort, rqpair);
    5090           0 :                 break;
    5091             : 
    5092             :         case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
    5093           0 :                 if (spdk_get_ticks() < req->timeout_tsc) {
    5094           0 :                         req->poller = SPDK_POLLER_REGISTER(_nvmf_rdma_qpair_abort_request, req, 0);
    5095           0 :                         return SPDK_POLLER_BUSY;
    5096             :                 }
    5097           0 :                 break;
    5098             : 
    5099             :         default:
    5100           0 :                 break;
    5101             :         }
    5102             : 
    5103           0 :         spdk_nvmf_request_complete(req);
    5104           0 :         return SPDK_POLLER_BUSY;
    5105           0 : }
    5106             : 
    5107             : static void
    5108           0 : nvmf_rdma_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
    5109             :                               struct spdk_nvmf_request *req)
    5110             : {
    5111           0 :         struct spdk_nvmf_rdma_qpair *rqpair;
    5112           0 :         struct spdk_nvmf_rdma_transport *rtransport;
    5113           0 :         struct spdk_nvmf_transport *transport;
    5114           0 :         uint16_t cid;
    5115           0 :         uint32_t i, max_req_count;
    5116           0 :         struct spdk_nvmf_rdma_request *rdma_req_to_abort = NULL, *rdma_req;
    5117             : 
    5118           0 :         rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
    5119           0 :         rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
    5120           0 :         transport = &rtransport->transport;
    5121             : 
    5122           0 :         cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
    5123           0 :         max_req_count = rqpair->srq == NULL ? rqpair->max_queue_depth : rqpair->poller->max_srq_depth;
    5124             : 
    5125           0 :         for (i = 0; i < max_req_count; i++) {
    5126           0 :                 rdma_req = &rqpair->resources->reqs[i];
    5127             :                 /* When SRQ == NULL, rqpair has its own requests and req.qpair pointer always points to the qpair
    5128             :                  * When SRQ != NULL all rqpairs share common requests and qpair pointer is assigned when we start to
    5129             :                  * process a request. So in both cases all requests which are not in FREE state have valid qpair ptr */
    5130           0 :                 if (rdma_req->state != RDMA_REQUEST_STATE_FREE && rdma_req->req.cmd->nvme_cmd.cid == cid &&
    5131           0 :                     rdma_req->req.qpair == qpair) {
    5132           0 :                         rdma_req_to_abort = rdma_req;
    5133           0 :                         break;
    5134             :                 }
    5135           0 :         }
    5136             : 
    5137           0 :         if (rdma_req_to_abort == NULL) {
    5138           0 :                 spdk_nvmf_request_complete(req);
    5139           0 :                 return;
    5140             :         }
    5141             : 
    5142           0 :         req->req_to_abort = &rdma_req_to_abort->req;
    5143           0 :         req->timeout_tsc = spdk_get_ticks() +
    5144           0 :                            transport->opts.abort_timeout_sec * spdk_get_ticks_hz();
    5145           0 :         req->poller = NULL;
    5146             : 
    5147           0 :         _nvmf_rdma_qpair_abort_request(req);
    5148           0 : }
    5149             : 
    5150             : static void
    5151           0 : nvmf_rdma_poll_group_dump_stat(struct spdk_nvmf_transport_poll_group *group,
    5152             :                                struct spdk_json_write_ctx *w)
    5153             : {
    5154           0 :         struct spdk_nvmf_rdma_poll_group *rgroup;
    5155           0 :         struct spdk_nvmf_rdma_poller *rpoller;
    5156             : 
    5157           0 :         assert(w != NULL);
    5158             : 
    5159           0 :         rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
    5160             : 
    5161           0 :         spdk_json_write_named_uint64(w, "pending_data_buffer", rgroup->stat.pending_data_buffer);
    5162             : 
    5163           0 :         spdk_json_write_named_array_begin(w, "devices");
    5164             : 
    5165           0 :         TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
    5166           0 :                 spdk_json_write_object_begin(w);
    5167           0 :                 spdk_json_write_named_string(w, "name",
    5168           0 :                                              ibv_get_device_name(rpoller->device->context->device));
    5169           0 :                 spdk_json_write_named_uint64(w, "polls",
    5170           0 :                                              rpoller->stat.polls);
    5171           0 :                 spdk_json_write_named_uint64(w, "idle_polls",
    5172           0 :                                              rpoller->stat.idle_polls);
    5173           0 :                 spdk_json_write_named_uint64(w, "completions",
    5174           0 :                                              rpoller->stat.completions);
    5175           0 :                 spdk_json_write_named_uint64(w, "requests",
    5176           0 :                                              rpoller->stat.requests);
    5177           0 :                 spdk_json_write_named_uint64(w, "request_latency",
    5178           0 :                                              rpoller->stat.request_latency);
    5179           0 :                 spdk_json_write_named_uint64(w, "pending_free_request",
    5180           0 :                                              rpoller->stat.pending_free_request);
    5181           0 :                 spdk_json_write_named_uint64(w, "pending_rdma_read",
    5182           0 :                                              rpoller->stat.pending_rdma_read);
    5183           0 :                 spdk_json_write_named_uint64(w, "pending_rdma_write",
    5184           0 :                                              rpoller->stat.pending_rdma_write);
    5185           0 :                 spdk_json_write_named_uint64(w, "pending_rdma_send",
    5186           0 :                                              rpoller->stat.pending_rdma_send);
    5187           0 :                 spdk_json_write_named_uint64(w, "total_send_wrs",
    5188           0 :                                              rpoller->stat.qp_stats.send.num_submitted_wrs);
    5189           0 :                 spdk_json_write_named_uint64(w, "send_doorbell_updates",
    5190           0 :                                              rpoller->stat.qp_stats.send.doorbell_updates);
    5191           0 :                 spdk_json_write_named_uint64(w, "total_recv_wrs",
    5192           0 :                                              rpoller->stat.qp_stats.recv.num_submitted_wrs);
    5193           0 :                 spdk_json_write_named_uint64(w, "recv_doorbell_updates",
    5194           0 :                                              rpoller->stat.qp_stats.recv.doorbell_updates);
    5195           0 :                 spdk_json_write_object_end(w);
    5196           0 :         }
    5197             : 
    5198           0 :         spdk_json_write_array_end(w);
    5199           0 : }
    5200             : 
    5201             : const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = {
    5202             :         .name = "RDMA",
    5203             :         .type = SPDK_NVME_TRANSPORT_RDMA,
    5204             :         .opts_init = nvmf_rdma_opts_init,
    5205             :         .create = nvmf_rdma_create,
    5206             :         .dump_opts = nvmf_rdma_dump_opts,
    5207             :         .destroy = nvmf_rdma_destroy,
    5208             : 
    5209             :         .listen = nvmf_rdma_listen,
    5210             :         .stop_listen = nvmf_rdma_stop_listen,
    5211             :         .cdata_init = nvmf_rdma_cdata_init,
    5212             : 
    5213             :         .listener_discover = nvmf_rdma_discover,
    5214             : 
    5215             :         .poll_group_create = nvmf_rdma_poll_group_create,
    5216             :         .get_optimal_poll_group = nvmf_rdma_get_optimal_poll_group,
    5217             :         .poll_group_destroy = nvmf_rdma_poll_group_destroy,
    5218             :         .poll_group_add = nvmf_rdma_poll_group_add,
    5219             :         .poll_group_remove = nvmf_rdma_poll_group_remove,
    5220             :         .poll_group_poll = nvmf_rdma_poll_group_poll,
    5221             : 
    5222             :         .req_free = nvmf_rdma_request_free,
    5223             :         .req_complete = nvmf_rdma_request_complete,
    5224             : 
    5225             :         .qpair_fini = nvmf_rdma_close_qpair,
    5226             :         .qpair_get_peer_trid = nvmf_rdma_qpair_get_peer_trid,
    5227             :         .qpair_get_local_trid = nvmf_rdma_qpair_get_local_trid,
    5228             :         .qpair_get_listen_trid = nvmf_rdma_qpair_get_listen_trid,
    5229             :         .qpair_abort_request = nvmf_rdma_qpair_abort_request,
    5230             : 
    5231             :         .poll_group_dump_stat = nvmf_rdma_poll_group_dump_stat,
    5232             : };
    5233             : 
    5234           2 : SPDK_NVMF_TRANSPORT_REGISTER(rdma, &spdk_nvmf_transport_rdma);
    5235           2 : SPDK_LOG_REGISTER_COMPONENT(rdma)

Generated by: LCOV version 1.15