LCOV - code coverage report
Current view: top level - module/sock/uring - uring.c (source / functions) Hit Total Coverage
Test: ut_cov_unit.info Lines: 68 1151 5.9 %
Date: 2024-12-16 09:12:13 Functions: 5 48 10.4 %

          Line data    Source code
       1             : /*   SPDX-License-Identifier: BSD-3-Clause
       2             :  *   Copyright (C) 2019 Intel Corporation.
       3             :  *   All rights reserved.
       4             :  */
       5             : 
       6             : #include "spdk/stdinc.h"
       7             : #include "spdk/config.h"
       8             : 
       9             : #include <linux/errqueue.h>
      10             : #include <sys/epoll.h>
      11             : #include <liburing.h>
      12             : 
      13             : #include "spdk/barrier.h"
      14             : #include "spdk/env.h"
      15             : #include "spdk/log.h"
      16             : #include "spdk/pipe.h"
      17             : #include "spdk/sock.h"
      18             : #include "spdk/string.h"
      19             : #include "spdk/util.h"
      20             : 
      21             : #include "spdk_internal/sock.h"
      22             : #include "spdk_internal/assert.h"
      23             : #include "../sock_kernel.h"
      24             : 
      25             : #define MAX_TMPBUF 1024
      26             : #define PORTNUMLEN 32
      27             : #define SPDK_SOCK_GROUP_QUEUE_DEPTH 4096
      28             : #define SPDK_SOCK_CMG_INFO_SIZE (sizeof(struct cmsghdr) + sizeof(struct sock_extended_err))
      29             : 
      30             : enum uring_task_type {
      31             :         URING_TASK_READ = 0,
      32             :         URING_TASK_ERRQUEUE,
      33             :         URING_TASK_WRITE,
      34             :         URING_TASK_CANCEL,
      35             : };
      36             : 
      37             : #if defined(SO_ZEROCOPY) && defined(MSG_ZEROCOPY)
      38             : #define SPDK_ZEROCOPY
      39             : #endif
      40             : 
      41             : /* We don't know how big the buffers that the user posts will be, but this
      42             :  * is the maximum we'll ever allow it to receive in a single command.
      43             :  * If the user buffers are smaller, it will just receive less. */
      44             : #define URING_MAX_RECV_SIZE (128 * 1024)
      45             : 
      46             : /* We don't know how many buffers the user will post, but this is the
      47             :  * maximum number we'll take from the pool to post per group. */
      48             : #define URING_BUF_POOL_SIZE 128
      49             : 
      50             : /* We use 1 just so it's not zero and we can validate it's right. */
      51             : #define URING_BUF_GROUP_ID 1
      52             : 
      53             : enum spdk_uring_sock_task_status {
      54             :         SPDK_URING_SOCK_TASK_NOT_IN_USE = 0,
      55             :         SPDK_URING_SOCK_TASK_IN_PROCESS,
      56             : };
      57             : 
      58             : struct spdk_uring_task {
      59             :         enum spdk_uring_sock_task_status        status;
      60             :         enum uring_task_type            type;
      61             :         struct spdk_uring_sock                  *sock;
      62             :         struct msghdr                           msg;
      63             :         struct iovec                            iovs[IOV_BATCH_SIZE];
      64             :         int                                     iov_cnt;
      65             :         struct spdk_sock_request                *last_req;
      66             :         bool                                    is_zcopy;
      67             :         STAILQ_ENTRY(spdk_uring_task)           link;
      68             : };
      69             : 
      70             : struct spdk_uring_sock {
      71             :         struct spdk_sock                        base;
      72             :         int                                     fd;
      73             :         uint32_t                                sendmsg_idx;
      74             :         struct spdk_uring_sock_group_impl       *group;
      75             :         STAILQ_HEAD(, spdk_uring_buf_tracker)   recv_stream;
      76             :         size_t                                  recv_offset;
      77             :         struct spdk_uring_task                  write_task;
      78             :         struct spdk_uring_task                  errqueue_task;
      79             :         struct spdk_uring_task                  read_task;
      80             :         struct spdk_uring_task                  cancel_task;
      81             :         struct spdk_pipe                        *recv_pipe;
      82             :         void                                    *recv_buf;
      83             :         int                                     recv_buf_sz;
      84             :         bool                                    zcopy;
      85             :         bool                                    pending_recv;
      86             :         bool                                    pending_group_remove;
      87             :         int                                     zcopy_send_flags;
      88             :         int                                     connection_status;
      89             :         int                                     placement_id;
      90             :         uint8_t                                 buf[SPDK_SOCK_CMG_INFO_SIZE];
      91             :         TAILQ_ENTRY(spdk_uring_sock)            link;
      92             : };
      93             : 
      94             : TAILQ_HEAD(pending_recv_list, spdk_uring_sock);
      95             : 
      96             : struct spdk_uring_buf_tracker {
      97             :         void                                    *buf;
      98             :         size_t                                  buflen;
      99             :         size_t                                  len;
     100             :         void                                    *ctx;
     101             :         int                                     id;
     102             :         STAILQ_ENTRY(spdk_uring_buf_tracker)    link;
     103             : };
     104             : 
     105             : struct spdk_uring_sock_group_impl {
     106             :         struct spdk_sock_group_impl             base;
     107             :         struct io_uring                         uring;
     108             :         uint32_t                                io_inflight;
     109             :         uint32_t                                io_queued;
     110             :         uint32_t                                io_avail;
     111             :         struct pending_recv_list                pending_recv;
     112             : 
     113             :         struct io_uring_buf_ring                *buf_ring;
     114             :         uint32_t                                buf_ring_count;
     115             :         struct spdk_uring_buf_tracker           *trackers;
     116             :         STAILQ_HEAD(, spdk_uring_buf_tracker)   free_trackers;
     117             : };
     118             : 
     119             : static struct spdk_sock_impl_opts g_spdk_uring_sock_impl_opts = {
     120             :         .recv_buf_size = DEFAULT_SO_RCVBUF_SIZE,
     121             :         .send_buf_size = DEFAULT_SO_SNDBUF_SIZE,
     122             :         .enable_recv_pipe = true,
     123             :         .enable_quickack = false,
     124             :         .enable_placement_id = PLACEMENT_NONE,
     125             :         .enable_zerocopy_send_server = false,
     126             :         .enable_zerocopy_send_client = false,
     127             :         .zerocopy_threshold = 0,
     128             :         .tls_version = 0,
     129             :         .enable_ktls = false,
     130             :         .psk_key = NULL,
     131             :         .psk_identity = NULL
     132             : };
     133             : 
     134             : static struct spdk_sock_map g_map = {
     135             :         .entries = STAILQ_HEAD_INITIALIZER(g_map.entries),
     136             :         .mtx = PTHREAD_MUTEX_INITIALIZER
     137             : };
     138             : 
     139             : __attribute((destructor)) static void
     140           1 : uring_sock_map_cleanup(void)
     141             : {
     142           1 :         spdk_sock_map_cleanup(&g_map);
     143           1 : }
     144             : 
     145             : #define SPDK_URING_SOCK_REQUEST_IOV(req) ((struct iovec *)((uint8_t *)req + sizeof(struct spdk_sock_request)))
     146             : 
     147             : #define __uring_sock(sock) (struct spdk_uring_sock *)sock
     148             : #define __uring_group_impl(group) (struct spdk_uring_sock_group_impl *)group
     149             : 
     150             : static void
     151           0 : uring_sock_copy_impl_opts(struct spdk_sock_impl_opts *dest, const struct spdk_sock_impl_opts *src,
     152             :                           size_t len)
     153             : {
     154             : #define FIELD_OK(field) \
     155             :         offsetof(struct spdk_sock_impl_opts, field) + sizeof(src->field) <= len
     156             : 
     157             : #define SET_FIELD(field) \
     158             :         if (FIELD_OK(field)) { \
     159             :                 dest->field = src->field; \
     160             :         }
     161             : 
     162           0 :         SET_FIELD(recv_buf_size);
     163           0 :         SET_FIELD(send_buf_size);
     164           0 :         SET_FIELD(enable_recv_pipe);
     165           0 :         SET_FIELD(enable_quickack);
     166           0 :         SET_FIELD(enable_placement_id);
     167           0 :         SET_FIELD(enable_zerocopy_send_server);
     168           0 :         SET_FIELD(enable_zerocopy_send_client);
     169           0 :         SET_FIELD(zerocopy_threshold);
     170           0 :         SET_FIELD(tls_version);
     171           0 :         SET_FIELD(enable_ktls);
     172           0 :         SET_FIELD(psk_key);
     173           0 :         SET_FIELD(psk_identity);
     174             : 
     175             : #undef SET_FIELD
     176             : #undef FIELD_OK
     177           0 : }
     178             : 
     179             : static int
     180           0 : uring_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len)
     181             : {
     182           0 :         if (!opts || !len) {
     183           0 :                 errno = EINVAL;
     184           0 :                 return -1;
     185             :         }
     186             : 
     187           0 :         assert(sizeof(*opts) >= *len);
     188           0 :         memset(opts, 0, *len);
     189             : 
     190           0 :         uring_sock_copy_impl_opts(opts, &g_spdk_uring_sock_impl_opts, *len);
     191           0 :         *len = spdk_min(*len, sizeof(g_spdk_uring_sock_impl_opts));
     192             : 
     193           0 :         return 0;
     194           0 : }
     195             : 
     196             : static int
     197           0 : uring_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len)
     198             : {
     199           0 :         if (!opts) {
     200           0 :                 errno = EINVAL;
     201           0 :                 return -1;
     202             :         }
     203             : 
     204           0 :         assert(sizeof(*opts) >= len);
     205           0 :         uring_sock_copy_impl_opts(&g_spdk_uring_sock_impl_opts, opts, len);
     206             : 
     207           0 :         return 0;
     208           0 : }
     209             : 
     210             : static void
     211           0 : uring_opts_get_impl_opts(const struct spdk_sock_opts *opts, struct spdk_sock_impl_opts *dest)
     212             : {
     213             :         /* Copy the default impl_opts first to cover cases when user's impl_opts is smaller */
     214           0 :         memcpy(dest, &g_spdk_uring_sock_impl_opts, sizeof(*dest));
     215             : 
     216           0 :         if (opts->impl_opts != NULL) {
     217           0 :                 assert(sizeof(*dest) >= opts->impl_opts_size);
     218           0 :                 uring_sock_copy_impl_opts(dest, opts->impl_opts, opts->impl_opts_size);
     219           0 :         }
     220           0 : }
     221             : 
     222             : static int
     223           0 : uring_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport,
     224             :                    char *caddr, int clen, uint16_t *cport)
     225             : {
     226           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
     227           0 :         struct sockaddr_storage sa;
     228           0 :         socklen_t salen;
     229           0 :         int rc;
     230             : 
     231           0 :         assert(sock != NULL);
     232             : 
     233           0 :         memset(&sa, 0, sizeof sa);
     234           0 :         salen = sizeof sa;
     235           0 :         rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
     236           0 :         if (rc != 0) {
     237           0 :                 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
     238           0 :                 return -1;
     239             :         }
     240             : 
     241           0 :         switch (sa.ss_family) {
     242             :         case AF_UNIX:
     243             :                 /* Acceptable connection types that don't have IPs */
     244           0 :                 return 0;
     245             :         case AF_INET:
     246             :         case AF_INET6:
     247             :                 /* Code below will get IP addresses */
     248           0 :                 break;
     249             :         default:
     250             :                 /* Unsupported socket family */
     251           0 :                 return -1;
     252             :         }
     253             : 
     254           0 :         rc = get_addr_str((struct sockaddr *)&sa, saddr, slen);
     255           0 :         if (rc != 0) {
     256           0 :                 SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno);
     257           0 :                 return -1;
     258             :         }
     259             : 
     260           0 :         if (sport) {
     261           0 :                 if (sa.ss_family == AF_INET) {
     262           0 :                         *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port);
     263           0 :                 } else if (sa.ss_family == AF_INET6) {
     264           0 :                         *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port);
     265           0 :                 }
     266           0 :         }
     267             : 
     268           0 :         memset(&sa, 0, sizeof sa);
     269           0 :         salen = sizeof sa;
     270           0 :         rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen);
     271           0 :         if (rc != 0) {
     272           0 :                 SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno);
     273           0 :                 return -1;
     274             :         }
     275             : 
     276           0 :         rc = get_addr_str((struct sockaddr *)&sa, caddr, clen);
     277           0 :         if (rc != 0) {
     278           0 :                 SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno);
     279           0 :                 return -1;
     280             :         }
     281             : 
     282           0 :         if (cport) {
     283           0 :                 if (sa.ss_family == AF_INET) {
     284           0 :                         *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port);
     285           0 :                 } else if (sa.ss_family == AF_INET6) {
     286           0 :                         *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port);
     287           0 :                 }
     288           0 :         }
     289             : 
     290           0 :         return 0;
     291           0 : }
     292             : 
     293             : enum uring_sock_create_type {
     294             :         SPDK_SOCK_CREATE_LISTEN,
     295             :         SPDK_SOCK_CREATE_CONNECT,
     296             : };
     297             : 
     298             : static int
     299           0 : uring_sock_alloc_pipe(struct spdk_uring_sock *sock, int sz)
     300             : {
     301           0 :         uint8_t *new_buf;
     302           0 :         struct spdk_pipe *new_pipe;
     303           0 :         struct iovec siov[2];
     304           0 :         struct iovec diov[2];
     305           0 :         int sbytes;
     306           0 :         ssize_t bytes;
     307           0 :         int rc;
     308             : 
     309           0 :         if (sock->recv_buf_sz == sz) {
     310           0 :                 return 0;
     311             :         }
     312             : 
     313             :         /* If the new size is 0, just free the pipe */
     314           0 :         if (sz == 0) {
     315           0 :                 spdk_pipe_destroy(sock->recv_pipe);
     316           0 :                 free(sock->recv_buf);
     317           0 :                 sock->recv_pipe = NULL;
     318           0 :                 sock->recv_buf = NULL;
     319           0 :                 return 0;
     320           0 :         } else if (sz < MIN_SOCK_PIPE_SIZE) {
     321           0 :                 SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE);
     322           0 :                 return -1;
     323             :         }
     324             : 
     325             :         /* Round up to next 64 byte multiple */
     326           0 :         rc = posix_memalign((void **)&new_buf, 64, sz);
     327           0 :         if (rc != 0) {
     328           0 :                 SPDK_ERRLOG("socket recv buf allocation failed\n");
     329           0 :                 return -ENOMEM;
     330             :         }
     331           0 :         memset(new_buf, 0, sz);
     332             : 
     333           0 :         new_pipe = spdk_pipe_create(new_buf, sz);
     334           0 :         if (new_pipe == NULL) {
     335           0 :                 SPDK_ERRLOG("socket pipe allocation failed\n");
     336           0 :                 free(new_buf);
     337           0 :                 return -ENOMEM;
     338             :         }
     339             : 
     340           0 :         if (sock->recv_pipe != NULL) {
     341             :                 /* Pull all of the data out of the old pipe */
     342           0 :                 sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov);
     343           0 :                 if (sbytes > sz) {
     344             :                         /* Too much data to fit into the new pipe size */
     345           0 :                         spdk_pipe_destroy(new_pipe);
     346           0 :                         free(new_buf);
     347           0 :                         return -EINVAL;
     348             :                 }
     349             : 
     350           0 :                 sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov);
     351           0 :                 assert(sbytes == sz);
     352             : 
     353           0 :                 bytes = spdk_iovcpy(siov, 2, diov, 2);
     354           0 :                 spdk_pipe_writer_advance(new_pipe, bytes);
     355             : 
     356           0 :                 spdk_pipe_destroy(sock->recv_pipe);
     357           0 :                 free(sock->recv_buf);
     358           0 :         }
     359             : 
     360           0 :         sock->recv_buf_sz = sz;
     361           0 :         sock->recv_buf = new_buf;
     362           0 :         sock->recv_pipe = new_pipe;
     363             : 
     364           0 :         return 0;
     365           0 : }
     366             : 
     367             : static int
     368           0 : uring_sock_set_recvbuf(struct spdk_sock *_sock, int sz)
     369             : {
     370           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
     371           0 :         int min_size;
     372           0 :         int rc;
     373             : 
     374           0 :         assert(sock != NULL);
     375             : 
     376           0 :         if (_sock->impl_opts.enable_recv_pipe) {
     377           0 :                 rc = uring_sock_alloc_pipe(sock, sz);
     378           0 :                 if (rc) {
     379           0 :                         SPDK_ERRLOG("unable to allocate sufficient recvbuf with sz=%d on sock=%p\n", sz, _sock);
     380           0 :                         return rc;
     381             :                 }
     382           0 :         }
     383             : 
     384             :         /* Set kernel buffer size to be at least MIN_SO_RCVBUF_SIZE and
     385             :          * g_spdk_uring_sock_impl_opts.recv_buf_size. */
     386           0 :         min_size = spdk_max(MIN_SO_RCVBUF_SIZE, g_spdk_uring_sock_impl_opts.recv_buf_size);
     387             : 
     388           0 :         if (sz < min_size) {
     389           0 :                 sz = min_size;
     390           0 :         }
     391             : 
     392           0 :         rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz));
     393           0 :         if (rc < 0) {
     394           0 :                 return rc;
     395             :         }
     396             : 
     397           0 :         _sock->impl_opts.recv_buf_size = sz;
     398             : 
     399           0 :         return 0;
     400           0 : }
     401             : 
     402             : static int
     403           0 : uring_sock_set_sendbuf(struct spdk_sock *_sock, int sz)
     404             : {
     405           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
     406           0 :         int min_size;
     407           0 :         int rc;
     408             : 
     409           0 :         assert(sock != NULL);
     410             : 
     411             :         /* Set kernel buffer size to be at least MIN_SO_SNDBUF_SIZE and
     412             :          * g_spdk_uring_sock_impl_opts.seend_buf_size. */
     413           0 :         min_size = spdk_max(MIN_SO_SNDBUF_SIZE, g_spdk_uring_sock_impl_opts.send_buf_size);
     414             : 
     415           0 :         if (sz < min_size) {
     416           0 :                 sz = min_size;
     417           0 :         }
     418             : 
     419           0 :         rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz));
     420           0 :         if (rc < 0) {
     421           0 :                 return rc;
     422             :         }
     423             : 
     424           0 :         _sock->impl_opts.send_buf_size = sz;
     425             : 
     426           0 :         return 0;
     427           0 : }
     428             : 
     429             : static struct spdk_uring_sock *
     430           0 : uring_sock_alloc(int fd, struct spdk_sock_impl_opts *impl_opts, bool enable_zero_copy)
     431             : {
     432           0 :         struct spdk_uring_sock *sock;
     433             : #if defined(__linux__)
     434           0 :         int flag;
     435           0 :         int rc;
     436             : #endif
     437             : 
     438           0 :         sock = calloc(1, sizeof(*sock));
     439           0 :         if (sock == NULL) {
     440           0 :                 SPDK_ERRLOG("sock allocation failed\n");
     441           0 :                 return NULL;
     442             :         }
     443             : 
     444           0 :         sock->fd = fd;
     445           0 :         memcpy(&sock->base.impl_opts, impl_opts, sizeof(*impl_opts));
     446             : 
     447           0 :         STAILQ_INIT(&sock->recv_stream);
     448             : 
     449             : #if defined(__linux__)
     450           0 :         flag = 1;
     451             : 
     452           0 :         if (sock->base.impl_opts.enable_quickack) {
     453           0 :                 rc = setsockopt(sock->fd, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(flag));
     454           0 :                 if (rc != 0) {
     455           0 :                         SPDK_ERRLOG("quickack was failed to set\n");
     456           0 :                 }
     457           0 :         }
     458             : 
     459           0 :         spdk_sock_get_placement_id(sock->fd, sock->base.impl_opts.enable_placement_id,
     460           0 :                                    &sock->placement_id);
     461             : #ifdef SPDK_ZEROCOPY
     462             :         /* Try to turn on zero copy sends */
     463           0 :         flag = 1;
     464             : 
     465           0 :         if (enable_zero_copy) {
     466           0 :                 rc = setsockopt(sock->fd, SOL_SOCKET, SO_ZEROCOPY, &flag, sizeof(flag));
     467           0 :                 if (rc == 0) {
     468           0 :                         sock->zcopy = true;
     469           0 :                         sock->zcopy_send_flags = MSG_ZEROCOPY;
     470           0 :                 }
     471           0 :         }
     472             : #endif
     473             : #endif
     474             : 
     475           0 :         return sock;
     476           0 : }
     477             : 
     478             : static struct spdk_sock *
     479           0 : uring_sock_create(const char *ip, int port,
     480             :                   enum uring_sock_create_type type,
     481             :                   struct spdk_sock_opts *opts)
     482             : {
     483           0 :         struct spdk_uring_sock *sock;
     484           0 :         struct spdk_sock_impl_opts impl_opts;
     485           0 :         char buf[MAX_TMPBUF];
     486           0 :         char portnum[PORTNUMLEN];
     487           0 :         char *p;
     488           0 :         struct addrinfo hints, *res, *res0;
     489           0 :         int fd, flag;
     490           0 :         int val = 1;
     491           0 :         int rc;
     492           0 :         bool enable_zcopy_impl_opts = false;
     493           0 :         bool enable_zcopy_user_opts = true;
     494             : 
     495           0 :         assert(opts != NULL);
     496           0 :         uring_opts_get_impl_opts(opts, &impl_opts);
     497             : 
     498           0 :         if (ip == NULL) {
     499           0 :                 return NULL;
     500             :         }
     501           0 :         if (ip[0] == '[') {
     502           0 :                 snprintf(buf, sizeof(buf), "%s", ip + 1);
     503           0 :                 p = strchr(buf, ']');
     504           0 :                 if (p != NULL) {
     505           0 :                         *p = '\0';
     506           0 :                 }
     507           0 :                 ip = (const char *) &buf[0];
     508           0 :         }
     509             : 
     510           0 :         snprintf(portnum, sizeof portnum, "%d", port);
     511           0 :         memset(&hints, 0, sizeof hints);
     512           0 :         hints.ai_family = PF_UNSPEC;
     513           0 :         hints.ai_socktype = SOCK_STREAM;
     514           0 :         hints.ai_flags = AI_NUMERICSERV;
     515           0 :         hints.ai_flags |= AI_PASSIVE;
     516           0 :         hints.ai_flags |= AI_NUMERICHOST;
     517           0 :         rc = getaddrinfo(ip, portnum, &hints, &res0);
     518           0 :         if (rc != 0) {
     519           0 :                 SPDK_ERRLOG("getaddrinfo() failed %s (%d)\n", gai_strerror(rc), rc);
     520           0 :                 return NULL;
     521             :         }
     522             : 
     523             :         /* try listen */
     524           0 :         fd = -1;
     525           0 :         for (res = res0; res != NULL; res = res->ai_next) {
     526             : retry:
     527           0 :                 fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
     528           0 :                 if (fd < 0) {
     529             :                         /* error */
     530           0 :                         continue;
     531             :                 }
     532             : 
     533           0 :                 val = impl_opts.recv_buf_size;
     534           0 :                 rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof val);
     535           0 :                 if (rc) {
     536             :                         /* Not fatal */
     537           0 :                 }
     538             : 
     539           0 :                 val = impl_opts.send_buf_size;
     540           0 :                 rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &val, sizeof val);
     541           0 :                 if (rc) {
     542             :                         /* Not fatal */
     543           0 :                 }
     544             : 
     545           0 :                 rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val);
     546           0 :                 if (rc != 0) {
     547           0 :                         close(fd);
     548           0 :                         fd = -1;
     549             :                         /* error */
     550           0 :                         continue;
     551             :                 }
     552           0 :                 rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val);
     553           0 :                 if (rc != 0) {
     554           0 :                         close(fd);
     555           0 :                         fd = -1;
     556             :                         /* error */
     557           0 :                         continue;
     558             :                 }
     559             : 
     560           0 :                 if (opts->ack_timeout) {
     561             : #if defined(__linux__)
     562           0 :                         val = opts->ack_timeout;
     563           0 :                         rc = setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &val, sizeof val);
     564           0 :                         if (rc != 0) {
     565           0 :                                 close(fd);
     566           0 :                                 fd = -1;
     567             :                                 /* error */
     568           0 :                                 continue;
     569             :                         }
     570             : #else
     571             :                         SPDK_WARNLOG("TCP_USER_TIMEOUT is not supported.\n");
     572             : #endif
     573           0 :                 }
     574             : 
     575             : 
     576             : 
     577             : #if defined(SO_PRIORITY)
     578           0 :                 if (opts != NULL && opts->priority) {
     579           0 :                         rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val);
     580           0 :                         if (rc != 0) {
     581           0 :                                 close(fd);
     582           0 :                                 fd = -1;
     583             :                                 /* error */
     584           0 :                                 continue;
     585             :                         }
     586           0 :                 }
     587             : #endif
     588           0 :                 if (res->ai_family == AF_INET6) {
     589           0 :                         rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val);
     590           0 :                         if (rc != 0) {
     591           0 :                                 close(fd);
     592           0 :                                 fd = -1;
     593             :                                 /* error */
     594           0 :                                 continue;
     595             :                         }
     596           0 :                 }
     597             : 
     598           0 :                 if (type == SPDK_SOCK_CREATE_LISTEN) {
     599           0 :                         rc = bind(fd, res->ai_addr, res->ai_addrlen);
     600           0 :                         if (rc != 0) {
     601           0 :                                 SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno);
     602           0 :                                 switch (errno) {
     603             :                                 case EINTR:
     604             :                                         /* interrupted? */
     605           0 :                                         close(fd);
     606           0 :                                         goto retry;
     607             :                                 case EADDRNOTAVAIL:
     608           0 :                                         SPDK_ERRLOG("IP address %s not available. "
     609             :                                                     "Verify IP address in config file "
     610             :                                                     "and make sure setup script is "
     611             :                                                     "run before starting spdk app.\n", ip);
     612             :                                 /* FALLTHROUGH */
     613             :                                 default:
     614             :                                         /* try next family */
     615           0 :                                         close(fd);
     616           0 :                                         fd = -1;
     617           0 :                                         continue;
     618             :                                 }
     619             :                         }
     620             :                         /* bind OK */
     621           0 :                         rc = listen(fd, 512);
     622           0 :                         if (rc != 0) {
     623           0 :                                 SPDK_ERRLOG("listen() failed, errno = %d\n", errno);
     624           0 :                                 close(fd);
     625           0 :                                 fd = -1;
     626           0 :                                 break;
     627             :                         }
     628             : 
     629           0 :                         flag = fcntl(fd, F_GETFL);
     630           0 :                         if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) {
     631           0 :                                 SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno);
     632           0 :                                 close(fd);
     633           0 :                                 fd = -1;
     634           0 :                                 break;
     635             :                         }
     636             : 
     637           0 :                         enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_server;
     638           0 :                 } else if (type == SPDK_SOCK_CREATE_CONNECT) {
     639           0 :                         rc = connect(fd, res->ai_addr, res->ai_addrlen);
     640           0 :                         if (rc != 0) {
     641           0 :                                 SPDK_ERRLOG("connect() failed, errno = %d\n", errno);
     642             :                                 /* try next family */
     643           0 :                                 close(fd);
     644           0 :                                 fd = -1;
     645           0 :                                 continue;
     646             :                         }
     647             : 
     648           0 :                         flag = fcntl(fd, F_GETFL);
     649           0 :                         if (fcntl(fd, F_SETFL, flag & ~O_NONBLOCK) < 0) {
     650           0 :                                 SPDK_ERRLOG("fcntl can't set blocking mode for socket, fd: %d (%d)\n", fd, errno);
     651           0 :                                 close(fd);
     652           0 :                                 fd = -1;
     653           0 :                                 break;
     654             :                         }
     655             : 
     656           0 :                         enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_client;
     657           0 :                 }
     658           0 :                 break;
     659             :         }
     660           0 :         freeaddrinfo(res0);
     661             : 
     662           0 :         if (fd < 0) {
     663           0 :                 return NULL;
     664             :         }
     665             : 
     666           0 :         enable_zcopy_user_opts = opts->zcopy && !sock_is_loopback(fd);
     667           0 :         sock = uring_sock_alloc(fd, &impl_opts, enable_zcopy_user_opts && enable_zcopy_impl_opts);
     668           0 :         if (sock == NULL) {
     669           0 :                 SPDK_ERRLOG("sock allocation failed\n");
     670           0 :                 close(fd);
     671           0 :                 return NULL;
     672             :         }
     673             : 
     674           0 :         return &sock->base;
     675           0 : }
     676             : 
     677             : static struct spdk_sock *
     678           0 : uring_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts)
     679             : {
     680           0 :         return uring_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts);
     681             : }
     682             : 
     683             : static struct spdk_sock *
     684           0 : uring_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts)
     685             : {
     686           0 :         return uring_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts);
     687             : }
     688             : 
     689             : static struct spdk_sock *
     690           0 : uring_sock_accept(struct spdk_sock *_sock)
     691             : {
     692           0 :         struct spdk_uring_sock          *sock = __uring_sock(_sock);
     693           0 :         struct sockaddr_storage         sa;
     694           0 :         socklen_t                       salen;
     695           0 :         int                             rc, fd;
     696           0 :         struct spdk_uring_sock          *new_sock;
     697           0 :         int                             flag;
     698             : 
     699           0 :         memset(&sa, 0, sizeof(sa));
     700           0 :         salen = sizeof(sa);
     701             : 
     702           0 :         assert(sock != NULL);
     703             : 
     704           0 :         rc = accept(sock->fd, (struct sockaddr *)&sa, &salen);
     705             : 
     706           0 :         if (rc == -1) {
     707           0 :                 return NULL;
     708             :         }
     709             : 
     710           0 :         fd = rc;
     711             : 
     712           0 :         flag = fcntl(fd, F_GETFL);
     713           0 :         if ((flag & O_NONBLOCK) && (fcntl(fd, F_SETFL, flag & ~O_NONBLOCK) < 0)) {
     714           0 :                 SPDK_ERRLOG("fcntl can't set blocking mode for socket, fd: %d (%d)\n", fd, errno);
     715           0 :                 close(fd);
     716           0 :                 return NULL;
     717             :         }
     718             : 
     719             : #if defined(SO_PRIORITY)
     720             :         /* The priority is not inherited, so call this function again */
     721           0 :         if (sock->base.opts.priority) {
     722           0 :                 rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int));
     723           0 :                 if (rc != 0) {
     724           0 :                         close(fd);
     725           0 :                         return NULL;
     726             :                 }
     727           0 :         }
     728             : #endif
     729             : 
     730           0 :         new_sock = uring_sock_alloc(fd, &sock->base.impl_opts, sock->zcopy);
     731           0 :         if (new_sock == NULL) {
     732           0 :                 close(fd);
     733           0 :                 return NULL;
     734             :         }
     735             : 
     736           0 :         return &new_sock->base;
     737           0 : }
     738             : 
     739             : static int
     740           0 : uring_sock_close(struct spdk_sock *_sock)
     741             : {
     742           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
     743             : 
     744           0 :         assert(TAILQ_EMPTY(&_sock->pending_reqs));
     745           0 :         assert(sock->group == NULL);
     746             : 
     747             :         /* If the socket fails to close, the best choice is to
     748             :          * leak the fd but continue to free the rest of the sock
     749             :          * memory. */
     750           0 :         close(sock->fd);
     751             : 
     752           0 :         spdk_pipe_destroy(sock->recv_pipe);
     753           0 :         free(sock->recv_buf);
     754           0 :         free(sock);
     755             : 
     756           0 :         return 0;
     757           0 : }
     758             : 
     759             : static ssize_t
     760           0 : uring_sock_recv_from_pipe(struct spdk_uring_sock *sock, struct iovec *diov, int diovcnt)
     761             : {
     762           0 :         struct iovec siov[2];
     763           0 :         int sbytes;
     764           0 :         ssize_t bytes;
     765           0 :         struct spdk_uring_sock_group_impl *group;
     766             : 
     767           0 :         sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov);
     768           0 :         if (sbytes < 0) {
     769           0 :                 errno = EINVAL;
     770           0 :                 return -1;
     771           0 :         } else if (sbytes == 0) {
     772           0 :                 errno = EAGAIN;
     773           0 :                 return -1;
     774             :         }
     775             : 
     776           0 :         bytes = spdk_iovcpy(siov, 2, diov, diovcnt);
     777             : 
     778           0 :         if (bytes == 0) {
     779             :                 /* The only way this happens is if diov is 0 length */
     780           0 :                 errno = EINVAL;
     781           0 :                 return -1;
     782             :         }
     783             : 
     784           0 :         spdk_pipe_reader_advance(sock->recv_pipe, bytes);
     785             : 
     786             :         /* If we drained the pipe, take it off the level-triggered list */
     787           0 :         if (sock->base.group_impl && spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) {
     788           0 :                 group = __uring_group_impl(sock->base.group_impl);
     789           0 :                 TAILQ_REMOVE(&group->pending_recv, sock, link);
     790           0 :                 sock->pending_recv = false;
     791           0 :         }
     792             : 
     793           0 :         return bytes;
     794           0 : }
     795             : 
     796             : static inline ssize_t
     797           0 : sock_readv(int fd, struct iovec *iov, int iovcnt)
     798             : {
     799           0 :         struct msghdr msg = {
     800           0 :                 .msg_iov = iov,
     801           0 :                 .msg_iovlen = iovcnt,
     802             :         };
     803             : 
     804           0 :         return recvmsg(fd, &msg, MSG_DONTWAIT);
     805           0 : }
     806             : 
     807             : static inline ssize_t
     808           0 : uring_sock_read(struct spdk_uring_sock *sock)
     809             : {
     810           0 :         struct iovec iov[2];
     811           0 :         int bytes;
     812           0 :         struct spdk_uring_sock_group_impl *group;
     813             : 
     814           0 :         bytes = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov);
     815             : 
     816           0 :         if (bytes > 0) {
     817           0 :                 bytes = sock_readv(sock->fd, iov, 2);
     818           0 :                 if (bytes > 0) {
     819           0 :                         spdk_pipe_writer_advance(sock->recv_pipe, bytes);
     820           0 :                         if (sock->base.group_impl && !sock->pending_recv) {
     821           0 :                                 group = __uring_group_impl(sock->base.group_impl);
     822           0 :                                 TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
     823           0 :                                 sock->pending_recv = true;
     824           0 :                         }
     825           0 :                 }
     826           0 :         }
     827             : 
     828           0 :         return bytes;
     829           0 : }
     830             : 
     831             : static int
     832           0 : uring_sock_recv_next(struct spdk_sock *_sock, void **_buf, void **ctx)
     833             : {
     834           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
     835           0 :         struct spdk_uring_sock_group_impl *group;
     836           0 :         struct spdk_uring_buf_tracker *tr;
     837             : 
     838           0 :         if (sock->connection_status < 0) {
     839           0 :                 errno = -sock->connection_status;
     840           0 :                 return -1;
     841             :         }
     842             : 
     843           0 :         if (sock->recv_pipe != NULL) {
     844           0 :                 errno = ENOTSUP;
     845           0 :                 return -1;
     846             :         }
     847             : 
     848           0 :         group = __uring_group_impl(_sock->group_impl);
     849             : 
     850           0 :         tr = STAILQ_FIRST(&sock->recv_stream);
     851           0 :         if (tr == NULL) {
     852           0 :                 if (sock->group->buf_ring_count > 0) {
     853             :                         /* There are buffers posted, but data hasn't arrived. */
     854           0 :                         errno = EAGAIN;
     855           0 :                 } else {
     856             :                         /* There are no buffers posted, so this won't ever
     857             :                          * make forward progress. */
     858           0 :                         errno = ENOBUFS;
     859             :                 }
     860           0 :                 return -1;
     861             :         }
     862           0 :         assert(sock->pending_recv == true);
     863           0 :         assert(tr->buf != NULL);
     864             : 
     865           0 :         *_buf = tr->buf + sock->recv_offset;
     866           0 :         *ctx = tr->ctx;
     867             : 
     868           0 :         STAILQ_REMOVE_HEAD(&sock->recv_stream, link);
     869           0 :         STAILQ_INSERT_HEAD(&group->free_trackers, tr, link);
     870             : 
     871           0 :         if (STAILQ_EMPTY(&sock->recv_stream)) {
     872           0 :                 sock->pending_recv = false;
     873           0 :                 TAILQ_REMOVE(&group->pending_recv, sock, link);
     874           0 :         }
     875             : 
     876           0 :         return tr->len - sock->recv_offset;
     877           0 : }
     878             : 
     879             : static ssize_t
     880           0 : uring_sock_readv_no_pipe(struct spdk_sock *_sock, struct iovec *iovs, int iovcnt)
     881             : {
     882           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
     883           0 :         struct spdk_uring_buf_tracker *tr;
     884           0 :         struct iovec iov;
     885           0 :         ssize_t total, len;
     886           0 :         int i;
     887             : 
     888           0 :         if (sock->connection_status < 0) {
     889           0 :                 errno = -sock->connection_status;
     890           0 :                 return -1;
     891             :         }
     892             : 
     893           0 :         if (_sock->group_impl == NULL) {
     894             :                 /* If not in a group just read from the socket the regular way. */
     895           0 :                 return sock_readv(sock->fd, iovs, iovcnt);
     896             :         }
     897             : 
     898           0 :         if (STAILQ_EMPTY(&sock->recv_stream)) {
     899           0 :                 if (sock->group->buf_ring_count == 0) {
     900             :                         /* If the user hasn't posted any buffers, read from the socket
     901             :                          * directly. */
     902             : 
     903           0 :                         if (sock->pending_recv) {
     904           0 :                                 sock->pending_recv = false;
     905           0 :                                 TAILQ_REMOVE(&(__uring_group_impl(_sock->group_impl))->pending_recv, sock, link);
     906           0 :                         }
     907             : 
     908           0 :                         return sock_readv(sock->fd, iovs, iovcnt);
     909             :                 }
     910             : 
     911           0 :                 errno = EAGAIN;
     912           0 :                 return -1;
     913             :         }
     914             : 
     915           0 :         total = 0;
     916           0 :         for (i = 0; i < iovcnt; i++) {
     917             :                 /* Copy to stack so we can change it */
     918           0 :                 iov = iovs[i];
     919             : 
     920           0 :                 tr = STAILQ_FIRST(&sock->recv_stream);
     921           0 :                 while (tr != NULL) {
     922           0 :                         len = spdk_min(iov.iov_len, tr->len - sock->recv_offset);
     923           0 :                         memcpy(iov.iov_base, tr->buf + sock->recv_offset, len);
     924             : 
     925           0 :                         total += len;
     926           0 :                         sock->recv_offset += len;
     927           0 :                         iov.iov_base += len;
     928           0 :                         iov.iov_len -= len;
     929             : 
     930           0 :                         if (sock->recv_offset == tr->len) {
     931           0 :                                 sock->recv_offset = 0;
     932           0 :                                 STAILQ_REMOVE_HEAD(&sock->recv_stream, link);
     933           0 :                                 STAILQ_INSERT_HEAD(&sock->group->free_trackers, tr, link);
     934           0 :                                 spdk_sock_group_provide_buf(sock->group->base.group, tr->buf, tr->buflen, tr->ctx);
     935           0 :                                 tr = STAILQ_FIRST(&sock->recv_stream);
     936           0 :                         }
     937             : 
     938           0 :                         if (iov.iov_len == 0) {
     939           0 :                                 break;
     940             :                         }
     941             :                 }
     942           0 :         }
     943             : 
     944           0 :         if (STAILQ_EMPTY(&sock->recv_stream)) {
     945           0 :                 struct spdk_uring_sock_group_impl *group;
     946             : 
     947           0 :                 group = __uring_group_impl(_sock->group_impl);
     948           0 :                 sock->pending_recv = false;
     949           0 :                 TAILQ_REMOVE(&group->pending_recv, sock, link);
     950           0 :         }
     951             : 
     952           0 :         assert(total > 0);
     953           0 :         return total;
     954           0 : }
     955             : 
     956             : static ssize_t
     957           0 : uring_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
     958             : {
     959           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
     960           0 :         int rc, i;
     961           0 :         size_t len;
     962             : 
     963           0 :         if (sock->connection_status < 0) {
     964           0 :                 errno = -sock->connection_status;
     965           0 :                 return -1;
     966             :         }
     967             : 
     968           0 :         if (sock->recv_pipe == NULL) {
     969           0 :                 return uring_sock_readv_no_pipe(_sock, iov, iovcnt);
     970             :         }
     971             : 
     972           0 :         len = 0;
     973           0 :         for (i = 0; i < iovcnt; i++) {
     974           0 :                 len += iov[i].iov_len;
     975           0 :         }
     976             : 
     977           0 :         if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) {
     978             :                 /* If the user is receiving a sufficiently large amount of data,
     979             :                  * receive directly to their buffers. */
     980           0 :                 if (len >= MIN_SOCK_PIPE_SIZE) {
     981           0 :                         return sock_readv(sock->fd, iov, iovcnt);
     982             :                 }
     983             : 
     984             :                 /* Otherwise, do a big read into our pipe */
     985           0 :                 rc = uring_sock_read(sock);
     986           0 :                 if (rc <= 0) {
     987           0 :                         return rc;
     988             :                 }
     989           0 :         }
     990             : 
     991           0 :         return uring_sock_recv_from_pipe(sock, iov, iovcnt);
     992           0 : }
     993             : 
     994             : static ssize_t
     995           0 : uring_sock_recv(struct spdk_sock *sock, void *buf, size_t len)
     996             : {
     997           0 :         struct iovec iov[1];
     998             : 
     999           0 :         iov[0].iov_base = buf;
    1000           0 :         iov[0].iov_len = len;
    1001             : 
    1002           0 :         return uring_sock_readv(sock, iov, 1);
    1003           0 : }
    1004             : 
    1005             : static ssize_t
    1006           0 : uring_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
    1007             : {
    1008           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1009           0 :         struct msghdr msg = {
    1010           0 :                 .msg_iov = iov,
    1011           0 :                 .msg_iovlen = iovcnt,
    1012             :         };
    1013             : 
    1014           0 :         if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) {
    1015           0 :                 errno = EAGAIN;
    1016           0 :                 return -1;
    1017             :         }
    1018             : 
    1019           0 :         return sendmsg(sock->fd, &msg, MSG_DONTWAIT);
    1020           0 : }
    1021             : 
    1022             : static ssize_t
    1023          13 : sock_request_advance_offset(struct spdk_sock_request *req, ssize_t rc)
    1024             : {
    1025          13 :         unsigned int offset;
    1026          13 :         size_t len;
    1027          13 :         int i;
    1028             : 
    1029          13 :         offset = req->internal.offset;
    1030          37 :         for (i = 0; i < req->iovcnt; i++) {
    1031             :                 /* Advance by the offset first */
    1032          28 :                 if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) {
    1033           2 :                         offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len;
    1034           2 :                         continue;
    1035             :                 }
    1036             : 
    1037             :                 /* Calculate the remaining length of this element */
    1038          26 :                 len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset;
    1039             : 
    1040          26 :                 if (len > (size_t)rc) {
    1041           4 :                         req->internal.offset += rc;
    1042           4 :                         return -1;
    1043             :                 }
    1044             : 
    1045          22 :                 offset = 0;
    1046          22 :                 req->internal.offset += len;
    1047          22 :                 rc -= len;
    1048          22 :         }
    1049             : 
    1050           9 :         return rc;
    1051          13 : }
    1052             : 
    1053             : static int
    1054          11 : sock_complete_write_reqs(struct spdk_sock *_sock, ssize_t rc, bool is_zcopy)
    1055             : {
    1056          11 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1057          11 :         struct spdk_sock_request *req;
    1058          11 :         int retval;
    1059             : 
    1060          11 :         if (is_zcopy) {
    1061             :                 /* Handling overflow case, because we use psock->sendmsg_idx - 1 for the
    1062             :                  * req->internal.offset, so sendmsg_idx should not be zero */
    1063           0 :                 if (spdk_unlikely(sock->sendmsg_idx == UINT32_MAX)) {
    1064           0 :                         sock->sendmsg_idx = 1;
    1065           0 :                 } else {
    1066           0 :                         sock->sendmsg_idx++;
    1067             :                 }
    1068           0 :         }
    1069             : 
    1070             :         /* Consume the requests that were actually written */
    1071          11 :         req = TAILQ_FIRST(&_sock->queued_reqs);
    1072          13 :         while (req) {
    1073             :                 /* req->internal.is_zcopy is true when the whole req or part of it is sent with zerocopy */
    1074          13 :                 req->internal.is_zcopy = is_zcopy;
    1075             : 
    1076          13 :                 rc = sock_request_advance_offset(req, rc);
    1077          13 :                 if (rc < 0) {
    1078             :                         /* This element was partially sent. */
    1079           4 :                         return 0;
    1080             :                 }
    1081             : 
    1082             :                 /* Handled a full request. */
    1083           9 :                 spdk_sock_request_pend(_sock, req);
    1084             : 
    1085           9 :                 if (!req->internal.is_zcopy && req == TAILQ_FIRST(&_sock->pending_reqs)) {
    1086           9 :                         retval = spdk_sock_request_put(_sock, req, 0);
    1087           9 :                         if (retval) {
    1088           0 :                                 return retval;
    1089             :                         }
    1090           9 :                 } else {
    1091             :                         /* Re-use the offset field to hold the sendmsg call index. The
    1092             :                          * index is 0 based, so subtract one here because we've already
    1093             :                          * incremented above. */
    1094           0 :                         req->internal.offset = sock->sendmsg_idx - 1;
    1095             :                 }
    1096             : 
    1097           9 :                 if (rc == 0) {
    1098           7 :                         break;
    1099             :                 }
    1100             : 
    1101           2 :                 req = TAILQ_FIRST(&_sock->queued_reqs);
    1102             :         }
    1103             : 
    1104           7 :         return 0;
    1105          11 : }
    1106             : 
    1107             : #ifdef SPDK_ZEROCOPY
    1108             : static int
    1109           0 : _sock_check_zcopy(struct spdk_sock *_sock, int status)
    1110             : {
    1111           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1112           0 :         ssize_t rc;
    1113           0 :         struct sock_extended_err *serr;
    1114           0 :         struct cmsghdr *cm;
    1115           0 :         uint32_t idx;
    1116           0 :         struct spdk_sock_request *req, *treq;
    1117           0 :         bool found;
    1118             : 
    1119           0 :         assert(sock->zcopy == true);
    1120           0 :         if (spdk_unlikely(status) < 0) {
    1121           0 :                 if (!TAILQ_EMPTY(&_sock->pending_reqs)) {
    1122           0 :                         SPDK_ERRLOG("Attempting to receive from ERRQUEUE yielded error, but pending list still has orphaned entries, status =%d\n",
    1123             :                                     status);
    1124           0 :                 } else {
    1125           0 :                         SPDK_WARNLOG("Recvmsg yielded an error!\n");
    1126             :                 }
    1127           0 :                 return 0;
    1128             :         }
    1129             : 
    1130           0 :         cm = CMSG_FIRSTHDR(&sock->errqueue_task.msg);
    1131           0 :         if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) ||
    1132           0 :               (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR))) {
    1133           0 :                 SPDK_WARNLOG("Unexpected cmsg level or type!\n");
    1134           0 :                 return 0;
    1135             :         }
    1136             : 
    1137           0 :         serr = (struct sock_extended_err *)CMSG_DATA(cm);
    1138           0 :         if (serr->ee_errno != 0 || serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
    1139           0 :                 SPDK_WARNLOG("Unexpected extended error origin\n");
    1140           0 :                 return 0;
    1141             :         }
    1142             : 
    1143             :         /* Most of the time, the pending_reqs array is in the exact
    1144             :          * order we need such that all of the requests to complete are
    1145             :          * in order, in the front. It is guaranteed that all requests
    1146             :          * belonging to the same sendmsg call are sequential, so once
    1147             :          * we encounter one match we can stop looping as soon as a
    1148             :          * non-match is found.
    1149             :          */
    1150           0 :         for (idx = serr->ee_info; idx <= serr->ee_data; idx++) {
    1151           0 :                 found = false;
    1152           0 :                 TAILQ_FOREACH_SAFE(req, &_sock->pending_reqs, internal.link, treq) {
    1153           0 :                         if (!req->internal.is_zcopy) {
    1154             :                                 /* This wasn't a zcopy request. It was just waiting in line to complete */
    1155           0 :                                 rc = spdk_sock_request_put(_sock, req, 0);
    1156           0 :                                 if (rc < 0) {
    1157           0 :                                         return rc;
    1158             :                                 }
    1159           0 :                         } else if (req->internal.offset == idx) {
    1160           0 :                                 found = true;
    1161           0 :                                 rc = spdk_sock_request_put(_sock, req, 0);
    1162           0 :                                 if (rc < 0) {
    1163           0 :                                         return rc;
    1164             :                                 }
    1165           0 :                         } else if (found) {
    1166           0 :                                 break;
    1167             :                         }
    1168           0 :                 }
    1169           0 :         }
    1170             : 
    1171           0 :         return 0;
    1172           0 : }
    1173             : 
    1174             : static void
    1175           0 : _sock_prep_errqueue(struct spdk_sock *_sock)
    1176             : {
    1177           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1178           0 :         struct spdk_uring_task *task = &sock->errqueue_task;
    1179           0 :         struct io_uring_sqe *sqe;
    1180             : 
    1181           0 :         if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) {
    1182           0 :                 return;
    1183             :         }
    1184             : 
    1185           0 :         if (sock->pending_group_remove) {
    1186           0 :                 return;
    1187             :         }
    1188             : 
    1189           0 :         assert(sock->group != NULL);
    1190           0 :         sock->group->io_queued++;
    1191             : 
    1192           0 :         sqe = io_uring_get_sqe(&sock->group->uring);
    1193           0 :         io_uring_prep_recvmsg(sqe, sock->fd, &task->msg, MSG_ERRQUEUE);
    1194           0 :         io_uring_sqe_set_data(sqe, task);
    1195           0 :         task->status = SPDK_URING_SOCK_TASK_IN_PROCESS;
    1196           0 : }
    1197             : 
    1198             : #endif
    1199             : 
    1200             : static void
    1201           0 : _sock_flush(struct spdk_sock *_sock)
    1202             : {
    1203           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1204           0 :         struct spdk_uring_task *task = &sock->write_task;
    1205           0 :         uint32_t iovcnt;
    1206           0 :         struct io_uring_sqe *sqe;
    1207           0 :         int flags;
    1208             : 
    1209           0 :         if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) {
    1210           0 :                 return;
    1211             :         }
    1212             : 
    1213             : #ifdef SPDK_ZEROCOPY
    1214           0 :         if (sock->zcopy) {
    1215           0 :                 flags = MSG_DONTWAIT | sock->zcopy_send_flags;
    1216           0 :         } else
    1217             : #endif
    1218             :         {
    1219           0 :                 flags = MSG_DONTWAIT;
    1220             :         }
    1221             : 
    1222           0 :         iovcnt = spdk_sock_prep_reqs(&sock->base, task->iovs, task->iov_cnt, &task->last_req, &flags);
    1223           0 :         if (!iovcnt) {
    1224           0 :                 return;
    1225             :         }
    1226             : 
    1227           0 :         task->iov_cnt = iovcnt;
    1228           0 :         assert(sock->group != NULL);
    1229           0 :         task->msg.msg_iov = task->iovs;
    1230           0 :         task->msg.msg_iovlen = task->iov_cnt;
    1231             : #ifdef SPDK_ZEROCOPY
    1232           0 :         task->is_zcopy = (flags & MSG_ZEROCOPY) ? true : false;
    1233             : #endif
    1234           0 :         sock->group->io_queued++;
    1235             : 
    1236           0 :         sqe = io_uring_get_sqe(&sock->group->uring);
    1237           0 :         io_uring_prep_sendmsg(sqe, sock->fd, &sock->write_task.msg, flags);
    1238           0 :         io_uring_sqe_set_data(sqe, task);
    1239           0 :         task->status = SPDK_URING_SOCK_TASK_IN_PROCESS;
    1240           0 : }
    1241             : 
    1242             : static void
    1243           0 : _sock_prep_read(struct spdk_sock *_sock)
    1244             : {
    1245           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1246           0 :         struct spdk_uring_task *task = &sock->read_task;
    1247           0 :         struct io_uring_sqe *sqe;
    1248             : 
    1249             :         /* Do not prepare read event */
    1250           0 :         if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) {
    1251           0 :                 return;
    1252             :         }
    1253             : 
    1254           0 :         if (sock->pending_group_remove) {
    1255           0 :                 return;
    1256             :         }
    1257             : 
    1258           0 :         assert(sock->group != NULL);
    1259           0 :         sock->group->io_queued++;
    1260             : 
    1261           0 :         sqe = io_uring_get_sqe(&sock->group->uring);
    1262           0 :         io_uring_prep_recv(sqe, sock->fd, NULL, URING_MAX_RECV_SIZE, 0);
    1263           0 :         sqe->buf_group = URING_BUF_GROUP_ID;
    1264           0 :         sqe->flags |= IOSQE_BUFFER_SELECT;
    1265           0 :         io_uring_sqe_set_data(sqe, task);
    1266           0 :         task->status = SPDK_URING_SOCK_TASK_IN_PROCESS;
    1267           0 : }
    1268             : 
    1269             : static void
    1270           0 : _sock_prep_cancel_task(struct spdk_sock *_sock, void *user_data)
    1271             : {
    1272           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1273           0 :         struct spdk_uring_task *task = &sock->cancel_task;
    1274           0 :         struct io_uring_sqe *sqe;
    1275             : 
    1276           0 :         if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) {
    1277           0 :                 return;
    1278             :         }
    1279             : 
    1280           0 :         assert(sock->group != NULL);
    1281           0 :         sock->group->io_queued++;
    1282             : 
    1283           0 :         sqe = io_uring_get_sqe(&sock->group->uring);
    1284           0 :         io_uring_prep_cancel(sqe, user_data, 0);
    1285           0 :         io_uring_sqe_set_data(sqe, task);
    1286           0 :         task->status = SPDK_URING_SOCK_TASK_IN_PROCESS;
    1287           0 : }
    1288             : 
    1289             : static void
    1290           0 : uring_sock_fail(struct spdk_uring_sock *sock, int status)
    1291             : {
    1292           0 :         struct spdk_uring_sock_group_impl *group = sock->group;
    1293           0 :         int rc;
    1294             : 
    1295           0 :         sock->connection_status = status;
    1296           0 :         rc = spdk_sock_abort_requests(&sock->base);
    1297             : 
    1298             :         /* The user needs to be notified that this socket is dead. */
    1299           0 :         if (rc == 0 && sock->base.cb_fn != NULL &&
    1300           0 :             sock->pending_recv == false) {
    1301           0 :                 sock->pending_recv = true;
    1302           0 :                 TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
    1303           0 :         }
    1304           0 : }
    1305             : 
    1306             : static int
    1307           0 : sock_uring_group_reap(struct spdk_uring_sock_group_impl *group, int max, int max_read_events,
    1308             :                       struct spdk_sock **socks)
    1309             : {
    1310           0 :         int i, count, ret;
    1311           0 :         struct io_uring_cqe *cqe;
    1312           0 :         struct spdk_uring_sock *sock, *tmp;
    1313           0 :         struct spdk_uring_task *task;
    1314           0 :         int status, bid, flags;
    1315           0 :         bool is_zcopy;
    1316             : 
    1317           0 :         for (i = 0; i < max; i++) {
    1318           0 :                 ret = io_uring_peek_cqe(&group->uring, &cqe);
    1319           0 :                 if (ret != 0) {
    1320           0 :                         break;
    1321             :                 }
    1322             : 
    1323           0 :                 if (cqe == NULL) {
    1324           0 :                         break;
    1325             :                 }
    1326             : 
    1327           0 :                 task = (struct spdk_uring_task *)cqe->user_data;
    1328           0 :                 assert(task != NULL);
    1329           0 :                 sock = task->sock;
    1330           0 :                 assert(sock != NULL);
    1331           0 :                 assert(sock->group != NULL);
    1332           0 :                 assert(sock->group == group);
    1333           0 :                 sock->group->io_inflight--;
    1334           0 :                 sock->group->io_avail++;
    1335           0 :                 status = cqe->res;
    1336           0 :                 flags = cqe->flags;
    1337           0 :                 io_uring_cqe_seen(&group->uring, cqe);
    1338             : 
    1339           0 :                 task->status = SPDK_URING_SOCK_TASK_NOT_IN_USE;
    1340             : 
    1341           0 :                 switch (task->type) {
    1342             :                 case URING_TASK_READ:
    1343           0 :                         if (status == -EAGAIN || status == -EWOULDBLOCK) {
    1344             :                                 /* This likely shouldn't happen, but would indicate that the
    1345             :                                  * kernel didn't have enough resources to queue a task internally. */
    1346           0 :                                 _sock_prep_read(&sock->base);
    1347           0 :                         } else if (status == -ECANCELED) {
    1348           0 :                                 continue;
    1349           0 :                         } else if (status == -ENOBUFS) {
    1350             :                                 /* There's data in the socket but the user hasn't provided any buffers.
    1351             :                                  * We need to notify the user that the socket has data pending. */
    1352           0 :                                 if (sock->base.cb_fn != NULL &&
    1353           0 :                                     sock->pending_recv == false) {
    1354           0 :                                         sock->pending_recv = true;
    1355           0 :                                         TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
    1356           0 :                                 }
    1357             : 
    1358           0 :                                 _sock_prep_read(&sock->base);
    1359           0 :                         } else if (spdk_unlikely(status <= 0)) {
    1360           0 :                                 uring_sock_fail(sock, status < 0 ? status : -ECONNRESET);
    1361           0 :                         } else {
    1362           0 :                                 struct spdk_uring_buf_tracker *tracker;
    1363             : 
    1364           0 :                                 assert((flags & IORING_CQE_F_BUFFER) != 0);
    1365             : 
    1366           0 :                                 bid = flags >> IORING_CQE_BUFFER_SHIFT;
    1367           0 :                                 tracker = &group->trackers[bid];
    1368             : 
    1369           0 :                                 assert(tracker->buf != NULL);
    1370           0 :                                 assert(tracker->len != 0);
    1371             : 
    1372             :                                 /* Append this data to the stream */
    1373           0 :                                 tracker->len = status;
    1374           0 :                                 STAILQ_INSERT_TAIL(&sock->recv_stream, tracker, link);
    1375           0 :                                 assert(group->buf_ring_count > 0);
    1376           0 :                                 group->buf_ring_count--;
    1377             : 
    1378           0 :                                 if (sock->base.cb_fn != NULL &&
    1379           0 :                                     sock->pending_recv == false) {
    1380           0 :                                         sock->pending_recv = true;
    1381           0 :                                         TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
    1382           0 :                                 }
    1383             : 
    1384           0 :                                 _sock_prep_read(&sock->base);
    1385           0 :                         }
    1386           0 :                         break;
    1387             :                 case URING_TASK_WRITE:
    1388           0 :                         if (status == -EAGAIN || status == -EWOULDBLOCK ||
    1389           0 :                             (status == -ENOBUFS && sock->zcopy) ||
    1390           0 :                             status == -ECANCELED) {
    1391           0 :                                 continue;
    1392           0 :                         } else if (spdk_unlikely(status) < 0) {
    1393           0 :                                 uring_sock_fail(sock, status);
    1394           0 :                         } else {
    1395           0 :                                 task->last_req = NULL;
    1396           0 :                                 task->iov_cnt = 0;
    1397           0 :                                 is_zcopy = task->is_zcopy;
    1398           0 :                                 task->is_zcopy = false;
    1399           0 :                                 sock_complete_write_reqs(&sock->base, status, is_zcopy);
    1400             :                         }
    1401             : 
    1402           0 :                         break;
    1403             : #ifdef SPDK_ZEROCOPY
    1404             :                 case URING_TASK_ERRQUEUE:
    1405           0 :                         if (status == -EAGAIN || status == -EWOULDBLOCK) {
    1406           0 :                                 _sock_prep_errqueue(&sock->base);
    1407           0 :                         } else if (status == -ECANCELED) {
    1408           0 :                                 continue;
    1409           0 :                         } else if (spdk_unlikely(status < 0)) {
    1410           0 :                                 uring_sock_fail(sock, status);
    1411           0 :                         } else {
    1412           0 :                                 _sock_check_zcopy(&sock->base, status);
    1413           0 :                                 _sock_prep_errqueue(&sock->base);
    1414             :                         }
    1415           0 :                         break;
    1416             : #endif
    1417             :                 case URING_TASK_CANCEL:
    1418             :                         /* Do nothing */
    1419           0 :                         break;
    1420             :                 default:
    1421           0 :                         SPDK_UNREACHABLE();
    1422             :                 }
    1423           0 :         }
    1424             : 
    1425           0 :         if (!socks) {
    1426           0 :                 return 0;
    1427             :         }
    1428           0 :         count = 0;
    1429           0 :         TAILQ_FOREACH_SAFE(sock, &group->pending_recv, link, tmp) {
    1430           0 :                 if (count == max_read_events) {
    1431           0 :                         break;
    1432             :                 }
    1433             : 
    1434             :                 /* If the socket's cb_fn is NULL, do not add it to socks array */
    1435           0 :                 if (spdk_unlikely(sock->base.cb_fn == NULL)) {
    1436           0 :                         assert(sock->pending_recv == true);
    1437           0 :                         sock->pending_recv = false;
    1438           0 :                         TAILQ_REMOVE(&group->pending_recv, sock, link);
    1439           0 :                         continue;
    1440             :                 }
    1441             : 
    1442           0 :                 socks[count++] = &sock->base;
    1443           0 :         }
    1444             : 
    1445             : 
    1446             :         /* Cycle the pending_recv list so that each time we poll things aren't
    1447             :          * in the same order. Say we have 6 sockets in the list, named as follows:
    1448             :          * A B C D E F
    1449             :          * And all 6 sockets had the poll events, but max_events is only 3. That means
    1450             :          * psock currently points at D. We want to rearrange the list to the following:
    1451             :          * D E F A B C
    1452             :          *
    1453             :          * The variables below are named according to this example to make it easier to
    1454             :          * follow the swaps.
    1455             :          */
    1456           0 :         if (sock != NULL) {
    1457           0 :                 struct spdk_uring_sock *ua, *uc, *ud, *uf;
    1458             : 
    1459             :                 /* Capture pointers to the elements we need */
    1460           0 :                 ud = sock;
    1461             : 
    1462           0 :                 ua = TAILQ_FIRST(&group->pending_recv);
    1463           0 :                 if (ua == ud) {
    1464           0 :                         goto end;
    1465             :                 }
    1466             : 
    1467           0 :                 uf = TAILQ_LAST(&group->pending_recv, pending_recv_list);
    1468           0 :                 if (uf == ud) {
    1469           0 :                         TAILQ_REMOVE(&group->pending_recv, ud, link);
    1470           0 :                         TAILQ_INSERT_HEAD(&group->pending_recv, ud, link);
    1471           0 :                         goto end;
    1472             :                 }
    1473             : 
    1474           0 :                 uc = TAILQ_PREV(ud, pending_recv_list, link);
    1475           0 :                 assert(uc != NULL);
    1476             : 
    1477             :                 /* Break the link between C and D */
    1478           0 :                 uc->link.tqe_next = NULL;
    1479             : 
    1480             :                 /* Connect F to A */
    1481           0 :                 uf->link.tqe_next = ua;
    1482           0 :                 ua->link.tqe_prev = &uf->link.tqe_next;
    1483             : 
    1484             :                 /* Fix up the list first/last pointers */
    1485           0 :                 group->pending_recv.tqh_first = ud;
    1486           0 :                 group->pending_recv.tqh_last = &uc->link.tqe_next;
    1487             : 
    1488             :                 /* D is in front of the list, make tqe prev pointer point to the head of list */
    1489           0 :                 ud->link.tqe_prev = &group->pending_recv.tqh_first;
    1490           0 :         }
    1491             : 
    1492             : end:
    1493           0 :         return count;
    1494           0 : }
    1495             : 
    1496             : static int uring_sock_flush(struct spdk_sock *_sock);
    1497             : 
    1498             : static void
    1499           0 : uring_sock_writev_async(struct spdk_sock *_sock, struct spdk_sock_request *req)
    1500             : {
    1501           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1502           0 :         int rc;
    1503             : 
    1504           0 :         if (spdk_unlikely(sock->connection_status)) {
    1505           0 :                 req->cb_fn(req->cb_arg, sock->connection_status);
    1506           0 :                 return;
    1507             :         }
    1508             : 
    1509           0 :         spdk_sock_request_queue(_sock, req);
    1510             : 
    1511           0 :         if (!sock->group) {
    1512           0 :                 if (_sock->queued_iovcnt >= IOV_BATCH_SIZE) {
    1513           0 :                         rc = uring_sock_flush(_sock);
    1514           0 :                         if (rc < 0 && errno != EAGAIN) {
    1515           0 :                                 spdk_sock_abort_requests(_sock);
    1516           0 :                         }
    1517           0 :                 }
    1518           0 :         }
    1519           0 : }
    1520             : 
    1521             : static int
    1522           0 : uring_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes)
    1523             : {
    1524           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1525           0 :         int val;
    1526           0 :         int rc;
    1527             : 
    1528           0 :         assert(sock != NULL);
    1529             : 
    1530           0 :         val = nbytes;
    1531           0 :         rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val);
    1532           0 :         if (rc != 0) {
    1533           0 :                 return -1;
    1534             :         }
    1535           0 :         return 0;
    1536           0 : }
    1537             : 
    1538             : static bool
    1539           0 : uring_sock_is_ipv6(struct spdk_sock *_sock)
    1540             : {
    1541           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1542           0 :         struct sockaddr_storage sa;
    1543           0 :         socklen_t salen;
    1544           0 :         int rc;
    1545             : 
    1546           0 :         assert(sock != NULL);
    1547             : 
    1548           0 :         memset(&sa, 0, sizeof sa);
    1549           0 :         salen = sizeof sa;
    1550           0 :         rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
    1551           0 :         if (rc != 0) {
    1552           0 :                 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
    1553           0 :                 return false;
    1554             :         }
    1555             : 
    1556           0 :         return (sa.ss_family == AF_INET6);
    1557           0 : }
    1558             : 
    1559             : static bool
    1560           0 : uring_sock_is_ipv4(struct spdk_sock *_sock)
    1561             : {
    1562           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1563           0 :         struct sockaddr_storage sa;
    1564           0 :         socklen_t salen;
    1565           0 :         int rc;
    1566             : 
    1567           0 :         assert(sock != NULL);
    1568             : 
    1569           0 :         memset(&sa, 0, sizeof sa);
    1570           0 :         salen = sizeof sa;
    1571           0 :         rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
    1572           0 :         if (rc != 0) {
    1573           0 :                 SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
    1574           0 :                 return false;
    1575             :         }
    1576             : 
    1577           0 :         return (sa.ss_family == AF_INET);
    1578           0 : }
    1579             : 
    1580             : static bool
    1581           0 : uring_sock_is_connected(struct spdk_sock *_sock)
    1582             : {
    1583           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1584           0 :         uint8_t byte;
    1585           0 :         int rc;
    1586             : 
    1587           0 :         rc = recv(sock->fd, &byte, 1, MSG_PEEK | MSG_DONTWAIT);
    1588           0 :         if (rc == 0) {
    1589           0 :                 return false;
    1590             :         }
    1591             : 
    1592           0 :         if (rc < 0) {
    1593           0 :                 if (errno == EAGAIN || errno == EWOULDBLOCK) {
    1594           0 :                         return true;
    1595             :                 }
    1596             : 
    1597           0 :                 return false;
    1598             :         }
    1599             : 
    1600           0 :         return true;
    1601           0 : }
    1602             : 
    1603             : static struct spdk_sock_group_impl *
    1604           0 : uring_sock_group_impl_get_optimal(struct spdk_sock *_sock, struct spdk_sock_group_impl *hint)
    1605             : {
    1606           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1607           0 :         struct spdk_sock_group_impl *group;
    1608             : 
    1609           0 :         if (sock->placement_id != -1) {
    1610           0 :                 spdk_sock_map_lookup(&g_map, sock->placement_id, &group, hint);
    1611           0 :                 return group;
    1612             :         }
    1613             : 
    1614           0 :         return NULL;
    1615           0 : }
    1616             : 
    1617             : static int
    1618           0 : uring_sock_group_impl_buf_pool_free(struct spdk_uring_sock_group_impl *group_impl)
    1619             : {
    1620           0 :         if (group_impl->buf_ring) {
    1621           0 :                 io_uring_unregister_buf_ring(&group_impl->uring, URING_BUF_GROUP_ID);
    1622           0 :                 free(group_impl->buf_ring);
    1623           0 :         }
    1624             : 
    1625           0 :         free(group_impl->trackers);
    1626             : 
    1627           0 :         return 0;
    1628             : }
    1629             : 
    1630             : static int
    1631           0 : uring_sock_group_impl_buf_pool_alloc(struct spdk_uring_sock_group_impl *group_impl)
    1632             : {
    1633           0 :         struct io_uring_buf_reg buf_reg = {};
    1634           0 :         struct io_uring_buf_ring *buf_ring;
    1635           0 :         int i, rc;
    1636             : 
    1637           0 :         rc = posix_memalign((void **)&buf_ring, 0x1000, URING_BUF_POOL_SIZE * sizeof(struct io_uring_buf));
    1638           0 :         if (rc != 0) {
    1639             :                 /* posix_memalign returns positive errno values */
    1640           0 :                 return -rc;
    1641             :         }
    1642             : 
    1643           0 :         buf_reg.ring_addr = (unsigned long long)buf_ring;
    1644           0 :         buf_reg.ring_entries = URING_BUF_POOL_SIZE;
    1645           0 :         buf_reg.bgid = URING_BUF_GROUP_ID;
    1646             : 
    1647           0 :         rc = io_uring_register_buf_ring(&group_impl->uring, &buf_reg, 0);
    1648           0 :         if (rc != 0) {
    1649           0 :                 free(buf_ring);
    1650           0 :                 return rc;
    1651             :         }
    1652             : 
    1653           0 :         group_impl->buf_ring = buf_ring;
    1654           0 :         io_uring_buf_ring_init(group_impl->buf_ring);
    1655           0 :         group_impl->buf_ring_count = 0;
    1656             : 
    1657           0 :         group_impl->trackers = calloc(URING_BUF_POOL_SIZE, sizeof(struct spdk_uring_buf_tracker));
    1658           0 :         if (group_impl->trackers == NULL) {
    1659           0 :                 uring_sock_group_impl_buf_pool_free(group_impl);
    1660           0 :                 return -ENOMEM;
    1661             :         }
    1662             : 
    1663           0 :         STAILQ_INIT(&group_impl->free_trackers);
    1664             : 
    1665           0 :         for (i = 0; i < URING_BUF_POOL_SIZE; i++) {
    1666           0 :                 struct spdk_uring_buf_tracker *tracker = &group_impl->trackers[i];
    1667             : 
    1668           0 :                 tracker->buf = NULL;
    1669           0 :                 tracker->len = 0;
    1670           0 :                 tracker->ctx = NULL;
    1671           0 :                 tracker->id = i;
    1672             : 
    1673           0 :                 STAILQ_INSERT_TAIL(&group_impl->free_trackers, tracker, link);
    1674           0 :         }
    1675             : 
    1676           0 :         return 0;
    1677           0 : }
    1678             : 
    1679             : static struct spdk_sock_group_impl *
    1680           0 : uring_sock_group_impl_create(void)
    1681             : {
    1682           0 :         struct spdk_uring_sock_group_impl *group_impl;
    1683             : 
    1684           0 :         group_impl = calloc(1, sizeof(*group_impl));
    1685           0 :         if (group_impl == NULL) {
    1686           0 :                 SPDK_ERRLOG("group_impl allocation failed\n");
    1687           0 :                 return NULL;
    1688             :         }
    1689             : 
    1690           0 :         group_impl->io_avail = SPDK_SOCK_GROUP_QUEUE_DEPTH;
    1691             : 
    1692           0 :         if (io_uring_queue_init(SPDK_SOCK_GROUP_QUEUE_DEPTH, &group_impl->uring, 0) < 0) {
    1693           0 :                 SPDK_ERRLOG("uring I/O context setup failure\n");
    1694           0 :                 free(group_impl);
    1695           0 :                 return NULL;
    1696             :         }
    1697             : 
    1698           0 :         TAILQ_INIT(&group_impl->pending_recv);
    1699             : 
    1700           0 :         if (uring_sock_group_impl_buf_pool_alloc(group_impl) < 0) {
    1701           0 :                 SPDK_ERRLOG("Failed to create buffer ring. Your kernel is likely not new enough. "
    1702             :                             "Please switch to the POSIX sock implementation instead.\n");
    1703           0 :                 io_uring_queue_exit(&group_impl->uring);
    1704           0 :                 free(group_impl);
    1705           0 :                 return NULL;
    1706             :         }
    1707             : 
    1708           0 :         if (g_spdk_uring_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) {
    1709           0 :                 spdk_sock_map_insert(&g_map, spdk_env_get_current_core(), &group_impl->base);
    1710           0 :         }
    1711             : 
    1712           0 :         return &group_impl->base;
    1713           0 : }
    1714             : 
    1715             : static int
    1716           0 : uring_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group,
    1717             :                                struct spdk_sock *_sock)
    1718             : {
    1719           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1720           0 :         struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group);
    1721           0 :         int rc;
    1722             : 
    1723           0 :         sock->group = group;
    1724           0 :         sock->write_task.sock = sock;
    1725           0 :         sock->write_task.type = URING_TASK_WRITE;
    1726             : 
    1727           0 :         sock->read_task.sock = sock;
    1728           0 :         sock->read_task.type = URING_TASK_READ;
    1729             : 
    1730           0 :         sock->errqueue_task.sock = sock;
    1731           0 :         sock->errqueue_task.type = URING_TASK_ERRQUEUE;
    1732           0 :         sock->errqueue_task.msg.msg_control = sock->buf;
    1733           0 :         sock->errqueue_task.msg.msg_controllen = sizeof(sock->buf);
    1734             : 
    1735           0 :         sock->cancel_task.sock = sock;
    1736           0 :         sock->cancel_task.type = URING_TASK_CANCEL;
    1737             : 
    1738             :         /* switched from another polling group due to scheduling */
    1739           0 :         if (spdk_unlikely(sock->recv_pipe != NULL &&
    1740             :                           (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) {
    1741           0 :                 assert(sock->pending_recv == false);
    1742           0 :                 sock->pending_recv = true;
    1743           0 :                 TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
    1744           0 :         }
    1745             : 
    1746           0 :         if (sock->placement_id != -1) {
    1747           0 :                 rc = spdk_sock_map_insert(&g_map, sock->placement_id, &group->base);
    1748           0 :                 if (rc != 0) {
    1749           0 :                         SPDK_ERRLOG("Failed to insert sock group into map: %d", rc);
    1750             :                         /* Do not treat this as an error. The system will continue running. */
    1751           0 :                 }
    1752           0 :         }
    1753             : 
    1754             :         /* We get an async read going immediately */
    1755           0 :         _sock_prep_read(&sock->base);
    1756             : #ifdef SPDK_ZEROCOPY
    1757           0 :         if (sock->zcopy) {
    1758           0 :                 _sock_prep_errqueue(_sock);
    1759           0 :         }
    1760             : #endif
    1761             : 
    1762           0 :         return 0;
    1763           0 : }
    1764             : 
    1765             : static void
    1766           0 : uring_sock_group_populate_buf_ring(struct spdk_uring_sock_group_impl *group)
    1767             : {
    1768           0 :         struct spdk_uring_buf_tracker *tracker;
    1769           0 :         int count, mask;
    1770             : 
    1771           0 :         if (g_spdk_uring_sock_impl_opts.enable_recv_pipe) {
    1772             :                 /* If recv_pipe is enabled, we do not post buffers. */
    1773           0 :                 return;
    1774             :         }
    1775             : 
    1776             :         /* Try to re-populate the io_uring's buffer pool using user-provided buffers */
    1777           0 :         tracker = STAILQ_FIRST(&group->free_trackers);
    1778           0 :         count = 0;
    1779           0 :         mask = io_uring_buf_ring_mask(URING_BUF_POOL_SIZE);
    1780           0 :         while (tracker != NULL) {
    1781           0 :                 tracker->buflen = spdk_sock_group_get_buf(group->base.group, &tracker->buf, &tracker->ctx);
    1782           0 :                 if (tracker->buflen == 0) {
    1783           0 :                         break;
    1784             :                 }
    1785             : 
    1786           0 :                 assert(tracker->buf != NULL);
    1787           0 :                 STAILQ_REMOVE_HEAD(&group->free_trackers, link);
    1788           0 :                 assert(STAILQ_FIRST(&group->free_trackers) != tracker);
    1789             : 
    1790           0 :                 io_uring_buf_ring_add(group->buf_ring, tracker->buf, tracker->buflen, tracker->id, mask, count);
    1791           0 :                 count++;
    1792           0 :                 tracker = STAILQ_FIRST(&group->free_trackers);
    1793             :         }
    1794             : 
    1795           0 :         if (count > 0) {
    1796           0 :                 group->buf_ring_count += count;
    1797           0 :                 io_uring_buf_ring_advance(group->buf_ring, count);
    1798           0 :         }
    1799           0 : }
    1800             : 
    1801             : static int
    1802           0 : uring_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events,
    1803             :                            struct spdk_sock **socks)
    1804             : {
    1805           0 :         struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group);
    1806           0 :         int count, ret;
    1807           0 :         int to_complete, to_submit;
    1808           0 :         struct spdk_sock *_sock, *tmp;
    1809           0 :         struct spdk_uring_sock *sock;
    1810             : 
    1811           0 :         if (spdk_likely(socks)) {
    1812           0 :                 TAILQ_FOREACH_SAFE(_sock, &group->base.socks, link, tmp) {
    1813           0 :                         sock = __uring_sock(_sock);
    1814           0 :                         if (spdk_unlikely(sock->connection_status)) {
    1815           0 :                                 continue;
    1816             :                         }
    1817           0 :                         _sock_flush(_sock);
    1818           0 :                 }
    1819           0 :         }
    1820             : 
    1821             :         /* Try to re-populate the io_uring's buffer pool using user-provided buffers */
    1822           0 :         uring_sock_group_populate_buf_ring(group);
    1823             : 
    1824           0 :         to_submit = group->io_queued;
    1825             : 
    1826             :         /* For network I/O, it cannot be set with O_DIRECT, so we do not need to call spdk_io_uring_enter */
    1827           0 :         if (to_submit > 0) {
    1828             :                 /* If there are I/O to submit, use io_uring_submit here.
    1829             :                  * It will automatically call io_uring_enter appropriately. */
    1830           0 :                 ret = io_uring_submit(&group->uring);
    1831           0 :                 if (ret < 0) {
    1832           0 :                         return 1;
    1833             :                 }
    1834           0 :                 group->io_queued = 0;
    1835           0 :                 group->io_inflight += to_submit;
    1836           0 :                 group->io_avail -= to_submit;
    1837           0 :         }
    1838             : 
    1839           0 :         count = 0;
    1840           0 :         to_complete = group->io_inflight;
    1841           0 :         if (to_complete > 0 || !TAILQ_EMPTY(&group->pending_recv)) {
    1842           0 :                 count = sock_uring_group_reap(group, to_complete, max_events, socks);
    1843           0 :         }
    1844             : 
    1845           0 :         return count;
    1846           0 : }
    1847             : 
    1848             : static int
    1849           0 : uring_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group,
    1850             :                                   struct spdk_sock *_sock)
    1851             : {
    1852           0 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1853           0 :         struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group);
    1854             : 
    1855           0 :         sock->pending_group_remove = true;
    1856             : 
    1857           0 :         if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) {
    1858           0 :                 _sock_prep_cancel_task(_sock, &sock->write_task);
    1859             :                 /* Since spdk_sock_group_remove_sock is not asynchronous interface, so
    1860             :                  * currently can use a while loop here. */
    1861           0 :                 while ((sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) ||
    1862           0 :                        (sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) {
    1863           0 :                         uring_sock_group_impl_poll(_group, 32, NULL);
    1864             :                 }
    1865           0 :         }
    1866             : 
    1867           0 :         if (sock->read_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) {
    1868           0 :                 _sock_prep_cancel_task(_sock, &sock->read_task);
    1869             :                 /* Since spdk_sock_group_remove_sock is not asynchronous interface, so
    1870             :                  * currently can use a while loop here. */
    1871           0 :                 while ((sock->read_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) ||
    1872           0 :                        (sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) {
    1873           0 :                         uring_sock_group_impl_poll(_group, 32, NULL);
    1874             :                 }
    1875           0 :         }
    1876             : 
    1877           0 :         if (sock->errqueue_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) {
    1878           0 :                 _sock_prep_cancel_task(_sock, &sock->errqueue_task);
    1879             :                 /* Since spdk_sock_group_remove_sock is not asynchronous interface, so
    1880             :                  * currently can use a while loop here. */
    1881           0 :                 while ((sock->errqueue_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) ||
    1882           0 :                        (sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) {
    1883           0 :                         uring_sock_group_impl_poll(_group, 32, NULL);
    1884             :                 }
    1885           0 :         }
    1886             : 
    1887             :         /* Make sure the cancelling the tasks above didn't cause sending new requests */
    1888           0 :         assert(sock->write_task.status == SPDK_URING_SOCK_TASK_NOT_IN_USE);
    1889           0 :         assert(sock->read_task.status == SPDK_URING_SOCK_TASK_NOT_IN_USE);
    1890           0 :         assert(sock->errqueue_task.status == SPDK_URING_SOCK_TASK_NOT_IN_USE);
    1891             : 
    1892           0 :         if (sock->pending_recv) {
    1893           0 :                 TAILQ_REMOVE(&group->pending_recv, sock, link);
    1894           0 :                 sock->pending_recv = false;
    1895           0 :         }
    1896           0 :         assert(sock->pending_recv == false);
    1897             : 
    1898             :         /* We have no way to handle this case. We could let the user read this
    1899             :          * buffer, but the buffer came from a group and we have lost the association
    1900             :          * to that so we couldn't release it. */
    1901           0 :         assert(STAILQ_EMPTY(&sock->recv_stream));
    1902             : 
    1903           0 :         if (sock->placement_id != -1) {
    1904           0 :                 spdk_sock_map_release(&g_map, sock->placement_id);
    1905           0 :         }
    1906             : 
    1907           0 :         sock->pending_group_remove = false;
    1908           0 :         sock->group = NULL;
    1909           0 :         return 0;
    1910           0 : }
    1911             : 
    1912             : static int
    1913           0 : uring_sock_group_impl_close(struct spdk_sock_group_impl *_group)
    1914             : {
    1915           0 :         struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group);
    1916             : 
    1917             :         /* try to reap all the active I/O */
    1918           0 :         while (group->io_inflight) {
    1919           0 :                 uring_sock_group_impl_poll(_group, 32, NULL);
    1920             :         }
    1921           0 :         assert(group->io_inflight == 0);
    1922           0 :         assert(group->io_avail == SPDK_SOCK_GROUP_QUEUE_DEPTH);
    1923             : 
    1924           0 :         uring_sock_group_impl_buf_pool_free(group);
    1925             : 
    1926           0 :         io_uring_queue_exit(&group->uring);
    1927             : 
    1928           0 :         if (g_spdk_uring_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) {
    1929           0 :                 spdk_sock_map_release(&g_map, spdk_env_get_current_core());
    1930           0 :         }
    1931             : 
    1932           0 :         free(group);
    1933           0 :         return 0;
    1934           0 : }
    1935             : 
    1936             : static int
    1937           6 : uring_sock_flush(struct spdk_sock *_sock)
    1938             : {
    1939           6 :         struct spdk_uring_sock *sock = __uring_sock(_sock);
    1940           6 :         struct msghdr msg = {};
    1941           6 :         struct iovec iovs[IOV_BATCH_SIZE];
    1942           6 :         int iovcnt;
    1943           6 :         ssize_t rc;
    1944           6 :         int flags = sock->zcopy_send_flags;
    1945           6 :         int retval;
    1946           6 :         bool is_zcopy = false;
    1947           6 :         struct spdk_uring_task *task = &sock->errqueue_task;
    1948             : 
    1949             :         /* Can't flush from within a callback or we end up with recursive calls */
    1950           6 :         if (_sock->cb_cnt > 0) {
    1951           0 :                 errno = EAGAIN;
    1952           0 :                 return -1;
    1953             :         }
    1954             : 
    1955             :         /* Can't flush while a write is already outstanding */
    1956           6 :         if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) {
    1957           0 :                 errno = EAGAIN;
    1958           0 :                 return -1;
    1959             :         }
    1960             : 
    1961             :         /* Gather an iov */
    1962           6 :         iovcnt = spdk_sock_prep_reqs(_sock, iovs, 0, NULL, &flags);
    1963           6 :         if (iovcnt == 0) {
    1964             :                 /* Nothing to send */
    1965           0 :                 return 0;
    1966             :         }
    1967             : 
    1968             :         /* Perform the vectored write */
    1969           6 :         msg.msg_iov = iovs;
    1970           6 :         msg.msg_iovlen = iovcnt;
    1971           6 :         rc = sendmsg(sock->fd, &msg, flags | MSG_DONTWAIT);
    1972           6 :         if (rc <= 0) {
    1973           0 :                 if (rc == 0 || errno == EAGAIN || errno == EWOULDBLOCK || (errno == ENOBUFS && sock->zcopy)) {
    1974           0 :                         errno = EAGAIN;
    1975           0 :                 }
    1976           0 :                 return -1;
    1977             :         }
    1978             : 
    1979             : #ifdef SPDK_ZEROCOPY
    1980           6 :         is_zcopy = flags & MSG_ZEROCOPY;
    1981             : #endif
    1982           6 :         retval = sock_complete_write_reqs(_sock, rc, is_zcopy);
    1983           6 :         if (retval < 0) {
    1984             :                 /* if the socket is closed, return to avoid heap-use-after-free error */
    1985           0 :                 errno = ENOTCONN;
    1986           0 :                 return -1;
    1987             :         }
    1988             : 
    1989             : #ifdef SPDK_ZEROCOPY
    1990             :         /* At least do once to check zero copy case */
    1991           6 :         if (sock->zcopy && !TAILQ_EMPTY(&_sock->pending_reqs)) {
    1992           0 :                 retval = recvmsg(sock->fd, &task->msg, MSG_ERRQUEUE);
    1993           0 :                 if (retval < 0) {
    1994           0 :                         if (errno == EWOULDBLOCK || errno == EAGAIN) {
    1995           0 :                                 return rc;
    1996             :                         }
    1997           0 :                 }
    1998           0 :                 _sock_check_zcopy(_sock, retval);;
    1999           0 :         }
    2000             : #endif
    2001             : 
    2002           6 :         return rc;
    2003           6 : }
    2004             : 
    2005             : static struct spdk_net_impl g_uring_net_impl = {
    2006             :         .name           = "uring",
    2007             :         .getaddr        = uring_sock_getaddr,
    2008             :         .connect        = uring_sock_connect,
    2009             :         .listen         = uring_sock_listen,
    2010             :         .accept         = uring_sock_accept,
    2011             :         .close          = uring_sock_close,
    2012             :         .recv           = uring_sock_recv,
    2013             :         .readv          = uring_sock_readv,
    2014             :         .writev         = uring_sock_writev,
    2015             :         .recv_next      = uring_sock_recv_next,
    2016             :         .writev_async   = uring_sock_writev_async,
    2017             :         .flush          = uring_sock_flush,
    2018             :         .set_recvlowat  = uring_sock_set_recvlowat,
    2019             :         .set_recvbuf    = uring_sock_set_recvbuf,
    2020             :         .set_sendbuf    = uring_sock_set_sendbuf,
    2021             :         .is_ipv6        = uring_sock_is_ipv6,
    2022             :         .is_ipv4        = uring_sock_is_ipv4,
    2023             :         .is_connected   = uring_sock_is_connected,
    2024             :         .group_impl_get_optimal = uring_sock_group_impl_get_optimal,
    2025             :         .group_impl_create      = uring_sock_group_impl_create,
    2026             :         .group_impl_add_sock    = uring_sock_group_impl_add_sock,
    2027             :         .group_impl_remove_sock = uring_sock_group_impl_remove_sock,
    2028             :         .group_impl_poll        = uring_sock_group_impl_poll,
    2029             :         .group_impl_close       = uring_sock_group_impl_close,
    2030             :         .get_opts               = uring_sock_impl_get_opts,
    2031             :         .set_opts               = uring_sock_impl_set_opts,
    2032             : };
    2033             : 
    2034           1 : SPDK_NET_IMPL_REGISTER(uring, &g_uring_net_impl, DEFAULT_SOCK_PRIORITY + 2);

Generated by: LCOV version 1.15