Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2018 Intel Corporation. All rights reserved.
3 : * Copyright (c) 2020, 2021 Mellanox Technologies LTD. All rights reserved.
4 : * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 : */
6 :
7 : #include "spdk/stdinc.h"
8 :
9 : #if defined(__FreeBSD__)
10 : #include <sys/event.h>
11 : #define SPDK_KEVENT
12 : #else
13 : #include <sys/epoll.h>
14 : #define SPDK_EPOLL
15 : #endif
16 :
17 : #if defined(__linux__)
18 : #include <linux/errqueue.h>
19 : #endif
20 :
21 : #include "spdk/env.h"
22 : #include "spdk/log.h"
23 : #include "spdk/pipe.h"
24 : #include "spdk/sock.h"
25 : #include "spdk/util.h"
26 : #include "spdk/string.h"
27 : #include "spdk_internal/sock.h"
28 : #include "../sock_kernel.h"
29 :
30 : #include "openssl/crypto.h"
31 : #include "openssl/err.h"
32 : #include "openssl/ssl.h"
33 :
34 : #define MAX_TMPBUF 1024
35 : #define PORTNUMLEN 32
36 :
37 : #if defined(SO_ZEROCOPY) && defined(MSG_ZEROCOPY)
38 : #define SPDK_ZEROCOPY
39 : #endif
40 :
41 : struct spdk_posix_sock {
42 : struct spdk_sock base;
43 : int fd;
44 :
45 : uint32_t sendmsg_idx;
46 :
47 : struct spdk_pipe *recv_pipe;
48 : void *recv_buf;
49 : int recv_buf_sz;
50 : bool pipe_has_data;
51 : bool socket_has_data;
52 : bool zcopy;
53 :
54 : int placement_id;
55 :
56 : SSL_CTX *ctx;
57 : SSL *ssl;
58 :
59 : TAILQ_ENTRY(spdk_posix_sock) link;
60 : };
61 :
62 : TAILQ_HEAD(spdk_has_data_list, spdk_posix_sock);
63 :
64 : struct spdk_posix_sock_group_impl {
65 : struct spdk_sock_group_impl base;
66 : int fd;
67 : struct spdk_has_data_list socks_with_data;
68 : int placement_id;
69 : };
70 :
71 : static struct spdk_sock_impl_opts g_posix_impl_opts = {
72 : .recv_buf_size = DEFAULT_SO_RCVBUF_SIZE,
73 : .send_buf_size = DEFAULT_SO_SNDBUF_SIZE,
74 : .enable_recv_pipe = true,
75 : .enable_quickack = false,
76 : .enable_placement_id = PLACEMENT_NONE,
77 : .enable_zerocopy_send_server = true,
78 : .enable_zerocopy_send_client = false,
79 : .zerocopy_threshold = 0,
80 : .tls_version = 0,
81 : .enable_ktls = false,
82 : .psk_key = NULL,
83 : .psk_key_size = 0,
84 : .psk_identity = NULL,
85 : .get_key = NULL,
86 : .get_key_ctx = NULL,
87 : .tls_cipher_suites = NULL
88 : };
89 :
90 : static struct spdk_sock_impl_opts g_ssl_impl_opts = {
91 : .recv_buf_size = MIN_SO_RCVBUF_SIZE,
92 : .send_buf_size = MIN_SO_SNDBUF_SIZE,
93 : .enable_recv_pipe = true,
94 : .enable_quickack = false,
95 : .enable_placement_id = PLACEMENT_NONE,
96 : .enable_zerocopy_send_server = true,
97 : .enable_zerocopy_send_client = false,
98 : .zerocopy_threshold = 0,
99 : .tls_version = 0,
100 : .enable_ktls = false,
101 : .psk_key = NULL,
102 : .psk_identity = NULL
103 : };
104 :
105 : static struct spdk_sock_map g_map = {
106 : .entries = STAILQ_HEAD_INITIALIZER(g_map.entries),
107 : .mtx = PTHREAD_MUTEX_INITIALIZER
108 : };
109 :
110 : __attribute((destructor)) static void
111 2 : posix_sock_map_cleanup(void)
112 : {
113 2 : spdk_sock_map_cleanup(&g_map);
114 2 : }
115 :
116 : #define __posix_sock(sock) (struct spdk_posix_sock *)sock
117 : #define __posix_group_impl(group) (struct spdk_posix_sock_group_impl *)group
118 :
119 : static void
120 16 : posix_sock_copy_impl_opts(struct spdk_sock_impl_opts *dest, const struct spdk_sock_impl_opts *src,
121 : size_t len)
122 : {
123 : #define FIELD_OK(field) \
124 : offsetof(struct spdk_sock_impl_opts, field) + sizeof(src->field) <= len
125 :
126 : #define SET_FIELD(field) \
127 : if (FIELD_OK(field)) { \
128 : dest->field = src->field; \
129 : }
130 :
131 16 : SET_FIELD(recv_buf_size);
132 16 : SET_FIELD(send_buf_size);
133 16 : SET_FIELD(enable_recv_pipe);
134 16 : SET_FIELD(enable_zerocopy_send);
135 16 : SET_FIELD(enable_quickack);
136 16 : SET_FIELD(enable_placement_id);
137 16 : SET_FIELD(enable_zerocopy_send_server);
138 16 : SET_FIELD(enable_zerocopy_send_client);
139 16 : SET_FIELD(zerocopy_threshold);
140 16 : SET_FIELD(tls_version);
141 16 : SET_FIELD(enable_ktls);
142 16 : SET_FIELD(psk_key);
143 16 : SET_FIELD(psk_key_size);
144 16 : SET_FIELD(psk_identity);
145 16 : SET_FIELD(get_key);
146 16 : SET_FIELD(get_key_ctx);
147 16 : SET_FIELD(tls_cipher_suites);
148 :
149 : #undef SET_FIELD
150 : #undef FIELD_OK
151 16 : }
152 :
153 : static int
154 9 : _sock_impl_get_opts(struct spdk_sock_impl_opts *opts, struct spdk_sock_impl_opts *impl_opts,
155 : size_t *len)
156 : {
157 9 : if (!opts || !len) {
158 0 : errno = EINVAL;
159 0 : return -1;
160 : }
161 :
162 9 : assert(sizeof(*opts) >= *len);
163 9 : memset(opts, 0, *len);
164 :
165 9 : posix_sock_copy_impl_opts(opts, impl_opts, *len);
166 9 : *len = spdk_min(*len, sizeof(*impl_opts));
167 :
168 9 : return 0;
169 : }
170 :
171 : static int
172 9 : posix_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len)
173 : {
174 9 : return _sock_impl_get_opts(opts, &g_posix_impl_opts, len);
175 : }
176 :
177 : static int
178 0 : ssl_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len)
179 : {
180 0 : return _sock_impl_get_opts(opts, &g_ssl_impl_opts, len);
181 : }
182 :
183 : static int
184 3 : _sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, struct spdk_sock_impl_opts *impl_opts,
185 : size_t len)
186 : {
187 3 : if (!opts) {
188 0 : errno = EINVAL;
189 0 : return -1;
190 : }
191 :
192 3 : assert(sizeof(*opts) >= len);
193 3 : posix_sock_copy_impl_opts(impl_opts, opts, len);
194 :
195 3 : return 0;
196 : }
197 :
198 : static int
199 3 : posix_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len)
200 : {
201 3 : return _sock_impl_set_opts(opts, &g_posix_impl_opts, len);
202 : }
203 :
204 : static int
205 0 : ssl_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len)
206 : {
207 0 : return _sock_impl_set_opts(opts, &g_ssl_impl_opts, len);
208 : }
209 :
210 : static void
211 14 : _opts_get_impl_opts(const struct spdk_sock_opts *opts, struct spdk_sock_impl_opts *dest,
212 : const struct spdk_sock_impl_opts *default_impl)
213 : {
214 : /* Copy the default impl_opts first to cover cases when user's impl_opts is smaller */
215 14 : memcpy(dest, default_impl, sizeof(*dest));
216 :
217 14 : if (opts->impl_opts != NULL) {
218 4 : assert(sizeof(*dest) >= opts->impl_opts_size);
219 4 : posix_sock_copy_impl_opts(dest, opts->impl_opts, opts->impl_opts_size);
220 : }
221 14 : }
222 :
223 : static int
224 0 : posix_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport,
225 : char *caddr, int clen, uint16_t *cport)
226 : {
227 0 : struct spdk_posix_sock *sock = __posix_sock(_sock);
228 0 : struct sockaddr_storage sa;
229 0 : socklen_t salen;
230 : int rc;
231 :
232 0 : assert(sock != NULL);
233 :
234 0 : memset(&sa, 0, sizeof sa);
235 0 : salen = sizeof sa;
236 0 : rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
237 0 : if (rc != 0) {
238 0 : SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
239 0 : return -1;
240 : }
241 :
242 0 : switch (sa.ss_family) {
243 0 : case AF_UNIX:
244 : /* Acceptable connection types that don't have IPs */
245 0 : return 0;
246 0 : case AF_INET:
247 : case AF_INET6:
248 : /* Code below will get IP addresses */
249 0 : break;
250 0 : default:
251 : /* Unsupported socket family */
252 0 : return -1;
253 : }
254 :
255 0 : rc = get_addr_str((struct sockaddr *)&sa, saddr, slen);
256 0 : if (rc != 0) {
257 0 : SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno);
258 0 : return -1;
259 : }
260 :
261 0 : if (sport) {
262 0 : if (sa.ss_family == AF_INET) {
263 0 : *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port);
264 0 : } else if (sa.ss_family == AF_INET6) {
265 0 : *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port);
266 : }
267 : }
268 :
269 0 : memset(&sa, 0, sizeof sa);
270 0 : salen = sizeof sa;
271 0 : rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen);
272 0 : if (rc != 0) {
273 0 : SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno);
274 0 : return -1;
275 : }
276 :
277 0 : rc = get_addr_str((struct sockaddr *)&sa, caddr, clen);
278 0 : if (rc != 0) {
279 0 : SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno);
280 0 : return -1;
281 : }
282 :
283 0 : if (cport) {
284 0 : if (sa.ss_family == AF_INET) {
285 0 : *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port);
286 0 : } else if (sa.ss_family == AF_INET6) {
287 0 : *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port);
288 : }
289 : }
290 :
291 0 : return 0;
292 : }
293 :
294 : enum posix_sock_create_type {
295 : SPDK_SOCK_CREATE_LISTEN,
296 : SPDK_SOCK_CREATE_CONNECT,
297 : };
298 :
299 : static int
300 1 : posix_sock_alloc_pipe(struct spdk_posix_sock *sock, int sz)
301 : {
302 1 : uint8_t *new_buf;
303 : struct spdk_pipe *new_pipe;
304 1 : struct iovec siov[2];
305 1 : struct iovec diov[2];
306 : int sbytes;
307 : ssize_t bytes;
308 : int rc;
309 :
310 1 : if (sock->recv_buf_sz == sz) {
311 0 : return 0;
312 : }
313 :
314 : /* If the new size is 0, just free the pipe */
315 1 : if (sz == 0) {
316 0 : spdk_pipe_destroy(sock->recv_pipe);
317 0 : free(sock->recv_buf);
318 0 : sock->recv_pipe = NULL;
319 0 : sock->recv_buf = NULL;
320 0 : return 0;
321 1 : } else if (sz < MIN_SOCK_PIPE_SIZE) {
322 0 : SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE);
323 0 : return -1;
324 : }
325 :
326 : /* Round up to next 64 byte multiple */
327 1 : rc = posix_memalign((void **)&new_buf, 64, sz);
328 1 : if (rc != 0) {
329 0 : SPDK_ERRLOG("socket recv buf allocation failed\n");
330 0 : return -ENOMEM;
331 : }
332 1 : memset(new_buf, 0, sz);
333 :
334 1 : new_pipe = spdk_pipe_create(new_buf, sz);
335 1 : if (new_pipe == NULL) {
336 0 : SPDK_ERRLOG("socket pipe allocation failed\n");
337 0 : free(new_buf);
338 0 : return -ENOMEM;
339 : }
340 :
341 1 : if (sock->recv_pipe != NULL) {
342 : /* Pull all of the data out of the old pipe */
343 0 : sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov);
344 0 : if (sbytes > sz) {
345 : /* Too much data to fit into the new pipe size */
346 0 : spdk_pipe_destroy(new_pipe);
347 0 : free(new_buf);
348 0 : return -EINVAL;
349 : }
350 :
351 0 : sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov);
352 0 : assert(sbytes == sz);
353 :
354 0 : bytes = spdk_iovcpy(siov, 2, diov, 2);
355 0 : spdk_pipe_writer_advance(new_pipe, bytes);
356 :
357 0 : spdk_pipe_destroy(sock->recv_pipe);
358 0 : free(sock->recv_buf);
359 : }
360 :
361 1 : sock->recv_buf_sz = sz;
362 1 : sock->recv_buf = new_buf;
363 1 : sock->recv_pipe = new_pipe;
364 :
365 1 : return 0;
366 : }
367 :
368 : static int
369 1 : posix_sock_set_recvbuf(struct spdk_sock *_sock, int sz)
370 : {
371 1 : struct spdk_posix_sock *sock = __posix_sock(_sock);
372 : int min_size;
373 : int rc;
374 :
375 1 : assert(sock != NULL);
376 :
377 1 : if (_sock->impl_opts.enable_recv_pipe) {
378 1 : rc = posix_sock_alloc_pipe(sock, sz);
379 1 : if (rc) {
380 0 : return rc;
381 : }
382 : }
383 :
384 : /* Set kernel buffer size to be at least MIN_SO_RCVBUF_SIZE and
385 : * _sock->impl_opts.recv_buf_size. */
386 1 : min_size = spdk_max(MIN_SO_RCVBUF_SIZE, _sock->impl_opts.recv_buf_size);
387 :
388 1 : if (sz < min_size) {
389 1 : sz = min_size;
390 : }
391 :
392 1 : rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz));
393 1 : if (rc < 0) {
394 0 : return rc;
395 : }
396 :
397 1 : _sock->impl_opts.recv_buf_size = sz;
398 :
399 1 : return 0;
400 : }
401 :
402 : static int
403 1 : posix_sock_set_sendbuf(struct spdk_sock *_sock, int sz)
404 : {
405 1 : struct spdk_posix_sock *sock = __posix_sock(_sock);
406 : int min_size;
407 : int rc;
408 :
409 1 : assert(sock != NULL);
410 :
411 : /* Set kernel buffer size to be at least MIN_SO_SNDBUF_SIZE and
412 : * _sock->impl_opts.send_buf_size. */
413 1 : min_size = spdk_max(MIN_SO_SNDBUF_SIZE, _sock->impl_opts.send_buf_size);
414 :
415 1 : if (sz < min_size) {
416 1 : sz = min_size;
417 : }
418 :
419 1 : rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz));
420 1 : if (rc < 0) {
421 0 : return rc;
422 : }
423 :
424 1 : _sock->impl_opts.send_buf_size = sz;
425 :
426 1 : return 0;
427 : }
428 :
429 : static void
430 21 : posix_sock_init(struct spdk_posix_sock *sock, bool enable_zero_copy)
431 : {
432 : #if defined(SPDK_ZEROCOPY) || defined(__linux__)
433 21 : int flag;
434 : int rc;
435 : #endif
436 :
437 : #if defined(SPDK_ZEROCOPY)
438 : flag = 1;
439 :
440 : if (enable_zero_copy) {
441 : /* Try to turn on zero copy sends */
442 : rc = setsockopt(sock->fd, SOL_SOCKET, SO_ZEROCOPY, &flag, sizeof(flag));
443 : if (rc == 0) {
444 : sock->zcopy = true;
445 : }
446 : }
447 : #endif
448 :
449 : #if defined(__linux__)
450 21 : flag = 1;
451 :
452 21 : if (sock->base.impl_opts.enable_quickack) {
453 0 : rc = setsockopt(sock->fd, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(flag));
454 0 : if (rc != 0) {
455 0 : SPDK_ERRLOG("quickack was failed to set\n");
456 : }
457 : }
458 :
459 21 : spdk_sock_get_placement_id(sock->fd, sock->base.impl_opts.enable_placement_id,
460 : &sock->placement_id);
461 :
462 21 : if (sock->base.impl_opts.enable_placement_id == PLACEMENT_MARK) {
463 : /* Save placement_id */
464 0 : spdk_sock_map_insert(&g_map, sock->placement_id, NULL);
465 : }
466 : #endif
467 21 : }
468 :
469 : static struct spdk_posix_sock *
470 21 : posix_sock_alloc(int fd, struct spdk_sock_impl_opts *impl_opts, bool enable_zero_copy)
471 : {
472 : struct spdk_posix_sock *sock;
473 :
474 21 : sock = calloc(1, sizeof(*sock));
475 21 : if (sock == NULL) {
476 0 : SPDK_ERRLOG("sock allocation failed\n");
477 0 : return NULL;
478 : }
479 :
480 21 : sock->fd = fd;
481 21 : memcpy(&sock->base.impl_opts, impl_opts, sizeof(*impl_opts));
482 21 : posix_sock_init(sock, enable_zero_copy);
483 :
484 21 : return sock;
485 : }
486 :
487 : static int
488 14 : posix_fd_create(struct addrinfo *res, struct spdk_sock_opts *opts,
489 : struct spdk_sock_impl_opts *impl_opts)
490 : {
491 : int fd;
492 14 : int val = 1;
493 14 : int rc, sz;
494 : #if defined(__linux__)
495 14 : int to;
496 : #endif
497 :
498 14 : fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
499 14 : if (fd < 0) {
500 : /* error */
501 0 : return -1;
502 : }
503 :
504 14 : sz = impl_opts->recv_buf_size;
505 14 : rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz));
506 : if (rc) {
507 : /* Not fatal */
508 : }
509 :
510 14 : sz = impl_opts->send_buf_size;
511 14 : rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz));
512 : if (rc) {
513 : /* Not fatal */
514 : }
515 :
516 14 : rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val);
517 14 : if (rc != 0) {
518 0 : close(fd);
519 : /* error */
520 0 : return -1;
521 : }
522 14 : rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val);
523 14 : if (rc != 0) {
524 0 : close(fd);
525 : /* error */
526 0 : return -1;
527 : }
528 :
529 : #if defined(SO_PRIORITY)
530 14 : if (opts->priority) {
531 0 : rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val);
532 0 : if (rc != 0) {
533 0 : close(fd);
534 : /* error */
535 0 : return -1;
536 : }
537 : }
538 : #endif
539 :
540 14 : if (res->ai_family == AF_INET6) {
541 0 : rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val);
542 0 : if (rc != 0) {
543 0 : close(fd);
544 : /* error */
545 0 : return -1;
546 : }
547 : }
548 :
549 14 : if (opts->ack_timeout) {
550 : #if defined(__linux__)
551 0 : to = opts->ack_timeout;
552 0 : rc = setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &to, sizeof(to));
553 0 : if (rc != 0) {
554 0 : close(fd);
555 : /* error */
556 0 : return -1;
557 : }
558 : #else
559 : SPDK_WARNLOG("TCP_USER_TIMEOUT is not supported.\n");
560 : #endif
561 : }
562 :
563 14 : return fd;
564 : }
565 :
566 : static int
567 0 : posix_sock_psk_find_session_server_cb(SSL *ssl, const unsigned char *identity,
568 : size_t identity_len, SSL_SESSION **sess)
569 : {
570 0 : struct spdk_sock_impl_opts *impl_opts = SSL_get_app_data(ssl);
571 0 : uint8_t key[SSL_MAX_MASTER_KEY_LENGTH] = {};
572 : int keylen;
573 : int rc, i;
574 : STACK_OF(SSL_CIPHER) *ciphers;
575 : const SSL_CIPHER *cipher;
576 : const char *cipher_name;
577 0 : const char *user_cipher = NULL;
578 0 : bool found = false;
579 :
580 0 : if (impl_opts->get_key) {
581 0 : rc = impl_opts->get_key(key, sizeof(key), &user_cipher, identity, impl_opts->get_key_ctx);
582 0 : if (rc < 0) {
583 0 : SPDK_ERRLOG("Unable to find PSK for identity: %s\n", identity);
584 0 : return 0;
585 : }
586 0 : keylen = rc;
587 : } else {
588 0 : if (impl_opts->psk_key == NULL) {
589 0 : SPDK_ERRLOG("PSK is not set\n");
590 0 : return 0;
591 : }
592 :
593 0 : SPDK_DEBUGLOG(sock_posix, "Length of Client's PSK ID %lu\n", strlen(impl_opts->psk_identity));
594 0 : if (strcmp(impl_opts->psk_identity, identity) != 0) {
595 0 : SPDK_ERRLOG("Unknown Client's PSK ID\n");
596 0 : return 0;
597 : }
598 0 : keylen = impl_opts->psk_key_size;
599 :
600 0 : memcpy(key, impl_opts->psk_key, keylen);
601 0 : user_cipher = impl_opts->tls_cipher_suites;
602 : }
603 :
604 0 : if (user_cipher == NULL) {
605 0 : SPDK_ERRLOG("Cipher suite not set\n");
606 0 : return 0;
607 : }
608 :
609 0 : *sess = SSL_SESSION_new();
610 0 : if (*sess == NULL) {
611 0 : SPDK_ERRLOG("Unable to allocate new SSL session\n");
612 0 : return 0;
613 : }
614 :
615 0 : ciphers = SSL_get_ciphers(ssl);
616 0 : for (i = 0; i < sk_SSL_CIPHER_num(ciphers); i++) {
617 0 : cipher = sk_SSL_CIPHER_value(ciphers, i);
618 0 : cipher_name = SSL_CIPHER_get_name(cipher);
619 :
620 0 : if (strcmp(user_cipher, cipher_name) == 0) {
621 0 : rc = SSL_SESSION_set_cipher(*sess, cipher);
622 0 : if (rc != 1) {
623 0 : SPDK_ERRLOG("Unable to set cipher: %s\n", cipher_name);
624 0 : goto err;
625 : }
626 0 : found = true;
627 0 : break;
628 : }
629 : }
630 0 : if (found == false) {
631 0 : SPDK_ERRLOG("No suitable cipher found\n");
632 0 : goto err;
633 : }
634 :
635 0 : SPDK_DEBUGLOG(sock_posix, "Cipher selected: %s\n", cipher_name);
636 :
637 0 : rc = SSL_SESSION_set_protocol_version(*sess, TLS1_3_VERSION);
638 0 : if (rc != 1) {
639 0 : SPDK_ERRLOG("Unable to set TLS version: %d\n", TLS1_3_VERSION);
640 0 : goto err;
641 : }
642 :
643 0 : rc = SSL_SESSION_set1_master_key(*sess, key, keylen);
644 0 : if (rc != 1) {
645 0 : SPDK_ERRLOG("Unable to set PSK for session\n");
646 0 : goto err;
647 : }
648 :
649 0 : return 1;
650 :
651 0 : err:
652 0 : SSL_SESSION_free(*sess);
653 0 : *sess = NULL;
654 0 : return 0;
655 : }
656 :
657 : static int
658 0 : posix_sock_psk_use_session_client_cb(SSL *ssl, const EVP_MD *md, const unsigned char **identity,
659 : size_t *identity_len, SSL_SESSION **sess)
660 : {
661 0 : struct spdk_sock_impl_opts *impl_opts = SSL_get_app_data(ssl);
662 : int rc, i;
663 : STACK_OF(SSL_CIPHER) *ciphers;
664 : const SSL_CIPHER *cipher;
665 : const char *cipher_name;
666 : long keylen;
667 0 : bool found = false;
668 :
669 0 : if (impl_opts->psk_key == NULL) {
670 0 : SPDK_ERRLOG("PSK is not set\n");
671 0 : return 0;
672 : }
673 0 : if (impl_opts->psk_key_size > SSL_MAX_MASTER_KEY_LENGTH) {
674 0 : SPDK_ERRLOG("PSK too long\n");
675 0 : return 0;
676 : }
677 0 : keylen = impl_opts->psk_key_size;
678 :
679 0 : if (impl_opts->tls_cipher_suites == NULL) {
680 0 : SPDK_ERRLOG("Cipher suite not set\n");
681 0 : return 0;
682 : }
683 0 : *sess = SSL_SESSION_new();
684 0 : if (*sess == NULL) {
685 0 : SPDK_ERRLOG("Unable to allocate new SSL session\n");
686 0 : return 0;
687 : }
688 :
689 0 : ciphers = SSL_get_ciphers(ssl);
690 0 : for (i = 0; i < sk_SSL_CIPHER_num(ciphers); i++) {
691 0 : cipher = sk_SSL_CIPHER_value(ciphers, i);
692 0 : cipher_name = SSL_CIPHER_get_name(cipher);
693 :
694 0 : if (strcmp(impl_opts->tls_cipher_suites, cipher_name) == 0) {
695 0 : rc = SSL_SESSION_set_cipher(*sess, cipher);
696 0 : if (rc != 1) {
697 0 : SPDK_ERRLOG("Unable to set cipher: %s\n", cipher_name);
698 0 : goto err;
699 : }
700 0 : found = true;
701 0 : break;
702 : }
703 : }
704 0 : if (found == false) {
705 0 : SPDK_ERRLOG("No suitable cipher found\n");
706 0 : goto err;
707 : }
708 :
709 0 : SPDK_DEBUGLOG(sock_posix, "Cipher selected: %s\n", cipher_name);
710 :
711 0 : rc = SSL_SESSION_set_protocol_version(*sess, TLS1_3_VERSION);
712 0 : if (rc != 1) {
713 0 : SPDK_ERRLOG("Unable to set TLS version: %d\n", TLS1_3_VERSION);
714 0 : goto err;
715 : }
716 :
717 0 : rc = SSL_SESSION_set1_master_key(*sess, impl_opts->psk_key, keylen);
718 0 : if (rc != 1) {
719 0 : SPDK_ERRLOG("Unable to set PSK for session\n");
720 0 : goto err;
721 : }
722 :
723 0 : *identity_len = strlen(impl_opts->psk_identity);
724 0 : *identity = impl_opts->psk_identity;
725 :
726 0 : return 1;
727 :
728 0 : err:
729 0 : SSL_SESSION_free(*sess);
730 0 : *sess = NULL;
731 0 : return 0;
732 : }
733 :
734 : static SSL_CTX *
735 0 : posix_sock_create_ssl_context(const SSL_METHOD *method, struct spdk_sock_opts *opts,
736 : struct spdk_sock_impl_opts *impl_opts)
737 : {
738 : SSL_CTX *ctx;
739 0 : int tls_version = 0;
740 0 : bool ktls_enabled = false;
741 : #ifdef SSL_OP_ENABLE_KTLS
742 : long options;
743 : #endif
744 :
745 0 : SSL_library_init();
746 0 : OpenSSL_add_all_algorithms();
747 0 : SSL_load_error_strings();
748 : /* Produce a SSL CTX in SSL V2 and V3 standards compliant way */
749 0 : ctx = SSL_CTX_new(method);
750 0 : if (!ctx) {
751 0 : SPDK_ERRLOG("SSL_CTX_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL));
752 0 : return NULL;
753 : }
754 0 : SPDK_DEBUGLOG(sock_posix, "SSL context created\n");
755 :
756 0 : switch (impl_opts->tls_version) {
757 0 : case 0:
758 : /* auto-negotioation */
759 0 : break;
760 0 : case SPDK_TLS_VERSION_1_3:
761 0 : tls_version = TLS1_3_VERSION;
762 0 : break;
763 0 : default:
764 0 : SPDK_ERRLOG("Incorrect TLS version provided: %d\n", impl_opts->tls_version);
765 0 : goto err;
766 : }
767 :
768 0 : if (tls_version) {
769 0 : SPDK_DEBUGLOG(sock_posix, "Hardening TLS version to '%d'='0x%X'\n", impl_opts->tls_version,
770 : tls_version);
771 0 : if (!SSL_CTX_set_min_proto_version(ctx, tls_version)) {
772 0 : SPDK_ERRLOG("Unable to set Min TLS version to '%d'='0x%X\n", impl_opts->tls_version, tls_version);
773 0 : goto err;
774 : }
775 0 : if (!SSL_CTX_set_max_proto_version(ctx, tls_version)) {
776 0 : SPDK_ERRLOG("Unable to set Max TLS version to '%d'='0x%X\n", impl_opts->tls_version, tls_version);
777 0 : goto err;
778 : }
779 : }
780 0 : if (impl_opts->enable_ktls) {
781 0 : SPDK_DEBUGLOG(sock_posix, "Enabling kTLS offload\n");
782 : #ifdef SSL_OP_ENABLE_KTLS
783 : options = SSL_CTX_set_options(ctx, SSL_OP_ENABLE_KTLS);
784 : ktls_enabled = options & SSL_OP_ENABLE_KTLS;
785 : #else
786 0 : ktls_enabled = false;
787 : #endif
788 0 : if (!ktls_enabled) {
789 0 : SPDK_ERRLOG("Unable to set kTLS offload via SSL_CTX_set_options(). Configure openssl with 'enable-ktls'\n");
790 0 : goto err;
791 : }
792 : }
793 :
794 : /* SSL_CTX_set_ciphersuites() return 1 if the requested
795 : * cipher suite list was configured, and 0 otherwise. */
796 0 : if (impl_opts->tls_cipher_suites != NULL &&
797 0 : SSL_CTX_set_ciphersuites(ctx, impl_opts->tls_cipher_suites) != 1) {
798 0 : SPDK_ERRLOG("Unable to set TLS cipher suites for SSL'\n");
799 0 : goto err;
800 : }
801 :
802 0 : return ctx;
803 :
804 0 : err:
805 0 : SSL_CTX_free(ctx);
806 0 : return NULL;
807 : }
808 :
809 : static SSL *
810 0 : ssl_sock_setup_connect(SSL_CTX *ctx, int fd)
811 : {
812 : SSL *ssl;
813 :
814 0 : ssl = SSL_new(ctx);
815 0 : if (!ssl) {
816 0 : SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL));
817 0 : return NULL;
818 : }
819 0 : SSL_set_fd(ssl, fd);
820 0 : SSL_set_connect_state(ssl);
821 0 : SSL_set_psk_use_session_callback(ssl, posix_sock_psk_use_session_client_cb);
822 0 : SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl);
823 0 : SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl);
824 0 : SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl);
825 0 : SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n",
826 : SSL_CIPHER_get_name(SSL_get_current_cipher(ssl)));
827 0 : return ssl;
828 : }
829 :
830 : static SSL *
831 0 : ssl_sock_setup_accept(SSL_CTX *ctx, int fd)
832 : {
833 : SSL *ssl;
834 :
835 0 : ssl = SSL_new(ctx);
836 0 : if (!ssl) {
837 0 : SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL));
838 0 : return NULL;
839 : }
840 0 : SSL_set_fd(ssl, fd);
841 0 : SSL_set_accept_state(ssl);
842 0 : SSL_set_psk_find_session_callback(ssl, posix_sock_psk_find_session_server_cb);
843 0 : SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl);
844 0 : SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl);
845 0 : SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl);
846 0 : SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n",
847 : SSL_CIPHER_get_name(SSL_get_current_cipher(ssl)));
848 0 : return ssl;
849 : }
850 :
851 : static ssize_t
852 0 : SSL_readv(SSL *ssl, const struct iovec *iov, int iovcnt)
853 : {
854 0 : int i, rc = 0;
855 0 : ssize_t total = 0;
856 :
857 0 : for (i = 0; i < iovcnt; i++) {
858 0 : rc = SSL_read(ssl, iov[i].iov_base, iov[i].iov_len);
859 :
860 0 : if (rc > 0) {
861 0 : total += rc;
862 : }
863 0 : if (rc != (int)iov[i].iov_len) {
864 0 : break;
865 : }
866 : }
867 0 : if (total > 0) {
868 0 : errno = 0;
869 0 : return total;
870 : }
871 0 : switch (SSL_get_error(ssl, rc)) {
872 0 : case SSL_ERROR_ZERO_RETURN:
873 0 : errno = ENOTCONN;
874 0 : return 0;
875 0 : case SSL_ERROR_WANT_READ:
876 : case SSL_ERROR_WANT_WRITE:
877 : case SSL_ERROR_WANT_CONNECT:
878 : case SSL_ERROR_WANT_ACCEPT:
879 : case SSL_ERROR_WANT_X509_LOOKUP:
880 : case SSL_ERROR_WANT_ASYNC:
881 : case SSL_ERROR_WANT_ASYNC_JOB:
882 : case SSL_ERROR_WANT_CLIENT_HELLO_CB:
883 0 : errno = EAGAIN;
884 0 : return -1;
885 0 : case SSL_ERROR_SYSCALL:
886 : case SSL_ERROR_SSL:
887 0 : errno = ENOTCONN;
888 0 : return -1;
889 0 : default:
890 0 : errno = ENOTCONN;
891 0 : return -1;
892 : }
893 : }
894 :
895 : static ssize_t
896 0 : SSL_writev(SSL *ssl, struct iovec *iov, int iovcnt)
897 : {
898 0 : int i, rc = 0;
899 0 : ssize_t total = 0;
900 :
901 0 : for (i = 0; i < iovcnt; i++) {
902 0 : rc = SSL_write(ssl, iov[i].iov_base, iov[i].iov_len);
903 :
904 0 : if (rc > 0) {
905 0 : total += rc;
906 : }
907 0 : if (rc != (int)iov[i].iov_len) {
908 0 : break;
909 : }
910 : }
911 0 : if (total > 0) {
912 0 : errno = 0;
913 0 : return total;
914 : }
915 0 : switch (SSL_get_error(ssl, rc)) {
916 0 : case SSL_ERROR_ZERO_RETURN:
917 0 : errno = ENOTCONN;
918 0 : return 0;
919 0 : case SSL_ERROR_WANT_READ:
920 : case SSL_ERROR_WANT_WRITE:
921 : case SSL_ERROR_WANT_CONNECT:
922 : case SSL_ERROR_WANT_ACCEPT:
923 : case SSL_ERROR_WANT_X509_LOOKUP:
924 : case SSL_ERROR_WANT_ASYNC:
925 : case SSL_ERROR_WANT_ASYNC_JOB:
926 : case SSL_ERROR_WANT_CLIENT_HELLO_CB:
927 0 : errno = EAGAIN;
928 0 : return -1;
929 0 : case SSL_ERROR_SYSCALL:
930 : case SSL_ERROR_SSL:
931 0 : errno = ENOTCONN;
932 0 : return -1;
933 0 : default:
934 0 : errno = ENOTCONN;
935 0 : return -1;
936 : }
937 : }
938 :
939 : static struct spdk_sock *
940 14 : posix_sock_create(const char *ip, int port,
941 : enum posix_sock_create_type type,
942 : struct spdk_sock_opts *opts,
943 : bool enable_ssl)
944 : {
945 : struct spdk_posix_sock *sock;
946 14 : struct spdk_sock_impl_opts impl_opts;
947 14 : char buf[MAX_TMPBUF];
948 14 : char portnum[PORTNUMLEN];
949 : char *p;
950 14 : struct addrinfo hints, *res, *res0;
951 : int fd, flag;
952 : int rc;
953 14 : bool enable_zcopy_user_opts = true;
954 14 : bool enable_zcopy_impl_opts = true;
955 14 : SSL_CTX *ctx = 0;
956 14 : SSL *ssl = 0;
957 :
958 14 : assert(opts != NULL);
959 14 : if (enable_ssl) {
960 0 : _opts_get_impl_opts(opts, &impl_opts, &g_ssl_impl_opts);
961 : } else {
962 14 : _opts_get_impl_opts(opts, &impl_opts, &g_posix_impl_opts);
963 : }
964 :
965 14 : if (ip == NULL) {
966 0 : return NULL;
967 : }
968 14 : if (ip[0] == '[') {
969 0 : snprintf(buf, sizeof(buf), "%s", ip + 1);
970 0 : p = strchr(buf, ']');
971 0 : if (p != NULL) {
972 0 : *p = '\0';
973 : }
974 0 : ip = (const char *) &buf[0];
975 : }
976 :
977 14 : snprintf(portnum, sizeof portnum, "%d", port);
978 14 : memset(&hints, 0, sizeof hints);
979 14 : hints.ai_family = PF_UNSPEC;
980 14 : hints.ai_socktype = SOCK_STREAM;
981 14 : hints.ai_flags = AI_NUMERICSERV;
982 14 : hints.ai_flags |= AI_PASSIVE;
983 14 : hints.ai_flags |= AI_NUMERICHOST;
984 14 : rc = getaddrinfo(ip, portnum, &hints, &res0);
985 14 : if (rc != 0) {
986 0 : SPDK_ERRLOG("getaddrinfo() failed %s (%d)\n", gai_strerror(rc), rc);
987 0 : return NULL;
988 : }
989 :
990 : /* try listen */
991 14 : fd = -1;
992 14 : for (res = res0; res != NULL; res = res->ai_next) {
993 14 : retry:
994 14 : fd = posix_fd_create(res, opts, &impl_opts);
995 14 : if (fd < 0) {
996 0 : continue;
997 : }
998 14 : if (type == SPDK_SOCK_CREATE_LISTEN) {
999 6 : rc = bind(fd, res->ai_addr, res->ai_addrlen);
1000 6 : if (rc != 0) {
1001 0 : SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno);
1002 0 : switch (errno) {
1003 0 : case EINTR:
1004 : /* interrupted? */
1005 0 : close(fd);
1006 0 : goto retry;
1007 0 : case EADDRNOTAVAIL:
1008 0 : SPDK_ERRLOG("IP address %s not available. "
1009 : "Verify IP address in config file "
1010 : "and make sure setup script is "
1011 : "run before starting spdk app.\n", ip);
1012 : /* FALLTHROUGH */
1013 0 : default:
1014 : /* try next family */
1015 0 : close(fd);
1016 0 : fd = -1;
1017 0 : continue;
1018 : }
1019 : }
1020 : /* bind OK */
1021 6 : rc = listen(fd, 512);
1022 6 : if (rc != 0) {
1023 0 : SPDK_ERRLOG("listen() failed, errno = %d\n", errno);
1024 0 : close(fd);
1025 0 : fd = -1;
1026 0 : break;
1027 : }
1028 6 : enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_server;
1029 8 : } else if (type == SPDK_SOCK_CREATE_CONNECT) {
1030 8 : rc = connect(fd, res->ai_addr, res->ai_addrlen);
1031 8 : if (rc != 0) {
1032 0 : SPDK_ERRLOG("connect() failed, errno = %d\n", errno);
1033 : /* try next family */
1034 0 : close(fd);
1035 0 : fd = -1;
1036 0 : continue;
1037 : }
1038 8 : enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_client;
1039 8 : if (enable_ssl) {
1040 0 : ctx = posix_sock_create_ssl_context(TLS_client_method(), opts, &impl_opts);
1041 0 : if (!ctx) {
1042 0 : SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno);
1043 0 : close(fd);
1044 0 : fd = -1;
1045 0 : break;
1046 : }
1047 0 : ssl = ssl_sock_setup_connect(ctx, fd);
1048 0 : if (!ssl) {
1049 0 : SPDK_ERRLOG("ssl_sock_setup_connect() failed, errno = %d\n", errno);
1050 0 : close(fd);
1051 0 : fd = -1;
1052 0 : SSL_CTX_free(ctx);
1053 0 : break;
1054 : }
1055 : }
1056 : }
1057 :
1058 14 : flag = fcntl(fd, F_GETFL);
1059 14 : if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) {
1060 0 : SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno);
1061 0 : SSL_free(ssl);
1062 0 : SSL_CTX_free(ctx);
1063 0 : close(fd);
1064 0 : fd = -1;
1065 0 : break;
1066 : }
1067 14 : break;
1068 : }
1069 14 : freeaddrinfo(res0);
1070 :
1071 14 : if (fd < 0) {
1072 0 : return NULL;
1073 : }
1074 :
1075 : /* Only enable zero copy for non-loopback and non-ssl sockets. */
1076 14 : enable_zcopy_user_opts = opts->zcopy && !sock_is_loopback(fd) && !enable_ssl;
1077 :
1078 14 : sock = posix_sock_alloc(fd, &impl_opts, enable_zcopy_user_opts && enable_zcopy_impl_opts);
1079 14 : if (sock == NULL) {
1080 0 : SPDK_ERRLOG("sock allocation failed\n");
1081 0 : SSL_free(ssl);
1082 0 : SSL_CTX_free(ctx);
1083 0 : close(fd);
1084 0 : return NULL;
1085 : }
1086 :
1087 14 : if (ctx) {
1088 0 : sock->ctx = ctx;
1089 : }
1090 :
1091 14 : if (ssl) {
1092 0 : sock->ssl = ssl;
1093 0 : SSL_set_app_data(ssl, &sock->base.impl_opts);
1094 : }
1095 :
1096 14 : return &sock->base;
1097 : }
1098 :
1099 : static struct spdk_sock *
1100 6 : posix_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts)
1101 : {
1102 6 : return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, false);
1103 : }
1104 :
1105 : static struct spdk_sock *
1106 8 : posix_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts)
1107 : {
1108 8 : return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, false);
1109 : }
1110 :
1111 : static struct spdk_sock *
1112 9 : _posix_sock_accept(struct spdk_sock *_sock, bool enable_ssl)
1113 : {
1114 9 : struct spdk_posix_sock *sock = __posix_sock(_sock);
1115 9 : struct sockaddr_storage sa;
1116 9 : socklen_t salen;
1117 : int rc, fd;
1118 : struct spdk_posix_sock *new_sock;
1119 : int flag;
1120 9 : SSL_CTX *ctx = 0;
1121 9 : SSL *ssl = 0;
1122 :
1123 9 : memset(&sa, 0, sizeof(sa));
1124 9 : salen = sizeof(sa);
1125 :
1126 9 : assert(sock != NULL);
1127 :
1128 9 : rc = accept(sock->fd, (struct sockaddr *)&sa, &salen);
1129 :
1130 9 : if (rc == -1) {
1131 2 : return NULL;
1132 : }
1133 :
1134 7 : fd = rc;
1135 :
1136 7 : flag = fcntl(fd, F_GETFL);
1137 7 : if ((!(flag & O_NONBLOCK)) && (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0)) {
1138 0 : SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno);
1139 0 : close(fd);
1140 0 : return NULL;
1141 : }
1142 :
1143 : #if defined(SO_PRIORITY)
1144 : /* The priority is not inherited, so call this function again */
1145 7 : if (sock->base.opts.priority) {
1146 0 : rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int));
1147 0 : if (rc != 0) {
1148 0 : close(fd);
1149 0 : return NULL;
1150 : }
1151 : }
1152 : #endif
1153 :
1154 : /* Establish SSL connection */
1155 7 : if (enable_ssl) {
1156 0 : ctx = posix_sock_create_ssl_context(TLS_server_method(), &sock->base.opts, &sock->base.impl_opts);
1157 0 : if (!ctx) {
1158 0 : SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno);
1159 0 : close(fd);
1160 0 : return NULL;
1161 : }
1162 0 : ssl = ssl_sock_setup_accept(ctx, fd);
1163 0 : if (!ssl) {
1164 0 : SPDK_ERRLOG("ssl_sock_setup_accept() failed, errno = %d\n", errno);
1165 0 : close(fd);
1166 0 : SSL_CTX_free(ctx);
1167 0 : return NULL;
1168 : }
1169 : }
1170 :
1171 : /* Inherit the zero copy feature from the listen socket */
1172 7 : new_sock = posix_sock_alloc(fd, &sock->base.impl_opts, sock->zcopy);
1173 7 : if (new_sock == NULL) {
1174 0 : close(fd);
1175 0 : SSL_free(ssl);
1176 0 : SSL_CTX_free(ctx);
1177 0 : return NULL;
1178 : }
1179 :
1180 7 : if (ctx) {
1181 0 : new_sock->ctx = ctx;
1182 : }
1183 :
1184 7 : if (ssl) {
1185 0 : new_sock->ssl = ssl;
1186 0 : SSL_set_app_data(ssl, &new_sock->base.impl_opts);
1187 : }
1188 :
1189 7 : return &new_sock->base;
1190 : }
1191 :
1192 : static struct spdk_sock *
1193 9 : posix_sock_accept(struct spdk_sock *_sock)
1194 : {
1195 9 : return _posix_sock_accept(_sock, false);
1196 : }
1197 :
1198 : static int
1199 21 : posix_sock_close(struct spdk_sock *_sock)
1200 : {
1201 21 : struct spdk_posix_sock *sock = __posix_sock(_sock);
1202 :
1203 21 : assert(TAILQ_EMPTY(&_sock->pending_reqs));
1204 :
1205 21 : if (sock->ssl != NULL) {
1206 0 : SSL_shutdown(sock->ssl);
1207 : }
1208 :
1209 : /* If the socket fails to close, the best choice is to
1210 : * leak the fd but continue to free the rest of the sock
1211 : * memory. */
1212 21 : close(sock->fd);
1213 :
1214 21 : SSL_free(sock->ssl);
1215 21 : SSL_CTX_free(sock->ctx);
1216 :
1217 21 : spdk_pipe_destroy(sock->recv_pipe);
1218 21 : free(sock->recv_buf);
1219 21 : free(sock);
1220 :
1221 21 : return 0;
1222 : }
1223 :
1224 : #ifdef SPDK_ZEROCOPY
1225 : static int
1226 : _sock_check_zcopy(struct spdk_sock *sock)
1227 : {
1228 : struct spdk_posix_sock *psock = __posix_sock(sock);
1229 : struct msghdr msgh = {};
1230 : uint8_t buf[sizeof(struct cmsghdr) + sizeof(struct sock_extended_err)];
1231 : ssize_t rc;
1232 : struct sock_extended_err *serr;
1233 : struct cmsghdr *cm;
1234 : uint32_t idx;
1235 : struct spdk_sock_request *req, *treq;
1236 : bool found;
1237 :
1238 : msgh.msg_control = buf;
1239 : msgh.msg_controllen = sizeof(buf);
1240 :
1241 : while (true) {
1242 : rc = recvmsg(psock->fd, &msgh, MSG_ERRQUEUE);
1243 :
1244 : if (rc < 0) {
1245 : if (errno == EWOULDBLOCK || errno == EAGAIN) {
1246 : return 0;
1247 : }
1248 :
1249 : if (!TAILQ_EMPTY(&sock->pending_reqs)) {
1250 : SPDK_ERRLOG("Attempting to receive from ERRQUEUE yielded error, but pending list still has orphaned entries\n");
1251 : } else {
1252 : SPDK_WARNLOG("Recvmsg yielded an error!\n");
1253 : }
1254 : return 0;
1255 : }
1256 :
1257 : cm = CMSG_FIRSTHDR(&msgh);
1258 : if (!(cm &&
1259 : ((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) ||
1260 : (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR)))) {
1261 : SPDK_WARNLOG("Unexpected cmsg level or type!\n");
1262 : return 0;
1263 : }
1264 :
1265 : serr = (struct sock_extended_err *)CMSG_DATA(cm);
1266 : if (serr->ee_errno != 0 || serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
1267 : SPDK_WARNLOG("Unexpected extended error origin\n");
1268 : return 0;
1269 : }
1270 :
1271 : /* Most of the time, the pending_reqs array is in the exact
1272 : * order we need such that all of the requests to complete are
1273 : * in order, in the front. It is guaranteed that all requests
1274 : * belonging to the same sendmsg call are sequential, so once
1275 : * we encounter one match we can stop looping as soon as a
1276 : * non-match is found.
1277 : */
1278 : idx = serr->ee_info;
1279 : while (true) {
1280 : found = false;
1281 : TAILQ_FOREACH_SAFE(req, &sock->pending_reqs, internal.link, treq) {
1282 : if (!req->internal.is_zcopy) {
1283 : /* This wasn't a zcopy request. It was just waiting in line to complete */
1284 : rc = spdk_sock_request_put(sock, req, 0);
1285 : if (rc < 0) {
1286 : return rc;
1287 : }
1288 : } else if (req->internal.offset == idx) {
1289 : found = true;
1290 : rc = spdk_sock_request_put(sock, req, 0);
1291 : if (rc < 0) {
1292 : return rc;
1293 : }
1294 : } else if (found) {
1295 : break;
1296 : }
1297 : }
1298 :
1299 : if (idx == serr->ee_data) {
1300 : break;
1301 : }
1302 :
1303 : if (idx == UINT32_MAX) {
1304 : idx = 0;
1305 : } else {
1306 : idx++;
1307 : }
1308 : }
1309 : }
1310 :
1311 : return 0;
1312 : }
1313 : #endif
1314 :
1315 : static int
1316 29 : _sock_flush(struct spdk_sock *sock)
1317 : {
1318 29 : struct spdk_posix_sock *psock = __posix_sock(sock);
1319 29 : struct msghdr msg = {};
1320 29 : int flags;
1321 29 : struct iovec iovs[IOV_BATCH_SIZE];
1322 : int iovcnt;
1323 : int retval;
1324 : struct spdk_sock_request *req;
1325 : int i;
1326 : ssize_t rc, sent;
1327 : unsigned int offset;
1328 : size_t len;
1329 29 : bool is_zcopy = false;
1330 :
1331 : /* Can't flush from within a callback or we end up with recursive calls */
1332 29 : if (sock->cb_cnt > 0) {
1333 0 : errno = EAGAIN;
1334 0 : return -1;
1335 : }
1336 :
1337 : #ifdef SPDK_ZEROCOPY
1338 : if (psock->zcopy) {
1339 : flags = MSG_ZEROCOPY | MSG_NOSIGNAL;
1340 : } else
1341 : #endif
1342 : {
1343 29 : flags = MSG_NOSIGNAL;
1344 : }
1345 :
1346 29 : iovcnt = spdk_sock_prep_reqs(sock, iovs, 0, NULL, &flags);
1347 29 : if (iovcnt == 0) {
1348 22 : return 0;
1349 : }
1350 :
1351 : #ifdef SPDK_ZEROCOPY
1352 : is_zcopy = flags & MSG_ZEROCOPY;
1353 : #endif
1354 :
1355 : /* Perform the vectored write */
1356 7 : msg.msg_iov = iovs;
1357 7 : msg.msg_iovlen = iovcnt;
1358 :
1359 7 : if (psock->ssl) {
1360 0 : rc = SSL_writev(psock->ssl, iovs, iovcnt);
1361 : } else {
1362 7 : rc = sendmsg(psock->fd, &msg, flags);
1363 : }
1364 7 : if (rc <= 0) {
1365 0 : if (rc == 0 || errno == EAGAIN || errno == EWOULDBLOCK || (errno == ENOBUFS && psock->zcopy)) {
1366 0 : errno = EAGAIN;
1367 : }
1368 0 : return -1;
1369 : }
1370 :
1371 7 : sent = rc;
1372 :
1373 7 : if (is_zcopy) {
1374 : /* Handling overflow case, because we use psock->sendmsg_idx - 1 for the
1375 : * req->internal.offset, so sendmsg_idx should not be zero */
1376 0 : if (spdk_unlikely(psock->sendmsg_idx == UINT32_MAX)) {
1377 0 : psock->sendmsg_idx = 1;
1378 : } else {
1379 0 : psock->sendmsg_idx++;
1380 : }
1381 : }
1382 :
1383 : /* Consume the requests that were actually written */
1384 7 : req = TAILQ_FIRST(&sock->queued_reqs);
1385 8 : while (req) {
1386 8 : offset = req->internal.offset;
1387 :
1388 : /* req->internal.is_zcopy is true when the whole req or part of it is sent with zerocopy */
1389 8 : req->internal.is_zcopy = is_zcopy;
1390 :
1391 20 : for (i = 0; i < req->iovcnt; i++) {
1392 : /* Advance by the offset first */
1393 14 : if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) {
1394 1 : offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len;
1395 1 : continue;
1396 : }
1397 :
1398 : /* Calculate the remaining length of this element */
1399 13 : len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset;
1400 :
1401 13 : if (len > (size_t)rc) {
1402 : /* This element was partially sent. */
1403 2 : req->internal.offset += rc;
1404 2 : return sent;
1405 : }
1406 :
1407 11 : offset = 0;
1408 11 : req->internal.offset += len;
1409 11 : rc -= len;
1410 : }
1411 :
1412 : /* Handled a full request. */
1413 6 : spdk_sock_request_pend(sock, req);
1414 :
1415 6 : if (!req->internal.is_zcopy && req == TAILQ_FIRST(&sock->pending_reqs)) {
1416 : /* The sendmsg syscall above isn't currently asynchronous,
1417 : * so it's already done. */
1418 6 : retval = spdk_sock_request_put(sock, req, 0);
1419 6 : if (retval) {
1420 1 : break;
1421 : }
1422 : } else {
1423 : /* Re-use the offset field to hold the sendmsg call index. The
1424 : * index is 0 based, so subtract one here because we've already
1425 : * incremented above. */
1426 0 : req->internal.offset = psock->sendmsg_idx - 1;
1427 : }
1428 :
1429 5 : if (rc == 0) {
1430 4 : break;
1431 : }
1432 :
1433 1 : req = TAILQ_FIRST(&sock->queued_reqs);
1434 : }
1435 :
1436 5 : return sent;
1437 : }
1438 :
1439 : static int
1440 1 : posix_sock_flush(struct spdk_sock *sock)
1441 : {
1442 : #ifdef SPDK_ZEROCOPY
1443 : struct spdk_posix_sock *psock = __posix_sock(sock);
1444 :
1445 : if (psock->zcopy && !TAILQ_EMPTY(&sock->pending_reqs)) {
1446 : _sock_check_zcopy(sock);
1447 : }
1448 : #endif
1449 :
1450 1 : return _sock_flush(sock);
1451 : }
1452 :
1453 : static ssize_t
1454 0 : posix_sock_recv_from_pipe(struct spdk_posix_sock *sock, struct iovec *diov, int diovcnt)
1455 : {
1456 0 : struct iovec siov[2];
1457 : int sbytes;
1458 : ssize_t bytes;
1459 : struct spdk_posix_sock_group_impl *group;
1460 :
1461 0 : sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov);
1462 0 : if (sbytes < 0) {
1463 0 : errno = EINVAL;
1464 0 : return -1;
1465 0 : } else if (sbytes == 0) {
1466 0 : errno = EAGAIN;
1467 0 : return -1;
1468 : }
1469 :
1470 0 : bytes = spdk_iovcpy(siov, 2, diov, diovcnt);
1471 :
1472 0 : if (bytes == 0) {
1473 : /* The only way this happens is if diov is 0 length */
1474 0 : errno = EINVAL;
1475 0 : return -1;
1476 : }
1477 :
1478 0 : spdk_pipe_reader_advance(sock->recv_pipe, bytes);
1479 :
1480 : /* If we drained the pipe, mark it appropriately */
1481 0 : if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) {
1482 0 : assert(sock->pipe_has_data == true);
1483 :
1484 0 : group = __posix_group_impl(sock->base.group_impl);
1485 0 : if (group && !sock->socket_has_data) {
1486 0 : TAILQ_REMOVE(&group->socks_with_data, sock, link);
1487 : }
1488 :
1489 0 : sock->pipe_has_data = false;
1490 : }
1491 :
1492 0 : return bytes;
1493 : }
1494 :
1495 : static inline ssize_t
1496 0 : posix_sock_read(struct spdk_posix_sock *sock)
1497 : {
1498 0 : struct iovec iov[2];
1499 : int bytes_avail, bytes_recvd;
1500 : struct spdk_posix_sock_group_impl *group;
1501 :
1502 0 : bytes_avail = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov);
1503 :
1504 0 : if (bytes_avail <= 0) {
1505 0 : return bytes_avail;
1506 : }
1507 :
1508 0 : if (sock->ssl) {
1509 0 : bytes_recvd = SSL_readv(sock->ssl, iov, 2);
1510 : } else {
1511 0 : bytes_recvd = readv(sock->fd, iov, 2);
1512 : }
1513 :
1514 0 : assert(sock->pipe_has_data == false);
1515 :
1516 0 : if (bytes_recvd <= 0) {
1517 : /* Errors count as draining the socket data */
1518 0 : if (sock->base.group_impl && sock->socket_has_data) {
1519 0 : group = __posix_group_impl(sock->base.group_impl);
1520 0 : TAILQ_REMOVE(&group->socks_with_data, sock, link);
1521 : }
1522 :
1523 0 : sock->socket_has_data = false;
1524 :
1525 0 : return bytes_recvd;
1526 : }
1527 :
1528 0 : spdk_pipe_writer_advance(sock->recv_pipe, bytes_recvd);
1529 :
1530 : #if DEBUG
1531 0 : if (sock->base.group_impl) {
1532 0 : assert(sock->socket_has_data == true);
1533 : }
1534 : #endif
1535 :
1536 0 : sock->pipe_has_data = true;
1537 0 : if (bytes_recvd < bytes_avail) {
1538 : /* We drained the kernel socket entirely. */
1539 0 : sock->socket_has_data = false;
1540 : }
1541 :
1542 0 : return bytes_recvd;
1543 : }
1544 :
1545 : static ssize_t
1546 9 : posix_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
1547 : {
1548 9 : struct spdk_posix_sock *sock = __posix_sock(_sock);
1549 9 : struct spdk_posix_sock_group_impl *group = __posix_group_impl(sock->base.group_impl);
1550 : int rc, i;
1551 : size_t len;
1552 :
1553 9 : if (sock->recv_pipe == NULL) {
1554 9 : assert(sock->pipe_has_data == false);
1555 9 : if (group && sock->socket_has_data) {
1556 5 : sock->socket_has_data = false;
1557 5 : TAILQ_REMOVE(&group->socks_with_data, sock, link);
1558 : }
1559 9 : if (sock->ssl) {
1560 0 : return SSL_readv(sock->ssl, iov, iovcnt);
1561 : } else {
1562 9 : return readv(sock->fd, iov, iovcnt);
1563 : }
1564 : }
1565 :
1566 : /* If the socket is not in a group, we must assume it always has
1567 : * data waiting for us because it is not epolled */
1568 0 : if (!sock->pipe_has_data && (group == NULL || sock->socket_has_data)) {
1569 : /* If the user is receiving a sufficiently large amount of data,
1570 : * receive directly to their buffers. */
1571 0 : len = 0;
1572 0 : for (i = 0; i < iovcnt; i++) {
1573 0 : len += iov[i].iov_len;
1574 : }
1575 :
1576 0 : if (len >= MIN_SOCK_PIPE_SIZE) {
1577 : /* TODO: Should this detect if kernel socket is drained? */
1578 0 : if (sock->ssl) {
1579 0 : return SSL_readv(sock->ssl, iov, iovcnt);
1580 : } else {
1581 0 : return readv(sock->fd, iov, iovcnt);
1582 : }
1583 : }
1584 :
1585 : /* Otherwise, do a big read into our pipe */
1586 0 : rc = posix_sock_read(sock);
1587 0 : if (rc <= 0) {
1588 0 : return rc;
1589 : }
1590 : }
1591 :
1592 0 : return posix_sock_recv_from_pipe(sock, iov, iovcnt);
1593 : }
1594 :
1595 : static ssize_t
1596 7 : posix_sock_recv(struct spdk_sock *sock, void *buf, size_t len)
1597 : {
1598 7 : struct iovec iov[1];
1599 :
1600 7 : iov[0].iov_base = buf;
1601 7 : iov[0].iov_len = len;
1602 :
1603 7 : return posix_sock_readv(sock, iov, 1);
1604 : }
1605 :
1606 : static ssize_t
1607 7 : posix_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
1608 : {
1609 7 : struct spdk_posix_sock *sock = __posix_sock(_sock);
1610 : int rc;
1611 :
1612 : /* In order to process a writev, we need to flush any asynchronous writes
1613 : * first. */
1614 7 : rc = _sock_flush(_sock);
1615 7 : if (rc < 0) {
1616 0 : return rc;
1617 : }
1618 :
1619 7 : if (!TAILQ_EMPTY(&_sock->queued_reqs)) {
1620 : /* We weren't able to flush all requests */
1621 0 : errno = EAGAIN;
1622 0 : return -1;
1623 : }
1624 :
1625 7 : if (sock->ssl) {
1626 0 : return SSL_writev(sock->ssl, iov, iovcnt);
1627 : } else {
1628 7 : return writev(sock->fd, iov, iovcnt);
1629 : }
1630 : }
1631 :
1632 : static int
1633 0 : posix_sock_recv_next(struct spdk_sock *_sock, void **buf, void **ctx)
1634 : {
1635 0 : struct spdk_posix_sock *sock = __posix_sock(_sock);
1636 0 : struct iovec iov;
1637 : ssize_t rc;
1638 :
1639 0 : if (sock->recv_pipe != NULL) {
1640 0 : errno = ENOTSUP;
1641 0 : return -1;
1642 : }
1643 :
1644 0 : iov.iov_len = spdk_sock_group_get_buf(_sock->group_impl->group, &iov.iov_base, ctx);
1645 0 : if (iov.iov_len == 0) {
1646 0 : errno = ENOBUFS;
1647 0 : return -1;
1648 : }
1649 :
1650 0 : rc = posix_sock_readv(_sock, &iov, 1);
1651 0 : if (rc <= 0) {
1652 0 : spdk_sock_group_provide_buf(_sock->group_impl->group, iov.iov_base, iov.iov_len, *ctx);
1653 0 : return rc;
1654 : }
1655 :
1656 0 : *buf = iov.iov_base;
1657 :
1658 0 : return rc;
1659 : }
1660 :
1661 : static void
1662 2 : posix_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req)
1663 : {
1664 : int rc;
1665 :
1666 2 : spdk_sock_request_queue(sock, req);
1667 :
1668 : /* If there are a sufficient number queued, just flush them out immediately. */
1669 2 : if (sock->queued_iovcnt >= IOV_BATCH_SIZE) {
1670 0 : rc = _sock_flush(sock);
1671 0 : if (rc < 0 && errno != EAGAIN) {
1672 0 : spdk_sock_abort_requests(sock);
1673 : }
1674 : }
1675 2 : }
1676 :
1677 : static int
1678 1 : posix_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes)
1679 : {
1680 1 : struct spdk_posix_sock *sock = __posix_sock(_sock);
1681 1 : int val;
1682 : int rc;
1683 :
1684 1 : assert(sock != NULL);
1685 :
1686 1 : val = nbytes;
1687 1 : rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val);
1688 1 : if (rc != 0) {
1689 0 : return -1;
1690 : }
1691 1 : return 0;
1692 : }
1693 :
1694 : static bool
1695 1 : posix_sock_is_ipv6(struct spdk_sock *_sock)
1696 : {
1697 1 : struct spdk_posix_sock *sock = __posix_sock(_sock);
1698 1 : struct sockaddr_storage sa;
1699 1 : socklen_t salen;
1700 : int rc;
1701 :
1702 1 : assert(sock != NULL);
1703 :
1704 1 : memset(&sa, 0, sizeof sa);
1705 1 : salen = sizeof sa;
1706 1 : rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
1707 1 : if (rc != 0) {
1708 0 : SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
1709 0 : return false;
1710 : }
1711 :
1712 1 : return (sa.ss_family == AF_INET6);
1713 : }
1714 :
1715 : static bool
1716 1 : posix_sock_is_ipv4(struct spdk_sock *_sock)
1717 : {
1718 1 : struct spdk_posix_sock *sock = __posix_sock(_sock);
1719 1 : struct sockaddr_storage sa;
1720 1 : socklen_t salen;
1721 : int rc;
1722 :
1723 1 : assert(sock != NULL);
1724 :
1725 1 : memset(&sa, 0, sizeof sa);
1726 1 : salen = sizeof sa;
1727 1 : rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
1728 1 : if (rc != 0) {
1729 0 : SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
1730 0 : return false;
1731 : }
1732 :
1733 1 : return (sa.ss_family == AF_INET);
1734 : }
1735 :
1736 : static bool
1737 3 : posix_sock_is_connected(struct spdk_sock *_sock)
1738 : {
1739 3 : struct spdk_posix_sock *sock = __posix_sock(_sock);
1740 3 : uint8_t byte;
1741 : int rc;
1742 :
1743 3 : rc = recv(sock->fd, &byte, 1, MSG_PEEK);
1744 3 : if (rc == 0) {
1745 1 : return false;
1746 : }
1747 :
1748 2 : if (rc < 0) {
1749 2 : if (errno == EAGAIN || errno == EWOULDBLOCK) {
1750 2 : return true;
1751 : }
1752 :
1753 0 : return false;
1754 : }
1755 :
1756 0 : return true;
1757 : }
1758 :
1759 : static struct spdk_sock_group_impl *
1760 1 : posix_sock_group_impl_get_optimal(struct spdk_sock *_sock, struct spdk_sock_group_impl *hint)
1761 : {
1762 1 : struct spdk_posix_sock *sock = __posix_sock(_sock);
1763 1 : struct spdk_sock_group_impl *group_impl;
1764 :
1765 1 : if (sock->placement_id != -1) {
1766 0 : spdk_sock_map_lookup(&g_map, sock->placement_id, &group_impl, hint);
1767 0 : return group_impl;
1768 : }
1769 :
1770 1 : return NULL;
1771 : }
1772 :
1773 : static struct spdk_sock_group_impl *
1774 12 : _sock_group_impl_create(uint32_t enable_placement_id)
1775 : {
1776 : struct spdk_posix_sock_group_impl *group_impl;
1777 : int fd;
1778 :
1779 : #if defined(SPDK_EPOLL)
1780 12 : fd = epoll_create1(0);
1781 : #elif defined(SPDK_KEVENT)
1782 : fd = kqueue();
1783 : #endif
1784 12 : if (fd == -1) {
1785 0 : return NULL;
1786 : }
1787 :
1788 12 : group_impl = calloc(1, sizeof(*group_impl));
1789 12 : if (group_impl == NULL) {
1790 0 : SPDK_ERRLOG("group_impl allocation failed\n");
1791 0 : close(fd);
1792 0 : return NULL;
1793 : }
1794 :
1795 12 : group_impl->fd = fd;
1796 12 : TAILQ_INIT(&group_impl->socks_with_data);
1797 12 : group_impl->placement_id = -1;
1798 :
1799 12 : if (enable_placement_id == PLACEMENT_CPU) {
1800 0 : spdk_sock_map_insert(&g_map, spdk_env_get_current_core(), &group_impl->base);
1801 0 : group_impl->placement_id = spdk_env_get_current_core();
1802 : }
1803 :
1804 12 : return &group_impl->base;
1805 : }
1806 :
1807 : static struct spdk_sock_group_impl *
1808 6 : posix_sock_group_impl_create(void)
1809 : {
1810 6 : return _sock_group_impl_create(g_posix_impl_opts.enable_placement_id);
1811 : }
1812 :
1813 : static struct spdk_sock_group_impl *
1814 6 : ssl_sock_group_impl_create(void)
1815 : {
1816 6 : return _sock_group_impl_create(g_ssl_impl_opts.enable_placement_id);
1817 : }
1818 :
1819 : static void
1820 0 : posix_sock_mark(struct spdk_posix_sock_group_impl *group, struct spdk_posix_sock *sock,
1821 : int placement_id)
1822 : {
1823 : #if defined(SO_MARK)
1824 : int rc;
1825 :
1826 0 : rc = setsockopt(sock->fd, SOL_SOCKET, SO_MARK,
1827 : &placement_id, sizeof(placement_id));
1828 0 : if (rc != 0) {
1829 : /* Not fatal */
1830 0 : SPDK_ERRLOG("Error setting SO_MARK\n");
1831 0 : return;
1832 : }
1833 :
1834 0 : rc = spdk_sock_map_insert(&g_map, placement_id, &group->base);
1835 0 : if (rc != 0) {
1836 : /* Not fatal */
1837 0 : SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc);
1838 0 : return;
1839 : }
1840 :
1841 0 : sock->placement_id = placement_id;
1842 : #endif
1843 : }
1844 :
1845 : static void
1846 0 : posix_sock_update_mark(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock)
1847 : {
1848 0 : struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group);
1849 :
1850 0 : if (group->placement_id == -1) {
1851 0 : group->placement_id = spdk_sock_map_find_free(&g_map);
1852 :
1853 : /* If a free placement id is found, update existing sockets in this group */
1854 0 : if (group->placement_id != -1) {
1855 : struct spdk_sock *sock, *tmp;
1856 :
1857 0 : TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) {
1858 0 : posix_sock_mark(group, __posix_sock(sock), group->placement_id);
1859 : }
1860 : }
1861 : }
1862 :
1863 0 : if (group->placement_id != -1) {
1864 : /*
1865 : * group placement id is already determined for this poll group.
1866 : * Mark socket with group's placement id.
1867 : */
1868 0 : posix_sock_mark(group, __posix_sock(_sock), group->placement_id);
1869 : }
1870 0 : }
1871 :
1872 : static int
1873 5 : posix_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock)
1874 : {
1875 5 : struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group);
1876 5 : struct spdk_posix_sock *sock = __posix_sock(_sock);
1877 : int rc;
1878 :
1879 : #if defined(SPDK_EPOLL)
1880 5 : struct epoll_event event;
1881 :
1882 5 : memset(&event, 0, sizeof(event));
1883 : /* EPOLLERR is always on even if we don't set it, but be explicit for clarity */
1884 5 : event.events = EPOLLIN | EPOLLERR;
1885 5 : event.data.ptr = sock;
1886 :
1887 5 : rc = epoll_ctl(group->fd, EPOLL_CTL_ADD, sock->fd, &event);
1888 : #elif defined(SPDK_KEVENT)
1889 : struct kevent event;
1890 : struct timespec ts = {0};
1891 :
1892 : EV_SET(&event, sock->fd, EVFILT_READ, EV_ADD, 0, 0, sock);
1893 :
1894 : rc = kevent(group->fd, &event, 1, NULL, 0, &ts);
1895 : #endif
1896 :
1897 5 : if (rc != 0) {
1898 0 : return rc;
1899 : }
1900 :
1901 : /* switched from another polling group due to scheduling */
1902 5 : if (spdk_unlikely(sock->recv_pipe != NULL &&
1903 : (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) {
1904 0 : sock->pipe_has_data = true;
1905 0 : sock->socket_has_data = false;
1906 0 : TAILQ_INSERT_TAIL(&group->socks_with_data, sock, link);
1907 : }
1908 :
1909 5 : if (_sock->impl_opts.enable_placement_id == PLACEMENT_MARK) {
1910 0 : posix_sock_update_mark(_group, _sock);
1911 5 : } else if (sock->placement_id != -1) {
1912 0 : rc = spdk_sock_map_insert(&g_map, sock->placement_id, &group->base);
1913 0 : if (rc != 0) {
1914 0 : SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc);
1915 : /* Do not treat this as an error. The system will continue running. */
1916 : }
1917 : }
1918 :
1919 5 : return rc;
1920 : }
1921 :
1922 : static int
1923 5 : posix_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock)
1924 : {
1925 5 : struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group);
1926 5 : struct spdk_posix_sock *sock = __posix_sock(_sock);
1927 : int rc;
1928 :
1929 5 : if (sock->pipe_has_data || sock->socket_has_data) {
1930 0 : TAILQ_REMOVE(&group->socks_with_data, sock, link);
1931 0 : sock->pipe_has_data = false;
1932 0 : sock->socket_has_data = false;
1933 : }
1934 :
1935 5 : if (sock->placement_id != -1) {
1936 0 : spdk_sock_map_release(&g_map, sock->placement_id);
1937 : }
1938 :
1939 : #if defined(SPDK_EPOLL)
1940 5 : struct epoll_event event;
1941 :
1942 : /* Event parameter is ignored but some old kernel version still require it. */
1943 5 : rc = epoll_ctl(group->fd, EPOLL_CTL_DEL, sock->fd, &event);
1944 : #elif defined(SPDK_KEVENT)
1945 : struct kevent event;
1946 : struct timespec ts = {0};
1947 :
1948 : EV_SET(&event, sock->fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
1949 :
1950 : rc = kevent(group->fd, &event, 1, NULL, 0, &ts);
1951 : if (rc == 0 && event.flags & EV_ERROR) {
1952 : rc = -1;
1953 : errno = event.data;
1954 : }
1955 : #endif
1956 :
1957 5 : spdk_sock_abort_requests(_sock);
1958 :
1959 5 : return rc;
1960 : }
1961 :
1962 : static int
1963 7 : posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events,
1964 : struct spdk_sock **socks)
1965 : {
1966 7 : struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group);
1967 : struct spdk_sock *sock, *tmp;
1968 : int num_events, i, rc;
1969 : struct spdk_posix_sock *psock, *ptmp;
1970 : #if defined(SPDK_EPOLL)
1971 7 : struct epoll_event events[MAX_EVENTS_PER_POLL];
1972 : #elif defined(SPDK_KEVENT)
1973 : struct kevent events[MAX_EVENTS_PER_POLL];
1974 : struct timespec ts = {0};
1975 : #endif
1976 :
1977 : #ifdef SPDK_ZEROCOPY
1978 : /* When all of the following conditions are met
1979 : * - non-blocking socket
1980 : * - zero copy is enabled
1981 : * - interrupts suppressed (i.e. busy polling)
1982 : * - the NIC tx queue is full at the time sendmsg() is called
1983 : * - epoll_wait determines there is an EPOLLIN event for the socket
1984 : * then we can get into a situation where data we've sent is queued
1985 : * up in the kernel network stack, but interrupts have been suppressed
1986 : * because other traffic is flowing so the kernel misses the signal
1987 : * to flush the software tx queue. If there wasn't incoming data
1988 : * pending on the socket, then epoll_wait would have been sufficient
1989 : * to kick off the send operation, but since there is a pending event
1990 : * epoll_wait does not trigger the necessary operation.
1991 : *
1992 : * We deal with this by checking for all of the above conditions and
1993 : * additionally looking for EPOLLIN events that were not consumed from
1994 : * the last poll loop. We take this to mean that the upper layer is
1995 : * unable to consume them because it is blocked waiting for resources
1996 : * to free up, and those resources are most likely freed in response
1997 : * to a pending asynchronous write completing.
1998 : *
1999 : * Additionally, sockets that have the same placement_id actually share
2000 : * an underlying hardware queue. That means polling one of them is
2001 : * equivalent to polling all of them. As a quick mechanism to avoid
2002 : * making extra poll() calls, stash the last placement_id during the loop
2003 : * and only poll if it's not the same. The overwhelmingly common case
2004 : * is that all sockets in this list have the same placement_id because
2005 : * SPDK is intentionally grouping sockets by that value, so even
2006 : * though this won't stop all extra calls to poll(), it's very fast
2007 : * and will catch all of them in practice.
2008 : */
2009 : int last_placement_id = -1;
2010 :
2011 : TAILQ_FOREACH(psock, &group->socks_with_data, link) {
2012 : if (psock->zcopy && psock->placement_id >= 0 &&
2013 : psock->placement_id != last_placement_id) {
2014 : struct pollfd pfd = {psock->fd, POLLIN | POLLERR, 0};
2015 :
2016 : poll(&pfd, 1, 0);
2017 : last_placement_id = psock->placement_id;
2018 : }
2019 : }
2020 : #endif
2021 :
2022 : /* This must be a TAILQ_FOREACH_SAFE because while flushing,
2023 : * a completion callback could remove the sock from the
2024 : * group. */
2025 22 : TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) {
2026 15 : rc = _sock_flush(sock);
2027 15 : if (rc < 0 && errno != EAGAIN) {
2028 0 : spdk_sock_abort_requests(sock);
2029 : }
2030 : }
2031 :
2032 7 : assert(max_events > 0);
2033 :
2034 : #if defined(SPDK_EPOLL)
2035 7 : num_events = epoll_wait(group->fd, events, max_events, 0);
2036 : #elif defined(SPDK_KEVENT)
2037 : num_events = kevent(group->fd, NULL, 0, events, max_events, &ts);
2038 : #endif
2039 :
2040 7 : if (num_events == -1) {
2041 0 : return -1;
2042 7 : } else if (num_events == 0 && !TAILQ_EMPTY(&_group->socks)) {
2043 1 : sock = TAILQ_FIRST(&_group->socks);
2044 1 : psock = __posix_sock(sock);
2045 : /* poll() is called here to busy poll the queue associated with
2046 : * first socket in list and potentially reap incoming data.
2047 : */
2048 1 : if (sock->opts.priority) {
2049 0 : struct pollfd pfd = {0, 0, 0};
2050 :
2051 0 : pfd.fd = psock->fd;
2052 0 : pfd.events = POLLIN | POLLERR;
2053 0 : poll(&pfd, 1, 0);
2054 : }
2055 : }
2056 :
2057 12 : for (i = 0; i < num_events; i++) {
2058 : #if defined(SPDK_EPOLL)
2059 5 : sock = events[i].data.ptr;
2060 5 : psock = __posix_sock(sock);
2061 :
2062 : #ifdef SPDK_ZEROCOPY
2063 : if (events[i].events & EPOLLERR) {
2064 : rc = _sock_check_zcopy(sock);
2065 : /* If the socket was closed or removed from
2066 : * the group in response to a send ack, don't
2067 : * add it to the array here. */
2068 : if (rc || sock->cb_fn == NULL) {
2069 : continue;
2070 : }
2071 : }
2072 : #endif
2073 5 : if ((events[i].events & EPOLLIN) == 0) {
2074 0 : continue;
2075 : }
2076 :
2077 : #elif defined(SPDK_KEVENT)
2078 : sock = events[i].udata;
2079 : psock = __posix_sock(sock);
2080 : #endif
2081 :
2082 : /* If the socket is not already in the list, add it now */
2083 5 : if (!psock->socket_has_data && !psock->pipe_has_data) {
2084 5 : TAILQ_INSERT_TAIL(&group->socks_with_data, psock, link);
2085 : }
2086 5 : psock->socket_has_data = true;
2087 : }
2088 :
2089 7 : num_events = 0;
2090 :
2091 12 : TAILQ_FOREACH_SAFE(psock, &group->socks_with_data, link, ptmp) {
2092 5 : if (num_events == max_events) {
2093 0 : break;
2094 : }
2095 :
2096 : /* If the socket's cb_fn is NULL, just remove it from the
2097 : * list and do not add it to socks array */
2098 5 : if (spdk_unlikely(psock->base.cb_fn == NULL)) {
2099 0 : psock->socket_has_data = false;
2100 0 : psock->pipe_has_data = false;
2101 0 : TAILQ_REMOVE(&group->socks_with_data, psock, link);
2102 0 : continue;
2103 : }
2104 :
2105 5 : socks[num_events++] = &psock->base;
2106 : }
2107 :
2108 : /* Cycle the has_data list so that each time we poll things aren't
2109 : * in the same order. Say we have 6 sockets in the list, named as follows:
2110 : * A B C D E F
2111 : * And all 6 sockets had epoll events, but max_events is only 3. That means
2112 : * psock currently points at D. We want to rearrange the list to the following:
2113 : * D E F A B C
2114 : *
2115 : * The variables below are named according to this example to make it easier to
2116 : * follow the swaps.
2117 : */
2118 7 : if (psock != NULL) {
2119 : struct spdk_posix_sock *pa, *pc, *pd, *pf;
2120 :
2121 : /* Capture pointers to the elements we need */
2122 0 : pd = psock;
2123 0 : pc = TAILQ_PREV(pd, spdk_has_data_list, link);
2124 0 : pa = TAILQ_FIRST(&group->socks_with_data);
2125 0 : pf = TAILQ_LAST(&group->socks_with_data, spdk_has_data_list);
2126 :
2127 : /* Break the link between C and D */
2128 0 : pc->link.tqe_next = NULL;
2129 :
2130 : /* Connect F to A */
2131 0 : pf->link.tqe_next = pa;
2132 0 : pa->link.tqe_prev = &pf->link.tqe_next;
2133 :
2134 : /* Fix up the list first/last pointers */
2135 0 : group->socks_with_data.tqh_first = pd;
2136 0 : group->socks_with_data.tqh_last = &pc->link.tqe_next;
2137 :
2138 : /* D is in front of the list, make tqe prev pointer point to the head of list */
2139 0 : pd->link.tqe_prev = &group->socks_with_data.tqh_first;
2140 : }
2141 :
2142 7 : return num_events;
2143 : }
2144 :
2145 : static int
2146 12 : _sock_group_impl_close(struct spdk_sock_group_impl *_group, uint32_t enable_placement_id)
2147 : {
2148 12 : struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group);
2149 : int rc;
2150 :
2151 12 : if (enable_placement_id == PLACEMENT_CPU) {
2152 0 : spdk_sock_map_release(&g_map, spdk_env_get_current_core());
2153 : }
2154 :
2155 12 : rc = close(group->fd);
2156 12 : free(group);
2157 12 : return rc;
2158 : }
2159 :
2160 : static int
2161 6 : posix_sock_group_impl_close(struct spdk_sock_group_impl *_group)
2162 : {
2163 6 : return _sock_group_impl_close(_group, g_posix_impl_opts.enable_placement_id);
2164 : }
2165 :
2166 : static int
2167 6 : ssl_sock_group_impl_close(struct spdk_sock_group_impl *_group)
2168 : {
2169 6 : return _sock_group_impl_close(_group, g_ssl_impl_opts.enable_placement_id);
2170 : }
2171 :
2172 : static struct spdk_net_impl g_posix_net_impl = {
2173 : .name = "posix",
2174 : .getaddr = posix_sock_getaddr,
2175 : .connect = posix_sock_connect,
2176 : .listen = posix_sock_listen,
2177 : .accept = posix_sock_accept,
2178 : .close = posix_sock_close,
2179 : .recv = posix_sock_recv,
2180 : .readv = posix_sock_readv,
2181 : .writev = posix_sock_writev,
2182 : .recv_next = posix_sock_recv_next,
2183 : .writev_async = posix_sock_writev_async,
2184 : .flush = posix_sock_flush,
2185 : .set_recvlowat = posix_sock_set_recvlowat,
2186 : .set_recvbuf = posix_sock_set_recvbuf,
2187 : .set_sendbuf = posix_sock_set_sendbuf,
2188 : .is_ipv6 = posix_sock_is_ipv6,
2189 : .is_ipv4 = posix_sock_is_ipv4,
2190 : .is_connected = posix_sock_is_connected,
2191 : .group_impl_get_optimal = posix_sock_group_impl_get_optimal,
2192 : .group_impl_create = posix_sock_group_impl_create,
2193 : .group_impl_add_sock = posix_sock_group_impl_add_sock,
2194 : .group_impl_remove_sock = posix_sock_group_impl_remove_sock,
2195 : .group_impl_poll = posix_sock_group_impl_poll,
2196 : .group_impl_close = posix_sock_group_impl_close,
2197 : .get_opts = posix_sock_impl_get_opts,
2198 : .set_opts = posix_sock_impl_set_opts,
2199 : };
2200 :
2201 2 : SPDK_NET_IMPL_REGISTER(posix, &g_posix_net_impl, DEFAULT_SOCK_PRIORITY + 1);
2202 :
2203 : static struct spdk_sock *
2204 0 : ssl_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts)
2205 : {
2206 0 : return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, true);
2207 : }
2208 :
2209 : static struct spdk_sock *
2210 0 : ssl_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts)
2211 : {
2212 0 : return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, true);
2213 : }
2214 :
2215 : static struct spdk_sock *
2216 0 : ssl_sock_accept(struct spdk_sock *_sock)
2217 : {
2218 0 : return _posix_sock_accept(_sock, true);
2219 : }
2220 :
2221 : static struct spdk_net_impl g_ssl_net_impl = {
2222 : .name = "ssl",
2223 : .getaddr = posix_sock_getaddr,
2224 : .connect = ssl_sock_connect,
2225 : .listen = ssl_sock_listen,
2226 : .accept = ssl_sock_accept,
2227 : .close = posix_sock_close,
2228 : .recv = posix_sock_recv,
2229 : .readv = posix_sock_readv,
2230 : .writev = posix_sock_writev,
2231 : .recv_next = posix_sock_recv_next,
2232 : .writev_async = posix_sock_writev_async,
2233 : .flush = posix_sock_flush,
2234 : .set_recvlowat = posix_sock_set_recvlowat,
2235 : .set_recvbuf = posix_sock_set_recvbuf,
2236 : .set_sendbuf = posix_sock_set_sendbuf,
2237 : .is_ipv6 = posix_sock_is_ipv6,
2238 : .is_ipv4 = posix_sock_is_ipv4,
2239 : .is_connected = posix_sock_is_connected,
2240 : .group_impl_get_optimal = posix_sock_group_impl_get_optimal,
2241 : .group_impl_create = ssl_sock_group_impl_create,
2242 : .group_impl_add_sock = posix_sock_group_impl_add_sock,
2243 : .group_impl_remove_sock = posix_sock_group_impl_remove_sock,
2244 : .group_impl_poll = posix_sock_group_impl_poll,
2245 : .group_impl_close = ssl_sock_group_impl_close,
2246 : .get_opts = ssl_sock_impl_get_opts,
2247 : .set_opts = ssl_sock_impl_set_opts,
2248 : };
2249 :
2250 2 : SPDK_NET_IMPL_REGISTER(ssl, &g_ssl_net_impl, DEFAULT_SOCK_PRIORITY);
2251 2 : SPDK_LOG_REGISTER_COMPONENT(sock_posix)
|