Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2017 Intel Corporation.
3 : * All rights reserved.
4 : * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 : */
6 :
7 : #include "bdev_aio.h"
8 :
9 : #include "spdk/stdinc.h"
10 :
11 : #include "spdk/barrier.h"
12 : #include "spdk/bdev.h"
13 : #include "spdk/bdev_module.h"
14 : #include "spdk/env.h"
15 : #include "spdk/fd.h"
16 : #include "spdk/likely.h"
17 : #include "spdk/thread.h"
18 : #include "spdk/json.h"
19 : #include "spdk/util.h"
20 : #include "spdk/string.h"
21 :
22 : #include "spdk/log.h"
23 :
24 : #include <sys/eventfd.h>
25 :
26 : #ifndef __FreeBSD__
27 : #include <libaio.h>
28 : #endif
29 :
30 : struct bdev_aio_io_channel {
31 : uint64_t io_inflight;
32 : #ifdef __FreeBSD__
33 : int kqfd;
34 : #else
35 : io_context_t io_ctx;
36 : #endif
37 : struct bdev_aio_group_channel *group_ch;
38 : TAILQ_ENTRY(bdev_aio_io_channel) link;
39 : };
40 :
41 : struct bdev_aio_group_channel {
42 : /* eventfd for io completion notification in interrupt mode.
43 : * Negative value like '-1' indicates it is invalid or unused.
44 : */
45 : int efd;
46 : struct spdk_interrupt *intr;
47 : struct spdk_poller *poller;
48 : TAILQ_HEAD(, bdev_aio_io_channel) io_ch_head;
49 : };
50 :
51 : struct bdev_aio_task {
52 : #ifdef __FreeBSD__
53 : struct aiocb aiocb;
54 : #else
55 : struct iocb iocb;
56 : #endif
57 : uint64_t len;
58 : struct bdev_aio_io_channel *ch;
59 : };
60 :
61 : struct file_disk {
62 : struct bdev_aio_task *reset_task;
63 : struct spdk_poller *reset_retry_timer;
64 : struct spdk_bdev disk;
65 : char *filename;
66 : int fd;
67 : bool use_nowait;
68 : TAILQ_ENTRY(file_disk) link;
69 : bool block_size_override;
70 : bool readonly;
71 : bool fallocate;
72 : };
73 :
74 : /* For user space reaping of completions */
75 : struct spdk_aio_ring {
76 : uint32_t id;
77 : uint32_t size;
78 : uint32_t head;
79 : uint32_t tail;
80 :
81 : uint32_t version;
82 : uint32_t compat_features;
83 : uint32_t incompat_features;
84 : uint32_t header_length;
85 : };
86 :
87 : #define SPDK_AIO_RING_VERSION 0xa10a10a1
88 :
89 : static int bdev_aio_initialize(void);
90 : static void bdev_aio_fini(void);
91 : static void aio_free_disk(struct file_disk *fdisk);
92 : static TAILQ_HEAD(, file_disk) g_aio_disk_head = TAILQ_HEAD_INITIALIZER(g_aio_disk_head);
93 :
94 : #define SPDK_AIO_QUEUE_DEPTH 128
95 : #define MAX_EVENTS_PER_POLL 32
96 :
97 : static int
98 0 : bdev_aio_get_ctx_size(void)
99 : {
100 0 : return sizeof(struct bdev_aio_task);
101 : }
102 :
103 : static struct spdk_bdev_module aio_if = {
104 : .name = "aio",
105 : .module_init = bdev_aio_initialize,
106 : .module_fini = bdev_aio_fini,
107 : .get_ctx_size = bdev_aio_get_ctx_size,
108 : };
109 :
110 0 : SPDK_BDEV_MODULE_REGISTER(aio, &aio_if)
111 :
112 : static int
113 0 : bdev_aio_open(struct file_disk *disk)
114 : {
115 : int fd;
116 0 : int io_flag = disk->readonly ? O_RDONLY : O_RDWR;
117 0 : struct stat st;
118 :
119 0 : fd = open(disk->filename, io_flag | O_DIRECT);
120 0 : if (fd < 0) {
121 : /* Try without O_DIRECT for non-disk files */
122 0 : fd = open(disk->filename, io_flag);
123 0 : if (fd < 0) {
124 0 : SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
125 : disk->filename, errno, spdk_strerror(errno));
126 0 : disk->fd = -1;
127 0 : return -1;
128 : }
129 : }
130 :
131 0 : disk->fd = fd;
132 : /* Some aio operations can block, for example if number outstanding
133 : * I/O exceeds number of block layer tags. But not all files can
134 : * support RWF_NOWAIT flag. So use RWF_NOWAIT on block devices only.
135 : */
136 0 : disk->use_nowait = fstat(fd, &st) == 0 && S_ISBLK(st.st_mode);
137 :
138 0 : return 0;
139 : }
140 :
141 : static int
142 0 : bdev_aio_close(struct file_disk *disk)
143 : {
144 : int rc;
145 :
146 0 : if (disk->fd == -1) {
147 0 : return 0;
148 : }
149 :
150 0 : rc = close(disk->fd);
151 0 : if (rc < 0) {
152 0 : SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
153 : disk->fd, errno, spdk_strerror(errno));
154 0 : return -1;
155 : }
156 :
157 0 : disk->fd = -1;
158 :
159 0 : return 0;
160 : }
161 :
162 : #ifdef __FreeBSD__
163 : static int
164 : bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk,
165 : struct spdk_io_channel *ch, struct bdev_aio_task *aio_task,
166 : struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
167 : {
168 : struct aiocb *aiocb = &aio_task->aiocb;
169 : struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
170 :
171 : memset(aiocb, 0, sizeof(struct aiocb));
172 : aiocb->aio_fildes = fdisk->fd;
173 : aiocb->aio_iov = iov;
174 : aiocb->aio_iovcnt = iovcnt;
175 : aiocb->aio_offset = offset;
176 : aiocb->aio_sigevent.sigev_notify_kqueue = aio_ch->kqfd;
177 : aiocb->aio_sigevent.sigev_value.sival_ptr = aio_task;
178 : aiocb->aio_sigevent.sigev_notify = SIGEV_KEVENT;
179 :
180 : aio_task->len = nbytes;
181 : aio_task->ch = aio_ch;
182 :
183 : if (type == SPDK_BDEV_IO_TYPE_READ) {
184 : return aio_readv(aiocb);
185 : }
186 :
187 : return aio_writev(aiocb);
188 : }
189 : #else
190 : static int
191 0 : bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk,
192 : struct spdk_io_channel *ch, struct bdev_aio_task *aio_task,
193 : struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
194 : {
195 0 : struct iocb *iocb = &aio_task->iocb;
196 0 : struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
197 :
198 0 : if (type == SPDK_BDEV_IO_TYPE_READ) {
199 0 : io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset);
200 : } else {
201 0 : io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset);
202 : }
203 :
204 0 : if (aio_ch->group_ch->efd >= 0) {
205 0 : io_set_eventfd(iocb, aio_ch->group_ch->efd);
206 : }
207 0 : iocb->data = aio_task;
208 0 : if (fdisk->use_nowait) {
209 0 : iocb->aio_rw_flags = RWF_NOWAIT;
210 : }
211 0 : aio_task->len = nbytes;
212 0 : aio_task->ch = aio_ch;
213 :
214 0 : return io_submit(aio_ch->io_ctx, 1, &iocb);
215 : }
216 : #endif
217 :
218 : static void
219 0 : bdev_aio_rw(enum spdk_bdev_io_type type, struct file_disk *fdisk,
220 : struct spdk_io_channel *ch, struct bdev_aio_task *aio_task,
221 : struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
222 : {
223 0 : struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
224 : int rc;
225 :
226 0 : if (type == SPDK_BDEV_IO_TYPE_READ) {
227 0 : SPDK_DEBUGLOG(aio, "read %d iovs size %lu to off: %#lx\n",
228 : iovcnt, nbytes, offset);
229 : } else {
230 0 : SPDK_DEBUGLOG(aio, "write %d iovs size %lu from off: %#lx\n",
231 : iovcnt, nbytes, offset);
232 : }
233 :
234 0 : rc = bdev_aio_submit_io(type, fdisk, ch, aio_task, iov, iovcnt, nbytes, offset);
235 0 : if (spdk_unlikely(rc < 0)) {
236 0 : if (rc == -EAGAIN) {
237 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
238 : } else {
239 0 : spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc);
240 0 : SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
241 : }
242 : } else {
243 0 : aio_ch->io_inflight++;
244 : }
245 0 : }
246 :
247 : static void
248 0 : bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
249 : {
250 0 : int rc = fsync(fdisk->fd);
251 :
252 0 : if (rc == 0) {
253 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
254 : } else {
255 0 : spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno);
256 : }
257 0 : }
258 :
259 : #ifndef __FreeBSD__
260 : static void
261 0 : bdev_aio_fallocate(struct spdk_bdev_io *bdev_io, int mode)
262 : {
263 0 : struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt;
264 0 : struct bdev_aio_task *aio_task = (struct bdev_aio_task *)bdev_io->driver_ctx;
265 0 : uint64_t offset_bytes = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen;
266 0 : uint64_t length_bytes = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
267 : int rc;
268 :
269 0 : if (!fdisk->fallocate) {
270 0 : spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -ENOTSUP);
271 0 : return;
272 : }
273 :
274 0 : rc = fallocate(fdisk->fd, mode, offset_bytes, length_bytes);
275 0 : if (rc == 0) {
276 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
277 : } else {
278 0 : spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno);
279 : }
280 : }
281 :
282 : static void
283 0 : bdev_aio_unmap(struct spdk_bdev_io *bdev_io)
284 : {
285 0 : int mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE;
286 :
287 0 : bdev_aio_fallocate(bdev_io, mode);
288 0 : }
289 :
290 :
291 : static void
292 0 : bdev_aio_write_zeros(struct spdk_bdev_io *bdev_io)
293 : {
294 0 : int mode = FALLOC_FL_ZERO_RANGE;
295 :
296 0 : bdev_aio_fallocate(bdev_io, mode);
297 0 : }
298 : #endif
299 :
300 : static void
301 0 : bdev_aio_destruct_cb(void *io_device)
302 : {
303 0 : struct file_disk *fdisk = io_device;
304 0 : int rc = 0;
305 :
306 0 : TAILQ_REMOVE(&g_aio_disk_head, fdisk, link);
307 0 : rc = bdev_aio_close(fdisk);
308 0 : if (rc < 0) {
309 0 : SPDK_ERRLOG("bdev_aio_close() failed\n");
310 : }
311 0 : aio_free_disk(fdisk);
312 0 : }
313 :
314 : static int
315 0 : bdev_aio_destruct(void *ctx)
316 : {
317 0 : struct file_disk *fdisk = ctx;
318 :
319 0 : spdk_io_device_unregister(fdisk, bdev_aio_destruct_cb);
320 :
321 0 : return 0;
322 : }
323 :
324 : #ifdef __FreeBSD__
325 : static int
326 : bdev_user_io_getevents(int kq, unsigned int max, struct kevent *events)
327 : {
328 : struct timespec ts;
329 : int count;
330 :
331 : memset(events, 0, max * sizeof(struct kevent));
332 : memset(&ts, 0, sizeof(ts));
333 :
334 : count = kevent(kq, NULL, 0, events, max, &ts);
335 : if (count < 0) {
336 : SPDK_ERRLOG("failed to get kevents: %s.\n", spdk_strerror(errno));
337 : return -errno;
338 : }
339 :
340 : return count;
341 : }
342 :
343 : static int
344 : bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch)
345 : {
346 : int nr, i, res = 0;
347 : struct bdev_aio_task *aio_task;
348 : struct kevent events[SPDK_AIO_QUEUE_DEPTH];
349 :
350 : nr = bdev_user_io_getevents(io_ch->kqfd, SPDK_AIO_QUEUE_DEPTH, events);
351 : if (nr < 0) {
352 : return 0;
353 : }
354 :
355 : for (i = 0; i < nr; i++) {
356 : aio_task = events[i].udata;
357 : aio_task->ch->io_inflight--;
358 : if (aio_task == NULL) {
359 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
360 : break;
361 : } else if ((uint64_t)aio_return(&aio_task->aiocb) == aio_task->len) {
362 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
363 : } else {
364 : SPDK_ERRLOG("failed to complete aio: rc %d\n", aio_error(&aio_task->aiocb));
365 : res = aio_error(&aio_task->aiocb);
366 : if (res != 0) {
367 : spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res);
368 : } else {
369 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
370 : }
371 : }
372 : }
373 :
374 : return nr;
375 : }
376 : #else
377 : static int
378 0 : bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents)
379 : {
380 : uint32_t head, tail, count;
381 : struct spdk_aio_ring *ring;
382 0 : struct timespec timeout;
383 : struct io_event *kevents;
384 :
385 0 : ring = (struct spdk_aio_ring *)io_ctx;
386 :
387 0 : if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) {
388 0 : timeout.tv_sec = 0;
389 0 : timeout.tv_nsec = 0;
390 :
391 0 : return io_getevents(io_ctx, 0, max, uevents, &timeout);
392 : }
393 :
394 : /* Read the current state out of the ring */
395 0 : head = ring->head;
396 0 : tail = ring->tail;
397 :
398 : /* This memory barrier is required to prevent the loads above
399 : * from being re-ordered with stores to the events array
400 : * potentially occurring on other threads. */
401 0 : spdk_smp_rmb();
402 :
403 : /* Calculate how many items are in the circular ring */
404 0 : count = tail - head;
405 0 : if (tail < head) {
406 0 : count += ring->size;
407 : }
408 :
409 : /* Reduce the count to the limit provided by the user */
410 0 : count = spdk_min(max, count);
411 :
412 : /* Grab the memory location of the event array */
413 0 : kevents = (struct io_event *)((uintptr_t)ring + ring->header_length);
414 :
415 : /* Copy the events out of the ring. */
416 0 : if ((head + count) <= ring->size) {
417 : /* Only one copy is required */
418 0 : memcpy(uevents, &kevents[head], count * sizeof(struct io_event));
419 : } else {
420 0 : uint32_t first_part = ring->size - head;
421 : /* Two copies are required */
422 0 : memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event));
423 0 : memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event));
424 : }
425 :
426 : /* Update the head pointer. On x86, stores will not be reordered with older loads,
427 : * so the copies out of the event array will always be complete prior to this
428 : * update becoming visible. On other architectures this is not guaranteed, so
429 : * add a barrier. */
430 : #if defined(__i386__) || defined(__x86_64__)
431 0 : spdk_compiler_barrier();
432 : #else
433 : spdk_smp_mb();
434 : #endif
435 0 : ring->head = (head + count) % ring->size;
436 :
437 0 : return count;
438 : }
439 :
440 : static int
441 0 : bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch)
442 : {
443 0 : int nr, i, res = 0;
444 : struct bdev_aio_task *aio_task;
445 0 : struct io_event events[SPDK_AIO_QUEUE_DEPTH];
446 :
447 0 : nr = bdev_user_io_getevents(io_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events);
448 0 : if (nr < 0) {
449 0 : return 0;
450 : }
451 :
452 0 : for (i = 0; i < nr; i++) {
453 0 : aio_task = events[i].data;
454 0 : aio_task->ch->io_inflight--;
455 0 : if (events[i].res == aio_task->len) {
456 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
457 : } else {
458 : /* From aio_abi.h, io_event.res is defined __s64, negative errno
459 : * will be assigned to io_event.res for error situation.
460 : * But from libaio.h, io_event.res is defined unsigned long, so
461 : * convert it to signed value for error detection.
462 : */
463 0 : res = (int)events[i].res;
464 0 : if (res < 0) {
465 0 : if (res == -EAGAIN) {
466 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
467 : } else {
468 0 : SPDK_ERRLOG("failed to complete aio: rc %"PRId64"\n", events[i].res);
469 0 : spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res);
470 : }
471 : } else {
472 0 : SPDK_ERRLOG("failed to complete aio: rc %"PRId64"\n", events[i].res);
473 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
474 : }
475 : }
476 : }
477 :
478 0 : return nr;
479 : }
480 : #endif
481 :
482 : static int
483 0 : bdev_aio_group_poll(void *arg)
484 : {
485 0 : struct bdev_aio_group_channel *group_ch = arg;
486 : struct bdev_aio_io_channel *io_ch;
487 0 : int nr = 0;
488 :
489 0 : TAILQ_FOREACH(io_ch, &group_ch->io_ch_head, link) {
490 0 : nr += bdev_aio_io_channel_poll(io_ch);
491 : }
492 :
493 0 : return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
494 : }
495 :
496 : static int
497 0 : bdev_aio_group_interrupt(void *arg)
498 : {
499 0 : struct bdev_aio_group_channel *group_ch = arg;
500 : int rc;
501 0 : uint64_t num_events;
502 :
503 0 : assert(group_ch->efd >= 0);
504 :
505 : /* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH,
506 : * io_getevent should be called again to ensure all completed IO are processed.
507 : */
508 0 : rc = read(group_ch->efd, &num_events, sizeof(num_events));
509 0 : if (rc < 0) {
510 0 : SPDK_ERRLOG("failed to acknowledge aio group: %s.\n", spdk_strerror(errno));
511 0 : return -errno;
512 : }
513 :
514 0 : if (num_events > SPDK_AIO_QUEUE_DEPTH) {
515 0 : num_events -= SPDK_AIO_QUEUE_DEPTH;
516 0 : rc = write(group_ch->efd, &num_events, sizeof(num_events));
517 0 : if (rc < 0) {
518 0 : SPDK_ERRLOG("failed to notify aio group: %s.\n", spdk_strerror(errno));
519 : }
520 : }
521 :
522 0 : return bdev_aio_group_poll(group_ch);
523 : }
524 :
525 : static void
526 0 : _bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i)
527 : {
528 0 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
529 0 : struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
530 :
531 0 : if (aio_ch->io_inflight) {
532 0 : spdk_for_each_channel_continue(i, -1);
533 0 : return;
534 : }
535 :
536 0 : spdk_for_each_channel_continue(i, 0);
537 : }
538 :
539 : static int bdev_aio_reset_retry_timer(void *arg);
540 :
541 : static void
542 0 : _bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status)
543 : {
544 0 : struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i);
545 :
546 0 : if (status == -1) {
547 0 : fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500);
548 0 : return;
549 : }
550 :
551 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS);
552 : }
553 :
554 : static int
555 0 : bdev_aio_reset_retry_timer(void *arg)
556 : {
557 0 : struct file_disk *fdisk = arg;
558 :
559 0 : if (fdisk->reset_retry_timer) {
560 0 : spdk_poller_unregister(&fdisk->reset_retry_timer);
561 : }
562 :
563 0 : spdk_for_each_channel(fdisk,
564 : _bdev_aio_get_io_inflight,
565 : fdisk,
566 : _bdev_aio_get_io_inflight_done);
567 :
568 0 : return SPDK_POLLER_BUSY;
569 : }
570 :
571 : static void
572 0 : bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
573 : {
574 0 : fdisk->reset_task = aio_task;
575 :
576 0 : bdev_aio_reset_retry_timer(fdisk);
577 0 : }
578 :
579 : static void
580 0 : bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
581 : bool success)
582 : {
583 0 : if (!success) {
584 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
585 0 : return;
586 : }
587 :
588 0 : switch (bdev_io->type) {
589 0 : case SPDK_BDEV_IO_TYPE_READ:
590 : case SPDK_BDEV_IO_TYPE_WRITE:
591 0 : bdev_aio_rw(bdev_io->type,
592 0 : (struct file_disk *)bdev_io->bdev->ctxt,
593 : ch,
594 0 : (struct bdev_aio_task *)bdev_io->driver_ctx,
595 : bdev_io->u.bdev.iovs,
596 : bdev_io->u.bdev.iovcnt,
597 0 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
598 0 : bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
599 0 : break;
600 0 : default:
601 0 : SPDK_ERRLOG("Wrong io type\n");
602 0 : break;
603 : }
604 : }
605 :
606 : static int
607 0 : _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
608 : {
609 0 : struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt;
610 :
611 0 : switch (bdev_io->type) {
612 : /* Read and write operations must be performed on buffers aligned to
613 : * bdev->required_alignment. If user specified unaligned buffers,
614 : * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
615 0 : case SPDK_BDEV_IO_TYPE_READ:
616 0 : spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
617 0 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
618 0 : return 0;
619 0 : case SPDK_BDEV_IO_TYPE_WRITE:
620 0 : if (fdisk->readonly) {
621 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
622 : } else {
623 0 : spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
624 0 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
625 : }
626 0 : return 0;
627 :
628 0 : case SPDK_BDEV_IO_TYPE_FLUSH:
629 0 : bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt,
630 0 : (struct bdev_aio_task *)bdev_io->driver_ctx);
631 0 : return 0;
632 :
633 0 : case SPDK_BDEV_IO_TYPE_RESET:
634 0 : bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt,
635 0 : (struct bdev_aio_task *)bdev_io->driver_ctx);
636 0 : return 0;
637 :
638 : #ifndef __FreeBSD__
639 0 : case SPDK_BDEV_IO_TYPE_UNMAP:
640 0 : bdev_aio_unmap(bdev_io);
641 0 : return 0;
642 :
643 0 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
644 0 : bdev_aio_write_zeros(bdev_io);
645 0 : return 0;
646 : #endif
647 :
648 0 : default:
649 0 : return -1;
650 : }
651 : }
652 :
653 : static void
654 0 : bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
655 : {
656 0 : if (_bdev_aio_submit_request(ch, bdev_io) < 0) {
657 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
658 : }
659 0 : }
660 :
661 : static bool
662 0 : bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
663 : {
664 0 : struct file_disk *fdisk = ctx;
665 :
666 0 : switch (io_type) {
667 0 : case SPDK_BDEV_IO_TYPE_READ:
668 : case SPDK_BDEV_IO_TYPE_WRITE:
669 : case SPDK_BDEV_IO_TYPE_FLUSH:
670 : case SPDK_BDEV_IO_TYPE_RESET:
671 0 : return true;
672 :
673 0 : case SPDK_BDEV_IO_TYPE_UNMAP:
674 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
675 0 : return fdisk->fallocate;
676 :
677 0 : default:
678 0 : return false;
679 : }
680 : }
681 :
682 : #ifdef __FreeBSD__
683 : static int
684 : bdev_aio_create_io(struct bdev_aio_io_channel *ch)
685 : {
686 : ch->kqfd = kqueue();
687 : if (ch->kqfd < 0) {
688 : SPDK_ERRLOG("async I/O context setup failure: %s.\n", spdk_strerror(errno));
689 : return -1;
690 : }
691 :
692 : return 0;
693 : }
694 :
695 : static void
696 : bdev_aio_destroy_io(struct bdev_aio_io_channel *ch)
697 : {
698 : close(ch->kqfd);
699 : }
700 : #else
701 : static int
702 0 : bdev_aio_create_io(struct bdev_aio_io_channel *ch)
703 : {
704 0 : if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) {
705 0 : SPDK_ERRLOG("Async I/O context setup failure, likely due to exceeding kernel limit.\n");
706 0 : SPDK_ERRLOG("This limit may be increased using 'sysctl -w fs.aio-max-nr'.\n");
707 0 : return -1;
708 : }
709 :
710 0 : return 0;
711 : }
712 :
713 : static void
714 0 : bdev_aio_destroy_io(struct bdev_aio_io_channel *ch)
715 : {
716 0 : io_destroy(ch->io_ctx);
717 0 : }
718 : #endif
719 :
720 : static int
721 0 : bdev_aio_create_cb(void *io_device, void *ctx_buf)
722 : {
723 0 : struct bdev_aio_io_channel *ch = ctx_buf;
724 : int rc;
725 :
726 0 : rc = bdev_aio_create_io(ch);
727 0 : if (rc < 0) {
728 0 : return rc;
729 : }
730 :
731 0 : ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if));
732 0 : TAILQ_INSERT_TAIL(&ch->group_ch->io_ch_head, ch, link);
733 :
734 0 : return 0;
735 : }
736 :
737 : static void
738 0 : bdev_aio_destroy_cb(void *io_device, void *ctx_buf)
739 : {
740 0 : struct bdev_aio_io_channel *ch = ctx_buf;
741 :
742 0 : bdev_aio_destroy_io(ch);
743 :
744 0 : assert(ch->group_ch);
745 0 : TAILQ_REMOVE(&ch->group_ch->io_ch_head, ch, link);
746 :
747 0 : spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
748 0 : }
749 :
750 : static struct spdk_io_channel *
751 0 : bdev_aio_get_io_channel(void *ctx)
752 : {
753 0 : struct file_disk *fdisk = ctx;
754 :
755 0 : return spdk_get_io_channel(fdisk);
756 : }
757 :
758 :
759 : static int
760 0 : bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
761 : {
762 0 : struct file_disk *fdisk = ctx;
763 :
764 0 : spdk_json_write_named_object_begin(w, "aio");
765 :
766 0 : spdk_json_write_named_string(w, "filename", fdisk->filename);
767 :
768 0 : spdk_json_write_named_bool(w, "block_size_override", fdisk->block_size_override);
769 :
770 0 : spdk_json_write_named_bool(w, "readonly", fdisk->readonly);
771 :
772 0 : spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate);
773 :
774 0 : spdk_json_write_object_end(w);
775 :
776 0 : return 0;
777 : }
778 :
779 : static void
780 0 : bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
781 : {
782 0 : struct file_disk *fdisk = bdev->ctxt;
783 0 : const struct spdk_uuid *uuid = spdk_bdev_get_uuid(bdev);
784 :
785 0 : spdk_json_write_object_begin(w);
786 :
787 0 : spdk_json_write_named_string(w, "method", "bdev_aio_create");
788 :
789 0 : spdk_json_write_named_object_begin(w, "params");
790 0 : spdk_json_write_named_string(w, "name", bdev->name);
791 0 : if (fdisk->block_size_override) {
792 0 : spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
793 : }
794 0 : spdk_json_write_named_string(w, "filename", fdisk->filename);
795 0 : spdk_json_write_named_bool(w, "readonly", fdisk->readonly);
796 0 : spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate);
797 0 : if (!spdk_uuid_is_null(uuid)) {
798 0 : spdk_json_write_named_uuid(w, "uuid", uuid);
799 : }
800 0 : spdk_json_write_object_end(w);
801 :
802 0 : spdk_json_write_object_end(w);
803 0 : }
804 :
805 : static const struct spdk_bdev_fn_table aio_fn_table = {
806 : .destruct = bdev_aio_destruct,
807 : .submit_request = bdev_aio_submit_request,
808 : .io_type_supported = bdev_aio_io_type_supported,
809 : .get_io_channel = bdev_aio_get_io_channel,
810 : .dump_info_json = bdev_aio_dump_info_json,
811 : .write_config_json = bdev_aio_write_json_config,
812 : };
813 :
814 : static void
815 0 : aio_free_disk(struct file_disk *fdisk)
816 : {
817 0 : if (fdisk == NULL) {
818 0 : return;
819 : }
820 0 : free(fdisk->filename);
821 0 : free(fdisk->disk.name);
822 0 : free(fdisk);
823 : }
824 :
825 : static int
826 0 : bdev_aio_register_interrupt(struct bdev_aio_group_channel *ch)
827 : {
828 : int efd;
829 :
830 0 : efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
831 0 : if (efd < 0) {
832 0 : return -1;
833 : }
834 :
835 0 : ch->intr = SPDK_INTERRUPT_REGISTER(efd, bdev_aio_group_interrupt, ch);
836 0 : if (ch->intr == NULL) {
837 0 : close(efd);
838 0 : return -1;
839 : }
840 0 : ch->efd = efd;
841 :
842 0 : return 0;
843 : }
844 :
845 : static void
846 0 : bdev_aio_unregister_interrupt(struct bdev_aio_group_channel *ch)
847 : {
848 0 : spdk_interrupt_unregister(&ch->intr);
849 0 : close(ch->efd);
850 0 : ch->efd = -1;
851 0 : }
852 :
853 : static int
854 0 : bdev_aio_group_create_cb(void *io_device, void *ctx_buf)
855 : {
856 0 : struct bdev_aio_group_channel *ch = ctx_buf;
857 : int rc;
858 :
859 0 : TAILQ_INIT(&ch->io_ch_head);
860 : /* Initialize ch->efd to be invalid and unused. */
861 0 : ch->efd = -1;
862 0 : if (spdk_interrupt_mode_is_enabled()) {
863 0 : rc = bdev_aio_register_interrupt(ch);
864 0 : if (rc < 0) {
865 0 : SPDK_ERRLOG("Failed to prepare intr resource to bdev_aio\n");
866 0 : return rc;
867 : }
868 : }
869 :
870 0 : ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0);
871 0 : spdk_poller_register_interrupt(ch->poller, NULL, NULL);
872 :
873 0 : return 0;
874 : }
875 :
876 : static void
877 0 : bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf)
878 : {
879 0 : struct bdev_aio_group_channel *ch = ctx_buf;
880 :
881 0 : if (!TAILQ_EMPTY(&ch->io_ch_head)) {
882 0 : SPDK_ERRLOG("Group channel of bdev aio has uncleared io channel\n");
883 : }
884 :
885 0 : spdk_poller_unregister(&ch->poller);
886 0 : if (spdk_interrupt_mode_is_enabled()) {
887 0 : bdev_aio_unregister_interrupt(ch);
888 : }
889 0 : }
890 :
891 : int
892 0 : create_aio_bdev(const char *name, const char *filename, uint32_t block_size, bool readonly,
893 : bool fallocate, const struct spdk_uuid *uuid)
894 : {
895 : struct file_disk *fdisk;
896 : uint32_t detected_block_size;
897 : uint64_t disk_size;
898 : int rc;
899 :
900 : #ifdef __FreeBSD__
901 : if (fallocate) {
902 : SPDK_ERRLOG("Unable to support fallocate on this platform\n");
903 : return -ENOTSUP;
904 : }
905 : #endif
906 :
907 0 : fdisk = calloc(1, sizeof(*fdisk));
908 0 : if (!fdisk) {
909 0 : SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n");
910 0 : return -ENOMEM;
911 : }
912 0 : fdisk->readonly = readonly;
913 0 : fdisk->fallocate = fallocate;
914 :
915 0 : fdisk->filename = strdup(filename);
916 0 : if (!fdisk->filename) {
917 0 : rc = -ENOMEM;
918 0 : goto error_return;
919 : }
920 :
921 0 : if (bdev_aio_open(fdisk)) {
922 0 : SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno);
923 0 : rc = -errno;
924 0 : goto error_return;
925 : }
926 :
927 0 : disk_size = spdk_fd_get_size(fdisk->fd);
928 :
929 0 : fdisk->disk.name = strdup(name);
930 0 : if (!fdisk->disk.name) {
931 0 : rc = -ENOMEM;
932 0 : goto error_return;
933 : }
934 0 : fdisk->disk.product_name = "AIO disk";
935 0 : fdisk->disk.module = &aio_if;
936 :
937 0 : fdisk->disk.write_cache = 1;
938 :
939 0 : detected_block_size = spdk_fd_get_blocklen(fdisk->fd);
940 0 : if (block_size == 0) {
941 : /* User did not specify block size - use autodetected block size. */
942 0 : if (detected_block_size == 0) {
943 0 : SPDK_ERRLOG("Block size could not be auto-detected\n");
944 0 : rc = -EINVAL;
945 0 : goto error_return;
946 : }
947 0 : fdisk->block_size_override = false;
948 0 : block_size = detected_block_size;
949 : } else {
950 0 : if (block_size < detected_block_size) {
951 0 : SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
952 : "auto-detected block size %" PRIu32 "\n",
953 : block_size, detected_block_size);
954 0 : rc = -EINVAL;
955 0 : goto error_return;
956 0 : } else if (detected_block_size != 0 && block_size != detected_block_size) {
957 0 : SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
958 : "auto-detected block size %" PRIu32 "\n",
959 : block_size, detected_block_size);
960 : }
961 0 : fdisk->block_size_override = true;
962 : }
963 :
964 0 : if (block_size < 512) {
965 0 : SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
966 0 : rc = -EINVAL;
967 0 : goto error_return;
968 : }
969 :
970 0 : if (!spdk_u32_is_pow2(block_size)) {
971 0 : SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
972 0 : rc = -EINVAL;
973 0 : goto error_return;
974 : }
975 :
976 0 : fdisk->disk.blocklen = block_size;
977 0 : if (fdisk->block_size_override && detected_block_size) {
978 0 : fdisk->disk.required_alignment = spdk_u32log2(detected_block_size);
979 : } else {
980 0 : fdisk->disk.required_alignment = spdk_u32log2(block_size);
981 : }
982 :
983 0 : if (disk_size % fdisk->disk.blocklen != 0) {
984 0 : SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
985 : disk_size, fdisk->disk.blocklen);
986 0 : rc = -EINVAL;
987 0 : goto error_return;
988 : }
989 :
990 0 : fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen;
991 0 : fdisk->disk.ctxt = fdisk;
992 0 : spdk_uuid_copy(&fdisk->disk.uuid, uuid);
993 :
994 0 : fdisk->disk.fn_table = &aio_fn_table;
995 :
996 0 : spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb,
997 : sizeof(struct bdev_aio_io_channel),
998 0 : fdisk->disk.name);
999 0 : rc = spdk_bdev_register(&fdisk->disk);
1000 0 : if (rc) {
1001 0 : spdk_io_device_unregister(fdisk, NULL);
1002 0 : goto error_return;
1003 : }
1004 :
1005 0 : TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link);
1006 0 : return 0;
1007 :
1008 0 : error_return:
1009 0 : bdev_aio_close(fdisk);
1010 0 : aio_free_disk(fdisk);
1011 0 : return rc;
1012 : }
1013 :
1014 : static void
1015 0 : dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
1016 : {
1017 0 : }
1018 :
1019 : int
1020 0 : bdev_aio_rescan(const char *name)
1021 : {
1022 0 : struct spdk_bdev_desc *desc;
1023 : struct spdk_bdev *bdev;
1024 : struct file_disk *fdisk;
1025 : uint64_t disk_size, blockcnt;
1026 : int rc;
1027 :
1028 0 : rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc);
1029 0 : if (rc != 0) {
1030 0 : return rc;
1031 : }
1032 :
1033 0 : bdev = spdk_bdev_desc_get_bdev(desc);
1034 0 : if (bdev->module != &aio_if) {
1035 0 : rc = -ENODEV;
1036 0 : goto exit;
1037 : }
1038 :
1039 0 : fdisk = SPDK_CONTAINEROF(bdev, struct file_disk, disk);
1040 0 : disk_size = spdk_fd_get_size(fdisk->fd);
1041 0 : blockcnt = disk_size / bdev->blocklen;
1042 :
1043 0 : if (bdev->blockcnt != blockcnt) {
1044 0 : SPDK_NOTICELOG("AIO device is resized: bdev name %s, old block count %" PRIu64 ", new block count %"
1045 : PRIu64 "\n",
1046 : fdisk->filename,
1047 : bdev->blockcnt,
1048 : blockcnt);
1049 0 : rc = spdk_bdev_notify_blockcnt_change(bdev, blockcnt);
1050 0 : if (rc != 0) {
1051 0 : SPDK_ERRLOG("Could not change num blocks for aio bdev: name %s, errno: %d.\n",
1052 : fdisk->filename, rc);
1053 0 : goto exit;
1054 : }
1055 : }
1056 :
1057 0 : exit:
1058 0 : spdk_bdev_close(desc);
1059 0 : return rc;
1060 : }
1061 :
1062 : struct delete_aio_bdev_ctx {
1063 : delete_aio_bdev_complete cb_fn;
1064 : void *cb_arg;
1065 : };
1066 :
1067 : static void
1068 0 : aio_bdev_unregister_cb(void *arg, int bdeverrno)
1069 : {
1070 0 : struct delete_aio_bdev_ctx *ctx = arg;
1071 :
1072 0 : ctx->cb_fn(ctx->cb_arg, bdeverrno);
1073 0 : free(ctx);
1074 0 : }
1075 :
1076 : void
1077 0 : bdev_aio_delete(const char *name, delete_aio_bdev_complete cb_fn, void *cb_arg)
1078 : {
1079 : struct delete_aio_bdev_ctx *ctx;
1080 : int rc;
1081 :
1082 0 : ctx = calloc(1, sizeof(*ctx));
1083 0 : if (ctx == NULL) {
1084 0 : cb_fn(cb_arg, -ENOMEM);
1085 0 : return;
1086 : }
1087 :
1088 0 : ctx->cb_fn = cb_fn;
1089 0 : ctx->cb_arg = cb_arg;
1090 0 : rc = spdk_bdev_unregister_by_name(name, &aio_if, aio_bdev_unregister_cb, ctx);
1091 0 : if (rc != 0) {
1092 0 : aio_bdev_unregister_cb(ctx, rc);
1093 : }
1094 : }
1095 :
1096 : static int
1097 0 : bdev_aio_initialize(void)
1098 : {
1099 0 : spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb,
1100 : sizeof(struct bdev_aio_group_channel), "aio_module");
1101 :
1102 0 : return 0;
1103 : }
1104 :
1105 : static void
1106 0 : bdev_aio_fini(void)
1107 : {
1108 0 : spdk_io_device_unregister(&aio_if, NULL);
1109 0 : }
1110 :
1111 0 : SPDK_LOG_REGISTER_COMPONENT(aio)
|