Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2017 Intel Corporation.
3 : * All rights reserved.
4 : * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 : */
6 :
7 : #include "bdev_aio.h"
8 :
9 : #include "spdk/stdinc.h"
10 :
11 : #include "spdk/barrier.h"
12 : #include "spdk/bdev.h"
13 : #include "spdk/bdev_module.h"
14 : #include "spdk/env.h"
15 : #include "spdk/fd.h"
16 : #include "spdk/likely.h"
17 : #include "spdk/thread.h"
18 : #include "spdk/json.h"
19 : #include "spdk/util.h"
20 : #include "spdk/string.h"
21 :
22 : #include "spdk/log.h"
23 :
24 : #include <sys/eventfd.h>
25 :
26 : #ifndef __FreeBSD__
27 : #include <libaio.h>
28 : #endif
29 :
30 : struct bdev_aio_io_channel {
31 : uint64_t io_inflight;
32 : #ifdef __FreeBSD__
33 : int kqfd;
34 : #else
35 : io_context_t io_ctx;
36 : #endif
37 : struct bdev_aio_group_channel *group_ch;
38 : TAILQ_ENTRY(bdev_aio_io_channel) link;
39 : };
40 :
41 : struct bdev_aio_group_channel {
42 : /* eventfd for io completion notification in interrupt mode.
43 : * Negative value like '-1' indicates it is invalid or unused.
44 : */
45 : int efd;
46 : struct spdk_interrupt *intr;
47 : struct spdk_poller *poller;
48 : TAILQ_HEAD(, bdev_aio_io_channel) io_ch_head;
49 : };
50 :
51 : struct bdev_aio_task {
52 : #ifdef __FreeBSD__
53 : struct aiocb aiocb;
54 : #else
55 : struct iocb iocb;
56 : #endif
57 : uint64_t len;
58 : struct bdev_aio_io_channel *ch;
59 : };
60 :
61 : struct file_disk {
62 : struct bdev_aio_task *reset_task;
63 : struct spdk_poller *reset_retry_timer;
64 : struct spdk_bdev disk;
65 : char *filename;
66 : int fd;
67 : TAILQ_ENTRY(file_disk) link;
68 : bool block_size_override;
69 : bool readonly;
70 : bool fallocate;
71 : };
72 :
73 : /* For user space reaping of completions */
74 : struct spdk_aio_ring {
75 : uint32_t id;
76 : uint32_t size;
77 : uint32_t head;
78 : uint32_t tail;
79 :
80 : uint32_t version;
81 : uint32_t compat_features;
82 : uint32_t incompat_features;
83 : uint32_t header_length;
84 : };
85 :
86 : #define SPDK_AIO_RING_VERSION 0xa10a10a1
87 :
88 : static int bdev_aio_initialize(void);
89 : static void bdev_aio_fini(void);
90 : static void aio_free_disk(struct file_disk *fdisk);
91 : static TAILQ_HEAD(, file_disk) g_aio_disk_head = TAILQ_HEAD_INITIALIZER(g_aio_disk_head);
92 :
93 : #define SPDK_AIO_QUEUE_DEPTH 128
94 : #define MAX_EVENTS_PER_POLL 32
95 :
96 : static int
97 0 : bdev_aio_get_ctx_size(void)
98 : {
99 0 : return sizeof(struct bdev_aio_task);
100 : }
101 :
102 : static struct spdk_bdev_module aio_if = {
103 : .name = "aio",
104 : .module_init = bdev_aio_initialize,
105 : .module_fini = bdev_aio_fini,
106 : .get_ctx_size = bdev_aio_get_ctx_size,
107 : };
108 :
109 0 : SPDK_BDEV_MODULE_REGISTER(aio, &aio_if)
110 :
111 : static int
112 0 : bdev_aio_open(struct file_disk *disk)
113 : {
114 : int fd;
115 0 : int io_flag = disk->readonly ? O_RDONLY : O_RDWR;
116 :
117 0 : fd = open(disk->filename, io_flag | O_DIRECT);
118 0 : if (fd < 0) {
119 : /* Try without O_DIRECT for non-disk files */
120 0 : fd = open(disk->filename, io_flag);
121 0 : if (fd < 0) {
122 0 : SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
123 : disk->filename, errno, spdk_strerror(errno));
124 0 : disk->fd = -1;
125 0 : return -1;
126 : }
127 : }
128 :
129 0 : disk->fd = fd;
130 :
131 0 : return 0;
132 : }
133 :
134 : static int
135 0 : bdev_aio_close(struct file_disk *disk)
136 : {
137 : int rc;
138 :
139 0 : if (disk->fd == -1) {
140 0 : return 0;
141 : }
142 :
143 0 : rc = close(disk->fd);
144 0 : if (rc < 0) {
145 0 : SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
146 : disk->fd, errno, spdk_strerror(errno));
147 0 : return -1;
148 : }
149 :
150 0 : disk->fd = -1;
151 :
152 0 : return 0;
153 : }
154 :
155 : #ifdef __FreeBSD__
156 : static int
157 : bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk,
158 : struct spdk_io_channel *ch, struct bdev_aio_task *aio_task,
159 : struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
160 : {
161 : struct aiocb *aiocb = &aio_task->aiocb;
162 : struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
163 :
164 : memset(aiocb, 0, sizeof(struct aiocb));
165 : aiocb->aio_fildes = fdisk->fd;
166 : aiocb->aio_iov = iov;
167 : aiocb->aio_iovcnt = iovcnt;
168 : aiocb->aio_offset = offset;
169 : aiocb->aio_sigevent.sigev_notify_kqueue = aio_ch->kqfd;
170 : aiocb->aio_sigevent.sigev_value.sival_ptr = aio_task;
171 : aiocb->aio_sigevent.sigev_notify = SIGEV_KEVENT;
172 :
173 : aio_task->len = nbytes;
174 : aio_task->ch = aio_ch;
175 :
176 : if (type == SPDK_BDEV_IO_TYPE_READ) {
177 : return aio_readv(aiocb);
178 : }
179 :
180 : return aio_writev(aiocb);
181 : }
182 : #else
183 : static int
184 0 : bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk,
185 : struct spdk_io_channel *ch, struct bdev_aio_task *aio_task,
186 : struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
187 : {
188 0 : struct iocb *iocb = &aio_task->iocb;
189 0 : struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
190 :
191 0 : if (type == SPDK_BDEV_IO_TYPE_READ) {
192 0 : io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset);
193 : } else {
194 0 : io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset);
195 : }
196 :
197 0 : if (aio_ch->group_ch->efd >= 0) {
198 0 : io_set_eventfd(iocb, aio_ch->group_ch->efd);
199 : }
200 0 : iocb->data = aio_task;
201 0 : aio_task->len = nbytes;
202 0 : aio_task->ch = aio_ch;
203 :
204 0 : return io_submit(aio_ch->io_ctx, 1, &iocb);
205 : }
206 : #endif
207 :
208 : static void
209 0 : bdev_aio_rw(enum spdk_bdev_io_type type, struct file_disk *fdisk,
210 : struct spdk_io_channel *ch, struct bdev_aio_task *aio_task,
211 : struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
212 : {
213 0 : struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
214 : int rc;
215 :
216 0 : if (type == SPDK_BDEV_IO_TYPE_READ) {
217 0 : SPDK_DEBUGLOG(aio, "read %d iovs size %lu to off: %#lx\n",
218 : iovcnt, nbytes, offset);
219 : } else {
220 0 : SPDK_DEBUGLOG(aio, "write %d iovs size %lu from off: %#lx\n",
221 : iovcnt, nbytes, offset);
222 : }
223 :
224 0 : rc = bdev_aio_submit_io(type, fdisk, ch, aio_task, iov, iovcnt, nbytes, offset);
225 0 : if (spdk_unlikely(rc < 0)) {
226 0 : if (rc == -EAGAIN) {
227 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
228 : } else {
229 0 : spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc);
230 0 : SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
231 : }
232 : } else {
233 0 : aio_ch->io_inflight++;
234 : }
235 0 : }
236 :
237 : static void
238 0 : bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
239 : {
240 0 : int rc = fsync(fdisk->fd);
241 :
242 0 : if (rc == 0) {
243 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
244 : } else {
245 0 : spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno);
246 : }
247 0 : }
248 :
249 : #ifndef __FreeBSD__
250 : static void
251 0 : bdev_aio_fallocate(struct spdk_bdev_io *bdev_io, int mode)
252 : {
253 0 : struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt;
254 0 : struct bdev_aio_task *aio_task = (struct bdev_aio_task *)bdev_io->driver_ctx;
255 0 : uint64_t offset_bytes = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen;
256 0 : uint64_t length_bytes = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
257 : int rc;
258 :
259 0 : if (!fdisk->fallocate) {
260 0 : spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -ENOTSUP);
261 0 : return;
262 : }
263 :
264 0 : rc = fallocate(fdisk->fd, mode, offset_bytes, length_bytes);
265 0 : if (rc == 0) {
266 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
267 : } else {
268 0 : spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno);
269 : }
270 : }
271 :
272 : static void
273 0 : bdev_aio_unmap(struct spdk_bdev_io *bdev_io)
274 : {
275 0 : int mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE;
276 :
277 0 : bdev_aio_fallocate(bdev_io, mode);
278 0 : }
279 :
280 :
281 : static void
282 0 : bdev_aio_write_zeros(struct spdk_bdev_io *bdev_io)
283 : {
284 0 : int mode = FALLOC_FL_ZERO_RANGE;
285 :
286 0 : bdev_aio_fallocate(bdev_io, mode);
287 0 : }
288 : #endif
289 :
290 : static void
291 0 : bdev_aio_destruct_cb(void *io_device)
292 : {
293 0 : struct file_disk *fdisk = io_device;
294 0 : int rc = 0;
295 :
296 0 : TAILQ_REMOVE(&g_aio_disk_head, fdisk, link);
297 0 : rc = bdev_aio_close(fdisk);
298 0 : if (rc < 0) {
299 0 : SPDK_ERRLOG("bdev_aio_close() failed\n");
300 : }
301 0 : aio_free_disk(fdisk);
302 0 : }
303 :
304 : static int
305 0 : bdev_aio_destruct(void *ctx)
306 : {
307 0 : struct file_disk *fdisk = ctx;
308 :
309 0 : spdk_io_device_unregister(fdisk, bdev_aio_destruct_cb);
310 :
311 0 : return 0;
312 : }
313 :
314 : #ifdef __FreeBSD__
315 : static int
316 : bdev_user_io_getevents(int kq, unsigned int max, struct kevent *events)
317 : {
318 : struct timespec ts;
319 : int count;
320 :
321 : memset(events, 0, max * sizeof(struct kevent));
322 : memset(&ts, 0, sizeof(ts));
323 :
324 : count = kevent(kq, NULL, 0, events, max, &ts);
325 : if (count < 0) {
326 : SPDK_ERRLOG("failed to get kevents: %s.\n", spdk_strerror(errno));
327 : return -errno;
328 : }
329 :
330 : return count;
331 : }
332 :
333 : static int
334 : bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch)
335 : {
336 : int nr, i, res = 0;
337 : struct bdev_aio_task *aio_task;
338 : struct kevent events[SPDK_AIO_QUEUE_DEPTH];
339 :
340 : nr = bdev_user_io_getevents(io_ch->kqfd, SPDK_AIO_QUEUE_DEPTH, events);
341 : if (nr < 0) {
342 : return 0;
343 : }
344 :
345 : for (i = 0; i < nr; i++) {
346 : aio_task = events[i].udata;
347 : aio_task->ch->io_inflight--;
348 : if (aio_task == NULL) {
349 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
350 : break;
351 : } else if ((uint64_t)aio_return(&aio_task->aiocb) == aio_task->len) {
352 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
353 : } else {
354 : SPDK_ERRLOG("failed to complete aio: rc %d\n", aio_error(&aio_task->aiocb));
355 : res = aio_error(&aio_task->aiocb);
356 : if (res != 0) {
357 : spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res);
358 : } else {
359 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
360 : }
361 : }
362 : }
363 :
364 : return nr;
365 : }
366 : #else
367 : static int
368 0 : bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents)
369 : {
370 : uint32_t head, tail, count;
371 : struct spdk_aio_ring *ring;
372 0 : struct timespec timeout;
373 : struct io_event *kevents;
374 :
375 0 : ring = (struct spdk_aio_ring *)io_ctx;
376 :
377 0 : if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) {
378 0 : timeout.tv_sec = 0;
379 0 : timeout.tv_nsec = 0;
380 :
381 0 : return io_getevents(io_ctx, 0, max, uevents, &timeout);
382 : }
383 :
384 : /* Read the current state out of the ring */
385 0 : head = ring->head;
386 0 : tail = ring->tail;
387 :
388 : /* This memory barrier is required to prevent the loads above
389 : * from being re-ordered with stores to the events array
390 : * potentially occurring on other threads. */
391 0 : spdk_smp_rmb();
392 :
393 : /* Calculate how many items are in the circular ring */
394 0 : count = tail - head;
395 0 : if (tail < head) {
396 0 : count += ring->size;
397 : }
398 :
399 : /* Reduce the count to the limit provided by the user */
400 0 : count = spdk_min(max, count);
401 :
402 : /* Grab the memory location of the event array */
403 0 : kevents = (struct io_event *)((uintptr_t)ring + ring->header_length);
404 :
405 : /* Copy the events out of the ring. */
406 0 : if ((head + count) <= ring->size) {
407 : /* Only one copy is required */
408 0 : memcpy(uevents, &kevents[head], count * sizeof(struct io_event));
409 : } else {
410 0 : uint32_t first_part = ring->size - head;
411 : /* Two copies are required */
412 0 : memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event));
413 0 : memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event));
414 : }
415 :
416 : /* Update the head pointer. On x86, stores will not be reordered with older loads,
417 : * so the copies out of the event array will always be complete prior to this
418 : * update becoming visible. On other architectures this is not guaranteed, so
419 : * add a barrier. */
420 : #if defined(__i386__) || defined(__x86_64__)
421 0 : spdk_compiler_barrier();
422 : #else
423 : spdk_smp_mb();
424 : #endif
425 0 : ring->head = (head + count) % ring->size;
426 :
427 0 : return count;
428 : }
429 :
430 : static int
431 0 : bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch)
432 : {
433 0 : int nr, i, res = 0;
434 : struct bdev_aio_task *aio_task;
435 0 : struct io_event events[SPDK_AIO_QUEUE_DEPTH];
436 :
437 0 : nr = bdev_user_io_getevents(io_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events);
438 0 : if (nr < 0) {
439 0 : return 0;
440 : }
441 :
442 0 : for (i = 0; i < nr; i++) {
443 0 : aio_task = events[i].data;
444 0 : aio_task->ch->io_inflight--;
445 0 : if (events[i].res == aio_task->len) {
446 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
447 : } else {
448 : /* From aio_abi.h, io_event.res is defined __s64, negative errno
449 : * will be assigned to io_event.res for error situation.
450 : * But from libaio.h, io_event.res is defined unsigned long, so
451 : * convert it to signed value for error detection.
452 : */
453 0 : SPDK_ERRLOG("failed to complete aio: rc %"PRId64"\n", events[i].res);
454 0 : res = (int)events[i].res;
455 0 : if (res < 0) {
456 0 : spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res);
457 : } else {
458 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
459 : }
460 : }
461 : }
462 :
463 0 : return nr;
464 : }
465 : #endif
466 :
467 : static int
468 0 : bdev_aio_group_poll(void *arg)
469 : {
470 0 : struct bdev_aio_group_channel *group_ch = arg;
471 : struct bdev_aio_io_channel *io_ch;
472 0 : int nr = 0;
473 :
474 0 : TAILQ_FOREACH(io_ch, &group_ch->io_ch_head, link) {
475 0 : nr += bdev_aio_io_channel_poll(io_ch);
476 : }
477 :
478 0 : return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
479 : }
480 :
481 : static int
482 0 : bdev_aio_group_interrupt(void *arg)
483 : {
484 0 : struct bdev_aio_group_channel *group_ch = arg;
485 : int rc;
486 0 : uint64_t num_events;
487 :
488 0 : assert(group_ch->efd >= 0);
489 :
490 : /* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH,
491 : * io_getevent should be called again to ensure all completed IO are processed.
492 : */
493 0 : rc = read(group_ch->efd, &num_events, sizeof(num_events));
494 0 : if (rc < 0) {
495 0 : SPDK_ERRLOG("failed to acknowledge aio group: %s.\n", spdk_strerror(errno));
496 0 : return -errno;
497 : }
498 :
499 0 : if (num_events > SPDK_AIO_QUEUE_DEPTH) {
500 0 : num_events -= SPDK_AIO_QUEUE_DEPTH;
501 0 : rc = write(group_ch->efd, &num_events, sizeof(num_events));
502 0 : if (rc < 0) {
503 0 : SPDK_ERRLOG("failed to notify aio group: %s.\n", spdk_strerror(errno));
504 : }
505 : }
506 :
507 0 : return bdev_aio_group_poll(group_ch);
508 : }
509 :
510 : static void
511 0 : _bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i)
512 : {
513 0 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
514 0 : struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
515 :
516 0 : if (aio_ch->io_inflight) {
517 0 : spdk_for_each_channel_continue(i, -1);
518 0 : return;
519 : }
520 :
521 0 : spdk_for_each_channel_continue(i, 0);
522 : }
523 :
524 : static int bdev_aio_reset_retry_timer(void *arg);
525 :
526 : static void
527 0 : _bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status)
528 : {
529 0 : struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i);
530 :
531 0 : if (status == -1) {
532 0 : fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500);
533 0 : return;
534 : }
535 :
536 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS);
537 : }
538 :
539 : static int
540 0 : bdev_aio_reset_retry_timer(void *arg)
541 : {
542 0 : struct file_disk *fdisk = arg;
543 :
544 0 : if (fdisk->reset_retry_timer) {
545 0 : spdk_poller_unregister(&fdisk->reset_retry_timer);
546 : }
547 :
548 0 : spdk_for_each_channel(fdisk,
549 : _bdev_aio_get_io_inflight,
550 : fdisk,
551 : _bdev_aio_get_io_inflight_done);
552 :
553 0 : return SPDK_POLLER_BUSY;
554 : }
555 :
556 : static void
557 0 : bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
558 : {
559 0 : fdisk->reset_task = aio_task;
560 :
561 0 : bdev_aio_reset_retry_timer(fdisk);
562 0 : }
563 :
564 : static void
565 0 : bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
566 : bool success)
567 : {
568 0 : if (!success) {
569 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
570 0 : return;
571 : }
572 :
573 0 : switch (bdev_io->type) {
574 0 : case SPDK_BDEV_IO_TYPE_READ:
575 : case SPDK_BDEV_IO_TYPE_WRITE:
576 0 : bdev_aio_rw(bdev_io->type,
577 0 : (struct file_disk *)bdev_io->bdev->ctxt,
578 : ch,
579 0 : (struct bdev_aio_task *)bdev_io->driver_ctx,
580 : bdev_io->u.bdev.iovs,
581 : bdev_io->u.bdev.iovcnt,
582 0 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
583 0 : bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
584 0 : break;
585 0 : default:
586 0 : SPDK_ERRLOG("Wrong io type\n");
587 0 : break;
588 : }
589 : }
590 :
591 : static int
592 0 : _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
593 : {
594 0 : struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt;
595 :
596 0 : switch (bdev_io->type) {
597 : /* Read and write operations must be performed on buffers aligned to
598 : * bdev->required_alignment. If user specified unaligned buffers,
599 : * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
600 0 : case SPDK_BDEV_IO_TYPE_READ:
601 0 : spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
602 0 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
603 0 : return 0;
604 0 : case SPDK_BDEV_IO_TYPE_WRITE:
605 0 : if (fdisk->readonly) {
606 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
607 : } else {
608 0 : spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
609 0 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
610 : }
611 0 : return 0;
612 :
613 0 : case SPDK_BDEV_IO_TYPE_FLUSH:
614 0 : bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt,
615 0 : (struct bdev_aio_task *)bdev_io->driver_ctx);
616 0 : return 0;
617 :
618 0 : case SPDK_BDEV_IO_TYPE_RESET:
619 0 : bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt,
620 0 : (struct bdev_aio_task *)bdev_io->driver_ctx);
621 0 : return 0;
622 :
623 : #ifndef __FreeBSD__
624 0 : case SPDK_BDEV_IO_TYPE_UNMAP:
625 0 : bdev_aio_unmap(bdev_io);
626 0 : return 0;
627 :
628 0 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
629 0 : bdev_aio_write_zeros(bdev_io);
630 0 : return 0;
631 : #endif
632 :
633 0 : default:
634 0 : return -1;
635 : }
636 : }
637 :
638 : static void
639 0 : bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
640 : {
641 0 : if (_bdev_aio_submit_request(ch, bdev_io) < 0) {
642 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
643 : }
644 0 : }
645 :
646 : static bool
647 0 : bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
648 : {
649 0 : struct file_disk *fdisk = ctx;
650 :
651 0 : switch (io_type) {
652 0 : case SPDK_BDEV_IO_TYPE_READ:
653 : case SPDK_BDEV_IO_TYPE_WRITE:
654 : case SPDK_BDEV_IO_TYPE_FLUSH:
655 : case SPDK_BDEV_IO_TYPE_RESET:
656 0 : return true;
657 :
658 0 : case SPDK_BDEV_IO_TYPE_UNMAP:
659 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
660 0 : return fdisk->fallocate;
661 :
662 0 : default:
663 0 : return false;
664 : }
665 : }
666 :
667 : #ifdef __FreeBSD__
668 : static int
669 : bdev_aio_create_io(struct bdev_aio_io_channel *ch)
670 : {
671 : ch->kqfd = kqueue();
672 : if (ch->kqfd < 0) {
673 : SPDK_ERRLOG("async I/O context setup failure: %s.\n", spdk_strerror(errno));
674 : return -1;
675 : }
676 :
677 : return 0;
678 : }
679 :
680 : static void
681 : bdev_aio_destroy_io(struct bdev_aio_io_channel *ch)
682 : {
683 : close(ch->kqfd);
684 : }
685 : #else
686 : static int
687 0 : bdev_aio_create_io(struct bdev_aio_io_channel *ch)
688 : {
689 0 : if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) {
690 0 : SPDK_ERRLOG("Async I/O context setup failure, likely due to exceeding kernel limit.\n");
691 0 : SPDK_ERRLOG("This limit may be increased using 'sysctl -w fs.aio-max-nr'.\n");
692 0 : return -1;
693 : }
694 :
695 0 : return 0;
696 : }
697 :
698 : static void
699 0 : bdev_aio_destroy_io(struct bdev_aio_io_channel *ch)
700 : {
701 0 : io_destroy(ch->io_ctx);
702 0 : }
703 : #endif
704 :
705 : static int
706 0 : bdev_aio_create_cb(void *io_device, void *ctx_buf)
707 : {
708 0 : struct bdev_aio_io_channel *ch = ctx_buf;
709 : int rc;
710 :
711 0 : rc = bdev_aio_create_io(ch);
712 0 : if (rc < 0) {
713 0 : return rc;
714 : }
715 :
716 0 : ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if));
717 0 : TAILQ_INSERT_TAIL(&ch->group_ch->io_ch_head, ch, link);
718 :
719 0 : return 0;
720 : }
721 :
722 : static void
723 0 : bdev_aio_destroy_cb(void *io_device, void *ctx_buf)
724 : {
725 0 : struct bdev_aio_io_channel *ch = ctx_buf;
726 :
727 0 : bdev_aio_destroy_io(ch);
728 :
729 0 : assert(ch->group_ch);
730 0 : TAILQ_REMOVE(&ch->group_ch->io_ch_head, ch, link);
731 :
732 0 : spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
733 0 : }
734 :
735 : static struct spdk_io_channel *
736 0 : bdev_aio_get_io_channel(void *ctx)
737 : {
738 0 : struct file_disk *fdisk = ctx;
739 :
740 0 : return spdk_get_io_channel(fdisk);
741 : }
742 :
743 :
744 : static int
745 0 : bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
746 : {
747 0 : struct file_disk *fdisk = ctx;
748 :
749 0 : spdk_json_write_named_object_begin(w, "aio");
750 :
751 0 : spdk_json_write_named_string(w, "filename", fdisk->filename);
752 :
753 0 : spdk_json_write_named_bool(w, "block_size_override", fdisk->block_size_override);
754 :
755 0 : spdk_json_write_named_bool(w, "readonly", fdisk->readonly);
756 :
757 0 : spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate);
758 :
759 0 : spdk_json_write_object_end(w);
760 :
761 0 : return 0;
762 : }
763 :
764 : static void
765 0 : bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
766 : {
767 0 : struct file_disk *fdisk = bdev->ctxt;
768 :
769 0 : spdk_json_write_object_begin(w);
770 :
771 0 : spdk_json_write_named_string(w, "method", "bdev_aio_create");
772 :
773 0 : spdk_json_write_named_object_begin(w, "params");
774 0 : spdk_json_write_named_string(w, "name", bdev->name);
775 0 : if (fdisk->block_size_override) {
776 0 : spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
777 : }
778 0 : spdk_json_write_named_string(w, "filename", fdisk->filename);
779 0 : spdk_json_write_named_bool(w, "readonly", fdisk->readonly);
780 0 : spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate);
781 0 : spdk_json_write_object_end(w);
782 :
783 0 : spdk_json_write_object_end(w);
784 0 : }
785 :
786 : static const struct spdk_bdev_fn_table aio_fn_table = {
787 : .destruct = bdev_aio_destruct,
788 : .submit_request = bdev_aio_submit_request,
789 : .io_type_supported = bdev_aio_io_type_supported,
790 : .get_io_channel = bdev_aio_get_io_channel,
791 : .dump_info_json = bdev_aio_dump_info_json,
792 : .write_config_json = bdev_aio_write_json_config,
793 : };
794 :
795 : static void
796 0 : aio_free_disk(struct file_disk *fdisk)
797 : {
798 0 : if (fdisk == NULL) {
799 0 : return;
800 : }
801 0 : free(fdisk->filename);
802 0 : free(fdisk->disk.name);
803 0 : free(fdisk);
804 : }
805 :
806 : static int
807 0 : bdev_aio_register_interrupt(struct bdev_aio_group_channel *ch)
808 : {
809 : int efd;
810 :
811 0 : efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
812 0 : if (efd < 0) {
813 0 : return -1;
814 : }
815 :
816 0 : ch->intr = SPDK_INTERRUPT_REGISTER(efd, bdev_aio_group_interrupt, ch);
817 0 : if (ch->intr == NULL) {
818 0 : close(efd);
819 0 : return -1;
820 : }
821 0 : ch->efd = efd;
822 :
823 0 : return 0;
824 : }
825 :
826 : static void
827 0 : bdev_aio_unregister_interrupt(struct bdev_aio_group_channel *ch)
828 : {
829 0 : spdk_interrupt_unregister(&ch->intr);
830 0 : close(ch->efd);
831 0 : ch->efd = -1;
832 0 : }
833 :
834 : static void
835 0 : bdev_aio_poller_set_interrupt_mode(struct spdk_poller *poller, void *cb_arg, bool interrupt_mode)
836 : {
837 0 : return;
838 : }
839 :
840 : static int
841 0 : bdev_aio_group_create_cb(void *io_device, void *ctx_buf)
842 : {
843 0 : struct bdev_aio_group_channel *ch = ctx_buf;
844 : int rc;
845 :
846 0 : TAILQ_INIT(&ch->io_ch_head);
847 : /* Initialize ch->efd to be invalid and unused. */
848 0 : ch->efd = -1;
849 0 : if (spdk_interrupt_mode_is_enabled()) {
850 0 : rc = bdev_aio_register_interrupt(ch);
851 0 : if (rc < 0) {
852 0 : SPDK_ERRLOG("Failed to prepare intr resource to bdev_aio\n");
853 0 : return rc;
854 : }
855 : }
856 :
857 0 : ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0);
858 0 : spdk_poller_register_interrupt(ch->poller, bdev_aio_poller_set_interrupt_mode, NULL);
859 :
860 0 : return 0;
861 : }
862 :
863 : static void
864 0 : bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf)
865 : {
866 0 : struct bdev_aio_group_channel *ch = ctx_buf;
867 :
868 0 : if (!TAILQ_EMPTY(&ch->io_ch_head)) {
869 0 : SPDK_ERRLOG("Group channel of bdev aio has uncleared io channel\n");
870 : }
871 :
872 0 : spdk_poller_unregister(&ch->poller);
873 0 : if (spdk_interrupt_mode_is_enabled()) {
874 0 : bdev_aio_unregister_interrupt(ch);
875 : }
876 0 : }
877 :
878 : int
879 0 : create_aio_bdev(const char *name, const char *filename, uint32_t block_size, bool readonly,
880 : bool fallocate)
881 : {
882 : struct file_disk *fdisk;
883 : uint32_t detected_block_size;
884 : uint64_t disk_size;
885 : int rc;
886 :
887 : #ifdef __FreeBSD__
888 : if (fallocate) {
889 : SPDK_ERRLOG("Unable to support fallocate on this platform\n");
890 : return -ENOTSUP;
891 : }
892 : #endif
893 :
894 0 : fdisk = calloc(1, sizeof(*fdisk));
895 0 : if (!fdisk) {
896 0 : SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n");
897 0 : return -ENOMEM;
898 : }
899 0 : fdisk->readonly = readonly;
900 0 : fdisk->fallocate = fallocate;
901 :
902 0 : fdisk->filename = strdup(filename);
903 0 : if (!fdisk->filename) {
904 0 : rc = -ENOMEM;
905 0 : goto error_return;
906 : }
907 :
908 0 : if (bdev_aio_open(fdisk)) {
909 0 : SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno);
910 0 : rc = -errno;
911 0 : goto error_return;
912 : }
913 :
914 0 : disk_size = spdk_fd_get_size(fdisk->fd);
915 :
916 0 : fdisk->disk.name = strdup(name);
917 0 : if (!fdisk->disk.name) {
918 0 : rc = -ENOMEM;
919 0 : goto error_return;
920 : }
921 0 : fdisk->disk.product_name = "AIO disk";
922 0 : fdisk->disk.module = &aio_if;
923 :
924 0 : fdisk->disk.write_cache = 1;
925 :
926 0 : detected_block_size = spdk_fd_get_blocklen(fdisk->fd);
927 0 : if (block_size == 0) {
928 : /* User did not specify block size - use autodetected block size. */
929 0 : if (detected_block_size == 0) {
930 0 : SPDK_ERRLOG("Block size could not be auto-detected\n");
931 0 : rc = -EINVAL;
932 0 : goto error_return;
933 : }
934 0 : fdisk->block_size_override = false;
935 0 : block_size = detected_block_size;
936 : } else {
937 0 : if (block_size < detected_block_size) {
938 0 : SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
939 : "auto-detected block size %" PRIu32 "\n",
940 : block_size, detected_block_size);
941 0 : rc = -EINVAL;
942 0 : goto error_return;
943 0 : } else if (detected_block_size != 0 && block_size != detected_block_size) {
944 0 : SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
945 : "auto-detected block size %" PRIu32 "\n",
946 : block_size, detected_block_size);
947 : }
948 0 : fdisk->block_size_override = true;
949 : }
950 :
951 0 : if (block_size < 512) {
952 0 : SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
953 0 : rc = -EINVAL;
954 0 : goto error_return;
955 : }
956 :
957 0 : if (!spdk_u32_is_pow2(block_size)) {
958 0 : SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
959 0 : rc = -EINVAL;
960 0 : goto error_return;
961 : }
962 :
963 0 : fdisk->disk.blocklen = block_size;
964 0 : if (fdisk->block_size_override && detected_block_size) {
965 0 : fdisk->disk.required_alignment = spdk_u32log2(detected_block_size);
966 : } else {
967 0 : fdisk->disk.required_alignment = spdk_u32log2(block_size);
968 : }
969 :
970 0 : if (disk_size % fdisk->disk.blocklen != 0) {
971 0 : SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
972 : disk_size, fdisk->disk.blocklen);
973 0 : rc = -EINVAL;
974 0 : goto error_return;
975 : }
976 :
977 0 : fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen;
978 0 : fdisk->disk.ctxt = fdisk;
979 :
980 0 : fdisk->disk.fn_table = &aio_fn_table;
981 :
982 0 : spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb,
983 : sizeof(struct bdev_aio_io_channel),
984 0 : fdisk->disk.name);
985 0 : rc = spdk_bdev_register(&fdisk->disk);
986 0 : if (rc) {
987 0 : spdk_io_device_unregister(fdisk, NULL);
988 0 : goto error_return;
989 : }
990 :
991 0 : TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link);
992 0 : return 0;
993 :
994 0 : error_return:
995 0 : bdev_aio_close(fdisk);
996 0 : aio_free_disk(fdisk);
997 0 : return rc;
998 : }
999 :
1000 : static void
1001 0 : dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
1002 : {
1003 0 : }
1004 :
1005 : int
1006 0 : bdev_aio_rescan(const char *name)
1007 : {
1008 0 : struct spdk_bdev_desc *desc;
1009 : struct spdk_bdev *bdev;
1010 : struct file_disk *fdisk;
1011 : uint64_t disk_size, blockcnt;
1012 : int rc;
1013 :
1014 0 : rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc);
1015 0 : if (rc != 0) {
1016 0 : return rc;
1017 : }
1018 :
1019 0 : bdev = spdk_bdev_desc_get_bdev(desc);
1020 0 : if (bdev->module != &aio_if) {
1021 0 : rc = -ENODEV;
1022 0 : goto exit;
1023 : }
1024 :
1025 0 : fdisk = SPDK_CONTAINEROF(bdev, struct file_disk, disk);
1026 0 : disk_size = spdk_fd_get_size(fdisk->fd);
1027 0 : blockcnt = disk_size / bdev->blocklen;
1028 :
1029 0 : if (bdev->blockcnt != blockcnt) {
1030 0 : SPDK_NOTICELOG("AIO device is resized: bdev name %s, old block count %" PRIu64 ", new block count %"
1031 : PRIu64 "\n",
1032 : fdisk->filename,
1033 : bdev->blockcnt,
1034 : blockcnt);
1035 0 : rc = spdk_bdev_notify_blockcnt_change(bdev, blockcnt);
1036 0 : if (rc != 0) {
1037 0 : SPDK_ERRLOG("Could not change num blocks for aio bdev: name %s, errno: %d.\n",
1038 : fdisk->filename, rc);
1039 0 : goto exit;
1040 : }
1041 : }
1042 :
1043 0 : exit:
1044 0 : spdk_bdev_close(desc);
1045 0 : return rc;
1046 : }
1047 :
1048 : struct delete_aio_bdev_ctx {
1049 : delete_aio_bdev_complete cb_fn;
1050 : void *cb_arg;
1051 : };
1052 :
1053 : static void
1054 0 : aio_bdev_unregister_cb(void *arg, int bdeverrno)
1055 : {
1056 0 : struct delete_aio_bdev_ctx *ctx = arg;
1057 :
1058 0 : ctx->cb_fn(ctx->cb_arg, bdeverrno);
1059 0 : free(ctx);
1060 0 : }
1061 :
1062 : void
1063 0 : bdev_aio_delete(const char *name, delete_aio_bdev_complete cb_fn, void *cb_arg)
1064 : {
1065 : struct delete_aio_bdev_ctx *ctx;
1066 : int rc;
1067 :
1068 0 : ctx = calloc(1, sizeof(*ctx));
1069 0 : if (ctx == NULL) {
1070 0 : cb_fn(cb_arg, -ENOMEM);
1071 0 : return;
1072 : }
1073 :
1074 0 : ctx->cb_fn = cb_fn;
1075 0 : ctx->cb_arg = cb_arg;
1076 0 : rc = spdk_bdev_unregister_by_name(name, &aio_if, aio_bdev_unregister_cb, ctx);
1077 0 : if (rc != 0) {
1078 0 : aio_bdev_unregister_cb(ctx, rc);
1079 : }
1080 : }
1081 :
1082 : static int
1083 0 : bdev_aio_initialize(void)
1084 : {
1085 0 : spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb,
1086 : sizeof(struct bdev_aio_group_channel), "aio_module");
1087 :
1088 0 : return 0;
1089 : }
1090 :
1091 : static void
1092 0 : bdev_aio_fini(void)
1093 : {
1094 0 : spdk_io_device_unregister(&aio_if, NULL);
1095 0 : }
1096 :
1097 0 : SPDK_LOG_REGISTER_COMPONENT(aio)
|