Branch data Line data Source code
1 : : /* SPDX-License-Identifier: BSD-3-Clause
2 : : * Copyright (C) 2019 Intel Corporation.
3 : : * All rights reserved.
4 : : * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 : : */
6 : :
7 : : #include "bdev_raid.h"
8 : :
9 : : #include "spdk/env.h"
10 : : #include "spdk/thread.h"
11 : : #include "spdk/string.h"
12 : : #include "spdk/util.h"
13 : :
14 : : #include "spdk/log.h"
15 : :
16 : : /*
17 : : * brief:
18 : : * raid0_bdev_io_completion function is called by lower layers to notify raid
19 : : * module that particular bdev_io is completed.
20 : : * params:
21 : : * bdev_io - pointer to bdev io submitted to lower layers, like child io
22 : : * success - bdev_io status
23 : : * cb_arg - function callback context (parent raid_bdev_io)
24 : : * returns:
25 : : * none
26 : : */
27 : : static void
28 : 4417147 : raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
29 : : {
30 : 4417147 : struct raid_bdev_io *raid_io = cb_arg;
31 : : int rc;
32 : :
33 [ + + ]: 4417147 : if (success) {
34 [ + + + + : 4417117 : if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_READ &&
+ + + - ]
35 : : spdk_bdev_get_dif_type(bdev_io->bdev) != SPDK_DIF_DISABLE &&
36 : : bdev_io->bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) {
37 : :
38 : 6 : rc = raid_bdev_verify_dix_reftag(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
39 : : bdev_io->u.bdev.md_buf, bdev_io->u.bdev.num_blocks, bdev_io->bdev,
40 : 6 : bdev_io->u.bdev.offset_blocks);
41 [ - + ]: 6 : if (rc != 0) {
42 : 0 : SPDK_ERRLOG("Reftag verify failed.\n");
43 : 0 : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
44 : 0 : return;
45 : : }
46 : : }
47 : :
48 : 4417117 : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
49 : : } else {
50 : 30 : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
51 : : }
52 : :
53 : 4417147 : spdk_bdev_free_io(bdev_io);
54 : : }
55 : :
56 : : static void raid0_submit_rw_request(struct raid_bdev_io *raid_io);
57 : :
58 : : static void
59 : 0 : _raid0_submit_rw_request(void *_raid_io)
60 : : {
61 : 0 : struct raid_bdev_io *raid_io = _raid_io;
62 : :
63 : 0 : raid0_submit_rw_request(raid_io);
64 : 0 : }
65 : :
66 : : /*
67 : : * brief:
68 : : * raid0_submit_rw_request function is used to submit I/O to the correct
69 : : * member disk for raid0 bdevs.
70 : : * params:
71 : : * raid_io
72 : : * returns:
73 : : * none
74 : : */
75 : : static void
76 : 4417147 : raid0_submit_rw_request(struct raid_bdev_io *raid_io)
77 : : {
78 : 4417147 : struct spdk_bdev_ext_io_opts io_opts = {};
79 : 4417147 : struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch;
80 : 4417147 : struct raid_bdev *raid_bdev = raid_io->raid_bdev;
81 : : uint64_t pd_strip;
82 : : uint32_t offset_in_strip;
83 : : uint64_t pd_lba;
84 : : uint64_t pd_blocks;
85 : : uint8_t pd_idx;
86 : 4417147 : int ret = 0;
87 : : uint64_t start_strip;
88 : : uint64_t end_strip;
89 : : struct raid_base_bdev_info *base_info;
90 : : struct spdk_io_channel *base_ch;
91 : :
92 [ - + ]: 4417147 : start_strip = raid_io->offset_blocks >> raid_bdev->strip_size_shift;
93 [ - + ]: 4417147 : end_strip = (raid_io->offset_blocks + raid_io->num_blocks - 1) >>
94 : 4417147 : raid_bdev->strip_size_shift;
95 [ - + - - ]: 4417147 : if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
96 : 0 : assert(false);
97 : : SPDK_ERRLOG("I/O spans strip boundary!\n");
98 : : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
99 : 0 : return;
100 : : }
101 : :
102 [ - + ]: 4417147 : pd_strip = start_strip / raid_bdev->num_base_bdevs;
103 [ - + ]: 4417147 : pd_idx = start_strip % raid_bdev->num_base_bdevs;
104 : 4417147 : offset_in_strip = raid_io->offset_blocks & (raid_bdev->strip_size - 1);
105 [ - + ]: 4417147 : pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
106 : 4417147 : pd_blocks = raid_io->num_blocks;
107 : 4417147 : base_info = &raid_bdev->base_bdev_info[pd_idx];
108 [ - + ]: 4417147 : if (base_info->desc == NULL) {
109 : 0 : SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
110 : 0 : assert(0);
111 : : }
112 : :
113 : : /*
114 : : * Submit child io to bdev layer with using base bdev descriptors, base
115 : : * bdev lba, base bdev child io length in blocks, buffer, completion
116 : : * function and function callback context
117 : : */
118 [ - + ]: 4417147 : assert(raid_ch != NULL);
119 : 4417147 : base_ch = raid_bdev_channel_get_base_channel(raid_ch, pd_idx);
120 : :
121 : 4417147 : io_opts.size = sizeof(io_opts);
122 : 4417147 : io_opts.memory_domain = raid_io->memory_domain;
123 : 4417147 : io_opts.memory_domain_ctx = raid_io->memory_domain_ctx;
124 : 4417147 : io_opts.metadata = raid_io->md_buf;
125 : :
126 [ + + ]: 4417147 : if (raid_io->type == SPDK_BDEV_IO_TYPE_READ) {
127 : 1448630 : ret = raid_bdev_readv_blocks_ext(base_info, base_ch,
128 : : raid_io->iovs, raid_io->iovcnt,
129 : : pd_lba, pd_blocks, raid0_bdev_io_completion,
130 : : raid_io, &io_opts);
131 [ + - ]: 2968517 : } else if (raid_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
132 : 2968517 : struct spdk_bdev *bdev = &base_info->raid_bdev->bdev;
133 : :
134 [ + + + - ]: 2968517 : if (spdk_unlikely(spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE &&
135 : : bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) {
136 : 9 : ret = raid_bdev_verify_dix_reftag(raid_io->iovs, raid_io->iovcnt, io_opts.metadata,
137 : 9 : pd_blocks, bdev, raid_io->offset_blocks);
138 [ - + ]: 9 : if (ret != 0) {
139 : 0 : SPDK_ERRLOG("bdev io submit error due to DIX verify failure\n");
140 : 0 : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
141 : 0 : return;
142 : : }
143 : : }
144 : :
145 : 2968517 : ret = raid_bdev_writev_blocks_ext(base_info, base_ch,
146 : : raid_io->iovs, raid_io->iovcnt,
147 : : pd_lba, pd_blocks, raid0_bdev_io_completion,
148 : : raid_io, &io_opts);
149 : : } else {
150 : 0 : SPDK_ERRLOG("Recvd not supported io type %u\n", raid_io->type);
151 : 0 : assert(0);
152 : : }
153 : :
154 [ - + ]: 4417147 : if (ret == -ENOMEM) {
155 : 0 : raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
156 : : base_ch, _raid0_submit_rw_request);
157 [ - + ]: 4417147 : } else if (ret != 0) {
158 : 0 : SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
159 : 0 : assert(false);
160 : : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
161 : : }
162 : : }
163 : :
164 : : /* raid0 IO range */
165 : : struct raid_bdev_io_range {
166 : : uint64_t strip_size;
167 : : uint64_t start_strip_in_disk;
168 : : uint64_t end_strip_in_disk;
169 : : uint64_t start_offset_in_strip;
170 : : uint64_t end_offset_in_strip;
171 : : uint8_t start_disk;
172 : : uint8_t end_disk;
173 : : uint8_t n_disks_involved;
174 : : };
175 : :
176 : : static inline void
177 : 540638 : _raid0_get_io_range(struct raid_bdev_io_range *io_range,
178 : : uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
179 : : uint64_t offset_blocks, uint64_t num_blocks)
180 : : {
181 : : uint64_t start_strip;
182 : : uint64_t end_strip;
183 : : uint64_t total_blocks;
184 : :
185 : 540638 : io_range->strip_size = strip_size;
186 : 540638 : total_blocks = offset_blocks + num_blocks - (num_blocks > 0);
187 : :
188 : : /* The start and end strip index in raid0 bdev scope */
189 [ - + ]: 540638 : start_strip = offset_blocks >> strip_size_shift;
190 [ - + ]: 540638 : end_strip = total_blocks >> strip_size_shift;
191 [ - + ]: 540638 : io_range->start_strip_in_disk = start_strip / num_base_bdevs;
192 [ - + ]: 540638 : io_range->end_strip_in_disk = end_strip / num_base_bdevs;
193 : :
194 : : /* The first strip may have unaligned start LBA offset.
195 : : * The end strip may have unaligned end LBA offset.
196 : : * Strips between them certainly have aligned offset and length to boundaries.
197 : : */
198 [ - + ]: 540638 : io_range->start_offset_in_strip = offset_blocks % strip_size;
199 [ - + ]: 540638 : io_range->end_offset_in_strip = total_blocks % strip_size;
200 : :
201 : : /* The base bdev indexes in which start and end strips are located */
202 [ - + ]: 540638 : io_range->start_disk = start_strip % num_base_bdevs;
203 [ - + ]: 540638 : io_range->end_disk = end_strip % num_base_bdevs;
204 : :
205 : : /* Calculate how many base_bdevs are involved in io operation.
206 : : * Number of base bdevs involved is between 1 and num_base_bdevs.
207 : : * It will be 1 if the first strip and last strip are the same one.
208 : : */
209 : 540638 : io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
210 : 540638 : }
211 : :
212 : : static inline void
213 : 641133 : _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
214 : : uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
215 : : {
216 : : uint64_t n_strips_in_disk;
217 : : uint64_t start_offset_in_disk;
218 : : uint64_t end_offset_in_disk;
219 : : uint64_t offset_in_disk;
220 : : uint64_t nblocks_in_disk;
221 : : uint64_t start_strip_in_disk;
222 : : uint64_t end_strip_in_disk;
223 : :
224 : 641133 : start_strip_in_disk = io_range->start_strip_in_disk;
225 [ + + ]: 641133 : if (disk_idx < io_range->start_disk) {
226 : 44280 : start_strip_in_disk += 1;
227 : : }
228 : :
229 : 641133 : end_strip_in_disk = io_range->end_strip_in_disk;
230 [ + + ]: 641133 : if (disk_idx > io_range->end_disk) {
231 : 20902 : end_strip_in_disk -= 1;
232 : : }
233 : :
234 [ - + ]: 641133 : assert(end_strip_in_disk >= start_strip_in_disk);
235 : 641133 : n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
236 : :
237 [ + + ]: 641133 : if (disk_idx == io_range->start_disk) {
238 : 540638 : start_offset_in_disk = io_range->start_offset_in_strip;
239 : : } else {
240 : 100495 : start_offset_in_disk = 0;
241 : : }
242 : :
243 [ + + ]: 641133 : if (disk_idx == io_range->end_disk) {
244 : 540638 : end_offset_in_disk = io_range->end_offset_in_strip;
245 : : } else {
246 : 100495 : end_offset_in_disk = io_range->strip_size - 1;
247 : : }
248 : :
249 : 641133 : offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
250 : 641133 : nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
251 : 641133 : + end_offset_in_disk - start_offset_in_disk + 1;
252 : :
253 [ - + - + ]: 641133 : SPDK_DEBUGLOG(bdev_raid0,
254 : : "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64
255 : : ").\n",
256 : : io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
257 : :
258 : 641133 : *_offset_in_disk = offset_in_disk;
259 : 641133 : *_nblocks_in_disk = nblocks_in_disk;
260 : 641133 : }
261 : :
262 : : static void raid0_submit_null_payload_request(struct raid_bdev_io *raid_io);
263 : :
264 : : static void
265 : 0 : _raid0_submit_null_payload_request(void *_raid_io)
266 : : {
267 : 0 : struct raid_bdev_io *raid_io = _raid_io;
268 : :
269 : 0 : raid0_submit_null_payload_request(raid_io);
270 : 0 : }
271 : :
272 : : static void
273 : 641133 : raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
274 : : {
275 : 641133 : struct raid_bdev_io *raid_io = cb_arg;
276 : :
277 [ + - ]: 641133 : raid_bdev_io_complete_part(raid_io, 1, success ?
278 : : SPDK_BDEV_IO_STATUS_SUCCESS :
279 : : SPDK_BDEV_IO_STATUS_FAILED);
280 : :
281 : 641133 : spdk_bdev_free_io(bdev_io);
282 : 641133 : }
283 : :
284 : : /*
285 : : * brief:
286 : : * raid0_submit_null_payload_request function submits the next batch of
287 : : * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
288 : : * it will submit as many as possible unless one base io request fails with -ENOMEM,
289 : : * in which case it will queue itself for later submission.
290 : : * params:
291 : : * bdev_io - pointer to parent bdev_io on raid bdev device
292 : : * returns:
293 : : * none
294 : : */
295 : : static void
296 : 540638 : raid0_submit_null_payload_request(struct raid_bdev_io *raid_io)
297 : : {
298 : : struct raid_bdev *raid_bdev;
299 : 460617 : struct raid_bdev_io_range io_range;
300 : : int ret;
301 : : struct raid_base_bdev_info *base_info;
302 : : struct spdk_io_channel *base_ch;
303 : :
304 : 540638 : raid_bdev = raid_io->raid_bdev;
305 : :
306 : 831569 : _raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs,
307 : 540638 : raid_bdev->strip_size, raid_bdev->strip_size_shift,
308 : : raid_io->offset_blocks, raid_io->num_blocks);
309 : :
310 [ + - ]: 540638 : if (raid_io->base_bdev_io_remaining == 0) {
311 : 540638 : raid_io->base_bdev_io_remaining = io_range.n_disks_involved;
312 : : }
313 : :
314 [ + + ]: 1181771 : while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) {
315 : : uint8_t disk_idx;
316 : 561063 : uint64_t offset_in_disk;
317 : 561063 : uint64_t nblocks_in_disk;
318 : :
319 : : /* base_bdev is started from start_disk to end_disk.
320 : : * It is possible that index of start_disk is larger than end_disk's.
321 : : */
322 [ - + ]: 641133 : disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
323 : 641133 : base_info = &raid_bdev->base_bdev_info[disk_idx];
324 : 641133 : base_ch = raid_bdev_channel_get_base_channel(raid_io->raid_ch, disk_idx);
325 : :
326 : 641133 : _raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
327 : :
328 [ + - - ]: 641133 : switch (raid_io->type) {
329 : 641133 : case SPDK_BDEV_IO_TYPE_UNMAP:
330 : 641133 : ret = raid_bdev_unmap_blocks(base_info, base_ch,
331 : : offset_in_disk, nblocks_in_disk,
332 : : raid0_base_io_complete, raid_io);
333 : 641133 : break;
334 : :
335 : 0 : case SPDK_BDEV_IO_TYPE_FLUSH:
336 : 0 : ret = raid_bdev_flush_blocks(base_info, base_ch,
337 : : offset_in_disk, nblocks_in_disk,
338 : : raid0_base_io_complete, raid_io);
339 : 0 : break;
340 : :
341 : 0 : default:
342 : 0 : SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", raid_io->type);
343 : 0 : assert(false);
344 : : ret = -EIO;
345 : : }
346 : :
347 [ + - ]: 641133 : if (ret == 0) {
348 : 641133 : raid_io->base_bdev_io_submitted++;
349 [ # # ]: 0 : } else if (ret == -ENOMEM) {
350 : 0 : raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
351 : : base_ch, _raid0_submit_null_payload_request);
352 : 0 : return;
353 : : } else {
354 : 0 : SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
355 : 0 : assert(false);
356 : : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
357 : : return;
358 : : }
359 : : }
360 : : }
361 : :
362 : : static int
363 : 173 : raid0_start(struct raid_bdev *raid_bdev)
364 : : {
365 : 173 : uint64_t min_blockcnt = UINT64_MAX;
366 : : uint64_t base_bdev_data_size;
367 : : struct raid_base_bdev_info *base_info;
368 : :
369 [ + + ]: 1335 : RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
370 : : /* Calculate minimum block count from all base bdevs */
371 : 1162 : min_blockcnt = spdk_min(min_blockcnt, base_info->data_size);
372 : : }
373 : :
374 [ - + - + ]: 173 : base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift;
375 : :
376 [ + + ]: 1335 : RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
377 : 1162 : base_info->data_size = base_bdev_data_size;
378 : : }
379 : :
380 : : /*
381 : : * Take the minimum block count based approach where total block count
382 : : * of raid bdev is the number of base bdev times the minimum block count
383 : : * of any base bdev.
384 : : */
385 [ - + - + ]: 173 : SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n",
386 : : min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
387 : :
388 : 173 : raid_bdev->bdev.blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs;
389 : :
390 [ + - ]: 173 : if (raid_bdev->num_base_bdevs > 1) {
391 : 173 : raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
392 : 173 : raid_bdev->bdev.split_on_optimal_io_boundary = true;
393 : : } else {
394 : : /* Do not need to split reads/writes on single bdev RAID modules. */
395 : 0 : raid_bdev->bdev.optimal_io_boundary = 0;
396 : 0 : raid_bdev->bdev.split_on_optimal_io_boundary = false;
397 : : }
398 : :
399 : 173 : return 0;
400 : : }
401 : :
402 : : static bool
403 : 8 : raid0_resize(struct raid_bdev *raid_bdev)
404 : : {
405 : : uint64_t blockcnt;
406 : : int rc;
407 : 8 : uint64_t min_blockcnt = UINT64_MAX;
408 : : struct raid_base_bdev_info *base_info;
409 : : uint64_t base_bdev_data_size;
410 : :
411 [ + + ]: 24 : RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
412 : 16 : struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(base_info->desc);
413 : :
414 : 16 : min_blockcnt = spdk_min(min_blockcnt, base_bdev->blockcnt - base_info->data_offset);
415 : : }
416 : :
417 [ - + - + ]: 8 : base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift;
418 : 8 : blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs;
419 : :
420 [ + + ]: 8 : if (blockcnt == raid_bdev->bdev.blockcnt) {
421 : 4 : return false;
422 : : }
423 : :
424 : 4 : rc = spdk_bdev_notify_blockcnt_change(&raid_bdev->bdev, blockcnt);
425 [ - + ]: 4 : if (rc != 0) {
426 : 0 : SPDK_ERRLOG("Failed to notify blockcount change\n");
427 : 0 : return false;
428 : : }
429 : :
430 [ + + ]: 12 : RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
431 : 8 : base_info->data_size = base_bdev_data_size;
432 : : }
433 : :
434 : 4 : return true;
435 : : }
436 : :
437 : : static struct raid_bdev_module g_raid0_module = {
438 : : .level = RAID0,
439 : : .base_bdevs_min = 1,
440 : : .memory_domains_supported = true,
441 : : .dif_supported = true,
442 : : .start = raid0_start,
443 : : .submit_rw_request = raid0_submit_rw_request,
444 : : .submit_null_payload_request = raid0_submit_null_payload_request,
445 : : .resize = raid0_resize,
446 : : };
447 : 2148 : RAID_MODULE_REGISTER(&g_raid0_module)
448 : :
449 : 2148 : SPDK_LOG_REGISTER_COMPONENT(bdev_raid0)
|