Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2017 Intel Corporation.
3 : * All rights reserved.
4 : */
5 :
6 : #include "spdk/stdinc.h"
7 :
8 : #include "env_internal.h"
9 : #include "pci_dpdk.h"
10 :
11 : #include <rte_config.h>
12 : #include <rte_memory.h>
13 : #include <rte_eal_memconfig.h>
14 : #include <rte_dev.h>
15 : #include <rte_pci.h>
16 :
17 : #include "spdk_internal/assert.h"
18 :
19 : #include "spdk/assert.h"
20 : #include "spdk/likely.h"
21 : #include "spdk/queue.h"
22 : #include "spdk/util.h"
23 : #include "spdk/memory.h"
24 : #include "spdk/env_dpdk.h"
25 : #include "spdk/log.h"
26 :
27 : #ifdef __linux__
28 : #include <linux/version.h>
29 : #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
30 : #include <linux/vfio.h>
31 : #include <rte_vfio.h>
32 :
33 : struct spdk_vfio_dma_map {
34 : struct vfio_iommu_type1_dma_map map;
35 : TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
36 : };
37 :
38 : struct vfio_cfg {
39 : int fd;
40 : bool enabled;
41 : bool noiommu_enabled;
42 : unsigned device_ref;
43 : TAILQ_HEAD(, spdk_vfio_dma_map) maps;
44 : pthread_mutex_t mutex;
45 : };
46 :
47 : static struct vfio_cfg g_vfio = {
48 : .fd = -1,
49 : .enabled = false,
50 : .noiommu_enabled = false,
51 : .device_ref = 0,
52 : .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
53 : .mutex = PTHREAD_MUTEX_INITIALIZER
54 : };
55 : #endif
56 : #endif
57 :
58 : #if DEBUG
59 : #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__)
60 : #else
61 : #define DEBUG_PRINT(...)
62 : #endif
63 :
64 : struct map_page_cfg {
65 : uint64_t shift;
66 : uint64_t size;
67 : uint64_t mask;
68 : uint64_t num_pages_per_gb;
69 : };
70 :
71 : /**
72 : * g_map_page_cfg can not be static because it is used in the inline function
73 : * spdk_mem_map_translate.
74 : */
75 : struct map_page_cfg g_map_page_cfg = {
76 : .shift = SHIFT_2MB,
77 : .size = VALUE_2MB,
78 : .mask = MASK_2MB,
79 : .num_pages_per_gb = 1UL << (SHIFT_1GB - SHIFT_2MB),
80 : };
81 :
82 : #define MAP_PAGE_SHIFT (g_map_page_cfg.shift)
83 : #define MAP_PAGE_SIZE (g_map_page_cfg.size)
84 : #define MAP_PAGE_MASK (g_map_page_cfg.mask)
85 : #define MAP_NUM_PAGES_PER_GB (g_map_page_cfg.num_pages_per_gb)
86 :
87 : #define MAP_256TB_IDX(vfn_page) ((vfn_page) >> (SHIFT_1GB - MAP_PAGE_SHIFT))
88 : #define MAP_1GB_IDX(vfn_page) ((vfn_page) & ((1ULL << (SHIFT_1GB - MAP_PAGE_SHIFT)) - 1))
89 : #define MAP_PAGE_OFFSET(ptr) (((uintptr_t)(ptr)) & MAP_PAGE_MASK)
90 :
91 : /* Page is registered */
92 : #define REG_MAP_REGISTERED (1ULL << 62)
93 :
94 : /* A notification region barrier. The page translation entry that's marked
95 : * with this flag must be unregistered separately. This allows contiguous
96 : * regions to be unregistered in the same chunks they were registered.
97 : */
98 : #define REG_MAP_NOTIFY_START (1ULL << 63)
99 :
100 : /* Translation of a single page. */
101 : struct map_page {
102 : uint64_t translation;
103 : };
104 :
105 : /* Second-level map table indexed by bits [page_shift..29] of the virtual address.
106 : * Each entry contains the address translation or error for entries that haven't
107 : * been retrieved yet.
108 : */
109 : struct map_1gb {
110 : struct map_page map[0];
111 : /**
112 : * Page table space.
113 : * Do not put any fields after this!
114 : */
115 : };
116 :
117 : #define MAP_SIZE_OF_MAP_1GB (sizeof(struct map_1gb) + MAP_NUM_PAGES_PER_GB * sizeof(struct map_page))
118 :
119 : /* Top-level map table indexed by bits [30..47] of the virtual address.
120 : * Each entry points to a second-level map table or NULL.
121 : */
122 : struct map_256tb {
123 : struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
124 : };
125 :
126 : /* Page-granularity memory address translation */
127 : struct spdk_mem_map {
128 : struct map_256tb map_256tb;
129 : pthread_mutex_t mutex;
130 : uint64_t default_translation;
131 : struct spdk_mem_map_ops ops;
132 : void *cb_ctx;
133 : TAILQ_ENTRY(spdk_mem_map) tailq;
134 : };
135 :
136 : /* Registrations map. The 64 bit translations are bit fields with the
137 : * following layout (starting with the low bits):
138 : * 0 - 61 : reserved
139 : * 62 - 63 : flags
140 : */
141 : static struct spdk_mem_map *g_mem_reg_map;
142 : static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
143 : TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
144 : static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
145 :
146 : static bool g_legacy_mem;
147 : static bool g_huge_pages = true;
148 :
149 : /*
150 : * Walk the currently registered memory via the main memory registration map
151 : * and call the new map's notify callback for each virtually contiguous region.
152 : */
153 : static int
154 0 : mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
155 : {
156 : size_t idx_256tb;
157 : uint64_t idx_1gb;
158 0 : uint64_t contig_start = UINT64_MAX;
159 0 : uint64_t contig_end = UINT64_MAX;
160 : struct map_1gb *map_1gb;
161 : int rc;
162 :
163 0 : if (!g_mem_reg_map) {
164 0 : return -EINVAL;
165 : }
166 :
167 : /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
168 0 : pthread_mutex_lock(&g_mem_reg_map->mutex);
169 :
170 0 : for (idx_256tb = 0;
171 0 : idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
172 0 : idx_256tb++) {
173 0 : map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
174 :
175 0 : if (!map_1gb) {
176 0 : if (contig_start != UINT64_MAX) {
177 : /* End of of a virtually contiguous range */
178 0 : rc = map->ops.notify_cb(map->cb_ctx, map, action,
179 0 : (void *)contig_start,
180 0 : contig_end - contig_start + MAP_PAGE_SIZE);
181 : /* Don't bother handling unregister failures. It can't be any worse */
182 0 : if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
183 0 : goto err_unregister;
184 : }
185 0 : }
186 0 : contig_start = UINT64_MAX;
187 0 : continue;
188 : }
189 :
190 0 : for (idx_1gb = 0; idx_1gb < MAP_NUM_PAGES_PER_GB; idx_1gb++) {
191 0 : if ((map_1gb->map[idx_1gb].translation & REG_MAP_REGISTERED) &&
192 0 : (contig_start == UINT64_MAX ||
193 0 : (map_1gb->map[idx_1gb].translation & REG_MAP_NOTIFY_START) == 0)) {
194 : /* Rebuild the virtual address from the indexes */
195 0 : uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << MAP_PAGE_SHIFT);
196 :
197 0 : if (contig_start == UINT64_MAX) {
198 0 : contig_start = vaddr;
199 0 : }
200 :
201 0 : contig_end = vaddr;
202 0 : } else {
203 0 : if (contig_start != UINT64_MAX) {
204 : /* End of of a virtually contiguous range */
205 0 : rc = map->ops.notify_cb(map->cb_ctx, map, action,
206 0 : (void *)contig_start,
207 0 : contig_end - contig_start + MAP_PAGE_SIZE);
208 : /* Don't bother handling unregister failures. It can't be any worse */
209 0 : if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
210 0 : goto err_unregister;
211 : }
212 :
213 : /* This page might be a part of a neighbour region, so process
214 : * it again. The idx_1gb will be incremented immediately.
215 : */
216 0 : idx_1gb--;
217 0 : }
218 0 : contig_start = UINT64_MAX;
219 : }
220 0 : }
221 0 : }
222 :
223 0 : pthread_mutex_unlock(&g_mem_reg_map->mutex);
224 0 : return 0;
225 :
226 : err_unregister:
227 : /* Unwind to the first empty translation so we don't unregister
228 : * a region that just failed to register.
229 : */
230 0 : idx_256tb = MAP_256TB_IDX((contig_start >> MAP_PAGE_SHIFT) - 1);
231 0 : idx_1gb = MAP_1GB_IDX((contig_start >> MAP_PAGE_SHIFT) - 1);
232 0 : contig_start = UINT64_MAX;
233 0 : contig_end = UINT64_MAX;
234 :
235 : /* Unregister any memory we managed to register before the failure */
236 0 : for (; idx_256tb < SIZE_MAX; idx_256tb--) {
237 0 : map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
238 :
239 0 : if (!map_1gb) {
240 0 : if (contig_end != UINT64_MAX) {
241 : /* End of of a virtually contiguous range */
242 0 : map->ops.notify_cb(map->cb_ctx, map,
243 : SPDK_MEM_MAP_NOTIFY_UNREGISTER,
244 0 : (void *)contig_start,
245 0 : contig_end - contig_start + MAP_PAGE_SIZE);
246 0 : }
247 0 : contig_end = UINT64_MAX;
248 0 : continue;
249 : }
250 :
251 0 : for (; idx_1gb < UINT64_MAX; idx_1gb--) {
252 : /* Rebuild the virtual address from the indexes */
253 0 : uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << MAP_PAGE_SHIFT);
254 0 : if ((map_1gb->map[idx_1gb].translation & REG_MAP_REGISTERED) &&
255 0 : (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation & REG_MAP_NOTIFY_START) == 0)) {
256 :
257 0 : if (contig_end == UINT64_MAX) {
258 0 : contig_end = vaddr;
259 0 : }
260 0 : contig_start = vaddr;
261 0 : } else {
262 0 : if (contig_end != UINT64_MAX) {
263 0 : if (map_1gb->map[idx_1gb].translation & REG_MAP_NOTIFY_START) {
264 0 : contig_start = vaddr;
265 0 : }
266 : /* End of of a virtually contiguous range */
267 0 : map->ops.notify_cb(map->cb_ctx, map,
268 : SPDK_MEM_MAP_NOTIFY_UNREGISTER,
269 0 : (void *)contig_start,
270 0 : contig_end - contig_start + MAP_PAGE_SIZE);
271 0 : }
272 0 : contig_end = UINT64_MAX;
273 : }
274 0 : }
275 0 : idx_1gb = MAP_NUM_PAGES_PER_GB - 1;
276 0 : }
277 :
278 0 : pthread_mutex_unlock(&g_mem_reg_map->mutex);
279 0 : return rc;
280 0 : }
281 :
282 : struct spdk_mem_map *
283 0 : spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
284 : {
285 : struct spdk_mem_map *map;
286 : int rc;
287 : size_t i;
288 :
289 0 : map = calloc(1, sizeof(*map));
290 0 : if (map == NULL) {
291 0 : return NULL;
292 : }
293 :
294 0 : if (pthread_mutex_init(&map->mutex, NULL)) {
295 0 : free(map);
296 0 : return NULL;
297 : }
298 :
299 0 : map->default_translation = default_translation;
300 0 : map->cb_ctx = cb_ctx;
301 0 : if (ops) {
302 0 : map->ops = *ops;
303 0 : }
304 :
305 0 : if (ops && ops->notify_cb) {
306 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
307 0 : rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
308 0 : if (rc != 0) {
309 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
310 0 : DEBUG_PRINT("Initial mem_map notify failed\n");
311 0 : pthread_mutex_destroy(&map->mutex);
312 0 : for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
313 0 : free(map->map_256tb.map[i]);
314 0 : }
315 0 : free(map);
316 0 : return NULL;
317 : }
318 0 : TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
319 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
320 0 : }
321 :
322 0 : return map;
323 0 : }
324 :
325 : void
326 0 : spdk_mem_map_free(struct spdk_mem_map **pmap)
327 : {
328 : struct spdk_mem_map *map;
329 : size_t i;
330 :
331 0 : if (!pmap) {
332 0 : return;
333 : }
334 :
335 0 : map = *pmap;
336 :
337 0 : if (!map) {
338 0 : return;
339 : }
340 :
341 0 : if (map->ops.notify_cb) {
342 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
343 0 : mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
344 0 : TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
345 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
346 0 : }
347 :
348 0 : for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
349 0 : free(map->map_256tb.map[i]);
350 0 : }
351 :
352 0 : pthread_mutex_destroy(&map->mutex);
353 :
354 0 : free(map);
355 0 : *pmap = NULL;
356 0 : }
357 :
358 : uint64_t
359 0 : spdk_mem_map_get_page_size(void)
360 : {
361 0 : return g_map_page_cfg.size;
362 : }
363 :
364 : int
365 0 : spdk_mem_register(void *_vaddr, size_t len)
366 : {
367 : struct spdk_mem_map *map;
368 : int rc;
369 0 : uint64_t vaddr = (uintptr_t)_vaddr;
370 : uint64_t seg_vaddr;
371 : size_t seg_len;
372 : uint64_t reg;
373 :
374 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
375 0 : DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr);
376 0 : return -EINVAL;
377 : }
378 :
379 0 : if (((uintptr_t)vaddr & MAP_PAGE_MASK) || (len & MAP_PAGE_MASK)) {
380 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n",
381 : __func__, vaddr, len);
382 0 : return -EINVAL;
383 : }
384 :
385 0 : if (len == 0) {
386 0 : return 0;
387 : }
388 :
389 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
390 :
391 0 : seg_vaddr = vaddr;
392 0 : seg_len = len;
393 0 : while (seg_len > 0) {
394 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
395 0 : if (reg & REG_MAP_REGISTERED) {
396 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
397 0 : return -EBUSY;
398 : }
399 0 : seg_vaddr += MAP_PAGE_SIZE;
400 0 : seg_len -= MAP_PAGE_SIZE;
401 : }
402 :
403 0 : seg_vaddr = vaddr;
404 0 : seg_len = 0;
405 0 : while (len > 0) {
406 0 : spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, MAP_PAGE_SIZE,
407 0 : seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
408 0 : seg_len += MAP_PAGE_SIZE;
409 0 : vaddr += MAP_PAGE_SIZE;
410 0 : len -= MAP_PAGE_SIZE;
411 : }
412 :
413 0 : TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
414 0 : rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER,
415 0 : (void *)seg_vaddr, seg_len);
416 0 : if (rc != 0) {
417 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
418 0 : return rc;
419 : }
420 0 : }
421 :
422 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
423 0 : return 0;
424 0 : }
425 :
426 : int
427 0 : spdk_mem_unregister(void *_vaddr, size_t len)
428 : {
429 : struct spdk_mem_map *map;
430 : int rc;
431 0 : uint64_t vaddr = (uintptr_t)_vaddr;
432 : uint64_t seg_vaddr;
433 : size_t seg_len;
434 : uint64_t reg, newreg;
435 :
436 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
437 0 : DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr);
438 0 : return -EINVAL;
439 : }
440 :
441 0 : if (((uintptr_t)vaddr & MAP_PAGE_MASK) || (len & MAP_PAGE_MASK)) {
442 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n",
443 : __func__, vaddr, len);
444 0 : return -EINVAL;
445 : }
446 :
447 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
448 :
449 : /* The first page must be a start of a region. Also check if it's
450 : * registered to make sure we don't return -ERANGE for non-registered
451 : * regions.
452 : */
453 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
454 0 : if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
455 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
456 0 : return -ERANGE;
457 : }
458 :
459 0 : seg_vaddr = vaddr;
460 0 : seg_len = len;
461 0 : while (seg_len > 0) {
462 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
463 0 : if ((reg & REG_MAP_REGISTERED) == 0) {
464 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
465 0 : return -EINVAL;
466 : }
467 0 : seg_vaddr += MAP_PAGE_SIZE;
468 0 : seg_len -= MAP_PAGE_SIZE;
469 : }
470 :
471 0 : newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
472 : /* If the next page is registered, it must be a start of a region as well,
473 : * otherwise we'd be unregistering only a part of a region.
474 : */
475 0 : if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
476 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
477 0 : return -ERANGE;
478 : }
479 0 : seg_vaddr = vaddr;
480 0 : seg_len = 0;
481 :
482 0 : while (len > 0) {
483 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
484 0 : spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, MAP_PAGE_SIZE, 0);
485 :
486 0 : if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
487 0 : TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
488 0 : rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER,
489 0 : (void *)seg_vaddr, seg_len);
490 0 : if (rc != 0) {
491 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
492 0 : return rc;
493 : }
494 0 : }
495 :
496 0 : seg_vaddr = vaddr;
497 0 : seg_len = MAP_PAGE_SIZE;
498 0 : } else {
499 0 : seg_len += MAP_PAGE_SIZE;
500 : }
501 :
502 0 : vaddr += MAP_PAGE_SIZE;
503 0 : len -= MAP_PAGE_SIZE;
504 : }
505 :
506 0 : if (seg_len > 0) {
507 0 : TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
508 0 : rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER,
509 0 : (void *)seg_vaddr, seg_len);
510 0 : if (rc != 0) {
511 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
512 0 : return rc;
513 : }
514 0 : }
515 0 : }
516 :
517 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
518 0 : return 0;
519 0 : }
520 :
521 : int
522 0 : spdk_mem_reserve(void *vaddr, size_t len)
523 : {
524 : struct spdk_mem_map *map;
525 : void *seg_vaddr;
526 : size_t seg_len;
527 : uint64_t reg;
528 :
529 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
530 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
531 0 : return -EINVAL;
532 : }
533 :
534 0 : if (((uintptr_t)vaddr & MAP_PAGE_MASK) || (len & MAP_PAGE_MASK)) {
535 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
536 : __func__, vaddr, len);
537 0 : return -EINVAL;
538 : }
539 :
540 0 : if (len == 0) {
541 0 : return 0;
542 : }
543 :
544 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
545 :
546 : /* Check if any part of this range is already registered */
547 0 : seg_vaddr = vaddr;
548 0 : seg_len = len;
549 0 : while (seg_len > 0) {
550 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
551 0 : if (reg & REG_MAP_REGISTERED) {
552 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
553 0 : return -EBUSY;
554 : }
555 0 : seg_vaddr += MAP_PAGE_SIZE;
556 0 : seg_len -= MAP_PAGE_SIZE;
557 : }
558 :
559 : /* Simply set the translation to the memory map's default. This allocates the space in the
560 : * map but does not provide a valid translation. */
561 0 : spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len,
562 0 : g_mem_reg_map->default_translation);
563 :
564 0 : TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
565 0 : spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation);
566 0 : }
567 :
568 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
569 0 : return 0;
570 0 : }
571 :
572 : static struct map_1gb *
573 0 : mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_page)
574 : {
575 : struct map_1gb *map_1gb;
576 0 : uint64_t idx_256tb = MAP_256TB_IDX(vfn_page);
577 : size_t i;
578 :
579 0 : if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
580 0 : return NULL;
581 : }
582 :
583 0 : map_1gb = map->map_256tb.map[idx_256tb];
584 :
585 0 : if (!map_1gb) {
586 0 : pthread_mutex_lock(&map->mutex);
587 :
588 : /* Recheck to make sure nobody else got the mutex first. */
589 0 : map_1gb = map->map_256tb.map[idx_256tb];
590 0 : if (!map_1gb) {
591 0 : map_1gb = malloc(MAP_SIZE_OF_MAP_1GB);
592 0 : if (map_1gb) {
593 : /* initialize all entries to default translation */
594 0 : for (i = 0; i < MAP_NUM_PAGES_PER_GB; i++) {
595 0 : map_1gb->map[i].translation = map->default_translation;
596 0 : }
597 0 : map->map_256tb.map[idx_256tb] = map_1gb;
598 0 : }
599 0 : }
600 :
601 0 : pthread_mutex_unlock(&map->mutex);
602 :
603 0 : if (!map_1gb) {
604 0 : DEBUG_PRINT("allocation failed\n");
605 0 : return NULL;
606 : }
607 0 : }
608 :
609 0 : return map_1gb;
610 0 : }
611 :
612 : int
613 0 : spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
614 : uint64_t translation)
615 : {
616 : uint64_t vfn_page;
617 : struct map_1gb *map_1gb;
618 : uint64_t idx_1gb;
619 : struct map_page *map_page;
620 :
621 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
622 0 : DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr);
623 0 : return -EINVAL;
624 : }
625 :
626 : /* Only page-aligned registrations are supported */
627 0 : if (((uintptr_t)vaddr & MAP_PAGE_MASK) || (size & MAP_PAGE_MASK)) {
628 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n",
629 : __func__, vaddr, size);
630 0 : return -EINVAL;
631 : }
632 :
633 0 : vfn_page = vaddr >> MAP_PAGE_SHIFT;
634 :
635 0 : while (size) {
636 0 : map_1gb = mem_map_get_map_1gb(map, vfn_page);
637 0 : if (!map_1gb) {
638 0 : DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
639 0 : return -ENOMEM;
640 : }
641 :
642 0 : idx_1gb = MAP_1GB_IDX(vfn_page);
643 0 : map_page = &map_1gb->map[idx_1gb];
644 0 : map_page->translation = translation;
645 :
646 0 : size -= MAP_PAGE_SIZE;
647 0 : vfn_page++;
648 : }
649 :
650 0 : return 0;
651 0 : }
652 :
653 : int
654 0 : spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
655 : {
656 0 : return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation);
657 : }
658 :
659 : inline uint64_t
660 0 : spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
661 : {
662 : const struct map_1gb *map_1gb;
663 : const struct map_page *map_page;
664 : uint64_t idx_256tb;
665 : uint64_t idx_1gb;
666 : uint64_t vfn_page;
667 : uint64_t cur_size;
668 : uint64_t prev_translation;
669 : uint64_t orig_translation;
670 :
671 0 : if (spdk_unlikely(vaddr & ~MASK_256TB)) {
672 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
673 0 : return map->default_translation;
674 : }
675 :
676 0 : vfn_page = vaddr >> MAP_PAGE_SHIFT;
677 0 : idx_256tb = MAP_256TB_IDX(vfn_page);
678 0 : idx_1gb = MAP_1GB_IDX(vfn_page);
679 :
680 0 : map_1gb = map->map_256tb.map[idx_256tb];
681 0 : if (spdk_unlikely(!map_1gb)) {
682 0 : return map->default_translation;
683 : }
684 :
685 0 : cur_size = MAP_PAGE_SIZE - MAP_PAGE_OFFSET(vaddr);
686 0 : map_page = &map_1gb->map[idx_1gb];
687 0 : if (size == NULL || map->ops.are_contiguous == NULL ||
688 0 : map_page->translation == map->default_translation) {
689 0 : if (size != NULL) {
690 0 : *size = spdk_min(*size, cur_size);
691 0 : }
692 0 : return map_page->translation;
693 : }
694 :
695 0 : orig_translation = map_page->translation;
696 0 : prev_translation = orig_translation;
697 0 : while (cur_size < *size) {
698 0 : vfn_page++;
699 0 : idx_256tb = MAP_256TB_IDX(vfn_page);
700 0 : idx_1gb = MAP_1GB_IDX(vfn_page);
701 :
702 0 : map_1gb = map->map_256tb.map[idx_256tb];
703 0 : if (spdk_unlikely(!map_1gb)) {
704 0 : break;
705 : }
706 :
707 0 : map_page = &map_1gb->map[idx_1gb];
708 0 : if (!map->ops.are_contiguous(prev_translation, map_page->translation)) {
709 0 : break;
710 : }
711 :
712 0 : cur_size += MAP_PAGE_SIZE;
713 0 : prev_translation = map_page->translation;
714 : }
715 :
716 0 : *size = spdk_min(*size, cur_size);
717 0 : return orig_translation;
718 0 : }
719 :
720 : static void
721 0 : memory_hotplug_cb(enum rte_mem_event event_type,
722 : const void *addr, size_t len, void *arg)
723 : {
724 0 : if (event_type == RTE_MEM_EVENT_ALLOC) {
725 0 : spdk_mem_register((void *)addr, len);
726 :
727 0 : if (!spdk_env_dpdk_external_init()) {
728 0 : return;
729 : }
730 :
731 : /* When the user initialized DPDK separately, we can't
732 : * be sure that --match-allocations RTE flag was specified.
733 : * Without this flag, DPDK can free memory in different units
734 : * than it was allocated. It doesn't work with things like RDMA MRs.
735 : *
736 : * For such cases, we mark segments so they aren't freed.
737 : */
738 0 : while (len > 0) {
739 : struct rte_memseg *seg;
740 :
741 0 : seg = rte_mem_virt2memseg(addr, NULL);
742 0 : assert(seg != NULL);
743 0 : seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
744 0 : addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
745 0 : len -= seg->hugepage_sz;
746 : }
747 0 : } else if (event_type == RTE_MEM_EVENT_FREE) {
748 0 : spdk_mem_unregister((void *)addr, len);
749 0 : }
750 0 : }
751 :
752 : static int
753 0 : memory_iter_cb(const struct rte_memseg_list *msl,
754 : const struct rte_memseg *ms, size_t len, void *arg)
755 : {
756 0 : return spdk_mem_register(ms->addr, len);
757 : }
758 :
759 : static bool g_mem_event_cb_registered = false;
760 :
761 : static int
762 0 : mem_map_mem_event_callback_register(void)
763 : {
764 : int rc;
765 :
766 0 : rc = rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
767 0 : if (rc != 0) {
768 0 : DEBUG_PRINT("memory event callback registration failed, rc = %d\n", rc);
769 0 : return -errno;
770 : }
771 :
772 0 : g_mem_event_cb_registered = true;
773 0 : return 0;
774 0 : }
775 :
776 : static void
777 0 : mem_map_mem_event_callback_unregister(void)
778 : {
779 0 : if (g_mem_event_cb_registered) {
780 0 : g_mem_event_cb_registered = false;
781 0 : rte_mem_event_callback_unregister("spdk", NULL);
782 0 : }
783 0 : }
784 :
785 : int
786 0 : mem_map_init(bool legacy_mem)
787 : {
788 : int rc;
789 :
790 0 : g_legacy_mem = legacy_mem;
791 :
792 0 : g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
793 0 : if (g_mem_reg_map == NULL) {
794 0 : DEBUG_PRINT("memory registration map allocation failed\n");
795 0 : return -ENOMEM;
796 : }
797 :
798 0 : if (!g_huge_pages) {
799 0 : return 0;
800 : }
801 :
802 0 : if (!g_legacy_mem) {
803 : /**
804 : * To prevent DPDK complaining, only register the callback when
805 : * we are not in legacy mem mode.
806 : */
807 0 : rc = mem_map_mem_event_callback_register();
808 0 : if (rc != 0) {
809 0 : DEBUG_PRINT("memory event callback registration failed, rc = %d\n", rc);
810 0 : goto err_free_reg_map;
811 : }
812 0 : }
813 :
814 : /*
815 : * Walk all DPDK memory segments and register them
816 : * with the main memory map
817 : */
818 0 : rc = rte_memseg_contig_walk(memory_iter_cb, NULL);
819 0 : if (rc != 0) {
820 0 : DEBUG_PRINT("memory segments walking failed, rc = %d\n", rc);
821 0 : goto err_unregister_mem_cb;
822 : }
823 :
824 0 : return 0;
825 :
826 : err_unregister_mem_cb:
827 0 : mem_map_mem_event_callback_unregister();
828 : err_free_reg_map:
829 0 : spdk_mem_map_free(&g_mem_reg_map);
830 0 : return rc;
831 0 : }
832 :
833 : void
834 0 : mem_map_fini(void)
835 : {
836 0 : mem_map_mem_event_callback_unregister();
837 0 : spdk_mem_map_free(&g_mem_reg_map);
838 0 : }
839 :
840 : bool
841 0 : spdk_iommu_is_enabled(void)
842 : {
843 : #if VFIO_ENABLED
844 : return g_vfio.enabled && !g_vfio.noiommu_enabled;
845 : #else
846 0 : return false;
847 : #endif
848 : }
849 :
850 : struct spdk_vtophys_pci_device {
851 : struct rte_pci_device *pci_device;
852 : TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
853 : };
854 :
855 : static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
856 : static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
857 : TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
858 :
859 : static struct spdk_mem_map *g_vtophys_map;
860 : static struct spdk_mem_map *g_phys_ref_map;
861 : static struct spdk_mem_map *g_numa_map;
862 :
863 : #if VFIO_ENABLED
864 : static int
865 : _vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
866 : {
867 : struct spdk_vfio_dma_map *dma_map;
868 : int ret;
869 :
870 : dma_map = calloc(1, sizeof(*dma_map));
871 : if (dma_map == NULL) {
872 : return -ENOMEM;
873 : }
874 :
875 : dma_map->map.argsz = sizeof(dma_map->map);
876 : dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
877 : dma_map->map.vaddr = vaddr;
878 : dma_map->map.iova = iova;
879 : dma_map->map.size = size;
880 :
881 : if (g_vfio.device_ref == 0) {
882 : /* VFIO requires at least one device (IOMMU group) to be added to
883 : * a VFIO container before it is possible to perform any IOMMU
884 : * operations on that container. This memory will be mapped once
885 : * the first device (IOMMU group) is hotplugged.
886 : *
887 : * Since the vfio container is managed internally by DPDK, it is
888 : * also possible that some device is already in that container, but
889 : * it's not managed by SPDK - e.g. an NIC attached internally
890 : * inside DPDK. We could map the memory straight away in such
891 : * scenario, but there's no need to do it. DPDK devices clearly
892 : * don't need our mappings and hence we defer the mapping
893 : * unconditionally until the first SPDK-managed device is
894 : * hotplugged.
895 : */
896 : goto out_insert;
897 : }
898 :
899 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
900 : if (ret) {
901 : /* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */
902 : SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno);
903 : }
904 :
905 : out_insert:
906 : TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
907 : return 0;
908 : }
909 :
910 :
911 : static int
912 : vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
913 : {
914 : uint64_t refcount;
915 : int ret;
916 :
917 : refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
918 : assert(refcount < UINT64_MAX);
919 : if (refcount > 0) {
920 : spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
921 : return 0;
922 : }
923 :
924 : pthread_mutex_lock(&g_vfio.mutex);
925 : ret = _vfio_iommu_map_dma(vaddr, iova, size);
926 : pthread_mutex_unlock(&g_vfio.mutex);
927 : if (ret) {
928 : return ret;
929 : }
930 :
931 : spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
932 : return 0;
933 : }
934 :
935 : int
936 : vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size)
937 : {
938 : int ret;
939 :
940 : pthread_mutex_lock(&g_vfio.mutex);
941 : ret = _vfio_iommu_map_dma(vaddr, iova, size);
942 : pthread_mutex_unlock(&g_vfio.mutex);
943 :
944 : return ret;
945 : }
946 :
947 : static int
948 : _vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map)
949 : {
950 : struct vfio_iommu_type1_dma_unmap unmap = {};
951 : int ret;
952 :
953 : if (g_vfio.device_ref == 0) {
954 : /* Memory is not mapped anymore, just remove it's references */
955 : goto out_remove;
956 : }
957 :
958 : unmap.argsz = sizeof(unmap);
959 : unmap.flags = 0;
960 : unmap.iova = dma_map->map.iova;
961 : unmap.size = dma_map->map.size;
962 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
963 : if (ret) {
964 : SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno);
965 : }
966 :
967 : out_remove:
968 : TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
969 : free(dma_map);
970 : return 0;
971 : }
972 :
973 : static int
974 : vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
975 : {
976 : struct spdk_vfio_dma_map *dma_map;
977 : uint64_t refcount;
978 : int ret;
979 :
980 : pthread_mutex_lock(&g_vfio.mutex);
981 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
982 : if (dma_map->map.iova == iova) {
983 : break;
984 : }
985 : }
986 :
987 : if (dma_map == NULL) {
988 : DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
989 : pthread_mutex_unlock(&g_vfio.mutex);
990 : return -ENXIO;
991 : }
992 :
993 : refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
994 : assert(refcount < UINT64_MAX);
995 : if (refcount > 0) {
996 : spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
997 : }
998 :
999 : /* We still have outstanding references, don't clear it. */
1000 : if (refcount > 1) {
1001 : pthread_mutex_unlock(&g_vfio.mutex);
1002 : return 0;
1003 : }
1004 :
1005 : /** don't support partial or multiple-page unmap for now */
1006 : assert(dma_map->map.size == size);
1007 :
1008 : ret = _vfio_iommu_unmap_dma(dma_map);
1009 : pthread_mutex_unlock(&g_vfio.mutex);
1010 :
1011 : return ret;
1012 : }
1013 :
1014 : int
1015 : vtophys_iommu_unmap_dma_bar(uint64_t vaddr)
1016 : {
1017 : struct spdk_vfio_dma_map *dma_map;
1018 : int ret;
1019 :
1020 : pthread_mutex_lock(&g_vfio.mutex);
1021 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1022 : if (dma_map->map.vaddr == vaddr) {
1023 : break;
1024 : }
1025 : }
1026 :
1027 : if (dma_map == NULL) {
1028 : DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr);
1029 : pthread_mutex_unlock(&g_vfio.mutex);
1030 : return -ENXIO;
1031 : }
1032 :
1033 : ret = _vfio_iommu_unmap_dma(dma_map);
1034 : pthread_mutex_unlock(&g_vfio.mutex);
1035 : return ret;
1036 : }
1037 : #endif
1038 :
1039 : static uint64_t
1040 0 : vtophys_get_paddr_memseg(uint64_t vaddr)
1041 : {
1042 : uintptr_t paddr;
1043 : struct rte_memseg *seg;
1044 :
1045 0 : seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
1046 0 : if (seg != NULL) {
1047 0 : paddr = seg->iova;
1048 0 : if (paddr == RTE_BAD_IOVA) {
1049 0 : return SPDK_VTOPHYS_ERROR;
1050 : }
1051 0 : paddr += (vaddr - (uintptr_t)seg->addr);
1052 0 : return paddr;
1053 : }
1054 :
1055 0 : return SPDK_VTOPHYS_ERROR;
1056 0 : }
1057 :
1058 : /* Try to get the paddr from /proc/self/pagemap */
1059 : static uint64_t
1060 0 : vtophys_get_paddr_pagemap(uint64_t vaddr)
1061 : {
1062 : uintptr_t paddr;
1063 :
1064 : /* Silence static analyzers */
1065 0 : assert(vaddr != 0);
1066 0 : paddr = rte_mem_virt2iova((void *)vaddr);
1067 0 : if (paddr == RTE_BAD_IOVA) {
1068 : /*
1069 : * The vaddr may be valid but doesn't have a backing page
1070 : * assigned yet. Touch the page to ensure a backing page
1071 : * gets assigned, then try to translate again.
1072 : */
1073 0 : rte_atomic64_read((rte_atomic64_t *)vaddr);
1074 0 : paddr = rte_mem_virt2iova((void *)vaddr);
1075 0 : }
1076 0 : if (paddr == RTE_BAD_IOVA) {
1077 : /* Unable to get to the physical address. */
1078 0 : return SPDK_VTOPHYS_ERROR;
1079 : }
1080 :
1081 0 : return paddr;
1082 0 : }
1083 :
1084 : static uint64_t
1085 0 : pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len)
1086 : {
1087 : struct rte_mem_resource *res;
1088 : uint64_t paddr;
1089 : unsigned r;
1090 :
1091 0 : for (r = 0; r < PCI_MAX_RESOURCE; r++) {
1092 0 : res = dpdk_pci_device_get_mem_resource(dev, r);
1093 :
1094 0 : if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr ||
1095 0 : (vaddr + len) >= (uint64_t)res->addr + res->len) {
1096 0 : continue;
1097 : }
1098 :
1099 : #if VFIO_ENABLED
1100 : if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) {
1101 : /*
1102 : * The IOMMU is on and we're using IOVA == VA. The BAR was
1103 : * automatically registered when it was mapped, so just return
1104 : * the virtual address here.
1105 : */
1106 : return vaddr;
1107 : }
1108 : #endif
1109 0 : paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
1110 0 : return paddr;
1111 : }
1112 :
1113 0 : return SPDK_VTOPHYS_ERROR;
1114 0 : }
1115 :
1116 : /* Try to get the paddr from pci devices */
1117 : static uint64_t
1118 0 : vtophys_get_paddr_pci(uint64_t vaddr, size_t len)
1119 : {
1120 : struct spdk_vtophys_pci_device *vtophys_dev;
1121 : uintptr_t paddr;
1122 : struct rte_pci_device *dev;
1123 :
1124 0 : pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1125 0 : TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1126 0 : dev = vtophys_dev->pci_device;
1127 0 : paddr = pci_device_vtophys(dev, vaddr, len);
1128 0 : if (paddr != SPDK_VTOPHYS_ERROR) {
1129 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1130 0 : return paddr;
1131 : }
1132 0 : }
1133 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1134 :
1135 0 : return SPDK_VTOPHYS_ERROR;
1136 0 : }
1137 :
1138 : static int
1139 0 : vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
1140 : enum spdk_mem_map_notify_action action,
1141 : void *vaddr, size_t len)
1142 : {
1143 0 : int rc = 0;
1144 : uint64_t paddr;
1145 :
1146 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
1147 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
1148 0 : return -EINVAL;
1149 : }
1150 :
1151 0 : if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
1152 0 : DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n",
1153 : vaddr, len);
1154 0 : return -EINVAL;
1155 : }
1156 :
1157 : /* Get the physical address from the DPDK memsegs */
1158 0 : paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1159 :
1160 0 : switch (action) {
1161 : case SPDK_MEM_MAP_NOTIFY_REGISTER:
1162 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1163 : /* This is not an address that DPDK is managing. */
1164 :
1165 : /* Check if this is a PCI BAR. They need special handling */
1166 0 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1167 0 : if (paddr != SPDK_VTOPHYS_ERROR) {
1168 : /* Get paddr for each 2MB chunk in this address range */
1169 0 : while (len > 0) {
1170 0 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1171 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1172 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1173 0 : return -EFAULT;
1174 : }
1175 :
1176 0 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1177 0 : if (rc != 0) {
1178 0 : return rc;
1179 : }
1180 :
1181 0 : vaddr += VALUE_2MB;
1182 0 : len -= VALUE_2MB;
1183 : }
1184 :
1185 0 : return 0;
1186 : }
1187 :
1188 : #if VFIO_ENABLED
1189 : enum rte_iova_mode iova_mode;
1190 :
1191 : iova_mode = rte_eal_iova_mode();
1192 :
1193 : if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
1194 : /* We'll use the virtual address as the iova to match DPDK. */
1195 : paddr = (uint64_t)vaddr;
1196 : rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
1197 : if (rc) {
1198 : return -EFAULT;
1199 : }
1200 : while (len > 0) {
1201 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1202 : if (rc != 0) {
1203 : return rc;
1204 : }
1205 : vaddr += VALUE_2MB;
1206 : paddr += VALUE_2MB;
1207 : len -= VALUE_2MB;
1208 : }
1209 : } else
1210 : #endif
1211 : {
1212 : /* Get the physical address from /proc/self/pagemap. */
1213 0 : paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1214 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1215 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1216 0 : return -EFAULT;
1217 : }
1218 :
1219 : /* Get paddr for each 2MB chunk in this address range */
1220 0 : while (len > 0) {
1221 : /* Get the physical address from /proc/self/pagemap. */
1222 0 : paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1223 :
1224 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1225 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1226 0 : return -EFAULT;
1227 : }
1228 :
1229 0 : if (paddr & MASK_2MB) {
1230 0 : DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
1231 0 : return -EINVAL;
1232 : }
1233 : #if VFIO_ENABLED
1234 : /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
1235 : * with the IOMMU using the physical address to match. */
1236 : if (spdk_iommu_is_enabled()) {
1237 : rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
1238 : if (rc) {
1239 : DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
1240 : return -EFAULT;
1241 : }
1242 : }
1243 : #endif
1244 :
1245 0 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1246 0 : if (rc != 0) {
1247 0 : return rc;
1248 : }
1249 :
1250 0 : vaddr += VALUE_2MB;
1251 0 : len -= VALUE_2MB;
1252 : }
1253 : }
1254 0 : } else {
1255 : /* This is an address managed by DPDK. Just setup the translations. */
1256 0 : while (len > 0) {
1257 0 : paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1258 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1259 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1260 0 : return -EFAULT;
1261 : }
1262 :
1263 0 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1264 0 : if (rc != 0) {
1265 0 : return rc;
1266 : }
1267 :
1268 0 : vaddr += VALUE_2MB;
1269 0 : len -= VALUE_2MB;
1270 : }
1271 : }
1272 :
1273 0 : break;
1274 : case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1275 : #if VFIO_ENABLED
1276 : if (paddr == SPDK_VTOPHYS_ERROR) {
1277 : /*
1278 : * This is not an address that DPDK is managing.
1279 : */
1280 :
1281 : /* Check if this is a PCI BAR. They need special handling */
1282 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1283 : if (paddr != SPDK_VTOPHYS_ERROR) {
1284 : /* Get paddr for each 2MB chunk in this address range */
1285 : while (len > 0) {
1286 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1287 : if (paddr == SPDK_VTOPHYS_ERROR) {
1288 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1289 : return -EFAULT;
1290 : }
1291 :
1292 : rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1293 : if (rc != 0) {
1294 : return rc;
1295 : }
1296 :
1297 : vaddr += VALUE_2MB;
1298 : len -= VALUE_2MB;
1299 : }
1300 :
1301 : return 0;
1302 : }
1303 :
1304 : /* If vfio is enabled,
1305 : * we need to unmap the range from the IOMMU
1306 : */
1307 : if (spdk_iommu_is_enabled()) {
1308 : uint64_t buffer_len = len;
1309 : uint8_t *va = vaddr;
1310 : enum rte_iova_mode iova_mode;
1311 :
1312 : iova_mode = rte_eal_iova_mode();
1313 : /*
1314 : * In virtual address mode, the region is contiguous and can be done in
1315 : * one unmap.
1316 : */
1317 : if (iova_mode == RTE_IOVA_VA) {
1318 : paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len);
1319 : if (buffer_len != len || paddr != (uintptr_t)va) {
1320 : DEBUG_PRINT("Unmapping %p with length %lu failed because "
1321 : "translation had address 0x%" PRIx64 " and length %lu\n",
1322 : va, len, paddr, buffer_len);
1323 : return -EINVAL;
1324 : }
1325 : rc = vtophys_iommu_unmap_dma(paddr, len);
1326 : if (rc) {
1327 : DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1328 : return -EFAULT;
1329 : }
1330 : } else if (iova_mode == RTE_IOVA_PA) {
1331 : /* Get paddr for each 2MB chunk in this address range */
1332 : while (buffer_len > 0) {
1333 : paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
1334 :
1335 : if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
1336 : DEBUG_PRINT("could not get phys addr for %p\n", va);
1337 : return -EFAULT;
1338 : }
1339 :
1340 : rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
1341 : if (rc) {
1342 : DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1343 : return -EFAULT;
1344 : }
1345 :
1346 : va += VALUE_2MB;
1347 : buffer_len -= VALUE_2MB;
1348 : }
1349 : }
1350 : }
1351 : }
1352 : #endif
1353 0 : while (len > 0) {
1354 0 : rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1355 0 : if (rc != 0) {
1356 0 : return rc;
1357 : }
1358 :
1359 0 : vaddr += VALUE_2MB;
1360 0 : len -= VALUE_2MB;
1361 : }
1362 :
1363 0 : break;
1364 : default:
1365 0 : SPDK_UNREACHABLE();
1366 : }
1367 :
1368 0 : return rc;
1369 0 : }
1370 :
1371 : static int
1372 0 : numa_notify(void *cb_ctx, struct spdk_mem_map *map,
1373 : enum spdk_mem_map_notify_action action,
1374 : void *vaddr, size_t len)
1375 : {
1376 : struct rte_memseg *seg;
1377 :
1378 : /* We always return 0 from here, even if we aren't able to get a
1379 : * memseg for the address. This can happen in non-DPDK memory
1380 : * registration paths, for example vhost or vfio-user. That is OK,
1381 : * spdk_mem_get_numa_id() just returns SPDK_ENV_NUMA_ID_ANY for
1382 : * that kind of memory. If we return an error here, the
1383 : * spdk_mem_register() from vhost or vfio-user would fail which is
1384 : * not what we want.
1385 : */
1386 0 : seg = rte_mem_virt2memseg(vaddr, NULL);
1387 0 : if (seg == NULL) {
1388 0 : return 0;
1389 : }
1390 :
1391 0 : switch (action) {
1392 : case SPDK_MEM_MAP_NOTIFY_REGISTER:
1393 0 : spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, seg->socket_id);
1394 0 : break;
1395 : case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1396 0 : spdk_mem_map_clear_translation(map, (uint64_t)vaddr, len);
1397 0 : break;
1398 : default:
1399 0 : break;
1400 : }
1401 :
1402 0 : return 0;
1403 0 : }
1404 :
1405 : static int
1406 0 : vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2)
1407 : {
1408 : /* This function is always called with paddrs for two subsequent
1409 : * 2MB chunks in virtual address space, so those chunks will be only
1410 : * physically contiguous if the physical addresses are 2MB apart
1411 : * from each other as well.
1412 : */
1413 0 : return (paddr2 - paddr1 == VALUE_2MB);
1414 : }
1415 :
1416 : #if VFIO_ENABLED
1417 :
1418 : static bool
1419 : vfio_enabled(void)
1420 : {
1421 : return rte_vfio_is_enabled("vfio_pci");
1422 : }
1423 :
1424 : /* Check if IOMMU is enabled on the system */
1425 : static bool
1426 : has_iommu_groups(void)
1427 : {
1428 : int count = 0;
1429 : DIR *dir = opendir("/sys/kernel/iommu_groups");
1430 :
1431 : if (dir == NULL) {
1432 : return false;
1433 : }
1434 :
1435 : while (count < 3 && readdir(dir) != NULL) {
1436 : count++;
1437 : }
1438 :
1439 : closedir(dir);
1440 : /* there will always be ./ and ../ entries */
1441 : return count > 2;
1442 : }
1443 :
1444 : static bool
1445 : vfio_noiommu_enabled(void)
1446 : {
1447 : return rte_vfio_noiommu_is_enabled();
1448 : }
1449 :
1450 : static void
1451 : vtophys_iommu_init(void)
1452 : {
1453 : char proc_fd_path[PATH_MAX + 1];
1454 : char link_path[PATH_MAX + 1];
1455 : const char vfio_path[] = "/dev/vfio/vfio";
1456 : DIR *dir;
1457 : struct dirent *d;
1458 :
1459 : if (!vfio_enabled()) {
1460 : return;
1461 : }
1462 :
1463 : if (vfio_noiommu_enabled()) {
1464 : g_vfio.noiommu_enabled = true;
1465 : } else if (!has_iommu_groups()) {
1466 : return;
1467 : }
1468 :
1469 : dir = opendir("/proc/self/fd");
1470 : if (!dir) {
1471 : DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
1472 : return;
1473 : }
1474 :
1475 : while ((d = readdir(dir)) != NULL) {
1476 : if (d->d_type != DT_LNK) {
1477 : continue;
1478 : }
1479 :
1480 : snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
1481 : if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
1482 : continue;
1483 : }
1484 :
1485 : if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
1486 : sscanf(d->d_name, "%d", &g_vfio.fd);
1487 : break;
1488 : }
1489 : }
1490 :
1491 : closedir(dir);
1492 :
1493 : if (g_vfio.fd < 0) {
1494 : DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
1495 : return;
1496 : }
1497 :
1498 : g_vfio.enabled = true;
1499 :
1500 : return;
1501 : }
1502 :
1503 : #endif
1504 :
1505 : void
1506 0 : vtophys_pci_device_added(struct rte_pci_device *pci_device)
1507 : {
1508 : struct spdk_vtophys_pci_device *vtophys_dev;
1509 :
1510 0 : pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1511 :
1512 0 : vtophys_dev = calloc(1, sizeof(*vtophys_dev));
1513 0 : if (vtophys_dev) {
1514 0 : vtophys_dev->pci_device = pci_device;
1515 0 : TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
1516 0 : } else {
1517 0 : DEBUG_PRINT("Memory allocation error\n");
1518 : }
1519 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1520 :
1521 : #if VFIO_ENABLED
1522 : struct spdk_vfio_dma_map *dma_map;
1523 : int ret;
1524 :
1525 : if (!g_vfio.enabled) {
1526 : return;
1527 : }
1528 :
1529 : pthread_mutex_lock(&g_vfio.mutex);
1530 : g_vfio.device_ref++;
1531 : if (g_vfio.device_ref > 1) {
1532 : pthread_mutex_unlock(&g_vfio.mutex);
1533 : return;
1534 : }
1535 :
1536 : /* This is the first SPDK device using DPDK vfio. This means that the first
1537 : * IOMMU group might have been just been added to the DPDK vfio container.
1538 : * From this point it is certain that the memory can be mapped now.
1539 : */
1540 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1541 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
1542 : if (ret) {
1543 : DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
1544 : break;
1545 : }
1546 : }
1547 : pthread_mutex_unlock(&g_vfio.mutex);
1548 : #endif
1549 0 : }
1550 :
1551 : void
1552 0 : vtophys_pci_device_removed(struct rte_pci_device *pci_device)
1553 : {
1554 : struct spdk_vtophys_pci_device *vtophys_dev;
1555 :
1556 0 : pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1557 0 : TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1558 0 : if (vtophys_dev->pci_device == pci_device) {
1559 0 : TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
1560 0 : free(vtophys_dev);
1561 0 : break;
1562 : }
1563 0 : }
1564 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1565 :
1566 : #if VFIO_ENABLED
1567 : struct spdk_vfio_dma_map *dma_map;
1568 : int ret;
1569 :
1570 : if (!g_vfio.enabled) {
1571 : return;
1572 : }
1573 :
1574 : pthread_mutex_lock(&g_vfio.mutex);
1575 : assert(g_vfio.device_ref > 0);
1576 : g_vfio.device_ref--;
1577 : if (g_vfio.device_ref > 0) {
1578 : pthread_mutex_unlock(&g_vfio.mutex);
1579 : return;
1580 : }
1581 :
1582 : /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
1583 : * any additional devices using it's vfio container, all the mappings
1584 : * will be automatically removed by the Linux vfio driver. We unmap
1585 : * the memory manually to be able to easily re-map it later regardless
1586 : * of other, external factors.
1587 : */
1588 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1589 : struct vfio_iommu_type1_dma_unmap unmap = {};
1590 : unmap.argsz = sizeof(unmap);
1591 : unmap.flags = 0;
1592 : unmap.iova = dma_map->map.iova;
1593 : unmap.size = dma_map->map.size;
1594 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
1595 : if (ret) {
1596 : DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
1597 : break;
1598 : }
1599 : }
1600 : pthread_mutex_unlock(&g_vfio.mutex);
1601 : #endif
1602 0 : }
1603 :
1604 : int
1605 0 : vtophys_init(void)
1606 : {
1607 0 : const struct spdk_mem_map_ops vtophys_map_ops = {
1608 : .notify_cb = vtophys_notify,
1609 : .are_contiguous = vtophys_check_contiguous_entries,
1610 : };
1611 :
1612 0 : const struct spdk_mem_map_ops phys_ref_map_ops = {
1613 : .notify_cb = NULL,
1614 : .are_contiguous = NULL,
1615 : };
1616 :
1617 0 : const struct spdk_mem_map_ops numa_map_ops = {
1618 : .notify_cb = numa_notify,
1619 : .are_contiguous = NULL,
1620 : };
1621 :
1622 : #if VFIO_ENABLED
1623 : vtophys_iommu_init();
1624 : #endif
1625 :
1626 0 : g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL);
1627 0 : if (g_phys_ref_map == NULL) {
1628 0 : DEBUG_PRINT("phys_ref map allocation failed.\n");
1629 0 : return -ENOMEM;
1630 : }
1631 :
1632 0 : g_numa_map = spdk_mem_map_alloc(SPDK_ENV_NUMA_ID_ANY, &numa_map_ops, NULL);
1633 0 : if (g_numa_map == NULL) {
1634 0 : DEBUG_PRINT("numa map allocation failed.\n");
1635 0 : spdk_mem_map_free(&g_phys_ref_map);
1636 0 : return -ENOMEM;
1637 : }
1638 :
1639 0 : if (g_huge_pages) {
1640 0 : g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
1641 0 : if (g_vtophys_map == NULL) {
1642 0 : DEBUG_PRINT("vtophys map allocation failed\n");
1643 0 : spdk_mem_map_free(&g_numa_map);
1644 0 : spdk_mem_map_free(&g_phys_ref_map);
1645 0 : return -ENOMEM;
1646 : }
1647 0 : }
1648 0 : return 0;
1649 0 : }
1650 :
1651 : void
1652 0 : vtophys_fini(void)
1653 : {
1654 0 : spdk_mem_map_free(&g_vtophys_map);
1655 0 : spdk_mem_map_free(&g_numa_map);
1656 0 : spdk_mem_map_free(&g_phys_ref_map);
1657 0 : }
1658 :
1659 : uint64_t
1660 0 : spdk_vtophys(const void *buf, uint64_t *size)
1661 : {
1662 : uint64_t vaddr, paddr_2mb;
1663 :
1664 0 : if (!g_huge_pages) {
1665 0 : return SPDK_VTOPHYS_ERROR;
1666 : }
1667 :
1668 0 : vaddr = (uint64_t)buf;
1669 0 : paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
1670 :
1671 : /*
1672 : * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
1673 : * we will still bitwise-or it with the buf offset below, but the result will still be
1674 : * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
1675 : * unaligned) we must now check the return value before addition.
1676 : */
1677 : SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
1678 0 : if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
1679 0 : return SPDK_VTOPHYS_ERROR;
1680 : } else {
1681 0 : return paddr_2mb + (vaddr & MASK_2MB);
1682 : }
1683 0 : }
1684 :
1685 : int32_t
1686 0 : spdk_mem_get_numa_id(const void *buf, uint64_t *size)
1687 : {
1688 0 : return spdk_mem_map_translate(g_numa_map, (uint64_t)buf, size);
1689 : }
1690 :
1691 : int
1692 0 : spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset)
1693 : {
1694 : struct rte_memseg *seg;
1695 : int ret, fd;
1696 :
1697 0 : seg = rte_mem_virt2memseg(vaddr, NULL);
1698 0 : if (!seg) {
1699 0 : SPDK_ERRLOG("memory %p doesn't exist\n", vaddr);
1700 0 : return -ENOENT;
1701 : }
1702 :
1703 0 : fd = rte_memseg_get_fd_thread_unsafe(seg);
1704 0 : if (fd < 0) {
1705 0 : return fd;
1706 : }
1707 :
1708 0 : ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset);
1709 0 : if (ret < 0) {
1710 0 : return ret;
1711 : }
1712 :
1713 0 : return fd;
1714 0 : }
1715 :
1716 : void
1717 0 : mem_disable_huge_pages(void)
1718 : {
1719 0 : g_huge_pages = false;
1720 0 : }
1721 :
1722 : void
1723 0 : mem_map_use_page_shift(uint32_t page_shift)
1724 : {
1725 0 : g_map_page_cfg.shift = page_shift;
1726 0 : g_map_page_cfg.size = 1UL << page_shift;
1727 0 : g_map_page_cfg.mask = g_map_page_cfg.size - 1;
1728 0 : g_map_page_cfg.num_pages_per_gb = 1UL << (SHIFT_1GB - page_shift);
1729 0 : }
|