Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2017 Intel Corporation.
3 : * All rights reserved.
4 : */
5 :
6 : #include "spdk/stdinc.h"
7 :
8 : #include "env_internal.h"
9 : #include "pci_dpdk.h"
10 :
11 : #include <rte_config.h>
12 : #include <rte_memory.h>
13 : #include <rte_eal_memconfig.h>
14 : #include <rte_dev.h>
15 : #include <rte_pci.h>
16 :
17 : #include "spdk_internal/assert.h"
18 :
19 : #include "spdk/assert.h"
20 : #include "spdk/likely.h"
21 : #include "spdk/queue.h"
22 : #include "spdk/util.h"
23 : #include "spdk/memory.h"
24 : #include "spdk/env_dpdk.h"
25 : #include "spdk/log.h"
26 :
27 : #ifdef __linux__
28 : #include <linux/version.h>
29 : #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
30 : #include <linux/vfio.h>
31 : #include <rte_vfio.h>
32 :
33 : struct spdk_vfio_dma_map {
34 : struct vfio_iommu_type1_dma_map map;
35 : TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
36 : };
37 :
38 : struct vfio_cfg {
39 : int fd;
40 : bool enabled;
41 : bool noiommu_enabled;
42 : unsigned device_ref;
43 : TAILQ_HEAD(, spdk_vfio_dma_map) maps;
44 : pthread_mutex_t mutex;
45 : };
46 :
47 : static struct vfio_cfg g_vfio = {
48 : .fd = -1,
49 : .enabled = false,
50 : .noiommu_enabled = false,
51 : .device_ref = 0,
52 : .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
53 : .mutex = PTHREAD_MUTEX_INITIALIZER
54 : };
55 : #endif
56 : #endif
57 :
58 : #if DEBUG
59 : #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__)
60 : #else
61 : #define DEBUG_PRINT(...)
62 : #endif
63 :
64 : #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB))
65 : #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB))
66 :
67 : #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
68 : #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
69 :
70 : /* Page is registered */
71 : #define REG_MAP_REGISTERED (1ULL << 62)
72 :
73 : /* A notification region barrier. The 2MB translation entry that's marked
74 : * with this flag must be unregistered separately. This allows contiguous
75 : * regions to be unregistered in the same chunks they were registered.
76 : */
77 : #define REG_MAP_NOTIFY_START (1ULL << 63)
78 :
79 : /* Translation of a single 2MB page. */
80 : struct map_2mb {
81 : uint64_t translation_2mb;
82 : };
83 :
84 : /* Second-level map table indexed by bits [21..29] of the virtual address.
85 : * Each entry contains the address translation or error for entries that haven't
86 : * been retrieved yet.
87 : */
88 : struct map_1gb {
89 : struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
90 : };
91 :
92 : /* Top-level map table indexed by bits [30..47] of the virtual address.
93 : * Each entry points to a second-level map table or NULL.
94 : */
95 : struct map_256tb {
96 : struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
97 : };
98 :
99 : /* Page-granularity memory address translation */
100 : struct spdk_mem_map {
101 : struct map_256tb map_256tb;
102 : pthread_mutex_t mutex;
103 : uint64_t default_translation;
104 : struct spdk_mem_map_ops ops;
105 : void *cb_ctx;
106 : TAILQ_ENTRY(spdk_mem_map) tailq;
107 : };
108 :
109 : /* Registrations map. The 64 bit translations are bit fields with the
110 : * following layout (starting with the low bits):
111 : * 0 - 61 : reserved
112 : * 62 - 63 : flags
113 : */
114 : static struct spdk_mem_map *g_mem_reg_map;
115 : static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
116 : TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
117 : static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
118 :
119 : static bool g_legacy_mem;
120 : static bool g_huge_pages = true;
121 :
122 : /*
123 : * Walk the currently registered memory via the main memory registration map
124 : * and call the new map's notify callback for each virtually contiguous region.
125 : */
126 : static int
127 0 : mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
128 : {
129 : size_t idx_256tb;
130 : uint64_t idx_1gb;
131 0 : uint64_t contig_start = UINT64_MAX;
132 0 : uint64_t contig_end = UINT64_MAX;
133 : struct map_1gb *map_1gb;
134 : int rc;
135 :
136 0 : if (!g_mem_reg_map) {
137 0 : return -EINVAL;
138 : }
139 :
140 : /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
141 0 : pthread_mutex_lock(&g_mem_reg_map->mutex);
142 :
143 0 : for (idx_256tb = 0;
144 0 : idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
145 0 : idx_256tb++) {
146 0 : map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
147 :
148 0 : if (!map_1gb) {
149 0 : if (contig_start != UINT64_MAX) {
150 : /* End of of a virtually contiguous range */
151 0 : rc = map->ops.notify_cb(map->cb_ctx, map, action,
152 0 : (void *)contig_start,
153 0 : contig_end - contig_start + VALUE_2MB);
154 : /* Don't bother handling unregister failures. It can't be any worse */
155 0 : if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
156 0 : goto err_unregister;
157 : }
158 0 : }
159 0 : contig_start = UINT64_MAX;
160 0 : continue;
161 : }
162 :
163 0 : for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
164 0 : if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
165 0 : (contig_start == UINT64_MAX ||
166 0 : (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
167 : /* Rebuild the virtual address from the indexes */
168 0 : uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
169 :
170 0 : if (contig_start == UINT64_MAX) {
171 0 : contig_start = vaddr;
172 0 : }
173 :
174 0 : contig_end = vaddr;
175 0 : } else {
176 0 : if (contig_start != UINT64_MAX) {
177 : /* End of of a virtually contiguous range */
178 0 : rc = map->ops.notify_cb(map->cb_ctx, map, action,
179 0 : (void *)contig_start,
180 0 : contig_end - contig_start + VALUE_2MB);
181 : /* Don't bother handling unregister failures. It can't be any worse */
182 0 : if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
183 0 : goto err_unregister;
184 : }
185 :
186 : /* This page might be a part of a neighbour region, so process
187 : * it again. The idx_1gb will be incremented immediately.
188 : */
189 0 : idx_1gb--;
190 0 : }
191 0 : contig_start = UINT64_MAX;
192 : }
193 0 : }
194 0 : }
195 :
196 0 : pthread_mutex_unlock(&g_mem_reg_map->mutex);
197 0 : return 0;
198 :
199 : err_unregister:
200 : /* Unwind to the first empty translation so we don't unregister
201 : * a region that just failed to register.
202 : */
203 0 : idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
204 0 : idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
205 0 : contig_start = UINT64_MAX;
206 0 : contig_end = UINT64_MAX;
207 :
208 : /* Unregister any memory we managed to register before the failure */
209 0 : for (; idx_256tb < SIZE_MAX; idx_256tb--) {
210 0 : map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
211 :
212 0 : if (!map_1gb) {
213 0 : if (contig_end != UINT64_MAX) {
214 : /* End of of a virtually contiguous range */
215 0 : map->ops.notify_cb(map->cb_ctx, map,
216 : SPDK_MEM_MAP_NOTIFY_UNREGISTER,
217 0 : (void *)contig_start,
218 0 : contig_end - contig_start + VALUE_2MB);
219 0 : }
220 0 : contig_end = UINT64_MAX;
221 0 : continue;
222 : }
223 :
224 0 : for (; idx_1gb < UINT64_MAX; idx_1gb--) {
225 : /* Rebuild the virtual address from the indexes */
226 0 : uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
227 0 : if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
228 0 : (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
229 :
230 0 : if (contig_end == UINT64_MAX) {
231 0 : contig_end = vaddr;
232 0 : }
233 0 : contig_start = vaddr;
234 0 : } else {
235 0 : if (contig_end != UINT64_MAX) {
236 0 : if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) {
237 0 : contig_start = vaddr;
238 0 : }
239 : /* End of of a virtually contiguous range */
240 0 : map->ops.notify_cb(map->cb_ctx, map,
241 : SPDK_MEM_MAP_NOTIFY_UNREGISTER,
242 0 : (void *)contig_start,
243 0 : contig_end - contig_start + VALUE_2MB);
244 0 : }
245 0 : contig_end = UINT64_MAX;
246 : }
247 0 : }
248 0 : idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
249 0 : }
250 :
251 0 : pthread_mutex_unlock(&g_mem_reg_map->mutex);
252 0 : return rc;
253 0 : }
254 :
255 : struct spdk_mem_map *
256 0 : spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
257 : {
258 : struct spdk_mem_map *map;
259 : int rc;
260 : size_t i;
261 :
262 0 : map = calloc(1, sizeof(*map));
263 0 : if (map == NULL) {
264 0 : return NULL;
265 : }
266 :
267 0 : if (pthread_mutex_init(&map->mutex, NULL)) {
268 0 : free(map);
269 0 : return NULL;
270 : }
271 :
272 0 : map->default_translation = default_translation;
273 0 : map->cb_ctx = cb_ctx;
274 0 : if (ops) {
275 0 : map->ops = *ops;
276 0 : }
277 :
278 0 : if (ops && ops->notify_cb) {
279 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
280 0 : rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
281 0 : if (rc != 0) {
282 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
283 0 : DEBUG_PRINT("Initial mem_map notify failed\n");
284 0 : pthread_mutex_destroy(&map->mutex);
285 0 : for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
286 0 : free(map->map_256tb.map[i]);
287 0 : }
288 0 : free(map);
289 0 : return NULL;
290 : }
291 0 : TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
292 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
293 0 : }
294 :
295 0 : return map;
296 0 : }
297 :
298 : void
299 0 : spdk_mem_map_free(struct spdk_mem_map **pmap)
300 : {
301 : struct spdk_mem_map *map;
302 : size_t i;
303 :
304 0 : if (!pmap) {
305 0 : return;
306 : }
307 :
308 0 : map = *pmap;
309 :
310 0 : if (!map) {
311 0 : return;
312 : }
313 :
314 0 : if (map->ops.notify_cb) {
315 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
316 0 : mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
317 0 : TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
318 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
319 0 : }
320 :
321 0 : for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
322 0 : free(map->map_256tb.map[i]);
323 0 : }
324 :
325 0 : pthread_mutex_destroy(&map->mutex);
326 :
327 0 : free(map);
328 0 : *pmap = NULL;
329 0 : }
330 :
331 : int
332 0 : spdk_mem_register(void *_vaddr, size_t len)
333 : {
334 : struct spdk_mem_map *map;
335 : int rc;
336 0 : uint64_t vaddr = (uintptr_t)_vaddr;
337 : uint64_t seg_vaddr;
338 : size_t seg_len;
339 : uint64_t reg;
340 :
341 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
342 0 : DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr);
343 0 : return -EINVAL;
344 : }
345 :
346 0 : if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
347 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n",
348 : __func__, vaddr, len);
349 0 : return -EINVAL;
350 : }
351 :
352 0 : if (len == 0) {
353 0 : return 0;
354 : }
355 :
356 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
357 :
358 0 : seg_vaddr = vaddr;
359 0 : seg_len = len;
360 0 : while (seg_len > 0) {
361 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
362 0 : if (reg & REG_MAP_REGISTERED) {
363 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
364 0 : return -EBUSY;
365 : }
366 0 : seg_vaddr += VALUE_2MB;
367 0 : seg_len -= VALUE_2MB;
368 : }
369 :
370 0 : seg_vaddr = vaddr;
371 0 : seg_len = 0;
372 0 : while (len > 0) {
373 0 : spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
374 0 : seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
375 0 : seg_len += VALUE_2MB;
376 0 : vaddr += VALUE_2MB;
377 0 : len -= VALUE_2MB;
378 : }
379 :
380 0 : TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
381 0 : rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER,
382 0 : (void *)seg_vaddr, seg_len);
383 0 : if (rc != 0) {
384 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
385 0 : return rc;
386 : }
387 0 : }
388 :
389 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
390 0 : return 0;
391 0 : }
392 :
393 : int
394 0 : spdk_mem_unregister(void *_vaddr, size_t len)
395 : {
396 : struct spdk_mem_map *map;
397 : int rc;
398 0 : uint64_t vaddr = (uintptr_t)_vaddr;
399 : uint64_t seg_vaddr;
400 : size_t seg_len;
401 : uint64_t reg, newreg;
402 :
403 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
404 0 : DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr);
405 0 : return -EINVAL;
406 : }
407 :
408 0 : if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
409 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n",
410 : __func__, vaddr, len);
411 0 : return -EINVAL;
412 : }
413 :
414 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
415 :
416 : /* The first page must be a start of a region. Also check if it's
417 : * registered to make sure we don't return -ERANGE for non-registered
418 : * regions.
419 : */
420 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
421 0 : if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
422 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
423 0 : return -ERANGE;
424 : }
425 :
426 0 : seg_vaddr = vaddr;
427 0 : seg_len = len;
428 0 : while (seg_len > 0) {
429 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
430 0 : if ((reg & REG_MAP_REGISTERED) == 0) {
431 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
432 0 : return -EINVAL;
433 : }
434 0 : seg_vaddr += VALUE_2MB;
435 0 : seg_len -= VALUE_2MB;
436 : }
437 :
438 0 : newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
439 : /* If the next page is registered, it must be a start of a region as well,
440 : * otherwise we'd be unregistering only a part of a region.
441 : */
442 0 : if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
443 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
444 0 : return -ERANGE;
445 : }
446 0 : seg_vaddr = vaddr;
447 0 : seg_len = 0;
448 :
449 0 : while (len > 0) {
450 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
451 0 : spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
452 :
453 0 : if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
454 0 : TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
455 0 : rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER,
456 0 : (void *)seg_vaddr, seg_len);
457 0 : if (rc != 0) {
458 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
459 0 : return rc;
460 : }
461 0 : }
462 :
463 0 : seg_vaddr = vaddr;
464 0 : seg_len = VALUE_2MB;
465 0 : } else {
466 0 : seg_len += VALUE_2MB;
467 : }
468 :
469 0 : vaddr += VALUE_2MB;
470 0 : len -= VALUE_2MB;
471 : }
472 :
473 0 : if (seg_len > 0) {
474 0 : TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
475 0 : rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER,
476 0 : (void *)seg_vaddr, seg_len);
477 0 : if (rc != 0) {
478 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
479 0 : return rc;
480 : }
481 0 : }
482 0 : }
483 :
484 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
485 0 : return 0;
486 0 : }
487 :
488 : int
489 0 : spdk_mem_reserve(void *vaddr, size_t len)
490 : {
491 : struct spdk_mem_map *map;
492 : void *seg_vaddr;
493 : size_t seg_len;
494 : uint64_t reg;
495 :
496 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
497 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
498 0 : return -EINVAL;
499 : }
500 :
501 0 : if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
502 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
503 : __func__, vaddr, len);
504 0 : return -EINVAL;
505 : }
506 :
507 0 : if (len == 0) {
508 0 : return 0;
509 : }
510 :
511 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
512 :
513 : /* Check if any part of this range is already registered */
514 0 : seg_vaddr = vaddr;
515 0 : seg_len = len;
516 0 : while (seg_len > 0) {
517 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
518 0 : if (reg & REG_MAP_REGISTERED) {
519 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
520 0 : return -EBUSY;
521 : }
522 0 : seg_vaddr += VALUE_2MB;
523 0 : seg_len -= VALUE_2MB;
524 : }
525 :
526 : /* Simply set the translation to the memory map's default. This allocates the space in the
527 : * map but does not provide a valid translation. */
528 0 : spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len,
529 0 : g_mem_reg_map->default_translation);
530 :
531 0 : TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
532 0 : spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation);
533 0 : }
534 :
535 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
536 0 : return 0;
537 0 : }
538 :
539 : static struct map_1gb *
540 0 : mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
541 : {
542 : struct map_1gb *map_1gb;
543 0 : uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
544 : size_t i;
545 :
546 0 : if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
547 0 : return NULL;
548 : }
549 :
550 0 : map_1gb = map->map_256tb.map[idx_256tb];
551 :
552 0 : if (!map_1gb) {
553 0 : pthread_mutex_lock(&map->mutex);
554 :
555 : /* Recheck to make sure nobody else got the mutex first. */
556 0 : map_1gb = map->map_256tb.map[idx_256tb];
557 0 : if (!map_1gb) {
558 0 : map_1gb = malloc(sizeof(struct map_1gb));
559 0 : if (map_1gb) {
560 : /* initialize all entries to default translation */
561 0 : for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
562 0 : map_1gb->map[i].translation_2mb = map->default_translation;
563 0 : }
564 0 : map->map_256tb.map[idx_256tb] = map_1gb;
565 0 : }
566 0 : }
567 :
568 0 : pthread_mutex_unlock(&map->mutex);
569 :
570 0 : if (!map_1gb) {
571 0 : DEBUG_PRINT("allocation failed\n");
572 0 : return NULL;
573 : }
574 0 : }
575 :
576 0 : return map_1gb;
577 0 : }
578 :
579 : int
580 0 : spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
581 : uint64_t translation)
582 : {
583 : uint64_t vfn_2mb;
584 : struct map_1gb *map_1gb;
585 : uint64_t idx_1gb;
586 : struct map_2mb *map_2mb;
587 :
588 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
589 0 : DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr);
590 0 : return -EINVAL;
591 : }
592 :
593 : /* For now, only 2 MB-aligned registrations are supported */
594 0 : if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
595 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n",
596 : __func__, vaddr, size);
597 0 : return -EINVAL;
598 : }
599 :
600 0 : vfn_2mb = vaddr >> SHIFT_2MB;
601 :
602 0 : while (size) {
603 0 : map_1gb = mem_map_get_map_1gb(map, vfn_2mb);
604 0 : if (!map_1gb) {
605 0 : DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
606 0 : return -ENOMEM;
607 : }
608 :
609 0 : idx_1gb = MAP_1GB_IDX(vfn_2mb);
610 0 : map_2mb = &map_1gb->map[idx_1gb];
611 0 : map_2mb->translation_2mb = translation;
612 :
613 0 : size -= VALUE_2MB;
614 0 : vfn_2mb++;
615 : }
616 :
617 0 : return 0;
618 0 : }
619 :
620 : int
621 0 : spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
622 : {
623 0 : return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation);
624 : }
625 :
626 : inline uint64_t
627 0 : spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
628 : {
629 : const struct map_1gb *map_1gb;
630 : const struct map_2mb *map_2mb;
631 : uint64_t idx_256tb;
632 : uint64_t idx_1gb;
633 : uint64_t vfn_2mb;
634 : uint64_t cur_size;
635 : uint64_t prev_translation;
636 : uint64_t orig_translation;
637 :
638 0 : if (spdk_unlikely(vaddr & ~MASK_256TB)) {
639 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
640 0 : return map->default_translation;
641 : }
642 :
643 0 : vfn_2mb = vaddr >> SHIFT_2MB;
644 0 : idx_256tb = MAP_256TB_IDX(vfn_2mb);
645 0 : idx_1gb = MAP_1GB_IDX(vfn_2mb);
646 :
647 0 : map_1gb = map->map_256tb.map[idx_256tb];
648 0 : if (spdk_unlikely(!map_1gb)) {
649 0 : return map->default_translation;
650 : }
651 :
652 0 : cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
653 0 : map_2mb = &map_1gb->map[idx_1gb];
654 0 : if (size == NULL || map->ops.are_contiguous == NULL ||
655 0 : map_2mb->translation_2mb == map->default_translation) {
656 0 : if (size != NULL) {
657 0 : *size = spdk_min(*size, cur_size);
658 0 : }
659 0 : return map_2mb->translation_2mb;
660 : }
661 :
662 0 : orig_translation = map_2mb->translation_2mb;
663 0 : prev_translation = orig_translation;
664 0 : while (cur_size < *size) {
665 0 : vfn_2mb++;
666 0 : idx_256tb = MAP_256TB_IDX(vfn_2mb);
667 0 : idx_1gb = MAP_1GB_IDX(vfn_2mb);
668 :
669 0 : map_1gb = map->map_256tb.map[idx_256tb];
670 0 : if (spdk_unlikely(!map_1gb)) {
671 0 : break;
672 : }
673 :
674 0 : map_2mb = &map_1gb->map[idx_1gb];
675 0 : if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
676 0 : break;
677 : }
678 :
679 0 : cur_size += VALUE_2MB;
680 0 : prev_translation = map_2mb->translation_2mb;
681 : }
682 :
683 0 : *size = spdk_min(*size, cur_size);
684 0 : return orig_translation;
685 0 : }
686 :
687 : static void
688 0 : memory_hotplug_cb(enum rte_mem_event event_type,
689 : const void *addr, size_t len, void *arg)
690 : {
691 0 : if (event_type == RTE_MEM_EVENT_ALLOC) {
692 0 : spdk_mem_register((void *)addr, len);
693 :
694 0 : if (!spdk_env_dpdk_external_init()) {
695 0 : return;
696 : }
697 :
698 : /* When the user initialized DPDK separately, we can't
699 : * be sure that --match-allocations RTE flag was specified.
700 : * Without this flag, DPDK can free memory in different units
701 : * than it was allocated. It doesn't work with things like RDMA MRs.
702 : *
703 : * For such cases, we mark segments so they aren't freed.
704 : */
705 0 : while (len > 0) {
706 : struct rte_memseg *seg;
707 :
708 0 : seg = rte_mem_virt2memseg(addr, NULL);
709 0 : assert(seg != NULL);
710 0 : seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
711 0 : addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
712 0 : len -= seg->hugepage_sz;
713 : }
714 0 : } else if (event_type == RTE_MEM_EVENT_FREE) {
715 0 : spdk_mem_unregister((void *)addr, len);
716 0 : }
717 0 : }
718 :
719 : static int
720 0 : memory_iter_cb(const struct rte_memseg_list *msl,
721 : const struct rte_memseg *ms, size_t len, void *arg)
722 : {
723 0 : return spdk_mem_register(ms->addr, len);
724 : }
725 :
726 : static bool g_mem_event_cb_registered = false;
727 :
728 : static int
729 0 : mem_map_mem_event_callback_register(void)
730 : {
731 : int rc;
732 :
733 0 : rc = rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
734 0 : if (rc != 0) {
735 0 : return rc;
736 : }
737 :
738 0 : g_mem_event_cb_registered = true;
739 0 : return 0;
740 0 : }
741 :
742 : static void
743 0 : mem_map_mem_event_callback_unregister(void)
744 : {
745 0 : if (g_mem_event_cb_registered) {
746 0 : g_mem_event_cb_registered = false;
747 0 : rte_mem_event_callback_unregister("spdk", NULL);
748 0 : }
749 0 : }
750 :
751 : int
752 0 : mem_map_init(bool legacy_mem)
753 : {
754 : int rc;
755 :
756 0 : g_legacy_mem = legacy_mem;
757 :
758 0 : g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
759 0 : if (g_mem_reg_map == NULL) {
760 0 : DEBUG_PRINT("memory registration map allocation failed\n");
761 0 : return -ENOMEM;
762 : }
763 :
764 0 : if (!g_huge_pages) {
765 0 : return 0;
766 : }
767 :
768 0 : if (!g_legacy_mem) {
769 : /**
770 : * To prevent DPDK complaining, only register the callback when
771 : * we are not in legacy mem mode.
772 : */
773 0 : rc = mem_map_mem_event_callback_register();
774 0 : if (rc != 0) {
775 0 : DEBUG_PRINT("memory event callback registration failed, rc = %d\n", rc);
776 0 : goto err_free_reg_map;
777 : }
778 0 : }
779 :
780 : /*
781 : * Walk all DPDK memory segments and register them
782 : * with the main memory map
783 : */
784 0 : rc = rte_memseg_contig_walk(memory_iter_cb, NULL);
785 0 : if (rc != 0) {
786 0 : DEBUG_PRINT("memory segments walking failed, rc = %d\n", rc);
787 0 : goto err_unregister_mem_cb;
788 : }
789 :
790 0 : return 0;
791 :
792 : err_unregister_mem_cb:
793 0 : mem_map_mem_event_callback_unregister();
794 : err_free_reg_map:
795 0 : spdk_mem_map_free(&g_mem_reg_map);
796 0 : return rc;
797 0 : }
798 :
799 : void
800 0 : mem_map_fini(void)
801 : {
802 0 : mem_map_mem_event_callback_unregister();
803 0 : spdk_mem_map_free(&g_mem_reg_map);
804 0 : }
805 :
806 : bool
807 0 : spdk_iommu_is_enabled(void)
808 : {
809 : #if VFIO_ENABLED
810 : return g_vfio.enabled && !g_vfio.noiommu_enabled;
811 : #else
812 0 : return false;
813 : #endif
814 : }
815 :
816 : struct spdk_vtophys_pci_device {
817 : struct rte_pci_device *pci_device;
818 : TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
819 : };
820 :
821 : static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
822 : static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
823 : TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
824 :
825 : static struct spdk_mem_map *g_vtophys_map;
826 : static struct spdk_mem_map *g_phys_ref_map;
827 : static struct spdk_mem_map *g_numa_map;
828 :
829 : #if VFIO_ENABLED
830 : static int
831 : _vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
832 : {
833 : struct spdk_vfio_dma_map *dma_map;
834 : int ret;
835 :
836 : dma_map = calloc(1, sizeof(*dma_map));
837 : if (dma_map == NULL) {
838 : return -ENOMEM;
839 : }
840 :
841 : dma_map->map.argsz = sizeof(dma_map->map);
842 : dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
843 : dma_map->map.vaddr = vaddr;
844 : dma_map->map.iova = iova;
845 : dma_map->map.size = size;
846 :
847 : if (g_vfio.device_ref == 0) {
848 : /* VFIO requires at least one device (IOMMU group) to be added to
849 : * a VFIO container before it is possible to perform any IOMMU
850 : * operations on that container. This memory will be mapped once
851 : * the first device (IOMMU group) is hotplugged.
852 : *
853 : * Since the vfio container is managed internally by DPDK, it is
854 : * also possible that some device is already in that container, but
855 : * it's not managed by SPDK - e.g. an NIC attached internally
856 : * inside DPDK. We could map the memory straight away in such
857 : * scenario, but there's no need to do it. DPDK devices clearly
858 : * don't need our mappings and hence we defer the mapping
859 : * unconditionally until the first SPDK-managed device is
860 : * hotplugged.
861 : */
862 : goto out_insert;
863 : }
864 :
865 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
866 : if (ret) {
867 : /* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */
868 : SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno);
869 : }
870 :
871 : out_insert:
872 : TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
873 : return 0;
874 : }
875 :
876 :
877 : static int
878 : vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
879 : {
880 : uint64_t refcount;
881 : int ret;
882 :
883 : refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
884 : assert(refcount < UINT64_MAX);
885 : if (refcount > 0) {
886 : spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
887 : return 0;
888 : }
889 :
890 : pthread_mutex_lock(&g_vfio.mutex);
891 : ret = _vfio_iommu_map_dma(vaddr, iova, size);
892 : pthread_mutex_unlock(&g_vfio.mutex);
893 : if (ret) {
894 : return ret;
895 : }
896 :
897 : spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
898 : return 0;
899 : }
900 :
901 : int
902 : vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size)
903 : {
904 : int ret;
905 :
906 : pthread_mutex_lock(&g_vfio.mutex);
907 : ret = _vfio_iommu_map_dma(vaddr, iova, size);
908 : pthread_mutex_unlock(&g_vfio.mutex);
909 :
910 : return ret;
911 : }
912 :
913 : static int
914 : _vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map)
915 : {
916 : struct vfio_iommu_type1_dma_unmap unmap = {};
917 : int ret;
918 :
919 : if (g_vfio.device_ref == 0) {
920 : /* Memory is not mapped anymore, just remove it's references */
921 : goto out_remove;
922 : }
923 :
924 : unmap.argsz = sizeof(unmap);
925 : unmap.flags = 0;
926 : unmap.iova = dma_map->map.iova;
927 : unmap.size = dma_map->map.size;
928 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
929 : if (ret) {
930 : SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno);
931 : }
932 :
933 : out_remove:
934 : TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
935 : free(dma_map);
936 : return 0;
937 : }
938 :
939 : static int
940 : vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
941 : {
942 : struct spdk_vfio_dma_map *dma_map;
943 : uint64_t refcount;
944 : int ret;
945 :
946 : pthread_mutex_lock(&g_vfio.mutex);
947 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
948 : if (dma_map->map.iova == iova) {
949 : break;
950 : }
951 : }
952 :
953 : if (dma_map == NULL) {
954 : DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
955 : pthread_mutex_unlock(&g_vfio.mutex);
956 : return -ENXIO;
957 : }
958 :
959 : refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
960 : assert(refcount < UINT64_MAX);
961 : if (refcount > 0) {
962 : spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
963 : }
964 :
965 : /* We still have outstanding references, don't clear it. */
966 : if (refcount > 1) {
967 : pthread_mutex_unlock(&g_vfio.mutex);
968 : return 0;
969 : }
970 :
971 : /** don't support partial or multiple-page unmap for now */
972 : assert(dma_map->map.size == size);
973 :
974 : ret = _vfio_iommu_unmap_dma(dma_map);
975 : pthread_mutex_unlock(&g_vfio.mutex);
976 :
977 : return ret;
978 : }
979 :
980 : int
981 : vtophys_iommu_unmap_dma_bar(uint64_t vaddr)
982 : {
983 : struct spdk_vfio_dma_map *dma_map;
984 : int ret;
985 :
986 : pthread_mutex_lock(&g_vfio.mutex);
987 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
988 : if (dma_map->map.vaddr == vaddr) {
989 : break;
990 : }
991 : }
992 :
993 : if (dma_map == NULL) {
994 : DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr);
995 : pthread_mutex_unlock(&g_vfio.mutex);
996 : return -ENXIO;
997 : }
998 :
999 : ret = _vfio_iommu_unmap_dma(dma_map);
1000 : pthread_mutex_unlock(&g_vfio.mutex);
1001 : return ret;
1002 : }
1003 : #endif
1004 :
1005 : static uint64_t
1006 0 : vtophys_get_paddr_memseg(uint64_t vaddr)
1007 : {
1008 : uintptr_t paddr;
1009 : struct rte_memseg *seg;
1010 :
1011 0 : seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
1012 0 : if (seg != NULL) {
1013 0 : paddr = seg->iova;
1014 0 : if (paddr == RTE_BAD_IOVA) {
1015 0 : return SPDK_VTOPHYS_ERROR;
1016 : }
1017 0 : paddr += (vaddr - (uintptr_t)seg->addr);
1018 0 : return paddr;
1019 : }
1020 :
1021 0 : return SPDK_VTOPHYS_ERROR;
1022 0 : }
1023 :
1024 : /* Try to get the paddr from /proc/self/pagemap */
1025 : static uint64_t
1026 0 : vtophys_get_paddr_pagemap(uint64_t vaddr)
1027 : {
1028 : uintptr_t paddr;
1029 :
1030 : /* Silence static analyzers */
1031 0 : assert(vaddr != 0);
1032 0 : paddr = rte_mem_virt2iova((void *)vaddr);
1033 0 : if (paddr == RTE_BAD_IOVA) {
1034 : /*
1035 : * The vaddr may be valid but doesn't have a backing page
1036 : * assigned yet. Touch the page to ensure a backing page
1037 : * gets assigned, then try to translate again.
1038 : */
1039 0 : rte_atomic64_read((rte_atomic64_t *)vaddr);
1040 0 : paddr = rte_mem_virt2iova((void *)vaddr);
1041 0 : }
1042 0 : if (paddr == RTE_BAD_IOVA) {
1043 : /* Unable to get to the physical address. */
1044 0 : return SPDK_VTOPHYS_ERROR;
1045 : }
1046 :
1047 0 : return paddr;
1048 0 : }
1049 :
1050 : static uint64_t
1051 0 : pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len)
1052 : {
1053 : struct rte_mem_resource *res;
1054 : uint64_t paddr;
1055 : unsigned r;
1056 :
1057 0 : for (r = 0; r < PCI_MAX_RESOURCE; r++) {
1058 0 : res = dpdk_pci_device_get_mem_resource(dev, r);
1059 :
1060 0 : if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr ||
1061 0 : (vaddr + len) >= (uint64_t)res->addr + res->len) {
1062 0 : continue;
1063 : }
1064 :
1065 : #if VFIO_ENABLED
1066 : if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) {
1067 : /*
1068 : * The IOMMU is on and we're using IOVA == VA. The BAR was
1069 : * automatically registered when it was mapped, so just return
1070 : * the virtual address here.
1071 : */
1072 : return vaddr;
1073 : }
1074 : #endif
1075 0 : paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
1076 0 : return paddr;
1077 : }
1078 :
1079 0 : return SPDK_VTOPHYS_ERROR;
1080 0 : }
1081 :
1082 : /* Try to get the paddr from pci devices */
1083 : static uint64_t
1084 0 : vtophys_get_paddr_pci(uint64_t vaddr, size_t len)
1085 : {
1086 : struct spdk_vtophys_pci_device *vtophys_dev;
1087 : uintptr_t paddr;
1088 : struct rte_pci_device *dev;
1089 :
1090 0 : pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1091 0 : TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1092 0 : dev = vtophys_dev->pci_device;
1093 0 : paddr = pci_device_vtophys(dev, vaddr, len);
1094 0 : if (paddr != SPDK_VTOPHYS_ERROR) {
1095 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1096 0 : return paddr;
1097 : }
1098 0 : }
1099 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1100 :
1101 0 : return SPDK_VTOPHYS_ERROR;
1102 0 : }
1103 :
1104 : static int
1105 0 : vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
1106 : enum spdk_mem_map_notify_action action,
1107 : void *vaddr, size_t len)
1108 : {
1109 0 : int rc = 0;
1110 : uint64_t paddr;
1111 :
1112 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
1113 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
1114 0 : return -EINVAL;
1115 : }
1116 :
1117 0 : if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
1118 0 : DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n",
1119 : vaddr, len);
1120 0 : return -EINVAL;
1121 : }
1122 :
1123 : /* Get the physical address from the DPDK memsegs */
1124 0 : paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1125 :
1126 0 : switch (action) {
1127 : case SPDK_MEM_MAP_NOTIFY_REGISTER:
1128 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1129 : /* This is not an address that DPDK is managing. */
1130 :
1131 : /* Check if this is a PCI BAR. They need special handling */
1132 0 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1133 0 : if (paddr != SPDK_VTOPHYS_ERROR) {
1134 : /* Get paddr for each 2MB chunk in this address range */
1135 0 : while (len > 0) {
1136 0 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1137 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1138 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1139 0 : return -EFAULT;
1140 : }
1141 :
1142 0 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1143 0 : if (rc != 0) {
1144 0 : return rc;
1145 : }
1146 :
1147 0 : vaddr += VALUE_2MB;
1148 0 : len -= VALUE_2MB;
1149 : }
1150 :
1151 0 : return 0;
1152 : }
1153 :
1154 : #if VFIO_ENABLED
1155 : enum rte_iova_mode iova_mode;
1156 :
1157 : iova_mode = rte_eal_iova_mode();
1158 :
1159 : if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
1160 : /* We'll use the virtual address as the iova to match DPDK. */
1161 : paddr = (uint64_t)vaddr;
1162 : rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
1163 : if (rc) {
1164 : return -EFAULT;
1165 : }
1166 : while (len > 0) {
1167 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1168 : if (rc != 0) {
1169 : return rc;
1170 : }
1171 : vaddr += VALUE_2MB;
1172 : paddr += VALUE_2MB;
1173 : len -= VALUE_2MB;
1174 : }
1175 : } else
1176 : #endif
1177 : {
1178 : /* Get the physical address from /proc/self/pagemap. */
1179 0 : paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1180 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1181 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1182 0 : return -EFAULT;
1183 : }
1184 :
1185 : /* Get paddr for each 2MB chunk in this address range */
1186 0 : while (len > 0) {
1187 : /* Get the physical address from /proc/self/pagemap. */
1188 0 : paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1189 :
1190 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1191 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1192 0 : return -EFAULT;
1193 : }
1194 :
1195 0 : if (paddr & MASK_2MB) {
1196 0 : DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
1197 0 : return -EINVAL;
1198 : }
1199 : #if VFIO_ENABLED
1200 : /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
1201 : * with the IOMMU using the physical address to match. */
1202 : if (spdk_iommu_is_enabled()) {
1203 : rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
1204 : if (rc) {
1205 : DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
1206 : return -EFAULT;
1207 : }
1208 : }
1209 : #endif
1210 :
1211 0 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1212 0 : if (rc != 0) {
1213 0 : return rc;
1214 : }
1215 :
1216 0 : vaddr += VALUE_2MB;
1217 0 : len -= VALUE_2MB;
1218 : }
1219 : }
1220 0 : } else {
1221 : /* This is an address managed by DPDK. Just setup the translations. */
1222 0 : while (len > 0) {
1223 0 : paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1224 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1225 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1226 0 : return -EFAULT;
1227 : }
1228 :
1229 0 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1230 0 : if (rc != 0) {
1231 0 : return rc;
1232 : }
1233 :
1234 0 : vaddr += VALUE_2MB;
1235 0 : len -= VALUE_2MB;
1236 : }
1237 : }
1238 :
1239 0 : break;
1240 : case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1241 : #if VFIO_ENABLED
1242 : if (paddr == SPDK_VTOPHYS_ERROR) {
1243 : /*
1244 : * This is not an address that DPDK is managing.
1245 : */
1246 :
1247 : /* Check if this is a PCI BAR. They need special handling */
1248 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1249 : if (paddr != SPDK_VTOPHYS_ERROR) {
1250 : /* Get paddr for each 2MB chunk in this address range */
1251 : while (len > 0) {
1252 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1253 : if (paddr == SPDK_VTOPHYS_ERROR) {
1254 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1255 : return -EFAULT;
1256 : }
1257 :
1258 : rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1259 : if (rc != 0) {
1260 : return rc;
1261 : }
1262 :
1263 : vaddr += VALUE_2MB;
1264 : len -= VALUE_2MB;
1265 : }
1266 :
1267 : return 0;
1268 : }
1269 :
1270 : /* If vfio is enabled,
1271 : * we need to unmap the range from the IOMMU
1272 : */
1273 : if (spdk_iommu_is_enabled()) {
1274 : uint64_t buffer_len = len;
1275 : uint8_t *va = vaddr;
1276 : enum rte_iova_mode iova_mode;
1277 :
1278 : iova_mode = rte_eal_iova_mode();
1279 : /*
1280 : * In virtual address mode, the region is contiguous and can be done in
1281 : * one unmap.
1282 : */
1283 : if (iova_mode == RTE_IOVA_VA) {
1284 : paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len);
1285 : if (buffer_len != len || paddr != (uintptr_t)va) {
1286 : DEBUG_PRINT("Unmapping %p with length %lu failed because "
1287 : "translation had address 0x%" PRIx64 " and length %lu\n",
1288 : va, len, paddr, buffer_len);
1289 : return -EINVAL;
1290 : }
1291 : rc = vtophys_iommu_unmap_dma(paddr, len);
1292 : if (rc) {
1293 : DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1294 : return -EFAULT;
1295 : }
1296 : } else if (iova_mode == RTE_IOVA_PA) {
1297 : /* Get paddr for each 2MB chunk in this address range */
1298 : while (buffer_len > 0) {
1299 : paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
1300 :
1301 : if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
1302 : DEBUG_PRINT("could not get phys addr for %p\n", va);
1303 : return -EFAULT;
1304 : }
1305 :
1306 : rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
1307 : if (rc) {
1308 : DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1309 : return -EFAULT;
1310 : }
1311 :
1312 : va += VALUE_2MB;
1313 : buffer_len -= VALUE_2MB;
1314 : }
1315 : }
1316 : }
1317 : }
1318 : #endif
1319 0 : while (len > 0) {
1320 0 : rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1321 0 : if (rc != 0) {
1322 0 : return rc;
1323 : }
1324 :
1325 0 : vaddr += VALUE_2MB;
1326 0 : len -= VALUE_2MB;
1327 : }
1328 :
1329 0 : break;
1330 : default:
1331 0 : SPDK_UNREACHABLE();
1332 : }
1333 :
1334 0 : return rc;
1335 0 : }
1336 :
1337 : static int
1338 0 : numa_notify(void *cb_ctx, struct spdk_mem_map *map,
1339 : enum spdk_mem_map_notify_action action,
1340 : void *vaddr, size_t len)
1341 : {
1342 : struct rte_memseg *seg;
1343 :
1344 : /* We always return 0 from here, even if we aren't able to get a
1345 : * memseg for the address. This can happen in non-DPDK memory
1346 : * registration paths, for example vhost or vfio-user. That is OK,
1347 : * spdk_mem_get_numa_id() just returns SPDK_ENV_NUMA_ID_ANY for
1348 : * that kind of memory. If we return an error here, the
1349 : * spdk_mem_register() from vhost or vfio-user would fail which is
1350 : * not what we want.
1351 : */
1352 0 : seg = rte_mem_virt2memseg(vaddr, NULL);
1353 0 : if (seg == NULL) {
1354 0 : return 0;
1355 : }
1356 :
1357 0 : switch (action) {
1358 : case SPDK_MEM_MAP_NOTIFY_REGISTER:
1359 0 : spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, seg->socket_id);
1360 0 : break;
1361 : case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1362 0 : spdk_mem_map_clear_translation(map, (uint64_t)vaddr, len);
1363 0 : break;
1364 : default:
1365 0 : break;
1366 : }
1367 :
1368 0 : return 0;
1369 0 : }
1370 :
1371 : static int
1372 0 : vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2)
1373 : {
1374 : /* This function is always called with paddrs for two subsequent
1375 : * 2MB chunks in virtual address space, so those chunks will be only
1376 : * physically contiguous if the physical addresses are 2MB apart
1377 : * from each other as well.
1378 : */
1379 0 : return (paddr2 - paddr1 == VALUE_2MB);
1380 : }
1381 :
1382 : #if VFIO_ENABLED
1383 :
1384 : static bool
1385 : vfio_enabled(void)
1386 : {
1387 : return rte_vfio_is_enabled("vfio_pci");
1388 : }
1389 :
1390 : /* Check if IOMMU is enabled on the system */
1391 : static bool
1392 : has_iommu_groups(void)
1393 : {
1394 : int count = 0;
1395 : DIR *dir = opendir("/sys/kernel/iommu_groups");
1396 :
1397 : if (dir == NULL) {
1398 : return false;
1399 : }
1400 :
1401 : while (count < 3 && readdir(dir) != NULL) {
1402 : count++;
1403 : }
1404 :
1405 : closedir(dir);
1406 : /* there will always be ./ and ../ entries */
1407 : return count > 2;
1408 : }
1409 :
1410 : static bool
1411 : vfio_noiommu_enabled(void)
1412 : {
1413 : return rte_vfio_noiommu_is_enabled();
1414 : }
1415 :
1416 : static void
1417 : vtophys_iommu_init(void)
1418 : {
1419 : char proc_fd_path[PATH_MAX + 1];
1420 : char link_path[PATH_MAX + 1];
1421 : const char vfio_path[] = "/dev/vfio/vfio";
1422 : DIR *dir;
1423 : struct dirent *d;
1424 :
1425 : if (!vfio_enabled()) {
1426 : return;
1427 : }
1428 :
1429 : if (vfio_noiommu_enabled()) {
1430 : g_vfio.noiommu_enabled = true;
1431 : } else if (!has_iommu_groups()) {
1432 : return;
1433 : }
1434 :
1435 : dir = opendir("/proc/self/fd");
1436 : if (!dir) {
1437 : DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
1438 : return;
1439 : }
1440 :
1441 : while ((d = readdir(dir)) != NULL) {
1442 : if (d->d_type != DT_LNK) {
1443 : continue;
1444 : }
1445 :
1446 : snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
1447 : if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
1448 : continue;
1449 : }
1450 :
1451 : if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
1452 : sscanf(d->d_name, "%d", &g_vfio.fd);
1453 : break;
1454 : }
1455 : }
1456 :
1457 : closedir(dir);
1458 :
1459 : if (g_vfio.fd < 0) {
1460 : DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
1461 : return;
1462 : }
1463 :
1464 : g_vfio.enabled = true;
1465 :
1466 : return;
1467 : }
1468 :
1469 : #endif
1470 :
1471 : void
1472 0 : vtophys_pci_device_added(struct rte_pci_device *pci_device)
1473 : {
1474 : struct spdk_vtophys_pci_device *vtophys_dev;
1475 :
1476 0 : pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1477 :
1478 0 : vtophys_dev = calloc(1, sizeof(*vtophys_dev));
1479 0 : if (vtophys_dev) {
1480 0 : vtophys_dev->pci_device = pci_device;
1481 0 : TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
1482 0 : } else {
1483 0 : DEBUG_PRINT("Memory allocation error\n");
1484 : }
1485 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1486 :
1487 : #if VFIO_ENABLED
1488 : struct spdk_vfio_dma_map *dma_map;
1489 : int ret;
1490 :
1491 : if (!g_vfio.enabled) {
1492 : return;
1493 : }
1494 :
1495 : pthread_mutex_lock(&g_vfio.mutex);
1496 : g_vfio.device_ref++;
1497 : if (g_vfio.device_ref > 1) {
1498 : pthread_mutex_unlock(&g_vfio.mutex);
1499 : return;
1500 : }
1501 :
1502 : /* This is the first SPDK device using DPDK vfio. This means that the first
1503 : * IOMMU group might have been just been added to the DPDK vfio container.
1504 : * From this point it is certain that the memory can be mapped now.
1505 : */
1506 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1507 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
1508 : if (ret) {
1509 : DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
1510 : break;
1511 : }
1512 : }
1513 : pthread_mutex_unlock(&g_vfio.mutex);
1514 : #endif
1515 0 : }
1516 :
1517 : void
1518 0 : vtophys_pci_device_removed(struct rte_pci_device *pci_device)
1519 : {
1520 : struct spdk_vtophys_pci_device *vtophys_dev;
1521 :
1522 0 : pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1523 0 : TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1524 0 : if (vtophys_dev->pci_device == pci_device) {
1525 0 : TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
1526 0 : free(vtophys_dev);
1527 0 : break;
1528 : }
1529 0 : }
1530 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1531 :
1532 : #if VFIO_ENABLED
1533 : struct spdk_vfio_dma_map *dma_map;
1534 : int ret;
1535 :
1536 : if (!g_vfio.enabled) {
1537 : return;
1538 : }
1539 :
1540 : pthread_mutex_lock(&g_vfio.mutex);
1541 : assert(g_vfio.device_ref > 0);
1542 : g_vfio.device_ref--;
1543 : if (g_vfio.device_ref > 0) {
1544 : pthread_mutex_unlock(&g_vfio.mutex);
1545 : return;
1546 : }
1547 :
1548 : /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
1549 : * any additional devices using it's vfio container, all the mappings
1550 : * will be automatically removed by the Linux vfio driver. We unmap
1551 : * the memory manually to be able to easily re-map it later regardless
1552 : * of other, external factors.
1553 : */
1554 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1555 : struct vfio_iommu_type1_dma_unmap unmap = {};
1556 : unmap.argsz = sizeof(unmap);
1557 : unmap.flags = 0;
1558 : unmap.iova = dma_map->map.iova;
1559 : unmap.size = dma_map->map.size;
1560 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
1561 : if (ret) {
1562 : DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
1563 : break;
1564 : }
1565 : }
1566 : pthread_mutex_unlock(&g_vfio.mutex);
1567 : #endif
1568 0 : }
1569 :
1570 : int
1571 0 : vtophys_init(void)
1572 : {
1573 0 : const struct spdk_mem_map_ops vtophys_map_ops = {
1574 : .notify_cb = vtophys_notify,
1575 : .are_contiguous = vtophys_check_contiguous_entries,
1576 : };
1577 :
1578 0 : const struct spdk_mem_map_ops phys_ref_map_ops = {
1579 : .notify_cb = NULL,
1580 : .are_contiguous = NULL,
1581 : };
1582 :
1583 0 : const struct spdk_mem_map_ops numa_map_ops = {
1584 : .notify_cb = numa_notify,
1585 : .are_contiguous = NULL,
1586 : };
1587 :
1588 : #if VFIO_ENABLED
1589 : vtophys_iommu_init();
1590 : #endif
1591 :
1592 0 : g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL);
1593 0 : if (g_phys_ref_map == NULL) {
1594 0 : DEBUG_PRINT("phys_ref map allocation failed.\n");
1595 0 : return -ENOMEM;
1596 : }
1597 :
1598 0 : g_numa_map = spdk_mem_map_alloc(SPDK_ENV_NUMA_ID_ANY, &numa_map_ops, NULL);
1599 0 : if (g_numa_map == NULL) {
1600 0 : DEBUG_PRINT("numa map allocation failed.\n");
1601 0 : spdk_mem_map_free(&g_phys_ref_map);
1602 0 : return -ENOMEM;
1603 : }
1604 :
1605 0 : if (g_huge_pages) {
1606 0 : g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
1607 0 : if (g_vtophys_map == NULL) {
1608 0 : DEBUG_PRINT("vtophys map allocation failed\n");
1609 0 : spdk_mem_map_free(&g_numa_map);
1610 0 : spdk_mem_map_free(&g_phys_ref_map);
1611 0 : return -ENOMEM;
1612 : }
1613 0 : }
1614 0 : return 0;
1615 0 : }
1616 :
1617 : void
1618 0 : vtophys_fini(void)
1619 : {
1620 0 : spdk_mem_map_free(&g_vtophys_map);
1621 0 : spdk_mem_map_free(&g_numa_map);
1622 0 : spdk_mem_map_free(&g_phys_ref_map);
1623 0 : }
1624 :
1625 : uint64_t
1626 0 : spdk_vtophys(const void *buf, uint64_t *size)
1627 : {
1628 : uint64_t vaddr, paddr_2mb;
1629 :
1630 0 : if (!g_huge_pages) {
1631 0 : return SPDK_VTOPHYS_ERROR;
1632 : }
1633 :
1634 0 : vaddr = (uint64_t)buf;
1635 0 : paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
1636 :
1637 : /*
1638 : * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
1639 : * we will still bitwise-or it with the buf offset below, but the result will still be
1640 : * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
1641 : * unaligned) we must now check the return value before addition.
1642 : */
1643 : SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
1644 0 : if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
1645 0 : return SPDK_VTOPHYS_ERROR;
1646 : } else {
1647 0 : return paddr_2mb + (vaddr & MASK_2MB);
1648 : }
1649 0 : }
1650 :
1651 : int32_t
1652 0 : spdk_mem_get_numa_id(const void *buf, uint64_t *size)
1653 : {
1654 0 : return spdk_mem_map_translate(g_numa_map, (uint64_t)buf, size);
1655 : }
1656 :
1657 : int
1658 0 : spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset)
1659 : {
1660 : struct rte_memseg *seg;
1661 : int ret, fd;
1662 :
1663 0 : seg = rte_mem_virt2memseg(vaddr, NULL);
1664 0 : if (!seg) {
1665 0 : SPDK_ERRLOG("memory %p doesn't exist\n", vaddr);
1666 0 : return -ENOENT;
1667 : }
1668 :
1669 0 : fd = rte_memseg_get_fd_thread_unsafe(seg);
1670 0 : if (fd < 0) {
1671 0 : return fd;
1672 : }
1673 :
1674 0 : ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset);
1675 0 : if (ret < 0) {
1676 0 : return ret;
1677 : }
1678 :
1679 0 : return fd;
1680 0 : }
1681 :
1682 : void
1683 0 : mem_disable_huge_pages(void)
1684 : {
1685 0 : g_huge_pages = false;
1686 0 : }
|