Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2017 Intel Corporation.
3 : * All rights reserved.
4 : */
5 :
6 : #include "spdk/stdinc.h"
7 :
8 : #include "env_internal.h"
9 : #include "pci_dpdk.h"
10 :
11 : #include <rte_config.h>
12 : #include <rte_memory.h>
13 : #include <rte_eal_memconfig.h>
14 : #include <rte_dev.h>
15 : #include <rte_pci.h>
16 :
17 : #include "spdk_internal/assert.h"
18 :
19 : #include "spdk/assert.h"
20 : #include "spdk/likely.h"
21 : #include "spdk/queue.h"
22 : #include "spdk/util.h"
23 : #include "spdk/memory.h"
24 : #include "spdk/env_dpdk.h"
25 : #include "spdk/log.h"
26 :
27 : #ifdef __linux__
28 : #include <linux/version.h>
29 : #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
30 : #include <linux/vfio.h>
31 : #include <rte_vfio.h>
32 :
33 : struct spdk_vfio_dma_map {
34 : struct vfio_iommu_type1_dma_map map;
35 : TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
36 : };
37 :
38 : struct vfio_cfg {
39 : int fd;
40 : bool enabled;
41 : bool noiommu_enabled;
42 : unsigned device_ref;
43 : TAILQ_HEAD(, spdk_vfio_dma_map) maps;
44 : pthread_mutex_t mutex;
45 : };
46 :
47 : static struct vfio_cfg g_vfio = {
48 : .fd = -1,
49 : .enabled = false,
50 : .noiommu_enabled = false,
51 : .device_ref = 0,
52 : .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
53 : .mutex = PTHREAD_MUTEX_INITIALIZER
54 : };
55 : #endif
56 : #endif
57 :
58 : #if DEBUG
59 : #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__)
60 : #else
61 : #define DEBUG_PRINT(...)
62 : #endif
63 :
64 : #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB))
65 : #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB))
66 :
67 : #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
68 : #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
69 :
70 : /* Page is registered */
71 : #define REG_MAP_REGISTERED (1ULL << 62)
72 :
73 : /* A notification region barrier. The 2MB translation entry that's marked
74 : * with this flag must be unregistered separately. This allows contiguous
75 : * regions to be unregistered in the same chunks they were registered.
76 : */
77 : #define REG_MAP_NOTIFY_START (1ULL << 63)
78 :
79 : /* Translation of a single 2MB page. */
80 : struct map_2mb {
81 : uint64_t translation_2mb;
82 : };
83 :
84 : /* Second-level map table indexed by bits [21..29] of the virtual address.
85 : * Each entry contains the address translation or error for entries that haven't
86 : * been retrieved yet.
87 : */
88 : struct map_1gb {
89 : struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
90 : };
91 :
92 : /* Top-level map table indexed by bits [30..47] of the virtual address.
93 : * Each entry points to a second-level map table or NULL.
94 : */
95 : struct map_256tb {
96 : struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
97 : };
98 :
99 : /* Page-granularity memory address translation */
100 : struct spdk_mem_map {
101 : struct map_256tb map_256tb;
102 : pthread_mutex_t mutex;
103 : uint64_t default_translation;
104 : struct spdk_mem_map_ops ops;
105 : void *cb_ctx;
106 : TAILQ_ENTRY(spdk_mem_map) tailq;
107 : };
108 :
109 : /* Registrations map. The 64 bit translations are bit fields with the
110 : * following layout (starting with the low bits):
111 : * 0 - 61 : reserved
112 : * 62 - 63 : flags
113 : */
114 : static struct spdk_mem_map *g_mem_reg_map;
115 : static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
116 : TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
117 : static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
118 :
119 : static bool g_legacy_mem;
120 : static bool g_huge_pages = true;
121 : static bool g_mem_event_cb_registered = false;
122 :
123 : /*
124 : * Walk the currently registered memory via the main memory registration map
125 : * and call the new map's notify callback for each virtually contiguous region.
126 : */
127 : static int
128 0 : mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
129 : {
130 : size_t idx_256tb;
131 : uint64_t idx_1gb;
132 0 : uint64_t contig_start = UINT64_MAX;
133 0 : uint64_t contig_end = UINT64_MAX;
134 : struct map_1gb *map_1gb;
135 : int rc;
136 :
137 0 : if (!g_mem_reg_map) {
138 0 : return -EINVAL;
139 : }
140 :
141 : /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
142 0 : pthread_mutex_lock(&g_mem_reg_map->mutex);
143 :
144 0 : for (idx_256tb = 0;
145 0 : idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
146 0 : idx_256tb++) {
147 0 : map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
148 :
149 0 : if (!map_1gb) {
150 0 : if (contig_start != UINT64_MAX) {
151 : /* End of of a virtually contiguous range */
152 0 : rc = map->ops.notify_cb(map->cb_ctx, map, action,
153 0 : (void *)contig_start,
154 0 : contig_end - contig_start + VALUE_2MB);
155 : /* Don't bother handling unregister failures. It can't be any worse */
156 0 : if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
157 0 : goto err_unregister;
158 : }
159 0 : }
160 0 : contig_start = UINT64_MAX;
161 0 : continue;
162 : }
163 :
164 0 : for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
165 0 : if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
166 0 : (contig_start == UINT64_MAX ||
167 0 : (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
168 : /* Rebuild the virtual address from the indexes */
169 0 : uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
170 :
171 0 : if (contig_start == UINT64_MAX) {
172 0 : contig_start = vaddr;
173 0 : }
174 :
175 0 : contig_end = vaddr;
176 0 : } else {
177 0 : if (contig_start != UINT64_MAX) {
178 : /* End of of a virtually contiguous range */
179 0 : rc = map->ops.notify_cb(map->cb_ctx, map, action,
180 0 : (void *)contig_start,
181 0 : contig_end - contig_start + VALUE_2MB);
182 : /* Don't bother handling unregister failures. It can't be any worse */
183 0 : if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
184 0 : goto err_unregister;
185 : }
186 :
187 : /* This page might be a part of a neighbour region, so process
188 : * it again. The idx_1gb will be incremented immediately.
189 : */
190 0 : idx_1gb--;
191 0 : }
192 0 : contig_start = UINT64_MAX;
193 : }
194 0 : }
195 0 : }
196 :
197 0 : pthread_mutex_unlock(&g_mem_reg_map->mutex);
198 0 : return 0;
199 :
200 : err_unregister:
201 : /* Unwind to the first empty translation so we don't unregister
202 : * a region that just failed to register.
203 : */
204 0 : idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
205 0 : idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
206 0 : contig_start = UINT64_MAX;
207 0 : contig_end = UINT64_MAX;
208 :
209 : /* Unregister any memory we managed to register before the failure */
210 0 : for (; idx_256tb < SIZE_MAX; idx_256tb--) {
211 0 : map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
212 :
213 0 : if (!map_1gb) {
214 0 : if (contig_end != UINT64_MAX) {
215 : /* End of of a virtually contiguous range */
216 0 : map->ops.notify_cb(map->cb_ctx, map,
217 : SPDK_MEM_MAP_NOTIFY_UNREGISTER,
218 0 : (void *)contig_start,
219 0 : contig_end - contig_start + VALUE_2MB);
220 0 : }
221 0 : contig_end = UINT64_MAX;
222 0 : continue;
223 : }
224 :
225 0 : for (; idx_1gb < UINT64_MAX; idx_1gb--) {
226 : /* Rebuild the virtual address from the indexes */
227 0 : uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
228 0 : if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
229 0 : (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
230 :
231 0 : if (contig_end == UINT64_MAX) {
232 0 : contig_end = vaddr;
233 0 : }
234 0 : contig_start = vaddr;
235 0 : } else {
236 0 : if (contig_end != UINT64_MAX) {
237 0 : if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) {
238 0 : contig_start = vaddr;
239 0 : }
240 : /* End of of a virtually contiguous range */
241 0 : map->ops.notify_cb(map->cb_ctx, map,
242 : SPDK_MEM_MAP_NOTIFY_UNREGISTER,
243 0 : (void *)contig_start,
244 0 : contig_end - contig_start + VALUE_2MB);
245 0 : }
246 0 : contig_end = UINT64_MAX;
247 : }
248 0 : }
249 0 : idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
250 0 : }
251 :
252 0 : pthread_mutex_unlock(&g_mem_reg_map->mutex);
253 0 : return rc;
254 0 : }
255 :
256 : struct spdk_mem_map *
257 0 : spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
258 : {
259 : struct spdk_mem_map *map;
260 : int rc;
261 : size_t i;
262 :
263 0 : map = calloc(1, sizeof(*map));
264 0 : if (map == NULL) {
265 0 : return NULL;
266 : }
267 :
268 0 : if (pthread_mutex_init(&map->mutex, NULL)) {
269 0 : free(map);
270 0 : return NULL;
271 : }
272 :
273 0 : map->default_translation = default_translation;
274 0 : map->cb_ctx = cb_ctx;
275 0 : if (ops) {
276 0 : map->ops = *ops;
277 0 : }
278 :
279 0 : if (ops && ops->notify_cb) {
280 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
281 0 : rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
282 0 : if (rc != 0) {
283 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
284 0 : DEBUG_PRINT("Initial mem_map notify failed\n");
285 0 : pthread_mutex_destroy(&map->mutex);
286 0 : for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
287 0 : free(map->map_256tb.map[i]);
288 0 : }
289 0 : free(map);
290 0 : return NULL;
291 : }
292 0 : TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
293 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
294 0 : }
295 :
296 0 : return map;
297 0 : }
298 :
299 : void
300 0 : spdk_mem_map_free(struct spdk_mem_map **pmap)
301 : {
302 : struct spdk_mem_map *map;
303 : size_t i;
304 :
305 0 : if (!pmap) {
306 0 : return;
307 : }
308 :
309 0 : map = *pmap;
310 :
311 0 : if (!map) {
312 0 : return;
313 : }
314 :
315 0 : if (map->ops.notify_cb) {
316 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
317 0 : mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
318 0 : TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
319 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
320 0 : }
321 :
322 0 : for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
323 0 : free(map->map_256tb.map[i]);
324 0 : }
325 :
326 0 : pthread_mutex_destroy(&map->mutex);
327 :
328 0 : free(map);
329 0 : *pmap = NULL;
330 0 : }
331 :
332 : int
333 0 : spdk_mem_register(void *_vaddr, size_t len)
334 : {
335 : struct spdk_mem_map *map;
336 : int rc;
337 0 : uint64_t vaddr = (uintptr_t)_vaddr;
338 : uint64_t seg_vaddr;
339 : size_t seg_len;
340 : uint64_t reg;
341 :
342 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
343 0 : DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr);
344 0 : return -EINVAL;
345 : }
346 :
347 0 : if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
348 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n",
349 : __func__, vaddr, len);
350 0 : return -EINVAL;
351 : }
352 :
353 0 : if (len == 0) {
354 0 : return 0;
355 : }
356 :
357 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
358 :
359 0 : seg_vaddr = vaddr;
360 0 : seg_len = len;
361 0 : while (seg_len > 0) {
362 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
363 0 : if (reg & REG_MAP_REGISTERED) {
364 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
365 0 : return -EBUSY;
366 : }
367 0 : seg_vaddr += VALUE_2MB;
368 0 : seg_len -= VALUE_2MB;
369 : }
370 :
371 0 : seg_vaddr = vaddr;
372 0 : seg_len = 0;
373 0 : while (len > 0) {
374 0 : spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
375 0 : seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
376 0 : seg_len += VALUE_2MB;
377 0 : vaddr += VALUE_2MB;
378 0 : len -= VALUE_2MB;
379 : }
380 :
381 0 : TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
382 0 : rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER,
383 0 : (void *)seg_vaddr, seg_len);
384 0 : if (rc != 0) {
385 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
386 0 : return rc;
387 : }
388 0 : }
389 :
390 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
391 0 : return 0;
392 0 : }
393 :
394 : int
395 0 : spdk_mem_unregister(void *_vaddr, size_t len)
396 : {
397 : struct spdk_mem_map *map;
398 : int rc;
399 0 : uint64_t vaddr = (uintptr_t)_vaddr;
400 : uint64_t seg_vaddr;
401 : size_t seg_len;
402 : uint64_t reg, newreg;
403 :
404 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
405 0 : DEBUG_PRINT("invalid usermode virtual address %jx\n", vaddr);
406 0 : return -EINVAL;
407 : }
408 :
409 0 : if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
410 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%jx len=%ju\n",
411 : __func__, vaddr, len);
412 0 : return -EINVAL;
413 : }
414 :
415 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
416 :
417 : /* The first page must be a start of a region. Also check if it's
418 : * registered to make sure we don't return -ERANGE for non-registered
419 : * regions.
420 : */
421 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
422 0 : if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
423 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
424 0 : return -ERANGE;
425 : }
426 :
427 0 : seg_vaddr = vaddr;
428 0 : seg_len = len;
429 0 : while (seg_len > 0) {
430 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
431 0 : if ((reg & REG_MAP_REGISTERED) == 0) {
432 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
433 0 : return -EINVAL;
434 : }
435 0 : seg_vaddr += VALUE_2MB;
436 0 : seg_len -= VALUE_2MB;
437 : }
438 :
439 0 : newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
440 : /* If the next page is registered, it must be a start of a region as well,
441 : * otherwise we'd be unregistering only a part of a region.
442 : */
443 0 : if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
444 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
445 0 : return -ERANGE;
446 : }
447 0 : seg_vaddr = vaddr;
448 0 : seg_len = 0;
449 :
450 0 : while (len > 0) {
451 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
452 0 : spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
453 :
454 0 : if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
455 0 : TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
456 0 : rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER,
457 0 : (void *)seg_vaddr, seg_len);
458 0 : if (rc != 0) {
459 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
460 0 : return rc;
461 : }
462 0 : }
463 :
464 0 : seg_vaddr = vaddr;
465 0 : seg_len = VALUE_2MB;
466 0 : } else {
467 0 : seg_len += VALUE_2MB;
468 : }
469 :
470 0 : vaddr += VALUE_2MB;
471 0 : len -= VALUE_2MB;
472 : }
473 :
474 0 : if (seg_len > 0) {
475 0 : TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
476 0 : rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER,
477 0 : (void *)seg_vaddr, seg_len);
478 0 : if (rc != 0) {
479 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
480 0 : return rc;
481 : }
482 0 : }
483 0 : }
484 :
485 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
486 0 : return 0;
487 0 : }
488 :
489 : int
490 0 : spdk_mem_reserve(void *vaddr, size_t len)
491 : {
492 : struct spdk_mem_map *map;
493 : void *seg_vaddr;
494 : size_t seg_len;
495 : uint64_t reg;
496 :
497 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
498 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
499 0 : return -EINVAL;
500 : }
501 :
502 0 : if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
503 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
504 : __func__, vaddr, len);
505 0 : return -EINVAL;
506 : }
507 :
508 0 : if (len == 0) {
509 0 : return 0;
510 : }
511 :
512 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
513 :
514 : /* Check if any part of this range is already registered */
515 0 : seg_vaddr = vaddr;
516 0 : seg_len = len;
517 0 : while (seg_len > 0) {
518 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
519 0 : if (reg & REG_MAP_REGISTERED) {
520 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
521 0 : return -EBUSY;
522 : }
523 0 : seg_vaddr += VALUE_2MB;
524 0 : seg_len -= VALUE_2MB;
525 : }
526 :
527 : /* Simply set the translation to the memory map's default. This allocates the space in the
528 : * map but does not provide a valid translation. */
529 0 : spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len,
530 0 : g_mem_reg_map->default_translation);
531 :
532 0 : TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
533 0 : spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation);
534 0 : }
535 :
536 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
537 0 : return 0;
538 0 : }
539 :
540 : static struct map_1gb *
541 0 : mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
542 : {
543 : struct map_1gb *map_1gb;
544 0 : uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
545 : size_t i;
546 :
547 0 : if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
548 0 : return NULL;
549 : }
550 :
551 0 : map_1gb = map->map_256tb.map[idx_256tb];
552 :
553 0 : if (!map_1gb) {
554 0 : pthread_mutex_lock(&map->mutex);
555 :
556 : /* Recheck to make sure nobody else got the mutex first. */
557 0 : map_1gb = map->map_256tb.map[idx_256tb];
558 0 : if (!map_1gb) {
559 0 : map_1gb = malloc(sizeof(struct map_1gb));
560 0 : if (map_1gb) {
561 : /* initialize all entries to default translation */
562 0 : for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
563 0 : map_1gb->map[i].translation_2mb = map->default_translation;
564 0 : }
565 0 : map->map_256tb.map[idx_256tb] = map_1gb;
566 0 : }
567 0 : }
568 :
569 0 : pthread_mutex_unlock(&map->mutex);
570 :
571 0 : if (!map_1gb) {
572 0 : DEBUG_PRINT("allocation failed\n");
573 0 : return NULL;
574 : }
575 0 : }
576 :
577 0 : return map_1gb;
578 0 : }
579 :
580 : int
581 0 : spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
582 : uint64_t translation)
583 : {
584 : uint64_t vfn_2mb;
585 : struct map_1gb *map_1gb;
586 : uint64_t idx_1gb;
587 : struct map_2mb *map_2mb;
588 :
589 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
590 0 : DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr);
591 0 : return -EINVAL;
592 : }
593 :
594 : /* For now, only 2 MB-aligned registrations are supported */
595 0 : if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
596 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n",
597 : __func__, vaddr, size);
598 0 : return -EINVAL;
599 : }
600 :
601 0 : vfn_2mb = vaddr >> SHIFT_2MB;
602 :
603 0 : while (size) {
604 0 : map_1gb = mem_map_get_map_1gb(map, vfn_2mb);
605 0 : if (!map_1gb) {
606 0 : DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
607 0 : return -ENOMEM;
608 : }
609 :
610 0 : idx_1gb = MAP_1GB_IDX(vfn_2mb);
611 0 : map_2mb = &map_1gb->map[idx_1gb];
612 0 : map_2mb->translation_2mb = translation;
613 :
614 0 : size -= VALUE_2MB;
615 0 : vfn_2mb++;
616 : }
617 :
618 0 : return 0;
619 0 : }
620 :
621 : int
622 0 : spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
623 : {
624 0 : return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation);
625 : }
626 :
627 : inline uint64_t
628 0 : spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
629 : {
630 : const struct map_1gb *map_1gb;
631 : const struct map_2mb *map_2mb;
632 : uint64_t idx_256tb;
633 : uint64_t idx_1gb;
634 : uint64_t vfn_2mb;
635 : uint64_t cur_size;
636 : uint64_t prev_translation;
637 : uint64_t orig_translation;
638 :
639 0 : if (spdk_unlikely(vaddr & ~MASK_256TB)) {
640 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
641 0 : return map->default_translation;
642 : }
643 :
644 0 : vfn_2mb = vaddr >> SHIFT_2MB;
645 0 : idx_256tb = MAP_256TB_IDX(vfn_2mb);
646 0 : idx_1gb = MAP_1GB_IDX(vfn_2mb);
647 :
648 0 : map_1gb = map->map_256tb.map[idx_256tb];
649 0 : if (spdk_unlikely(!map_1gb)) {
650 0 : return map->default_translation;
651 : }
652 :
653 0 : cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
654 0 : map_2mb = &map_1gb->map[idx_1gb];
655 0 : if (size == NULL || map->ops.are_contiguous == NULL ||
656 0 : map_2mb->translation_2mb == map->default_translation) {
657 0 : if (size != NULL) {
658 0 : *size = spdk_min(*size, cur_size);
659 0 : }
660 0 : return map_2mb->translation_2mb;
661 : }
662 :
663 0 : orig_translation = map_2mb->translation_2mb;
664 0 : prev_translation = orig_translation;
665 0 : while (cur_size < *size) {
666 0 : vfn_2mb++;
667 0 : idx_256tb = MAP_256TB_IDX(vfn_2mb);
668 0 : idx_1gb = MAP_1GB_IDX(vfn_2mb);
669 :
670 0 : map_1gb = map->map_256tb.map[idx_256tb];
671 0 : if (spdk_unlikely(!map_1gb)) {
672 0 : break;
673 : }
674 :
675 0 : map_2mb = &map_1gb->map[idx_1gb];
676 0 : if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
677 0 : break;
678 : }
679 :
680 0 : cur_size += VALUE_2MB;
681 0 : prev_translation = map_2mb->translation_2mb;
682 : }
683 :
684 0 : *size = spdk_min(*size, cur_size);
685 0 : return orig_translation;
686 0 : }
687 :
688 : static void
689 0 : memory_hotplug_cb(enum rte_mem_event event_type,
690 : const void *addr, size_t len, void *arg)
691 : {
692 0 : if (event_type == RTE_MEM_EVENT_ALLOC) {
693 0 : spdk_mem_register((void *)addr, len);
694 :
695 0 : if (!spdk_env_dpdk_external_init()) {
696 0 : return;
697 : }
698 :
699 : /* When the user initialized DPDK separately, we can't
700 : * be sure that --match-allocations RTE flag was specified.
701 : * Without this flag, DPDK can free memory in different units
702 : * than it was allocated. It doesn't work with things like RDMA MRs.
703 : *
704 : * For such cases, we mark segments so they aren't freed.
705 : */
706 0 : while (len > 0) {
707 : struct rte_memseg *seg;
708 :
709 0 : seg = rte_mem_virt2memseg(addr, NULL);
710 0 : assert(seg != NULL);
711 0 : seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
712 0 : addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
713 0 : len -= seg->hugepage_sz;
714 : }
715 0 : } else if (event_type == RTE_MEM_EVENT_FREE) {
716 0 : spdk_mem_unregister((void *)addr, len);
717 0 : }
718 0 : }
719 :
720 : static int
721 0 : memory_iter_cb(const struct rte_memseg_list *msl,
722 : const struct rte_memseg *ms, size_t len, void *arg)
723 : {
724 0 : return spdk_mem_register(ms->addr, len);
725 : }
726 :
727 : int
728 0 : mem_map_init(bool legacy_mem)
729 : {
730 : int rc;
731 :
732 0 : g_legacy_mem = legacy_mem;
733 :
734 0 : g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
735 0 : if (g_mem_reg_map == NULL) {
736 0 : DEBUG_PRINT("memory registration map allocation failed\n");
737 0 : return -ENOMEM;
738 : }
739 :
740 0 : if (!g_huge_pages) {
741 0 : return 0;
742 : }
743 :
744 0 : rc = rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
745 0 : if (rc != 0) {
746 0 : DEBUG_PRINT("memory event callback registration failed, rc = %d\n", rc);
747 0 : goto err_free_reg_map;
748 : }
749 0 : g_mem_event_cb_registered = true;
750 :
751 : /*
752 : * Walk all DPDK memory segments and register them
753 : * with the main memory map
754 : */
755 0 : rc = rte_memseg_contig_walk(memory_iter_cb, NULL);
756 0 : if (rc != 0) {
757 0 : DEBUG_PRINT("memory segments walking failed, rc = %d\n", rc);
758 0 : goto err_unregister_mem_cb;
759 : }
760 :
761 0 : return 0;
762 :
763 : err_unregister_mem_cb:
764 0 : g_mem_event_cb_registered = false;
765 0 : rte_mem_event_callback_unregister("spdk", NULL);
766 : err_free_reg_map:
767 0 : spdk_mem_map_free(&g_mem_reg_map);
768 0 : return rc;
769 0 : }
770 :
771 : void
772 0 : mem_map_fini(void)
773 : {
774 0 : if (g_mem_event_cb_registered) {
775 0 : g_mem_event_cb_registered = false;
776 0 : rte_mem_event_callback_unregister("spdk", NULL);
777 0 : }
778 0 : spdk_mem_map_free(&g_mem_reg_map);
779 0 : }
780 :
781 : bool
782 0 : spdk_iommu_is_enabled(void)
783 : {
784 : #if VFIO_ENABLED
785 : return g_vfio.enabled && !g_vfio.noiommu_enabled;
786 : #else
787 0 : return false;
788 : #endif
789 : }
790 :
791 : struct spdk_vtophys_pci_device {
792 : struct rte_pci_device *pci_device;
793 : TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
794 : };
795 :
796 : static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
797 : static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
798 : TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
799 :
800 : static struct spdk_mem_map *g_vtophys_map = NULL;
801 : static struct spdk_mem_map *g_phys_ref_map = NULL;
802 : static struct spdk_mem_map *g_numa_map = NULL;
803 :
804 : #if VFIO_ENABLED
805 : static int
806 : _vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
807 : {
808 : struct spdk_vfio_dma_map *dma_map;
809 : int ret;
810 :
811 : dma_map = calloc(1, sizeof(*dma_map));
812 : if (dma_map == NULL) {
813 : return -ENOMEM;
814 : }
815 :
816 : dma_map->map.argsz = sizeof(dma_map->map);
817 : dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
818 : dma_map->map.vaddr = vaddr;
819 : dma_map->map.iova = iova;
820 : dma_map->map.size = size;
821 :
822 : if (g_vfio.device_ref == 0) {
823 : /* VFIO requires at least one device (IOMMU group) to be added to
824 : * a VFIO container before it is possible to perform any IOMMU
825 : * operations on that container. This memory will be mapped once
826 : * the first device (IOMMU group) is hotplugged.
827 : *
828 : * Since the vfio container is managed internally by DPDK, it is
829 : * also possible that some device is already in that container, but
830 : * it's not managed by SPDK - e.g. an NIC attached internally
831 : * inside DPDK. We could map the memory straight away in such
832 : * scenario, but there's no need to do it. DPDK devices clearly
833 : * don't need our mappings and hence we defer the mapping
834 : * unconditionally until the first SPDK-managed device is
835 : * hotplugged.
836 : */
837 : goto out_insert;
838 : }
839 :
840 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
841 : if (ret) {
842 : /* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */
843 : SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno);
844 : }
845 :
846 : out_insert:
847 : TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
848 : return 0;
849 : }
850 :
851 :
852 : static int
853 : vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
854 : {
855 : uint64_t refcount;
856 : int ret;
857 :
858 : refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
859 : assert(refcount < UINT64_MAX);
860 : if (refcount > 0) {
861 : spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
862 : return 0;
863 : }
864 :
865 : pthread_mutex_lock(&g_vfio.mutex);
866 : ret = _vfio_iommu_map_dma(vaddr, iova, size);
867 : pthread_mutex_unlock(&g_vfio.mutex);
868 : if (ret) {
869 : return ret;
870 : }
871 :
872 : spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
873 : return 0;
874 : }
875 :
876 : int
877 : vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size)
878 : {
879 : int ret;
880 :
881 : pthread_mutex_lock(&g_vfio.mutex);
882 : ret = _vfio_iommu_map_dma(vaddr, iova, size);
883 : pthread_mutex_unlock(&g_vfio.mutex);
884 :
885 : return ret;
886 : }
887 :
888 : static int
889 : _vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map)
890 : {
891 : struct vfio_iommu_type1_dma_unmap unmap = {};
892 : int ret;
893 :
894 : if (g_vfio.device_ref == 0) {
895 : /* Memory is not mapped anymore, just remove it's references */
896 : goto out_remove;
897 : }
898 :
899 : unmap.argsz = sizeof(unmap);
900 : unmap.flags = 0;
901 : unmap.iova = dma_map->map.iova;
902 : unmap.size = dma_map->map.size;
903 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
904 : if (ret) {
905 : SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno);
906 : }
907 :
908 : out_remove:
909 : TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
910 : free(dma_map);
911 : return 0;
912 : }
913 :
914 : static int
915 : vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
916 : {
917 : struct spdk_vfio_dma_map *dma_map;
918 : uint64_t refcount;
919 : int ret;
920 :
921 : pthread_mutex_lock(&g_vfio.mutex);
922 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
923 : if (dma_map->map.iova == iova) {
924 : break;
925 : }
926 : }
927 :
928 : if (dma_map == NULL) {
929 : DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
930 : pthread_mutex_unlock(&g_vfio.mutex);
931 : return -ENXIO;
932 : }
933 :
934 : refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
935 : assert(refcount < UINT64_MAX);
936 : if (refcount > 0) {
937 : spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
938 : }
939 :
940 : /* We still have outstanding references, don't clear it. */
941 : if (refcount > 1) {
942 : pthread_mutex_unlock(&g_vfio.mutex);
943 : return 0;
944 : }
945 :
946 : /** don't support partial or multiple-page unmap for now */
947 : assert(dma_map->map.size == size);
948 :
949 : ret = _vfio_iommu_unmap_dma(dma_map);
950 : pthread_mutex_unlock(&g_vfio.mutex);
951 :
952 : return ret;
953 : }
954 :
955 : int
956 : vtophys_iommu_unmap_dma_bar(uint64_t vaddr)
957 : {
958 : struct spdk_vfio_dma_map *dma_map;
959 : int ret;
960 :
961 : pthread_mutex_lock(&g_vfio.mutex);
962 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
963 : if (dma_map->map.vaddr == vaddr) {
964 : break;
965 : }
966 : }
967 :
968 : if (dma_map == NULL) {
969 : DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr);
970 : pthread_mutex_unlock(&g_vfio.mutex);
971 : return -ENXIO;
972 : }
973 :
974 : ret = _vfio_iommu_unmap_dma(dma_map);
975 : pthread_mutex_unlock(&g_vfio.mutex);
976 : return ret;
977 : }
978 : #endif
979 :
980 : static uint64_t
981 0 : vtophys_get_paddr_memseg(uint64_t vaddr)
982 : {
983 : uintptr_t paddr;
984 : struct rte_memseg *seg;
985 :
986 0 : seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
987 0 : if (seg != NULL) {
988 0 : paddr = seg->iova;
989 0 : if (paddr == RTE_BAD_IOVA) {
990 0 : return SPDK_VTOPHYS_ERROR;
991 : }
992 0 : paddr += (vaddr - (uintptr_t)seg->addr);
993 0 : return paddr;
994 : }
995 :
996 0 : return SPDK_VTOPHYS_ERROR;
997 0 : }
998 :
999 : /* Try to get the paddr from /proc/self/pagemap */
1000 : static uint64_t
1001 0 : vtophys_get_paddr_pagemap(uint64_t vaddr)
1002 : {
1003 : uintptr_t paddr;
1004 :
1005 : /* Silence static analyzers */
1006 0 : assert(vaddr != 0);
1007 0 : paddr = rte_mem_virt2iova((void *)vaddr);
1008 0 : if (paddr == RTE_BAD_IOVA) {
1009 : /*
1010 : * The vaddr may be valid but doesn't have a backing page
1011 : * assigned yet. Touch the page to ensure a backing page
1012 : * gets assigned, then try to translate again.
1013 : */
1014 0 : rte_atomic64_read((rte_atomic64_t *)vaddr);
1015 0 : paddr = rte_mem_virt2iova((void *)vaddr);
1016 0 : }
1017 0 : if (paddr == RTE_BAD_IOVA) {
1018 : /* Unable to get to the physical address. */
1019 0 : return SPDK_VTOPHYS_ERROR;
1020 : }
1021 :
1022 0 : return paddr;
1023 0 : }
1024 :
1025 : static uint64_t
1026 0 : pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len)
1027 : {
1028 : struct rte_mem_resource *res;
1029 : uint64_t paddr;
1030 : unsigned r;
1031 :
1032 0 : for (r = 0; r < PCI_MAX_RESOURCE; r++) {
1033 0 : res = dpdk_pci_device_get_mem_resource(dev, r);
1034 :
1035 0 : if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr ||
1036 0 : (vaddr + len) >= (uint64_t)res->addr + res->len) {
1037 0 : continue;
1038 : }
1039 :
1040 : #if VFIO_ENABLED
1041 : if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) {
1042 : /*
1043 : * The IOMMU is on and we're using IOVA == VA. The BAR was
1044 : * automatically registered when it was mapped, so just return
1045 : * the virtual address here.
1046 : */
1047 : return vaddr;
1048 : }
1049 : #endif
1050 0 : paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
1051 0 : return paddr;
1052 : }
1053 :
1054 0 : return SPDK_VTOPHYS_ERROR;
1055 0 : }
1056 :
1057 : /* Try to get the paddr from pci devices */
1058 : static uint64_t
1059 0 : vtophys_get_paddr_pci(uint64_t vaddr, size_t len)
1060 : {
1061 : struct spdk_vtophys_pci_device *vtophys_dev;
1062 : uintptr_t paddr;
1063 : struct rte_pci_device *dev;
1064 :
1065 0 : pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1066 0 : TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1067 0 : dev = vtophys_dev->pci_device;
1068 0 : paddr = pci_device_vtophys(dev, vaddr, len);
1069 0 : if (paddr != SPDK_VTOPHYS_ERROR) {
1070 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1071 0 : return paddr;
1072 : }
1073 0 : }
1074 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1075 :
1076 0 : return SPDK_VTOPHYS_ERROR;
1077 0 : }
1078 :
1079 : static int
1080 0 : vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
1081 : enum spdk_mem_map_notify_action action,
1082 : void *vaddr, size_t len)
1083 : {
1084 0 : int rc = 0;
1085 : uint64_t paddr;
1086 :
1087 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
1088 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
1089 0 : return -EINVAL;
1090 : }
1091 :
1092 0 : if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
1093 0 : DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n",
1094 : vaddr, len);
1095 0 : return -EINVAL;
1096 : }
1097 :
1098 : /* Get the physical address from the DPDK memsegs */
1099 0 : paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1100 :
1101 0 : switch (action) {
1102 : case SPDK_MEM_MAP_NOTIFY_REGISTER:
1103 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1104 : /* This is not an address that DPDK is managing. */
1105 :
1106 : /* Check if this is a PCI BAR. They need special handling */
1107 0 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1108 0 : if (paddr != SPDK_VTOPHYS_ERROR) {
1109 : /* Get paddr for each 2MB chunk in this address range */
1110 0 : while (len > 0) {
1111 0 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1112 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1113 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1114 0 : return -EFAULT;
1115 : }
1116 :
1117 0 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1118 0 : if (rc != 0) {
1119 0 : return rc;
1120 : }
1121 :
1122 0 : vaddr += VALUE_2MB;
1123 0 : len -= VALUE_2MB;
1124 : }
1125 :
1126 0 : return 0;
1127 : }
1128 :
1129 : #if VFIO_ENABLED
1130 : enum rte_iova_mode iova_mode;
1131 :
1132 : iova_mode = rte_eal_iova_mode();
1133 :
1134 : if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
1135 : /* We'll use the virtual address as the iova to match DPDK. */
1136 : paddr = (uint64_t)vaddr;
1137 : rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
1138 : if (rc) {
1139 : return -EFAULT;
1140 : }
1141 : while (len > 0) {
1142 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1143 : if (rc != 0) {
1144 : return rc;
1145 : }
1146 : vaddr += VALUE_2MB;
1147 : paddr += VALUE_2MB;
1148 : len -= VALUE_2MB;
1149 : }
1150 : } else
1151 : #endif
1152 : {
1153 : /* Get the physical address from /proc/self/pagemap. */
1154 0 : paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1155 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1156 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1157 0 : return -EFAULT;
1158 : }
1159 :
1160 : /* Get paddr for each 2MB chunk in this address range */
1161 0 : while (len > 0) {
1162 : /* Get the physical address from /proc/self/pagemap. */
1163 0 : paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1164 :
1165 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1166 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1167 0 : return -EFAULT;
1168 : }
1169 :
1170 0 : if (paddr & MASK_2MB) {
1171 0 : DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
1172 0 : return -EINVAL;
1173 : }
1174 : #if VFIO_ENABLED
1175 : /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
1176 : * with the IOMMU using the physical address to match. */
1177 : if (spdk_iommu_is_enabled()) {
1178 : rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
1179 : if (rc) {
1180 : DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
1181 : return -EFAULT;
1182 : }
1183 : }
1184 : #endif
1185 :
1186 0 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1187 0 : if (rc != 0) {
1188 0 : return rc;
1189 : }
1190 :
1191 0 : vaddr += VALUE_2MB;
1192 0 : len -= VALUE_2MB;
1193 : }
1194 : }
1195 0 : } else {
1196 : /* This is an address managed by DPDK. Just setup the translations. */
1197 0 : while (len > 0) {
1198 0 : paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1199 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1200 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1201 0 : return -EFAULT;
1202 : }
1203 :
1204 0 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1205 0 : if (rc != 0) {
1206 0 : return rc;
1207 : }
1208 :
1209 0 : vaddr += VALUE_2MB;
1210 0 : len -= VALUE_2MB;
1211 : }
1212 : }
1213 :
1214 0 : break;
1215 : case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1216 : #if VFIO_ENABLED
1217 : if (paddr == SPDK_VTOPHYS_ERROR) {
1218 : /*
1219 : * This is not an address that DPDK is managing.
1220 : */
1221 :
1222 : /* Check if this is a PCI BAR. They need special handling */
1223 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1224 : if (paddr != SPDK_VTOPHYS_ERROR) {
1225 : /* Get paddr for each 2MB chunk in this address range */
1226 : while (len > 0) {
1227 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1228 : if (paddr == SPDK_VTOPHYS_ERROR) {
1229 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1230 : return -EFAULT;
1231 : }
1232 :
1233 : rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1234 : if (rc != 0) {
1235 : return rc;
1236 : }
1237 :
1238 : vaddr += VALUE_2MB;
1239 : len -= VALUE_2MB;
1240 : }
1241 :
1242 : return 0;
1243 : }
1244 :
1245 : /* If vfio is enabled,
1246 : * we need to unmap the range from the IOMMU
1247 : */
1248 : if (spdk_iommu_is_enabled()) {
1249 : uint64_t buffer_len = len;
1250 : uint8_t *va = vaddr;
1251 : enum rte_iova_mode iova_mode;
1252 :
1253 : iova_mode = rte_eal_iova_mode();
1254 : /*
1255 : * In virtual address mode, the region is contiguous and can be done in
1256 : * one unmap.
1257 : */
1258 : if (iova_mode == RTE_IOVA_VA) {
1259 : paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len);
1260 : if (buffer_len != len || paddr != (uintptr_t)va) {
1261 : DEBUG_PRINT("Unmapping %p with length %lu failed because "
1262 : "translation had address 0x%" PRIx64 " and length %lu\n",
1263 : va, len, paddr, buffer_len);
1264 : return -EINVAL;
1265 : }
1266 : rc = vtophys_iommu_unmap_dma(paddr, len);
1267 : if (rc) {
1268 : DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1269 : return -EFAULT;
1270 : }
1271 : } else if (iova_mode == RTE_IOVA_PA) {
1272 : /* Get paddr for each 2MB chunk in this address range */
1273 : while (buffer_len > 0) {
1274 : paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
1275 :
1276 : if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
1277 : DEBUG_PRINT("could not get phys addr for %p\n", va);
1278 : return -EFAULT;
1279 : }
1280 :
1281 : rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
1282 : if (rc) {
1283 : DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1284 : return -EFAULT;
1285 : }
1286 :
1287 : va += VALUE_2MB;
1288 : buffer_len -= VALUE_2MB;
1289 : }
1290 : }
1291 : }
1292 : }
1293 : #endif
1294 0 : while (len > 0) {
1295 0 : rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1296 0 : if (rc != 0) {
1297 0 : return rc;
1298 : }
1299 :
1300 0 : vaddr += VALUE_2MB;
1301 0 : len -= VALUE_2MB;
1302 : }
1303 :
1304 0 : break;
1305 : default:
1306 0 : SPDK_UNREACHABLE();
1307 : }
1308 :
1309 0 : return rc;
1310 0 : }
1311 :
1312 : static int
1313 0 : numa_notify(void *cb_ctx, struct spdk_mem_map *map,
1314 : enum spdk_mem_map_notify_action action,
1315 : void *vaddr, size_t len)
1316 : {
1317 : struct rte_memseg *seg;
1318 :
1319 : /* We always return 0 from here, even if we aren't able to get a
1320 : * memseg for the address. This can happen in non-DPDK memory
1321 : * registration paths, for example vhost or vfio-user. That is OK,
1322 : * spdk_mem_get_numa_id() just returns SPDK_ENV_NUMA_ID_ANY for
1323 : * that kind of memory. If we return an error here, the
1324 : * spdk_mem_register() from vhost or vfio-user would fail which is
1325 : * not what we want.
1326 : */
1327 0 : seg = rte_mem_virt2memseg(vaddr, NULL);
1328 0 : if (seg == NULL) {
1329 0 : return 0;
1330 : }
1331 :
1332 0 : switch (action) {
1333 : case SPDK_MEM_MAP_NOTIFY_REGISTER:
1334 0 : spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, seg->socket_id);
1335 0 : break;
1336 : case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1337 0 : spdk_mem_map_clear_translation(map, (uint64_t)vaddr, len);
1338 0 : break;
1339 : default:
1340 0 : break;
1341 : }
1342 :
1343 0 : return 0;
1344 0 : }
1345 :
1346 : static int
1347 0 : vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2)
1348 : {
1349 : /* This function is always called with paddrs for two subsequent
1350 : * 2MB chunks in virtual address space, so those chunks will be only
1351 : * physically contiguous if the physical addresses are 2MB apart
1352 : * from each other as well.
1353 : */
1354 0 : return (paddr2 - paddr1 == VALUE_2MB);
1355 : }
1356 :
1357 : #if VFIO_ENABLED
1358 :
1359 : static bool
1360 : vfio_enabled(void)
1361 : {
1362 : return rte_vfio_is_enabled("vfio_pci");
1363 : }
1364 :
1365 : /* Check if IOMMU is enabled on the system */
1366 : static bool
1367 : has_iommu_groups(void)
1368 : {
1369 : int count = 0;
1370 : DIR *dir = opendir("/sys/kernel/iommu_groups");
1371 :
1372 : if (dir == NULL) {
1373 : return false;
1374 : }
1375 :
1376 : while (count < 3 && readdir(dir) != NULL) {
1377 : count++;
1378 : }
1379 :
1380 : closedir(dir);
1381 : /* there will always be ./ and ../ entries */
1382 : return count > 2;
1383 : }
1384 :
1385 : static bool
1386 : vfio_noiommu_enabled(void)
1387 : {
1388 : return rte_vfio_noiommu_is_enabled();
1389 : }
1390 :
1391 : static void
1392 : vtophys_iommu_init(void)
1393 : {
1394 : char proc_fd_path[PATH_MAX + 1];
1395 : char link_path[PATH_MAX + 1];
1396 : const char vfio_path[] = "/dev/vfio/vfio";
1397 : DIR *dir;
1398 : struct dirent *d;
1399 :
1400 : if (!vfio_enabled()) {
1401 : return;
1402 : }
1403 :
1404 : if (vfio_noiommu_enabled()) {
1405 : g_vfio.noiommu_enabled = true;
1406 : } else if (!has_iommu_groups()) {
1407 : return;
1408 : }
1409 :
1410 : dir = opendir("/proc/self/fd");
1411 : if (!dir) {
1412 : DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
1413 : return;
1414 : }
1415 :
1416 : while ((d = readdir(dir)) != NULL) {
1417 : if (d->d_type != DT_LNK) {
1418 : continue;
1419 : }
1420 :
1421 : snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
1422 : if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
1423 : continue;
1424 : }
1425 :
1426 : if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
1427 : sscanf(d->d_name, "%d", &g_vfio.fd);
1428 : break;
1429 : }
1430 : }
1431 :
1432 : closedir(dir);
1433 :
1434 : if (g_vfio.fd < 0) {
1435 : DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
1436 : return;
1437 : }
1438 :
1439 : g_vfio.enabled = true;
1440 :
1441 : return;
1442 : }
1443 :
1444 : #endif
1445 :
1446 : void
1447 0 : vtophys_pci_device_added(struct rte_pci_device *pci_device)
1448 : {
1449 : struct spdk_vtophys_pci_device *vtophys_dev;
1450 :
1451 0 : pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1452 :
1453 0 : vtophys_dev = calloc(1, sizeof(*vtophys_dev));
1454 0 : if (vtophys_dev) {
1455 0 : vtophys_dev->pci_device = pci_device;
1456 0 : TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
1457 0 : } else {
1458 0 : DEBUG_PRINT("Memory allocation error\n");
1459 : }
1460 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1461 :
1462 : #if VFIO_ENABLED
1463 : struct spdk_vfio_dma_map *dma_map;
1464 : int ret;
1465 :
1466 : if (!g_vfio.enabled) {
1467 : return;
1468 : }
1469 :
1470 : pthread_mutex_lock(&g_vfio.mutex);
1471 : g_vfio.device_ref++;
1472 : if (g_vfio.device_ref > 1) {
1473 : pthread_mutex_unlock(&g_vfio.mutex);
1474 : return;
1475 : }
1476 :
1477 : /* This is the first SPDK device using DPDK vfio. This means that the first
1478 : * IOMMU group might have been just been added to the DPDK vfio container.
1479 : * From this point it is certain that the memory can be mapped now.
1480 : */
1481 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1482 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
1483 : if (ret) {
1484 : DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
1485 : break;
1486 : }
1487 : }
1488 : pthread_mutex_unlock(&g_vfio.mutex);
1489 : #endif
1490 0 : }
1491 :
1492 : void
1493 0 : vtophys_pci_device_removed(struct rte_pci_device *pci_device)
1494 : {
1495 : struct spdk_vtophys_pci_device *vtophys_dev;
1496 :
1497 0 : pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1498 0 : TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1499 0 : if (vtophys_dev->pci_device == pci_device) {
1500 0 : TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
1501 0 : free(vtophys_dev);
1502 0 : break;
1503 : }
1504 0 : }
1505 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1506 :
1507 : #if VFIO_ENABLED
1508 : struct spdk_vfio_dma_map *dma_map;
1509 : int ret;
1510 :
1511 : if (!g_vfio.enabled) {
1512 : return;
1513 : }
1514 :
1515 : pthread_mutex_lock(&g_vfio.mutex);
1516 : assert(g_vfio.device_ref > 0);
1517 : g_vfio.device_ref--;
1518 : if (g_vfio.device_ref > 0) {
1519 : pthread_mutex_unlock(&g_vfio.mutex);
1520 : return;
1521 : }
1522 :
1523 : /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
1524 : * any additional devices using it's vfio container, all the mappings
1525 : * will be automatically removed by the Linux vfio driver. We unmap
1526 : * the memory manually to be able to easily re-map it later regardless
1527 : * of other, external factors.
1528 : */
1529 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1530 : struct vfio_iommu_type1_dma_unmap unmap = {};
1531 : unmap.argsz = sizeof(unmap);
1532 : unmap.flags = 0;
1533 : unmap.iova = dma_map->map.iova;
1534 : unmap.size = dma_map->map.size;
1535 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
1536 : if (ret) {
1537 : DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
1538 : break;
1539 : }
1540 : }
1541 : pthread_mutex_unlock(&g_vfio.mutex);
1542 : #endif
1543 0 : }
1544 :
1545 : int
1546 0 : vtophys_init(void)
1547 : {
1548 0 : const struct spdk_mem_map_ops vtophys_map_ops = {
1549 : .notify_cb = vtophys_notify,
1550 : .are_contiguous = vtophys_check_contiguous_entries,
1551 : };
1552 :
1553 0 : const struct spdk_mem_map_ops phys_ref_map_ops = {
1554 : .notify_cb = NULL,
1555 : .are_contiguous = NULL,
1556 : };
1557 :
1558 0 : const struct spdk_mem_map_ops numa_map_ops = {
1559 : .notify_cb = numa_notify,
1560 : .are_contiguous = NULL,
1561 : };
1562 :
1563 : #if VFIO_ENABLED
1564 : vtophys_iommu_init();
1565 : #endif
1566 :
1567 0 : g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL);
1568 0 : if (g_phys_ref_map == NULL) {
1569 0 : DEBUG_PRINT("phys_ref map allocation failed.\n");
1570 0 : return -ENOMEM;
1571 : }
1572 :
1573 0 : g_numa_map = spdk_mem_map_alloc(SPDK_ENV_NUMA_ID_ANY, &numa_map_ops, NULL);
1574 0 : if (g_numa_map == NULL) {
1575 0 : DEBUG_PRINT("numa map allocation failed.\n");
1576 0 : spdk_mem_map_free(&g_phys_ref_map);
1577 0 : return -ENOMEM;
1578 : }
1579 :
1580 0 : if (g_huge_pages) {
1581 0 : g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
1582 0 : if (g_vtophys_map == NULL) {
1583 0 : DEBUG_PRINT("vtophys map allocation failed\n");
1584 0 : spdk_mem_map_free(&g_numa_map);
1585 0 : spdk_mem_map_free(&g_phys_ref_map);
1586 0 : return -ENOMEM;
1587 : }
1588 0 : }
1589 0 : return 0;
1590 0 : }
1591 :
1592 : void
1593 0 : vtophys_fini(void)
1594 : {
1595 0 : spdk_mem_map_free(&g_vtophys_map);
1596 0 : spdk_mem_map_free(&g_numa_map);
1597 0 : spdk_mem_map_free(&g_phys_ref_map);
1598 0 : }
1599 :
1600 : uint64_t
1601 0 : spdk_vtophys(const void *buf, uint64_t *size)
1602 : {
1603 : uint64_t vaddr, paddr_2mb;
1604 :
1605 0 : if (!g_huge_pages) {
1606 0 : return SPDK_VTOPHYS_ERROR;
1607 : }
1608 :
1609 0 : vaddr = (uint64_t)buf;
1610 0 : paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
1611 :
1612 : /*
1613 : * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
1614 : * we will still bitwise-or it with the buf offset below, but the result will still be
1615 : * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
1616 : * unaligned) we must now check the return value before addition.
1617 : */
1618 : SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
1619 0 : if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
1620 0 : return SPDK_VTOPHYS_ERROR;
1621 : } else {
1622 0 : return paddr_2mb + (vaddr & MASK_2MB);
1623 : }
1624 0 : }
1625 :
1626 : int32_t
1627 0 : spdk_mem_get_numa_id(const void *buf, uint64_t *size)
1628 : {
1629 0 : return spdk_mem_map_translate(g_numa_map, (uint64_t)buf, size);
1630 : }
1631 :
1632 : int
1633 0 : spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset)
1634 : {
1635 : struct rte_memseg *seg;
1636 : int ret, fd;
1637 :
1638 0 : seg = rte_mem_virt2memseg(vaddr, NULL);
1639 0 : if (!seg) {
1640 0 : SPDK_ERRLOG("memory %p doesn't exist\n", vaddr);
1641 0 : return -ENOENT;
1642 : }
1643 :
1644 0 : fd = rte_memseg_get_fd_thread_unsafe(seg);
1645 0 : if (fd < 0) {
1646 0 : return fd;
1647 : }
1648 :
1649 0 : ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset);
1650 0 : if (ret < 0) {
1651 0 : return ret;
1652 : }
1653 :
1654 0 : return fd;
1655 0 : }
1656 :
1657 : void
1658 0 : mem_disable_huge_pages(void)
1659 : {
1660 0 : g_huge_pages = false;
1661 0 : }
|