From 36c01248049abba6d79b5ff1ddfe38306e0ddbff Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 6 May 2020 14:46:13 -0400 Subject: [PATCH] winsys/amdgpu: add RADEON_FLAG_UNCACHED for faster blits over PCIe Small blits benefit more. Good access pattern is required. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeon/radeon_winsys.h | 89 +++++++++++++++++----- src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 3 + 2 files changed, 72 insertions(+), 20 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index f183fe34cbb..9aba2a9d95f 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -67,7 +67,8 @@ enum radeon_bo_flag RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4), RADEON_FLAG_READ_ONLY = (1 << 5), RADEON_FLAG_32BIT = (1 << 6), - RADEON_FLAG_ENCRYPTED = (1 << 7) + RADEON_FLAG_ENCRYPTED = (1 << 7), + RADEON_FLAG_UNCACHED = (1 << 8), /* only gfx9 and newer */ }; enum radeon_dependency_flag @@ -712,6 +713,11 @@ enum radeon_heap RADEON_HEAP_GTT_WC_READ_ONLY_32BIT, RADEON_HEAP_GTT_WC_32BIT, RADEON_HEAP_GTT, + RADEON_HEAP_GTT_UNCACHED_WC, + RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY, + RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY_32BIT, + RADEON_HEAP_GTT_UNCACHED_WC_32BIT, + RADEON_HEAP_GTT_UNCACHED, RADEON_MAX_SLAB_HEAPS, RADEON_MAX_CACHED_HEAPS = RADEON_MAX_SLAB_HEAPS, }; @@ -730,6 +736,11 @@ static inline enum radeon_bo_domain radeon_domain_from_heap(enum radeon_heap hea case RADEON_HEAP_GTT_WC_READ_ONLY_32BIT: case RADEON_HEAP_GTT_WC_32BIT: case RADEON_HEAP_GTT: + case RADEON_HEAP_GTT_UNCACHED_WC: + case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY: + case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY_32BIT: + case RADEON_HEAP_GTT_UNCACHED_WC_32BIT: + case RADEON_HEAP_GTT_UNCACHED: return RADEON_DOMAIN_GTT; default: assert(0); @@ -739,36 +750,69 @@ static inline enum radeon_bo_domain radeon_domain_from_heap(enum radeon_heap hea static inline unsigned radeon_flags_from_heap(enum radeon_heap heap) { - unsigned flags = - RADEON_FLAG_NO_INTERPROCESS_SHARING | (heap != RADEON_HEAP_GTT ? RADEON_FLAG_GTT_WC : 0); + unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING; switch (heap) { - case RADEON_HEAP_VRAM_NO_CPU_ACCESS: - return flags | RADEON_FLAG_NO_CPU_ACCESS; + case RADEON_HEAP_GTT: + case RADEON_HEAP_GTT_UNCACHED: + break; + default: + flags |= RADEON_FLAG_GTT_WC; + } - case RADEON_HEAP_VRAM_READ_ONLY: - case RADEON_HEAP_GTT_WC_READ_ONLY: - return flags | RADEON_FLAG_READ_ONLY; + switch (heap) { + case RADEON_HEAP_GTT_UNCACHED_WC: + case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY: + case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY_32BIT: + case RADEON_HEAP_GTT_UNCACHED_WC_32BIT: + case RADEON_HEAP_GTT_UNCACHED: + flags |= RADEON_FLAG_UNCACHED; + break; + default: + break; + } + switch (heap) { + case RADEON_HEAP_VRAM_READ_ONLY: case RADEON_HEAP_VRAM_READ_ONLY_32BIT: + case RADEON_HEAP_GTT_WC_READ_ONLY: case RADEON_HEAP_GTT_WC_READ_ONLY_32BIT: - return flags | RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT; + case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY: + case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY_32BIT: + flags |= RADEON_FLAG_READ_ONLY; + break; + default: + break; + } + switch (heap) { + case RADEON_HEAP_VRAM_READ_ONLY_32BIT: case RADEON_HEAP_VRAM_32BIT: + case RADEON_HEAP_GTT_WC_READ_ONLY_32BIT: case RADEON_HEAP_GTT_WC_32BIT: - return flags | RADEON_FLAG_32BIT; + case RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY_32BIT: + case RADEON_HEAP_GTT_UNCACHED_WC_32BIT: + flags |= RADEON_FLAG_32BIT; + default: + break; + } - case RADEON_HEAP_VRAM: - case RADEON_HEAP_GTT_WC: - case RADEON_HEAP_GTT: + switch (heap) { + case RADEON_HEAP_VRAM_NO_CPU_ACCESS: + flags |= RADEON_FLAG_NO_CPU_ACCESS; + break; default: - return flags; + break; } + + return flags; } /* Return the heap index for winsys allocators, or -1 on failure. */ static inline int radeon_get_heap_index(enum radeon_bo_domain domain, enum radeon_bo_flag flags) { + bool uncached; + /* VRAM implies WC (write combining) */ assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC); /* NO_CPU_ACCESS implies VRAM only. */ @@ -779,7 +823,7 @@ static inline int radeon_get_heap_index(enum radeon_bo_domain domain, enum radeo return -1; /* Unsupported flags: NO_SUBALLOC, SPARSE. */ - if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS | + if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_UNCACHED | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT)) return -1; @@ -806,15 +850,20 @@ static inline int radeon_get_heap_index(enum radeon_bo_domain domain, enum radeo } break; case RADEON_DOMAIN_GTT: + uncached = flags & RADEON_FLAG_UNCACHED; + switch (flags & (RADEON_FLAG_GTT_WC | RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT)) { case RADEON_FLAG_GTT_WC | RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT: - return RADEON_HEAP_GTT_WC_READ_ONLY_32BIT; + return uncached ? RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY_32BIT + : RADEON_HEAP_GTT_WC_READ_ONLY_32BIT; case RADEON_FLAG_GTT_WC | RADEON_FLAG_READ_ONLY: - return RADEON_HEAP_GTT_WC_READ_ONLY; + return uncached ? RADEON_HEAP_GTT_UNCACHED_WC_READ_ONLY + : RADEON_HEAP_GTT_WC_READ_ONLY; case RADEON_FLAG_GTT_WC | RADEON_FLAG_32BIT: - return RADEON_HEAP_GTT_WC_32BIT; + return uncached ? RADEON_HEAP_GTT_UNCACHED_WC_32BIT + : RADEON_HEAP_GTT_WC_32BIT; case RADEON_FLAG_GTT_WC: - return RADEON_HEAP_GTT_WC; + return uncached ? RADEON_HEAP_GTT_UNCACHED_WC : RADEON_HEAP_GTT_WC; case RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT: case RADEON_FLAG_READ_ONLY: assert(!"READ_ONLY without WC is disallowed"); @@ -823,7 +872,7 @@ static inline int radeon_get_heap_index(enum radeon_bo_domain domain, enum radeo assert(!"32BIT without WC is disallowed"); return -1; case 0: - return RADEON_HEAP_GTT; + return uncached ? RADEON_HEAP_GTT_UNCACHED : RADEON_HEAP_GTT; } break; default: diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index 1a30c9a3cf3..0aef34acd2d 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -554,6 +554,9 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, if (!(flags & RADEON_FLAG_READ_ONLY)) vm_flags |= AMDGPU_VM_PAGE_WRITEABLE; + if (flags & RADEON_FLAG_UNCACHED) + vm_flags |= AMDGPU_VM_MTYPE_UC; + r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags, AMDGPU_VA_OP_MAP); if (r) -- 2.30.2