From f4e499ec79147f4172f3669ae9dafd941aaeeb65 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 7 Oct 2016 09:16:09 +1000 Subject: [PATCH] radv: add initial non-conformant radv vulkan driver MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This squashes all the radv development up until now into one for merging. History can be found: https://github.com/airlied/mesa/tree/semi-interesting This requires llvm 3.9 and is in no way considered a conformant vulkan implementation. It can run a number of vulkan applications, and supports all GPUs using the amdgpu kernel driver. Thanks to Intel for providing anv and spirv->nir, and Emil Velikov for reviewing build integration. Parts of this are: Reviewed-by: Nicolai Hähnle Acked-by: Edward O'Callaghan Authors: Bas Nieuwenhuizen and Dave Airlie Signed-off-by: Dave Airlie --- configure.ac | 33 +- src/Makefile.am | 8 +- src/amd/common/Makefile.am | 51 + src/amd/common/Makefile.sources | 29 + src/amd/common/ac_binary.c | 288 ++ src/amd/common/ac_binary.h | 88 + src/amd/common/ac_llvm_helper.cpp | 46 + src/amd/common/ac_llvm_util.c | 142 + src/amd/common/ac_llvm_util.h | 31 + src/amd/common/ac_nir_to_llvm.c | 4547 +++++++++++++++++ src/amd/common/ac_nir_to_llvm.h | 102 + src/amd/vulkan/.gitignore | 6 + src/amd/vulkan/Makefile.am | 165 + src/amd/vulkan/Makefile.sources | 67 + src/amd/vulkan/dev_icd.json.in | 7 + src/amd/vulkan/radeon_icd.json | 7 + src/amd/vulkan/radv_cmd_buffer.c | 2413 +++++++++ src/amd/vulkan/radv_cs.h | 117 + src/amd/vulkan/radv_descriptor_set.c | 716 +++ src/amd/vulkan/radv_descriptor_set.h | 81 + src/amd/vulkan/radv_device.c | 1782 +++++++ src/amd/vulkan/radv_device_info.h | 32 + src/amd/vulkan/radv_entrypoints_gen.py | 351 ++ src/amd/vulkan/radv_formats.c | 1085 ++++ src/amd/vulkan/radv_image.c | 1030 ++++ src/amd/vulkan/radv_meta.c | 388 ++ src/amd/vulkan/radv_meta.h | 190 + src/amd/vulkan/radv_meta_blit.c | 1270 +++++ src/amd/vulkan/radv_meta_blit2d.c | 1282 +++++ src/amd/vulkan/radv_meta_buffer.c | 543 ++ src/amd/vulkan/radv_meta_bufimage.c | 396 ++ src/amd/vulkan/radv_meta_clear.c | 1192 +++++ src/amd/vulkan/radv_meta_copy.c | 399 ++ src/amd/vulkan/radv_meta_decompress.c | 463 ++ src/amd/vulkan/radv_meta_fast_clear.c | 536 ++ src/amd/vulkan/radv_meta_resolve.c | 670 +++ src/amd/vulkan/radv_meta_resolve_cs.c | 461 ++ src/amd/vulkan/radv_pass.c | 183 + src/amd/vulkan/radv_pipeline.c | 1408 +++++ src/amd/vulkan/radv_pipeline_cache.c | 475 ++ src/amd/vulkan/radv_private.h | 1402 +++++ src/amd/vulkan/radv_query.c | 415 ++ src/amd/vulkan/radv_radeon_winsys.h | 336 ++ src/amd/vulkan/radv_util.c | 204 + src/amd/vulkan/radv_util.h | 9 + src/amd/vulkan/radv_wsi.c | 246 + src/amd/vulkan/radv_wsi.h | 79 + src/amd/vulkan/radv_wsi_wayland.c | 880 ++++ src/amd/vulkan/radv_wsi_x11.c | 963 ++++ src/amd/vulkan/si_cmd_buffer.c | 1119 ++++ src/amd/vulkan/vk_format.h | 449 ++ src/amd/vulkan/vk_format_layout.csv | 188 + src/amd/vulkan/vk_format_parse.py | 384 ++ src/amd/vulkan/vk_format_table.py | 173 + src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c | 297 ++ src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h | 50 + src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 778 +++ src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h | 51 + .../winsys/amdgpu/radv_amdgpu_surface.c | 523 ++ .../winsys/amdgpu/radv_amdgpu_surface.h | 29 + .../vulkan/winsys/amdgpu/radv_amdgpu_winsys.c | 359 ++ .../vulkan/winsys/amdgpu/radv_amdgpu_winsys.h | 57 + .../winsys/amdgpu/radv_amdgpu_winsys_public.h | 30 + 63 files changed, 32093 insertions(+), 8 deletions(-) create mode 100644 src/amd/common/Makefile.am create mode 100644 src/amd/common/Makefile.sources create mode 100644 src/amd/common/ac_binary.c create mode 100644 src/amd/common/ac_binary.h create mode 100644 src/amd/common/ac_llvm_helper.cpp create mode 100644 src/amd/common/ac_llvm_util.c create mode 100644 src/amd/common/ac_llvm_util.h create mode 100644 src/amd/common/ac_nir_to_llvm.c create mode 100644 src/amd/common/ac_nir_to_llvm.h create mode 100644 src/amd/vulkan/.gitignore create mode 100644 src/amd/vulkan/Makefile.am create mode 100644 src/amd/vulkan/Makefile.sources create mode 100644 src/amd/vulkan/dev_icd.json.in create mode 100644 src/amd/vulkan/radeon_icd.json create mode 100644 src/amd/vulkan/radv_cmd_buffer.c create mode 100644 src/amd/vulkan/radv_cs.h create mode 100644 src/amd/vulkan/radv_descriptor_set.c create mode 100644 src/amd/vulkan/radv_descriptor_set.h create mode 100644 src/amd/vulkan/radv_device.c create mode 100644 src/amd/vulkan/radv_device_info.h create mode 100644 src/amd/vulkan/radv_entrypoints_gen.py create mode 100644 src/amd/vulkan/radv_formats.c create mode 100644 src/amd/vulkan/radv_image.c create mode 100644 src/amd/vulkan/radv_meta.c create mode 100644 src/amd/vulkan/radv_meta.h create mode 100644 src/amd/vulkan/radv_meta_blit.c create mode 100644 src/amd/vulkan/radv_meta_blit2d.c create mode 100644 src/amd/vulkan/radv_meta_buffer.c create mode 100644 src/amd/vulkan/radv_meta_bufimage.c create mode 100644 src/amd/vulkan/radv_meta_clear.c create mode 100644 src/amd/vulkan/radv_meta_copy.c create mode 100644 src/amd/vulkan/radv_meta_decompress.c create mode 100644 src/amd/vulkan/radv_meta_fast_clear.c create mode 100644 src/amd/vulkan/radv_meta_resolve.c create mode 100644 src/amd/vulkan/radv_meta_resolve_cs.c create mode 100644 src/amd/vulkan/radv_pass.c create mode 100644 src/amd/vulkan/radv_pipeline.c create mode 100644 src/amd/vulkan/radv_pipeline_cache.c create mode 100644 src/amd/vulkan/radv_private.h create mode 100644 src/amd/vulkan/radv_query.c create mode 100644 src/amd/vulkan/radv_radeon_winsys.h create mode 100644 src/amd/vulkan/radv_util.c create mode 100644 src/amd/vulkan/radv_util.h create mode 100644 src/amd/vulkan/radv_wsi.c create mode 100644 src/amd/vulkan/radv_wsi.h create mode 100644 src/amd/vulkan/radv_wsi_wayland.c create mode 100644 src/amd/vulkan/radv_wsi_x11.c create mode 100644 src/amd/vulkan/si_cmd_buffer.c create mode 100644 src/amd/vulkan/vk_format.h create mode 100644 src/amd/vulkan/vk_format_layout.csv create mode 100755 src/amd/vulkan/vk_format_parse.py create mode 100755 src/amd/vulkan/vk_format_table.py create mode 100644 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c create mode 100644 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h create mode 100644 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c create mode 100644 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h create mode 100644 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c create mode 100644 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.h create mode 100644 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c create mode 100644 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h create mode 100644 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h diff --git a/configure.ac b/configure.ac index 99abcd7a5fe..c8aa829a8ca 100644 --- a/configure.ac +++ b/configure.ac @@ -1715,6 +1715,10 @@ if test -n "$with_vulkan_drivers"; then HAVE_INTEL_VULKAN=yes; ;; + xradeon) + PKG_CHECK_MODULES([AMDGPU], [libdrm_amdgpu >= $LIBDRM_AMDGPU_REQUIRED]) + HAVE_RADEON_VULKAN=yes; + ;; *) AC_MSG_ERROR([Vulkan driver '$driver' does not exist]) ;; @@ -2198,7 +2202,7 @@ if test "x$enable_gallium_llvm" = xauto; then i*86|x86_64|amd64) enable_gallium_llvm=yes;; esac fi -if test "x$enable_gallium_llvm" = xyes; then +if test "x$enable_gallium_llvm" = xyes || test "x$HAVE_RADEON_VULKAN" = xyes; then if test -n "$llvm_prefix"; then AC_PATH_TOOL([LLVM_CONFIG], [llvm-config], [no], ["$llvm_prefix/bin"]) else @@ -2368,10 +2372,7 @@ radeon_llvm_check() { else amdgpu_llvm_target_name='amdgpu' fi - if test "x$enable_gallium_llvm" != "xyes"; then - AC_MSG_ERROR([--enable-gallium-llvm is required when building $1]) - fi - llvm_check_version_for "3" "6" "0" $1 + llvm_check_version_for $2 $3 $4 $1 if test true && $LLVM_CONFIG --targets-built | grep -iqvw $amdgpu_llvm_target_name ; then AC_MSG_ERROR([LLVM $amdgpu_llvm_target_name not enabled in your LLVM build.]) fi @@ -2382,6 +2383,13 @@ radeon_llvm_check() { fi } +radeon_gallium_llvm_check() { + if test "x$enable_gallium_llvm" != "xyes"; then + AC_MSG_ERROR([--enable-gallium-llvm is required when building $1]) + fi + radeon_llvm_check $* +} + swr_llvm_check() { gallium_require_llvm $1 if test ${LLVM_VERSION_INT} -lt 306; then @@ -2466,7 +2474,7 @@ if test -n "$with_gallium_drivers"; then gallium_require_drm "Gallium R600" gallium_require_drm_loader if test "x$enable_opencl" = xyes; then - radeon_llvm_check "r600g" + radeon_gallium_llvm_check "r600g" "3" "6" "0" LLVM_COMPONENTS="${LLVM_COMPONENTS} bitreader asmparser" fi ;; @@ -2476,7 +2484,7 @@ if test -n "$with_gallium_drivers"; then PKG_CHECK_MODULES([AMDGPU], [libdrm_amdgpu >= $LIBDRM_AMDGPU_REQUIRED]) gallium_require_drm "radeonsi" gallium_require_drm_loader - radeon_llvm_check "radeonsi" + radeon_gallium_llvm_check "radeonsi" "3" "6" "0" require_egl_drm "radeonsi" ;; xnouveau) @@ -2541,6 +2549,10 @@ if test -n "$with_gallium_drivers"; then done fi +if test "x$HAVE_RADEON_VULKAN" != "x0"; then + radeon_llvm_check "radv" "3" "9" "0" +fi + dnl Set LLVM_LIBS - This is done after the driver configuration so dnl that drivers can add additional components to LLVM_COMPONENTS. dnl Previously, gallium drivers were updating LLVM_LIBS directly @@ -2632,8 +2644,13 @@ AM_CONDITIONAL(HAVE_R200_DRI, test x$HAVE_R200_DRI = xyes) AM_CONDITIONAL(HAVE_RADEON_DRI, test x$HAVE_RADEON_DRI = xyes) AM_CONDITIONAL(HAVE_SWRAST_DRI, test x$HAVE_SWRAST_DRI = xyes) +AM_CONDITIONAL(HAVE_RADEON_VULKAN, test "x$HAVE_RADEON_VULKAN" = xyes) AM_CONDITIONAL(HAVE_INTEL_VULKAN, test "x$HAVE_INTEL_VULKAN" = xyes) +AM_CONDITIONAL(HAVE_AMD_DRIVERS, test "x$HAVE_GALLIUM_R600" = xyes -o \ + "x$HAVE_GALLIUM_RADEONSI" = xyes -o \ + "x$HAVE_RADEON_VULKAN" = xyes) + AM_CONDITIONAL(HAVE_INTEL_DRIVERS, test "x$HAVE_INTEL_VULKAN" = xyes -o \ "x$HAVE_I965_DRI" = xyes) @@ -2726,6 +2743,8 @@ dnl Substitute the config AC_CONFIG_FILES([Makefile src/Makefile src/amd/Makefile + src/amd/common/Makefile + src/amd/vulkan/Makefile src/compiler/Makefile src/egl/Makefile src/egl/main/egl.pc diff --git a/src/Makefile.am b/src/Makefile.am index 91d6a7adb31..17c8798323a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -74,7 +74,7 @@ endif # include only conditionally ? SUBDIRS += compiler -if HAVE_GALLIUM_RADEON_COMMON +if HAVE_AMD_DRIVERS SUBDIRS += amd endif @@ -120,6 +120,12 @@ if HAVE_INTEL_VULKAN SUBDIRS += intel/vulkan endif +# Requires wayland-drm +if HAVE_RADEON_VULKAN +SUBDIRS += amd/common +SUBDIRS += amd/vulkan +endif + if HAVE_GALLIUM SUBDIRS += gallium endif diff --git a/src/amd/common/Makefile.am b/src/amd/common/Makefile.am new file mode 100644 index 00000000000..788152d850c --- /dev/null +++ b/src/amd/common/Makefile.am @@ -0,0 +1,51 @@ +# Copyright © 2016 Bas Nieuwenhuizen +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +include Makefile.sources + +# TODO cleanup these +AM_CPPFLAGS = \ + $(VALGRIND_CFLAGS) \ + $(DEFINES) \ + -I$(top_srcdir)/include \ + -I$(top_builddir)/src \ + -I$(top_srcdir)/src \ + -I$(top_builddir)/src/compiler \ + -I$(top_builddir)/src/compiler/nir \ + -I$(top_srcdir)/src/compiler \ + -I$(top_srcdir)/src/mapi \ + -I$(top_srcdir)/src/mesa \ + -I$(top_srcdir)/src/mesa/drivers/dri/common \ + -I$(top_srcdir)/src/gallium/auxiliary \ + -I$(top_srcdir)/src/gallium/include + +AM_CFLAGS = $(VISIBILITY_CFLAGS) \ + $(PTHREAD_CFLAGS) \ + $(LLVM_CFLAGS) \ + $(LIBELF_CFLAGS) + +AM_CXXFLAGS = \ + $(VISIBILITY_CXXFLAGS) \ + $(LLVM_CXXFLAGS) + +noinst_LTLIBRARIES = libamd_common.la + +libamd_common_la_SOURCES = $(AMD_COMPILER_SOURCES) diff --git a/src/amd/common/Makefile.sources b/src/amd/common/Makefile.sources new file mode 100644 index 00000000000..380dba08621 --- /dev/null +++ b/src/amd/common/Makefile.sources @@ -0,0 +1,29 @@ +# Copyright © 2016 Bas Nieuwenhuizen +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +AMD_COMPILER_SOURCES := \ + ac_binary.c \ + ac_binary.h \ + ac_llvm_helper.cpp \ + ac_llvm_util.c \ + ac_llvm_util.h \ + ac_nir_to_llvm.c \ + ac_nir_to_llvm.h diff --git a/src/amd/common/ac_binary.c b/src/amd/common/ac_binary.c new file mode 100644 index 00000000000..01cf000d9be --- /dev/null +++ b/src/amd/common/ac_binary.c @@ -0,0 +1,288 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Tom Stellard + * + * Based on radeon_elf_util.c. + */ + +#include "ac_binary.h" + +#include "util/u_math.h" +#include "util/u_memory.h" + +#include +#include +#include + +#include + +#define SPILLED_SGPRS 0x4 +#define SPILLED_VGPRS 0x8 + +static void parse_symbol_table(Elf_Data *symbol_table_data, + const GElf_Shdr *symbol_table_header, + struct ac_shader_binary *binary) +{ + GElf_Sym symbol; + unsigned i = 0; + unsigned symbol_count = + symbol_table_header->sh_size / symbol_table_header->sh_entsize; + + /* We are over allocating this list, because symbol_count gives the + * total number of symbols, and we will only be filling the list + * with offsets of global symbols. The memory savings from + * allocating the correct size of this list will be small, and + * I don't think it is worth the cost of pre-computing the number + * of global symbols. + */ + binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t)); + + while (gelf_getsym(symbol_table_data, i++, &symbol)) { + unsigned i; + if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL || + symbol.st_shndx == 0 /* Undefined symbol */) { + continue; + } + + binary->global_symbol_offsets[binary->global_symbol_count] = + symbol.st_value; + + /* Sort the list using bubble sort. This list will usually + * be small. */ + for (i = binary->global_symbol_count; i > 0; --i) { + uint64_t lhs = binary->global_symbol_offsets[i - 1]; + uint64_t rhs = binary->global_symbol_offsets[i]; + if (lhs < rhs) { + break; + } + binary->global_symbol_offsets[i] = lhs; + binary->global_symbol_offsets[i - 1] = rhs; + } + ++binary->global_symbol_count; + } +} + +static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols, + unsigned symbol_sh_link, + struct ac_shader_binary *binary) +{ + unsigned i; + + if (!relocs || !symbols || !binary->reloc_count) { + return; + } + binary->relocs = CALLOC(binary->reloc_count, + sizeof(struct ac_shader_reloc)); + for (i = 0; i < binary->reloc_count; i++) { + GElf_Sym symbol; + GElf_Rel rel; + char *symbol_name; + struct ac_shader_reloc *reloc = &binary->relocs[i]; + + gelf_getrel(relocs, i, &rel); + gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol); + symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name); + + reloc->offset = rel.r_offset; + strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1); + reloc->name[sizeof(reloc->name)-1] = 0; + } +} + +void ac_elf_read(const char *elf_data, unsigned elf_size, + struct ac_shader_binary *binary) +{ + char *elf_buffer; + Elf *elf; + Elf_Scn *section = NULL; + Elf_Data *symbols = NULL, *relocs = NULL; + size_t section_str_index; + unsigned symbol_sh_link = 0; + + /* One of the libelf implementations + * (http://www.mr511.de/software/english.htm) requires calling + * elf_version() before elf_memory(). + */ + elf_version(EV_CURRENT); + elf_buffer = MALLOC(elf_size); + memcpy(elf_buffer, elf_data, elf_size); + + elf = elf_memory(elf_buffer, elf_size); + + elf_getshdrstrndx(elf, §ion_str_index); + + while ((section = elf_nextscn(elf, section))) { + const char *name; + Elf_Data *section_data = NULL; + GElf_Shdr section_header; + if (gelf_getshdr(section, §ion_header) != §ion_header) { + fprintf(stderr, "Failed to read ELF section header\n"); + return; + } + name = elf_strptr(elf, section_str_index, section_header.sh_name); + if (!strcmp(name, ".text")) { + section_data = elf_getdata(section, section_data); + binary->code_size = section_data->d_size; + binary->code = MALLOC(binary->code_size * sizeof(unsigned char)); + memcpy(binary->code, section_data->d_buf, binary->code_size); + } else if (!strcmp(name, ".AMDGPU.config")) { + section_data = elf_getdata(section, section_data); + binary->config_size = section_data->d_size; + binary->config = MALLOC(binary->config_size * sizeof(unsigned char)); + memcpy(binary->config, section_data->d_buf, binary->config_size); + } else if (!strcmp(name, ".AMDGPU.disasm")) { + /* Always read disassembly if it's available. */ + section_data = elf_getdata(section, section_data); + binary->disasm_string = strndup(section_data->d_buf, + section_data->d_size); + } else if (!strncmp(name, ".rodata", 7)) { + section_data = elf_getdata(section, section_data); + binary->rodata_size = section_data->d_size; + binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char)); + memcpy(binary->rodata, section_data->d_buf, binary->rodata_size); + } else if (!strncmp(name, ".symtab", 7)) { + symbols = elf_getdata(section, section_data); + symbol_sh_link = section_header.sh_link; + parse_symbol_table(symbols, §ion_header, binary); + } else if (!strcmp(name, ".rel.text")) { + relocs = elf_getdata(section, section_data); + binary->reloc_count = section_header.sh_size / + section_header.sh_entsize; + } + } + + parse_relocs(elf, relocs, symbols, symbol_sh_link, binary); + + if (elf){ + elf_end(elf); + } + FREE(elf_buffer); + + /* Cache the config size per symbol */ + if (binary->global_symbol_count) { + binary->config_size_per_symbol = + binary->config_size / binary->global_symbol_count; + } else { + binary->global_symbol_count = 1; + binary->config_size_per_symbol = binary->config_size; + } +} + +static +const unsigned char *ac_shader_binary_config_start( + const struct ac_shader_binary *binary, + uint64_t symbol_offset) +{ + unsigned i; + for (i = 0; i < binary->global_symbol_count; ++i) { + if (binary->global_symbol_offsets[i] == symbol_offset) { + unsigned offset = i * binary->config_size_per_symbol; + return binary->config + offset; + } + } + return binary->config; +} + + +static const char *scratch_rsrc_dword0_symbol = + "SCRATCH_RSRC_DWORD0"; + +static const char *scratch_rsrc_dword1_symbol = + "SCRATCH_RSRC_DWORD1"; + +void ac_shader_binary_read_config(struct ac_shader_binary *binary, + struct ac_shader_config *conf, + unsigned symbol_offset) +{ + unsigned i; + const unsigned char *config = + ac_shader_binary_config_start(binary, symbol_offset); + bool really_needs_scratch = false; + + /* LLVM adds SGPR spills to the scratch size. + * Find out if we really need the scratch buffer. + */ + for (i = 0; i < binary->reloc_count; i++) { + const struct ac_shader_reloc *reloc = &binary->relocs[i]; + + if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) || + !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { + really_needs_scratch = true; + break; + } + } + + for (i = 0; i < binary->config_size_per_symbol; i+= 8) { + unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i)); + unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4)); + switch (reg) { + case R_00B028_SPI_SHADER_PGM_RSRC1_PS: + case R_00B128_SPI_SHADER_PGM_RSRC1_VS: + case R_00B228_SPI_SHADER_PGM_RSRC1_GS: + case R_00B848_COMPUTE_PGM_RSRC1: + conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); + conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); + conf->float_mode = G_00B028_FLOAT_MODE(value); + break; + case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: + conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); + break; + case R_00B84C_COMPUTE_PGM_RSRC2: + conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value)); + break; + case R_0286CC_SPI_PS_INPUT_ENA: + conf->spi_ps_input_ena = value; + break; + case R_0286D0_SPI_PS_INPUT_ADDR: + conf->spi_ps_input_addr = value; + break; + case R_0286E8_SPI_TMPRING_SIZE: + case R_00B860_COMPUTE_TMPRING_SIZE: + /* WAVESIZE is in units of 256 dwords. */ + if (really_needs_scratch) + conf->scratch_bytes_per_wave = + G_00B860_WAVESIZE(value) * 256 * 4; + break; + case SPILLED_SGPRS: + conf->spilled_sgprs = value; + break; + case SPILLED_VGPRS: + conf->spilled_vgprs = value; + break; + default: + { + static bool printed; + + if (!printed) { + fprintf(stderr, "Warning: LLVM emitted unknown " + "config register: 0x%x\n", reg); + printed = true; + } + } + break; + } + + if (!conf->spi_ps_input_addr) + conf->spi_ps_input_addr = conf->spi_ps_input_ena; + } +} diff --git a/src/amd/common/ac_binary.h b/src/amd/common/ac_binary.h new file mode 100644 index 00000000000..282f33d22b9 --- /dev/null +++ b/src/amd/common/ac_binary.h @@ -0,0 +1,88 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Tom Stellard + * + */ + +#pragma once + +#include + +struct ac_shader_reloc { + char name[32]; + uint64_t offset; +}; + +struct ac_shader_binary { + /** Shader code */ + unsigned char *code; + unsigned code_size; + + /** Config/Context register state that accompanies this shader. + * This is a stream of dword pairs. First dword contains the + * register address, the second dword contains the value.*/ + unsigned char *config; + unsigned config_size; + + /** The number of bytes of config information for each global symbol. + */ + unsigned config_size_per_symbol; + + /** Constant data accessed by the shader. This will be uploaded + * into a constant buffer. */ + unsigned char *rodata; + unsigned rodata_size; + + /** List of symbol offsets for the shader */ + uint64_t *global_symbol_offsets; + unsigned global_symbol_count; + + struct ac_shader_reloc *relocs; + unsigned reloc_count; + + /** Disassembled shader in a string. */ + char *disasm_string; +}; + +struct ac_shader_config { + unsigned num_sgprs; + unsigned num_vgprs; + unsigned spilled_sgprs; + unsigned spilled_vgprs; + unsigned lds_size; + unsigned spi_ps_input_ena; + unsigned spi_ps_input_addr; + unsigned float_mode; + unsigned scratch_bytes_per_wave; +}; + +/* + * Parse the elf binary stored in \p elf_data and create a + * ac_shader_binary object. + */ +void ac_elf_read(const char *elf_data, unsigned elf_size, + struct ac_shader_binary *binary); + +void ac_shader_binary_read_config(struct ac_shader_binary *binary, + struct ac_shader_config *conf, + unsigned symbol_offset); diff --git a/src/amd/common/ac_llvm_helper.cpp b/src/amd/common/ac_llvm_helper.cpp new file mode 100644 index 00000000000..062f0aad92b --- /dev/null +++ b/src/amd/common/ac_llvm_helper.cpp @@ -0,0 +1,46 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + */ + +/* based on Marek's patch to lp_bld_misc.cpp */ + +// Workaround http://llvm.org/PR23628 +#if HAVE_LLVM >= 0x0307 +# pragma push_macro("DEBUG") +# undef DEBUG +#endif + +#include "ac_nir_to_llvm.h" +#include +#include +#include + +extern "C" void +ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes) +{ + llvm::Argument *A = llvm::unwrap(val); + llvm::AttrBuilder B; + B.addDereferenceableAttr(bytes); + A->addAttr(llvm::AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); +} diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c new file mode 100644 index 00000000000..a8408dd79e6 --- /dev/null +++ b/src/amd/common/ac_llvm_util.c @@ -0,0 +1,142 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + */ +/* based on pieces from si_pipe.c and radeon_llvm_emit.c */ +#include "ac_llvm_util.h" + +#include + +#include "c11/threads.h" + +#include +#include + +static void ac_init_llvm_target() +{ +#if HAVE_LLVM < 0x0307 + LLVMInitializeR600TargetInfo(); + LLVMInitializeR600Target(); + LLVMInitializeR600TargetMC(); + LLVMInitializeR600AsmPrinter(); +#else + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUAsmPrinter(); +#endif +} + +static once_flag ac_init_llvm_target_once_flag = ONCE_FLAG_INIT; + +static LLVMTargetRef ac_get_llvm_target(const char *triple) +{ + LLVMTargetRef target = NULL; + char *err_message = NULL; + + call_once(&ac_init_llvm_target_once_flag, ac_init_llvm_target); + + if (LLVMGetTargetFromTriple(triple, &target, &err_message)) { + fprintf(stderr, "Cannot find target for triple %s ", triple); + if (err_message) { + fprintf(stderr, "%s\n", err_message); + } + LLVMDisposeMessage(err_message); + return NULL; + } + return target; +} + +static const char *ac_get_llvm_processor_name(enum radeon_family family) +{ + switch (family) { + case CHIP_TAHITI: + return "tahiti"; + case CHIP_PITCAIRN: + return "pitcairn"; + case CHIP_VERDE: + return "verde"; + case CHIP_OLAND: + return "oland"; + case CHIP_HAINAN: + return "hainan"; + case CHIP_BONAIRE: + return "bonaire"; + case CHIP_KABINI: + return "kabini"; + case CHIP_KAVERI: + return "kaveri"; + case CHIP_HAWAII: + return "hawaii"; + case CHIP_MULLINS: + return "mullins"; + case CHIP_TONGA: + return "tonga"; + case CHIP_ICELAND: + return "iceland"; + case CHIP_CARRIZO: + return "carrizo"; +#if HAVE_LLVM <= 0x0307 + case CHIP_FIJI: + return "tonga"; + case CHIP_STONEY: + return "carrizo"; +#else + case CHIP_FIJI: + return "fiji"; + case CHIP_STONEY: + return "stoney"; +#endif +#if HAVE_LLVM <= 0x0308 + case CHIP_POLARIS10: + return "tonga"; + case CHIP_POLARIS11: + return "tonga"; +#else + case CHIP_POLARIS10: + return "polaris10"; + case CHIP_POLARIS11: + return "polaris11"; +#endif + default: + return ""; + } +} + +LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family) +{ + assert(family >= CHIP_TAHITI); + + const char *triple = "amdgcn--"; + LLVMTargetRef target = ac_get_llvm_target(triple); + LLVMTargetMachineRef tm = LLVMCreateTargetMachine( + target, + triple, + ac_get_llvm_processor_name(family), + "+DumpCode,+vgpr-spilling", + LLVMCodeGenLevelDefault, + LLVMRelocDefault, + LLVMCodeModelDefault); + + return tm; +} diff --git a/src/amd/common/ac_llvm_util.h b/src/amd/common/ac_llvm_util.h new file mode 100644 index 00000000000..8357fbf3f3e --- /dev/null +++ b/src/amd/common/ac_llvm_util.h @@ -0,0 +1,31 @@ +/* + * Copyright 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + */ +#pragma once + +#include + +#include "amd_family.h" + +LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family); diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c new file mode 100644 index 00000000000..e6ff7c84362 --- /dev/null +++ b/src/amd/common/ac_nir_to_llvm.c @@ -0,0 +1,4547 @@ +/* + * Copyright © 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "ac_nir_to_llvm.h" +#include "ac_binary.h" +#include "sid.h" +#include "nir/nir.h" +#include "../vulkan/radv_descriptor_set.h" +#include "util/bitscan.h" +#include + +enum radeon_llvm_calling_convention { + RADEON_LLVM_AMDGPU_VS = 87, + RADEON_LLVM_AMDGPU_GS = 88, + RADEON_LLVM_AMDGPU_PS = 89, + RADEON_LLVM_AMDGPU_CS = 90, +}; + +#define CONST_ADDR_SPACE 2 +#define LOCAL_ADDR_SPACE 3 + +#define RADEON_LLVM_MAX_INPUTS (VARYING_SLOT_VAR31 + 1) +#define RADEON_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1) + +enum desc_type { + DESC_IMAGE, + DESC_FMASK, + DESC_SAMPLER, + DESC_BUFFER, +}; + +struct nir_to_llvm_context { + const struct ac_nir_compiler_options *options; + struct ac_shader_variant_info *shader_info; + + LLVMContextRef context; + LLVMModuleRef module; + LLVMBuilderRef builder; + LLVMValueRef main_function; + + struct hash_table *defs; + struct hash_table *phis; + + LLVMValueRef descriptor_sets[4]; + LLVMValueRef push_constants; + LLVMValueRef num_work_groups; + LLVMValueRef workgroup_ids; + LLVMValueRef local_invocation_ids; + LLVMValueRef tg_size; + + LLVMValueRef vertex_buffers; + LLVMValueRef base_vertex; + LLVMValueRef start_instance; + LLVMValueRef vertex_id; + LLVMValueRef rel_auto_id; + LLVMValueRef vs_prim_id; + LLVMValueRef instance_id; + + LLVMValueRef prim_mask; + LLVMValueRef sample_positions; + LLVMValueRef persp_sample, persp_center, persp_centroid; + LLVMValueRef linear_sample, linear_center, linear_centroid; + LLVMValueRef front_face; + LLVMValueRef ancillary; + LLVMValueRef frag_pos[4]; + + LLVMBasicBlockRef continue_block; + LLVMBasicBlockRef break_block; + + LLVMTypeRef i1; + LLVMTypeRef i8; + LLVMTypeRef i16; + LLVMTypeRef i32; + LLVMTypeRef i64; + LLVMTypeRef v2i32; + LLVMTypeRef v3i32; + LLVMTypeRef v4i32; + LLVMTypeRef v8i32; + LLVMTypeRef f32; + LLVMTypeRef f16; + LLVMTypeRef v2f32; + LLVMTypeRef v4f32; + LLVMTypeRef v16i8; + LLVMTypeRef voidt; + + LLVMValueRef i32zero; + LLVMValueRef i32one; + LLVMValueRef f32zero; + LLVMValueRef f32one; + LLVMValueRef v4f32empty; + + unsigned range_md_kind; + unsigned uniform_md_kind; + unsigned fpmath_md_kind; + unsigned invariant_load_md_kind; + LLVMValueRef empty_md; + LLVMValueRef fpmath_md_2p5_ulp; + gl_shader_stage stage; + + LLVMValueRef lds; + LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4]; + LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS * 4]; + + LLVMValueRef shared_memory; + uint64_t input_mask; + uint64_t output_mask; + int num_locals; + LLVMValueRef *locals; + bool has_ddxy; + unsigned num_clips; + unsigned num_culls; +}; + +struct ac_tex_info { + LLVMValueRef args[12]; + int arg_count; + LLVMTypeRef dst_type; + bool has_offset; +}; + +static LLVMValueRef +emit_llvm_intrinsic(struct nir_to_llvm_context *ctx, const char *name, + LLVMTypeRef return_type, LLVMValueRef *params, + unsigned param_count, LLVMAttribute attribs); +static LLVMValueRef get_sampler_desc(struct nir_to_llvm_context *ctx, + nir_deref_var *deref, + enum desc_type desc_type); +static unsigned radeon_llvm_reg_index_soa(unsigned index, unsigned chan) +{ + return (index * 4) + chan; +} + +static unsigned llvm_get_type_size(LLVMTypeRef type) +{ + LLVMTypeKind kind = LLVMGetTypeKind(type); + + switch (kind) { + case LLVMIntegerTypeKind: + return LLVMGetIntTypeWidth(type) / 8; + case LLVMFloatTypeKind: + return 4; + case LLVMPointerTypeKind: + return 8; + case LLVMVectorTypeKind: + return LLVMGetVectorSize(type) * + llvm_get_type_size(LLVMGetElementType(type)); + default: + assert(0); + return 0; + } +} + +static void set_llvm_calling_convention(LLVMValueRef func, + gl_shader_stage stage) +{ + enum radeon_llvm_calling_convention calling_conv; + + switch (stage) { + case MESA_SHADER_VERTEX: + case MESA_SHADER_TESS_CTRL: + case MESA_SHADER_TESS_EVAL: + calling_conv = RADEON_LLVM_AMDGPU_VS; + break; + case MESA_SHADER_GEOMETRY: + calling_conv = RADEON_LLVM_AMDGPU_GS; + break; + case MESA_SHADER_FRAGMENT: + calling_conv = RADEON_LLVM_AMDGPU_PS; + break; + case MESA_SHADER_COMPUTE: + calling_conv = RADEON_LLVM_AMDGPU_CS; + break; + default: + unreachable("Unhandle shader type"); + } + + LLVMSetFunctionCallConv(func, calling_conv); +} + +static LLVMValueRef +create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module, + LLVMBuilderRef builder, LLVMTypeRef *return_types, + unsigned num_return_elems, LLVMTypeRef *param_types, + unsigned param_count, unsigned array_params, + unsigned sgpr_params, bool unsafe_math) +{ + LLVMTypeRef main_function_type, ret_type; + LLVMBasicBlockRef main_function_body; + + if (num_return_elems) + ret_type = LLVMStructTypeInContext(ctx, return_types, + num_return_elems, true); + else + ret_type = LLVMVoidTypeInContext(ctx); + + /* Setup the function */ + main_function_type = + LLVMFunctionType(ret_type, param_types, param_count, 0); + LLVMValueRef main_function = + LLVMAddFunction(module, "main", main_function_type); + main_function_body = + LLVMAppendBasicBlockInContext(ctx, main_function, "main_body"); + LLVMPositionBuilderAtEnd(builder, main_function_body); + + LLVMSetFunctionCallConv(main_function, RADEON_LLVM_AMDGPU_CS); + for (unsigned i = 0; i < sgpr_params; ++i) { + LLVMValueRef P = LLVMGetParam(main_function, i); + + if (i < array_params) { + LLVMAddAttribute(P, LLVMByValAttribute); + ac_add_attr_dereferenceable(P, UINT64_MAX); + } + else + LLVMAddAttribute(P, LLVMInRegAttribute); + } + + if (unsafe_math) { + /* These were copied from some LLVM test. */ + LLVMAddTargetDependentFunctionAttr(main_function, + "less-precise-fpmad", + "true"); + LLVMAddTargetDependentFunctionAttr(main_function, + "no-infs-fp-math", + "true"); + LLVMAddTargetDependentFunctionAttr(main_function, + "no-nans-fp-math", + "true"); + LLVMAddTargetDependentFunctionAttr(main_function, + "unsafe-fp-math", + "true"); + } + return main_function; +} + +static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements) +{ + return LLVMPointerType(LLVMArrayType(elem_type, num_elements), + CONST_ADDR_SPACE); +} + +static LLVMValueRef get_shared_memory_ptr(struct nir_to_llvm_context *ctx, + int idx, + LLVMTypeRef type) +{ + LLVMValueRef offset; + LLVMValueRef ptr; + int addr_space; + + offset = LLVMConstInt(ctx->i32, idx, false); + + ptr = ctx->shared_memory; + ptr = LLVMBuildGEP(ctx->builder, ptr, &offset, 1, ""); + addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); + ptr = LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), ""); + return ptr; +} + +static LLVMValueRef to_integer(struct nir_to_llvm_context *ctx, LLVMValueRef v) +{ + LLVMTypeRef type = LLVMTypeOf(v); + if (type == ctx->f32) { + return LLVMBuildBitCast(ctx->builder, v, ctx->i32, ""); + } else if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { + LLVMTypeRef elem_type = LLVMGetElementType(type); + if (elem_type == ctx->f32) { + LLVMTypeRef nt = LLVMVectorType(ctx->i32, LLVMGetVectorSize(type)); + return LLVMBuildBitCast(ctx->builder, v, nt, ""); + } + } + return v; +} + +static LLVMValueRef to_float(struct nir_to_llvm_context *ctx, LLVMValueRef v) +{ + LLVMTypeRef type = LLVMTypeOf(v); + if (type == ctx->i32) { + return LLVMBuildBitCast(ctx->builder, v, ctx->f32, ""); + } else if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { + LLVMTypeRef elem_type = LLVMGetElementType(type); + if (elem_type == ctx->i32) { + LLVMTypeRef nt = LLVMVectorType(ctx->f32, LLVMGetVectorSize(type)); + return LLVMBuildBitCast(ctx->builder, v, nt, ""); + } + } + return v; +} + +static LLVMValueRef build_indexed_load(struct nir_to_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index, + bool uniform) +{ + LLVMValueRef pointer; + LLVMValueRef indices[] = {ctx->i32zero, index}; + + pointer = LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, ""); + if (uniform) + LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md); + return LLVMBuildLoad(ctx->builder, pointer, ""); +} + +static LLVMValueRef build_indexed_load_const(struct nir_to_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index) +{ + LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true); + LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md); + return result; +} + +static void create_function(struct nir_to_llvm_context *ctx, + struct nir_shader *nir) +{ + LLVMTypeRef arg_types[23]; + unsigned arg_idx = 0; + unsigned array_count = 0; + unsigned sgpr_count = 0, user_sgpr_count; + unsigned i; + for (unsigned i = 0; i < 4; ++i) + arg_types[arg_idx++] = const_array(ctx->i8, 1024 * 1024); + + arg_types[arg_idx++] = const_array(ctx->i8, 1024 * 1024); + + array_count = arg_idx; + switch (nir->stage) { + case MESA_SHADER_COMPUTE: + arg_types[arg_idx++] = LLVMVectorType(ctx->i32, 3); /* grid size */ + user_sgpr_count = arg_idx; + arg_types[arg_idx++] = LLVMVectorType(ctx->i32, 3); + arg_types[arg_idx++] = ctx->i32; + sgpr_count = arg_idx; + + arg_types[arg_idx++] = LLVMVectorType(ctx->i32, 3); + break; + case MESA_SHADER_VERTEX: + arg_types[arg_idx++] = const_array(ctx->v16i8, 16); + arg_types[arg_idx++] = ctx->i32; // base vertex + arg_types[arg_idx++] = ctx->i32; // start instance + user_sgpr_count = sgpr_count = arg_idx; + arg_types[arg_idx++] = ctx->i32; // vertex id + arg_types[arg_idx++] = ctx->i32; // rel auto id + arg_types[arg_idx++] = ctx->i32; // vs prim id + arg_types[arg_idx++] = ctx->i32; // instance id + break; + case MESA_SHADER_FRAGMENT: + arg_types[arg_idx++] = const_array(ctx->f32, 32); + user_sgpr_count = arg_idx; + arg_types[arg_idx++] = ctx->i32; /* prim mask */ + sgpr_count = arg_idx; + arg_types[arg_idx++] = ctx->v2i32; /* persp sample */ + arg_types[arg_idx++] = ctx->v2i32; /* persp center */ + arg_types[arg_idx++] = ctx->v2i32; /* persp centroid */ + arg_types[arg_idx++] = ctx->v3i32; /* persp pull model */ + arg_types[arg_idx++] = ctx->v2i32; /* linear sample */ + arg_types[arg_idx++] = ctx->v2i32; /* linear center */ + arg_types[arg_idx++] = ctx->v2i32; /* linear centroid */ + arg_types[arg_idx++] = ctx->f32; /* line stipple tex */ + arg_types[arg_idx++] = ctx->f32; /* pos x float */ + arg_types[arg_idx++] = ctx->f32; /* pos y float */ + arg_types[arg_idx++] = ctx->f32; /* pos z float */ + arg_types[arg_idx++] = ctx->f32; /* pos w float */ + arg_types[arg_idx++] = ctx->i32; /* front face */ + arg_types[arg_idx++] = ctx->i32; /* ancillary */ + arg_types[arg_idx++] = ctx->f32; /* sample coverage */ + arg_types[arg_idx++] = ctx->i32; /* fixed pt */ + break; + default: + unreachable("Shader stage not implemented"); + } + + ctx->main_function = create_llvm_function( + ctx->context, ctx->module, ctx->builder, NULL, 0, arg_types, + arg_idx, array_count, sgpr_count, ctx->options->unsafe_math); + set_llvm_calling_convention(ctx->main_function, nir->stage); + + + ctx->shader_info->num_input_sgprs = 0; + ctx->shader_info->num_input_vgprs = 0; + + for (i = 0; i < user_sgpr_count; i++) + ctx->shader_info->num_user_sgprs += llvm_get_type_size(arg_types[i]) / 4; + + ctx->shader_info->num_input_sgprs = ctx->shader_info->num_user_sgprs; + for (; i < sgpr_count; i++) + ctx->shader_info->num_input_sgprs += llvm_get_type_size(arg_types[i]) / 4; + + if (nir->stage != MESA_SHADER_FRAGMENT) + for (; i < arg_idx; ++i) + ctx->shader_info->num_input_vgprs += llvm_get_type_size(arg_types[i]) / 4; + + arg_idx = 0; + for (unsigned i = 0; i < 4; ++i) + ctx->descriptor_sets[i] = + LLVMGetParam(ctx->main_function, arg_idx++); + + ctx->push_constants = LLVMGetParam(ctx->main_function, arg_idx++); + + switch (nir->stage) { + case MESA_SHADER_COMPUTE: + ctx->num_work_groups = + LLVMGetParam(ctx->main_function, arg_idx++); + ctx->workgroup_ids = + LLVMGetParam(ctx->main_function, arg_idx++); + ctx->tg_size = + LLVMGetParam(ctx->main_function, arg_idx++); + ctx->local_invocation_ids = + LLVMGetParam(ctx->main_function, arg_idx++); + break; + case MESA_SHADER_VERTEX: + ctx->vertex_buffers = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->base_vertex = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->start_instance = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->vertex_id = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->rel_auto_id = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->vs_prim_id = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->instance_id = LLVMGetParam(ctx->main_function, arg_idx++); + break; + case MESA_SHADER_FRAGMENT: + ctx->sample_positions = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->prim_mask = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->persp_sample = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->persp_center = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->persp_centroid = LLVMGetParam(ctx->main_function, arg_idx++); + arg_idx++; + ctx->linear_sample = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->linear_center = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->linear_centroid = LLVMGetParam(ctx->main_function, arg_idx++); + arg_idx++; /* line stipple */ + ctx->frag_pos[0] = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->frag_pos[1] = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->frag_pos[2] = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->frag_pos[3] = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->front_face = LLVMGetParam(ctx->main_function, arg_idx++); + ctx->ancillary = LLVMGetParam(ctx->main_function, arg_idx++); + break; + default: + unreachable("Shader stage not implemented"); + } +} + +static void setup_types(struct nir_to_llvm_context *ctx) +{ + LLVMValueRef args[4]; + + ctx->voidt = LLVMVoidTypeInContext(ctx->context); + ctx->i1 = LLVMIntTypeInContext(ctx->context, 1); + ctx->i8 = LLVMIntTypeInContext(ctx->context, 8); + ctx->i16 = LLVMIntTypeInContext(ctx->context, 16); + ctx->i32 = LLVMIntTypeInContext(ctx->context, 32); + ctx->i64 = LLVMIntTypeInContext(ctx->context, 64); + ctx->v2i32 = LLVMVectorType(ctx->i32, 2); + ctx->v3i32 = LLVMVectorType(ctx->i32, 3); + ctx->v4i32 = LLVMVectorType(ctx->i32, 4); + ctx->v8i32 = LLVMVectorType(ctx->i32, 8); + ctx->f32 = LLVMFloatTypeInContext(ctx->context); + ctx->f16 = LLVMHalfTypeInContext(ctx->context); + ctx->v2f32 = LLVMVectorType(ctx->f32, 2); + ctx->v4f32 = LLVMVectorType(ctx->f32, 4); + ctx->v16i8 = LLVMVectorType(ctx->i8, 16); + + ctx->i32zero = LLVMConstInt(ctx->i32, 0, false); + ctx->i32one = LLVMConstInt(ctx->i32, 1, false); + ctx->f32zero = LLVMConstReal(ctx->f32, 0.0); + ctx->f32one = LLVMConstReal(ctx->f32, 1.0); + + args[0] = ctx->f32zero; + args[1] = ctx->f32zero; + args[2] = ctx->f32zero; + args[3] = ctx->f32one; + ctx->v4f32empty = LLVMConstVector(args, 4); + + ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, + "range", 5); + ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, + "invariant.load", 14); + ctx->uniform_md_kind = + LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14); + ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0); + + ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6); + + args[0] = LLVMConstReal(ctx->f32, 2.5); + ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1); +} + +static int get_llvm_num_components(LLVMValueRef value) +{ + LLVMTypeRef type = LLVMTypeOf(value); + unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind + ? LLVMGetVectorSize(type) + : 1; + return num_components; +} + +static LLVMValueRef llvm_extract_elem(struct nir_to_llvm_context *ctx, + LLVMValueRef value, + int index) +{ + int count = get_llvm_num_components(value); + + assert(index < count); + if (count == 1) + return value; + + return LLVMBuildExtractElement(ctx->builder, value, + LLVMConstInt(ctx->i32, index, false), ""); +} + +static LLVMValueRef trim_vector(struct nir_to_llvm_context *ctx, + LLVMValueRef value, unsigned count) +{ + unsigned num_components = get_llvm_num_components(value); + if (count == num_components) + return value; + + LLVMValueRef masks[] = { + LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false), + LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)}; + + if (count == 1) + return LLVMBuildExtractElement(ctx->builder, value, masks[0], + ""); + + LLVMValueRef swizzle = LLVMConstVector(masks, count); + return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, ""); +} + +static LLVMValueRef +build_gather_values_extended(struct nir_to_llvm_context *ctx, + LLVMValueRef *values, + unsigned value_count, + unsigned value_stride, + bool load) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef vec; + unsigned i; + + + if (value_count == 1) { + if (load) + return LLVMBuildLoad(builder, values[0], ""); + return values[0]; + } + + for (i = 0; i < value_count; i++) { + LLVMValueRef value = values[i * value_stride]; + if (load) + value = LLVMBuildLoad(builder, value, ""); + + if (!i) + vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count)); + LLVMValueRef index = LLVMConstInt(ctx->i32, i, false); + vec = LLVMBuildInsertElement(builder, vec, value, index, ""); + } + return vec; +} + + +static void +build_store_values_extended(struct nir_to_llvm_context *ctx, + LLVMValueRef *values, + unsigned value_count, + unsigned value_stride, + LLVMValueRef vec) +{ + LLVMBuilderRef builder = ctx->builder; + unsigned i; + + if (value_count == 1) { + LLVMBuildStore(builder, vec, values[0]); + return; + } + + for (i = 0; i < value_count; i++) { + LLVMValueRef ptr = values[i * value_stride]; + LLVMValueRef index = LLVMConstInt(ctx->i32, i, false); + LLVMValueRef value = LLVMBuildExtractElement(builder, vec, index, ""); + LLVMBuildStore(builder, value, ptr); + } +} + +static LLVMValueRef +build_gather_values(struct nir_to_llvm_context *ctx, + LLVMValueRef *values, + unsigned value_count) +{ + return build_gather_values_extended(ctx, values, value_count, 1, false); +} + +static LLVMTypeRef get_def_type(struct nir_to_llvm_context *ctx, + nir_ssa_def *def) +{ + LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, def->bit_size); + if (def->num_components > 1) { + type = LLVMVectorType(type, def->num_components); + } + return type; +} + +static LLVMValueRef get_src(struct nir_to_llvm_context *ctx, nir_src src) +{ + assert(src.is_ssa); + struct hash_entry *entry = _mesa_hash_table_search(ctx->defs, src.ssa); + return (LLVMValueRef)entry->data; +} + + +static LLVMBasicBlockRef get_block(struct nir_to_llvm_context *ctx, + struct nir_block *b) +{ + struct hash_entry *entry = _mesa_hash_table_search(ctx->defs, b); + return (LLVMBasicBlockRef)entry->data; +} + +static LLVMValueRef get_alu_src(struct nir_to_llvm_context *ctx, + nir_alu_src src, + unsigned num_components) +{ + LLVMValueRef value = get_src(ctx, src.src); + bool need_swizzle = false; + + assert(value); + LLVMTypeRef type = LLVMTypeOf(value); + unsigned src_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind + ? LLVMGetVectorSize(type) + : 1; + + for (unsigned i = 0; i < num_components; ++i) { + assert(src.swizzle[i] < src_components); + if (src.swizzle[i] != i) + need_swizzle = true; + } + + if (need_swizzle || num_components != src_components) { + LLVMValueRef masks[] = { + LLVMConstInt(ctx->i32, src.swizzle[0], false), + LLVMConstInt(ctx->i32, src.swizzle[1], false), + LLVMConstInt(ctx->i32, src.swizzle[2], false), + LLVMConstInt(ctx->i32, src.swizzle[3], false)}; + + if (src_components > 1 && num_components == 1) { + value = LLVMBuildExtractElement(ctx->builder, value, + masks[0], ""); + } else if (src_components == 1 && num_components > 1) { + LLVMValueRef values[] = {value, value, value, value}; + value = build_gather_values(ctx, values, num_components); + } else { + LLVMValueRef swizzle = LLVMConstVector(masks, num_components); + value = LLVMBuildShuffleVector(ctx->builder, value, value, + swizzle, ""); + } + } + assert(!src.negate); + assert(!src.abs); + return value; +} + +static LLVMValueRef emit_int_cmp(struct nir_to_llvm_context *ctx, + LLVMIntPredicate pred, LLVMValueRef src0, + LLVMValueRef src1) +{ + LLVMValueRef result = LLVMBuildICmp(ctx->builder, pred, src0, src1, ""); + return LLVMBuildSelect(ctx->builder, result, + LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), + LLVMConstInt(ctx->i32, 0, false), ""); +} + +static LLVMValueRef emit_float_cmp(struct nir_to_llvm_context *ctx, + LLVMRealPredicate pred, LLVMValueRef src0, + LLVMValueRef src1) +{ + LLVMValueRef result; + src0 = to_float(ctx, src0); + src1 = to_float(ctx, src1); + result = LLVMBuildFCmp(ctx->builder, pred, src0, src1, ""); + return LLVMBuildSelect(ctx->builder, result, + LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), + LLVMConstInt(ctx->i32, 0, false), ""); +} + +static LLVMValueRef emit_intrin_1f_param(struct nir_to_llvm_context *ctx, + const char *intrin, + LLVMValueRef src0) +{ + LLVMValueRef params[] = { + to_float(ctx, src0), + }; + return emit_llvm_intrinsic(ctx, intrin, ctx->f32, params, 1, LLVMReadNoneAttribute); +} + +static LLVMValueRef emit_intrin_2f_param(struct nir_to_llvm_context *ctx, + const char *intrin, + LLVMValueRef src0, LLVMValueRef src1) +{ + LLVMValueRef params[] = { + to_float(ctx, src0), + to_float(ctx, src1), + }; + return emit_llvm_intrinsic(ctx, intrin, ctx->f32, params, 2, LLVMReadNoneAttribute); +} + +static LLVMValueRef emit_intrin_3f_param(struct nir_to_llvm_context *ctx, + const char *intrin, + LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2) +{ + LLVMValueRef params[] = { + to_float(ctx, src0), + to_float(ctx, src1), + to_float(ctx, src2), + }; + return emit_llvm_intrinsic(ctx, intrin, ctx->f32, params, 3, LLVMReadNoneAttribute); +} + +static LLVMValueRef emit_bcsel(struct nir_to_llvm_context *ctx, + LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2) +{ + LLVMValueRef v = LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, + ctx->i32zero, ""); + return LLVMBuildSelect(ctx->builder, v, src1, src2, ""); +} + +static LLVMValueRef emit_find_lsb(struct nir_to_llvm_context *ctx, + LLVMValueRef src0) +{ + LLVMValueRef params[2] = { + src0, + + /* The value of 1 means that ffs(x=0) = undef, so LLVM won't + * add special code to check for x=0. The reason is that + * the LLVM behavior for x=0 is different from what we + * need here. + * + * The hardware already implements the correct behavior. + */ + LLVMConstInt(ctx->i32, 1, false), + }; + return emit_llvm_intrinsic(ctx, "llvm.cttz.i32", ctx->i32, params, 2, LLVMReadNoneAttribute); +} + +static LLVMValueRef emit_ifind_msb(struct nir_to_llvm_context *ctx, + LLVMValueRef src0) +{ + LLVMValueRef msb = emit_llvm_intrinsic(ctx, "llvm.AMDGPU.flbit.i32", + ctx->i32, &src0, 1, + LLVMReadNoneAttribute); + + /* The HW returns the last bit index from MSB, but NIR wants + * the index from LSB. Invert it by doing "31 - msb". */ + msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), + msb, ""); + + LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true); + LLVMValueRef cond = LLVMBuildOr(ctx->builder, + LLVMBuildICmp(ctx->builder, LLVMIntEQ, + src0, ctx->i32zero, ""), + LLVMBuildICmp(ctx->builder, LLVMIntEQ, + src0, all_ones, ""), ""); + + return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, ""); +} + +static LLVMValueRef emit_ufind_msb(struct nir_to_llvm_context *ctx, + LLVMValueRef src0) +{ + LLVMValueRef args[2] = { + src0, + ctx->i32one, + }; + LLVMValueRef msb = emit_llvm_intrinsic(ctx, "llvm.ctlz.i32", + ctx->i32, args, ARRAY_SIZE(args), + LLVMReadNoneAttribute); + + /* The HW returns the last bit index from MSB, but NIR wants + * the index from LSB. Invert it by doing "31 - msb". */ + msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), + msb, ""); + + return LLVMBuildSelect(ctx->builder, + LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, + ctx->i32zero, ""), + LLVMConstInt(ctx->i32, -1, true), msb, ""); +} + +static LLVMValueRef emit_minmax_int(struct nir_to_llvm_context *ctx, + LLVMIntPredicate pred, + LLVMValueRef src0, LLVMValueRef src1) +{ + return LLVMBuildSelect(ctx->builder, + LLVMBuildICmp(ctx->builder, pred, src0, src1, ""), + src0, + src1, ""); + +} +static LLVMValueRef emit_iabs(struct nir_to_llvm_context *ctx, + LLVMValueRef src0) +{ + return emit_minmax_int(ctx, LLVMIntSGT, src0, + LLVMBuildNeg(ctx->builder, src0, "")); +} + +static LLVMValueRef emit_fsign(struct nir_to_llvm_context *ctx, + LLVMValueRef src0) +{ + LLVMValueRef cmp, val; + + cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, ctx->f32zero, ""); + val = LLVMBuildSelect(ctx->builder, cmp, ctx->f32one, src0, ""); + cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, ctx->f32zero, ""); + val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(ctx->f32, -1.0), ""); + return val; +} + +static LLVMValueRef emit_isign(struct nir_to_llvm_context *ctx, + LLVMValueRef src0) +{ + LLVMValueRef cmp, val; + + cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, ctx->i32zero, ""); + val = LLVMBuildSelect(ctx->builder, cmp, ctx->i32one, src0, ""); + cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, ctx->i32zero, ""); + val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(ctx->i32, -1, true), ""); + return val; +} + +static LLVMValueRef emit_ffract(struct nir_to_llvm_context *ctx, + LLVMValueRef src0) +{ + const char *intr = "llvm.floor.f32"; + LLVMValueRef fsrc0 = to_float(ctx, src0); + LLVMValueRef params[] = { + fsrc0, + }; + LLVMValueRef floor = emit_llvm_intrinsic(ctx, intr, + ctx->f32, params, 1, + LLVMReadNoneAttribute); + return LLVMBuildFSub(ctx->builder, fsrc0, floor, ""); +} + +static LLVMValueRef emit_uint_carry(struct nir_to_llvm_context *ctx, + const char *intrin, + LLVMValueRef src0, LLVMValueRef src1) +{ + LLVMTypeRef ret_type; + LLVMTypeRef types[] = { ctx->i32, ctx->i1 }; + LLVMValueRef res; + LLVMValueRef params[] = { src0, src1 }; + ret_type = LLVMStructTypeInContext(ctx->context, types, + 2, true); + + res = emit_llvm_intrinsic(ctx, intrin, ret_type, + params, 2, LLVMReadNoneAttribute); + + res = LLVMBuildExtractValue(ctx->builder, res, 1, ""); + res = LLVMBuildZExt(ctx->builder, res, ctx->i32, ""); + return res; +} + +static LLVMValueRef emit_b2f(struct nir_to_llvm_context *ctx, + LLVMValueRef src0) +{ + return LLVMBuildAnd(ctx->builder, src0, LLVMBuildBitCast(ctx->builder, LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""), ""); +} + +static LLVMValueRef emit_umul_high(struct nir_to_llvm_context *ctx, + LLVMValueRef src0, LLVMValueRef src1) +{ + LLVMValueRef dst64, result; + src0 = LLVMBuildZExt(ctx->builder, src0, ctx->i64, ""); + src1 = LLVMBuildZExt(ctx->builder, src1, ctx->i64, ""); + + dst64 = LLVMBuildMul(ctx->builder, src0, src1, ""); + dst64 = LLVMBuildLShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), ""); + result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, ""); + return result; +} + +static LLVMValueRef emit_imul_high(struct nir_to_llvm_context *ctx, + LLVMValueRef src0, LLVMValueRef src1) +{ + LLVMValueRef dst64, result; + src0 = LLVMBuildSExt(ctx->builder, src0, ctx->i64, ""); + src1 = LLVMBuildSExt(ctx->builder, src1, ctx->i64, ""); + + dst64 = LLVMBuildMul(ctx->builder, src0, src1, ""); + dst64 = LLVMBuildAShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), ""); + result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, ""); + return result; +} + +static LLVMValueRef emit_bitfield_extract(struct nir_to_llvm_context *ctx, + const char *intrin, + LLVMValueRef srcs[3]) +{ + LLVMValueRef result; + LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, srcs[2], LLVMConstInt(ctx->i32, 32, false), ""); + result = emit_llvm_intrinsic(ctx, intrin, ctx->i32, srcs, 3, LLVMReadNoneAttribute); + + result = LLVMBuildSelect(ctx->builder, icond, srcs[0], result, ""); + return result; +} + +static LLVMValueRef emit_bitfield_insert(struct nir_to_llvm_context *ctx, + LLVMValueRef src0, LLVMValueRef src1, + LLVMValueRef src2, LLVMValueRef src3) +{ + LLVMValueRef bfi_args[3], result; + + bfi_args[0] = LLVMBuildShl(ctx->builder, + LLVMBuildSub(ctx->builder, + LLVMBuildShl(ctx->builder, + ctx->i32one, + src3, ""), + ctx->i32one, ""), + src2, ""); + bfi_args[1] = LLVMBuildShl(ctx->builder, src1, src2, ""); + bfi_args[2] = src0; + + LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, src3, LLVMConstInt(ctx->i32, 32, false), ""); + + /* Calculate: + * (arg0 & arg1) | (~arg0 & arg2) = arg2 ^ (arg0 & (arg1 ^ arg2) + * Use the right-hand side, which the LLVM backend can convert to V_BFI. + */ + result = LLVMBuildXor(ctx->builder, bfi_args[2], + LLVMBuildAnd(ctx->builder, bfi_args[0], + LLVMBuildXor(ctx->builder, bfi_args[1], bfi_args[2], ""), ""), ""); + + result = LLVMBuildSelect(ctx->builder, icond, src1, result, ""); + return result; +} + +static LLVMValueRef emit_pack_half_2x16(struct nir_to_llvm_context *ctx, + LLVMValueRef src0) +{ + LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false); + int i; + LLVMValueRef comp[2]; + + src0 = to_float(ctx, src0); + comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32zero, ""); + comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32one, ""); + for (i = 0; i < 2; i++) { + comp[i] = LLVMBuildFPTrunc(ctx->builder, comp[i], ctx->f16, ""); + comp[i] = LLVMBuildBitCast(ctx->builder, comp[i], ctx->i16, ""); + comp[i] = LLVMBuildZExt(ctx->builder, comp[i], ctx->i32, ""); + } + + comp[1] = LLVMBuildShl(ctx->builder, comp[1], const16, ""); + comp[0] = LLVMBuildOr(ctx->builder, comp[0], comp[1], ""); + + return comp[0]; +} + +static LLVMValueRef emit_unpack_half_2x16(struct nir_to_llvm_context *ctx, + LLVMValueRef src0) +{ + LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false); + LLVMValueRef temps[2], result, val; + int i; + + for (i = 0; i < 2; i++) { + val = i == 1 ? LLVMBuildLShr(ctx->builder, src0, const16, "") : src0; + val = LLVMBuildTrunc(ctx->builder, val, ctx->i16, ""); + val = LLVMBuildBitCast(ctx->builder, val, ctx->f16, ""); + temps[i] = LLVMBuildFPExt(ctx->builder, val, ctx->f32, ""); + } + + result = LLVMBuildInsertElement(ctx->builder, LLVMGetUndef(ctx->v2f32), temps[0], + ctx->i32zero, ""); + result = LLVMBuildInsertElement(ctx->builder, result, temps[1], + ctx->i32one, ""); + return result; +} + +/** + * Set range metadata on an instruction. This can only be used on load and + * call instructions. If you know an instruction can only produce the values + * 0, 1, 2, you would do set_range_metadata(value, 0, 3); + * \p lo is the minimum value inclusive. + * \p hi is the maximum value exclusive. + */ +static void set_range_metadata(struct nir_to_llvm_context *ctx, + LLVMValueRef value, unsigned lo, unsigned hi) +{ + LLVMValueRef range_md, md_args[2]; + LLVMTypeRef type = LLVMTypeOf(value); + LLVMContextRef context = LLVMGetTypeContext(type); + + md_args[0] = LLVMConstInt(type, lo, false); + md_args[1] = LLVMConstInt(type, hi, false); + range_md = LLVMMDNodeInContext(context, md_args, 2); + LLVMSetMetadata(value, ctx->range_md_kind, range_md); +} + +static LLVMValueRef get_thread_id(struct nir_to_llvm_context *ctx) +{ + LLVMValueRef tid; + LLVMValueRef tid_args[2]; + tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false); + tid_args[1] = ctx->i32zero; + tid_args[1] = emit_llvm_intrinsic(ctx, + "llvm.amdgcn.mbcnt.lo", ctx->i32, + tid_args, 2, LLVMReadNoneAttribute); + + tid = emit_llvm_intrinsic(ctx, + "llvm.amdgcn.mbcnt.hi", ctx->i32, + tid_args, 2, LLVMReadNoneAttribute); + set_range_metadata(ctx, tid, 0, 64); + return tid; +} + +/* + * SI implements derivatives using the local data store (LDS) + * All writes to the LDS happen in all executing threads at + * the same time. TID is the Thread ID for the current + * thread and is a value between 0 and 63, representing + * the thread's position in the wavefront. + * + * For the pixel shader threads are grouped into quads of four pixels. + * The TIDs of the pixels of a quad are: + * + * +------+------+ + * |4n + 0|4n + 1| + * +------+------+ + * |4n + 2|4n + 3| + * +------+------+ + * + * So, masking the TID with 0xfffffffc yields the TID of the top left pixel + * of the quad, masking with 0xfffffffd yields the TID of the top pixel of + * the current pixel's column, and masking with 0xfffffffe yields the TID + * of the left pixel of the current pixel's row. + * + * Adding 1 yields the TID of the pixel to the right of the left pixel, and + * adding 2 yields the TID of the pixel below the top pixel. + */ +/* masks for thread ID. */ +#define TID_MASK_TOP_LEFT 0xfffffffc +#define TID_MASK_TOP 0xfffffffd +#define TID_MASK_LEFT 0xfffffffe +static LLVMValueRef emit_ddxy(struct nir_to_llvm_context *ctx, + nir_alu_instr *instr, + LLVMValueRef src0) +{ + LLVMValueRef indices[2]; + LLVMValueRef store_ptr, load_ptr0, load_ptr1; + LLVMValueRef tl, trbl, result; + LLVMValueRef tl_tid, trbl_tid; + LLVMValueRef args[2]; + unsigned mask; + int idx; + ctx->has_ddxy = true; + if (!ctx->lds) + ctx->lds = LLVMAddGlobalInAddressSpace(ctx->module, + LLVMArrayType(ctx->i32, 64), + "ddxy_lds", LOCAL_ADDR_SPACE); + + indices[0] = ctx->i32zero; + indices[1] = get_thread_id(ctx); + store_ptr = LLVMBuildGEP(ctx->builder, ctx->lds, + indices, 2, ""); + + if (instr->op == nir_op_fddx_fine || instr->op == nir_op_fddx) + mask = TID_MASK_LEFT; + else if (instr->op == nir_op_fddy_fine || instr->op == nir_op_fddy) + mask = TID_MASK_TOP; + else + mask = TID_MASK_TOP_LEFT; + + tl_tid = LLVMBuildAnd(ctx->builder, indices[1], + LLVMConstInt(ctx->i32, mask, false), ""); + indices[1] = tl_tid; + load_ptr0 = LLVMBuildGEP(ctx->builder, ctx->lds, + indices, 2, ""); + + /* for DDX we want to next X pixel, DDY next Y pixel. */ + if (instr->op == nir_op_fddx_fine || + instr->op == nir_op_fddx_coarse || + instr->op == nir_op_fddx) + idx = 1; + else + idx = 2; + + trbl_tid = LLVMBuildAdd(ctx->builder, indices[1], + LLVMConstInt(ctx->i32, idx, false), ""); + indices[1] = trbl_tid; + load_ptr1 = LLVMBuildGEP(ctx->builder, ctx->lds, + indices, 2, ""); + + if (ctx->options->family >= CHIP_TONGA) { + args[0] = LLVMBuildMul(ctx->builder, tl_tid, + LLVMConstInt(ctx->i32, 4, false), ""); + args[1] = src0; + tl = emit_llvm_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", + ctx->i32, args, 2, + LLVMReadNoneAttribute); + + args[0] = LLVMBuildMul(ctx->builder, trbl_tid, + LLVMConstInt(ctx->i32, 4, false), ""); + trbl = emit_llvm_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", + ctx->i32, args, 2, + LLVMReadNoneAttribute); + } else { + LLVMBuildStore(ctx->builder, src0, store_ptr); + + tl = LLVMBuildLoad(ctx->builder, load_ptr0, ""); + trbl = LLVMBuildLoad(ctx->builder, load_ptr1, ""); + } + tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, ""); + trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, ""); + result = LLVMBuildFSub(ctx->builder, trbl, tl, ""); + return result; +} + +/* + * this takes an I,J coordinate pair, + * and works out the X and Y derivatives. + * it returns DDX(I), DDX(J), DDY(I), DDY(J). + */ +static LLVMValueRef emit_ddxy_interp( + struct nir_to_llvm_context *ctx, + LLVMValueRef interp_ij) +{ + LLVMValueRef indices[2]; + LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2; + LLVMValueRef tl, tr, bl, result[4]; + unsigned c; + + if (!ctx->lds) + ctx->lds = LLVMAddGlobalInAddressSpace(ctx->module, + LLVMArrayType(ctx->i32, 64), + "ddxy_lds", LOCAL_ADDR_SPACE); + + indices[0] = ctx->i32zero; + indices[1] = get_thread_id(ctx); + store_ptr = LLVMBuildGEP(ctx->builder, ctx->lds, + indices, 2, ""); + + temp = LLVMBuildAnd(ctx->builder, indices[1], + LLVMConstInt(ctx->i32, TID_MASK_LEFT, false), ""); + + temp2 = LLVMBuildAnd(ctx->builder, indices[1], + LLVMConstInt(ctx->i32, TID_MASK_TOP, false), ""); + + indices[1] = temp; + load_ptr_x = LLVMBuildGEP(ctx->builder, ctx->lds, + indices, 2, ""); + + indices[1] = temp2; + load_ptr_y = LLVMBuildGEP(ctx->builder, ctx->lds, + indices, 2, ""); + + indices[1] = LLVMBuildAdd(ctx->builder, temp, + LLVMConstInt(ctx->i32, 1, false), ""); + load_ptr_ddx = LLVMBuildGEP(ctx->builder, ctx->lds, + indices, 2, ""); + + indices[1] = LLVMBuildAdd(ctx->builder, temp2, + LLVMConstInt(ctx->i32, 2, false), ""); + load_ptr_ddy = LLVMBuildGEP(ctx->builder, ctx->lds, + indices, 2, ""); + + for (c = 0; c < 2; ++c) { + LLVMValueRef store_val; + LLVMValueRef c_ll = LLVMConstInt(ctx->i32, c, false); + + store_val = LLVMBuildExtractElement(ctx->builder, + interp_ij, c_ll, ""); + LLVMBuildStore(ctx->builder, + store_val, + store_ptr); + + tl = LLVMBuildLoad(ctx->builder, load_ptr_x, ""); + tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, ""); + + tr = LLVMBuildLoad(ctx->builder, load_ptr_ddx, ""); + tr = LLVMBuildBitCast(ctx->builder, tr, ctx->f32, ""); + + result[c] = LLVMBuildFSub(ctx->builder, tr, tl, ""); + + tl = LLVMBuildLoad(ctx->builder, load_ptr_y, ""); + tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, ""); + + bl = LLVMBuildLoad(ctx->builder, load_ptr_ddy, ""); + bl = LLVMBuildBitCast(ctx->builder, bl, ctx->f32, ""); + + result[c + 2] = LLVMBuildFSub(ctx->builder, bl, tl, ""); + } + + return build_gather_values(ctx, result, 4); +} + +static LLVMValueRef emit_fdiv(struct nir_to_llvm_context *ctx, + LLVMValueRef num, + LLVMValueRef den) +{ + LLVMValueRef ret = LLVMBuildFDiv(ctx->builder, num, den, ""); + + if (!LLVMIsConstant(ret)) + LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp); + return ret; +} + +static void visit_alu(struct nir_to_llvm_context *ctx, nir_alu_instr *instr) +{ + LLVMValueRef src[4], result = NULL; + unsigned num_components = instr->dest.dest.ssa.num_components; + unsigned src_components; + + assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src)); + switch (instr->op) { + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: + src_components = 1; + break; + case nir_op_pack_half_2x16: + src_components = 2; + break; + case nir_op_unpack_half_2x16: + src_components = 1; + break; + default: + src_components = num_components; + break; + } + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) + src[i] = get_alu_src(ctx, instr->src[i], src_components); + + switch (instr->op) { + case nir_op_fmov: + case nir_op_imov: + result = src[0]; + break; + case nir_op_fneg: + src[0] = to_float(ctx, src[0]); + result = LLVMBuildFNeg(ctx->builder, src[0], ""); + break; + case nir_op_ineg: + result = LLVMBuildNeg(ctx->builder, src[0], ""); + break; + case nir_op_inot: + result = LLVMBuildNot(ctx->builder, src[0], ""); + break; + case nir_op_iadd: + result = LLVMBuildAdd(ctx->builder, src[0], src[1], ""); + break; + case nir_op_fadd: + src[0] = to_float(ctx, src[0]); + src[1] = to_float(ctx, src[1]); + result = LLVMBuildFAdd(ctx->builder, src[0], src[1], ""); + break; + case nir_op_fsub: + src[0] = to_float(ctx, src[0]); + src[1] = to_float(ctx, src[1]); + result = LLVMBuildFSub(ctx->builder, src[0], src[1], ""); + break; + case nir_op_isub: + result = LLVMBuildSub(ctx->builder, src[0], src[1], ""); + break; + case nir_op_imul: + result = LLVMBuildMul(ctx->builder, src[0], src[1], ""); + break; + case nir_op_imod: + result = LLVMBuildSRem(ctx->builder, src[0], src[1], ""); + break; + case nir_op_umod: + result = LLVMBuildURem(ctx->builder, src[0], src[1], ""); + break; + case nir_op_fmod: + src[0] = to_float(ctx, src[0]); + src[1] = to_float(ctx, src[1]); + result = emit_fdiv(ctx, src[0], src[1]); + result = emit_intrin_1f_param(ctx, "llvm.floor.f32", result); + result = LLVMBuildFMul(ctx->builder, src[1] , result, ""); + result = LLVMBuildFSub(ctx->builder, src[0], result, ""); + break; + case nir_op_frem: + src[0] = to_float(ctx, src[0]); + src[1] = to_float(ctx, src[1]); + result = LLVMBuildFRem(ctx->builder, src[0], src[1], ""); + break; + case nir_op_idiv: + result = LLVMBuildSDiv(ctx->builder, src[0], src[1], ""); + break; + case nir_op_udiv: + result = LLVMBuildUDiv(ctx->builder, src[0], src[1], ""); + break; + case nir_op_fmul: + src[0] = to_float(ctx, src[0]); + src[1] = to_float(ctx, src[1]); + result = LLVMBuildFMul(ctx->builder, src[0], src[1], ""); + break; + case nir_op_fdiv: + src[0] = to_float(ctx, src[0]); + src[1] = to_float(ctx, src[1]); + result = emit_fdiv(ctx, src[0], src[1]); + break; + case nir_op_frcp: + src[0] = to_float(ctx, src[0]); + result = emit_fdiv(ctx, ctx->f32one, src[0]); + break; + case nir_op_iand: + result = LLVMBuildAnd(ctx->builder, src[0], src[1], ""); + break; + case nir_op_ior: + result = LLVMBuildOr(ctx->builder, src[0], src[1], ""); + break; + case nir_op_ixor: + result = LLVMBuildXor(ctx->builder, src[0], src[1], ""); + break; + case nir_op_ishl: + result = LLVMBuildShl(ctx->builder, src[0], src[1], ""); + break; + case nir_op_ishr: + result = LLVMBuildAShr(ctx->builder, src[0], src[1], ""); + break; + case nir_op_ushr: + result = LLVMBuildLShr(ctx->builder, src[0], src[1], ""); + break; + case nir_op_ilt: + result = emit_int_cmp(ctx, LLVMIntSLT, src[0], src[1]); + break; + case nir_op_ine: + result = emit_int_cmp(ctx, LLVMIntNE, src[0], src[1]); + break; + case nir_op_ieq: + result = emit_int_cmp(ctx, LLVMIntEQ, src[0], src[1]); + break; + case nir_op_ige: + result = emit_int_cmp(ctx, LLVMIntSGE, src[0], src[1]); + break; + case nir_op_ult: + result = emit_int_cmp(ctx, LLVMIntULT, src[0], src[1]); + break; + case nir_op_uge: + result = emit_int_cmp(ctx, LLVMIntUGE, src[0], src[1]); + break; + case nir_op_feq: + result = emit_float_cmp(ctx, LLVMRealUEQ, src[0], src[1]); + break; + case nir_op_fne: + result = emit_float_cmp(ctx, LLVMRealUNE, src[0], src[1]); + break; + case nir_op_flt: + result = emit_float_cmp(ctx, LLVMRealULT, src[0], src[1]); + break; + case nir_op_fge: + result = emit_float_cmp(ctx, LLVMRealUGE, src[0], src[1]); + break; + case nir_op_fabs: + result = emit_intrin_1f_param(ctx, "llvm.fabs.f32", src[0]); + break; + case nir_op_iabs: + result = emit_iabs(ctx, src[0]); + break; + case nir_op_imax: + result = emit_minmax_int(ctx, LLVMIntSGT, src[0], src[1]); + break; + case nir_op_imin: + result = emit_minmax_int(ctx, LLVMIntSLT, src[0], src[1]); + break; + case nir_op_umax: + result = emit_minmax_int(ctx, LLVMIntUGT, src[0], src[1]); + break; + case nir_op_umin: + result = emit_minmax_int(ctx, LLVMIntULT, src[0], src[1]); + break; + case nir_op_isign: + result = emit_isign(ctx, src[0]); + break; + case nir_op_fsign: + src[0] = to_float(ctx, src[0]); + result = emit_fsign(ctx, src[0]); + break; + case nir_op_ffloor: + result = emit_intrin_1f_param(ctx, "llvm.floor.f32", src[0]); + break; + case nir_op_ftrunc: + result = emit_intrin_1f_param(ctx, "llvm.trunc.f32", src[0]); + break; + case nir_op_fceil: + result = emit_intrin_1f_param(ctx, "llvm.ceil.f32", src[0]); + break; + case nir_op_fround_even: + result = emit_intrin_1f_param(ctx, "llvm.rint.f32", src[0]); + break; + case nir_op_ffract: + result = emit_ffract(ctx, src[0]); + break; + case nir_op_fsin: + result = emit_intrin_1f_param(ctx, "llvm.sin.f32", src[0]); + break; + case nir_op_fcos: + result = emit_intrin_1f_param(ctx, "llvm.cos.f32", src[0]); + break; + case nir_op_fsqrt: + result = emit_intrin_1f_param(ctx, "llvm.sqrt.f32", src[0]); + break; + case nir_op_fexp2: + result = emit_intrin_1f_param(ctx, "llvm.exp2.f32", src[0]); + break; + case nir_op_flog2: + result = emit_intrin_1f_param(ctx, "llvm.log2.f32", src[0]); + break; + case nir_op_frsq: + result = emit_intrin_1f_param(ctx, "llvm.sqrt.f32", src[0]); + result = emit_fdiv(ctx, ctx->f32one, result); + break; + case nir_op_fpow: + result = emit_intrin_2f_param(ctx, "llvm.pow.f32", src[0], src[1]); + break; + case nir_op_fmax: + result = emit_intrin_2f_param(ctx, "llvm.maxnum.f32", src[0], src[1]); + break; + case nir_op_fmin: + result = emit_intrin_2f_param(ctx, "llvm.minnum.f32", src[0], src[1]); + break; + case nir_op_ffma: + result = emit_intrin_3f_param(ctx, "llvm.fma.f32", src[0], src[1], src[2]); + break; + case nir_op_ibitfield_extract: + result = emit_bitfield_extract(ctx, "llvm.AMDGPU.bfe.i32", src); + break; + case nir_op_ubitfield_extract: + result = emit_bitfield_extract(ctx, "llvm.AMDGPU.bfe.u32", src); + break; + case nir_op_bitfield_insert: + result = emit_bitfield_insert(ctx, src[0], src[1], src[2], src[3]); + break; + case nir_op_bitfield_reverse: + result = emit_llvm_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, src, 1, LLVMReadNoneAttribute); + break; + case nir_op_bit_count: + result = emit_llvm_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, src, 1, LLVMReadNoneAttribute); + break; + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) + src[i] = to_integer(ctx, src[i]); + result = build_gather_values(ctx, src, num_components); + break; + case nir_op_f2i: + src[0] = to_float(ctx, src[0]); + result = LLVMBuildFPToSI(ctx->builder, src[0], ctx->i32, ""); + break; + case nir_op_f2u: + src[0] = to_float(ctx, src[0]); + result = LLVMBuildFPToUI(ctx->builder, src[0], ctx->i32, ""); + break; + case nir_op_i2f: + result = LLVMBuildSIToFP(ctx->builder, src[0], ctx->f32, ""); + break; + case nir_op_u2f: + result = LLVMBuildUIToFP(ctx->builder, src[0], ctx->f32, ""); + break; + case nir_op_bcsel: + result = emit_bcsel(ctx, src[0], src[1], src[2]); + break; + case nir_op_find_lsb: + result = emit_find_lsb(ctx, src[0]); + break; + case nir_op_ufind_msb: + result = emit_ufind_msb(ctx, src[0]); + break; + case nir_op_ifind_msb: + result = emit_ifind_msb(ctx, src[0]); + break; + case nir_op_uadd_carry: + result = emit_uint_carry(ctx, "llvm.uadd.with.overflow.i32", src[0], src[1]); + break; + case nir_op_usub_borrow: + result = emit_uint_carry(ctx, "llvm.usub.with.overflow.i32", src[0], src[1]); + break; + case nir_op_b2f: + result = emit_b2f(ctx, src[0]); + break; + case nir_op_fquantize2f16: + src[0] = to_float(ctx, src[0]); + result = LLVMBuildFPTrunc(ctx->builder, src[0], ctx->f16, ""); + /* need to convert back up to f32 */ + result = LLVMBuildFPExt(ctx->builder, result, ctx->f32, ""); + break; + case nir_op_umul_high: + result = emit_umul_high(ctx, src[0], src[1]); + break; + case nir_op_imul_high: + result = emit_imul_high(ctx, src[0], src[1]); + break; + case nir_op_pack_half_2x16: + result = emit_pack_half_2x16(ctx, src[0]); + break; + case nir_op_unpack_half_2x16: + result = emit_unpack_half_2x16(ctx, src[0]); + break; + case nir_op_fddx: + case nir_op_fddy: + case nir_op_fddx_fine: + case nir_op_fddy_fine: + case nir_op_fddx_coarse: + case nir_op_fddy_coarse: + result = emit_ddxy(ctx, instr, src[0]); + break; + default: + fprintf(stderr, "Unknown NIR alu instr: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + + if (result) { + assert(instr->dest.dest.is_ssa); + result = to_integer(ctx, result); + _mesa_hash_table_insert(ctx->defs, &instr->dest.dest.ssa, + result); + } +} + +static void visit_load_const(struct nir_to_llvm_context *ctx, + nir_load_const_instr *instr) +{ + LLVMValueRef values[4], value = NULL; + LLVMTypeRef element_type = + LLVMIntTypeInContext(ctx->context, instr->def.bit_size); + + for (unsigned i = 0; i < instr->def.num_components; ++i) { + switch (instr->def.bit_size) { + case 32: + values[i] = LLVMConstInt(element_type, + instr->value.u32[i], false); + break; + case 64: + values[i] = LLVMConstInt(element_type, + instr->value.u64[i], false); + break; + default: + fprintf(stderr, + "unsupported nir load_const bit_size: %d\n", + instr->def.bit_size); + abort(); + } + } + if (instr->def.num_components > 1) { + value = LLVMConstVector(values, instr->def.num_components); + } else + value = values[0]; + + _mesa_hash_table_insert(ctx->defs, &instr->def, value); +} + +static LLVMValueRef cast_ptr(struct nir_to_llvm_context *ctx, LLVMValueRef ptr, + LLVMTypeRef type) +{ + int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); + return LLVMBuildBitCast(ctx->builder, ptr, + LLVMPointerType(type, addr_space), ""); +} + +static LLVMValueRef +emit_llvm_intrinsic(struct nir_to_llvm_context *ctx, const char *name, + LLVMTypeRef return_type, LLVMValueRef *params, + unsigned param_count, LLVMAttribute attribs) +{ + LLVMValueRef function; + + function = LLVMGetNamedFunction(ctx->module, name); + if (!function) { + LLVMTypeRef param_types[32], function_type; + unsigned i; + + assert(param_count <= 32); + + for (i = 0; i < param_count; ++i) { + assert(params[i]); + param_types[i] = LLVMTypeOf(params[i]); + } + function_type = + LLVMFunctionType(return_type, param_types, param_count, 0); + function = LLVMAddFunction(ctx->module, name, function_type); + + LLVMSetFunctionCallConv(function, LLVMCCallConv); + LLVMSetLinkage(function, LLVMExternalLinkage); + + LLVMAddFunctionAttr(function, attribs | LLVMNoUnwindAttribute); + } + return LLVMBuildCall(ctx->builder, function, params, param_count, ""); +} + +static LLVMValueRef +get_buffer_size(struct nir_to_llvm_context *ctx, LLVMValueRef descriptor, bool in_elements) +{ + LLVMValueRef size = + LLVMBuildExtractElement(ctx->builder, descriptor, + LLVMConstInt(ctx->i32, 2, false), ""); + + /* VI only */ + if (ctx->options->chip_class >= VI && in_elements) { + /* On VI, the descriptor contains the size in bytes, + * but TXQ must return the size in elements. + * The stride is always non-zero for resources using TXQ. + */ + LLVMValueRef stride = + LLVMBuildExtractElement(ctx->builder, descriptor, + LLVMConstInt(ctx->i32, 1, false), ""); + stride = LLVMBuildLShr(ctx->builder, stride, + LLVMConstInt(ctx->i32, 16, false), ""); + stride = LLVMBuildAnd(ctx->builder, stride, + LLVMConstInt(ctx->i32, 0x3fff, false), ""); + + size = LLVMBuildUDiv(ctx->builder, size, stride, ""); + } + return size; +} + +/** + * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with + * intrinsic names). + */ +static void build_int_type_name( + LLVMTypeRef type, + char *buf, unsigned bufsize) +{ + assert(bufsize >= 6); + + if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) + snprintf(buf, bufsize, "v%ui32", + LLVMGetVectorSize(type)); + else + strcpy(buf, "i32"); +} + +static LLVMValueRef radv_lower_gather4_integer(struct nir_to_llvm_context *ctx, + struct ac_tex_info *tinfo, + nir_tex_instr *instr, + const char *intr_name, + unsigned coord_vgpr_index) +{ + LLVMValueRef coord = tinfo->args[0]; + LLVMValueRef half_texel[2]; + int c; + + //TODO Rect + { + LLVMValueRef txq_args[10]; + int txq_arg_count = 0; + LLVMValueRef size; + bool da = instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE; + txq_args[txq_arg_count++] = LLVMConstInt(ctx->i32, 0, false); + txq_args[txq_arg_count++] = tinfo->args[1]; + txq_args[txq_arg_count++] = LLVMConstInt(ctx->i32, 0xf, 0); /* dmask */ + txq_args[txq_arg_count++] = LLVMConstInt(ctx->i32, 0, 0); /* unorm */ + txq_args[txq_arg_count++] = LLVMConstInt(ctx->i32, 0, 0); /* r128 */ + txq_args[txq_arg_count++] = LLVMConstInt(ctx->i32, da ? 1 : 0, 0); + txq_args[txq_arg_count++] = LLVMConstInt(ctx->i32, 0, 0); /* glc */ + txq_args[txq_arg_count++] = LLVMConstInt(ctx->i32, 0, 0); /* slc */ + txq_args[txq_arg_count++] = LLVMConstInt(ctx->i32, 0, 0); /* tfe */ + txq_args[txq_arg_count++] = LLVMConstInt(ctx->i32, 0, 0); /* lwe */ + size = emit_llvm_intrinsic(ctx, "llvm.SI.getresinfo.i32", ctx->v4i32, + txq_args, txq_arg_count, + LLVMReadNoneAttribute); + + for (c = 0; c < 2; c++) { + half_texel[c] = LLVMBuildExtractElement(ctx->builder, size, + ctx->i32zero, ""); + half_texel[c] = LLVMBuildUIToFP(ctx->builder, half_texel[c], ctx->f32, ""); + half_texel[c] = emit_fdiv(ctx, ctx->f32one, half_texel[c]); + half_texel[c] = LLVMBuildFMul(ctx->builder, half_texel[c], + LLVMConstReal(ctx->f32, -0.5), ""); + } + } + + for (c = 0; c < 2; c++) { + LLVMValueRef tmp; + LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0); + tmp = LLVMBuildExtractElement(ctx->builder, coord, index, ""); + tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, ""); + tmp = LLVMBuildFAdd(ctx->builder, tmp, half_texel[c], ""); + tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, ""); + coord = LLVMBuildInsertElement(ctx->builder, coord, tmp, index, ""); + } + + tinfo->args[0] = coord; + return emit_llvm_intrinsic(ctx, intr_name, tinfo->dst_type, tinfo->args, tinfo->arg_count, + LLVMReadNoneAttribute | LLVMNoUnwindAttribute); + +} + +static LLVMValueRef build_tex_intrinsic(struct nir_to_llvm_context *ctx, + nir_tex_instr *instr, + struct ac_tex_info *tinfo) +{ + const char *name = "llvm.SI.image.sample"; + const char *infix = ""; + char intr_name[127]; + char type[64]; + bool is_shadow = instr->is_shadow; + bool has_offset = tinfo->has_offset; + switch (instr->op) { + case nir_texop_txf: + case nir_texop_txf_ms: + case nir_texop_samples_identical: + name = instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? "llvm.SI.image.load" : + instr->sampler_dim == GLSL_SAMPLER_DIM_BUF ? "llvm.SI.vs.load.input" : + "llvm.SI.image.load.mip"; + is_shadow = false; + has_offset = false; + break; + case nir_texop_txb: + infix = ".b"; + break; + case nir_texop_txl: + infix = ".l"; + break; + case nir_texop_txs: + name = "llvm.SI.getresinfo"; + break; + case nir_texop_query_levels: + name = "llvm.SI.getresinfo"; + break; + case nir_texop_tex: + if (ctx->stage != MESA_SHADER_FRAGMENT) + infix = ".lz"; + break; + case nir_texop_txd: + infix = ".d"; + break; + case nir_texop_tg4: + name = "llvm.SI.gather4"; + infix = ".lz"; + break; + case nir_texop_lod: + name = "llvm.SI.getlod"; + is_shadow = false; + has_offset = false; + break; + default: + break; + } + + build_int_type_name(LLVMTypeOf(tinfo->args[0]), type, sizeof(type)); + sprintf(intr_name, "%s%s%s%s.%s", name, is_shadow ? ".c" : "", infix, + has_offset ? ".o" : "", type); + + if (instr->op == nir_texop_tg4) { + enum glsl_base_type stype = glsl_get_sampler_result_type(instr->texture->var->type); + if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) { + return radv_lower_gather4_integer(ctx, tinfo, instr, intr_name, + (int)has_offset + (int)is_shadow); + } + } + return emit_llvm_intrinsic(ctx, intr_name, tinfo->dst_type, tinfo->args, tinfo->arg_count, + LLVMReadNoneAttribute | LLVMNoUnwindAttribute); + +} + +static LLVMValueRef visit_vulkan_resource_index(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef index = get_src(ctx, instr->src[0]); + unsigned desc_set = nir_intrinsic_desc_set(instr); + unsigned binding = nir_intrinsic_binding(instr); + LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set]; + struct radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout; + unsigned base_offset = layout->binding[binding].offset; + LLVMValueRef offset, stride; + + if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || + layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { + desc_ptr = ctx->push_constants; + base_offset = ctx->options->layout->push_constant_size; + base_offset += 16 * layout->binding[binding].dynamic_offset_offset; + stride = LLVMConstInt(ctx->i32, 16, false); + } else + stride = LLVMConstInt(ctx->i32, layout->binding[binding].size, false); + + offset = LLVMConstInt(ctx->i32, base_offset, false); + index = LLVMBuildMul(ctx->builder, index, stride, ""); + offset = LLVMBuildAdd(ctx->builder, offset, index, ""); + + LLVMValueRef indices[] = {ctx->i32zero, offset}; + desc_ptr = LLVMBuildGEP(ctx->builder, desc_ptr, indices, 2, ""); + desc_ptr = cast_ptr(ctx, desc_ptr, ctx->v4i32); + LLVMSetMetadata(desc_ptr, ctx->uniform_md_kind, ctx->empty_md); + + return LLVMBuildLoad(ctx->builder, desc_ptr, ""); +} + +static LLVMValueRef visit_load_push_constant(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef ptr; + + LLVMValueRef indices[] = {ctx->i32zero, get_src(ctx, instr->src[0])}; + ptr = LLVMBuildGEP(ctx->builder, ctx->push_constants, indices, 2, ""); + ptr = cast_ptr(ctx, ptr, get_def_type(ctx, &instr->dest.ssa)); + + return LLVMBuildLoad(ctx->builder, ptr, ""); +} + +static LLVMValueRef visit_get_buffer_size(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef desc = get_src(ctx, instr->src[0]); + + return get_buffer_size(ctx, desc, false); +} +static void visit_store_ssbo(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + const char *store_name; + LLVMTypeRef data_type = ctx->f32; + unsigned writemask = nir_intrinsic_write_mask(instr); + LLVMValueRef base_data, base_offset; + LLVMValueRef params[6]; + + if (ctx->stage == MESA_SHADER_FRAGMENT) + ctx->shader_info->fs.writes_memory = true; + + params[1] = get_src(ctx, instr->src[1]); + params[2] = LLVMConstInt(ctx->i32, 0, false); /* vindex */ + params[4] = LLVMConstInt(ctx->i1, 0, false); /* glc */ + params[5] = LLVMConstInt(ctx->i1, 0, false); /* slc */ + + if (instr->num_components > 1) + data_type = LLVMVectorType(ctx->f32, instr->num_components); + + base_data = to_float(ctx, get_src(ctx, instr->src[0])); + base_data = trim_vector(ctx, base_data, instr->num_components); + base_data = LLVMBuildBitCast(ctx->builder, base_data, + data_type, ""); + base_offset = get_src(ctx, instr->src[2]); /* voffset */ + while (writemask) { + int start, count; + LLVMValueRef data; + LLVMValueRef offset; + LLVMValueRef tmp; + u_bit_scan_consecutive_range(&writemask, &start, &count); + + /* Due to an LLVM limitation, split 3-element writes + * into a 2-element and a 1-element write. */ + if (count == 3) { + writemask |= 1 << (start + 2); + count = 2; + } + + if (count == 4) { + store_name = "llvm.amdgcn.buffer.store.v4f32"; + data = base_data; + } else if (count == 2) { + tmp = LLVMBuildExtractElement(ctx->builder, + base_data, LLVMConstInt(ctx->i32, start, false), ""); + data = LLVMBuildInsertElement(ctx->builder, LLVMGetUndef(ctx->v2f32), tmp, + ctx->i32zero, ""); + + tmp = LLVMBuildExtractElement(ctx->builder, + base_data, LLVMConstInt(ctx->i32, start + 1, false), ""); + data = LLVMBuildInsertElement(ctx->builder, data, tmp, + ctx->i32one, ""); + store_name = "llvm.amdgcn.buffer.store.v2f32"; + + } else { + assert(count == 1); + if (get_llvm_num_components(base_data) > 1) + data = LLVMBuildExtractElement(ctx->builder, base_data, + LLVMConstInt(ctx->i32, start, false), ""); + else + data = base_data; + store_name = "llvm.amdgcn.buffer.store.f32"; + } + + offset = base_offset; + if (start != 0) { + offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, start * 4, false), ""); + } + params[0] = data; + params[3] = offset; + emit_llvm_intrinsic(ctx, store_name, + LLVMVoidTypeInContext(ctx->context), params, 6, 0); + } +} + +static LLVMValueRef visit_atomic_ssbo(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + const char *name; + LLVMValueRef params[5]; + int arg_count = 0; + if (ctx->stage == MESA_SHADER_FRAGMENT) + ctx->shader_info->fs.writes_memory = true; + + if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) { + params[arg_count++] = get_src(ctx, instr->src[3]); + } + params[arg_count++] = get_src(ctx, instr->src[2]); + params[arg_count++] = get_src(ctx, instr->src[0]); + params[arg_count++] = LLVMConstInt(ctx->i32, 0, false); /* vindex */ + params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */ + params[arg_count++] = LLVMConstInt(ctx->i1, 0, false); /* slc */ + + switch (instr->intrinsic) { + case nir_intrinsic_ssbo_atomic_add: + name = "llvm.amdgcn.buffer.atomic.add"; + break; + case nir_intrinsic_ssbo_atomic_imin: + name = "llvm.amdgcn.buffer.atomic.smin"; + break; + case nir_intrinsic_ssbo_atomic_umin: + name = "llvm.amdgcn.buffer.atomic.umin"; + break; + case nir_intrinsic_ssbo_atomic_imax: + name = "llvm.amdgcn.buffer.atomic.smax"; + break; + case nir_intrinsic_ssbo_atomic_umax: + name = "llvm.amdgcn.buffer.atomic.umax"; + break; + case nir_intrinsic_ssbo_atomic_and: + name = "llvm.amdgcn.buffer.atomic.and"; + break; + case nir_intrinsic_ssbo_atomic_or: + name = "llvm.amdgcn.buffer.atomic.or"; + break; + case nir_intrinsic_ssbo_atomic_xor: + name = "llvm.amdgcn.buffer.atomic.xor"; + break; + case nir_intrinsic_ssbo_atomic_exchange: + name = "llvm.amdgcn.buffer.atomic.swap"; + break; + case nir_intrinsic_ssbo_atomic_comp_swap: + name = "llvm.amdgcn.buffer.atomic.cmpswap"; + break; + default: + abort(); + } + + return emit_llvm_intrinsic(ctx, name, ctx->i32, params, arg_count, 0); +} + +static LLVMValueRef visit_load_buffer(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + const char *load_name; + LLVMTypeRef data_type = ctx->f32; + if (instr->num_components == 3) + data_type = LLVMVectorType(ctx->f32, 4); + else if (instr->num_components > 1) + data_type = LLVMVectorType(ctx->f32, instr->num_components); + + if (instr->num_components == 4 || instr->num_components == 3) + load_name = "llvm.amdgcn.buffer.load.v4f32"; + else if (instr->num_components == 2) + load_name = "llvm.amdgcn.buffer.load.v2f32"; + else if (instr->num_components == 1) + load_name = "llvm.amdgcn.buffer.load.f32"; + else + abort(); + + LLVMValueRef params[] = { + get_src(ctx, instr->src[0]), + LLVMConstInt(ctx->i32, 0, false), + get_src(ctx, instr->src[1]), + LLVMConstInt(ctx->i1, 0, false), + LLVMConstInt(ctx->i1, 0, false), + }; + + LLVMValueRef ret = + emit_llvm_intrinsic(ctx, load_name, data_type, params, 5, 0); + + if (instr->num_components == 3) + ret = trim_vector(ctx, ret, 3); + + return LLVMBuildBitCast(ctx->builder, ret, + get_def_type(ctx, &instr->dest.ssa), ""); +} + +static void +radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref *tail, + bool vs_in, unsigned *const_out, LLVMValueRef *indir_out) +{ + unsigned const_offset = 0; + LLVMValueRef offset = NULL; + + + while (tail->child != NULL) { + const struct glsl_type *parent_type = tail->type; + tail = tail->child; + + if (tail->deref_type == nir_deref_type_array) { + nir_deref_array *deref_array = nir_deref_as_array(tail); + LLVMValueRef index, stride, local_offset; + unsigned size = glsl_count_attribute_slots(tail->type, vs_in); + + const_offset += size * deref_array->base_offset; + if (deref_array->deref_array_type == nir_deref_array_type_direct) + continue; + + assert(deref_array->deref_array_type == nir_deref_array_type_indirect); + index = get_src(ctx, deref_array->indirect); + stride = LLVMConstInt(ctx->i32, size, 0); + local_offset = LLVMBuildMul(ctx->builder, stride, index, ""); + + if (offset) + offset = LLVMBuildAdd(ctx->builder, offset, local_offset, ""); + else + offset = local_offset; + } else if (tail->deref_type == nir_deref_type_struct) { + nir_deref_struct *deref_struct = nir_deref_as_struct(tail); + + for (unsigned i = 0; i < deref_struct->index; i++) { + const struct glsl_type *ft = glsl_get_struct_field(parent_type, i); + const_offset += glsl_count_attribute_slots(ft, vs_in); + } + } else + unreachable("unsupported deref type"); + + } + + if (const_offset && offset) + offset = LLVMBuildAdd(ctx->builder, offset, + LLVMConstInt(ctx->i32, const_offset, 0), + ""); + + *const_out = const_offset; + *indir_out = offset; +} + +static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef values[4]; + int idx = instr->variables[0]->var->data.driver_location; + int ve = instr->dest.ssa.num_components; + LLVMValueRef indir_index; + unsigned const_index; + switch (instr->variables[0]->var->data.mode) { + case nir_var_shader_in: + radv_get_deref_offset(ctx, &instr->variables[0]->deref, + ctx->stage == MESA_SHADER_VERTEX, + &const_index, &indir_index); + for (unsigned chan = 0; chan < ve; chan++) { + if (indir_index) { + unsigned count = glsl_count_attribute_slots( + instr->variables[0]->var->type, + ctx->stage == MESA_SHADER_VERTEX); + LLVMValueRef tmp_vec = build_gather_values_extended( + ctx, ctx->inputs + idx + chan, count, + 4, false); + + values[chan] = LLVMBuildExtractElement(ctx->builder, + tmp_vec, + indir_index, ""); + } else + values[chan] = ctx->inputs[idx + chan + const_index * 4]; + } + return to_integer(ctx, build_gather_values(ctx, values, ve)); + break; + case nir_var_local: + radv_get_deref_offset(ctx, &instr->variables[0]->deref, false, + &const_index, &indir_index); + for (unsigned chan = 0; chan < ve; chan++) { + if (indir_index) { + unsigned count = glsl_count_attribute_slots( + instr->variables[0]->var->type, false); + LLVMValueRef tmp_vec = build_gather_values_extended( + ctx, ctx->locals + idx + chan, count, + 4, true); + + values[chan] = LLVMBuildExtractElement(ctx->builder, + tmp_vec, + indir_index, ""); + } else { + values[chan] = LLVMBuildLoad(ctx->builder, ctx->locals[idx + chan + const_index * 4], ""); + } + } + return to_integer(ctx, build_gather_values(ctx, values, ve)); + case nir_var_shader_out: + radv_get_deref_offset(ctx, &instr->variables[0]->deref, false, + &const_index, &indir_index); + for (unsigned chan = 0; chan < ve; chan++) { + if (indir_index) { + unsigned count = glsl_count_attribute_slots( + instr->variables[0]->var->type, false); + LLVMValueRef tmp_vec = build_gather_values_extended( + ctx, ctx->outputs + idx + chan, count, + 4, true); + + values[chan] = LLVMBuildExtractElement(ctx->builder, + tmp_vec, + indir_index, ""); + } else { + values[chan] = LLVMBuildLoad(ctx->builder, + ctx->outputs[idx + chan + const_index * 4], + ""); + } + } + return to_integer(ctx, build_gather_values(ctx, values, ve)); + case nir_var_shared: { + radv_get_deref_offset(ctx, &instr->variables[0]->deref, false, + &const_index, &indir_index); + LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32); + LLVMValueRef derived_ptr; + LLVMValueRef index = ctx->i32zero; + if (indir_index) + index = LLVMBuildAdd(ctx->builder, index, indir_index, ""); + derived_ptr = LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""); + + return to_integer(ctx, LLVMBuildLoad(ctx->builder, derived_ptr, "")); + break; + } + default: + break; + } + return NULL; +} + +static void +visit_store_var(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef temp_ptr, value; + int idx = instr->variables[0]->var->data.driver_location; + LLVMValueRef src = to_float(ctx, get_src(ctx, instr->src[0])); + int writemask = instr->const_index[0]; + LLVMValueRef indir_index; + unsigned const_index; + switch (instr->variables[0]->var->data.mode) { + case nir_var_shader_out: + radv_get_deref_offset(ctx, &instr->variables[0]->deref, false, + &const_index, &indir_index); + for (unsigned chan = 0; chan < 4; chan++) { + int stride = 4; + if (!(writemask & (1 << chan))) + continue; + if (get_llvm_num_components(src) == 1) + value = src; + else + value = LLVMBuildExtractElement(ctx->builder, src, + LLVMConstInt(ctx->i32, + chan, false), + ""); + + if (instr->variables[0]->var->data.location == VARYING_SLOT_CLIP_DIST0 || + instr->variables[0]->var->data.location == VARYING_SLOT_CULL_DIST0) + stride = 1; + if (indir_index) { + unsigned count = glsl_count_attribute_slots( + instr->variables[0]->var->type, false); + LLVMValueRef tmp_vec = build_gather_values_extended( + ctx, ctx->outputs + idx + chan, count, + stride, true); + + if (get_llvm_num_components(tmp_vec) > 1) { + tmp_vec = LLVMBuildInsertElement(ctx->builder, tmp_vec, + value, indir_index, ""); + } else + tmp_vec = value; + build_store_values_extended(ctx, ctx->outputs + idx + chan, + count, stride, tmp_vec); + + } else { + temp_ptr = ctx->outputs[idx + chan + const_index * stride]; + + LLVMBuildStore(ctx->builder, value, temp_ptr); + } + } + break; + case nir_var_local: + radv_get_deref_offset(ctx, &instr->variables[0]->deref, false, + &const_index, &indir_index); + for (unsigned chan = 0; chan < 4; chan++) { + if (!(writemask & (1 << chan))) + continue; + + if (get_llvm_num_components(src) == 1) + value = src; + else + value = LLVMBuildExtractElement(ctx->builder, src, + LLVMConstInt(ctx->i32, chan, false), ""); + if (indir_index) { + unsigned count = glsl_count_attribute_slots( + instr->variables[0]->var->type, false); + LLVMValueRef tmp_vec = build_gather_values_extended( + ctx, ctx->locals + idx + chan, count, + 4, true); + + tmp_vec = LLVMBuildInsertElement(ctx->builder, tmp_vec, + value, indir_index, ""); + build_store_values_extended(ctx, ctx->locals + idx + chan, + count, 4, tmp_vec); + } else { + temp_ptr = ctx->locals[idx + chan + const_index * 4]; + + LLVMBuildStore(ctx->builder, value, temp_ptr); + } + } + break; + case nir_var_shared: { + LLVMValueRef ptr; + radv_get_deref_offset(ctx, &instr->variables[0]->deref, false, + &const_index, &indir_index); + + ptr = get_shared_memory_ptr(ctx, idx, ctx->i32); + LLVMValueRef index = ctx->i32zero; + LLVMValueRef derived_ptr; + + if (indir_index) + index = LLVMBuildAdd(ctx->builder, index, indir_index, ""); + derived_ptr = LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""); + LLVMBuildStore(ctx->builder, + to_integer(ctx, src), derived_ptr); + break; + } + default: + break; + } +} + +static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array) +{ + switch (dim) { + case GLSL_SAMPLER_DIM_BUF: + return 1; + case GLSL_SAMPLER_DIM_1D: + return array ? 2 : 1; + case GLSL_SAMPLER_DIM_2D: + return array ? 3 : 2; + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + return 3; + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_SUBPASS: + return 2; + default: + break; + } + return 0; +} + +static LLVMValueRef get_image_coords(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr, bool add_frag_pos) +{ + const struct glsl_type *type = instr->variables[0]->var->type; + if(instr->variables[0]->deref.child) + type = instr->variables[0]->deref.child->type; + + LLVMValueRef src0 = get_src(ctx, instr->src[0]); + LLVMValueRef coords[4]; + LLVMValueRef masks[] = { + LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false), + LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false), + }; + LLVMValueRef res; + int count; + count = image_type_to_components_count(glsl_get_sampler_dim(type), + glsl_sampler_type_is_array(type)); + + if (count == 1) { + if (instr->src[0].ssa->num_components) + res = LLVMBuildExtractElement(ctx->builder, src0, masks[0], ""); + else + res = src0; + } else { + int chan; + for (chan = 0; chan < count; ++chan) { + coords[chan] = LLVMBuildExtractElement(ctx->builder, src0, masks[chan], ""); + } + + if (add_frag_pos) { + for (chan = 0; chan < count; ++chan) + coords[chan] = LLVMBuildAdd(ctx->builder, coords[chan], LLVMBuildFPToUI(ctx->builder, ctx->frag_pos[chan], ctx->i32, ""), ""); + } + if (count == 3) { + coords[3] = LLVMGetUndef(ctx->i32); + count = 4; + } + res = build_gather_values(ctx, coords, count); + } + return res; +} + +static LLVMValueRef visit_image_load(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef params[7]; + LLVMValueRef res; + char intrinsic_name[32]; + char coords_type[8]; + const nir_variable *var = instr->variables[0]->var; + const struct glsl_type *type = var->type; + if(instr->variables[0]->deref.child) + type = instr->variables[0]->deref.child->type; + + type = glsl_without_array(type); + if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) { + params[0] = get_sampler_desc(ctx, instr->variables[0], DESC_BUFFER); + params[1] = LLVMBuildExtractElement(ctx->builder, get_src(ctx, instr->src[0]), + LLVMConstInt(ctx->i32, 0, false), ""); /* vindex */ + params[2] = LLVMConstInt(ctx->i32, 0, false); /* voffset */ + params[3] = LLVMConstInt(ctx->i1, 0, false); /* glc */ + params[4] = LLVMConstInt(ctx->i1, 0, false); /* slc */ + res = emit_llvm_intrinsic(ctx, "llvm.amdgcn.buffer.load.format.v4f32", ctx->v4f32, + params, 5, 0); + + res = trim_vector(ctx, res, instr->dest.ssa.num_components); + res = to_integer(ctx, res); + } else { + bool da = glsl_sampler_type_is_array(type) || + glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE; + bool add_frag_pos = glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_SUBPASS; + + params[0] = get_image_coords(ctx, instr, add_frag_pos); + params[1] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE); + params[2] = LLVMConstInt(ctx->i32, 15, false); /* dmask */ + params[3] = LLVMConstInt(ctx->i1, 0, false); /* r128 */ + params[4] = da ? ctx->i32one : ctx->i32zero; /* da */ + params[5] = LLVMConstInt(ctx->i1, 0, false); /* glc */ + params[6] = LLVMConstInt(ctx->i1, 0, false); /* slc */ + + build_int_type_name(LLVMTypeOf(params[0]), + coords_type, sizeof(coords_type)); + + snprintf(intrinsic_name, sizeof(intrinsic_name), + "llvm.amdgcn.image.load.%s", coords_type); + res = emit_llvm_intrinsic(ctx, intrinsic_name, ctx->v4f32, + params, 7, LLVMReadOnlyAttribute); + } + return to_integer(ctx, res); +} + +static void visit_image_store(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef params[8]; + char intrinsic_name[32]; + char coords_type[8]; + const nir_variable *var = instr->variables[0]->var; + LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); + LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0); + const struct glsl_type *type = glsl_without_array(var->type); + + if (ctx->stage == MESA_SHADER_FRAGMENT) + ctx->shader_info->fs.writes_memory = true; + + if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) { + params[0] = to_float(ctx, get_src(ctx, instr->src[2])); /* data */ + params[1] = get_sampler_desc(ctx, instr->variables[0], DESC_BUFFER); + params[2] = LLVMBuildExtractElement(ctx->builder, get_src(ctx, instr->src[0]), + LLVMConstInt(ctx->i32, 0, false), ""); /* vindex */ + params[3] = LLVMConstInt(ctx->i32, 0, false); /* voffset */ + params[4] = i1false; /* glc */ + params[5] = i1false; /* slc */ + emit_llvm_intrinsic(ctx, "llvm.amdgcn.buffer.store.format.v4f32", ctx->voidt, + params, 6, 0); + } else { + bool da = glsl_sampler_type_is_array(type) || + glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE; + + params[0] = get_src(ctx, instr->src[2]); /* coords */ + params[1] = get_image_coords(ctx, instr, false); + params[2] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE); + params[3] = LLVMConstInt(ctx->i32, 15, false); /* dmask */ + params[4] = i1false; /* r128 */ + params[5] = da ? i1true : i1false; /* da */ + params[6] = i1false; /* glc */ + params[7] = i1false; /* slc */ + + build_int_type_name(LLVMTypeOf(params[1]), + coords_type, sizeof(coords_type)); + + snprintf(intrinsic_name, sizeof(intrinsic_name), + "llvm.amdgcn.image.store.%s", coords_type); + emit_llvm_intrinsic(ctx, intrinsic_name, ctx->voidt, + params, 8, 0); + } + +} + +static LLVMValueRef visit_image_atomic(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef params[6]; + int param_count = 0; + const nir_variable *var = instr->variables[0]->var; + LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); + LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0); + const char *base_name = "llvm.amdgcn.image.atomic"; + const char *atomic_name; + LLVMValueRef coords; + char intrinsic_name[32], coords_type[8]; + const struct glsl_type *type = glsl_without_array(var->type); + + if (ctx->stage == MESA_SHADER_FRAGMENT) + ctx->shader_info->fs.writes_memory = true; + + params[param_count++] = get_src(ctx, instr->src[2]); + if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap) + params[param_count++] = get_src(ctx, instr->src[3]); + + if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) { + params[param_count++] = get_sampler_desc(ctx, instr->variables[0], DESC_BUFFER); + coords = params[param_count++] = LLVMBuildExtractElement(ctx->builder, get_src(ctx, instr->src[0]), + LLVMConstInt(ctx->i32, 0, false), ""); /* vindex */ + params[param_count++] = ctx->i32zero; /* voffset */ + params[param_count++] = i1false; /* glc */ + params[param_count++] = i1false; /* slc */ + } else { + bool da = glsl_sampler_type_is_array(type) || + glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE; + + coords = params[param_count++] = get_image_coords(ctx, instr, false); + params[param_count++] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE); + params[param_count++] = i1false; /* r128 */ + params[param_count++] = da ? i1true : i1false; /* da */ + params[param_count++] = i1false; /* slc */ + } + + switch (instr->intrinsic) { + case nir_intrinsic_image_atomic_add: + atomic_name = "add"; + break; + case nir_intrinsic_image_atomic_min: + atomic_name = "smin"; + break; + case nir_intrinsic_image_atomic_max: + atomic_name = "smax"; + break; + case nir_intrinsic_image_atomic_and: + atomic_name = "and"; + break; + case nir_intrinsic_image_atomic_or: + atomic_name = "or"; + break; + case nir_intrinsic_image_atomic_xor: + atomic_name = "xor"; + break; + case nir_intrinsic_image_atomic_exchange: + atomic_name = "swap"; + break; + case nir_intrinsic_image_atomic_comp_swap: + atomic_name = "cmpswap"; + break; + default: + abort(); + } + build_int_type_name(LLVMTypeOf(coords), + coords_type, sizeof(coords_type)); + + snprintf(intrinsic_name, sizeof(intrinsic_name), + "%s.%s.%s", base_name, atomic_name, coords_type); + return emit_llvm_intrinsic(ctx, intrinsic_name, ctx->i32, params, param_count, 0); +} + +static LLVMValueRef visit_image_size(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef res; + LLVMValueRef params[10]; + const nir_variable *var = instr->variables[0]->var; + const struct glsl_type *type = instr->variables[0]->var->type; + bool da = glsl_sampler_type_is_array(var->type) || + glsl_get_sampler_dim(var->type) == GLSL_SAMPLER_DIM_CUBE; + if(instr->variables[0]->deref.child) + type = instr->variables[0]->deref.child->type; + + if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) + return get_buffer_size(ctx, get_sampler_desc(ctx, instr->variables[0], DESC_BUFFER), true); + params[0] = ctx->i32zero; + params[1] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE); + params[2] = LLVMConstInt(ctx->i32, 15, false); + params[3] = ctx->i32zero; + params[4] = ctx->i32zero; + params[5] = da ? ctx->i32one : ctx->i32zero; + params[6] = ctx->i32zero; + params[7] = ctx->i32zero; + params[8] = ctx->i32zero; + params[9] = ctx->i32zero; + + res = emit_llvm_intrinsic(ctx, "llvm.SI.getresinfo.i32", ctx->v4i32, + params, 10, LLVMReadNoneAttribute); + + if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE && + glsl_sampler_type_is_array(type)) { + LLVMValueRef two = LLVMConstInt(ctx->i32, 2, false); + LLVMValueRef six = LLVMConstInt(ctx->i32, 6, false); + LLVMValueRef z = LLVMBuildExtractElement(ctx->builder, res, two, ""); + z = LLVMBuildSDiv(ctx->builder, z, six, ""); + res = LLVMBuildInsertElement(ctx->builder, res, z, two, ""); + } + return res; +} + +static void emit_waitcnt(struct nir_to_llvm_context *ctx) +{ + LLVMValueRef args[1] = { + LLVMConstInt(ctx->i32, 0xf70, false), + }; + emit_llvm_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", + ctx->voidt, args, 1, 0); +} + +static void emit_barrier(struct nir_to_llvm_context *ctx) +{ + // TODO tess + emit_llvm_intrinsic(ctx, "llvm.amdgcn.s.barrier", + ctx->voidt, NULL, 0, 0); +} + +static LLVMValueRef +visit_load_local_invocation_index(struct nir_to_llvm_context *ctx) +{ + LLVMValueRef result; + LLVMValueRef thread_id = get_thread_id(ctx); + result = LLVMBuildAnd(ctx->builder, ctx->tg_size, + LLVMConstInt(ctx->i32, 0xfc0, false), ""); + + return LLVMBuildAdd(ctx->builder, result, thread_id, ""); +} + +static LLVMValueRef visit_var_atomic(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef ptr, result; + int idx = instr->variables[0]->var->data.driver_location; + LLVMValueRef src = get_src(ctx, instr->src[0]); + ptr = get_shared_memory_ptr(ctx, idx, ctx->i32); + + if (instr->intrinsic == nir_intrinsic_var_atomic_comp_swap) { + LLVMValueRef src1 = get_src(ctx, instr->src[1]); + result = LLVMBuildAtomicCmpXchg(ctx->builder, + ptr, src, src1, + LLVMAtomicOrderingSequentiallyConsistent, + LLVMAtomicOrderingSequentiallyConsistent, + false); + } else { + LLVMAtomicRMWBinOp op; + switch (instr->intrinsic) { + case nir_intrinsic_var_atomic_add: + op = LLVMAtomicRMWBinOpAdd; + break; + case nir_intrinsic_var_atomic_umin: + op = LLVMAtomicRMWBinOpUMin; + break; + case nir_intrinsic_var_atomic_umax: + op = LLVMAtomicRMWBinOpUMax; + break; + case nir_intrinsic_var_atomic_imin: + op = LLVMAtomicRMWBinOpMin; + break; + case nir_intrinsic_var_atomic_imax: + op = LLVMAtomicRMWBinOpMax; + break; + case nir_intrinsic_var_atomic_and: + op = LLVMAtomicRMWBinOpAnd; + break; + case nir_intrinsic_var_atomic_or: + op = LLVMAtomicRMWBinOpOr; + break; + case nir_intrinsic_var_atomic_xor: + op = LLVMAtomicRMWBinOpXor; + break; + case nir_intrinsic_var_atomic_exchange: + op = LLVMAtomicRMWBinOpXchg; + break; + default: + return NULL; + } + + result = LLVMBuildAtomicRMW(ctx->builder, op, ptr, to_integer(ctx, src), + LLVMAtomicOrderingSequentiallyConsistent, + false); + } + return result; +} + +#define INTERP_CENTER 0 +#define INTERP_CENTROID 1 +#define INTERP_SAMPLE 2 + +static LLVMValueRef lookup_interp_param(struct nir_to_llvm_context *ctx, + enum glsl_interp_mode interp, unsigned location) +{ + switch (interp) { + case INTERP_MODE_FLAT: + default: + return NULL; + case INTERP_MODE_SMOOTH: + case INTERP_MODE_NONE: + if (location == INTERP_CENTER) + return ctx->persp_center; + else if (location == INTERP_CENTROID) + return ctx->persp_centroid; + else if (location == INTERP_SAMPLE) + return ctx->persp_sample; + break; + case INTERP_MODE_NOPERSPECTIVE: + if (location == INTERP_CENTER) + return ctx->linear_center; + else if (location == INTERP_CENTROID) + return ctx->linear_centroid; + else if (location == INTERP_SAMPLE) + return ctx->linear_sample; + break; + } + return NULL; +} + +static LLVMValueRef load_sample_position(struct nir_to_llvm_context *ctx, + LLVMValueRef sample_id) +{ + /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ + LLVMValueRef offset0 = LLVMBuildMul(ctx->builder, sample_id, LLVMConstInt(ctx->i32, 8, false), ""); + LLVMValueRef offset1 = LLVMBuildAdd(ctx->builder, offset0, LLVMConstInt(ctx->i32, 4, false), ""); + LLVMValueRef result[2]; + + result[0] = build_indexed_load_const(ctx, ctx->sample_positions, offset0); + result[1] = build_indexed_load_const(ctx, ctx->sample_positions, offset1); + + return build_gather_values(ctx, result, 2); +} + +static LLVMValueRef visit_interp(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef result[2]; + LLVMValueRef interp_param, attr_number; + unsigned location; + unsigned chan; + LLVMValueRef src_c0, src_c1; + const char *intr_name; + LLVMValueRef src0; + int input_index = instr->variables[0]->var->data.location - VARYING_SLOT_VAR0; + switch (instr->intrinsic) { + case nir_intrinsic_interp_var_at_centroid: + location = INTERP_CENTROID; + break; + case nir_intrinsic_interp_var_at_sample: + case nir_intrinsic_interp_var_at_offset: + location = INTERP_SAMPLE; + src0 = get_src(ctx, instr->src[0]); + break; + default: + break; + } + + if (instr->intrinsic == nir_intrinsic_interp_var_at_offset) { + src_c0 = to_float(ctx, LLVMBuildExtractElement(ctx->builder, src0, ctx->i32zero, "")); + src_c1 = to_float(ctx, LLVMBuildExtractElement(ctx->builder, src0, ctx->i32one, "")); + } else if (instr->intrinsic == nir_intrinsic_interp_var_at_sample) { + LLVMValueRef sample_position; + LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f); + + /* fetch sample ID */ + sample_position = load_sample_position(ctx, src0); + + src_c0 = LLVMBuildExtractElement(ctx->builder, sample_position, ctx->i32zero, ""); + src_c0 = LLVMBuildFSub(ctx->builder, src_c0, halfval, ""); + src_c1 = LLVMBuildExtractElement(ctx->builder, sample_position, ctx->i32one, ""); + src_c1 = LLVMBuildFSub(ctx->builder, src_c1, halfval, ""); + } + interp_param = lookup_interp_param(ctx, instr->variables[0]->var->data.interpolation, location); + attr_number = LLVMConstInt(ctx->i32, input_index, false); + + if (location == INTERP_SAMPLE) { + LLVMValueRef ij_out[2]; + LLVMValueRef ddxy_out = emit_ddxy_interp(ctx, interp_param); + + /* + * take the I then J parameters, and the DDX/Y for it, and + * calculate the IJ inputs for the interpolator. + * temp1 = ddx * offset/sample.x + I; + * interp_param.I = ddy * offset/sample.y + temp1; + * temp1 = ddx * offset/sample.x + J; + * interp_param.J = ddy * offset/sample.y + temp1; + */ + for (unsigned i = 0; i < 2; i++) { + LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, false); + LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, false); + LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->builder, + ddxy_out, ix_ll, ""); + LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->builder, + ddxy_out, iy_ll, ""); + LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->builder, + interp_param, ix_ll, ""); + LLVMValueRef temp1, temp2; + + interp_el = LLVMBuildBitCast(ctx->builder, interp_el, + ctx->f32, ""); + + temp1 = LLVMBuildFMul(ctx->builder, ddx_el, src_c0, ""); + temp1 = LLVMBuildFAdd(ctx->builder, temp1, interp_el, ""); + + temp2 = LLVMBuildFMul(ctx->builder, ddy_el, src_c1, ""); + temp2 = LLVMBuildFAdd(ctx->builder, temp2, temp1, ""); + + ij_out[i] = LLVMBuildBitCast(ctx->builder, + temp2, ctx->i32, ""); + } + interp_param = build_gather_values(ctx, ij_out, 2); + + } + intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant"; + for (chan = 0; chan < 2; chan++) { + LLVMValueRef args[4]; + LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, false); + + args[0] = llvm_chan; + args[1] = attr_number; + args[2] = ctx->prim_mask; + args[3] = interp_param; + result[chan] = emit_llvm_intrinsic(ctx, intr_name, + ctx->f32, args, args[3] ? 4 : 3, + LLVMReadNoneAttribute); + } + return build_gather_values(ctx, result, 2); +} + +static void visit_intrinsic(struct nir_to_llvm_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef result = NULL; + + switch (instr->intrinsic) { + case nir_intrinsic_load_work_group_id: { + result = ctx->workgroup_ids; + break; + } + case nir_intrinsic_load_base_vertex: { + result = ctx->base_vertex; + break; + } + case nir_intrinsic_load_vertex_id_zero_base: { + result = ctx->vertex_id; + break; + } + case nir_intrinsic_load_local_invocation_id: { + result = ctx->local_invocation_ids; + break; + } + case nir_intrinsic_load_base_instance: + result = ctx->start_instance; + break; + case nir_intrinsic_load_sample_id: + result = ctx->ancillary; + break; + case nir_intrinsic_load_front_face: + result = ctx->front_face; + break; + case nir_intrinsic_load_instance_id: + result = ctx->instance_id; + ctx->shader_info->vs.vgpr_comp_cnt = MAX2(3, + ctx->shader_info->vs.vgpr_comp_cnt); + break; + case nir_intrinsic_load_num_work_groups: + result = ctx->num_work_groups; + break; + case nir_intrinsic_load_local_invocation_index: + result = visit_load_local_invocation_index(ctx); + break; + case nir_intrinsic_load_push_constant: + result = visit_load_push_constant(ctx, instr); + break; + case nir_intrinsic_vulkan_resource_index: + result = visit_vulkan_resource_index(ctx, instr); + break; + case nir_intrinsic_store_ssbo: + visit_store_ssbo(ctx, instr); + break; + case nir_intrinsic_load_ssbo: + result = visit_load_buffer(ctx, instr); + break; + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + result = visit_atomic_ssbo(ctx, instr); + break; + case nir_intrinsic_load_ubo: + result = visit_load_buffer(ctx, instr); + break; + case nir_intrinsic_get_buffer_size: + result = visit_get_buffer_size(ctx, instr); + break; + case nir_intrinsic_load_var: + result = visit_load_var(ctx, instr); + break; + case nir_intrinsic_store_var: + visit_store_var(ctx, instr); + break; + case nir_intrinsic_image_load: + result = visit_image_load(ctx, instr); + break; + case nir_intrinsic_image_store: + visit_image_store(ctx, instr); + break; + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_min: + case nir_intrinsic_image_atomic_max: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_comp_swap: + result = visit_image_atomic(ctx, instr); + break; + case nir_intrinsic_image_size: + result = visit_image_size(ctx, instr); + break; + case nir_intrinsic_discard: + ctx->shader_info->fs.can_discard = true; + emit_llvm_intrinsic(ctx, "llvm.AMDGPU.kilp", + LLVMVoidTypeInContext(ctx->context), + NULL, 0, 0); + break; + case nir_intrinsic_memory_barrier: + emit_waitcnt(ctx); + break; + case nir_intrinsic_barrier: + emit_barrier(ctx); + break; + case nir_intrinsic_var_atomic_add: + case nir_intrinsic_var_atomic_imin: + case nir_intrinsic_var_atomic_umin: + case nir_intrinsic_var_atomic_imax: + case nir_intrinsic_var_atomic_umax: + case nir_intrinsic_var_atomic_and: + case nir_intrinsic_var_atomic_or: + case nir_intrinsic_var_atomic_xor: + case nir_intrinsic_var_atomic_exchange: + case nir_intrinsic_var_atomic_comp_swap: + result = visit_var_atomic(ctx, instr); + break; + case nir_intrinsic_interp_var_at_centroid: + case nir_intrinsic_interp_var_at_sample: + case nir_intrinsic_interp_var_at_offset: + result = visit_interp(ctx, instr); + break; + default: + fprintf(stderr, "Unknown intrinsic: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + break; + } + if (result) { + _mesa_hash_table_insert(ctx->defs, &instr->dest.ssa, result); + } +} + +static LLVMValueRef get_sampler_desc(struct nir_to_llvm_context *ctx, + nir_deref_var *deref, + enum desc_type desc_type) +{ + unsigned desc_set = deref->var->data.descriptor_set; + LLVMValueRef list = ctx->descriptor_sets[desc_set]; + struct radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout; + struct radv_descriptor_set_binding_layout *binding = layout->binding + deref->var->data.binding; + unsigned offset = binding->offset; + unsigned stride = binding->size; + unsigned type_size; + LLVMBuilderRef builder = ctx->builder; + LLVMTypeRef type; + LLVMValueRef indices[2]; + LLVMValueRef index = NULL; + + assert(deref->var->data.binding < layout->binding_count); + + switch (desc_type) { + case DESC_IMAGE: + type = ctx->v8i32; + type_size = 32; + break; + case DESC_FMASK: + type = ctx->v8i32; + offset += 32; + type_size = 32; + break; + case DESC_SAMPLER: + type = ctx->v4i32; + if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) + offset += 64; + + type_size = 16; + break; + case DESC_BUFFER: + type = ctx->v4i32; + type_size = 16; + break; + } + + if (deref->deref.child) { + nir_deref_array *child = (nir_deref_array*)deref->deref.child; + + assert(child->deref_array_type != nir_deref_array_type_wildcard); + offset += child->base_offset * stride; + if (child->deref_array_type == nir_deref_array_type_indirect) { + index = get_src(ctx, child->indirect); + } + } + + assert(stride % type_size == 0); + + if (!index) + index = ctx->i32zero; + + index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, stride / type_size, 0), ""); + indices[0] = ctx->i32zero; + indices[1] = LLVMConstInt(ctx->i32, offset, 0); + list = LLVMBuildGEP(builder, list, indices, 2, ""); + list = LLVMBuildPointerCast(builder, list, const_array(type, 0), ""); + + return build_indexed_load_const(ctx, list, index); +} + +static void set_tex_fetch_args(struct nir_to_llvm_context *ctx, + struct ac_tex_info *tinfo, + nir_tex_instr *instr, + nir_texop op, + LLVMValueRef res_ptr, LLVMValueRef samp_ptr, + LLVMValueRef *param, unsigned count, + unsigned dmask) +{ + int num_args; + unsigned is_rect = 0; + bool da = instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE; + + if (op == nir_texop_lod) + da = false; + /* Pad to power of two vector */ + while (count < util_next_power_of_two(count)) + param[count++] = LLVMGetUndef(ctx->i32); + + if (count > 1) + tinfo->args[0] = build_gather_values(ctx, param, count); + else + tinfo->args[0] = param[0]; + + tinfo->args[1] = res_ptr; + num_args = 2; + + if (op == nir_texop_txf || + op == nir_texop_txf_ms || + op == nir_texop_query_levels || + op == nir_texop_texture_samples || + op == nir_texop_txs) + tinfo->dst_type = ctx->v4i32; + else { + tinfo->dst_type = ctx->v4f32; + tinfo->args[num_args++] = samp_ptr; + } + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF && op == nir_texop_txf) { + tinfo->args[0] = res_ptr; + tinfo->args[1] = LLVMConstInt(ctx->i32, 0, false); + tinfo->args[2] = param[0]; + tinfo->arg_count = 3; + return; + } + + tinfo->args[num_args++] = LLVMConstInt(ctx->i32, dmask, 0); + tinfo->args[num_args++] = LLVMConstInt(ctx->i32, is_rect, 0); /* unorm */ + tinfo->args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* r128 */ + tinfo->args[num_args++] = LLVMConstInt(ctx->i32, da ? 1 : 0, 0); + tinfo->args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* glc */ + tinfo->args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* slc */ + tinfo->args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* tfe */ + tinfo->args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* lwe */ + + tinfo->arg_count = num_args; +} + +static void tex_fetch_ptrs(struct nir_to_llvm_context *ctx, + nir_tex_instr *instr, + LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, + LLVMValueRef *fmask_ptr) +{ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) + *res_ptr = get_sampler_desc(ctx, instr->texture, DESC_BUFFER); + else + *res_ptr = get_sampler_desc(ctx, instr->texture, DESC_IMAGE); + if (samp_ptr) { + if (instr->sampler) + *samp_ptr = get_sampler_desc(ctx, instr->sampler, DESC_SAMPLER); + else + *samp_ptr = get_sampler_desc(ctx, instr->texture, DESC_SAMPLER); + } + if (fmask_ptr && !instr->sampler && instr->op == nir_texop_txf_ms) + *fmask_ptr = get_sampler_desc(ctx, instr->texture, DESC_FMASK); +} + +static LLVMValueRef build_cube_intrinsic(struct nir_to_llvm_context *ctx, + LLVMValueRef *in) +{ + + LLVMValueRef v, cube_vec; + + if (1) { + LLVMTypeRef f32 = LLVMTypeOf(in[0]); + LLVMValueRef out[4]; + + out[0] = emit_llvm_intrinsic(ctx, "llvm.amdgcn.cubetc", + f32, in, 3, LLVMReadNoneAttribute); + out[1] = emit_llvm_intrinsic(ctx, "llvm.amdgcn.cubesc", + f32, in, 3, LLVMReadNoneAttribute); + out[2] = emit_llvm_intrinsic(ctx, "llvm.amdgcn.cubema", + f32, in, 3, LLVMReadNoneAttribute); + out[3] = emit_llvm_intrinsic(ctx, "llvm.amdgcn.cubeid", + f32, in, 3, LLVMReadNoneAttribute); + + return build_gather_values(ctx, out, 4); + } else { + LLVMValueRef c[4]; + c[0] = in[0]; + c[1] = in[1]; + c[2] = in[2]; + c[3] = LLVMGetUndef(LLVMTypeOf(in[0])); + cube_vec = build_gather_values(ctx, c, 4); + v = emit_llvm_intrinsic(ctx, "llvm.AMDGPU.cube", LLVMTypeOf(cube_vec), + &cube_vec, 1, LLVMReadNoneAttribute); + } + return v; +} + +static void cube_to_2d_coords(struct nir_to_llvm_context *ctx, + LLVMValueRef *in, LLVMValueRef *out) +{ + LLVMValueRef coords[4]; + LLVMValueRef mad_args[3]; + LLVMValueRef v; + LLVMValueRef tmp; + int i; + + v = build_cube_intrinsic(ctx, in); + for (i = 0; i < 4; i++) + coords[i] = LLVMBuildExtractElement(ctx->builder, v, + LLVMConstInt(ctx->i32, i, false), ""); + + coords[2] = emit_llvm_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, + &coords[2], 1, LLVMReadNoneAttribute); + coords[2] = emit_fdiv(ctx, ctx->f32one, coords[2]); + + mad_args[1] = coords[2]; + mad_args[2] = LLVMConstReal(ctx->f32, 1.5); + mad_args[0] = coords[0]; + + /* emit MAD */ + tmp = LLVMBuildFMul(ctx->builder, mad_args[0], mad_args[1], ""); + coords[0] = LLVMBuildFAdd(ctx->builder, tmp, mad_args[2], ""); + + mad_args[0] = coords[1]; + + /* emit MAD */ + tmp = LLVMBuildFMul(ctx->builder, mad_args[0], mad_args[1], ""); + coords[1] = LLVMBuildFAdd(ctx->builder, tmp, mad_args[2], ""); + + /* apply xyz = yxw swizzle to cooords */ + out[0] = coords[1]; + out[1] = coords[0]; + out[2] = coords[3]; +} + +static void emit_prepare_cube_coords(struct nir_to_llvm_context *ctx, + LLVMValueRef *coords_arg, int num_coords, + bool is_deriv, + bool is_array, LLVMValueRef *derivs_arg) +{ + LLVMValueRef coords[4]; + int i; + cube_to_2d_coords(ctx, coords_arg, coords); + + if (is_deriv && derivs_arg) { + LLVMValueRef derivs[4]; + int axis; + + /* Convert cube derivatives to 2D derivatives. */ + for (axis = 0; axis < 2; axis++) { + LLVMValueRef shifted_cube_coords[4], shifted_coords[4]; + + /* Shift the cube coordinates by the derivatives to get + * the cube coordinates of the "neighboring pixel". + */ + for (i = 0; i < 3; i++) + shifted_cube_coords[i] = + LLVMBuildFAdd(ctx->builder, coords_arg[i], + derivs_arg[axis*3+i], ""); + shifted_cube_coords[3] = LLVMGetUndef(ctx->f32); + + /* Project the shifted cube coordinates onto the face. */ + cube_to_2d_coords(ctx, shifted_cube_coords, + shifted_coords); + + /* Subtract both sets of 2D coordinates to get 2D derivatives. + * This won't work if the shifted coordinates ended up + * in a different face. + */ + for (i = 0; i < 2; i++) + derivs[axis * 2 + i] = + LLVMBuildFSub(ctx->builder, shifted_coords[i], + coords[i], ""); + } + + memcpy(derivs_arg, derivs, sizeof(derivs)); + } + + if (is_array) { + /* for cube arrays coord.z = coord.w(array_index) * 8 + face */ + /* coords_arg.w component - array_index for cube arrays */ + LLVMValueRef tmp = LLVMBuildFMul(ctx->builder, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), ""); + coords[2] = LLVMBuildFAdd(ctx->builder, tmp, coords[2], ""); + } + + memcpy(coords_arg, coords, sizeof(coords)); +} + +static void visit_tex(struct nir_to_llvm_context *ctx, nir_tex_instr *instr) +{ + LLVMValueRef result = NULL; + struct ac_tex_info tinfo = { 0 }; + unsigned dmask = 0xf; + LLVMValueRef address[16]; + LLVMValueRef coords[5]; + LLVMValueRef coord = NULL, lod = NULL, comparitor = NULL, bias, offsets = NULL; + LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL, sample_index = NULL; + LLVMValueRef ddx = NULL, ddy = NULL; + LLVMValueRef derivs[6]; + unsigned chan, count = 0; + unsigned const_src = 0, num_deriv_comp = 0; + + tex_fetch_ptrs(ctx, instr, &res_ptr, &samp_ptr, &fmask_ptr); + + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_coord: + coord = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_projector: + break; + case nir_tex_src_comparitor: + comparitor = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_offset: + offsets = get_src(ctx, instr->src[i].src); + const_src = i; + break; + case nir_tex_src_bias: + bias = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_lod: + lod = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_ms_index: + sample_index = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_ms_mcs: + break; + case nir_tex_src_ddx: + ddx = get_src(ctx, instr->src[i].src); + num_deriv_comp = instr->src[i].src.ssa->num_components; + break; + case nir_tex_src_ddy: + ddy = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_texture_offset: + case nir_tex_src_sampler_offset: + case nir_tex_src_plane: + default: + break; + } + } + + if (instr->op == nir_texop_texture_samples) { + LLVMValueRef res, samples; + res = LLVMBuildBitCast(ctx->builder, res_ptr, ctx->v8i32, ""); + samples = LLVMBuildExtractElement(ctx->builder, res, + LLVMConstInt(ctx->i32, 3, false), ""); + samples = LLVMBuildLShr(ctx->builder, samples, + LLVMConstInt(ctx->i32, 16, false), ""); + samples = LLVMBuildAnd(ctx->builder, samples, + LLVMConstInt(ctx->i32, 0xf, false), ""); + samples = LLVMBuildShl(ctx->builder, ctx->i32one, + samples, ""); + + result = samples; + goto write_result; + } + + if (coord) + for (chan = 0; chan < instr->coord_components; chan++) + coords[chan] = llvm_extract_elem(ctx, coord, chan); + + if (offsets && instr->op != nir_texop_txf) { + LLVMValueRef offset[3], pack; + for (chan = 0; chan < 3; ++chan) + offset[chan] = ctx->i32zero; + + tinfo.has_offset = true; + for (chan = 0; chan < get_llvm_num_components(offsets); chan++) { + offset[chan] = llvm_extract_elem(ctx, offsets, chan); + offset[chan] = LLVMBuildAnd(ctx->builder, offset[chan], + LLVMConstInt(ctx->i32, 0x3f, false), ""); + if (chan) + offset[chan] = LLVMBuildShl(ctx->builder, offset[chan], + LLVMConstInt(ctx->i32, chan * 8, false), ""); + } + pack = LLVMBuildOr(ctx->builder, offset[0], offset[1], ""); + pack = LLVMBuildOr(ctx->builder, pack, offset[2], ""); + address[count++] = pack; + + } + /* pack LOD bias value */ + if (instr->op == nir_texop_txb && bias) { + address[count++] = bias; + } + + /* Pack depth comparison value */ + if (instr->is_shadow && comparitor) { + address[count++] = llvm_extract_elem(ctx, comparitor, 0); + } + + /* pack derivatives */ + if (ddx || ddy) { + switch (instr->sampler_dim) { + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + num_deriv_comp = 3; + break; + case GLSL_SAMPLER_DIM_2D: + default: + num_deriv_comp = 2; + break; + case GLSL_SAMPLER_DIM_1D: + num_deriv_comp = 1; + break; + } + + for (unsigned i = 0; i < num_deriv_comp; i++) { + derivs[i * 2] = to_float(ctx, llvm_extract_elem(ctx, ddx, i)); + derivs[i * 2 + 1] = to_float(ctx, llvm_extract_elem(ctx, ddy, i)); + } + } + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && coord) { + for (chan = 0; chan < instr->coord_components; chan++) + coords[chan] = to_float(ctx, coords[chan]); + if (instr->coord_components == 3) + coords[3] = LLVMGetUndef(ctx->f32); + emit_prepare_cube_coords(ctx, coords, instr->coord_components, instr->op == nir_texop_txd, instr->is_array, derivs); + if (num_deriv_comp) + num_deriv_comp--; + } + + if (ddx || ddy) { + for (unsigned i = 0; i < num_deriv_comp * 2; i++) + address[count++] = derivs[i]; + } + + /* Pack texture coordinates */ + if (coord) { + address[count++] = coords[0]; + if (instr->coord_components > 1) + address[count++] = coords[1]; + if (instr->coord_components > 2) { + /* This seems like a bit of a hack - but it passes Vulkan CTS with it */ + if (instr->sampler_dim != GLSL_SAMPLER_DIM_3D && instr->op != nir_texop_txf) { + coords[2] = to_float(ctx, coords[2]); + coords[2] = emit_llvm_intrinsic(ctx, "llvm.rint.f32", ctx->f32, &coords[2], + 1, 0); + coords[2] = to_integer(ctx, coords[2]); + } + address[count++] = coords[2]; + } + } + + /* Pack LOD */ + if ((instr->op == nir_texop_txl || instr->op == nir_texop_txf) && lod) { + address[count++] = lod; + } else if (instr->op == nir_texop_txf_ms && sample_index) { + address[count++] = sample_index; + } else if(instr->op == nir_texop_txs) { + count = 0; + address[count++] = lod; + } + + for (chan = 0; chan < count; chan++) { + address[chan] = LLVMBuildBitCast(ctx->builder, + address[chan], ctx->i32, ""); + } + + if (instr->op == nir_texop_samples_identical) { + LLVMValueRef txf_address[4]; + struct ac_tex_info txf_info = { 0 }; + unsigned txf_count = count; + memcpy(txf_address, address, sizeof(txf_address)); + + if (!instr->is_array) + txf_address[2] = ctx->i32zero; + txf_address[3] = ctx->i32zero; + + set_tex_fetch_args(ctx, &txf_info, instr, nir_texop_txf, + res_ptr, samp_ptr, + txf_address, txf_count, 0xf); + + result = build_tex_intrinsic(ctx, instr, &txf_info); + goto write_result; + } + + /* TODO sample FMASK magic */ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_MS) { + LLVMValueRef txf_address[4]; + struct ac_tex_info txf_info = { 0 }; + unsigned txf_count = count; + memcpy(txf_address, address, sizeof(txf_address)); + + if (!instr->is_array) + txf_address[2] = ctx->i32zero; + txf_address[3] = ctx->i32zero; + + set_tex_fetch_args(ctx, &txf_info, instr, nir_texop_txf, + res_ptr, samp_ptr, + txf_address, txf_count, 0xf); + + result = build_tex_intrinsic(ctx, instr, &txf_info); + LLVMValueRef four = LLVMConstInt(ctx->i32, 4, false); + LLVMValueRef F = LLVMConstInt(ctx->i32, 0xf, false); + + LLVMValueRef fmask = LLVMBuildExtractElement(ctx->builder, + result, + ctx->i32zero, ""); + + unsigned sample_chan = instr->is_array ? 3 : 2; + + LLVMValueRef sample_index4 = + LLVMBuildMul(ctx->builder, address[sample_chan], four, ""); + LLVMValueRef shifted_fmask = + LLVMBuildLShr(ctx->builder, fmask, sample_index4, ""); + LLVMValueRef final_sample = + LLVMBuildAnd(ctx->builder, shifted_fmask, F, ""); + + /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK + * resource descriptor is 0 (invalid), + */ + LLVMValueRef fmask_desc = + LLVMBuildBitCast(ctx->builder, fmask_ptr, + ctx->v8i32, ""); + + LLVMValueRef fmask_word1 = + LLVMBuildExtractElement(ctx->builder, fmask_desc, + ctx->i32one, ""); + + LLVMValueRef word1_is_nonzero = + LLVMBuildICmp(ctx->builder, LLVMIntNE, + fmask_word1, ctx->i32zero, ""); + + /* Replace the MSAA sample index. */ + address[sample_chan] = + LLVMBuildSelect(ctx->builder, word1_is_nonzero, + final_sample, address[sample_chan], ""); + } + + if (offsets && instr->op == nir_texop_txf) { + nir_const_value *const_offset = + nir_src_as_const_value(instr->src[const_src].src); + + assert(const_offset); + if (instr->coord_components > 2) + address[2] = LLVMBuildAdd(ctx->builder, + address[2], LLVMConstInt(ctx->i32, const_offset->i32[2], false), ""); + if (instr->coord_components > 1) + address[1] = LLVMBuildAdd(ctx->builder, + address[1], LLVMConstInt(ctx->i32, const_offset->i32[1], false), ""); + address[0] = LLVMBuildAdd(ctx->builder, + address[0], LLVMConstInt(ctx->i32, const_offset->i32[0], false), ""); + + } + + /* TODO TG4 support */ + if (instr->op == nir_texop_tg4) { + if (instr->is_shadow) + dmask = 1; + else + dmask = 1 << instr->component; + } + set_tex_fetch_args(ctx, &tinfo, instr, instr->op, + res_ptr, samp_ptr, address, count, dmask); + + result = build_tex_intrinsic(ctx, instr, &tinfo); + + if (instr->op == nir_texop_query_levels) + result = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, 3, false), ""); + else if (instr->op == nir_texop_txs && + instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && + instr->is_array) { + LLVMValueRef two = LLVMConstInt(ctx->i32, 2, false); + LLVMValueRef six = LLVMConstInt(ctx->i32, 6, false); + LLVMValueRef z = LLVMBuildExtractElement(ctx->builder, result, two, ""); + z = LLVMBuildSDiv(ctx->builder, z, six, ""); + result = LLVMBuildInsertElement(ctx->builder, result, z, two, ""); + } + +write_result: + if (result) { + assert(instr->dest.is_ssa); + result = to_integer(ctx, result); + _mesa_hash_table_insert(ctx->defs, &instr->dest.ssa, result); + } +} + + +static void visit_phi(struct nir_to_llvm_context *ctx, nir_phi_instr *instr) +{ + LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa); + LLVMValueRef result = LLVMBuildPhi(ctx->builder, type, ""); + + _mesa_hash_table_insert(ctx->defs, &instr->dest.ssa, result); + _mesa_hash_table_insert(ctx->phis, instr, result); +} + +static void visit_post_phi(struct nir_to_llvm_context *ctx, + nir_phi_instr *instr, + LLVMValueRef llvm_phi) +{ + nir_foreach_phi_src(src, instr) { + LLVMBasicBlockRef block = get_block(ctx, src->pred); + LLVMValueRef llvm_src = get_src(ctx, src->src); + + LLVMAddIncoming(llvm_phi, &llvm_src, &block, 1); + } +} + +static void phi_post_pass(struct nir_to_llvm_context *ctx) +{ + struct hash_entry *entry; + hash_table_foreach(ctx->phis, entry) { + visit_post_phi(ctx, (nir_phi_instr*)entry->key, + (LLVMValueRef)entry->data); + } +} + + +static void visit_ssa_undef(struct nir_to_llvm_context *ctx, + nir_ssa_undef_instr *instr) +{ + unsigned num_components = instr->def.num_components; + LLVMValueRef undef; + + if (num_components == 1) + undef = LLVMGetUndef(ctx->i32); + else { + undef = LLVMGetUndef(LLVMVectorType(ctx->i32, num_components)); + } + _mesa_hash_table_insert(ctx->defs, &instr->def, undef); +} + +static void visit_jump(struct nir_to_llvm_context *ctx, + nir_jump_instr *instr) +{ + switch (instr->type) { + case nir_jump_break: + LLVMBuildBr(ctx->builder, ctx->break_block); + LLVMClearInsertionPosition(ctx->builder); + break; + case nir_jump_continue: + LLVMBuildBr(ctx->builder, ctx->continue_block); + LLVMClearInsertionPosition(ctx->builder); + break; + default: + fprintf(stderr, "Unknown NIR jump instr: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } +} + +static void visit_cf_list(struct nir_to_llvm_context *ctx, + struct exec_list *list); + +static void visit_block(struct nir_to_llvm_context *ctx, nir_block *block) +{ + LLVMBasicBlockRef llvm_block = LLVMGetInsertBlock(ctx->builder); + nir_foreach_instr(instr, block) + { + switch (instr->type) { + case nir_instr_type_alu: + visit_alu(ctx, nir_instr_as_alu(instr)); + break; + case nir_instr_type_load_const: + visit_load_const(ctx, nir_instr_as_load_const(instr)); + break; + case nir_instr_type_intrinsic: + visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_tex: + visit_tex(ctx, nir_instr_as_tex(instr)); + break; + case nir_instr_type_phi: + visit_phi(ctx, nir_instr_as_phi(instr)); + break; + case nir_instr_type_ssa_undef: + visit_ssa_undef(ctx, nir_instr_as_ssa_undef(instr)); + break; + case nir_instr_type_jump: + visit_jump(ctx, nir_instr_as_jump(instr)); + break; + default: + fprintf(stderr, "Unknown NIR instr type: "); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + } + + _mesa_hash_table_insert(ctx->defs, block, llvm_block); +} + +static void visit_if(struct nir_to_llvm_context *ctx, nir_if *if_stmt) +{ + LLVMValueRef value = get_src(ctx, if_stmt->condition); + + LLVMBasicBlockRef merge_block = + LLVMAppendBasicBlockInContext(ctx->context, ctx->main_function, ""); + LLVMBasicBlockRef if_block = + LLVMAppendBasicBlockInContext(ctx->context, ctx->main_function, ""); + LLVMBasicBlockRef else_block = merge_block; + if (!exec_list_is_empty(&if_stmt->else_list)) + else_block = LLVMAppendBasicBlockInContext( + ctx->context, ctx->main_function, ""); + + LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE, value, + LLVMConstInt(ctx->i32, 0, false), ""); + LLVMBuildCondBr(ctx->builder, cond, if_block, else_block); + + LLVMPositionBuilderAtEnd(ctx->builder, if_block); + visit_cf_list(ctx, &if_stmt->then_list); + if (LLVMGetInsertBlock(ctx->builder)) + LLVMBuildBr(ctx->builder, merge_block); + + if (!exec_list_is_empty(&if_stmt->else_list)) { + LLVMPositionBuilderAtEnd(ctx->builder, else_block); + visit_cf_list(ctx, &if_stmt->else_list); + if (LLVMGetInsertBlock(ctx->builder)) + LLVMBuildBr(ctx->builder, merge_block); + } + + LLVMPositionBuilderAtEnd(ctx->builder, merge_block); +} + +static void visit_loop(struct nir_to_llvm_context *ctx, nir_loop *loop) +{ + LLVMBasicBlockRef continue_parent = ctx->continue_block; + LLVMBasicBlockRef break_parent = ctx->break_block; + + ctx->continue_block = + LLVMAppendBasicBlockInContext(ctx->context, ctx->main_function, ""); + ctx->break_block = + LLVMAppendBasicBlockInContext(ctx->context, ctx->main_function, ""); + + LLVMBuildBr(ctx->builder, ctx->continue_block); + LLVMPositionBuilderAtEnd(ctx->builder, ctx->continue_block); + visit_cf_list(ctx, &loop->body); + + if (LLVMGetInsertBlock(ctx->builder)) + LLVMBuildBr(ctx->builder, ctx->continue_block); + LLVMPositionBuilderAtEnd(ctx->builder, ctx->break_block); + + ctx->continue_block = continue_parent; + ctx->break_block = break_parent; +} + +static void visit_cf_list(struct nir_to_llvm_context *ctx, + struct exec_list *list) +{ + foreach_list_typed(nir_cf_node, node, node, list) + { + switch (node->type) { + case nir_cf_node_block: + visit_block(ctx, nir_cf_node_as_block(node)); + break; + + case nir_cf_node_if: + visit_if(ctx, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + visit_loop(ctx, nir_cf_node_as_loop(node)); + break; + + default: + assert(0); + } + } +} + +static void +handle_vs_input_decl(struct nir_to_llvm_context *ctx, + struct nir_variable *variable) +{ + LLVMValueRef t_list_ptr = ctx->vertex_buffers; + LLVMValueRef t_offset; + LLVMValueRef t_list; + LLVMValueRef args[3]; + LLVMValueRef input; + LLVMValueRef buffer_index; + int index = variable->data.location - VERT_ATTRIB_GENERIC0; + int idx = variable->data.location; + unsigned attrib_count = glsl_count_attribute_slots(variable->type, true); + + variable->data.driver_location = idx * 4; + + if (ctx->options->key.vs.instance_rate_inputs & (1u << index)) { + buffer_index = LLVMBuildAdd(ctx->builder, ctx->instance_id, + ctx->start_instance, ""); + ctx->shader_info->vs.vgpr_comp_cnt = MAX2(3, + ctx->shader_info->vs.vgpr_comp_cnt); + } else + buffer_index = LLVMBuildAdd(ctx->builder, ctx->vertex_id, + ctx->base_vertex, ""); + + for (unsigned i = 0; i < attrib_count; ++i, ++idx) { + t_offset = LLVMConstInt(ctx->i32, index + i, false); + + t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset); + args[0] = t_list; + args[1] = LLVMConstInt(ctx->i32, 0, false); + args[2] = buffer_index; + input = emit_llvm_intrinsic(ctx, + "llvm.SI.vs.load.input", ctx->v4f32, args, 3, + LLVMReadNoneAttribute | LLVMNoUnwindAttribute); + + for (unsigned chan = 0; chan < 4; chan++) { + LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, false); + ctx->inputs[radeon_llvm_reg_index_soa(idx, chan)] = + to_integer(ctx, LLVMBuildExtractElement(ctx->builder, + input, llvm_chan, "")); + } + } +} + + +static void interp_fs_input(struct nir_to_llvm_context *ctx, + unsigned attr, + LLVMValueRef interp_param, + LLVMValueRef prim_mask, + LLVMValueRef result[4]) +{ + const char *intr_name; + LLVMValueRef attr_number; + unsigned chan; + + attr_number = LLVMConstInt(ctx->i32, attr, false); + + /* fs.constant returns the param from the middle vertex, so it's not + * really useful for flat shading. It's meant to be used for custom + * interpolation (but the intrinsic can't fetch from the other two + * vertices). + * + * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state + * to do the right thing. The only reason we use fs.constant is that + * fs.interp cannot be used on integers, because they can be equal + * to NaN. + */ + intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant"; + + for (chan = 0; chan < 4; chan++) { + LLVMValueRef args[4]; + LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, false); + + args[0] = llvm_chan; + args[1] = attr_number; + args[2] = prim_mask; + args[3] = interp_param; + result[chan] = emit_llvm_intrinsic(ctx, intr_name, + ctx->f32, args, args[3] ? 4 : 3, + LLVMReadNoneAttribute | LLVMNoUnwindAttribute); + } +} + +static void +handle_fs_input_decl(struct nir_to_llvm_context *ctx, + struct nir_variable *variable) +{ + int idx = variable->data.location; + unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); + LLVMValueRef interp; + + variable->data.driver_location = idx * 4; + ctx->input_mask |= ((1ull << attrib_count) - 1) << variable->data.location; + + if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT) + interp = lookup_interp_param(ctx, variable->data.interpolation, INTERP_CENTER); + else + interp = NULL; + + for (unsigned i = 0; i < attrib_count; ++i) + ctx->inputs[radeon_llvm_reg_index_soa(idx + i, 0)] = interp; + +} + +static void +handle_shader_input_decl(struct nir_to_llvm_context *ctx, + struct nir_variable *variable) +{ + switch (ctx->stage) { + case MESA_SHADER_VERTEX: + handle_vs_input_decl(ctx, variable); + break; + case MESA_SHADER_FRAGMENT: + handle_fs_input_decl(ctx, variable); + break; + default: + break; + } + +} + +static void +handle_fs_inputs_pre(struct nir_to_llvm_context *ctx, + struct nir_shader *nir) +{ + unsigned index = 0; + for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) { + LLVMValueRef interp_param; + LLVMValueRef *inputs = ctx->inputs +radeon_llvm_reg_index_soa(i, 0); + + if (!(ctx->input_mask & (1ull << i))) + continue; + + if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC) { + interp_param = *inputs; + interp_fs_input(ctx, index, interp_param, ctx->prim_mask, + inputs); + + if (!interp_param) + ctx->shader_info->fs.flat_shaded_mask |= 1u << index; + ++index; + } else if (i == VARYING_SLOT_POS) { + for(int i = 0; i < 3; ++i) + inputs[i] = ctx->frag_pos[i]; + + inputs[3] = emit_fdiv(ctx, ctx->f32one, ctx->frag_pos[3]); + } + } + ctx->shader_info->fs.num_interp = index; + if (ctx->input_mask & (1 << VARYING_SLOT_PNTC)) + ctx->shader_info->fs.has_pcoord = true; + ctx->shader_info->fs.input_mask = ctx->input_mask >> VARYING_SLOT_VAR0; +} + +static LLVMValueRef +ac_build_alloca(struct nir_to_llvm_context *ctx, + LLVMTypeRef type, + const char *name) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder); + LLVMValueRef function = LLVMGetBasicBlockParent(current_block); + LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function); + LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block); + LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ctx->context); + LLVMValueRef res; + + if (first_instr) { + LLVMPositionBuilderBefore(first_builder, first_instr); + } else { + LLVMPositionBuilderAtEnd(first_builder, first_block); + } + + res = LLVMBuildAlloca(first_builder, type, name); + LLVMBuildStore(builder, LLVMConstNull(type), res); + + LLVMDisposeBuilder(first_builder); + + return res; +} + +static LLVMValueRef si_build_alloca_undef(struct nir_to_llvm_context *ctx, + LLVMTypeRef type, + const char *name) +{ + LLVMValueRef ptr = ac_build_alloca(ctx, type, name); + LLVMBuildStore(ctx->builder, LLVMGetUndef(type), ptr); + return ptr; +} + +static void +handle_shader_output_decl(struct nir_to_llvm_context *ctx, + struct nir_variable *variable) +{ + int idx = variable->data.location; + unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); + + variable->data.driver_location = idx * 4; + + if (ctx->stage == MESA_SHADER_VERTEX) { + + if (idx == VARYING_SLOT_CLIP_DIST0 || + idx == VARYING_SLOT_CULL_DIST0) { + int length = glsl_get_length(variable->type); + if (idx == VARYING_SLOT_CLIP_DIST0) { + ctx->shader_info->vs.clip_dist_mask = (1 << length) - 1; + ctx->num_clips = length; + } else if (idx == VARYING_SLOT_CULL_DIST0) { + ctx->shader_info->vs.cull_dist_mask = (1 << length) - 1; + ctx->num_culls = length; + } + if (length > 4) + attrib_count = 2; + else + attrib_count = 1; + } + } + + for (unsigned i = 0; i < attrib_count; ++i) { + for (unsigned chan = 0; chan < 4; chan++) { + ctx->outputs[radeon_llvm_reg_index_soa(idx + i, chan)] = + si_build_alloca_undef(ctx, ctx->f32, ""); + } + } + ctx->output_mask |= ((1ull << attrib_count) - 1) << variable->data.location; +} + +static void +setup_locals(struct nir_to_llvm_context *ctx, + struct nir_function *func) +{ + int i, j; + ctx->num_locals = 0; + nir_foreach_variable(variable, &func->impl->locals) { + unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); + variable->data.driver_location = ctx->num_locals * 4; + ctx->num_locals += attrib_count; + } + ctx->locals = malloc(4 * ctx->num_locals * sizeof(LLVMValueRef)); + if (!ctx->locals) + return; + + for (i = 0; i < ctx->num_locals; i++) { + for (j = 0; j < 4; j++) { + ctx->locals[i * 4 + j] = + si_build_alloca_undef(ctx, ctx->f32, "temp"); + } + } +} + +static LLVMValueRef +emit_float_saturate(struct nir_to_llvm_context *ctx, LLVMValueRef v, float lo, float hi) +{ + v = to_float(ctx, v); + v = emit_intrin_2f_param(ctx, "llvm.maxnum.f32", v, LLVMConstReal(ctx->f32, lo)); + return emit_intrin_2f_param(ctx, "llvm.minnum.f32", v, LLVMConstReal(ctx->f32, hi)); +} + + +static LLVMValueRef emit_pack_int16(struct nir_to_llvm_context *ctx, + LLVMValueRef src0, LLVMValueRef src1) +{ + LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false); + LLVMValueRef comp[2]; + + comp[0] = LLVMBuildAnd(ctx->builder, src0, LLVMConstInt(ctx-> i32, 65535, 0), ""); + comp[1] = LLVMBuildAnd(ctx->builder, src1, LLVMConstInt(ctx-> i32, 65535, 0), ""); + comp[1] = LLVMBuildShl(ctx->builder, comp[1], const16, ""); + return LLVMBuildOr(ctx->builder, comp[0], comp[1], ""); +} + +/* Initialize arguments for the shader export intrinsic */ +static void +si_llvm_init_export_args(struct nir_to_llvm_context *ctx, + LLVMValueRef *values, + unsigned target, + LLVMValueRef *args) +{ + /* Default is 0xf. Adjusted below depending on the format. */ + args[0] = LLVMConstInt(ctx->i32, target != V_008DFC_SQ_EXP_NULL ? 0xf : 0, false); + /* Specify whether the EXEC mask represents the valid mask */ + args[1] = LLVMConstInt(ctx->i32, 0, false); + + /* Specify whether this is the last export */ + args[2] = LLVMConstInt(ctx->i32, 0, false); + /* Specify the target we are exporting */ + args[3] = LLVMConstInt(ctx->i32, target, false); + + args[4] = LLVMConstInt(ctx->i32, 0, false); /* COMPR flag */ + args[5] = LLVMGetUndef(ctx->f32); + args[6] = LLVMGetUndef(ctx->f32); + args[7] = LLVMGetUndef(ctx->f32); + args[8] = LLVMGetUndef(ctx->f32); + + if (!values) + return; + + if (ctx->stage == MESA_SHADER_FRAGMENT && target >= V_008DFC_SQ_EXP_MRT) { + LLVMValueRef val[4]; + unsigned index = target - V_008DFC_SQ_EXP_MRT; + unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf; + bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1; + + switch(col_format) { + case V_028714_SPI_SHADER_ZERO: + args[0] = LLVMConstInt(ctx->i32, 0x0, 0); + args[3] = LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_NULL, 0); + break; + + case V_028714_SPI_SHADER_32_R: + args[0] = LLVMConstInt(ctx->i32, 0x1, 0); + args[5] = values[0]; + break; + + case V_028714_SPI_SHADER_32_GR: + args[0] = LLVMConstInt(ctx->i32, 0x3, 0); + args[5] = values[0]; + args[6] = values[1]; + break; + + case V_028714_SPI_SHADER_32_AR: + args[0] = LLVMConstInt(ctx->i32, 0x9, 0); + args[5] = values[0]; + args[8] = values[3]; + break; + + case V_028714_SPI_SHADER_FP16_ABGR: + args[4] = ctx->i32one; + + for (unsigned chan = 0; chan < 2; chan++) { + LLVMValueRef pack_args[2] = { + values[2 * chan], + values[2 * chan + 1] + }; + LLVMValueRef packed; + + packed = emit_llvm_intrinsic(ctx, "llvm.SI.packf16", + ctx->i32, pack_args, 2, + LLVMReadNoneAttribute); + args[chan + 5] = packed; + } + break; + + case V_028714_SPI_SHADER_UNORM16_ABGR: + for (unsigned chan = 0; chan < 4; chan++) { + val[chan] = emit_float_saturate(ctx, values[chan], 0, 1); + val[chan] = LLVMBuildFMul(ctx->builder, val[chan], + LLVMConstReal(ctx->f32, 65535), ""); + val[chan] = LLVMBuildFAdd(ctx->builder, val[chan], + LLVMConstReal(ctx->f32, 0.5), ""); + val[chan] = LLVMBuildFPToUI(ctx->builder, val[chan], + ctx->i32, ""); + } + + args[4] = ctx->i32one; + args[5] = emit_pack_int16(ctx, val[0], val[1]); + args[6] = emit_pack_int16(ctx, val[2], val[3]); + break; + + case V_028714_SPI_SHADER_SNORM16_ABGR: + for (unsigned chan = 0; chan < 4; chan++) { + val[chan] = emit_float_saturate(ctx, values[chan], -1, 1); + val[chan] = LLVMBuildFMul(ctx->builder, val[chan], + LLVMConstReal(ctx->f32, 32767), ""); + + /* If positive, add 0.5, else add -0.5. */ + val[chan] = LLVMBuildFAdd(ctx->builder, val[chan], + LLVMBuildSelect(ctx->builder, + LLVMBuildFCmp(ctx->builder, LLVMRealOGE, + val[chan], ctx->f32zero, ""), + LLVMConstReal(ctx->f32, 0.5), + LLVMConstReal(ctx->f32, -0.5), ""), ""); + val[chan] = LLVMBuildFPToSI(ctx->builder, val[chan], ctx->i32, ""); + } + + args[4] = ctx->i32one; + args[5] = emit_pack_int16(ctx, val[0], val[1]); + args[6] = emit_pack_int16(ctx, val[2], val[3]); + break; + + case V_028714_SPI_SHADER_UINT16_ABGR: { + LLVMValueRef max = LLVMConstInt(ctx->i32, is_int8 ? 255 : 65535, 0); + + for (unsigned chan = 0; chan < 4; chan++) { + val[chan] = to_integer(ctx, values[chan]); + val[chan] = emit_minmax_int(ctx, LLVMIntULT, val[chan], max); + } + + args[4] = ctx->i32one; + args[5] = emit_pack_int16(ctx, val[0], val[1]); + args[6] = emit_pack_int16(ctx, val[2], val[3]); + break; + } + + case V_028714_SPI_SHADER_SINT16_ABGR: { + LLVMValueRef max = LLVMConstInt(ctx->i32, is_int8 ? 127 : 32767, 0); + LLVMValueRef min = LLVMConstInt(ctx->i32, is_int8 ? -128 : -32768, 0); + + /* Clamp. */ + for (unsigned chan = 0; chan < 4; chan++) { + val[chan] = to_integer(ctx, values[chan]); + val[chan] = emit_minmax_int(ctx, LLVMIntSLT, val[chan], max); + val[chan] = emit_minmax_int(ctx, LLVMIntSGT, val[chan], min); + } + + args[4] = ctx->i32one; + args[5] = emit_pack_int16(ctx, val[0], val[1]); + args[6] = emit_pack_int16(ctx, val[2], val[3]); + break; + } + + default: + case V_028714_SPI_SHADER_32_ABGR: + memcpy(&args[5], values, sizeof(values[0]) * 4); + break; + } + } else + memcpy(&args[5], values, sizeof(values[0]) * 4); + + for (unsigned i = 5; i < 9; ++i) + args[i] = to_float(ctx, args[i]); +} + +static void +handle_vs_outputs_post(struct nir_to_llvm_context *ctx, + struct nir_shader *nir) +{ + uint32_t param_count = 0; + unsigned target; + unsigned pos_idx, num_pos_exports = 0; + LLVMValueRef args[9]; + LLVMValueRef pos_args[4][9] = { { 0 } }; + LLVMValueRef psize_value = 0; + int i; + const uint64_t clip_mask = ctx->output_mask & ((1ull << VARYING_SLOT_CLIP_DIST0) | + (1ull << VARYING_SLOT_CLIP_DIST1) | + (1ull << VARYING_SLOT_CULL_DIST0) | + (1ull << VARYING_SLOT_CULL_DIST1)); + + if (clip_mask) { + LLVMValueRef slots[8]; + unsigned j; + + if (ctx->shader_info->vs.cull_dist_mask) + ctx->shader_info->vs.cull_dist_mask <<= ctx->num_clips; + + i = VARYING_SLOT_CLIP_DIST0; + for (j = 0; j < ctx->num_clips; j++) + slots[j] = to_float(ctx, LLVMBuildLoad(ctx->builder, + ctx->outputs[radeon_llvm_reg_index_soa(i, j)], "")); + i = VARYING_SLOT_CULL_DIST0; + for (j = 0; j < ctx->num_culls; j++) + slots[ctx->num_clips + j] = to_float(ctx, LLVMBuildLoad(ctx->builder, + ctx->outputs[radeon_llvm_reg_index_soa(i, j)], "")); + + for (i = ctx->num_clips + ctx->num_culls; i < 8; i++) + slots[i] = LLVMGetUndef(ctx->f32); + + if (ctx->num_clips + ctx->num_culls > 4) { + target = V_008DFC_SQ_EXP_POS + 3; + si_llvm_init_export_args(ctx, &slots[4], target, args); + memcpy(pos_args[target - V_008DFC_SQ_EXP_POS], + args, sizeof(args)); + } + + target = V_008DFC_SQ_EXP_POS + 2; + si_llvm_init_export_args(ctx, &slots[0], target, args); + memcpy(pos_args[target - V_008DFC_SQ_EXP_POS], + args, sizeof(args)); + + } + + for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) { + LLVMValueRef values[4]; + if (!(ctx->output_mask & (1ull << i))) + continue; + + for (unsigned j = 0; j < 4; j++) + values[j] = to_float(ctx, LLVMBuildLoad(ctx->builder, + ctx->outputs[radeon_llvm_reg_index_soa(i, j)], "")); + + if (i == VARYING_SLOT_POS) { + target = V_008DFC_SQ_EXP_POS; + } else if (i == VARYING_SLOT_CLIP_DIST0 || + i == VARYING_SLOT_CLIP_DIST1 || + i == VARYING_SLOT_CULL_DIST0 || + i == VARYING_SLOT_CULL_DIST1) { + continue; + } else if (i == VARYING_SLOT_PSIZ) { + ctx->shader_info->vs.writes_pointsize = true; + psize_value = values[0]; + continue; + } else if (i >= VARYING_SLOT_VAR0) { + ctx->shader_info->vs.export_mask |= 1u << (i - VARYING_SLOT_VAR0); + target = V_008DFC_SQ_EXP_PARAM + param_count; + param_count++; + } + + si_llvm_init_export_args(ctx, values, target, args); + + if (target >= V_008DFC_SQ_EXP_POS && + target <= (V_008DFC_SQ_EXP_POS + 3)) { + memcpy(pos_args[target - V_008DFC_SQ_EXP_POS], + args, sizeof(args)); + } else { + emit_llvm_intrinsic(ctx, + "llvm.SI.export", + LLVMVoidTypeInContext(ctx->context), + args, 9, 0); + } + } + + /* We need to add the position output manually if it's missing. */ + if (!pos_args[0][0]) { + pos_args[0][0] = LLVMConstInt(ctx->i32, 0xf, false); + pos_args[0][1] = ctx->i32zero; /* EXEC mask */ + pos_args[0][2] = ctx->i32zero; /* last export? */ + pos_args[0][3] = LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_POS, false); + pos_args[0][4] = ctx->i32zero; /* COMPR flag */ + pos_args[0][5] = ctx->f32zero; /* X */ + pos_args[0][6] = ctx->f32zero; /* Y */ + pos_args[0][7] = ctx->f32zero; /* Z */ + pos_args[0][8] = ctx->f32one; /* W */ + } + + if (ctx->shader_info->vs.writes_pointsize == true) { + pos_args[1][0] = LLVMConstInt(ctx->i32, (ctx->shader_info->vs.writes_pointsize == true), false); /* writemask */ + pos_args[1][1] = ctx->i32zero; /* EXEC mask */ + pos_args[1][2] = ctx->i32zero; /* last export? */ + pos_args[1][3] = LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_POS + 1, false); + pos_args[1][4] = ctx->i32zero; /* COMPR flag */ + pos_args[1][5] = ctx->f32zero; /* X */ + pos_args[1][6] = ctx->f32zero; /* Y */ + pos_args[1][7] = ctx->f32zero; /* Z */ + pos_args[1][8] = ctx->f32zero; /* W */ + + if (ctx->shader_info->vs.writes_pointsize == true) + pos_args[1][5] = psize_value; + } + for (i = 0; i < 4; i++) { + if (pos_args[i][0]) + num_pos_exports++; + } + + pos_idx = 0; + for (i = 0; i < 4; i++) { + if (!pos_args[i][0]) + continue; + + /* Specify the target we are exporting */ + pos_args[i][3] = LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_POS + pos_idx++, false); + if (pos_idx == num_pos_exports) + pos_args[i][2] = ctx->i32one; + emit_llvm_intrinsic(ctx, + "llvm.SI.export", + LLVMVoidTypeInContext(ctx->context), + pos_args[i], 9, 0); + } + + ctx->shader_info->vs.pos_exports = num_pos_exports; + ctx->shader_info->vs.param_exports = param_count; +} + +static void +si_export_mrt_color(struct nir_to_llvm_context *ctx, + LLVMValueRef *color, unsigned param, bool is_last) +{ + LLVMValueRef args[9]; + /* Export */ + si_llvm_init_export_args(ctx, color, param, + args); + + if (is_last) { + args[1] = ctx->i32one; /* whether the EXEC mask is valid */ + args[2] = ctx->i32one; /* DONE bit */ + } else if (args[0] == ctx->i32zero) + return; /* unnecessary NULL export */ + + emit_llvm_intrinsic(ctx, "llvm.SI.export", + ctx->voidt, args, 9, 0); +} + +static void +si_export_mrt_z(struct nir_to_llvm_context *ctx, + LLVMValueRef depth, LLVMValueRef stencil, + LLVMValueRef samplemask) +{ + LLVMValueRef args[9]; + unsigned mask = 0; + args[1] = ctx->i32one; /* whether the EXEC mask is valid */ + args[2] = ctx->i32one; /* DONE bit */ + /* Specify the target we are exporting */ + args[3] = LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_MRTZ, false); + + args[4] = ctx->i32zero; /* COMP flag */ + args[5] = LLVMGetUndef(ctx->f32); /* R, depth */ + args[6] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */ + args[7] = LLVMGetUndef(ctx->f32); /* B, sample mask */ + args[8] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */ + + if (depth) { + args[5] = depth; + mask |= 0x1; + } + + if (stencil) { + args[6] = stencil; + mask |= 0x2; + } + + if (samplemask) { + args[7] = samplemask; + mask |= 0x04; + } + + /* SI (except OLAND) has a bug that it only looks + * at the X writemask component. */ + if (ctx->options->chip_class == SI && + ctx->options->family != CHIP_OLAND) + mask |= 0x01; + + args[0] = LLVMConstInt(ctx->i32, mask, false); + emit_llvm_intrinsic(ctx, "llvm.SI.export", + ctx->voidt, args, 9, 0); +} + +static void +handle_fs_outputs_post(struct nir_to_llvm_context *ctx, + struct nir_shader *nir) +{ + unsigned index = 0; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + + for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) { + LLVMValueRef values[4]; + bool last; + if (!(ctx->output_mask & (1ull << i))) + continue; + + last = ctx->output_mask <= ((1ull << (i + 1)) - 1); + + if (i == FRAG_RESULT_DEPTH) { + ctx->shader_info->fs.writes_z = true; + depth = to_float(ctx, LLVMBuildLoad(ctx->builder, + ctx->outputs[radeon_llvm_reg_index_soa(i, 0)], "")); + } else if (i == FRAG_RESULT_STENCIL) { + ctx->shader_info->fs.writes_stencil = true; + stencil = to_float(ctx, LLVMBuildLoad(ctx->builder, + ctx->outputs[radeon_llvm_reg_index_soa(i, 0)], "")); + } else { + for (unsigned j = 0; j < 4; j++) + values[j] = to_float(ctx, LLVMBuildLoad(ctx->builder, + ctx->outputs[radeon_llvm_reg_index_soa(i, j)], "")); + + si_export_mrt_color(ctx, values, V_008DFC_SQ_EXP_MRT + index, last); + index++; + } + } + + if (depth || stencil) + si_export_mrt_z(ctx, depth, stencil, samplemask); + else if (!index) + si_export_mrt_color(ctx, NULL, V_008DFC_SQ_EXP_NULL, true); + + ctx->shader_info->fs.output_mask = index ? ((1ull << index) - 1) : 0; +} + +static void +handle_shader_outputs_post(struct nir_to_llvm_context *ctx, + struct nir_shader *nir) +{ + switch (ctx->stage) { + case MESA_SHADER_VERTEX: + handle_vs_outputs_post(ctx, nir); + break; + case MESA_SHADER_FRAGMENT: + handle_fs_outputs_post(ctx, nir); + break; + default: + break; + } +} + +static void +handle_shared_compute_var(struct nir_to_llvm_context *ctx, + struct nir_variable *variable, uint32_t *offset, int idx) +{ + unsigned size = glsl_count_attribute_slots(variable->type, false); + variable->data.driver_location = *offset; + *offset += size; +} + +static void ac_llvm_finalize_module(struct nir_to_llvm_context * ctx) +{ + LLVMPassManagerRef passmgr; + /* Create the pass manager */ + passmgr = LLVMCreateFunctionPassManagerForModule( + ctx->module); + + /* This pass should eliminate all the load and store instructions */ + LLVMAddPromoteMemoryToRegisterPass(passmgr); + + /* Add some optimization passes */ + LLVMAddScalarReplAggregatesPass(passmgr); + LLVMAddLICMPass(passmgr); + LLVMAddAggressiveDCEPass(passmgr); + LLVMAddCFGSimplificationPass(passmgr); + LLVMAddInstructionCombiningPass(passmgr); + + /* Run the pass */ + LLVMInitializeFunctionPassManager(passmgr); + LLVMRunFunctionPassManager(passmgr, ctx->main_function); + LLVMFinalizeFunctionPassManager(passmgr); + + LLVMDisposeBuilder(ctx->builder); + LLVMDisposePassManager(passmgr); +} + +static +LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, + struct nir_shader *nir, + struct ac_shader_variant_info *shader_info, + const struct ac_nir_compiler_options *options) +{ + struct nir_to_llvm_context ctx = {0}; + struct nir_function *func; + ctx.options = options; + ctx.shader_info = shader_info; + ctx.context = LLVMContextCreate(); + ctx.module = LLVMModuleCreateWithNameInContext("shader", ctx.context); + + memset(shader_info, 0, sizeof(*shader_info)); + + LLVMSetTarget(ctx.module, "amdgcn--"); + setup_types(&ctx); + + ctx.builder = LLVMCreateBuilderInContext(ctx.context); + ctx.stage = nir->stage; + + create_function(&ctx, nir); + + if (nir->stage == MESA_SHADER_COMPUTE) { + int num_shared = 0; + nir_foreach_variable(variable, &nir->shared) + num_shared++; + if (num_shared) { + int idx = 0; + uint32_t shared_size = 0; + LLVMValueRef var; + LLVMTypeRef i8p = LLVMPointerType(ctx.i8, LOCAL_ADDR_SPACE); + nir_foreach_variable(variable, &nir->shared) { + handle_shared_compute_var(&ctx, variable, &shared_size, idx); + idx++; + } + + shared_size *= 4; + var = LLVMAddGlobalInAddressSpace(ctx.module, + LLVMArrayType(ctx.i8, shared_size), + "compute_lds", + LOCAL_ADDR_SPACE); + LLVMSetAlignment(var, 4); + ctx.shared_memory = LLVMBuildBitCast(ctx.builder, var, i8p, ""); + } + } + + nir_foreach_variable(variable, &nir->inputs) + handle_shader_input_decl(&ctx, variable); + + if (nir->stage == MESA_SHADER_FRAGMENT) + handle_fs_inputs_pre(&ctx, nir); + + nir_foreach_variable(variable, &nir->outputs) + handle_shader_output_decl(&ctx, variable); + + ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + ctx.phis = _mesa_hash_table_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + + func = (struct nir_function *)exec_list_get_head(&nir->functions); + + setup_locals(&ctx, func); + + visit_cf_list(&ctx, &func->impl->body); + phi_post_pass(&ctx); + + handle_shader_outputs_post(&ctx, nir); + LLVMBuildRetVoid(ctx.builder); + + ac_llvm_finalize_module(&ctx); + free(ctx.locals); + ralloc_free(ctx.defs); + ralloc_free(ctx.phis); + + return ctx.module; +} + +static void ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context) +{ + unsigned *retval = (unsigned *)context; + LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di); + char *description = LLVMGetDiagInfoDescription(di); + + if (severity == LLVMDSError) { + *retval = 1; + fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", + description); + } + + LLVMDisposeMessage(description); +} + +static unsigned ac_llvm_compile(LLVMModuleRef M, + struct ac_shader_binary *binary, + LLVMTargetMachineRef tm) +{ + unsigned retval = 0; + char *err; + LLVMContextRef llvm_ctx; + LLVMMemoryBufferRef out_buffer; + unsigned buffer_size; + const char *buffer_data; + LLVMBool mem_err; + + /* Setup Diagnostic Handler*/ + llvm_ctx = LLVMGetModuleContext(M); + + LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler, + &retval); + + /* Compile IR*/ + mem_err = LLVMTargetMachineEmitToMemoryBuffer(tm, M, LLVMObjectFile, + &err, &out_buffer); + + /* Process Errors/Warnings */ + if (mem_err) { + fprintf(stderr, "%s: %s", __FUNCTION__, err); + free(err); + retval = 1; + goto out; + } + + /* Extract Shader Code*/ + buffer_size = LLVMGetBufferSize(out_buffer); + buffer_data = LLVMGetBufferStart(out_buffer); + + ac_elf_read(buffer_data, buffer_size, binary); + + /* Clean up */ + LLVMDisposeMemoryBuffer(out_buffer); + +out: + return retval; +} + +void ac_compile_nir_shader(LLVMTargetMachineRef tm, + struct ac_shader_binary *binary, + struct ac_shader_config *config, + struct ac_shader_variant_info *shader_info, + struct nir_shader *nir, + const struct ac_nir_compiler_options *options, + bool dump_shader) +{ + + LLVMModuleRef llvm_module = ac_translate_nir_to_llvm(tm, nir, shader_info, + options); + if (dump_shader) + LLVMDumpModule(llvm_module); + + memset(binary, 0, sizeof(*binary)); + int v = ac_llvm_compile(llvm_module, binary, tm); + if (v) { + fprintf(stderr, "compile failed\n"); + } + + if (dump_shader) + fprintf(stderr, "disasm:\n%s\n", binary->disasm_string); + + ac_shader_binary_read_config(binary, config, 0); + + LLVMContextRef ctx = LLVMGetModuleContext(llvm_module); + LLVMDisposeModule(llvm_module); + LLVMContextDispose(ctx); + + if (nir->stage == MESA_SHADER_FRAGMENT) { + shader_info->num_input_vgprs = 0; + if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 2; + if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 2; + if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 2; + if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 3; + if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 2; + if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 2; + if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 2; + if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 1; + if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 1; + if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 1; + if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 1; + if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 1; + if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 1; + if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 1; + if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 1; + if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr)) + shader_info->num_input_vgprs += 1; + } + config->num_vgprs = MAX2(config->num_vgprs, shader_info->num_input_vgprs); + + /* +3 for scratch wave offset and VCC */ + config->num_sgprs = MAX2(config->num_sgprs, + shader_info->num_input_sgprs + 3); + if (nir->stage == MESA_SHADER_COMPUTE) { + for (int i = 0; i < 3; ++i) + shader_info->cs.block_size[i] = nir->info.cs.local_size[i]; + } + + if (nir->stage == MESA_SHADER_FRAGMENT) + shader_info->fs.early_fragment_test = nir->info.fs.early_fragment_tests; +} diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h new file mode 100644 index 00000000000..550fe845074 --- /dev/null +++ b/src/amd/common/ac_nir_to_llvm.h @@ -0,0 +1,102 @@ +/* + * Copyright © 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include +#include "llvm-c/Core.h" +#include "llvm-c/TargetMachine.h" +#include "amd_family.h" + +struct ac_shader_binary; +struct ac_shader_config; +struct nir_shader; +struct radv_pipeline_layout; + + +struct ac_vs_variant_key { + uint32_t instance_rate_inputs; +}; + +struct ac_fs_variant_key { + uint32_t col_format; + uint32_t is_int8; +}; + +union ac_shader_variant_key { + struct ac_vs_variant_key vs; + struct ac_fs_variant_key fs; +}; + +struct ac_nir_compiler_options { + struct radv_pipeline_layout *layout; + union ac_shader_variant_key key; + bool unsafe_math; + enum radeon_family family; + enum chip_class chip_class; +}; + +struct ac_shader_variant_info { + unsigned num_user_sgprs; + unsigned num_input_sgprs; + unsigned num_input_vgprs; + union { + struct { + unsigned param_exports; + unsigned pos_exports; + unsigned vgpr_comp_cnt; + uint32_t export_mask; + bool writes_pointsize; + uint8_t clip_dist_mask; + uint8_t cull_dist_mask; + } vs; + struct { + unsigned num_interp; + uint32_t input_mask; + unsigned output_mask; + uint32_t flat_shaded_mask; + bool has_pcoord; + bool can_discard; + bool writes_z; + bool writes_stencil; + bool early_fragment_test; + bool writes_memory; + } fs; + struct { + unsigned block_size[3]; + } cs; + }; +}; + +void ac_compile_nir_shader(LLVMTargetMachineRef tm, + struct ac_shader_binary *binary, + struct ac_shader_config *config, + struct ac_shader_variant_info *shader_info, + struct nir_shader *nir, + const struct ac_nir_compiler_options *options, + bool dump_shader); + +#ifdef __cplusplus +extern "C" +#endif +void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes); diff --git a/src/amd/vulkan/.gitignore b/src/amd/vulkan/.gitignore new file mode 100644 index 00000000000..e55e353d8fb --- /dev/null +++ b/src/amd/vulkan/.gitignore @@ -0,0 +1,6 @@ +# Generated source files +/radv_entrypoints.c +/radv_entrypoints.h +/radv_timestamp.h +/dev_icd.json +/vk_format_table.c diff --git a/src/amd/vulkan/Makefile.am b/src/amd/vulkan/Makefile.am new file mode 100644 index 00000000000..387ba4dda8e --- /dev/null +++ b/src/amd/vulkan/Makefile.am @@ -0,0 +1,165 @@ +# Copyright © 2016 Red Hat +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +include Makefile.sources + +vulkan_includedir = $(includedir)/vulkan + +vulkan_include_HEADERS = \ + $(top_srcdir)/include/vulkan/vk_platform.h \ + $(top_srcdir)/include/vulkan/vulkan.h + +lib_LTLIBRARIES = libvulkan_radeon.la + +# The gallium includes are for the util/u_math.h include from main/macros.h + +AM_CPPFLAGS = \ + $(AMDGPU_CFLAGS) \ + $(VALGRIND_CFLAGS) \ + $(DEFINES) \ + -I$(top_srcdir)/include \ + -I$(top_builddir)/src \ + -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/amd \ + -I$(top_srcdir)/src/amd/common \ + -I$(top_builddir)/src/compiler \ + -I$(top_builddir)/src/compiler/nir \ + -I$(top_srcdir)/src/compiler \ + -I$(top_srcdir)/src/mapi \ + -I$(top_srcdir)/src/mesa \ + -I$(top_srcdir)/src/mesa/drivers/dri/common \ + -I$(top_srcdir)/src/gallium/auxiliary \ + -I$(top_srcdir)/src/gallium/include + +AM_CFLAGS = $(VISIBILITY_FLAGS) \ + $(PTHREAD_CFLAGS) \ + $(LLVM_CFLAGS) + +VULKAN_SOURCES = \ + $(VULKAN_GENERATED_FILES) \ + $(VULKAN_FILES) + +VULKAN_LIB_DEPS = $(AMDGPU_LIBS) + + +if HAVE_PLATFORM_X11 +AM_CPPFLAGS += \ + $(XCB_DRI3_CFLAGS) \ + -DVK_USE_PLATFORM_XCB_KHR \ + -DVK_USE_PLATFORM_XLIB_KHR + +VULKAN_SOURCES += $(VULKAN_WSI_X11_FILES) + +# FIXME: Use pkg-config for X11-xcb ldflags. +VULKAN_LIB_DEPS += $(XCB_DRI3_LIBS) -lX11-xcb +endif + + +if HAVE_PLATFORM_WAYLAND +AM_CPPFLAGS += \ + -I$(top_builddir)/src/egl/wayland/wayland-drm \ + -I$(top_srcdir)/src/egl/wayland/wayland-drm \ + $(WAYLAND_CFLAGS) \ + -DVK_USE_PLATFORM_WAYLAND_KHR + +VULKAN_SOURCES += $(VULKAN_WSI_WAYLAND_FILES) + +VULKAN_LIB_DEPS += \ + $(top_builddir)/src/egl/wayland/wayland-drm/libwayland-drm.la \ + $(WAYLAND_LIBS) +endif + +noinst_LTLIBRARIES = libvulkan_common.la +libvulkan_common_la_SOURCES = $(VULKAN_SOURCES) + +VULKAN_LIB_DEPS += \ + libvulkan_common.la \ + $(top_builddir)/src/amd/common/libamd_common.la \ + $(top_builddir)/src/compiler/nir/libnir.la \ + $(top_builddir)/src/util/libmesautil.la \ + $(LLVM_LIBS) \ + $(LIBELF_LIBS) \ + $(PTHREAD_LIBS) \ + $(LIBDRM_LIBS) \ + $(PTHREAD_LIBS) \ + $(DLOPEN_LIBS) \ + -lm + +nodist_EXTRA_libvulkan_radeon_la_SOURCES = dummy.cpp +libvulkan_radeon_la_SOURCES = $(VULKAN_GEM_FILES) + +radv_entrypoints.h : radv_entrypoints_gen.py $(vulkan_include_HEADERS) + $(AM_V_GEN) cat $(vulkan_include_HEADERS) |\ + $(PYTHON2) $(srcdir)/radv_entrypoints_gen.py header > $@ + +radv_entrypoints.c : radv_entrypoints_gen.py $(vulkan_include_HEADERS) + $(AM_V_GEN) cat $(vulkan_include_HEADERS) |\ + $(PYTHON2) $(srcdir)/radv_entrypoints_gen.py code > $@ + +.PHONY: radv_timestamp.h + +radv_timestamp.h: + @echo "Updating radv_timestamp.h" + $(AM_V_GEN) echo "#define RADV_TIMESTAMP \"$(TIMESTAMP_CMD)\"" > $@ + +vk_format_table.c: vk_format_table.py \ + vk_format_parse.py \ + vk_format_layout.csv + $(PYTHON2) $(srcdir)/vk_format_table.py $(srcdir)/vk_format_layout.csv > $@ + +BUILT_SOURCES = $(VULKAN_GENERATED_FILES) +CLEANFILES = $(BUILT_SOURCES) dev_icd.json radv_timestamp.h +EXTRA_DIST = \ + $(top_srcdir)/include/vulkan/vk_icd.h \ + radv_entrypoints_gen.py \ + dev_icd.json.in \ + radeon_icd.json + +libvulkan_radeon_la_LIBADD = $(VULKAN_LIB_DEPS) $(top_builddir)/src/amd/addrlib/libamdgpu_addrlib.la + +libvulkan_radeon_la_LDFLAGS = \ + -shared \ + -module \ + -no-undefined \ + -avoid-version \ + $(BSYMBOLIC) \ + $(LLVM_LDFLAGS) \ + $(GC_SECTIONS) \ + $(LD_NO_UNDEFINED) + + +icdconfdir = @VULKAN_ICD_INSTALL_DIR@ +icdconf_DATA = radeon_icd.json +# The following is used for development purposes, by setting VK_ICD_FILENAMES. +noinst_DATA = dev_icd.json + +dev_icd.json : dev_icd.json.in + $(AM_V_GEN) $(SED) \ + -e "s#@build_libdir@#${abs_top_builddir}/${LIB_DIR}#" \ + < $(srcdir)/dev_icd.json.in > $@ + +include $(top_srcdir)/install-lib-links.mk + +noinst_HEADERS = + +LDADD = \ + $(PTHREAD_LIBS) -lm -lstdc++ + diff --git a/src/amd/vulkan/Makefile.sources b/src/amd/vulkan/Makefile.sources new file mode 100644 index 00000000000..97fd0b686fd --- /dev/null +++ b/src/amd/vulkan/Makefile.sources @@ -0,0 +1,67 @@ +# Copyright © 2016 Red Hat +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +RADV_WS_AMDGPU_FILES := \ + winsys/amdgpu/radv_amdgpu_bo.c \ + winsys/amdgpu/radv_amdgpu_cs.c \ + winsys/amdgpu/radv_amdgpu_surface.c \ + winsys/amdgpu/radv_amdgpu_winsys.c \ + winsys/amdgpu/radv_amdgpu_winsys.h + +VULKAN_FILES := \ + radv_cmd_buffer.c \ + radv_device.c \ + radv_descriptor_set.c \ + radv_formats.c \ + radv_image.c \ + radv_meta.c \ + radv_meta.h \ + radv_meta_blit.c \ + radv_meta_blit2d.c \ + radv_meta_buffer.c \ + radv_meta_bufimage.c \ + radv_meta_clear.c \ + radv_meta_copy.c \ + radv_meta_decompress.c \ + radv_meta_fast_clear.c \ + radv_meta_resolve.c \ + radv_meta_resolve_cs.c \ + radv_pass.c \ + radv_pipeline.c \ + radv_pipeline_cache.c \ + radv_query.c \ + radv_util.c \ + radv_wsi.c \ + si_cmd_buffer.c \ + vk_format_table.c \ + $(RADV_WS_AMDGPU_FILES) + +VULKAN_WSI_WAYLAND_FILES := \ + radv_wsi_wayland.c + +VULKAN_WSI_X11_FILES := \ + radv_wsi_x11.c + +VULKAN_GENERATED_FILES := \ + radv_entrypoints.c \ + radv_entrypoints.h \ + radv_timestamp.h + diff --git a/src/amd/vulkan/dev_icd.json.in b/src/amd/vulkan/dev_icd.json.in new file mode 100644 index 00000000000..fd8c7e3d746 --- /dev/null +++ b/src/amd/vulkan/dev_icd.json.in @@ -0,0 +1,7 @@ +{ + "file_format_version": "1.0.0", + "ICD": { + "library_path": "@build_libdir@/libvulkan_radeon.so", + "abi_versions": "1.0.3" + } +} diff --git a/src/amd/vulkan/radeon_icd.json b/src/amd/vulkan/radeon_icd.json new file mode 100644 index 00000000000..2085bd6788a --- /dev/null +++ b/src/amd/vulkan/radeon_icd.json @@ -0,0 +1,7 @@ +{ + "file_format_version": "1.0.0", + "ICD": { + "library_path": "libvulkan_radeon.so", + "abi_versions": "1.0.3" + } +} diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c new file mode 100644 index 00000000000..e3e9e326324 --- /dev/null +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -0,0 +1,2413 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based in part on anv driver which is: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "radv_private.h" +#include "radv_radeon_winsys.h" +#include "radv_cs.h" +#include "sid.h" +#include "vk_format.h" +#include "radv_meta.h" + +static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + VkImageLayout src_layout, + VkImageLayout dst_layout, + VkImageSubresourceRange range, + VkImageAspectFlags pending_clears); + +const struct radv_dynamic_state default_dynamic_state = { + .viewport = { + .count = 0, + }, + .scissor = { + .count = 0, + }, + .line_width = 1.0f, + .depth_bias = { + .bias = 0.0f, + .clamp = 0.0f, + .slope = 0.0f, + }, + .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f }, + .depth_bounds = { + .min = 0.0f, + .max = 1.0f, + }, + .stencil_compare_mask = { + .front = ~0u, + .back = ~0u, + }, + .stencil_write_mask = { + .front = ~0u, + .back = ~0u, + }, + .stencil_reference = { + .front = 0u, + .back = 0u, + }, +}; + +void +radv_dynamic_state_copy(struct radv_dynamic_state *dest, + const struct radv_dynamic_state *src, + uint32_t copy_mask) +{ + if (copy_mask & (1 << VK_DYNAMIC_STATE_VIEWPORT)) { + dest->viewport.count = src->viewport.count; + typed_memcpy(dest->viewport.viewports, src->viewport.viewports, + src->viewport.count); + } + + if (copy_mask & (1 << VK_DYNAMIC_STATE_SCISSOR)) { + dest->scissor.count = src->scissor.count; + typed_memcpy(dest->scissor.scissors, src->scissor.scissors, + src->scissor.count); + } + + if (copy_mask & (1 << VK_DYNAMIC_STATE_LINE_WIDTH)) + dest->line_width = src->line_width; + + if (copy_mask & (1 << VK_DYNAMIC_STATE_DEPTH_BIAS)) + dest->depth_bias = src->depth_bias; + + if (copy_mask & (1 << VK_DYNAMIC_STATE_BLEND_CONSTANTS)) + typed_memcpy(dest->blend_constants, src->blend_constants, 4); + + if (copy_mask & (1 << VK_DYNAMIC_STATE_DEPTH_BOUNDS)) + dest->depth_bounds = src->depth_bounds; + + if (copy_mask & (1 << VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK)) + dest->stencil_compare_mask = src->stencil_compare_mask; + + if (copy_mask & (1 << VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) + dest->stencil_write_mask = src->stencil_write_mask; + + if (copy_mask & (1 << VK_DYNAMIC_STATE_STENCIL_REFERENCE)) + dest->stencil_reference = src->stencil_reference; +} + +static VkResult radv_create_cmd_buffer( + struct radv_device * device, + struct radv_cmd_pool * pool, + VkCommandBufferLevel level, + VkCommandBuffer* pCommandBuffer) +{ + struct radv_cmd_buffer *cmd_buffer; + VkResult result; + + cmd_buffer = radv_alloc(&pool->alloc, sizeof(*cmd_buffer), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (cmd_buffer == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + memset(cmd_buffer, 0, sizeof(*cmd_buffer)); + cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC; + cmd_buffer->device = device; + cmd_buffer->pool = pool; + cmd_buffer->level = level; + + if (pool) { + list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); + } else { + /* Init the pool_link so we can safefly call list_del when we destroy + * the command buffer + */ + list_inithead(&cmd_buffer->pool_link); + } + + cmd_buffer->cs = device->ws->cs_create(device->ws, RING_GFX); + if (!cmd_buffer->cs) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail; + } + + *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer); + + cmd_buffer->upload.offset = 0; + cmd_buffer->upload.size = 0; + list_inithead(&cmd_buffer->upload.list); + + return VK_SUCCESS; + +fail: + radv_free(&cmd_buffer->pool->alloc, cmd_buffer); + + return result; +} + +static bool +radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, + uint64_t min_needed) +{ + uint64_t new_size; + struct radeon_winsys_bo *bo; + struct radv_cmd_buffer_upload *upload; + struct radv_device *device = cmd_buffer->device; + + new_size = MAX2(min_needed, 16 * 1024); + new_size = MAX2(new_size, 2 * cmd_buffer->upload.size); + + bo = device->ws->buffer_create(device->ws, + new_size, 4096, + RADEON_DOMAIN_GTT, + RADEON_FLAG_CPU_ACCESS); + + if (!bo) { + cmd_buffer->record_fail = true; + return false; + } + + device->ws->cs_add_buffer(cmd_buffer->cs, bo, 8); + if (cmd_buffer->upload.upload_bo) { + upload = malloc(sizeof(*upload)); + + if (!upload) { + cmd_buffer->record_fail = true; + device->ws->buffer_destroy(bo); + return false; + } + + memcpy(upload, &cmd_buffer->upload, sizeof(*upload)); + list_add(&upload->list, &cmd_buffer->upload.list); + } + + cmd_buffer->upload.upload_bo = bo; + cmd_buffer->upload.size = new_size; + cmd_buffer->upload.offset = 0; + cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo); + + if (!cmd_buffer->upload.map) { + cmd_buffer->record_fail = true; + return false; + } + + return true; +} + +bool +radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, + unsigned size, + unsigned alignment, + unsigned *out_offset, + void **ptr) +{ + uint64_t offset = align(cmd_buffer->upload.offset, alignment); + if (offset + size > cmd_buffer->upload.size) { + if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size)) + return false; + offset = 0; + } + + *out_offset = offset; + *ptr = cmd_buffer->upload.map + offset; + + cmd_buffer->upload.offset = offset + size; + return true; +} + +bool +radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, + unsigned size, unsigned alignment, + const void *data, unsigned *out_offset) +{ + uint8_t *ptr; + + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, alignment, + out_offset, (void **)&ptr)) + return false; + + if (ptr) + memcpy(ptr, data, size); + + return true; +} + +static void +radv_emit_graphics_blend_state(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline) +{ + radeon_set_context_reg_seq(cmd_buffer->cs, R_028780_CB_BLEND0_CONTROL, 8); + radeon_emit_array(cmd_buffer->cs, pipeline->graphics.blend.cb_blend_control, + 8); + radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, pipeline->graphics.blend.cb_color_control); + radeon_set_context_reg(cmd_buffer->cs, R_028B70_DB_ALPHA_TO_MASK, pipeline->graphics.blend.db_alpha_to_mask); +} + +static void +radv_emit_graphics_depth_stencil_state(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline) +{ + struct radv_depth_stencil_state *ds = &pipeline->graphics.ds; + radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, ds->db_depth_control); + radeon_set_context_reg(cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL, ds->db_stencil_control); + + radeon_set_context_reg(cmd_buffer->cs, R_028000_DB_RENDER_CONTROL, ds->db_render_control); + radeon_set_context_reg(cmd_buffer->cs, R_028010_DB_RENDER_OVERRIDE2, ds->db_render_override2); +} + +/* 12.4 fixed-point */ +static unsigned radv_pack_float_12p4(float x) +{ + return x <= 0 ? 0 : + x >= 4096 ? 0xffff : x * 16; +} + +static void +radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline) +{ + int num_samples = pipeline->graphics.ms.num_samples; + struct radv_multisample_state *ms = &pipeline->graphics.ms; + struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline; + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); + radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_mask[0]); + radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_mask[1]); + + if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples) + return; + + radeon_set_context_reg_seq(cmd_buffer->cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2); + radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); + radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_config); + + radeon_set_context_reg(cmd_buffer->cs, CM_R_028804_DB_EQAA, ms->db_eqaa); + radeon_set_context_reg(cmd_buffer->cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, ms->pa_sc_mode_cntl_1); + + radv_cayman_emit_msaa_sample_locs(cmd_buffer->cs, num_samples); + + uint32_t samples_offset; + void *samples_ptr; + void *src; + radv_cmd_buffer_upload_alloc(cmd_buffer, num_samples * 4 * 2, 256, &samples_offset, + &samples_ptr); + switch (num_samples) { + case 1: + src = cmd_buffer->device->sample_locations_1x; + break; + case 2: + src = cmd_buffer->device->sample_locations_2x; + break; + case 4: + src = cmd_buffer->device->sample_locations_4x; + break; + case 8: + src = cmd_buffer->device->sample_locations_8x; + break; + case 16: + src = cmd_buffer->device->sample_locations_16x; + break; + } + memcpy(samples_ptr, src, num_samples * 4 * 2); + + uint64_t va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo); + va += samples_offset; + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B030_SPI_SHADER_USER_DATA_PS_0 + 10 * 4, 2); + radeon_emit(cmd_buffer->cs, va); + radeon_emit(cmd_buffer->cs, va >> 32); +} + +static void +radv_emit_graphics_raster_state(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline) +{ + struct radv_raster_state *raster = &pipeline->graphics.raster; + + radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL, + raster->pa_cl_clip_cntl); + + radeon_set_context_reg(cmd_buffer->cs, R_0286D4_SPI_INTERP_CONTROL_0, + raster->spi_interp_control); + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028A00_PA_SU_POINT_SIZE, 2); + radeon_emit(cmd_buffer->cs, 0); + radeon_emit(cmd_buffer->cs, S_028A04_MIN_SIZE(radv_pack_float_12p4(0)) | + S_028A04_MAX_SIZE(radv_pack_float_12p4(8192/2))); /* R_028A04_PA_SU_POINT_MINMAX */ + + radeon_set_context_reg(cmd_buffer->cs, R_028BE4_PA_SU_VTX_CNTL, + raster->pa_su_vtx_cntl); + + radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, + raster->pa_su_sc_mode_cntl); +} + +static void +radv_emit_vertex_shader(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline) +{ + struct radeon_winsys *ws = cmd_buffer->device->ws; + struct radv_shader_variant *vs; + uint64_t va; + unsigned export_count; + unsigned clip_dist_mask, cull_dist_mask, total_mask; + + assert (pipeline->shaders[MESA_SHADER_VERTEX]); + + vs = pipeline->shaders[MESA_SHADER_VERTEX]; + va = ws->buffer_get_va(vs->bo); + ws->cs_add_buffer(cmd_buffer->cs, vs->bo, 8); + + clip_dist_mask = vs->info.vs.clip_dist_mask; + cull_dist_mask = vs->info.vs.cull_dist_mask; + total_mask = clip_dist_mask | cull_dist_mask; + radeon_set_context_reg(cmd_buffer->cs, R_028A40_VGT_GS_MODE, 0); + radeon_set_context_reg(cmd_buffer->cs, R_028A84_VGT_PRIMITIVEID_EN, 0); + + export_count = MAX2(1, vs->info.vs.param_exports); + radeon_set_context_reg(cmd_buffer->cs, R_0286C4_SPI_VS_OUT_CONFIG, + S_0286C4_VS_EXPORT_COUNT(export_count - 1)); + radeon_set_context_reg(cmd_buffer->cs, R_02870C_SPI_SHADER_POS_FORMAT, + S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | + S_02870C_POS1_EXPORT_FORMAT(vs->info.vs.pos_exports > 1 ? + V_02870C_SPI_SHADER_4COMP : + V_02870C_SPI_SHADER_NONE) | + S_02870C_POS2_EXPORT_FORMAT(vs->info.vs.pos_exports > 2 ? + V_02870C_SPI_SHADER_4COMP : + V_02870C_SPI_SHADER_NONE) | + S_02870C_POS3_EXPORT_FORMAT(vs->info.vs.pos_exports > 3 ? + V_02870C_SPI_SHADER_4COMP : + V_02870C_SPI_SHADER_NONE)); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B120_SPI_SHADER_PGM_LO_VS, 4); + radeon_emit(cmd_buffer->cs, va >> 8); + radeon_emit(cmd_buffer->cs, va >> 40); + radeon_emit(cmd_buffer->cs, vs->rsrc1); + radeon_emit(cmd_buffer->cs, vs->rsrc2); + + radeon_set_context_reg(cmd_buffer->cs, R_028818_PA_CL_VTE_CNTL, + S_028818_VTX_W0_FMT(1) | + S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) | + S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) | + S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1)); + + radeon_set_context_reg(cmd_buffer->cs, R_02881C_PA_CL_VS_OUT_CNTL, + S_02881C_USE_VTX_POINT_SIZE(vs->info.vs.writes_pointsize) | + S_02881C_VS_OUT_MISC_VEC_ENA(vs->info.vs.writes_pointsize) | + S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) | + S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) | + pipeline->graphics.raster.pa_cl_vs_out_cntl | + cull_dist_mask << 8 | + clip_dist_mask); + +} + + + +static void +radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline) +{ + struct radeon_winsys *ws = cmd_buffer->device->ws; + struct radv_shader_variant *ps, *vs; + uint64_t va; + unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1); + struct radv_blend_state *blend = &pipeline->graphics.blend; + unsigned ps_offset = 0; + unsigned z_order; + assert (pipeline->shaders[MESA_SHADER_FRAGMENT]); + + ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; + vs = pipeline->shaders[MESA_SHADER_VERTEX]; + va = ws->buffer_get_va(ps->bo); + ws->cs_add_buffer(cmd_buffer->cs, ps->bo, 8); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4); + radeon_emit(cmd_buffer->cs, va >> 8); + radeon_emit(cmd_buffer->cs, va >> 40); + radeon_emit(cmd_buffer->cs, ps->rsrc1); + radeon_emit(cmd_buffer->cs, ps->rsrc2); + + if (ps->info.fs.early_fragment_test || !ps->info.fs.writes_memory) + z_order = V_02880C_EARLY_Z_THEN_LATE_Z; + else + z_order = V_02880C_LATE_Z; + + + radeon_set_context_reg(cmd_buffer->cs, R_02880C_DB_SHADER_CONTROL, + S_02880C_Z_EXPORT_ENABLE(ps->info.fs.writes_z) | + S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.fs.writes_stencil) | + S_02880C_KILL_ENABLE(!!ps->info.fs.can_discard) | + S_02880C_Z_ORDER(z_order) | + S_02880C_DEPTH_BEFORE_SHADER(ps->info.fs.early_fragment_test) | + S_02880C_EXEC_ON_HIER_FAIL(ps->info.fs.writes_memory) | + S_02880C_EXEC_ON_NOOP(ps->info.fs.writes_memory)); + + radeon_set_context_reg(cmd_buffer->cs, R_0286CC_SPI_PS_INPUT_ENA, + ps->config.spi_ps_input_ena); + + radeon_set_context_reg(cmd_buffer->cs, R_0286D0_SPI_PS_INPUT_ADDR, + ps->config.spi_ps_input_addr); + + spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2); + radeon_set_context_reg(cmd_buffer->cs, R_0286D8_SPI_PS_IN_CONTROL, + S_0286D8_NUM_INTERP(ps->info.fs.num_interp)); + + radeon_set_context_reg(cmd_buffer->cs, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl); + + radeon_set_context_reg(cmd_buffer->cs, R_028710_SPI_SHADER_Z_FORMAT, + ps->info.fs.writes_stencil ? V_028710_SPI_SHADER_32_GR : + ps->info.fs.writes_z ? V_028710_SPI_SHADER_32_R : + V_028710_SPI_SHADER_ZERO); + + radeon_set_context_reg(cmd_buffer->cs, R_028714_SPI_SHADER_COL_FORMAT, blend->spi_shader_col_format); + + radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK, blend->cb_target_mask); + radeon_set_context_reg(cmd_buffer->cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask); + + if (ps->info.fs.has_pcoord) { + unsigned val; + val = S_028644_PT_SPRITE_TEX(1) | S_028644_OFFSET(0x20); + radeon_set_context_reg(cmd_buffer->cs, R_028644_SPI_PS_INPUT_CNTL_0 + 4 * ps_offset, val); + ps_offset = 1; + } + + for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.fs.input_mask; ++i) { + unsigned vs_offset, flat_shade; + unsigned val; + + if (!(ps->info.fs.input_mask & (1u << i))) + continue; + + + if (!(vs->info.vs.export_mask & (1u << i))) { + radeon_set_context_reg(cmd_buffer->cs, R_028644_SPI_PS_INPUT_CNTL_0 + 4 * ps_offset, + S_028644_OFFSET(0x20)); + ++ps_offset; + continue; + } + + vs_offset = util_bitcount(vs->info.vs.export_mask & ((1u << i) - 1)); + flat_shade = !!(ps->info.fs.flat_shaded_mask & (1u << ps_offset)); + + val = S_028644_OFFSET(vs_offset) | S_028644_FLAT_SHADE(flat_shade); + radeon_set_context_reg(cmd_buffer->cs, R_028644_SPI_PS_INPUT_CNTL_0 + 4 * ps_offset, val); + ++ps_offset; + } +} + +static void +radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline) +{ + if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) + return; + + radv_emit_graphics_depth_stencil_state(cmd_buffer, pipeline); + radv_emit_graphics_blend_state(cmd_buffer, pipeline); + radv_emit_graphics_raster_state(cmd_buffer, pipeline); + radv_update_multisample_state(cmd_buffer, pipeline); + radv_emit_vertex_shader(cmd_buffer, pipeline); + radv_emit_fragment_shader(cmd_buffer, pipeline); + + radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, + pipeline->graphics.prim_restart_enable); + + cmd_buffer->state.emitted_pipeline = pipeline; +} + +static void +radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer) +{ + si_write_viewport(cmd_buffer->cs, 0, cmd_buffer->state.dynamic.viewport.count, + cmd_buffer->state.dynamic.viewport.viewports); +} + +static void +radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer) +{ + uint32_t count = cmd_buffer->state.dynamic.scissor.count; + si_write_scissors(cmd_buffer->cs, 0, count, + cmd_buffer->state.dynamic.scissor.scissors); + radeon_set_context_reg(cmd_buffer->cs, R_028A48_PA_SC_MODE_CNTL_0, + cmd_buffer->state.pipeline->graphics.ms.pa_sc_mode_cntl_0 | S_028A48_VPORT_SCISSOR_ENABLE(count ? 1 : 0)); +} + +static void +radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, + int index, + struct radv_color_buffer_info *cb) +{ + bool is_vi = cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= VI; + radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); + radeon_emit(cmd_buffer->cs, cb->cb_color_base); + radeon_emit(cmd_buffer->cs, cb->cb_color_pitch); + radeon_emit(cmd_buffer->cs, cb->cb_color_slice); + radeon_emit(cmd_buffer->cs, cb->cb_color_view); + radeon_emit(cmd_buffer->cs, cb->cb_color_info); + radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); + radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); + radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); + radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice); + radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); + radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice); + + if (is_vi) { /* DCC BASE */ + radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base); + } +} + +static void +radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, + struct radv_ds_buffer_info *ds, + struct radv_image *image, + VkImageLayout layout) +{ + uint32_t db_z_info = ds->db_z_info; + + if (!radv_layout_has_htile(image, layout)) + db_z_info &= C_028040_TILE_SURFACE_ENABLE; + + if (!radv_layout_can_expclear(image, layout)) + db_z_info &= C_028040_ALLOW_EXPCLEAR & C_028044_ALLOW_EXPCLEAR; + + radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view); + radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base); + + radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9); + radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */ + radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_info); /* R_028044_DB_STENCIL_INFO */ + radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */ + radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */ + + radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface); + radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + ds->pa_su_poly_offset_db_fmt_cntl); +} + +/* + * To hw resolve multisample images both src and dst need to have the same + * micro tiling mode. However we don't always know in advance when creating + * the images. This function gets called if we have a resolve attachment, + * and tests if the attachment image has the same tiling mode, then it + * checks if the generated framebuffer data has the same tiling mode, and + * updates it if not. + */ +static void radv_set_optimal_micro_tile_mode(struct radv_device *device, + struct radv_attachment_info *att, + uint32_t micro_tile_mode) +{ + struct radv_image *image = att->attachment->image; + uint32_t tile_mode_index; + if (image->surface.nsamples <= 1) + return; + + if (image->surface.micro_tile_mode != micro_tile_mode) { + radv_image_set_optimal_micro_tile_mode(device, image, micro_tile_mode); + } + + if (att->cb.micro_tile_mode != micro_tile_mode) { + tile_mode_index = image->surface.tiling_index[0]; + + att->cb.cb_color_attrib &= C_028C74_TILE_MODE_INDEX; + att->cb.cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index); + att->cb.micro_tile_mode = micro_tile_mode; + } +} + +void +radv_set_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + VkClearDepthStencilValue ds_clear_value, + VkImageAspectFlags aspects) +{ + uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo); + va += image->offset + image->clear_value_offset; + unsigned reg_offset = 0, reg_count = 0; + + if (!image->htile.size || !aspects) + return; + + if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + ++reg_count; + } else { + ++reg_offset; + va += 4; + } + if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) + ++reg_count; + + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, image->bo, 8); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, 0)); + radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM_ASYNC) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_PFP)); + radeon_emit(cmd_buffer->cs, va); + radeon_emit(cmd_buffer->cs, va >> 32); + if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) + radeon_emit(cmd_buffer->cs, ds_clear_value.stencil); + if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) + radeon_emit(cmd_buffer->cs, fui(ds_clear_value.depth)); + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028028_DB_STENCIL_CLEAR + 4 * reg_offset, reg_count); + if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) + radeon_emit(cmd_buffer->cs, ds_clear_value.stencil); /* R_028028_DB_STENCIL_CLEAR */ + if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) + radeon_emit(cmd_buffer->cs, fui(ds_clear_value.depth)); /* R_02802C_DB_DEPTH_CLEAR */ +} + +static void +radv_load_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image) +{ + uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo); + va += image->offset + image->clear_value_offset; + + if (!image->htile.size) + return; + + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, image->bo, 8); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | + COPY_DATA_DST_SEL(COPY_DATA_REG) | + COPY_DATA_COUNT_SEL); + radeon_emit(cmd_buffer->cs, va); + radeon_emit(cmd_buffer->cs, va >> 32); + radeon_emit(cmd_buffer->cs, R_028028_DB_STENCIL_CLEAR >> 2); + radeon_emit(cmd_buffer->cs, 0); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cmd_buffer->cs, 0); +} + +void +radv_set_color_clear_regs(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + int idx, + uint32_t color_values[2]) +{ + uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo); + va += image->offset + image->clear_value_offset; + + if (!image->cmask.size && !image->surface.dcc_size) + return; + + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, image->bo, 8); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 4, 0)); + radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM_ASYNC) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_PFP)); + radeon_emit(cmd_buffer->cs, va); + radeon_emit(cmd_buffer->cs, va >> 32); + radeon_emit(cmd_buffer->cs, color_values[0]); + radeon_emit(cmd_buffer->cs, color_values[1]); + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + idx * 0x3c, 2); + radeon_emit(cmd_buffer->cs, color_values[0]); + radeon_emit(cmd_buffer->cs, color_values[1]); +} + +static void +radv_load_color_clear_regs(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + int idx) +{ + uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo); + va += image->offset + image->clear_value_offset; + + if (!image->cmask.size && !image->surface.dcc_size) + return; + + uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + idx * 0x3c; + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, image->bo, 8); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | + COPY_DATA_DST_SEL(COPY_DATA_REG) | + COPY_DATA_COUNT_SEL); + radeon_emit(cmd_buffer->cs, va); + radeon_emit(cmd_buffer->cs, va >> 32); + radeon_emit(cmd_buffer->cs, reg >> 2); + radeon_emit(cmd_buffer->cs, 0); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cmd_buffer->cs, 0); +} + +void +radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer) +{ + int i; + struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; + const struct radv_subpass *subpass = cmd_buffer->state.subpass; + int dst_resolve_micro_tile_mode = -1; + + if (subpass->has_resolve) { + uint32_t a = subpass->resolve_attachments[0].attachment; + const struct radv_image *image = framebuffer->attachments[a].attachment->image; + dst_resolve_micro_tile_mode = image->surface.micro_tile_mode; + } + for (i = 0; i < subpass->color_count; ++i) { + int idx = subpass->color_attachments[i].attachment; + struct radv_attachment_info *att = &framebuffer->attachments[idx]; + + if (dst_resolve_micro_tile_mode != -1) { + radv_set_optimal_micro_tile_mode(cmd_buffer->device, + att, dst_resolve_micro_tile_mode); + } + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, att->attachment->bo, 8); + + assert(att->attachment->aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT); + radv_emit_fb_color_state(cmd_buffer, i, &att->cb); + + radv_load_color_clear_regs(cmd_buffer, att->attachment->image, i); + } + + for (i = subpass->color_count; i < 8; i++) + radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, + S_028C70_FORMAT(V_028C70_COLOR_INVALID)); + + if(subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) { + int idx = subpass->depth_stencil_attachment.attachment; + VkImageLayout layout = subpass->depth_stencil_attachment.layout; + struct radv_attachment_info *att = &framebuffer->attachments[idx]; + struct radv_image *image = att->attachment->image; + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, att->attachment->bo, 8); + + radv_emit_fb_ds_state(cmd_buffer, &att->ds, image, layout); + + if (att->ds.offset_scale != cmd_buffer->state.offset_scale) { + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; + cmd_buffer->state.offset_scale = att->ds.offset_scale; + } + radv_load_depth_clear_regs(cmd_buffer, image); + } else { + radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2); + radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* R_028040_DB_Z_INFO */ + radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* R_028044_DB_STENCIL_INFO */ + } + radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, + S_028208_BR_X(framebuffer->width) | + S_028208_BR_Y(framebuffer->height)); +} + +void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer) +{ + uint32_t db_count_control; + + if(!cmd_buffer->state.active_occlusion_queries) { + if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= CIK) { + db_count_control = 0; + } else { + db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1); + } + } else { + if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= CIK) { + db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | + S_028004_SAMPLE_RATE(0) | /* TODO: set this to the number of samples of the current framebuffer */ + S_028004_ZPASS_ENABLE(1) | + S_028004_SLICE_EVEN_ENABLE(1) | + S_028004_SLICE_ODD_ENABLE(1); + } else { + db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | + S_028004_SAMPLE_RATE(0); /* TODO: set this to the number of samples of the current framebuffer */ + } + } + + radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control); +} + +static void +radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; + + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH) { + unsigned width = cmd_buffer->state.dynamic.line_width * 8; + radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL, + S_028A08_WIDTH(CLAMP(width, 0, 0xFFF))); + } + + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) { + radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4); + radeon_emit_array(cmd_buffer->cs, (uint32_t*)d->blend_constants, 4); + } + + if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | + RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK | + RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK)) { + radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2); + radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) | + S_028430_STENCILMASK(d->stencil_compare_mask.front) | + S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) | + S_028430_STENCILOPVAL(1)); + radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) | + S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) | + S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) | + S_028434_STENCILOPVAL_BF(1)); + } + + if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_PIPELINE | + RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)) { + radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, fui(d->depth_bounds.min)); + radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_DEPTH_BOUNDS_MAX, fui(d->depth_bounds.max)); + } + + if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_PIPELINE | + RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)) { + struct radv_raster_state *raster = &cmd_buffer->state.pipeline->graphics.raster; + unsigned slope = fui(d->depth_bias.slope * 16.0f); + unsigned bias = fui(d->depth_bias.bias * cmd_buffer->state.offset_scale); + + if (G_028814_POLY_OFFSET_FRONT_ENABLE(raster->pa_su_sc_mode_cntl)) { + radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5); + radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */ + radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */ + radeon_emit(cmd_buffer->cs, bias); /* FRONT OFFSET */ + radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */ + radeon_emit(cmd_buffer->cs, bias); /* BACK OFFSET */ + } + } + + cmd_buffer->state.dirty = 0; +} + +static void +radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline_layout *layout, + VkShaderStageFlags stages) { + unsigned offset; + void *ptr; + uint64_t va; + + stages &= cmd_buffer->push_constant_stages; + if (!stages || !layout || (!layout->push_constant_size && !layout->dynamic_offset_count)) + return; + + radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size + + 16 * layout->dynamic_offset_count, + 256, &offset, &ptr); + + memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size); + memcpy((char*)ptr + layout->push_constant_size, cmd_buffer->dynamic_buffers, + 16 * layout->dynamic_offset_count); + + va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo); + va += offset; + + if (stages & VK_SHADER_STAGE_VERTEX_BIT) { + radeon_set_sh_reg_seq(cmd_buffer->cs, + R_00B130_SPI_SHADER_USER_DATA_VS_0 + 8 * 4, 2); + radeon_emit(cmd_buffer->cs, va); + radeon_emit(cmd_buffer->cs, va >> 32); + } + + if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) { + radeon_set_sh_reg_seq(cmd_buffer->cs, + R_00B030_SPI_SHADER_USER_DATA_PS_0 + 8 * 4, 2); + radeon_emit(cmd_buffer->cs, va); + radeon_emit(cmd_buffer->cs, va >> 32); + } + + if (stages & VK_SHADER_STAGE_COMPUTE_BIT) { + radeon_set_sh_reg_seq(cmd_buffer->cs, + R_00B900_COMPUTE_USER_DATA_0 + 8 * 4, 2); + radeon_emit(cmd_buffer->cs, va); + radeon_emit(cmd_buffer->cs, va >> 32); + } + + cmd_buffer->push_constant_stages &= ~stages; +} + +static void +radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + struct radv_device *device = cmd_buffer->device; + uint32_t ia_multi_vgt_param; + uint32_t ls_hs_config = 0; + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, + 4096); + + if ((cmd_buffer->state.vertex_descriptors_dirty || cmd_buffer->state.vb_dirty) && + cmd_buffer->state.pipeline->num_vertex_attribs) { + unsigned vb_offset; + void *vb_ptr; + uint32_t i = 0; + uint32_t num_attribs = cmd_buffer->state.pipeline->num_vertex_attribs; + uint64_t va; + + /* allocate some descriptor state for vertex buffers */ + radv_cmd_buffer_upload_alloc(cmd_buffer, num_attribs * 16, 256, + &vb_offset, &vb_ptr); + + for (i = 0; i < num_attribs; i++) { + uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4]; + uint32_t offset; + int vb = cmd_buffer->state.pipeline->va_binding[i]; + struct radv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer; + uint32_t stride = cmd_buffer->state.pipeline->binding_stride[vb]; + + device->ws->cs_add_buffer(cmd_buffer->cs, buffer->bo, 8); + va = device->ws->buffer_get_va(buffer->bo); + + offset = cmd_buffer->state.vertex_bindings[vb].offset + cmd_buffer->state.pipeline->va_offset[i]; + va += offset + buffer->offset; + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride); + if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class <= CIK && stride) + desc[2] = (buffer->size - offset - cmd_buffer->state.pipeline->va_format_size[i]) / stride + 1; + else + desc[2] = buffer->size - offset; + desc[3] = cmd_buffer->state.pipeline->va_rsrc_word3[i]; + } + + va = device->ws->buffer_get_va(cmd_buffer->upload.upload_bo); + va += vb_offset; + radeon_set_sh_reg_seq(cmd_buffer->cs, + R_00B130_SPI_SHADER_USER_DATA_VS_0 + 10 * 4, 2); + radeon_emit(cmd_buffer->cs, va); + radeon_emit(cmd_buffer->cs, va >> 32); + + } + + cmd_buffer->state.vertex_descriptors_dirty = false; + cmd_buffer->state.vb_dirty = 0; + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) + radv_emit_graphics_pipeline(cmd_buffer, pipeline); + + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_RENDER_TARGETS) + radv_emit_framebuffer_state(cmd_buffer); + + if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT)) + radv_emit_viewport(cmd_buffer); + + if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR)) + radv_emit_scissor(cmd_buffer); + + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) { + radeon_set_context_reg(cmd_buffer->cs, R_028B54_VGT_SHADER_STAGES_EN, 0); + ia_multi_vgt_param = si_get_ia_multi_vgt_param(cmd_buffer); + + if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= CIK) { + radeon_set_context_reg_idx(cmd_buffer->cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param); + radeon_set_context_reg_idx(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config); + radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, cmd_buffer->state.pipeline->graphics.prim); + } else { + radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, cmd_buffer->state.pipeline->graphics.prim); + radeon_set_context_reg(cmd_buffer->cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); + radeon_set_context_reg(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config); + } + radeon_set_context_reg(cmd_buffer->cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, cmd_buffer->state.pipeline->graphics.gs_out); + } + + radv_cmd_buffer_flush_dynamic_state(cmd_buffer); + + radv_flush_constants(cmd_buffer, cmd_buffer->state.pipeline->layout, + VK_SHADER_STAGE_ALL_GRAPHICS); + + assert(cmd_buffer->cs->cdw <= cdw_max); + + si_emit_cache_flush(cmd_buffer); +} + +static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, + VkPipelineStageFlags src_stage_mask) +{ + if (src_stage_mask & (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | + VK_PIPELINE_STAGE_TRANSFER_BIT | + VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) { + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH; + } + + if (src_stage_mask & (VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | + VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | + VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | + VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | + VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | + VK_PIPELINE_STAGE_TRANSFER_BIT | + VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | + VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) { + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH; + } else if (src_stage_mask & (VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT | + VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | + VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | + VK_PIPELINE_STAGE_VERTEX_SHADER_BIT)) { + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH; + } +} + +static void radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass_barrier *barrier) +{ + radv_stage_flush(cmd_buffer, barrier->src_stage_mask); + + /* TODO: actual cache flushes */ +} + +static void radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer, + VkAttachmentReference att) +{ + unsigned idx = att.attachment; + struct radv_image_view *view = cmd_buffer->state.framebuffer->attachments[idx].attachment; + VkImageSubresourceRange range; + range.aspectMask = 0; + range.baseMipLevel = view->base_mip; + range.levelCount = 1; + range.baseArrayLayer = view->base_layer; + range.layerCount = cmd_buffer->state.framebuffer->layers; + + radv_handle_image_transition(cmd_buffer, + view->image, + cmd_buffer->state.attachments[idx].current_layout, + att.layout, range, + cmd_buffer->state.attachments[idx].pending_clear_aspects); + + cmd_buffer->state.attachments[idx].current_layout = att.layout; + + +} + +void +radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, + const struct radv_subpass *subpass, bool transitions) +{ + if (transitions) { + radv_subpass_barrier(cmd_buffer, &subpass->start_barrier); + + for (unsigned i = 0; i < subpass->color_count; ++i) { + radv_handle_subpass_image_transition(cmd_buffer, + subpass->color_attachments[i]); + } + + for (unsigned i = 0; i < subpass->input_count; ++i) { + radv_handle_subpass_image_transition(cmd_buffer, + subpass->input_attachments[i]); + } + + if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) { + radv_handle_subpass_image_transition(cmd_buffer, + subpass->depth_stencil_attachment); + } + } + + cmd_buffer->state.subpass = subpass; + + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RENDER_TARGETS; +} + +static void +radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, + struct radv_render_pass *pass, + const VkRenderPassBeginInfo *info) +{ + struct radv_cmd_state *state = &cmd_buffer->state; + + if (pass->attachment_count == 0) { + state->attachments = NULL; + return; + } + + state->attachments = radv_alloc(&cmd_buffer->pool->alloc, + pass->attachment_count * + sizeof(state->attachments[0]), + 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (state->attachments == NULL) { + /* FIXME: Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */ + abort(); + } + + for (uint32_t i = 0; i < pass->attachment_count; ++i) { + struct radv_render_pass_attachment *att = &pass->attachments[i]; + VkImageAspectFlags att_aspects = vk_format_aspects(att->format); + VkImageAspectFlags clear_aspects = 0; + + if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + /* color attachment */ + if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; + } + } else { + /* depthstencil attachment */ + if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && + att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; + } + if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && + att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; + } + } + + state->attachments[i].pending_clear_aspects = clear_aspects; + if (clear_aspects && info) { + assert(info->clearValueCount > i); + state->attachments[i].clear_value = info->pClearValues[i]; + } + + state->attachments[i].current_layout = att->initial_layout; + } +} + +VkResult radv_AllocateCommandBuffers( + VkDevice _device, + const VkCommandBufferAllocateInfo *pAllocateInfo, + VkCommandBuffer *pCommandBuffers) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool); + + VkResult result = VK_SUCCESS; + uint32_t i; + + for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { + result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, + &pCommandBuffers[i]); + if (result != VK_SUCCESS) + break; + } + + if (result != VK_SUCCESS) + radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, + i, pCommandBuffers); + + return result; +} + +static void +radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer) +{ + list_del(&cmd_buffer->pool_link); + + list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, + &cmd_buffer->upload.list, list) { + cmd_buffer->device->ws->buffer_destroy(up->upload_bo); + list_del(&up->list); + free(up); + } + + if (cmd_buffer->upload.upload_bo) + cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo); + cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs); + radv_free(&cmd_buffer->pool->alloc, cmd_buffer); +} + +void radv_FreeCommandBuffers( + VkDevice device, + VkCommandPool commandPool, + uint32_t commandBufferCount, + const VkCommandBuffer *pCommandBuffers) +{ + for (uint32_t i = 0; i < commandBufferCount; i++) { + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]); + + if (cmd_buffer) + radv_cmd_buffer_destroy(cmd_buffer); + } +} + +static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) +{ + + cmd_buffer->device->ws->cs_reset(cmd_buffer->cs); + + list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, + &cmd_buffer->upload.list, list) { + cmd_buffer->device->ws->buffer_destroy(up->upload_bo); + list_del(&up->list); + free(up); + } + + if (cmd_buffer->upload.upload_bo) + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, + cmd_buffer->upload.upload_bo, 8); + cmd_buffer->upload.offset = 0; + + cmd_buffer->record_fail = false; +} + +VkResult radv_ResetCommandBuffer( + VkCommandBuffer commandBuffer, + VkCommandBufferResetFlags flags) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + radv_reset_cmd_buffer(cmd_buffer); + return VK_SUCCESS; +} + +VkResult radv_BeginCommandBuffer( + VkCommandBuffer commandBuffer, + const VkCommandBufferBeginInfo *pBeginInfo) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + radv_reset_cmd_buffer(cmd_buffer); + + memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state)); + + /* setup initial configuration into command buffer */ + if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { + /* Flush read caches at the beginning of CS not flushed by the kernel. */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_ICACHE | + RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_VMEM_L1 | + RADV_CMD_FLAG_INV_SMEM_L1 | + RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER | + RADV_CMD_FLAG_INV_GLOBAL_L2; + si_init_config(&cmd_buffer->device->instance->physicalDevice, cmd_buffer); + radv_set_db_count_control(cmd_buffer); + si_emit_cache_flush(cmd_buffer); + } + + if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { + cmd_buffer->state.framebuffer = radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer); + cmd_buffer->state.pass = radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); + + struct radv_subpass *subpass = + &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; + + radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL); + radv_cmd_buffer_set_subpass(cmd_buffer, subpass, false); + } + + return VK_SUCCESS; +} + +void radv_CmdBindVertexBuffers( + VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBuffer* pBuffers, + const VkDeviceSize* pOffsets) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_vertex_binding *vb = cmd_buffer->state.vertex_bindings; + + /* We have to defer setting up vertex buffer since we need the buffer + * stride from the pipeline. */ + + assert(firstBinding + bindingCount < MAX_VBS); + for (uint32_t i = 0; i < bindingCount; i++) { + vb[firstBinding + i].buffer = radv_buffer_from_handle(pBuffers[i]); + vb[firstBinding + i].offset = pOffsets[i]; + cmd_buffer->state.vb_dirty |= 1 << (firstBinding + i); + } +} + +void radv_CmdBindIndexBuffer( + VkCommandBuffer commandBuffer, + VkBuffer buffer, + VkDeviceSize offset, + VkIndexType indexType) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer->state.index_buffer = radv_buffer_from_handle(buffer); + cmd_buffer->state.index_offset = offset; + cmd_buffer->state.index_type = indexType; /* vk matches hw */ + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER; + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, cmd_buffer->state.index_buffer->bo, 8); +} + + +void radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, + struct radv_descriptor_set *set, + unsigned idx) +{ + struct radeon_winsys *ws = cmd_buffer->device->ws; + + cmd_buffer->state.descriptors[idx] = set; + + if (!set) + return; + + for (unsigned j = 0; j < set->layout->buffer_count; ++j) + if (set->descriptors[j]) + ws->cs_add_buffer(cmd_buffer->cs, set->descriptors[j], 7); + + radeon_set_sh_reg_seq(cmd_buffer->cs, + R_00B030_SPI_SHADER_USER_DATA_PS_0 + 8 * idx, 2); + radeon_emit(cmd_buffer->cs, set->va); + radeon_emit(cmd_buffer->cs, set->va >> 32); + + radeon_set_sh_reg_seq(cmd_buffer->cs, + R_00B130_SPI_SHADER_USER_DATA_VS_0 + 8 * idx, 2); + radeon_emit(cmd_buffer->cs, set->va); + radeon_emit(cmd_buffer->cs, set->va >> 32); + + radeon_set_sh_reg_seq(cmd_buffer->cs, + R_00B900_COMPUTE_USER_DATA_0 + 8 * idx, 2); + radeon_emit(cmd_buffer->cs, set->va); + radeon_emit(cmd_buffer->cs, set->va >> 32); + + if(set->bo) + ws->cs_add_buffer(cmd_buffer->cs, set->bo, 8); +} + +void radv_CmdBindDescriptorSets( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipelineLayout _layout, + uint32_t firstSet, + uint32_t descriptorSetCount, + const VkDescriptorSet* pDescriptorSets, + uint32_t dynamicOffsetCount, + const uint32_t* pDynamicOffsets) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); + unsigned dyn_idx = 0; + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, + MAX_SETS * 4 * 6); + + for (unsigned i = 0; i < descriptorSetCount; ++i) { + unsigned idx = i + firstSet; + RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]); + radv_bind_descriptor_set(cmd_buffer, set, idx); + + for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) { + unsigned idx = j + layout->set[i].dynamic_offset_start; + uint32_t *dst = cmd_buffer->dynamic_buffers + idx * 4; + assert(dyn_idx < dynamicOffsetCount); + + struct radv_descriptor_range *range = set->dynamic_descriptors + j; + uint64_t va = range->va + pDynamicOffsets[dyn_idx]; + dst[0] = va; + dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); + dst[2] = range->size; + dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + cmd_buffer->push_constant_stages |= + set->layout->dynamic_shader_stages; + } + } + + assert(cmd_buffer->cs->cdw <= cdw_max); +} + +void radv_CmdPushConstants(VkCommandBuffer commandBuffer, + VkPipelineLayout layout, + VkShaderStageFlags stageFlags, + uint32_t offset, + uint32_t size, + const void* pValues) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + memcpy(cmd_buffer->push_constants + offset, pValues, size); + cmd_buffer->push_constant_stages |= stageFlags; +} + +VkResult radv_EndCommandBuffer( + VkCommandBuffer commandBuffer) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + si_emit_cache_flush(cmd_buffer); + if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) || + cmd_buffer->record_fail) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + return VK_SUCCESS; +} + +static void +radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) +{ + struct radeon_winsys *ws = cmd_buffer->device->ws; + struct radv_shader_variant *compute_shader; + struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; + uint64_t va; + + if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline) + return; + + cmd_buffer->state.emitted_compute_pipeline = pipeline; + + compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; + va = ws->buffer_get_va(compute_shader->bo); + + ws->cs_add_buffer(cmd_buffer->cs, compute_shader->bo, 8); + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 16); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B830_COMPUTE_PGM_LO, 2); + radeon_emit(cmd_buffer->cs, va >> 8); + radeon_emit(cmd_buffer->cs, va >> 40); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B848_COMPUTE_PGM_RSRC1, 2); + radeon_emit(cmd_buffer->cs, compute_shader->rsrc1); + radeon_emit(cmd_buffer->cs, compute_shader->rsrc2); + + /* change these once we have scratch support */ + radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE, + S_00B860_WAVES(32) | S_00B860_WAVESIZE(0)); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); + radeon_emit(cmd_buffer->cs, + S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0])); + radeon_emit(cmd_buffer->cs, + S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1])); + radeon_emit(cmd_buffer->cs, + S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2])); + + assert(cmd_buffer->cs->cdw <= cdw_max); +} + + +void radv_CmdBindPipeline( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipeline _pipeline) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline); + + switch (pipelineBindPoint) { + case VK_PIPELINE_BIND_POINT_COMPUTE: + cmd_buffer->state.compute_pipeline = pipeline; + cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT; + break; + case VK_PIPELINE_BIND_POINT_GRAPHICS: + cmd_buffer->state.pipeline = pipeline; + cmd_buffer->state.vertex_descriptors_dirty = true; + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE; + cmd_buffer->push_constant_stages |= pipeline->active_stages; + + /* Apply the dynamic state from the pipeline */ + cmd_buffer->state.dirty |= pipeline->dynamic_state_mask; + radv_dynamic_state_copy(&cmd_buffer->state.dynamic, + &pipeline->dynamic_state, + pipeline->dynamic_state_mask); + break; + default: + assert(!"invalid bind point"); + break; + } +} + +void radv_CmdSetViewport( + VkCommandBuffer commandBuffer, + uint32_t firstViewport, + uint32_t viewportCount, + const VkViewport* pViewports) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + const uint32_t total_count = firstViewport + viewportCount; + if (cmd_buffer->state.dynamic.viewport.count < total_count) + cmd_buffer->state.dynamic.viewport.count = total_count; + + memcpy(cmd_buffer->state.dynamic.viewport.viewports + firstViewport, + pViewports, viewportCount * sizeof(*pViewports)); + + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT; +} + +void radv_CmdSetScissor( + VkCommandBuffer commandBuffer, + uint32_t firstScissor, + uint32_t scissorCount, + const VkRect2D* pScissors) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + const uint32_t total_count = firstScissor + scissorCount; + if (cmd_buffer->state.dynamic.scissor.count < total_count) + cmd_buffer->state.dynamic.scissor.count = total_count; + + memcpy(cmd_buffer->state.dynamic.scissor.scissors + firstScissor, + pScissors, scissorCount * sizeof(*pScissors)); + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR; +} + +void radv_CmdSetLineWidth( + VkCommandBuffer commandBuffer, + float lineWidth) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + cmd_buffer->state.dynamic.line_width = lineWidth; + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH; +} + +void radv_CmdSetDepthBias( + VkCommandBuffer commandBuffer, + float depthBiasConstantFactor, + float depthBiasClamp, + float depthBiasSlopeFactor) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer->state.dynamic.depth_bias.bias = depthBiasConstantFactor; + cmd_buffer->state.dynamic.depth_bias.clamp = depthBiasClamp; + cmd_buffer->state.dynamic.depth_bias.slope = depthBiasSlopeFactor; + + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; +} + +void radv_CmdSetBlendConstants( + VkCommandBuffer commandBuffer, + const float blendConstants[4]) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + memcpy(cmd_buffer->state.dynamic.blend_constants, + blendConstants, sizeof(float) * 4); + + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS; +} + +void radv_CmdSetDepthBounds( + VkCommandBuffer commandBuffer, + float minDepthBounds, + float maxDepthBounds) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds; + cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds; + + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS; +} + +void radv_CmdSetStencilCompareMask( + VkCommandBuffer commandBuffer, + VkStencilFaceFlags faceMask, + uint32_t compareMask) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + if (faceMask & VK_STENCIL_FACE_FRONT_BIT) + cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask; + if (faceMask & VK_STENCIL_FACE_BACK_BIT) + cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask; + + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK; +} + +void radv_CmdSetStencilWriteMask( + VkCommandBuffer commandBuffer, + VkStencilFaceFlags faceMask, + uint32_t writeMask) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + if (faceMask & VK_STENCIL_FACE_FRONT_BIT) + cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask; + if (faceMask & VK_STENCIL_FACE_BACK_BIT) + cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask; + + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK; +} + +void radv_CmdSetStencilReference( + VkCommandBuffer commandBuffer, + VkStencilFaceFlags faceMask, + uint32_t reference) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + if (faceMask & VK_STENCIL_FACE_FRONT_BIT) + cmd_buffer->state.dynamic.stencil_reference.front = reference; + if (faceMask & VK_STENCIL_FACE_BACK_BIT) + cmd_buffer->state.dynamic.stencil_reference.back = reference; + + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE; +} + + +void radv_CmdExecuteCommands( + VkCommandBuffer commandBuffer, + uint32_t commandBufferCount, + const VkCommandBuffer* pCmdBuffers) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer); + + for (uint32_t i = 0; i < commandBufferCount; i++) { + RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]); + + primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs); + } + + /* if we execute secondary we need to re-emit out pipelines */ + if (commandBufferCount) { + primary->state.emitted_pipeline = NULL; + primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE; + primary->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_ALL; + } +} + +VkResult radv_CreateCommandPool( + VkDevice _device, + const VkCommandPoolCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkCommandPool* pCmdPool) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_cmd_pool *pool; + + pool = radv_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (pool == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + if (pAllocator) + pool->alloc = *pAllocator; + else + pool->alloc = device->alloc; + + list_inithead(&pool->cmd_buffers); + + *pCmdPool = radv_cmd_pool_to_handle(pool); + + return VK_SUCCESS; + +} + +void radv_DestroyCommandPool( + VkDevice _device, + VkCommandPool commandPool, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); + + if (!pool) + return; + + list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, + &pool->cmd_buffers, pool_link) { + radv_cmd_buffer_destroy(cmd_buffer); + } + + radv_free2(&device->alloc, pAllocator, pool); +} + +VkResult radv_ResetCommandPool( + VkDevice device, + VkCommandPool commandPool, + VkCommandPoolResetFlags flags) +{ + RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); + + list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, + &pool->cmd_buffers, pool_link) { + radv_reset_cmd_buffer(cmd_buffer); + } + + return VK_SUCCESS; +} + +void radv_CmdBeginRenderPass( + VkCommandBuffer commandBuffer, + const VkRenderPassBeginInfo* pRenderPassBegin, + VkSubpassContents contents) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass); + RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer); + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, + 2048); + + cmd_buffer->state.framebuffer = framebuffer; + cmd_buffer->state.pass = pass; + cmd_buffer->state.render_area = pRenderPassBegin->renderArea; + radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin); + + si_emit_cache_flush(cmd_buffer); + + radv_cmd_buffer_set_subpass(cmd_buffer, pass->subpasses, true); + assert(cmd_buffer->cs->cdw <= cdw_max); + + radv_cmd_buffer_clear_subpass(cmd_buffer); +} + +void radv_CmdNextSubpass( + VkCommandBuffer commandBuffer, + VkSubpassContents contents) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + si_emit_cache_flush(cmd_buffer); + radv_cmd_buffer_resolve_subpass(cmd_buffer); + + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, + 2048); + + radv_cmd_buffer_set_subpass(cmd_buffer, cmd_buffer->state.subpass + 1, true); + radv_cmd_buffer_clear_subpass(cmd_buffer); +} + +void radv_CmdDraw( + VkCommandBuffer commandBuffer, + uint32_t vertexCount, + uint32_t instanceCount, + uint32_t firstVertex, + uint32_t firstInstance) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + radv_cmd_buffer_flush_state(cmd_buffer); + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B130_SPI_SHADER_USER_DATA_VS_0 + 12 * 4, 2); + radeon_emit(cmd_buffer->cs, firstVertex); + radeon_emit(cmd_buffer->cs, firstInstance); + radeon_emit(cmd_buffer->cs, PKT3(PKT3_NUM_INSTANCES, 0, 0)); + radeon_emit(cmd_buffer->cs, instanceCount); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, 0)); + radeon_emit(cmd_buffer->cs, vertexCount); + radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | + S_0287F0_USE_OPAQUE(0)); + + assert(cmd_buffer->cs->cdw <= cdw_max); +} + +static void radv_emit_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer) +{ + uint32_t primitive_reset_index = cmd_buffer->state.last_primitive_reset_index ? 0xffffffffu : 0xffffu; + + if (cmd_buffer->state.pipeline->graphics.prim_restart_enable && + primitive_reset_index != cmd_buffer->state.last_primitive_reset_index) { + cmd_buffer->state.last_primitive_reset_index = primitive_reset_index; + radeon_set_context_reg(cmd_buffer->cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, + primitive_reset_index); + } +} + +void radv_CmdDrawIndexed( + VkCommandBuffer commandBuffer, + uint32_t indexCount, + uint32_t instanceCount, + uint32_t firstIndex, + int32_t vertexOffset, + uint32_t firstInstance) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + int index_size = cmd_buffer->state.index_type ? 4 : 2; + uint32_t index_max_size = (cmd_buffer->state.index_buffer->size - cmd_buffer->state.index_offset) / index_size; + uint64_t index_va; + + radv_cmd_buffer_flush_state(cmd_buffer); + radv_emit_primitive_reset_index(cmd_buffer); + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 14); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); + radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B130_SPI_SHADER_USER_DATA_VS_0 + 12 * 4, 2); + radeon_emit(cmd_buffer->cs, vertexOffset); + radeon_emit(cmd_buffer->cs, firstInstance); + radeon_emit(cmd_buffer->cs, PKT3(PKT3_NUM_INSTANCES, 0, 0)); + radeon_emit(cmd_buffer->cs, instanceCount); + + index_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->state.index_buffer->bo); + index_va += firstIndex * index_size + cmd_buffer->state.index_buffer->offset + cmd_buffer->state.index_offset; + radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, false)); + radeon_emit(cmd_buffer->cs, index_max_size); + radeon_emit(cmd_buffer->cs, index_va); + radeon_emit(cmd_buffer->cs, (index_va >> 32UL) & 0xFF); + radeon_emit(cmd_buffer->cs, indexCount); + radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA); + + assert(cmd_buffer->cs->cdw <= cdw_max); +} + +static void +radv_emit_indirect_draw(struct radv_cmd_buffer *cmd_buffer, + VkBuffer _buffer, + VkDeviceSize offset, + uint32_t draw_count, + uint32_t stride, + bool indexed) +{ + RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); + struct radeon_winsys_cs *cs = cmd_buffer->cs; + unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA + : V_0287F0_DI_SRC_SEL_AUTO_INDEX; + uint64_t indirect_va = cmd_buffer->device->ws->buffer_get_va(buffer->bo); + indirect_va += offset + buffer->offset; + + if (!draw_count) + return; + + cmd_buffer->device->ws->cs_add_buffer(cs, buffer->bo, 8); + + radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0)); + radeon_emit(cs, 1); + radeon_emit(cs, indirect_va); + radeon_emit(cs, indirect_va >> 32); + + radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : + PKT3_DRAW_INDIRECT_MULTI, + 8, false)); + radeon_emit(cs, 0); + radeon_emit(cs, (R_00B160_SPI_SHADER_USER_DATA_VS_12 - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, (R_00B164_SPI_SHADER_USER_DATA_VS_13 - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, 0); /* draw_index */ + radeon_emit(cs, draw_count); /* count */ + radeon_emit(cs, 0); /* count_addr -- disabled */ + radeon_emit(cs, 0); + radeon_emit(cs, stride); /* stride */ + radeon_emit(cs, di_src_sel); +} + +void radv_CmdDrawIndirect( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset, + uint32_t drawCount, + uint32_t stride) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + radv_cmd_buffer_flush_state(cmd_buffer); + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 14); + + radv_emit_indirect_draw(cmd_buffer, _buffer, offset, drawCount, stride, false); + + assert(cmd_buffer->cs->cdw <= cdw_max); +} + +void radv_CmdDrawIndexedIndirect( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset, + uint32_t drawCount, + uint32_t stride) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + int index_size = cmd_buffer->state.index_type ? 4 : 2; + uint32_t index_max_size = (cmd_buffer->state.index_buffer->size - cmd_buffer->state.index_offset) / index_size; + uint64_t index_va; + radv_cmd_buffer_flush_state(cmd_buffer); + radv_emit_primitive_reset_index(cmd_buffer); + + index_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->state.index_buffer->bo); + index_va += cmd_buffer->state.index_buffer->offset + cmd_buffer->state.index_offset; + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 21); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); + radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_BASE, 1, 0)); + radeon_emit(cmd_buffer->cs, index_va); + radeon_emit(cmd_buffer->cs, index_va >> 32); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); + radeon_emit(cmd_buffer->cs, index_max_size); + + radv_emit_indirect_draw(cmd_buffer, _buffer, offset, drawCount, stride, true); + + assert(cmd_buffer->cs->cdw <= cdw_max); +} + +void radv_CmdDispatch( + VkCommandBuffer commandBuffer, + uint32_t x, + uint32_t y, + uint32_t z) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + radv_emit_compute_pipeline(cmd_buffer); + radv_flush_constants(cmd_buffer, cmd_buffer->state.compute_pipeline->layout, + VK_SHADER_STAGE_COMPUTE_BIT); + si_emit_cache_flush(cmd_buffer); + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 10); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + 10 * 4, 3); + radeon_emit(cmd_buffer->cs, x); + radeon_emit(cmd_buffer->cs, y); + radeon_emit(cmd_buffer->cs, z); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cmd_buffer->cs, x); + radeon_emit(cmd_buffer->cs, y); + radeon_emit(cmd_buffer->cs, z); + radeon_emit(cmd_buffer->cs, 1); + + assert(cmd_buffer->cs->cdw <= cdw_max); +} + +void radv_CmdDispatchIndirect( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); + uint64_t va = cmd_buffer->device->ws->buffer_get_va(buffer->bo); + va += buffer->offset + offset; + + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, buffer->bo, 8); + + radv_emit_compute_pipeline(cmd_buffer); + radv_flush_constants(cmd_buffer, cmd_buffer->state.compute_pipeline->layout, + VK_SHADER_STAGE_COMPUTE_BIT); + si_emit_cache_flush(cmd_buffer); + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 25); + + for (unsigned i = 0; i < 3; ++i) { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | + COPY_DATA_DST_SEL(COPY_DATA_REG)); + radeon_emit(cmd_buffer->cs, (va + 4 * i)); + radeon_emit(cmd_buffer->cs, (va + 4 * i) >> 32); + radeon_emit(cmd_buffer->cs, (R_00B928_COMPUTE_USER_DATA_10 >> 2) + i); + radeon_emit(cmd_buffer->cs, 0); + } + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_BASE, 2, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cmd_buffer->cs, 1); + radeon_emit(cmd_buffer->cs, va); + radeon_emit(cmd_buffer->cs, va >> 32); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cmd_buffer->cs, 0); + radeon_emit(cmd_buffer->cs, 1); + + assert(cmd_buffer->cs->cdw <= cdw_max); +} + +void radv_unaligned_dispatch( + struct radv_cmd_buffer *cmd_buffer, + uint32_t x, + uint32_t y, + uint32_t z) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; + struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; + uint32_t blocks[3], remainder[3]; + + blocks[0] = round_up_u32(x, compute_shader->info.cs.block_size[0]); + blocks[1] = round_up_u32(y, compute_shader->info.cs.block_size[1]); + blocks[2] = round_up_u32(z, compute_shader->info.cs.block_size[2]); + + /* If aligned, these should be an entire block size, not 0 */ + remainder[0] = x + compute_shader->info.cs.block_size[0] - align_u32_npot(x, compute_shader->info.cs.block_size[0]); + remainder[1] = y + compute_shader->info.cs.block_size[1] - align_u32_npot(y, compute_shader->info.cs.block_size[1]); + remainder[2] = z + compute_shader->info.cs.block_size[2] - align_u32_npot(z, compute_shader->info.cs.block_size[2]); + + radv_emit_compute_pipeline(cmd_buffer); + radv_flush_constants(cmd_buffer, cmd_buffer->state.compute_pipeline->layout, + VK_SHADER_STAGE_COMPUTE_BIT); + si_emit_cache_flush(cmd_buffer); + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 15); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); + radeon_emit(cmd_buffer->cs, + S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]) | + S_00B81C_NUM_THREAD_PARTIAL(remainder[0])); + radeon_emit(cmd_buffer->cs, + S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]) | + S_00B81C_NUM_THREAD_PARTIAL(remainder[1])); + radeon_emit(cmd_buffer->cs, + S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]) | + S_00B81C_NUM_THREAD_PARTIAL(remainder[2])); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + 10 * 4, 3); + radeon_emit(cmd_buffer->cs, blocks[0]); + radeon_emit(cmd_buffer->cs, blocks[1]); + radeon_emit(cmd_buffer->cs, blocks[2]); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cmd_buffer->cs, blocks[0]); + radeon_emit(cmd_buffer->cs, blocks[1]); + radeon_emit(cmd_buffer->cs, blocks[2]); + radeon_emit(cmd_buffer->cs, S_00B800_COMPUTE_SHADER_EN(1) | + S_00B800_PARTIAL_TG_EN(1)); + + assert(cmd_buffer->cs->cdw <= cdw_max); +} + +void radv_CmdEndRenderPass( + VkCommandBuffer commandBuffer) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + radv_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier); + + si_emit_cache_flush(cmd_buffer); + radv_cmd_buffer_resolve_subpass(cmd_buffer); + + for (unsigned i = 0; i < cmd_buffer->state.framebuffer->attachment_count; ++i) { + VkImageLayout layout = cmd_buffer->state.pass->attachments[i].final_layout; + radv_handle_subpass_image_transition(cmd_buffer, + (VkAttachmentReference){i, layout}); + } + + radv_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments); + + cmd_buffer->state.pass = NULL; + cmd_buffer->state.subpass = NULL; + cmd_buffer->state.attachments = NULL; + cmd_buffer->state.framebuffer = NULL; +} + + +static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image) +{ + + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | + RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; + + radv_fill_buffer(cmd_buffer, image->bo, image->offset + image->htile.offset, + image->htile.size, 0xffffffff); + + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META | + RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_VMEM_L1 | + RADV_CMD_FLAG_INV_GLOBAL_L2; +} + +static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + VkImageLayout src_layout, + VkImageLayout dst_layout, + VkImageSubresourceRange range, + VkImageAspectFlags pending_clears) +{ + if (dst_layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL && + (pending_clears & vk_format_aspects(image->vk_format)) == vk_format_aspects(image->vk_format) && + cmd_buffer->state.render_area.offset.x == 0 && cmd_buffer->state.render_area.offset.y == 0 && + cmd_buffer->state.render_area.extent.width == image->extent.width && + cmd_buffer->state.render_area.extent.height == image->extent.height) { + /* The clear will initialize htile. */ + return; + } else if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED && + radv_layout_has_htile(image, dst_layout)) { + /* TODO: merge with the clear if applicable */ + radv_initialize_htile(cmd_buffer, image); + } else if (!radv_layout_has_htile(image, src_layout) && + radv_layout_has_htile(image, dst_layout)) { + radv_initialize_htile(cmd_buffer, image); + } else if ((radv_layout_has_htile(image, src_layout) && + !radv_layout_has_htile(image, dst_layout)) || + (radv_layout_is_htile_compressed(image, src_layout) && + !radv_layout_is_htile_compressed(image, dst_layout))) { + + range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; + range.baseMipLevel = 0; + range.levelCount = 1; + + radv_decompress_depth_image_inplace(cmd_buffer, image, &range); + } +} + +void radv_initialise_cmask(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, uint32_t value) +{ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; + + radv_fill_buffer(cmd_buffer, image->bo, image->offset + image->cmask.offset, + image->cmask.size, value); + + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | + RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_VMEM_L1 | + RADV_CMD_FLAG_INV_GLOBAL_L2; +} + +static void radv_handle_cmask_image_transition(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + VkImageLayout src_layout, + VkImageLayout dst_layout, + VkImageSubresourceRange range, + VkImageAspectFlags pending_clears) +{ + if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { + if (image->fmask.size) + radv_initialise_cmask(cmd_buffer, image, 0xccccccccu); + else + radv_initialise_cmask(cmd_buffer, image, 0xffffffffu); + } else if (radv_layout_has_cmask(image, src_layout) && + !radv_layout_has_cmask(image, dst_layout)) { + radv_fast_clear_flush_image_inplace(cmd_buffer, image); + } +} + +void radv_initialize_dcc(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, uint32_t value) +{ + + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; + + radv_fill_buffer(cmd_buffer, image->bo, image->offset + image->dcc_offset, + image->surface.dcc_size, value); + + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | + RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_VMEM_L1 | + RADV_CMD_FLAG_INV_GLOBAL_L2; +} + +static void radv_handle_dcc_image_transition(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + VkImageLayout src_layout, + VkImageLayout dst_layout, + VkImageSubresourceRange range, + VkImageAspectFlags pending_clears) +{ + if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { + radv_initialize_dcc(cmd_buffer, image, 0x20202020u); + } else if(src_layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL && + dst_layout != VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) { + radv_fast_clear_flush_image_inplace(cmd_buffer, image); + } +} + +static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + VkImageLayout src_layout, + VkImageLayout dst_layout, + VkImageSubresourceRange range, + VkImageAspectFlags pending_clears) +{ + if (image->htile.size) + radv_handle_depth_image_transition(cmd_buffer, image, src_layout, + dst_layout, range, pending_clears); + + if (image->cmask.size) + radv_handle_cmask_image_transition(cmd_buffer, image, src_layout, + dst_layout, range, pending_clears); + + if (image->surface.dcc_size) + radv_handle_dcc_image_transition(cmd_buffer, image, src_layout, + dst_layout, range, pending_clears); +} + +void radv_CmdPipelineBarrier( + VkCommandBuffer commandBuffer, + VkPipelineStageFlags srcStageMask, + VkPipelineStageFlags destStageMask, + VkBool32 byRegion, + uint32_t memoryBarrierCount, + const VkMemoryBarrier* pMemoryBarriers, + uint32_t bufferMemoryBarrierCount, + const VkBufferMemoryBarrier* pBufferMemoryBarriers, + uint32_t imageMemoryBarrierCount, + const VkImageMemoryBarrier* pImageMemoryBarriers) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + VkAccessFlags src_flags = 0; + VkAccessFlags dst_flags = 0; + uint32_t b; + for (uint32_t i = 0; i < memoryBarrierCount; i++) { + src_flags |= pMemoryBarriers[i].srcAccessMask; + dst_flags |= pMemoryBarriers[i].dstAccessMask; + } + + for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) { + src_flags |= pBufferMemoryBarriers[i].srcAccessMask; + dst_flags |= pBufferMemoryBarriers[i].dstAccessMask; + } + + for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { + RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image); + src_flags |= pImageMemoryBarriers[i].srcAccessMask; + dst_flags |= pImageMemoryBarriers[i].dstAccessMask; + + radv_handle_image_transition(cmd_buffer, image, + pImageMemoryBarriers[i].oldLayout, + pImageMemoryBarriers[i].newLayout, + pImageMemoryBarriers[i].subresourceRange, + 0); + } + + enum radv_cmd_flush_bits flush_bits = 0; + + for_each_bit(b, src_flags) { + switch ((VkAccessFlagBits)(1 << b)) { + case VK_ACCESS_SHADER_WRITE_BIT: + flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2; + break; + case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: + flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; + break; + case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: + flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; + break; + case VK_ACCESS_TRANSFER_WRITE_BIT: + flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; + break; + default: + break; + } + } + + for_each_bit(b, dst_flags) { + switch ((VkAccessFlagBits)(1 << b)) { + case VK_ACCESS_INDIRECT_COMMAND_READ_BIT: + case VK_ACCESS_INDEX_READ_BIT: + case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT: + case VK_ACCESS_UNIFORM_READ_BIT: + flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1; + break; + case VK_ACCESS_SHADER_READ_BIT: + flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2; + break; + case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT: + case VK_ACCESS_TRANSFER_READ_BIT: + flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER | RADV_CMD_FLAG_INV_GLOBAL_L2; + default: + break; + } + } + + flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + RADV_CMD_FLAG_PS_PARTIAL_FLUSH; + + cmd_buffer->state.flush_bits |= flush_bits; +} + + +static void write_event(struct radv_cmd_buffer *cmd_buffer, + struct radv_event *event, + VkPipelineStageFlags stageMask, + unsigned value) +{ + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint64_t va = cmd_buffer->device->ws->buffer_get_va(event->bo); + + cmd_buffer->device->ws->cs_add_buffer(cs, event->bo, 8); + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 12); + + /* TODO: this is overkill. Probably should figure something out from + * the stage mask. */ + + if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class == CIK) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | + EVENT_INDEX(5)); + radeon_emit(cs, va); + radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1)); + radeon_emit(cs, 2); + radeon_emit(cs, 0); + } + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | + EVENT_INDEX(5)); + radeon_emit(cs, va); + radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1)); + radeon_emit(cs, value); + radeon_emit(cs, 0); + + assert(cmd_buffer->cs->cdw <= cdw_max); +} + +void radv_CmdSetEvent(VkCommandBuffer commandBuffer, + VkEvent _event, + VkPipelineStageFlags stageMask) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_event, event, _event); + + write_event(cmd_buffer, event, stageMask, 1); +} + +void radv_CmdResetEvent(VkCommandBuffer commandBuffer, + VkEvent _event, + VkPipelineStageFlags stageMask) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_event, event, _event); + + write_event(cmd_buffer, event, stageMask, 0); +} + +void radv_CmdWaitEvents(VkCommandBuffer commandBuffer, + uint32_t eventCount, + const VkEvent* pEvents, + VkPipelineStageFlags srcStageMask, + VkPipelineStageFlags dstStageMask, + uint32_t memoryBarrierCount, + const VkMemoryBarrier* pMemoryBarriers, + uint32_t bufferMemoryBarrierCount, + const VkBufferMemoryBarrier* pBufferMemoryBarriers, + uint32_t imageMemoryBarrierCount, + const VkImageMemoryBarrier* pImageMemoryBarriers) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radeon_winsys_cs *cs = cmd_buffer->cs; + + for (unsigned i = 0; i < eventCount; ++i) { + RADV_FROM_HANDLE(radv_event, event, pEvents[i]); + uint64_t va = cmd_buffer->device->ws->buffer_get_va(event->bo); + + cmd_buffer->device->ws->cs_add_buffer(cs, event->bo, 8); + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7); + + radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, 1); /* reference value */ + radeon_emit(cs, 0xffffffff); /* mask */ + radeon_emit(cs, 4); /* poll interval */ + + assert(cmd_buffer->cs->cdw <= cdw_max); + } + + + for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { + RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image); + + radv_handle_image_transition(cmd_buffer, image, + pImageMemoryBarriers[i].oldLayout, + pImageMemoryBarriers[i].newLayout, + pImageMemoryBarriers[i].subresourceRange, + 0); + } + + /* TODO: figure out how to do memory barriers without waiting */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER | + RADV_CMD_FLAG_INV_GLOBAL_L2 | + RADV_CMD_FLAG_INV_VMEM_L1 | + RADV_CMD_FLAG_INV_SMEM_L1; +} diff --git a/src/amd/vulkan/radv_cs.h b/src/amd/vulkan/radv_cs.h new file mode 100644 index 00000000000..6481df1357c --- /dev/null +++ b/src/amd/vulkan/radv_cs.h @@ -0,0 +1,117 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#include +#include +#include +#include "r600d_common.h" + +static inline unsigned radeon_check_space(struct radeon_winsys *ws, + struct radeon_winsys_cs *cs, + unsigned needed) +{ + if (cs->max_dw - cs->cdw < needed) + ws->cs_grow(cs, needed); + return cs->cdw + needed; +} + +static inline void radeon_set_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +{ + assert(reg < R600_CONTEXT_REG_OFFSET); + assert(cs->cdw + 2 + num <= cs->max_dw); + radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0)); + radeon_emit(cs, (reg - R600_CONFIG_REG_OFFSET) >> 2); +} + +static inline void radeon_set_config_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +{ + radeon_set_config_reg_seq(cs, reg, 1); + radeon_emit(cs, value); +} + +static inline void radeon_set_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +{ + assert(reg >= R600_CONTEXT_REG_OFFSET); + assert(cs->cdw + 2 + num <= cs->max_dw); + radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0)); + radeon_emit(cs, (reg - R600_CONTEXT_REG_OFFSET) >> 2); +} + +static inline void radeon_set_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +{ + radeon_set_context_reg_seq(cs, reg, 1); + radeon_emit(cs, value); +} + + +static inline void radeon_set_context_reg_idx(struct radeon_winsys_cs *cs, + unsigned reg, unsigned idx, + unsigned value) +{ + assert(reg >= R600_CONTEXT_REG_OFFSET); + assert(cs->cdw + 3 <= cs->max_dw); + radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0)); + radeon_emit(cs, (reg - R600_CONTEXT_REG_OFFSET) >> 2 | (idx << 28)); + radeon_emit(cs, value); +} + +static inline void radeon_set_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +{ + assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END); + assert(cs->cdw + 2 + num <= cs->max_dw); + radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0)); + radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2); +} + +static inline void radeon_set_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +{ + radeon_set_sh_reg_seq(cs, reg, 1); + radeon_emit(cs, value); +} + +static inline void radeon_set_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +{ + assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END); + assert(cs->cdw + 2 + num <= cs->max_dw); + radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0)); + radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2); +} + +static inline void radeon_set_uconfig_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +{ + radeon_set_uconfig_reg_seq(cs, reg, 1); + radeon_emit(cs, value); +} + +static inline void radeon_set_uconfig_reg_idx(struct radeon_winsys_cs *cs, + unsigned reg, unsigned idx, + unsigned value) +{ + assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END); + assert(cs->cdw + 3 <= cs->max_dw); + radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, 1, 0)); + radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28)); + radeon_emit(cs, value); +} diff --git a/src/amd/vulkan/radv_descriptor_set.c b/src/amd/vulkan/radv_descriptor_set.c new file mode 100644 index 00000000000..d1d2b1f4ce9 --- /dev/null +++ b/src/amd/vulkan/radv_descriptor_set.c @@ -0,0 +1,716 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#include +#include +#include +#include +#include + +#include "util/mesa-sha1.h" +#include "radv_private.h" +#include "sid.h" + +VkResult radv_CreateDescriptorSetLayout( + VkDevice _device, + const VkDescriptorSetLayoutCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDescriptorSetLayout* pSetLayout) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_descriptor_set_layout *set_layout; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO); + + uint32_t max_binding = 0; + uint32_t immutable_sampler_count = 0; + for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { + max_binding = MAX(max_binding, pCreateInfo->pBindings[j].binding); + if (pCreateInfo->pBindings[j].pImmutableSamplers) + immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount; + } + + size_t size = sizeof(struct radv_descriptor_set_layout) + + (max_binding + 1) * sizeof(set_layout->binding[0]) + + immutable_sampler_count * sizeof(struct radv_sampler *); + + set_layout = radv_alloc2(&device->alloc, pAllocator, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!set_layout) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + /* We just allocate all the samplers at the end of the struct */ + struct radv_sampler **samplers = + (struct radv_sampler **)&set_layout->binding[max_binding + 1]; + + set_layout->binding_count = max_binding + 1; + set_layout->shader_stages = 0; + set_layout->size = 0; + + memset(set_layout->binding, 0, size - sizeof(struct radv_descriptor_set_layout)); + + uint32_t buffer_count = 0; + uint32_t dynamic_offset_count = 0; + + for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { + const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j]; + uint32_t b = binding->binding; + uint32_t alignment; + + switch (binding->descriptorType) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + set_layout->binding[b].dynamic_offset_count = 1; + set_layout->dynamic_shader_stages |= binding->stageFlags; + set_layout->binding[b].size = 0; + set_layout->binding[b].buffer_count = 1; + alignment = 1; + break; + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + set_layout->binding[b].size = 16; + set_layout->binding[b].buffer_count = 1; + alignment = 16; + break; + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + /* main descriptor + fmask descriptor */ + set_layout->binding[b].size = 64; + set_layout->binding[b].buffer_count = 1; + alignment = 32; + break; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + /* main descriptor + fmask descriptor + sampler */ + set_layout->binding[b].size = 96; + set_layout->binding[b].buffer_count = 1; + alignment = 32; + break; + case VK_DESCRIPTOR_TYPE_SAMPLER: + set_layout->binding[b].size = 16; + alignment = 16; + break; + default: + break; + } + + set_layout->size = align(set_layout->size, alignment); + assert(binding->descriptorCount > 0); + set_layout->binding[b].type = binding->descriptorType; + set_layout->binding[b].array_size = binding->descriptorCount; + set_layout->binding[b].offset = set_layout->size; + set_layout->binding[b].buffer_offset = buffer_count; + set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count; + + set_layout->size += binding->descriptorCount * set_layout->binding[b].size; + buffer_count += binding->descriptorCount * set_layout->binding[b].buffer_count; + dynamic_offset_count += binding->descriptorCount * + set_layout->binding[b].dynamic_offset_count; + + + if (binding->pImmutableSamplers) { + set_layout->binding[b].immutable_samplers = samplers; + samplers += binding->descriptorCount; + + for (uint32_t i = 0; i < binding->descriptorCount; i++) + set_layout->binding[b].immutable_samplers[i] = + radv_sampler_from_handle(binding->pImmutableSamplers[i]); + } else { + set_layout->binding[b].immutable_samplers = NULL; + } + + set_layout->shader_stages |= binding->stageFlags; + } + + set_layout->buffer_count = buffer_count; + set_layout->dynamic_offset_count = dynamic_offset_count; + + *pSetLayout = radv_descriptor_set_layout_to_handle(set_layout); + + return VK_SUCCESS; +} + +void radv_DestroyDescriptorSetLayout( + VkDevice _device, + VkDescriptorSetLayout _set_layout, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_descriptor_set_layout, set_layout, _set_layout); + + if (!set_layout) + return; + + radv_free2(&device->alloc, pAllocator, set_layout); +} + +/* + * Pipeline layouts. These have nothing to do with the pipeline. They are + * just muttiple descriptor set layouts pasted together + */ + +VkResult radv_CreatePipelineLayout( + VkDevice _device, + const VkPipelineLayoutCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkPipelineLayout* pPipelineLayout) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_pipeline_layout *layout; + struct mesa_sha1 *ctx; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO); + + layout = radv_alloc2(&device->alloc, pAllocator, sizeof(*layout), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (layout == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + layout->num_sets = pCreateInfo->setLayoutCount; + + unsigned dynamic_offset_count = 0; + + + ctx = _mesa_sha1_init(); + for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) { + RADV_FROM_HANDLE(radv_descriptor_set_layout, set_layout, + pCreateInfo->pSetLayouts[set]); + layout->set[set].layout = set_layout; + + layout->set[set].dynamic_offset_start = dynamic_offset_count; + for (uint32_t b = 0; b < set_layout->binding_count; b++) { + dynamic_offset_count += set_layout->binding[b].array_size * set_layout->binding[b].dynamic_offset_count; + } + _mesa_sha1_update(ctx, set_layout->binding, + sizeof(set_layout->binding[0]) * set_layout->binding_count); + } + + layout->dynamic_offset_count = dynamic_offset_count; + layout->push_constant_size = 0; + for (unsigned i = 0; i < pCreateInfo->pushConstantRangeCount; ++i) { + const VkPushConstantRange *range = pCreateInfo->pPushConstantRanges + i; + layout->push_constant_size = MAX2(layout->push_constant_size, + range->offset + range->size); + } + + layout->push_constant_size = align(layout->push_constant_size, 16); + _mesa_sha1_update(ctx, &layout->push_constant_size, + sizeof(layout->push_constant_size)); + _mesa_sha1_final(ctx, layout->sha1); + *pPipelineLayout = radv_pipeline_layout_to_handle(layout); + + return VK_SUCCESS; +} + +void radv_DestroyPipelineLayout( + VkDevice _device, + VkPipelineLayout _pipelineLayout, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, _pipelineLayout); + + if (!pipeline_layout) + return; + radv_free2(&device->alloc, pAllocator, pipeline_layout); +} + +#define EMPTY 1 + +static VkResult +radv_descriptor_set_create(struct radv_device *device, + struct radv_descriptor_pool *pool, + struct radv_cmd_buffer *cmd_buffer, + const struct radv_descriptor_set_layout *layout, + struct radv_descriptor_set **out_set) +{ + struct radv_descriptor_set *set; + unsigned mem_size = sizeof(struct radv_descriptor_set) + + sizeof(struct radeon_winsys_bo *) * layout->buffer_count; + set = radv_alloc2(&device->alloc, NULL, mem_size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (!set) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + memset(set, 0, mem_size); + + if (layout->dynamic_offset_count) { + unsigned size = sizeof(struct radv_descriptor_range) * + layout->dynamic_offset_count; + set->dynamic_descriptors = radv_alloc2(&device->alloc, NULL, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (!set->dynamic_descriptors) { + radv_free2(&device->alloc, NULL, set); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } + } + + set->layout = layout; + if (layout->size) { + uint32_t layout_size = align_u32(layout->size, 32); + set->size = layout->size; + if (!cmd_buffer) { + if (pool->current_offset + layout_size <= pool->size) { + set->bo = pool->bo; + set->mapped_ptr = (uint32_t*)(pool->mapped_ptr + pool->current_offset); + set->va = device->ws->buffer_get_va(set->bo) + pool->current_offset; + pool->current_offset += layout_size; + + } else { + int entry = pool->free_list, prev_entry = -1; + uint32_t offset; + while (entry >= 0) { + if (pool->free_nodes[entry].size >= layout_size) { + if (prev_entry >= 0) + pool->free_nodes[prev_entry].next = pool->free_nodes[entry].next; + else + pool->free_list = pool->free_nodes[entry].next; + break; + } + prev_entry = entry; + entry = pool->free_nodes[entry].next; + } + + if (entry < 0) { + radv_free2(&device->alloc, NULL, set); + return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + offset = pool->free_nodes[entry].offset; + pool->free_nodes[entry].next = pool->full_list; + pool->full_list = entry; + + set->bo = pool->bo; + set->mapped_ptr = (uint32_t*)(pool->mapped_ptr + offset); + set->va = device->ws->buffer_get_va(set->bo) + offset; + } + } else { + unsigned bo_offset; + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, set->size, 32, + &bo_offset, + (void**)&set->mapped_ptr)) { + radv_free2(&device->alloc, NULL, set->dynamic_descriptors); + radv_free2(&device->alloc, NULL, set); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } + + set->va = device->ws->buffer_get_va(cmd_buffer->upload.upload_bo); + set->va += bo_offset; + } + } + + if (pool) + list_add(&set->descriptor_pool, &pool->descriptor_sets); + else + list_inithead(&set->descriptor_pool); + + for (unsigned i = 0; i < layout->binding_count; ++i) { + if (!layout->binding[i].immutable_samplers) + continue; + + unsigned offset = layout->binding[i].offset / 4; + if (layout->binding[i].type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) + offset += 16; + + for (unsigned j = 0; j < layout->binding[i].array_size; ++j) { + struct radv_sampler* sampler = layout->binding[i].immutable_samplers[j]; + + memcpy(set->mapped_ptr + offset, &sampler->state, 16); + offset += layout->binding[i].size / 4; + } + + } + *out_set = set; + return VK_SUCCESS; +} + +static void +radv_descriptor_set_destroy(struct radv_device *device, + struct radv_descriptor_pool *pool, + struct radv_descriptor_set *set, + bool free_bo) +{ + if (free_bo && set->size) { + assert(pool->full_list >= 0); + int next = pool->free_nodes[pool->full_list].next; + pool->free_nodes[pool->full_list].next = pool->free_list; + pool->free_nodes[pool->full_list].offset = (uint8_t*)set->mapped_ptr - pool->mapped_ptr; + pool->free_nodes[pool->full_list].size = align_u32(set->size, 32); + pool->free_list = pool->full_list; + pool->full_list = next; + } + if (set->dynamic_descriptors) + radv_free2(&device->alloc, NULL, set->dynamic_descriptors); + if (!list_empty(&set->descriptor_pool)) + list_del(&set->descriptor_pool); + radv_free2(&device->alloc, NULL, set); +} + +VkResult +radv_temp_descriptor_set_create(struct radv_device *device, + struct radv_cmd_buffer *cmd_buffer, + VkDescriptorSetLayout _layout, + VkDescriptorSet *_set) +{ + RADV_FROM_HANDLE(radv_descriptor_set_layout, layout, _layout); + struct radv_descriptor_set *set; + VkResult ret; + + ret = radv_descriptor_set_create(device, NULL, cmd_buffer, layout, &set); + *_set = radv_descriptor_set_to_handle(set); + return ret; +} + +void +radv_temp_descriptor_set_destroy(struct radv_device *device, + VkDescriptorSet _set) +{ + RADV_FROM_HANDLE(radv_descriptor_set, set, _set); + + radv_descriptor_set_destroy(device, NULL, set, false); +} + +VkResult radv_CreateDescriptorPool( + VkDevice _device, + const VkDescriptorPoolCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDescriptorPool* pDescriptorPool) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_descriptor_pool *pool; + unsigned max_sets = pCreateInfo->maxSets * 2; + int size = sizeof(struct radv_descriptor_pool) + + max_sets * sizeof(struct radv_descriptor_pool_free_node); + uint64_t bo_size = 0; + pool = radv_alloc2(&device->alloc, pAllocator, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!pool) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + memset(pool, 0, sizeof(*pool)); + + pool->free_list = -1; + pool->full_list = 0; + pool->free_nodes[max_sets - 1].next = -1; + pool->max_sets = max_sets; + + for (int i = 0; i + 1 < max_sets; ++i) + pool->free_nodes[i].next = i + 1; + + for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) { + switch(pCreateInfo->pPoolSizes[i].type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + break; + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_SAMPLER: + /* 32 as we may need to align for images */ + bo_size += 32 * pCreateInfo->pPoolSizes[i].descriptorCount; + break; + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + bo_size += 64 * pCreateInfo->pPoolSizes[i].descriptorCount; + break; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + bo_size += 96 * pCreateInfo->pPoolSizes[i].descriptorCount; + break; + default: + unreachable("unknown descriptor type\n"); + break; + } + } + + if (bo_size) { + pool->bo = device->ws->buffer_create(device->ws, bo_size, + 32, RADEON_DOMAIN_VRAM, 0); + pool->mapped_ptr = (uint8_t*)device->ws->buffer_map(pool->bo); + } + pool->size = bo_size; + + list_inithead(&pool->descriptor_sets); + *pDescriptorPool = radv_descriptor_pool_to_handle(pool); + return VK_SUCCESS; +} + +void radv_DestroyDescriptorPool( + VkDevice _device, + VkDescriptorPool _pool, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_descriptor_pool, pool, _pool); + + if (!pool) + return; + + list_for_each_entry_safe(struct radv_descriptor_set, set, + &pool->descriptor_sets, descriptor_pool) { + radv_descriptor_set_destroy(device, pool, set, false); + } + + if (pool->bo) + device->ws->buffer_destroy(pool->bo); + radv_free2(&device->alloc, pAllocator, pool); +} + +VkResult radv_ResetDescriptorPool( + VkDevice _device, + VkDescriptorPool descriptorPool, + VkDescriptorPoolResetFlags flags) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_descriptor_pool, pool, descriptorPool); + + list_for_each_entry_safe(struct radv_descriptor_set, set, + &pool->descriptor_sets, descriptor_pool) { + radv_descriptor_set_destroy(device, pool, set, false); + } + + pool->current_offset = 0; + pool->free_list = -1; + pool->full_list = 0; + pool->free_nodes[pool->max_sets - 1].next = -1; + + for (int i = 0; i + 1 < pool->max_sets; ++i) + pool->free_nodes[i].next = i + 1; + + return VK_SUCCESS; +} + +VkResult radv_AllocateDescriptorSets( + VkDevice _device, + const VkDescriptorSetAllocateInfo* pAllocateInfo, + VkDescriptorSet* pDescriptorSets) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_descriptor_pool, pool, pAllocateInfo->descriptorPool); + + VkResult result = VK_SUCCESS; + uint32_t i; + struct radv_descriptor_set *set; + + /* allocate a set of buffers for each shader to contain descriptors */ + for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) { + RADV_FROM_HANDLE(radv_descriptor_set_layout, layout, + pAllocateInfo->pSetLayouts[i]); + + result = radv_descriptor_set_create(device, pool, NULL, layout, &set); + if (result != VK_SUCCESS) + break; + + pDescriptorSets[i] = radv_descriptor_set_to_handle(set); + } + + if (result != VK_SUCCESS) + radv_FreeDescriptorSets(_device, pAllocateInfo->descriptorPool, + i, pDescriptorSets); + return result; +} + +VkResult radv_FreeDescriptorSets( + VkDevice _device, + VkDescriptorPool descriptorPool, + uint32_t count, + const VkDescriptorSet* pDescriptorSets) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_descriptor_pool, pool, descriptorPool); + + for (uint32_t i = 0; i < count; i++) { + RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]); + + if (set) + radv_descriptor_set_destroy(device, pool, set, true); + } + return VK_SUCCESS; +} + +static void write_texel_buffer_descriptor(struct radv_device *device, + unsigned *dst, + struct radeon_winsys_bo **buffer_list, + const VkBufferView _buffer_view) +{ + RADV_FROM_HANDLE(radv_buffer_view, buffer_view, _buffer_view); + + memcpy(dst, buffer_view->state, 4 * 4); + *buffer_list = buffer_view->bo; +} + +static void write_buffer_descriptor(struct radv_device *device, + unsigned *dst, + struct radeon_winsys_bo **buffer_list, + const VkDescriptorBufferInfo *buffer_info) +{ + RADV_FROM_HANDLE(radv_buffer, buffer, buffer_info->buffer); + uint64_t va = device->ws->buffer_get_va(buffer->bo); + uint32_t range = buffer_info->range; + + if (buffer_info->range == VK_WHOLE_SIZE) + range = buffer->size - buffer_info->offset; + + va += buffer_info->offset + buffer->offset; + dst[0] = va; + dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); + dst[2] = range; + dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + + *buffer_list = buffer->bo; +} + +static void write_dynamic_buffer_descriptor(struct radv_device *device, + struct radv_descriptor_range *range, + struct radeon_winsys_bo **buffer_list, + const VkDescriptorBufferInfo *buffer_info) +{ + RADV_FROM_HANDLE(radv_buffer, buffer, buffer_info->buffer); + uint64_t va = device->ws->buffer_get_va(buffer->bo); + unsigned size = buffer_info->range; + + if (buffer_info->range == VK_WHOLE_SIZE) + size = buffer->size - buffer_info->offset; + + va += buffer_info->offset + buffer->offset; + range->va = va; + range->size = size; + + *buffer_list = buffer->bo; +} + +static void +write_image_descriptor(struct radv_device *device, + unsigned *dst, + struct radeon_winsys_bo **buffer_list, + const VkDescriptorImageInfo *image_info) +{ + RADV_FROM_HANDLE(radv_image_view, iview, image_info->imageView); + memcpy(dst, iview->descriptor, 8 * 4); + memcpy(dst + 8, iview->fmask_descriptor, 8 * 4); + *buffer_list = iview->bo; +} + +static void +write_combined_image_sampler_descriptor(struct radv_device *device, + unsigned *dst, + struct radeon_winsys_bo **buffer_list, + const VkDescriptorImageInfo *image_info, + bool has_sampler) +{ + RADV_FROM_HANDLE(radv_sampler, sampler, image_info->sampler); + + write_image_descriptor(device, dst, buffer_list, image_info); + /* copy over sampler state */ + if (has_sampler) + memcpy(dst + 16, sampler->state, 16); +} + +static void +write_sampler_descriptor(struct radv_device *device, + unsigned *dst, + const VkDescriptorImageInfo *image_info) +{ + RADV_FROM_HANDLE(radv_sampler, sampler, image_info->sampler); + + memcpy(dst, sampler->state, 16); +} + +void radv_UpdateDescriptorSets( + VkDevice _device, + uint32_t descriptorWriteCount, + const VkWriteDescriptorSet* pDescriptorWrites, + uint32_t descriptorCopyCount, + const VkCopyDescriptorSet* pDescriptorCopies) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + uint32_t i, j; + for (i = 0; i < descriptorWriteCount; i++) { + const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i]; + RADV_FROM_HANDLE(radv_descriptor_set, set, writeset->dstSet); + const struct radv_descriptor_set_binding_layout *binding_layout = + set->layout->binding + writeset->dstBinding; + uint32_t *ptr = set->mapped_ptr; + struct radeon_winsys_bo **buffer_list = set->descriptors; + + ptr += binding_layout->offset / 4; + ptr += binding_layout->size * writeset->dstArrayElement / 4; + buffer_list += binding_layout->buffer_offset; + buffer_list += binding_layout->buffer_count * writeset->dstArrayElement; + for (j = 0; j < writeset->descriptorCount; ++j) { + switch(writeset->descriptorType) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + unsigned idx = writeset->dstArrayElement + j; + idx += binding_layout->dynamic_offset_offset; + write_dynamic_buffer_descriptor(device, set->dynamic_descriptors + idx, + buffer_list, writeset->pBufferInfo + j); + break; + } + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + write_buffer_descriptor(device, ptr, buffer_list, + writeset->pBufferInfo + j); + break; + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + write_texel_buffer_descriptor(device, ptr, buffer_list, + writeset->pTexelBufferView[j]); + break; + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + write_image_descriptor(device, ptr, buffer_list, + writeset->pImageInfo + j); + break; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + write_combined_image_sampler_descriptor(device, ptr, buffer_list, + writeset->pImageInfo + j, + !binding_layout->immutable_samplers); + break; + case VK_DESCRIPTOR_TYPE_SAMPLER: + assert(!binding_layout->immutable_samplers); + write_sampler_descriptor(device, ptr, + writeset->pImageInfo + j); + break; + default: + unreachable("unimplemented descriptor type"); + break; + } + ptr += binding_layout->size / 4; + buffer_list += binding_layout->buffer_count; + } + + } + if (descriptorCopyCount) + radv_finishme("copy descriptors"); +} diff --git a/src/amd/vulkan/radv_descriptor_set.h b/src/amd/vulkan/radv_descriptor_set.h new file mode 100644 index 00000000000..cad1ccf1573 --- /dev/null +++ b/src/amd/vulkan/radv_descriptor_set.h @@ -0,0 +1,81 @@ +/* + * Copyright © 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#include + +#define MAX_SETS 8 + +struct radv_descriptor_set_binding_layout { + VkDescriptorType type; + + /* Number of array elements in this binding */ + uint16_t array_size; + + uint16_t offset; + uint16_t buffer_offset; + uint16_t dynamic_offset_offset; + + /* redundant with the type, each for a single array element */ + uint16_t size; + uint16_t buffer_count; + uint16_t dynamic_offset_count; + + /* Immutable samplers (or NULL if no immutable samplers) */ + struct radv_sampler **immutable_samplers; +}; + +struct radv_descriptor_set_layout { + /* Number of bindings in this descriptor set */ + uint16_t binding_count; + + /* Total size of the descriptor set with room for all array entries */ + uint16_t size; + + /* Shader stages affected by this descriptor set */ + uint16_t shader_stages; + uint16_t dynamic_shader_stages; + + /* Number of buffers in this descriptor set */ + uint16_t buffer_count; + + /* Number of dynamic offsets used by this descriptor set */ + uint16_t dynamic_offset_count; + + /* Bindings in this descriptor set */ + struct radv_descriptor_set_binding_layout binding[0]; +}; + +struct radv_pipeline_layout { + struct { + struct radv_descriptor_set_layout *layout; + uint32_t size; + uint32_t dynamic_offset_start; + } set[MAX_SETS]; + + uint32_t num_sets; + uint32_t push_constant_size; + uint32_t dynamic_offset_count; + + unsigned char sha1[20]; +}; diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c new file mode 100644 index 00000000000..e9e00eb9673 --- /dev/null +++ b/src/amd/vulkan/radv_device.c @@ -0,0 +1,1782 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based in part on anv driver which is: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include "radv_private.h" +#include "util/strtod.h" + +#include +#include +#include "amdgpu_id.h" +#include "winsys/amdgpu/radv_amdgpu_winsys_public.h" +#include "ac_llvm_util.h" +#include "vk_format.h" +#include "sid.h" +#include "radv_timestamp.h" +#include "util/debug.h" +struct radv_dispatch_table dtable; + +struct radv_fence { + struct radeon_winsys_fence *fence; + bool submitted; + bool signalled; +}; + +static VkResult +radv_physical_device_init(struct radv_physical_device *device, + struct radv_instance *instance, + const char *path) +{ + VkResult result; + int fd; + + fd = open(path, O_RDWR | O_CLOEXEC); + if (fd < 0) + return vk_errorf(VK_ERROR_INCOMPATIBLE_DRIVER, + "failed to open %s: %m", path); + + device->_loader_data.loaderMagic = ICD_LOADER_MAGIC; + device->instance = instance; + assert(strlen(path) < ARRAY_SIZE(device->path)); + strncpy(device->path, path, ARRAY_SIZE(device->path)); + + device->ws = radv_amdgpu_winsys_create(fd); + if (!device->ws) { + result = VK_ERROR_INCOMPATIBLE_DRIVER; + goto fail; + } + device->ws->query_info(device->ws, &device->rad_info); + result = radv_init_wsi(device); + if (result != VK_SUCCESS) + goto fail; + + fprintf(stderr, "WARNING: radv is not a conformant vulkan implementation, testing use only.\n"); + device->name = device->rad_info.name; + return VK_SUCCESS; + +fail: + close(fd); + return result; +} + +static void +radv_physical_device_finish(struct radv_physical_device *device) +{ + radv_finish_wsi(device); + device->ws->destroy(device->ws); +} + +static const VkExtensionProperties global_extensions[] = { + { + .extensionName = VK_KHR_SURFACE_EXTENSION_NAME, + .specVersion = 25, + }, +#ifdef VK_USE_PLATFORM_XCB_KHR + { + .extensionName = VK_KHR_XCB_SURFACE_EXTENSION_NAME, + .specVersion = 5, + }, +#endif +#ifdef VK_USE_PLATFORM_WAYLAND_KHR + { + .extensionName = VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME, + .specVersion = 4, + }, +#endif +}; + +static const VkExtensionProperties device_extensions[] = { + { + .extensionName = VK_KHR_SWAPCHAIN_EXTENSION_NAME, + .specVersion = 67, + }, +}; + +static void * +default_alloc_func(void *pUserData, size_t size, size_t align, + VkSystemAllocationScope allocationScope) +{ + return malloc(size); +} + +static void * +default_realloc_func(void *pUserData, void *pOriginal, size_t size, + size_t align, VkSystemAllocationScope allocationScope) +{ + return realloc(pOriginal, size); +} + +static void +default_free_func(void *pUserData, void *pMemory) +{ + free(pMemory); +} + +static const VkAllocationCallbacks default_alloc = { + .pUserData = NULL, + .pfnAllocation = default_alloc_func, + .pfnReallocation = default_realloc_func, + .pfnFree = default_free_func, +}; + +VkResult radv_CreateInstance( + const VkInstanceCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkInstance* pInstance) +{ + struct radv_instance *instance; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO); + + uint32_t client_version; + if (pCreateInfo->pApplicationInfo && + pCreateInfo->pApplicationInfo->apiVersion != 0) { + client_version = pCreateInfo->pApplicationInfo->apiVersion; + } else { + client_version = VK_MAKE_VERSION(1, 0, 0); + } + + if (VK_MAKE_VERSION(1, 0, 0) > client_version || + client_version > VK_MAKE_VERSION(1, 0, 0xfff)) { + return vk_errorf(VK_ERROR_INCOMPATIBLE_DRIVER, + "Client requested version %d.%d.%d", + VK_VERSION_MAJOR(client_version), + VK_VERSION_MINOR(client_version), + VK_VERSION_PATCH(client_version)); + } + + for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) { + bool found = false; + for (uint32_t j = 0; j < ARRAY_SIZE(global_extensions); j++) { + if (strcmp(pCreateInfo->ppEnabledExtensionNames[i], + global_extensions[j].extensionName) == 0) { + found = true; + break; + } + } + if (!found) + return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT); + } + + instance = radv_alloc2(&default_alloc, pAllocator, sizeof(*instance), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + if (!instance) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + instance->_loader_data.loaderMagic = ICD_LOADER_MAGIC; + + if (pAllocator) + instance->alloc = *pAllocator; + else + instance->alloc = default_alloc; + + instance->apiVersion = client_version; + instance->physicalDeviceCount = -1; + + _mesa_locale_init(); + + VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false)); + + *pInstance = radv_instance_to_handle(instance); + + return VK_SUCCESS; +} + +void radv_DestroyInstance( + VkInstance _instance, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_instance, instance, _instance); + + if (instance->physicalDeviceCount > 0) { + /* We support at most one physical device. */ + assert(instance->physicalDeviceCount == 1); + radv_physical_device_finish(&instance->physicalDevice); + } + + VG(VALGRIND_DESTROY_MEMPOOL(instance)); + + _mesa_locale_fini(); + + radv_free(&instance->alloc, instance); +} + +VkResult radv_EnumeratePhysicalDevices( + VkInstance _instance, + uint32_t* pPhysicalDeviceCount, + VkPhysicalDevice* pPhysicalDevices) +{ + RADV_FROM_HANDLE(radv_instance, instance, _instance); + VkResult result; + + if (instance->physicalDeviceCount < 0) { + char path[20]; + for (unsigned i = 0; i < 8; i++) { + snprintf(path, sizeof(path), "/dev/dri/renderD%d", 128 + i); + result = radv_physical_device_init(&instance->physicalDevice, + instance, path); + if (result != VK_ERROR_INCOMPATIBLE_DRIVER) + break; + } + + if (result == VK_ERROR_INCOMPATIBLE_DRIVER) { + instance->physicalDeviceCount = 0; + } else if (result == VK_SUCCESS) { + instance->physicalDeviceCount = 1; + } else { + return result; + } + } + + /* pPhysicalDeviceCount is an out parameter if pPhysicalDevices is NULL; + * otherwise it's an inout parameter. + * + * The Vulkan spec (git aaed022) says: + * + * pPhysicalDeviceCount is a pointer to an unsigned integer variable + * that is initialized with the number of devices the application is + * prepared to receive handles to. pname:pPhysicalDevices is pointer to + * an array of at least this many VkPhysicalDevice handles [...]. + * + * Upon success, if pPhysicalDevices is NULL, vkEnumeratePhysicalDevices + * overwrites the contents of the variable pointed to by + * pPhysicalDeviceCount with the number of physical devices in in the + * instance; otherwise, vkEnumeratePhysicalDevices overwrites + * pPhysicalDeviceCount with the number of physical handles written to + * pPhysicalDevices. + */ + if (!pPhysicalDevices) { + *pPhysicalDeviceCount = instance->physicalDeviceCount; + } else if (*pPhysicalDeviceCount >= 1) { + pPhysicalDevices[0] = radv_physical_device_to_handle(&instance->physicalDevice); + *pPhysicalDeviceCount = 1; + } else { + *pPhysicalDeviceCount = 0; + } + + return VK_SUCCESS; +} + +void radv_GetPhysicalDeviceFeatures( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceFeatures* pFeatures) +{ + // RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); + + memset(pFeatures, 0, sizeof(*pFeatures)); + + *pFeatures = (VkPhysicalDeviceFeatures) { + .robustBufferAccess = true, + .fullDrawIndexUint32 = true, + .imageCubeArray = true, + .independentBlend = true, + .geometryShader = false, + .tessellationShader = false, + .sampleRateShading = false, + .dualSrcBlend = true, + .logicOp = true, + .multiDrawIndirect = true, + .drawIndirectFirstInstance = true, + .depthClamp = true, + .depthBiasClamp = true, + .fillModeNonSolid = true, + .depthBounds = true, + .wideLines = true, + .largePoints = true, + .alphaToOne = true, + .multiViewport = false, + .samplerAnisotropy = false, /* FINISHME */ + .textureCompressionETC2 = false, + .textureCompressionASTC_LDR = false, + .textureCompressionBC = true, + .occlusionQueryPrecise = true, + .pipelineStatisticsQuery = false, + .vertexPipelineStoresAndAtomics = true, + .fragmentStoresAndAtomics = true, + .shaderTessellationAndGeometryPointSize = true, + .shaderImageGatherExtended = false, + .shaderStorageImageExtendedFormats = false, + .shaderStorageImageMultisample = false, + .shaderUniformBufferArrayDynamicIndexing = true, + .shaderSampledImageArrayDynamicIndexing = true, + .shaderStorageBufferArrayDynamicIndexing = true, + .shaderStorageImageArrayDynamicIndexing = true, + .shaderStorageImageReadWithoutFormat = false, + .shaderStorageImageWriteWithoutFormat = true, + .shaderClipDistance = true, + .shaderCullDistance = true, + .shaderFloat64 = false, + .shaderInt64 = false, + .shaderInt16 = false, + .alphaToOne = true, + .variableMultisampleRate = false, + .inheritedQueries = false, + }; +} + +void +radv_device_get_cache_uuid(void *uuid) +{ + memset(uuid, 0, VK_UUID_SIZE); + snprintf(uuid, VK_UUID_SIZE, "radv-%s", RADV_TIMESTAMP); +} + +void radv_GetPhysicalDeviceProperties( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceProperties* pProperties) +{ + RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); + VkSampleCountFlags sample_counts = 0xf; + VkPhysicalDeviceLimits limits = { + .maxImageDimension1D = (1 << 14), + .maxImageDimension2D = (1 << 14), + .maxImageDimension3D = (1 << 11), + .maxImageDimensionCube = (1 << 14), + .maxImageArrayLayers = (1 << 11), + .maxTexelBufferElements = 128 * 1024 * 1024, + .maxUniformBufferRange = UINT32_MAX, + .maxStorageBufferRange = UINT32_MAX, + .maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE, + .maxMemoryAllocationCount = UINT32_MAX, + .maxSamplerAllocationCount = 64 * 1024, + .bufferImageGranularity = 64, /* A cache line */ + .sparseAddressSpaceSize = 0, + .maxBoundDescriptorSets = MAX_SETS, + .maxPerStageDescriptorSamplers = 64, + .maxPerStageDescriptorUniformBuffers = 64, + .maxPerStageDescriptorStorageBuffers = 64, + .maxPerStageDescriptorSampledImages = 64, + .maxPerStageDescriptorStorageImages = 64, + .maxPerStageDescriptorInputAttachments = 64, + .maxPerStageResources = 128, + .maxDescriptorSetSamplers = 256, + .maxDescriptorSetUniformBuffers = 256, + .maxDescriptorSetUniformBuffersDynamic = 256, + .maxDescriptorSetStorageBuffers = 256, + .maxDescriptorSetStorageBuffersDynamic = 256, + .maxDescriptorSetSampledImages = 256, + .maxDescriptorSetStorageImages = 256, + .maxDescriptorSetInputAttachments = 256, + .maxVertexInputAttributes = 32, + .maxVertexInputBindings = 32, + .maxVertexInputAttributeOffset = 2047, + .maxVertexInputBindingStride = 2048, + .maxVertexOutputComponents = 128, + .maxTessellationGenerationLevel = 0, + .maxTessellationPatchSize = 0, + .maxTessellationControlPerVertexInputComponents = 0, + .maxTessellationControlPerVertexOutputComponents = 0, + .maxTessellationControlPerPatchOutputComponents = 0, + .maxTessellationControlTotalOutputComponents = 0, + .maxTessellationEvaluationInputComponents = 0, + .maxTessellationEvaluationOutputComponents = 0, + .maxGeometryShaderInvocations = 32, + .maxGeometryInputComponents = 64, + .maxGeometryOutputComponents = 128, + .maxGeometryOutputVertices = 256, + .maxGeometryTotalOutputComponents = 1024, + .maxFragmentInputComponents = 128, + .maxFragmentOutputAttachments = 8, + .maxFragmentDualSrcAttachments = 2, + .maxFragmentCombinedOutputResources = 8, + .maxComputeSharedMemorySize = 32768, + .maxComputeWorkGroupCount = { 65535, 65535, 65535 }, + .maxComputeWorkGroupInvocations = 16 * 1024, + .maxComputeWorkGroupSize = { + 16 * 1024/*devinfo->max_cs_threads*/, + 16 * 1024, + 16 * 1024 + }, + .subPixelPrecisionBits = 4 /* FIXME */, + .subTexelPrecisionBits = 4 /* FIXME */, + .mipmapPrecisionBits = 4 /* FIXME */, + .maxDrawIndexedIndexValue = UINT32_MAX, + .maxDrawIndirectCount = UINT32_MAX, + .maxSamplerLodBias = 16, + .maxSamplerAnisotropy = 16, + .maxViewports = MAX_VIEWPORTS, + .maxViewportDimensions = { (1 << 14), (1 << 14) }, + .viewportBoundsRange = { INT16_MIN, INT16_MAX }, + .viewportSubPixelBits = 13, /* We take a float? */ + .minMemoryMapAlignment = 4096, /* A page */ + .minTexelBufferOffsetAlignment = 1, + .minUniformBufferOffsetAlignment = 4, + .minStorageBufferOffsetAlignment = 4, + .minTexelOffset = -8, + .maxTexelOffset = 7, + .minTexelGatherOffset = -8, + .maxTexelGatherOffset = 7, + .minInterpolationOffset = 0, /* FIXME */ + .maxInterpolationOffset = 0, /* FIXME */ + .subPixelInterpolationOffsetBits = 0, /* FIXME */ + .maxFramebufferWidth = (1 << 14), + .maxFramebufferHeight = (1 << 14), + .maxFramebufferLayers = (1 << 10), + .framebufferColorSampleCounts = sample_counts, + .framebufferDepthSampleCounts = sample_counts, + .framebufferStencilSampleCounts = sample_counts, + .framebufferNoAttachmentsSampleCounts = sample_counts, + .maxColorAttachments = MAX_RTS, + .sampledImageColorSampleCounts = sample_counts, + .sampledImageIntegerSampleCounts = VK_SAMPLE_COUNT_1_BIT, + .sampledImageDepthSampleCounts = sample_counts, + .sampledImageStencilSampleCounts = sample_counts, + .storageImageSampleCounts = VK_SAMPLE_COUNT_1_BIT, + .maxSampleMaskWords = 1, + .timestampComputeAndGraphics = false, + .timestampPeriod = 100000.0 / pdevice->rad_info.clock_crystal_freq, + .maxClipDistances = 8, + .maxCullDistances = 8, + .maxCombinedClipAndCullDistances = 8, + .discreteQueuePriorities = 1, + .pointSizeRange = { 0.125, 255.875 }, + .lineWidthRange = { 0.0, 7.9921875 }, + .pointSizeGranularity = (1.0 / 8.0), + .lineWidthGranularity = (1.0 / 128.0), + .strictLines = false, /* FINISHME */ + .standardSampleLocations = true, + .optimalBufferCopyOffsetAlignment = 128, + .optimalBufferCopyRowPitchAlignment = 128, + .nonCoherentAtomSize = 64, + }; + + *pProperties = (VkPhysicalDeviceProperties) { + .apiVersion = VK_MAKE_VERSION(1, 0, 5), + .driverVersion = 1, + .vendorID = 0x1002, + .deviceID = pdevice->rad_info.pci_id, + .deviceType = VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU, + .limits = limits, + .sparseProperties = {0}, /* Broadwell doesn't do sparse. */ + }; + + strcpy(pProperties->deviceName, pdevice->name); + radv_device_get_cache_uuid(pProperties->pipelineCacheUUID); +} + +void radv_GetPhysicalDeviceQueueFamilyProperties( + VkPhysicalDevice physicalDevice, + uint32_t* pCount, + VkQueueFamilyProperties* pQueueFamilyProperties) +{ + if (pQueueFamilyProperties == NULL) { + *pCount = 1; + return; + } + assert(*pCount >= 1); + + *pQueueFamilyProperties = (VkQueueFamilyProperties) { + .queueFlags = VK_QUEUE_GRAPHICS_BIT | + VK_QUEUE_COMPUTE_BIT | + VK_QUEUE_TRANSFER_BIT, + .queueCount = 1, + .timestampValidBits = 64, + .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 }, + }; +} + +void radv_GetPhysicalDeviceMemoryProperties( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceMemoryProperties* pMemoryProperties) +{ + RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice); + + pMemoryProperties->memoryTypeCount = 3; + pMemoryProperties->memoryTypes[0] = (VkMemoryType) { + .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + .heapIndex = 0, + }; + pMemoryProperties->memoryTypes[1] = (VkMemoryType) { + .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + .heapIndex = 0, + }; + pMemoryProperties->memoryTypes[2] = (VkMemoryType) { + .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT| + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT, + .heapIndex = 1, + }; + + pMemoryProperties->memoryHeapCount = 2; + pMemoryProperties->memoryHeaps[0] = (VkMemoryHeap) { + .size = physical_device->rad_info.vram_size, + .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, + }; + pMemoryProperties->memoryHeaps[1] = (VkMemoryHeap) { + .size = physical_device->rad_info.gart_size, + .flags = 0, + }; +} + +static VkResult +radv_queue_init(struct radv_device *device, struct radv_queue *queue) +{ + queue->_loader_data.loaderMagic = ICD_LOADER_MAGIC; + queue->device = device; + + return VK_SUCCESS; +} + +static void +radv_queue_finish(struct radv_queue *queue) +{ +} + +VkResult radv_CreateDevice( + VkPhysicalDevice physicalDevice, + const VkDeviceCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDevice* pDevice) +{ + RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice); + VkResult result; + struct radv_device *device; + + for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) { + bool found = false; + for (uint32_t j = 0; j < ARRAY_SIZE(device_extensions); j++) { + if (strcmp(pCreateInfo->ppEnabledExtensionNames[i], + device_extensions[j].extensionName) == 0) { + found = true; + break; + } + } + if (!found) + return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT); + } + + device = radv_alloc2(&physical_device->instance->alloc, pAllocator, + sizeof(*device), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!device) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + device->_loader_data.loaderMagic = ICD_LOADER_MAGIC; + device->instance = physical_device->instance; + + device->ws = physical_device->ws; + if (pAllocator) + device->alloc = *pAllocator; + else + device->alloc = physical_device->instance->alloc; + + device->hw_ctx = device->ws->ctx_create(device->ws); + if (!device->hw_ctx) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail_free; + } + + radv_queue_init(device, &device->queue); + + result = radv_device_init_meta(device); + if (result != VK_SUCCESS) { + device->ws->ctx_destroy(device->hw_ctx); + goto fail_free; + } + device->allow_fast_clears = env_var_as_boolean("RADV_FAST_CLEARS", false); + device->allow_dcc = !env_var_as_boolean("RADV_DCC_DISABLE", false); + + if (device->allow_fast_clears && device->allow_dcc) + radv_finishme("DCC fast clears have not been tested\n"); + + radv_device_init_msaa(device); + device->empty_cs = device->ws->cs_create(device->ws, RING_GFX); + radeon_emit(device->empty_cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); + radeon_emit(device->empty_cs, CONTEXT_CONTROL_LOAD_ENABLE(1)); + radeon_emit(device->empty_cs, CONTEXT_CONTROL_SHADOW_ENABLE(1)); + device->ws->cs_finalize(device->empty_cs); + *pDevice = radv_device_to_handle(device); + return VK_SUCCESS; +fail_free: + radv_free(&device->alloc, device); + return result; +} + +void radv_DestroyDevice( + VkDevice _device, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + + device->ws->ctx_destroy(device->hw_ctx); + radv_queue_finish(&device->queue); + radv_device_finish_meta(device); + + radv_free(&device->alloc, device); +} + +VkResult radv_EnumerateInstanceExtensionProperties( + const char* pLayerName, + uint32_t* pPropertyCount, + VkExtensionProperties* pProperties) +{ + unsigned i; + if (pProperties == NULL) { + *pPropertyCount = ARRAY_SIZE(global_extensions); + return VK_SUCCESS; + } + + for (i = 0; i < *pPropertyCount; i++) + memcpy(&pProperties[i], &global_extensions[i], sizeof(VkExtensionProperties)); + + *pPropertyCount = i; + if (i < ARRAY_SIZE(global_extensions)) + return VK_INCOMPLETE; + + return VK_SUCCESS; +} + +VkResult radv_EnumerateDeviceExtensionProperties( + VkPhysicalDevice physicalDevice, + const char* pLayerName, + uint32_t* pPropertyCount, + VkExtensionProperties* pProperties) +{ + unsigned i; + + if (pProperties == NULL) { + *pPropertyCount = ARRAY_SIZE(device_extensions); + return VK_SUCCESS; + } + + for (i = 0; i < *pPropertyCount; i++) + memcpy(&pProperties[i], &device_extensions[i], sizeof(VkExtensionProperties)); + + *pPropertyCount = i; + if (i < ARRAY_SIZE(device_extensions)) + return VK_INCOMPLETE; + return VK_SUCCESS; +} + +VkResult radv_EnumerateInstanceLayerProperties( + uint32_t* pPropertyCount, + VkLayerProperties* pProperties) +{ + if (pProperties == NULL) { + *pPropertyCount = 0; + return VK_SUCCESS; + } + + /* None supported at this time */ + return vk_error(VK_ERROR_LAYER_NOT_PRESENT); +} + +VkResult radv_EnumerateDeviceLayerProperties( + VkPhysicalDevice physicalDevice, + uint32_t* pPropertyCount, + VkLayerProperties* pProperties) +{ + if (pProperties == NULL) { + *pPropertyCount = 0; + return VK_SUCCESS; + } + + /* None supported at this time */ + return vk_error(VK_ERROR_LAYER_NOT_PRESENT); +} + +void radv_GetDeviceQueue( + VkDevice _device, + uint32_t queueNodeIndex, + uint32_t queueIndex, + VkQueue* pQueue) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + + assert(queueIndex == 0); + + *pQueue = radv_queue_to_handle(&device->queue); +} + +VkResult radv_QueueSubmit( + VkQueue _queue, + uint32_t submitCount, + const VkSubmitInfo* pSubmits, + VkFence _fence) +{ + RADV_FROM_HANDLE(radv_queue, queue, _queue); + RADV_FROM_HANDLE(radv_fence, fence, _fence); + struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL; + struct radeon_winsys_ctx *ctx = queue->device->hw_ctx; + int ret; + + for (uint32_t i = 0; i < submitCount; i++) { + struct radeon_winsys_cs **cs_array; + bool can_patch = true; + + if (!pSubmits[i].commandBufferCount) + continue; + + cs_array = malloc(sizeof(struct radeon_winsys_cs *) * + pSubmits[i].commandBufferCount); + + for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, + pSubmits[i].pCommandBuffers[j]); + assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); + + cs_array[j] = cmd_buffer->cs; + if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) + can_patch = false; + } + ret = queue->device->ws->cs_submit(ctx, cs_array, + pSubmits[i].commandBufferCount, + can_patch, base_fence); + if (ret) + radv_loge("failed to submit CS %d\n", i); + free(cs_array); + } + + if (fence) { + if (!submitCount) + ret = queue->device->ws->cs_submit(ctx, &queue->device->empty_cs, + 1, false, base_fence); + + fence->submitted = true; + } + + return VK_SUCCESS; +} + +VkResult radv_QueueWaitIdle( + VkQueue _queue) +{ + RADV_FROM_HANDLE(radv_queue, queue, _queue); + + queue->device->ws->ctx_wait_idle(queue->device->hw_ctx); + return VK_SUCCESS; +} + +VkResult radv_DeviceWaitIdle( + VkDevice _device) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + + device->ws->ctx_wait_idle(device->hw_ctx); + return VK_SUCCESS; +} + +PFN_vkVoidFunction radv_GetInstanceProcAddr( + VkInstance instance, + const char* pName) +{ + return radv_lookup_entrypoint(pName); +} + +/* The loader wants us to expose a second GetInstanceProcAddr function + * to work around certain LD_PRELOAD issues seen in apps. + */ +PUBLIC +VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr( + VkInstance instance, + const char* pName); + +PUBLIC +VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr( + VkInstance instance, + const char* pName) +{ + return radv_GetInstanceProcAddr(instance, pName); +} + +PFN_vkVoidFunction radv_GetDeviceProcAddr( + VkDevice device, + const char* pName) +{ + return radv_lookup_entrypoint(pName); +} + +VkResult radv_AllocateMemory( + VkDevice _device, + const VkMemoryAllocateInfo* pAllocateInfo, + const VkAllocationCallbacks* pAllocator, + VkDeviceMemory* pMem) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_device_memory *mem; + VkResult result; + enum radeon_bo_domain domain; + uint32_t flags = 0; + assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO); + + if (pAllocateInfo->allocationSize == 0) { + /* Apparently, this is allowed */ + *pMem = VK_NULL_HANDLE; + return VK_SUCCESS; + } + + mem = radv_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (mem == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096); + if (pAllocateInfo->memoryTypeIndex == 2) + domain = RADEON_DOMAIN_GTT; + else + domain = RADEON_DOMAIN_VRAM; + + if (pAllocateInfo->memoryTypeIndex == 0) + flags |= RADEON_FLAG_NO_CPU_ACCESS; + else + flags |= RADEON_FLAG_CPU_ACCESS; + mem->bo = device->ws->buffer_create(device->ws, alloc_size, 32768, + domain, flags); + + if (!mem->bo) { + result = VK_ERROR_OUT_OF_DEVICE_MEMORY; + goto fail; + } + mem->type_index = pAllocateInfo->memoryTypeIndex; + + *pMem = radv_device_memory_to_handle(mem); + + return VK_SUCCESS; + +fail: + radv_free2(&device->alloc, pAllocator, mem); + + return result; +} + +void radv_FreeMemory( + VkDevice _device, + VkDeviceMemory _mem, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_device_memory, mem, _mem); + + if (mem == NULL) + return; + + device->ws->buffer_destroy(mem->bo); + mem->bo = NULL; + + radv_free2(&device->alloc, pAllocator, mem); +} + +VkResult radv_MapMemory( + VkDevice _device, + VkDeviceMemory _memory, + VkDeviceSize offset, + VkDeviceSize size, + VkMemoryMapFlags flags, + void** ppData) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_device_memory, mem, _memory); + + if (mem == NULL) { + *ppData = NULL; + return VK_SUCCESS; + } + + *ppData = device->ws->buffer_map(mem->bo); + if (*ppData) { + *ppData += offset; + return VK_SUCCESS; + } + + return VK_ERROR_MEMORY_MAP_FAILED; +} + +void radv_UnmapMemory( + VkDevice _device, + VkDeviceMemory _memory) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_device_memory, mem, _memory); + + if (mem == NULL) + return; + + device->ws->buffer_unmap(mem->bo); +} + +VkResult radv_FlushMappedMemoryRanges( + VkDevice _device, + uint32_t memoryRangeCount, + const VkMappedMemoryRange* pMemoryRanges) +{ + return VK_SUCCESS; +} + +VkResult radv_InvalidateMappedMemoryRanges( + VkDevice _device, + uint32_t memoryRangeCount, + const VkMappedMemoryRange* pMemoryRanges) +{ + return VK_SUCCESS; +} + +void radv_GetBufferMemoryRequirements( + VkDevice device, + VkBuffer _buffer, + VkMemoryRequirements* pMemoryRequirements) +{ + RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); + + /* The Vulkan spec (git aaed022) says: + * + * memoryTypeBits is a bitfield and contains one bit set for every + * supported memory type for the resource. The bit `1<memoryTypeBits = 0x7; + + pMemoryRequirements->size = buffer->size; + pMemoryRequirements->alignment = 16; +} + +void radv_GetImageMemoryRequirements( + VkDevice device, + VkImage _image, + VkMemoryRequirements* pMemoryRequirements) +{ + RADV_FROM_HANDLE(radv_image, image, _image); + + /* The Vulkan spec (git aaed022) says: + * + * memoryTypeBits is a bitfield and contains one bit set for every + * supported memory type for the resource. The bit `1<memoryTypeBits = 0x7; + + pMemoryRequirements->size = image->size; + pMemoryRequirements->alignment = image->alignment; +} + +void radv_GetImageSparseMemoryRequirements( + VkDevice device, + VkImage image, + uint32_t* pSparseMemoryRequirementCount, + VkSparseImageMemoryRequirements* pSparseMemoryRequirements) +{ + stub(); +} + +void radv_GetDeviceMemoryCommitment( + VkDevice device, + VkDeviceMemory memory, + VkDeviceSize* pCommittedMemoryInBytes) +{ + *pCommittedMemoryInBytes = 0; +} + +VkResult radv_BindBufferMemory( + VkDevice device, + VkBuffer _buffer, + VkDeviceMemory _memory, + VkDeviceSize memoryOffset) +{ + RADV_FROM_HANDLE(radv_device_memory, mem, _memory); + RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); + + if (mem) { + buffer->bo = mem->bo; + buffer->offset = memoryOffset; + } else { + buffer->bo = NULL; + buffer->offset = 0; + } + + return VK_SUCCESS; +} + +VkResult radv_BindImageMemory( + VkDevice device, + VkImage _image, + VkDeviceMemory _memory, + VkDeviceSize memoryOffset) +{ + RADV_FROM_HANDLE(radv_device_memory, mem, _memory); + RADV_FROM_HANDLE(radv_image, image, _image); + + if (mem) { + image->bo = mem->bo; + image->offset = memoryOffset; + } else { + image->bo = NULL; + image->offset = 0; + } + + return VK_SUCCESS; +} + +VkResult radv_QueueBindSparse( + VkQueue queue, + uint32_t bindInfoCount, + const VkBindSparseInfo* pBindInfo, + VkFence fence) +{ + stub_return(VK_ERROR_INCOMPATIBLE_DRIVER); +} + +VkResult radv_CreateFence( + VkDevice _device, + const VkFenceCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkFence* pFence) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_fence *fence = radv_alloc2(&device->alloc, pAllocator, + sizeof(*fence), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (!fence) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + memset(fence, 0, sizeof(*fence)); + fence->submitted = false; + fence->signalled = !!(pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT); + fence->fence = device->ws->create_fence(); + + + *pFence = radv_fence_to_handle(fence); + + return VK_SUCCESS; +} + +void radv_DestroyFence( + VkDevice _device, + VkFence _fence, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_fence, fence, _fence); + + if (!fence) + return; + device->ws->destroy_fence(fence->fence); + radv_free2(&device->alloc, pAllocator, fence); +} + +static uint64_t radv_get_absolute_timeout(uint64_t timeout) +{ + uint64_t current_time; + struct timespec tv; + + clock_gettime(CLOCK_MONOTONIC, &tv); + current_time = tv.tv_nsec + tv.tv_sec*1000000000ull; + + timeout = MIN2(UINT64_MAX - current_time, timeout); + + return current_time + timeout; +} + +VkResult radv_WaitForFences( + VkDevice _device, + uint32_t fenceCount, + const VkFence* pFences, + VkBool32 waitAll, + uint64_t timeout) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + timeout = radv_get_absolute_timeout(timeout); + + if (!waitAll && fenceCount > 1) { + fprintf(stderr, "radv: WaitForFences without waitAll not implemented yet\n"); + } + + for (uint32_t i = 0; i < fenceCount; ++i) { + RADV_FROM_HANDLE(radv_fence, fence, pFences[i]); + bool expired = false; + + if (!fence->submitted) + return VK_TIMEOUT; + + if (fence->signalled) + continue; + + expired = device->ws->fence_wait(device->ws, fence->fence, true, timeout); + if (!expired) + return VK_TIMEOUT; + + fence->signalled = true; + } + + return VK_SUCCESS; +} + +VkResult radv_ResetFences(VkDevice device, + uint32_t fenceCount, + const VkFence *pFences) +{ + for (unsigned i = 0; i < fenceCount; ++i) { + RADV_FROM_HANDLE(radv_fence, fence, pFences[i]); + fence->submitted = fence->signalled = false; + } + + return VK_SUCCESS; +} + +VkResult radv_GetFenceStatus(VkDevice _device, VkFence _fence) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_fence, fence, _fence); + + if (!fence->submitted) + return VK_NOT_READY; + + if (!device->ws->fence_wait(device->ws, fence->fence, false, 0)) + return VK_NOT_READY; + + return VK_SUCCESS; +} + + +// Queue semaphore functions + +VkResult radv_CreateSemaphore( + VkDevice device, + const VkSemaphoreCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSemaphore* pSemaphore) +{ + /* The DRM execbuffer ioctl always execute in-oder, even between different + * rings. As such, there's nothing to do for the user space semaphore. + */ + + *pSemaphore = (VkSemaphore)1; + + return VK_SUCCESS; +} + +void radv_DestroySemaphore( + VkDevice device, + VkSemaphore semaphore, + const VkAllocationCallbacks* pAllocator) +{ +} + +VkResult radv_CreateEvent( + VkDevice _device, + const VkEventCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkEvent* pEvent) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_event *event = radv_alloc2(&device->alloc, pAllocator, + sizeof(*event), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (!event) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + event->bo = device->ws->buffer_create(device->ws, 8, 8, + RADEON_DOMAIN_GTT, + RADEON_FLAG_CPU_ACCESS); + if (!event->bo) { + radv_free2(&device->alloc, pAllocator, event); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + + event->map = (uint64_t*)device->ws->buffer_map(event->bo); + + *pEvent = radv_event_to_handle(event); + + return VK_SUCCESS; +} + +void radv_DestroyEvent( + VkDevice _device, + VkEvent _event, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_event, event, _event); + + if (!event) + return; + device->ws->buffer_destroy(event->bo); + radv_free2(&device->alloc, pAllocator, event); +} + +VkResult radv_GetEventStatus( + VkDevice _device, + VkEvent _event) +{ + RADV_FROM_HANDLE(radv_event, event, _event); + + if (*event->map == 1) + return VK_EVENT_SET; + return VK_EVENT_RESET; +} + +VkResult radv_SetEvent( + VkDevice _device, + VkEvent _event) +{ + RADV_FROM_HANDLE(radv_event, event, _event); + *event->map = 1; + + return VK_SUCCESS; +} + +VkResult radv_ResetEvent( + VkDevice _device, + VkEvent _event) +{ + RADV_FROM_HANDLE(radv_event, event, _event); + *event->map = 0; + + return VK_SUCCESS; +} + +VkResult radv_CreateBuffer( + VkDevice _device, + const VkBufferCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkBuffer* pBuffer) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_buffer *buffer; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO); + + buffer = radv_alloc2(&device->alloc, pAllocator, sizeof(*buffer), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (buffer == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + buffer->size = pCreateInfo->size; + buffer->usage = pCreateInfo->usage; + buffer->bo = NULL; + buffer->offset = 0; + + *pBuffer = radv_buffer_to_handle(buffer); + + return VK_SUCCESS; +} + +void radv_DestroyBuffer( + VkDevice _device, + VkBuffer _buffer, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); + + if (!buffer) + return; + + radv_free2(&device->alloc, pAllocator, buffer); +} + +static inline unsigned +si_tile_mode_index(const struct radv_image *image, unsigned level, bool stencil) +{ + if (stencil) + return image->surface.stencil_tiling_index[level]; + else + return image->surface.tiling_index[level]; +} + +static void +radv_initialise_color_surface(struct radv_device *device, + struct radv_color_buffer_info *cb, + struct radv_image_view *iview) +{ + const struct vk_format_description *desc; + unsigned ntype, format, swap, endian; + unsigned blend_clamp = 0, blend_bypass = 0; + unsigned pitch_tile_max, slice_tile_max, tile_mode_index; + uint64_t va; + const struct radeon_surf *surf = &iview->image->surface; + const struct radeon_surf_level *level_info = &surf->level[iview->base_mip]; + + desc = vk_format_description(iview->vk_format); + + memset(cb, 0, sizeof(*cb)); + + va = device->ws->buffer_get_va(iview->bo) + iview->image->offset; + va += level_info->offset; + cb->cb_color_base = va >> 8; + + /* CMASK variables */ + va = device->ws->buffer_get_va(iview->bo) + iview->image->offset; + va += iview->image->cmask.offset; + cb->cb_color_cmask = va >> 8; + cb->cb_color_cmask_slice = iview->image->cmask.slice_tile_max; + + va = device->ws->buffer_get_va(iview->bo) + iview->image->offset; + va += iview->image->dcc_offset; + cb->cb_dcc_base = va >> 8; + + cb->cb_color_view = S_028C6C_SLICE_START(iview->base_layer) | + S_028C6C_SLICE_MAX(iview->base_layer + iview->extent.depth - 1); + + cb->micro_tile_mode = iview->image->surface.micro_tile_mode; + pitch_tile_max = level_info->nblk_x / 8 - 1; + slice_tile_max = (level_info->nblk_x * level_info->nblk_y) / 64 - 1; + tile_mode_index = si_tile_mode_index(iview->image, iview->base_mip, false); + + cb->cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max); + cb->cb_color_slice = S_028C68_TILE_MAX(slice_tile_max); + + /* Intensity is implemented as Red, so treat it that way. */ + cb->cb_color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == VK_SWIZZLE_1) | + S_028C74_TILE_MODE_INDEX(tile_mode_index); + + if (iview->image->samples > 1) { + unsigned log_samples = util_logbase2(iview->image->samples); + + cb->cb_color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | + S_028C74_NUM_FRAGMENTS(log_samples); + } + + if (iview->image->fmask.size) { + va = device->ws->buffer_get_va(iview->bo) + iview->image->offset + iview->image->fmask.offset; + if (device->instance->physicalDevice.rad_info.chip_class >= CIK) + cb->cb_color_pitch |= S_028C64_FMASK_TILE_MAX(iview->image->fmask.pitch_in_pixels / 8 - 1); + cb->cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(iview->image->fmask.tile_mode_index); + cb->cb_color_fmask = va >> 8; + cb->cb_color_fmask_slice = S_028C88_TILE_MAX(iview->image->fmask.slice_tile_max); + } else { + /* This must be set for fast clear to work without FMASK. */ + if (device->instance->physicalDevice.rad_info.chip_class >= CIK) + cb->cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max); + cb->cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index); + cb->cb_color_fmask = cb->cb_color_base; + cb->cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max); + } + + ntype = radv_translate_color_numformat(iview->vk_format, + desc, + vk_format_get_first_non_void_channel(iview->vk_format)); + format = radv_translate_colorformat(iview->vk_format); + if (format == V_028C70_COLOR_INVALID || ntype == ~0u) + radv_finishme("Illegal color\n"); + swap = radv_translate_colorswap(iview->vk_format, FALSE); + endian = radv_colorformat_endian_swap(format); + + /* blend clamp should be set for all NORM/SRGB types */ + if (ntype == V_028C70_NUMBER_UNORM || + ntype == V_028C70_NUMBER_SNORM || + ntype == V_028C70_NUMBER_SRGB) + blend_clamp = 1; + + /* set blend bypass according to docs if SINT/UINT or + 8/24 COLOR variants */ + if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT || + format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 || + format == V_028C70_COLOR_X24_8_32_FLOAT) { + blend_clamp = 0; + blend_bypass = 1; + } +#if 0 + if ((ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) && + (format == V_028C70_COLOR_8 || + format == V_028C70_COLOR_8_8 || + format == V_028C70_COLOR_8_8_8_8)) + ->color_is_int8 = true; +#endif + cb->cb_color_info = S_028C70_FORMAT(format) | + S_028C70_COMP_SWAP(swap) | + S_028C70_BLEND_CLAMP(blend_clamp) | + S_028C70_BLEND_BYPASS(blend_bypass) | + S_028C70_SIMPLE_FLOAT(1) | + S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM && + ntype != V_028C70_NUMBER_SNORM && + ntype != V_028C70_NUMBER_SRGB && + format != V_028C70_COLOR_8_24 && + format != V_028C70_COLOR_24_8) | + S_028C70_NUMBER_TYPE(ntype) | + S_028C70_ENDIAN(endian); + if (iview->image->samples > 1) + if (iview->image->fmask.size) + cb->cb_color_info |= S_028C70_COMPRESSION(1); + + if (iview->image->cmask.size && device->allow_fast_clears) + cb->cb_color_info |= S_028C70_FAST_CLEAR(1); + + if (iview->image->surface.dcc_size && level_info->dcc_enabled) + cb->cb_color_info |= S_028C70_DCC_ENABLE(1); + + if (device->instance->physicalDevice.rad_info.chip_class >= VI) { + unsigned max_uncompressed_block_size = 2; + if (iview->image->samples > 1) { + if (iview->image->surface.bpe == 1) + max_uncompressed_block_size = 0; + else if (iview->image->surface.bpe == 2) + max_uncompressed_block_size = 1; + } + + cb->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) | + S_028C78_INDEPENDENT_64B_BLOCKS(1); + } + + /* This must be set for fast clear to work without FMASK. */ + if (!iview->image->fmask.size && + device->instance->physicalDevice.rad_info.chip_class == SI) { + unsigned bankh = util_logbase2(iview->image->surface.bankh); + cb->cb_color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh); + } +} + +static void +radv_initialise_ds_surface(struct radv_device *device, + struct radv_ds_buffer_info *ds, + struct radv_image_view *iview) +{ + unsigned level = iview->base_mip; + unsigned format; + uint64_t va, s_offs, z_offs; + const struct radeon_surf_level *level_info = &iview->image->surface.level[level]; + memset(ds, 0, sizeof(*ds)); + switch (iview->vk_format) { + case VK_FORMAT_D24_UNORM_S8_UINT: + case VK_FORMAT_X8_D24_UNORM_PACK32: + ds->pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24); + ds->offset_scale = 2.0f; + break; + case VK_FORMAT_D16_UNORM: + case VK_FORMAT_D16_UNORM_S8_UINT: + ds->pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16); + ds->offset_scale = 4.0f; + break; + case VK_FORMAT_D32_SFLOAT: + case VK_FORMAT_D32_SFLOAT_S8_UINT: + ds->pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | + S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1); + ds->offset_scale = 1.0f; + break; + default: + break; + } + + format = radv_translate_dbformat(iview->vk_format); + if (format == V_028040_Z_INVALID) { + fprintf(stderr, "Invalid DB format: %d, disabling DB.\n", iview->vk_format); + } + + va = device->ws->buffer_get_va(iview->bo) + iview->image->offset; + s_offs = z_offs = va; + z_offs += iview->image->surface.level[level].offset; + s_offs += iview->image->surface.stencil_level[level].offset; + + ds->db_depth_view = S_028008_SLICE_START(iview->base_layer) | + S_028008_SLICE_MAX(iview->base_layer + iview->extent.depth - 1); + ds->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1); + ds->db_z_info = S_028040_FORMAT(format) | S_028040_ZRANGE_PRECISION(1); + + if (iview->image->samples > 1) + ds->db_z_info |= S_028040_NUM_SAMPLES(util_logbase2(iview->image->samples)); + + if (iview->image->surface.flags & RADEON_SURF_SBUFFER) + ds->db_stencil_info = S_028044_FORMAT(V_028044_STENCIL_8); + else + ds->db_stencil_info = S_028044_FORMAT(V_028044_STENCIL_INVALID); + + if (device->instance->physicalDevice.rad_info.chip_class >= CIK) { + struct radeon_info *info = &device->instance->physicalDevice.rad_info; + unsigned tiling_index = iview->image->surface.tiling_index[level]; + unsigned stencil_index = iview->image->surface.stencil_tiling_index[level]; + unsigned macro_index = iview->image->surface.macro_tile_index; + unsigned tile_mode = info->si_tile_mode_array[tiling_index]; + unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index]; + unsigned macro_mode = info->cik_macrotile_mode_array[macro_index]; + + ds->db_depth_info |= + S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) | + S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) | + S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) | + S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) | + S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) | + S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode)); + ds->db_z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode)); + ds->db_stencil_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode)); + } else { + unsigned tile_mode_index = si_tile_mode_index(iview->image, level, false); + ds->db_z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index); + tile_mode_index = si_tile_mode_index(iview->image, level, true); + ds->db_stencil_info |= S_028044_TILE_MODE_INDEX(tile_mode_index); + } + + if (iview->image->htile.size && !level) { + ds->db_z_info |= S_028040_TILE_SURFACE_ENABLE(1) | + S_028040_ALLOW_EXPCLEAR(1); + + if (iview->image->surface.flags & RADEON_SURF_SBUFFER) { + /* Workaround: For a not yet understood reason, the + * combination of MSAA, fast stencil clear and stencil + * decompress messes with subsequent stencil buffer + * uses. Problem was reproduced on Verde, Bonaire, + * Tonga, and Carrizo. + * + * Disabling EXPCLEAR works around the problem. + * + * Check piglit's arb_texture_multisample-stencil-clear + * test if you want to try changing this. + */ + if (iview->image->samples <= 1) + ds->db_stencil_info |= S_028044_ALLOW_EXPCLEAR(1); + } else + /* Use all of the htile_buffer for depth if there's no stencil. */ + ds->db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1); + + va = device->ws->buffer_get_va(iview->bo) + iview->image->offset + + iview->image->htile.offset; + ds->db_htile_data_base = va >> 8; + ds->db_htile_surface = S_028ABC_FULL_CACHE(1); + } else { + ds->db_htile_data_base = 0; + ds->db_htile_surface = 0; + } + + ds->db_z_read_base = ds->db_z_write_base = z_offs >> 8; + ds->db_stencil_read_base = ds->db_stencil_write_base = s_offs >> 8; + + ds->db_depth_size = S_028058_PITCH_TILE_MAX((level_info->nblk_x / 8) - 1) | + S_028058_HEIGHT_TILE_MAX((level_info->nblk_y / 8) - 1); + ds->db_depth_slice = S_02805C_SLICE_TILE_MAX((level_info->nblk_x * level_info->nblk_y) / 64 - 1); +} + +VkResult radv_CreateFramebuffer( + VkDevice _device, + const VkFramebufferCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkFramebuffer* pFramebuffer) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_framebuffer *framebuffer; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO); + + size_t size = sizeof(*framebuffer) + + sizeof(struct radv_attachment_info) * pCreateInfo->attachmentCount; + framebuffer = radv_alloc2(&device->alloc, pAllocator, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (framebuffer == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + framebuffer->attachment_count = pCreateInfo->attachmentCount; + for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) { + VkImageView _iview = pCreateInfo->pAttachments[i]; + struct radv_image_view *iview = radv_image_view_from_handle(_iview); + framebuffer->attachments[i].attachment = iview; + if (iview->aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT) { + radv_initialise_color_surface(device, &framebuffer->attachments[i].cb, iview); + } else if (iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { + radv_initialise_ds_surface(device, &framebuffer->attachments[i].ds, iview); + } + } + + framebuffer->width = pCreateInfo->width; + framebuffer->height = pCreateInfo->height; + framebuffer->layers = pCreateInfo->layers; + + *pFramebuffer = radv_framebuffer_to_handle(framebuffer); + return VK_SUCCESS; +} + +void radv_DestroyFramebuffer( + VkDevice _device, + VkFramebuffer _fb, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_framebuffer, fb, _fb); + + if (!fb) + return; + radv_free2(&device->alloc, pAllocator, fb); +} + +static unsigned radv_tex_wrap(VkSamplerAddressMode address_mode) +{ + switch (address_mode) { + case VK_SAMPLER_ADDRESS_MODE_REPEAT: + return V_008F30_SQ_TEX_WRAP; + case VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT: + return V_008F30_SQ_TEX_MIRROR; + case VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE: + return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL; + case VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER: + return V_008F30_SQ_TEX_CLAMP_BORDER; + case VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE: + return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL; + default: + unreachable("illegal tex wrap mode"); + break; + } +} + +static unsigned +radv_tex_compare(VkCompareOp op) +{ + switch (op) { + case VK_COMPARE_OP_NEVER: + return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER; + case VK_COMPARE_OP_LESS: + return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS; + case VK_COMPARE_OP_EQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL; + case VK_COMPARE_OP_LESS_OR_EQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL; + case VK_COMPARE_OP_GREATER: + return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER; + case VK_COMPARE_OP_NOT_EQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL; + case VK_COMPARE_OP_GREATER_OR_EQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL; + case VK_COMPARE_OP_ALWAYS: + return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS; + default: + unreachable("illegal compare mode"); + break; + } +} + +static unsigned +radv_tex_filter(VkFilter filter, unsigned max_ansio) +{ + switch (filter) { + case VK_FILTER_NEAREST: + return (max_ansio > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT : + V_008F38_SQ_TEX_XY_FILTER_POINT); + case VK_FILTER_LINEAR: + return (max_ansio > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR : + V_008F38_SQ_TEX_XY_FILTER_BILINEAR); + case VK_FILTER_CUBIC_IMG: + default: + fprintf(stderr, "illegal texture filter"); + return 0; + } +} + +static unsigned +radv_tex_mipfilter(VkSamplerMipmapMode mode) +{ + switch (mode) { + case VK_SAMPLER_MIPMAP_MODE_NEAREST: + return V_008F38_SQ_TEX_Z_FILTER_POINT; + case VK_SAMPLER_MIPMAP_MODE_LINEAR: + return V_008F38_SQ_TEX_Z_FILTER_LINEAR; + default: + return V_008F38_SQ_TEX_Z_FILTER_NONE; + } +} + +static unsigned +radv_tex_bordercolor(VkBorderColor bcolor) +{ + switch (bcolor) { + case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK: + case VK_BORDER_COLOR_INT_TRANSPARENT_BLACK: + return V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK; + case VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK: + case VK_BORDER_COLOR_INT_OPAQUE_BLACK: + return V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK; + case VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE: + case VK_BORDER_COLOR_INT_OPAQUE_WHITE: + return V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE; + default: + break; + } + return 0; +} + +static void +radv_init_sampler(struct radv_device *device, + struct radv_sampler *sampler, + const VkSamplerCreateInfo *pCreateInfo) +{ + uint32_t max_aniso = 0; + uint32_t max_aniso_ratio = 0;//TODO + bool is_vi; + is_vi = (device->instance->physicalDevice.rad_info.chip_class >= VI); + + sampler->state[0] = (S_008F30_CLAMP_X(radv_tex_wrap(pCreateInfo->addressModeU)) | + S_008F30_CLAMP_Y(radv_tex_wrap(pCreateInfo->addressModeV)) | + S_008F30_CLAMP_Z(radv_tex_wrap(pCreateInfo->addressModeW)) | + S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) | + S_008F30_DEPTH_COMPARE_FUNC(radv_tex_compare(pCreateInfo->compareOp)) | + S_008F30_FORCE_UNNORMALIZED(pCreateInfo->unnormalizedCoordinates ? 1 : 0) | + S_008F30_DISABLE_CUBE_WRAP(0) | + S_008F30_COMPAT_MODE(is_vi)); + sampler->state[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(pCreateInfo->minLod, 0, 15), 8)) | + S_008F34_MAX_LOD(S_FIXED(CLAMP(pCreateInfo->maxLod, 0, 15), 8))); + sampler->state[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(pCreateInfo->mipLodBias, -16, 16), 8)) | + S_008F38_XY_MAG_FILTER(radv_tex_filter(pCreateInfo->magFilter, max_aniso)) | + S_008F38_XY_MIN_FILTER(radv_tex_filter(pCreateInfo->minFilter, max_aniso)) | + S_008F38_MIP_FILTER(radv_tex_mipfilter(pCreateInfo->mipmapMode)) | + S_008F38_MIP_POINT_PRECLAMP(1) | + S_008F38_DISABLE_LSB_CEIL(1) | + S_008F38_FILTER_PREC_FIX(1) | + S_008F38_ANISO_OVERRIDE(is_vi)); + sampler->state[3] = (S_008F3C_BORDER_COLOR_PTR(0) | + S_008F3C_BORDER_COLOR_TYPE(radv_tex_bordercolor(pCreateInfo->borderColor))); +} + +VkResult radv_CreateSampler( + VkDevice _device, + const VkSamplerCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSampler* pSampler) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_sampler *sampler; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO); + + sampler = radv_alloc2(&device->alloc, pAllocator, sizeof(*sampler), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!sampler) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + radv_init_sampler(device, sampler, pCreateInfo); + *pSampler = radv_sampler_to_handle(sampler); + + return VK_SUCCESS; +} + +void radv_DestroySampler( + VkDevice _device, + VkSampler _sampler, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_sampler, sampler, _sampler); + + if (!sampler) + return; + radv_free2(&device->alloc, pAllocator, sampler); +} diff --git a/src/amd/vulkan/radv_device_info.h b/src/amd/vulkan/radv_device_info.h new file mode 100644 index 00000000000..c38bd2a99ea --- /dev/null +++ b/src/amd/vulkan/radv_device_info.h @@ -0,0 +1,32 @@ +/* + * Copyright © 2016 Red Hat. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include + +#include "radv_radeon_winsys.h" +struct radv_device_info { + uint32_t pci_id; + enum chip_class chip_class; +}; diff --git a/src/amd/vulkan/radv_entrypoints_gen.py b/src/amd/vulkan/radv_entrypoints_gen.py new file mode 100644 index 00000000000..e8ef8a4e9eb --- /dev/null +++ b/src/amd/vulkan/radv_entrypoints_gen.py @@ -0,0 +1,351 @@ +# coding=utf-8 +# +# Copyright © 2015 Intel Corporation +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +# + +import fileinput, re, sys + +# Each function typedef in the vulkan.h header is all on one line and matches +# this regepx. We hope that won't change. + +p = re.compile('typedef ([^ ]*) *\((?:VKAPI_PTR)? *\*PFN_vk([^(]*)\)(.*);') + +entrypoints = [] + +# We generate a static hash table for entry point lookup +# (vkGetProcAddress). We use a linear congruential generator for our hash +# function and a power-of-two size table. The prime numbers are determined +# experimentally. + +none = 0xffff +hash_size = 256 +u32_mask = 2**32 - 1 +hash_mask = hash_size - 1 + +prime_factor = 5024183 +prime_step = 19 + +def hash(name): + h = 0; + for c in name: + h = (h * prime_factor + ord(c)) & u32_mask + + return h + +def get_platform_guard_macro(name): + if "Xlib" in name: + return "VK_USE_PLATFORM_XLIB_KHR" + elif "Xcb" in name: + return "VK_USE_PLATFORM_XCB_KHR" + elif "Wayland" in name: + return "VK_USE_PLATFORM_WAYLAND_KHR" + elif "Mir" in name: + return "VK_USE_PLATFORM_MIR_KHR" + elif "Android" in name: + return "VK_USE_PLATFORM_ANDROID_KHR" + elif "Win32" in name: + return "VK_USE_PLATFORM_WIN32_KHR" + else: + return None + +def print_guard_start(name): + guard = get_platform_guard_macro(name) + if guard is not None: + print "#ifdef {0}".format(guard) + +def print_guard_end(name): + guard = get_platform_guard_macro(name) + if guard is not None: + print "#endif // {0}".format(guard) + +opt_header = False +opt_code = False + +if (sys.argv[1] == "header"): + opt_header = True + sys.argv.pop() +elif (sys.argv[1] == "code"): + opt_code = True + sys.argv.pop() + +# Parse the entry points in the header + +i = 0 +for line in fileinput.input(): + m = p.match(line) + if (m): + if m.group(2) == 'VoidFunction': + continue + fullname = "vk" + m.group(2) + h = hash(fullname) + entrypoints.append((m.group(1), m.group(2), m.group(3), i, h)) + i = i + 1 + +# For outputting entrypoints.h we generate a radv_EntryPoint() prototype +# per entry point. + +if opt_header: + print "/* This file generated from vk_gen.py, don't edit directly. */\n" + + print "struct radv_dispatch_table {" + print " union {" + print " void *entrypoints[%d];" % len(entrypoints) + print " struct {" + + for type, name, args, num, h in entrypoints: + guard = get_platform_guard_macro(name) + if guard is not None: + print "#ifdef {0}".format(guard) + print " PFN_vk{0} {0};".format(name) + print "#else" + print " void *{0};".format(name) + print "#endif" + else: + print " PFN_vk{0} {0};".format(name) + print " };\n" + print " };\n" + print "};\n" + + print "void radv_set_dispatch_devinfo(const struct radv_device_info *info);\n" + + for type, name, args, num, h in entrypoints: + print_guard_start(name) + print "%s radv_%s%s;" % (type, name, args) + print "%s vi_%s%s;" % (type, name, args) + print "%s cik_%s%s;" % (type, name, args) + print "%s si_%s%s;" % (type, name, args) + print "%s radv_validate_%s%s;" % (type, name, args) + print_guard_end(name) + exit() + + + +print """/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* DO NOT EDIT! This is a generated file. */ + +#include "radv_private.h" + +struct radv_entrypoint { + uint32_t name; + uint32_t hash; +}; + +/* We use a big string constant to avoid lots of reloctions from the entry + * point table to lots of little strings. The entries in the entry point table + * store the index into this big string. + */ + +static const char strings[] =""" + +offsets = [] +i = 0; +for type, name, args, num, h in entrypoints: + print " \"vk%s\\0\"" % name + offsets.append(i) + i += 2 + len(name) + 1 +print " ;" + +# Now generate the table of all entry points and their validation functions + +print "\nstatic const struct radv_entrypoint entrypoints[] = {" +for type, name, args, num, h in entrypoints: + print " { %5d, 0x%08x }," % (offsets[num], h) +print "};\n" + +print """ + +/* Weak aliases for all potential implementations. These will resolve to + * NULL if they're not defined, which lets the resolve_entrypoint() function + * either pick the correct entry point. + */ +""" + +for layer in [ "radv", "validate", "si", "cik", "vi" ]: + for type, name, args, num, h in entrypoints: + print_guard_start(name) + print "%s %s_%s%s __attribute__ ((weak));" % (type, layer, name, args) + print_guard_end(name) + print "\nconst struct radv_dispatch_table %s_layer = {" % layer + for type, name, args, num, h in entrypoints: + print_guard_start(name) + print " .%s = %s_%s," % (name, layer, name) + print_guard_end(name) + print "};\n" + +print """ +#ifdef DEBUG +static bool enable_validate = true; +#else +static bool enable_validate = false; +#endif + +/* We can't use symbols that need resolving (like, oh, getenv) in the resolve + * function. This means that we have to determine whether or not to use the + * validation layer sometime before that. The constructor function attribute asks + * the dynamic linker to invoke determine_validate() at dlopen() time which + * works. + */ +static void __attribute__ ((constructor)) +determine_validate(void) +{ + const char *s = getenv("ANV_VALIDATE"); + + if (s) + enable_validate = atoi(s); +} + +static const struct radv_device_info *dispatch_devinfo; + +void +radv_set_dispatch_devinfo(const struct radv_device_info *devinfo) +{ + dispatch_devinfo = devinfo; +} + +void * __attribute__ ((noinline)) +radv_resolve_entrypoint(uint32_t index) +{ + if (enable_validate && validate_layer.entrypoints[index]) + return validate_layer.entrypoints[index]; + + if (dispatch_devinfo == NULL) { + return radv_layer.entrypoints[index]; + } + + switch (dispatch_devinfo->chip_class) { + case VI: + if (vi_layer.entrypoints[index]) + return vi_layer.entrypoints[index]; + /* fall through */ + case CIK: + if (cik_layer.entrypoints[index]) + return cik_layer.entrypoints[index]; + /* fall through */ + case SI: + if (si_layer.entrypoints[index]) + return si_layer.entrypoints[index]; + /* fall through */ + case 0: + return radv_layer.entrypoints[index]; + default: + unreachable("unsupported gen\\n"); + } +} +""" + +# Now generate the hash table used for entry point look up. This is a +# uint16_t table of entry point indices. We use 0xffff to indicate an entry +# in the hash table is empty. + +map = [none for f in xrange(hash_size)] +collisions = [0 for f in xrange(10)] +for type, name, args, num, h in entrypoints: + level = 0 + while map[h & hash_mask] != none: + h = h + prime_step + level = level + 1 + if level > 9: + collisions[9] += 1 + else: + collisions[level] += 1 + map[h & hash_mask] = num + +print "/* Hash table stats:" +print " * size %d entries" % hash_size +print " * collisions entries" +for i in xrange(10): + if (i == 9): + plus = "+" + else: + plus = " " + + print " * %2d%s %4d" % (i, plus, collisions[i]) +print " */\n" + +print "#define none 0x%04x\n" % none + +print "static const uint16_t map[] = {" +for i in xrange(0, hash_size, 8): + print " ", + for j in xrange(i, i + 8): + if map[j] & 0xffff == 0xffff: + print " none,", + else: + print "0x%04x," % (map[j] & 0xffff), + print + +print "};" + +# Finally we generate the hash table lookup function. The hash function and +# linear probing algorithm matches the hash table generated above. + +print """ +void * +radv_lookup_entrypoint(const char *name) +{ + static const uint32_t prime_factor = %d; + static const uint32_t prime_step = %d; + const struct radv_entrypoint *e; + uint32_t hash, h, i; + const char *p; + + hash = 0; + for (p = name; *p; p++) + hash = hash * prime_factor + *p; + + h = hash; + do { + i = map[h & %d]; + if (i == none) + return NULL; + e = &entrypoints[i]; + h += prime_step; + } while (e->hash != hash); + + if (strcmp(name, strings + e->name) != 0) + return NULL; + + return radv_resolve_entrypoint(i); +} +""" % (prime_factor, prime_step, hash_mask) diff --git a/src/amd/vulkan/radv_formats.c b/src/amd/vulkan/radv_formats.c new file mode 100644 index 00000000000..90c140c18b7 --- /dev/null +++ b/src/amd/vulkan/radv_formats.c @@ -0,0 +1,1085 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "radv_private.h" + +#include "vk_format.h" +#include "sid.h" +#include "r600d_common.h" + +#include "util/u_half.h" +#include "util/format_srgb.h" + +uint32_t radv_translate_buffer_dataformat(const struct vk_format_description *desc, + int first_non_void) +{ + unsigned type; + int i; + + if (desc->format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) + return V_008F0C_BUF_DATA_FORMAT_10_11_11; + + if (first_non_void < 0) + return V_008F0C_BUF_DATA_FORMAT_INVALID; + type = desc->channel[first_non_void].type; + + if (type == VK_FORMAT_TYPE_FIXED) + return V_008F0C_BUF_DATA_FORMAT_INVALID; + if (desc->nr_channels == 4 && + desc->channel[0].size == 10 && + desc->channel[1].size == 10 && + desc->channel[2].size == 10 && + desc->channel[3].size == 2) + return V_008F0C_BUF_DATA_FORMAT_2_10_10_10; + + /* See whether the components are of the same size. */ + for (i = 0; i < desc->nr_channels; i++) { + if (desc->channel[first_non_void].size != desc->channel[i].size) + return V_008F0C_BUF_DATA_FORMAT_INVALID; + } + + switch (desc->channel[first_non_void].size) { + case 8: + switch (desc->nr_channels) { + case 1: + return V_008F0C_BUF_DATA_FORMAT_8; + case 2: + return V_008F0C_BUF_DATA_FORMAT_8_8; + case 4: + return V_008F0C_BUF_DATA_FORMAT_8_8_8_8; + } + break; + case 16: + switch (desc->nr_channels) { + case 1: + return V_008F0C_BUF_DATA_FORMAT_16; + case 2: + return V_008F0C_BUF_DATA_FORMAT_16_16; + case 4: + return V_008F0C_BUF_DATA_FORMAT_16_16_16_16; + } + break; + case 32: + /* From the Southern Islands ISA documentation about MTBUF: + * 'Memory reads of data in memory that is 32 or 64 bits do not + * undergo any format conversion.' + */ + if (type != VK_FORMAT_TYPE_FLOAT && + !desc->channel[first_non_void].pure_integer) + return V_008F0C_BUF_DATA_FORMAT_INVALID; + + switch (desc->nr_channels) { + case 1: + return V_008F0C_BUF_DATA_FORMAT_32; + case 2: + return V_008F0C_BUF_DATA_FORMAT_32_32; + case 3: + return V_008F0C_BUF_DATA_FORMAT_32_32_32; + case 4: + return V_008F0C_BUF_DATA_FORMAT_32_32_32_32; + } + break; + } + + return V_008F0C_BUF_DATA_FORMAT_INVALID; +} + +uint32_t radv_translate_buffer_numformat(const struct vk_format_description *desc, + int first_non_void) +{ + if (desc->format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) + return V_008F0C_BUF_NUM_FORMAT_FLOAT; + + if (first_non_void < 0) + return ~0; + + switch (desc->channel[first_non_void].type) { + case VK_FORMAT_TYPE_SIGNED: + if (desc->channel[first_non_void].normalized) + return V_008F0C_BUF_NUM_FORMAT_SNORM; + else if (desc->channel[first_non_void].pure_integer) + return V_008F0C_BUF_NUM_FORMAT_SINT; + else + return V_008F0C_BUF_NUM_FORMAT_SSCALED; + break; + case VK_FORMAT_TYPE_UNSIGNED: + if (desc->channel[first_non_void].normalized) + return V_008F0C_BUF_NUM_FORMAT_UNORM; + else if (desc->channel[first_non_void].pure_integer) + return V_008F0C_BUF_NUM_FORMAT_UINT; + else + return V_008F0C_BUF_NUM_FORMAT_USCALED; + break; + case VK_FORMAT_TYPE_FLOAT: + default: + return V_008F0C_BUF_NUM_FORMAT_FLOAT; + } +} + +uint32_t radv_translate_tex_dataformat(VkFormat format, + const struct vk_format_description *desc, + int first_non_void) +{ + bool uniform = true; + int i; + + if (!desc) + return ~0; + /* Colorspace (return non-RGB formats directly). */ + switch (desc->colorspace) { + /* Depth stencil formats */ + case VK_FORMAT_COLORSPACE_ZS: + switch (format) { + case VK_FORMAT_D16_UNORM: + return V_008F14_IMG_DATA_FORMAT_16; + case VK_FORMAT_D24_UNORM_S8_UINT: + return V_008F14_IMG_DATA_FORMAT_8_24; + case VK_FORMAT_S8_UINT: + return V_008F14_IMG_DATA_FORMAT_8; + case VK_FORMAT_D32_SFLOAT: + return V_008F14_IMG_DATA_FORMAT_32; + case VK_FORMAT_D32_SFLOAT_S8_UINT: + return V_008F14_IMG_DATA_FORMAT_X24_8_32; + default: + goto out_unknown; + } + + case VK_FORMAT_COLORSPACE_YUV: + goto out_unknown; /* TODO */ + + case VK_FORMAT_COLORSPACE_SRGB: + if (desc->nr_channels != 4 && desc->nr_channels != 1) + goto out_unknown; + break; + + default: + break; + } + + if (desc->layout == VK_FORMAT_LAYOUT_RGTC) { + switch(format) { + case VK_FORMAT_BC4_UNORM_BLOCK: + case VK_FORMAT_BC4_SNORM_BLOCK: + return V_008F14_IMG_DATA_FORMAT_BC4; + case VK_FORMAT_BC5_UNORM_BLOCK: + case VK_FORMAT_BC5_SNORM_BLOCK: + return V_008F14_IMG_DATA_FORMAT_BC5; + default: + break; + } + } + + if (desc->layout == VK_FORMAT_LAYOUT_S3TC) { + switch(format) { + case VK_FORMAT_BC1_RGB_UNORM_BLOCK: + case VK_FORMAT_BC1_RGB_SRGB_BLOCK: + case VK_FORMAT_BC1_RGBA_UNORM_BLOCK: + case VK_FORMAT_BC1_RGBA_SRGB_BLOCK: + return V_008F14_IMG_DATA_FORMAT_BC1; + case VK_FORMAT_BC2_UNORM_BLOCK: + case VK_FORMAT_BC2_SRGB_BLOCK: + return V_008F14_IMG_DATA_FORMAT_BC2; + case VK_FORMAT_BC3_UNORM_BLOCK: + case VK_FORMAT_BC3_SRGB_BLOCK: + return V_008F14_IMG_DATA_FORMAT_BC3; + default: + break; + } + } + + if (desc->layout == VK_FORMAT_LAYOUT_BPTC) { + switch(format) { + case VK_FORMAT_BC6H_UFLOAT_BLOCK: + case VK_FORMAT_BC6H_SFLOAT_BLOCK: + return V_008F14_IMG_DATA_FORMAT_BC6; + case VK_FORMAT_BC7_UNORM_BLOCK: + case VK_FORMAT_BC7_SRGB_BLOCK: + return V_008F14_IMG_DATA_FORMAT_BC7; + default: + break; + } + } + + if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) { + return V_008F14_IMG_DATA_FORMAT_5_9_9_9; + } else if (format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) { + return V_008F14_IMG_DATA_FORMAT_10_11_11; + } + + /* R8G8Bx_SNORM - TODO CxV8U8 */ + + /* hw cannot support mixed formats (except depth/stencil, since only + * depth is read).*/ + if (desc->is_mixed && desc->colorspace != VK_FORMAT_COLORSPACE_ZS) + goto out_unknown; + + /* See whether the components are of the same size. */ + for (i = 1; i < desc->nr_channels; i++) { + uniform = uniform && desc->channel[0].size == desc->channel[i].size; + } + + /* Non-uniform formats. */ + if (!uniform) { + switch(desc->nr_channels) { + case 3: + if (desc->channel[0].size == 5 && + desc->channel[1].size == 6 && + desc->channel[2].size == 5) { + return V_008F14_IMG_DATA_FORMAT_5_6_5; + } + goto out_unknown; + case 4: + if (desc->channel[0].size == 5 && + desc->channel[1].size == 5 && + desc->channel[2].size == 5 && + desc->channel[3].size == 1) { + return V_008F14_IMG_DATA_FORMAT_1_5_5_5; + } + if (desc->channel[0].size == 1 && + desc->channel[1].size == 5 && + desc->channel[2].size == 5 && + desc->channel[3].size == 5) { + return V_008F14_IMG_DATA_FORMAT_5_5_5_1; + } + if (desc->channel[0].size == 10 && + desc->channel[1].size == 10 && + desc->channel[2].size == 10 && + desc->channel[3].size == 2) { + /* Closed VK driver does this also no 2/10/10/10 snorm */ + if (desc->channel[0].type == VK_FORMAT_TYPE_SIGNED && + desc->channel[0].normalized) + goto out_unknown; + return V_008F14_IMG_DATA_FORMAT_2_10_10_10; + } + goto out_unknown; + } + goto out_unknown; + } + + if (first_non_void < 0 || first_non_void > 3) + goto out_unknown; + + /* uniform formats */ + switch (desc->channel[first_non_void].size) { + case 4: + switch (desc->nr_channels) { +#if 0 /* Not supported for render targets */ + case 2: + return V_008F14_IMG_DATA_FORMAT_4_4; +#endif + case 4: + return V_008F14_IMG_DATA_FORMAT_4_4_4_4; + } + break; + case 8: + switch (desc->nr_channels) { + case 1: + return V_008F14_IMG_DATA_FORMAT_8; + case 2: + return V_008F14_IMG_DATA_FORMAT_8_8; + case 4: + return V_008F14_IMG_DATA_FORMAT_8_8_8_8; + } + break; + case 16: + switch (desc->nr_channels) { + case 1: + return V_008F14_IMG_DATA_FORMAT_16; + case 2: + return V_008F14_IMG_DATA_FORMAT_16_16; + case 4: + return V_008F14_IMG_DATA_FORMAT_16_16_16_16; + } + break; + case 32: + switch (desc->nr_channels) { + case 1: + return V_008F14_IMG_DATA_FORMAT_32; + case 2: + return V_008F14_IMG_DATA_FORMAT_32_32; +#if 0 /* Not supported for render targets */ + case 3: + return V_008F14_IMG_DATA_FORMAT_32_32_32; +#endif + case 4: + return V_008F14_IMG_DATA_FORMAT_32_32_32_32; + } + } + +out_unknown: + /* R600_ERR("Unable to handle texformat %d %s\n", format, vk_format_name(format)); */ + return ~0; +} + +uint32_t radv_translate_tex_numformat(VkFormat format, + const struct vk_format_description *desc, + int first_non_void) +{ + switch (format) { + case VK_FORMAT_D24_UNORM_S8_UINT: + return V_008F14_IMG_NUM_FORMAT_UNORM; + default: + if (first_non_void < 0) { + if (vk_format_is_compressed(format)) { + switch (format) { + case VK_FORMAT_BC1_RGB_SRGB_BLOCK: + case VK_FORMAT_BC1_RGBA_SRGB_BLOCK: + case VK_FORMAT_BC2_SRGB_BLOCK: + case VK_FORMAT_BC3_SRGB_BLOCK: + case VK_FORMAT_BC7_SRGB_BLOCK: + return V_008F14_IMG_NUM_FORMAT_SRGB; + case VK_FORMAT_BC4_SNORM_BLOCK: + case VK_FORMAT_BC5_SNORM_BLOCK: + case VK_FORMAT_BC6H_SFLOAT_BLOCK: + return V_008F14_IMG_NUM_FORMAT_SNORM; + default: + return V_008F14_IMG_NUM_FORMAT_UNORM; + } + } else if (desc->layout == VK_FORMAT_LAYOUT_SUBSAMPLED) { + return V_008F14_IMG_NUM_FORMAT_UNORM; + } else { + return V_008F14_IMG_NUM_FORMAT_FLOAT; + } + } else if (desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) { + return V_008F14_IMG_NUM_FORMAT_SRGB; + } else { + switch (desc->channel[first_non_void].type) { + case VK_FORMAT_TYPE_FLOAT: + return V_008F14_IMG_NUM_FORMAT_FLOAT; + case VK_FORMAT_TYPE_SIGNED: + if (desc->channel[first_non_void].normalized) + return V_008F14_IMG_NUM_FORMAT_SNORM; + else if (desc->channel[first_non_void].pure_integer) + return V_008F14_IMG_NUM_FORMAT_SINT; + else + return V_008F14_IMG_NUM_FORMAT_SSCALED; + case VK_FORMAT_TYPE_UNSIGNED: + if (desc->channel[first_non_void].normalized) + return V_008F14_IMG_NUM_FORMAT_UNORM; + else if (desc->channel[first_non_void].pure_integer) + return V_008F14_IMG_NUM_FORMAT_UINT; + else + return V_008F14_IMG_NUM_FORMAT_USCALED; + default: + return V_008F14_IMG_NUM_FORMAT_UNORM; + } + } + } +} + +uint32_t radv_translate_color_numformat(VkFormat format, + const struct vk_format_description *desc, + int first_non_void) +{ + unsigned ntype; + if (first_non_void == 4 || desc->channel[first_non_void].type == VK_FORMAT_TYPE_FLOAT) + ntype = V_028C70_NUMBER_FLOAT; + else { + ntype = V_028C70_NUMBER_UNORM; + if (desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) + ntype = V_028C70_NUMBER_SRGB; + else if (desc->channel[first_non_void].type == VK_FORMAT_TYPE_SIGNED) { + if (desc->channel[first_non_void].pure_integer) { + ntype = V_028C70_NUMBER_SINT; + } else if (desc->channel[first_non_void].normalized) { + ntype = V_028C70_NUMBER_SNORM; + } else + ntype = ~0u; + } else if (desc->channel[first_non_void].type == VK_FORMAT_TYPE_UNSIGNED) { + if (desc->channel[first_non_void].pure_integer) { + ntype = V_028C70_NUMBER_UINT; + } else if (desc->channel[first_non_void].normalized) { + ntype = V_028C70_NUMBER_UNORM; + } else + ntype = ~0u; + } + } + return ntype; +} + +static bool radv_is_sampler_format_supported(VkFormat format, bool *linear_sampling) +{ + const struct vk_format_description *desc = vk_format_description(format); + uint32_t num_format; + if (!desc || format == VK_FORMAT_UNDEFINED) + return false; + num_format = radv_translate_tex_numformat(format, desc, + vk_format_get_first_non_void_channel(format)); + + if (num_format == V_008F14_IMG_NUM_FORMAT_USCALED || + num_format == V_008F14_IMG_NUM_FORMAT_SSCALED) + return false; + + if (num_format == V_008F14_IMG_NUM_FORMAT_UNORM || + num_format == V_008F14_IMG_NUM_FORMAT_SNORM || + num_format == V_008F14_IMG_NUM_FORMAT_FLOAT || + num_format == V_008F14_IMG_NUM_FORMAT_SRGB) + *linear_sampling = true; + else + *linear_sampling = false; + return radv_translate_tex_dataformat(format, vk_format_description(format), + vk_format_get_first_non_void_channel(format)) != ~0U; +} + + +static bool radv_is_storage_image_format_supported(struct radv_physical_device *physical_device, + VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + unsigned data_format, num_format; + if (!desc || format == VK_FORMAT_UNDEFINED) + return false; + + data_format = radv_translate_tex_dataformat(format, desc, + vk_format_get_first_non_void_channel(format)); + num_format = radv_translate_tex_numformat(format, desc, + vk_format_get_first_non_void_channel(format)); + + if(data_format == ~0 || num_format == ~0) + return false; + + /* Extracted from the GCN3 ISA document. */ + switch(num_format) { + case V_008F14_IMG_NUM_FORMAT_UNORM: + case V_008F14_IMG_NUM_FORMAT_SNORM: + case V_008F14_IMG_NUM_FORMAT_UINT: + case V_008F14_IMG_NUM_FORMAT_SINT: + case V_008F14_IMG_NUM_FORMAT_FLOAT: + break; + default: + return false; + } + + switch(data_format) { + case V_008F14_IMG_DATA_FORMAT_8: + case V_008F14_IMG_DATA_FORMAT_16: + case V_008F14_IMG_DATA_FORMAT_8_8: + case V_008F14_IMG_DATA_FORMAT_32: + case V_008F14_IMG_DATA_FORMAT_16_16: + case V_008F14_IMG_DATA_FORMAT_10_11_11: + case V_008F14_IMG_DATA_FORMAT_11_11_10: + case V_008F14_IMG_DATA_FORMAT_10_10_10_2: + case V_008F14_IMG_DATA_FORMAT_2_10_10_10: + case V_008F14_IMG_DATA_FORMAT_8_8_8_8: + case V_008F14_IMG_DATA_FORMAT_32_32: + case V_008F14_IMG_DATA_FORMAT_16_16_16_16: + case V_008F14_IMG_DATA_FORMAT_32_32_32_32: + case V_008F14_IMG_DATA_FORMAT_5_6_5: + case V_008F14_IMG_DATA_FORMAT_1_5_5_5: + case V_008F14_IMG_DATA_FORMAT_5_5_5_1: + case V_008F14_IMG_DATA_FORMAT_4_4_4_4: + /* TODO: FMASK formats. */ + return true; + default: + return false; + } +} + +static bool radv_is_buffer_format_supported(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + unsigned data_format, num_format; + if (!desc || format == VK_FORMAT_UNDEFINED) + return false; + + data_format = radv_translate_buffer_dataformat(desc, + vk_format_get_first_non_void_channel(format)); + num_format = radv_translate_buffer_numformat(desc, + vk_format_get_first_non_void_channel(format)); + + return data_format != V_008F0C_BUF_DATA_FORMAT_INVALID && + num_format != ~0; +} + +bool radv_is_colorbuffer_format_supported(VkFormat format, bool *blendable) +{ + const struct vk_format_description *desc = vk_format_description(format); + uint32_t color_format = radv_translate_colorformat(format); + uint32_t color_swap = radv_translate_colorswap(format, false); + uint32_t color_num_format = radv_translate_color_numformat(format, + desc, + vk_format_get_first_non_void_channel(format)); + + if (color_num_format == V_028C70_NUMBER_UINT || color_num_format == V_028C70_NUMBER_SINT || + color_format == V_028C70_COLOR_8_24 || color_format == V_028C70_COLOR_24_8 || + color_format == V_028C70_COLOR_X24_8_32_FLOAT) { + *blendable = false; + } else + *blendable = true; + return color_format != V_028C70_COLOR_INVALID && + color_swap != ~0U && + color_num_format != ~0; +} + +static bool radv_is_zs_format_supported(VkFormat format) +{ + return radv_translate_dbformat(format) != V_028040_Z_INVALID; +} + +static void +radv_physical_device_get_format_properties(struct radv_physical_device *physical_device, + VkFormat format, + VkFormatProperties *out_properties) +{ + VkFormatFeatureFlags linear = 0, tiled = 0, buffer = 0; + const struct vk_format_description *desc = vk_format_description(format); + bool blendable; + if (!desc) { + out_properties->linearTilingFeatures = linear; + out_properties->optimalTilingFeatures = tiled; + out_properties->bufferFeatures = buffer; + return; + } + + if (radv_is_storage_image_format_supported(physical_device, format)) { + tiled |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT; + linear |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT; + } + + if (radv_is_buffer_format_supported(format)) { + buffer |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT | + VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT | + VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT; + } + + if (vk_format_is_depth_or_stencil(format)) { + if (radv_is_zs_format_supported(format)) + tiled |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT; + tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT; + tiled |= VK_FORMAT_FEATURE_BLIT_SRC_BIT | + VK_FORMAT_FEATURE_BLIT_DST_BIT; + } else { + bool linear_sampling; + if (radv_is_sampler_format_supported(format, &linear_sampling)) { + linear |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | + VK_FORMAT_FEATURE_BLIT_SRC_BIT; + tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | + VK_FORMAT_FEATURE_BLIT_SRC_BIT; + if (linear_sampling) { + linear |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + } + } + if (radv_is_colorbuffer_format_supported(format, &blendable)) { + linear |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; + tiled |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; + if (blendable) { + linear |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT; + tiled |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT; + } + } + } + + if (format == VK_FORMAT_R32_UINT || format == VK_FORMAT_R32_SINT) { + buffer |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT; + linear |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT; + tiled |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT; + } + + out_properties->linearTilingFeatures = linear; + out_properties->optimalTilingFeatures = tiled; + out_properties->bufferFeatures = buffer; +} + +uint32_t radv_translate_colorformat(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + +#define HAS_SIZE(x,y,z,w) \ + (desc->channel[0].size == (x) && desc->channel[1].size == (y) && \ + desc->channel[2].size == (z) && desc->channel[3].size == (w)) + + if (format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) /* isn't plain */ + return V_028C70_COLOR_10_11_11; + + if (desc->layout != VK_FORMAT_LAYOUT_PLAIN) + return V_028C70_COLOR_INVALID; + + /* hw cannot support mixed formats (except depth/stencil, since + * stencil is not written to). */ + if (desc->is_mixed && desc->colorspace != VK_FORMAT_COLORSPACE_ZS) + return V_028C70_COLOR_INVALID; + + switch (desc->nr_channels) { + case 1: + switch (desc->channel[0].size) { + case 8: + return V_028C70_COLOR_8; + case 16: + return V_028C70_COLOR_16; + case 32: + return V_028C70_COLOR_32; + } + break; + case 2: + if (desc->channel[0].size == desc->channel[1].size) { + switch (desc->channel[0].size) { + case 8: + return V_028C70_COLOR_8_8; + case 16: + return V_028C70_COLOR_16_16; + case 32: + return V_028C70_COLOR_32_32; + } + } else if (HAS_SIZE(8,24,0,0)) { + return V_028C70_COLOR_24_8; + } else if (HAS_SIZE(24,8,0,0)) { + return V_028C70_COLOR_8_24; + } + break; + case 3: + if (HAS_SIZE(5,6,5,0)) { + return V_028C70_COLOR_5_6_5; + } else if (HAS_SIZE(32,8,24,0)) { + return V_028C70_COLOR_X24_8_32_FLOAT; + } + break; + case 4: + if (desc->channel[0].size == desc->channel[1].size && + desc->channel[0].size == desc->channel[2].size && + desc->channel[0].size == desc->channel[3].size) { + switch (desc->channel[0].size) { + case 4: + return V_028C70_COLOR_4_4_4_4; + case 8: + return V_028C70_COLOR_8_8_8_8; + case 16: + return V_028C70_COLOR_16_16_16_16; + case 32: + return V_028C70_COLOR_32_32_32_32; + } + } else if (HAS_SIZE(5,5,5,1)) { + return V_028C70_COLOR_1_5_5_5; + } else if (HAS_SIZE(1,5,5,5)) { + return V_028C70_COLOR_5_5_5_1; + } else if (HAS_SIZE(10,10,10,2)) { + return V_028C70_COLOR_2_10_10_10; + } + break; + } + return V_028C70_COLOR_INVALID; +} + +uint32_t radv_colorformat_endian_swap(uint32_t colorformat) +{ + if (0/*SI_BIG_ENDIAN*/) { + switch(colorformat) { + /* 8-bit buffers. */ + case V_028C70_COLOR_8: + return V_028C70_ENDIAN_NONE; + + /* 16-bit buffers. */ + case V_028C70_COLOR_5_6_5: + case V_028C70_COLOR_1_5_5_5: + case V_028C70_COLOR_4_4_4_4: + case V_028C70_COLOR_16: + case V_028C70_COLOR_8_8: + return V_028C70_ENDIAN_8IN16; + + /* 32-bit buffers. */ + case V_028C70_COLOR_8_8_8_8: + case V_028C70_COLOR_2_10_10_10: + case V_028C70_COLOR_8_24: + case V_028C70_COLOR_24_8: + case V_028C70_COLOR_16_16: + return V_028C70_ENDIAN_8IN32; + + /* 64-bit buffers. */ + case V_028C70_COLOR_16_16_16_16: + return V_028C70_ENDIAN_8IN16; + + case V_028C70_COLOR_32_32: + return V_028C70_ENDIAN_8IN32; + + /* 128-bit buffers. */ + case V_028C70_COLOR_32_32_32_32: + return V_028C70_ENDIAN_8IN32; + default: + return V_028C70_ENDIAN_NONE; /* Unsupported. */ + } + } else { + return V_028C70_ENDIAN_NONE; + } +} + +uint32_t radv_translate_dbformat(VkFormat format) +{ + switch (format) { + case VK_FORMAT_D16_UNORM: + case VK_FORMAT_D16_UNORM_S8_UINT: + return V_028040_Z_16; + case VK_FORMAT_X8_D24_UNORM_PACK32: + case VK_FORMAT_D24_UNORM_S8_UINT: + return V_028040_Z_24; /* deprecated on SI */ + case VK_FORMAT_D32_SFLOAT: + case VK_FORMAT_D32_SFLOAT_S8_UINT: + return V_028040_Z_32_FLOAT; + default: + return V_028040_Z_INVALID; + } +} + +unsigned radv_translate_colorswap(VkFormat format, bool do_endian_swap) +{ + const struct vk_format_description *desc = vk_format_description(format); + +#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == VK_SWIZZLE_##swz) + + if (format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) + return V_0280A0_SWAP_STD; + + if (desc->layout != VK_FORMAT_LAYOUT_PLAIN) + return ~0U; + + switch (desc->nr_channels) { + case 1: + if (HAS_SWIZZLE(0,X)) + return V_0280A0_SWAP_STD; /* X___ */ + else if (HAS_SWIZZLE(3,X)) + return V_0280A0_SWAP_ALT_REV; /* ___X */ + break; + case 2: + if ((HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,Y)) || + (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,NONE)) || + (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,Y))) + return V_0280A0_SWAP_STD; /* XY__ */ + else if ((HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,X)) || + (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,NONE)) || + (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,X))) + /* YX__ */ + return (do_endian_swap ? V_0280A0_SWAP_STD : V_0280A0_SWAP_STD_REV); + else if (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(3,Y)) + return V_0280A0_SWAP_ALT; /* X__Y */ + else if (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(3,X)) + return V_0280A0_SWAP_ALT_REV; /* Y__X */ + break; + case 3: + if (HAS_SWIZZLE(0,X)) + return (do_endian_swap ? V_0280A0_SWAP_STD_REV : V_0280A0_SWAP_STD); + else if (HAS_SWIZZLE(0,Z)) + return V_0280A0_SWAP_STD_REV; /* ZYX */ + break; + case 4: + /* check the middle channels, the 1st and 4th channel can be NONE */ + if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) { + return V_0280A0_SWAP_STD; /* XYZW */ + } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) { + return V_0280A0_SWAP_STD_REV; /* WZYX */ + } else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) { + return V_0280A0_SWAP_ALT; /* ZYXW */ + } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,W)) { + /* YZWX */ + if (desc->is_array) + return V_0280A0_SWAP_ALT_REV; + else + return (do_endian_swap ? V_0280A0_SWAP_ALT : V_0280A0_SWAP_ALT_REV); + } + break; + } + return ~0U; +} + +bool radv_format_pack_clear_color(VkFormat format, + uint32_t clear_vals[2], + VkClearColorValue *value) +{ + uint8_t r, g, b, a; + const struct vk_format_description *desc = vk_format_description(format); + + if (vk_format_get_component_bits(format, VK_FORMAT_COLORSPACE_RGB, 0) <= 8) { + if (desc->colorspace == VK_FORMAT_COLORSPACE_RGB) { + r = float_to_ubyte(value->float32[0]); + g = float_to_ubyte(value->float32[1]); + b = float_to_ubyte(value->float32[2]); + a = float_to_ubyte(value->float32[3]); + } else if (desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) { + r = util_format_linear_float_to_srgb_8unorm(value->float32[0]); + g = util_format_linear_float_to_srgb_8unorm(value->float32[1]); + b = util_format_linear_float_to_srgb_8unorm(value->float32[2]); + a = float_to_ubyte(value->float32[3]); + } + } + switch (format) { + case VK_FORMAT_R8_UNORM: + case VK_FORMAT_R8_SRGB: + clear_vals[0] = r; + clear_vals[1] = 0; + break; + case VK_FORMAT_R8G8_UNORM: + case VK_FORMAT_R8G8_SRGB: + clear_vals[0] = r | g << 8; + clear_vals[1] = 0; + break; + case VK_FORMAT_R8G8B8A8_SRGB: + case VK_FORMAT_R8G8B8A8_UNORM: + clear_vals[0] = r | g << 8 | b << 16 | a << 24; + clear_vals[1] = 0; + break; + case VK_FORMAT_B8G8R8A8_SRGB: + case VK_FORMAT_B8G8R8A8_UNORM: + clear_vals[0] = b | g << 8 | r << 16 | a << 24; + clear_vals[1] = 0; + break; + case VK_FORMAT_A8B8G8R8_UNORM_PACK32: + case VK_FORMAT_A8B8G8R8_SRGB_PACK32: + clear_vals[0] = r | g << 8 | b << 16 | a << 24; + clear_vals[1] = 0; + break; + case VK_FORMAT_R8_UINT: + clear_vals[0] = value->uint32[0] & 0xff; + clear_vals[1] = 0; + break; + case VK_FORMAT_R16_UINT: + clear_vals[0] = value->uint32[0] & 0xffff; + clear_vals[1] = 0; + break; + case VK_FORMAT_R8G8_UINT: + clear_vals[0] = value->uint32[0] & 0xff; + clear_vals[0] |= (value->uint32[1] & 0xff) << 8; + clear_vals[1] = 0; + break; + case VK_FORMAT_R8G8B8A8_UINT: + clear_vals[0] = value->uint32[0] & 0xff; + clear_vals[0] |= (value->uint32[1] & 0xff) << 8; + clear_vals[0] |= (value->uint32[2] & 0xff) << 16; + clear_vals[0] |= (value->uint32[3] & 0xff) << 24; + clear_vals[1] = 0; + break; + case VK_FORMAT_A8B8G8R8_UINT_PACK32: + clear_vals[0] = value->uint32[0] & 0xff; + clear_vals[0] |= (value->uint32[1] & 0xff) << 8; + clear_vals[0] |= (value->uint32[2] & 0xff) << 16; + clear_vals[0] |= (value->uint32[3] & 0xff) << 24; + clear_vals[1] = 0; + break; + case VK_FORMAT_R16G16_UINT: + clear_vals[0] = value->uint32[0] & 0xffff; + clear_vals[0] |= (value->uint32[1] & 0xffff) << 16; + clear_vals[1] = 0; + break; + case VK_FORMAT_R16G16B16A16_UINT: + clear_vals[0] = value->uint32[0] & 0xffff; + clear_vals[0] |= (value->uint32[1] & 0xffff) << 16; + clear_vals[1] = value->uint32[2] & 0xffff; + clear_vals[1] |= (value->uint32[3] & 0xffff) << 16; + break; + case VK_FORMAT_R32_UINT: + clear_vals[0] = value->uint32[0]; + clear_vals[1] = 0; + break; + case VK_FORMAT_R32G32_UINT: + clear_vals[0] = value->uint32[0]; + clear_vals[1] = value->uint32[1]; + break; + case VK_FORMAT_R32_SINT: + clear_vals[0] = value->int32[0]; + clear_vals[1] = 0; + break; + case VK_FORMAT_R16_SFLOAT: + clear_vals[0] = util_float_to_half(value->float32[0]); + clear_vals[1] = 0; + break; + case VK_FORMAT_R16G16_SFLOAT: + clear_vals[0] = util_float_to_half(value->float32[0]); + clear_vals[0] |= (uint32_t)util_float_to_half(value->float32[1]) << 16; + clear_vals[1] = 0; + break; + case VK_FORMAT_R16G16B16A16_SFLOAT: + clear_vals[0] = util_float_to_half(value->float32[0]); + clear_vals[0] |= (uint32_t)util_float_to_half(value->float32[1]) << 16; + clear_vals[1] = util_float_to_half(value->float32[2]); + clear_vals[1] |= (uint32_t)util_float_to_half(value->float32[3]) << 16; + break; + case VK_FORMAT_R16_UNORM: + clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], 0.0f, 1.0f) * 0xffff)) & 0xffff; + clear_vals[1] = 0; + break; + case VK_FORMAT_R16G16_UNORM: + clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], 0.0f, 1.0f) * 0xffff)) & 0xffff; + clear_vals[0] |= ((uint16_t)util_iround(CLAMP(value->float32[1], 0.0f, 1.0f) * 0xffff)) << 16; + clear_vals[1] = 0; + break; + case VK_FORMAT_R16G16B16A16_UNORM: + clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], 0.0f, 1.0f) * 0xffff)) & 0xffff; + clear_vals[0] |= ((uint16_t)util_iround(CLAMP(value->float32[1], 0.0f, 1.0f) * 0xffff)) << 16; + clear_vals[1] = ((uint16_t)util_iround(CLAMP(value->float32[2], 0.0f, 1.0f) * 0xffff)) & 0xffff; + clear_vals[1] |= ((uint16_t)util_iround(CLAMP(value->float32[3], 0.0f, 1.0f) * 0xffff)) << 16; + break; + case VK_FORMAT_A2B10G10R10_UNORM_PACK32: + /* TODO */ + return false; + case VK_FORMAT_R32G32_SFLOAT: + clear_vals[0] = fui(value->float32[0]); + clear_vals[1] = fui(value->float32[1]); + break; + case VK_FORMAT_R32_SFLOAT: + clear_vals[1] = 0; + clear_vals[0] = fui(value->float32[0]); + break; + default: + fprintf(stderr, "failed to fast clear %d\n", format); + return false; + } + return true; +} + +void radv_GetPhysicalDeviceFormatProperties( + VkPhysicalDevice physicalDevice, + VkFormat format, + VkFormatProperties* pFormatProperties) +{ + RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice); + + radv_physical_device_get_format_properties(physical_device, + format, + pFormatProperties); +} + +VkResult radv_GetPhysicalDeviceImageFormatProperties( + VkPhysicalDevice physicalDevice, + VkFormat format, + VkImageType type, + VkImageTiling tiling, + VkImageUsageFlags usage, + VkImageCreateFlags createFlags, + VkImageFormatProperties* pImageFormatProperties) +{ + RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice); + VkFormatProperties format_props; + VkFormatFeatureFlags format_feature_flags; + VkExtent3D maxExtent; + uint32_t maxMipLevels; + uint32_t maxArraySize; + VkSampleCountFlags sampleCounts = VK_SAMPLE_COUNT_1_BIT; + + radv_physical_device_get_format_properties(physical_device, format, + &format_props); + if (tiling == VK_IMAGE_TILING_LINEAR) { + format_feature_flags = format_props.linearTilingFeatures; + } else if (tiling == VK_IMAGE_TILING_OPTIMAL) { + format_feature_flags = format_props.optimalTilingFeatures; + } else { + unreachable("bad VkImageTiling"); + } + + if (format_feature_flags == 0) + goto unsupported; + + switch (type) { + default: + unreachable("bad vkimage type\n"); + case VK_IMAGE_TYPE_1D: + maxExtent.width = 16384; + maxExtent.height = 1; + maxExtent.depth = 1; + maxMipLevels = 15; /* log2(maxWidth) + 1 */ + maxArraySize = 2048; + break; + case VK_IMAGE_TYPE_2D: + maxExtent.width = 16384; + maxExtent.height = 16384; + maxExtent.depth = 1; + maxMipLevels = 15; /* log2(maxWidth) + 1 */ + maxArraySize = 2048; + break; + case VK_IMAGE_TYPE_3D: + maxExtent.width = 2048; + maxExtent.height = 2048; + maxExtent.depth = 2048; + maxMipLevels = 12; /* log2(maxWidth) + 1 */ + maxArraySize = 1; + break; + } + + if (tiling == VK_IMAGE_TILING_OPTIMAL && + type == VK_IMAGE_TYPE_2D && + (format_feature_flags & (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) && + !(createFlags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) && + !(usage & VK_IMAGE_USAGE_STORAGE_BIT)) { + sampleCounts |= VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT | VK_SAMPLE_COUNT_8_BIT; + } + + if (usage & VK_IMAGE_USAGE_SAMPLED_BIT) { + if (!(format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) { + goto unsupported; + } + } + + if (usage & VK_IMAGE_USAGE_STORAGE_BIT) { + if (!(format_feature_flags & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) { + goto unsupported; + } + } + + if (usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { + if (!(format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) { + goto unsupported; + } + } + + if (usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { + if (!(format_feature_flags & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) { + goto unsupported; + } + } + + *pImageFormatProperties = (VkImageFormatProperties) { + .maxExtent = maxExtent, + .maxMipLevels = maxMipLevels, + .maxArrayLayers = maxArraySize, + .sampleCounts = sampleCounts, + + /* FINISHME: Accurately calculate + * VkImageFormatProperties::maxResourceSize. + */ + .maxResourceSize = UINT32_MAX, + }; + + return VK_SUCCESS; +unsupported: + *pImageFormatProperties = (VkImageFormatProperties) { + .maxExtent = { 0, 0, 0 }, + .maxMipLevels = 0, + .maxArrayLayers = 0, + .sampleCounts = 0, + .maxResourceSize = 0, + }; + + return VK_ERROR_FORMAT_NOT_SUPPORTED; +} + +void radv_GetPhysicalDeviceSparseImageFormatProperties( + VkPhysicalDevice physicalDevice, + VkFormat format, + VkImageType type, + uint32_t samples, + VkImageUsageFlags usage, + VkImageTiling tiling, + uint32_t* pNumProperties, + VkSparseImageFormatProperties* pProperties) +{ + /* Sparse images are not yet supported. */ + *pNumProperties = 0; +} diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c new file mode 100644 index 00000000000..2223c89cf12 --- /dev/null +++ b/src/amd/vulkan/radv_image.c @@ -0,0 +1,1030 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based in part on anv driver which is: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "radv_private.h" +#include "vk_format.h" +#include "radv_radeon_winsys.h" +#include "sid.h" +#include "util/debug.h" +static unsigned +radv_choose_tiling(struct radv_device *Device, + const struct radv_image_create_info *create_info) +{ + const VkImageCreateInfo *pCreateInfo = create_info->vk_info; + + if (pCreateInfo->tiling == VK_IMAGE_TILING_LINEAR) { + assert(pCreateInfo->samples <= 1); + return RADEON_SURF_MODE_LINEAR_ALIGNED; + } + + /* MSAA resources must be 2D tiled. */ + if (pCreateInfo->samples > 1) + return RADEON_SURF_MODE_2D; + + return RADEON_SURF_MODE_2D; +} +static int +radv_init_surface(struct radv_device *device, + struct radeon_surf *surface, + const struct radv_image_create_info *create_info) +{ + const VkImageCreateInfo *pCreateInfo = create_info->vk_info; + unsigned array_mode = radv_choose_tiling(device, create_info); + const struct vk_format_description *desc = + vk_format_description(pCreateInfo->format); + bool is_depth, is_stencil, blendable; + + is_depth = vk_format_has_depth(desc); + is_stencil = vk_format_has_stencil(desc); + surface->npix_x = pCreateInfo->extent.width; + surface->npix_y = pCreateInfo->extent.height; + surface->npix_z = pCreateInfo->extent.depth; + + surface->blk_w = vk_format_get_blockwidth(pCreateInfo->format); + surface->blk_h = vk_format_get_blockheight(pCreateInfo->format); + surface->blk_d = 1; + surface->array_size = pCreateInfo->arrayLayers; + surface->last_level = pCreateInfo->mipLevels - 1; + + surface->bpe = vk_format_get_blocksize(pCreateInfo->format); + /* align byte per element on dword */ + if (surface->bpe == 3) { + surface->bpe = 4; + } + surface->nsamples = pCreateInfo->samples ? pCreateInfo->samples : 1; + surface->flags = RADEON_SURF_SET(array_mode, MODE); + + switch (pCreateInfo->imageType){ + case VK_IMAGE_TYPE_1D: + if (pCreateInfo->arrayLayers > 1) + surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_1D_ARRAY, TYPE); + else + surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_1D, TYPE); + break; + case VK_IMAGE_TYPE_2D: + if (pCreateInfo->arrayLayers > 1) + surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_2D_ARRAY, TYPE); + else + surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_2D, TYPE); + break; + case VK_IMAGE_TYPE_3D: + surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_3D, TYPE); + break; + default: + unreachable("unhandled image type"); + } + + if (is_depth) { + surface->flags |= RADEON_SURF_ZBUFFER; + } + + if (is_stencil) + surface->flags |= RADEON_SURF_SBUFFER | + RADEON_SURF_HAS_SBUFFER_MIPTREE; + + surface->flags |= RADEON_SURF_HAS_TILE_MODE_INDEX; + + if ((pCreateInfo->usage & (VK_IMAGE_USAGE_TRANSFER_SRC_BIT | + VK_IMAGE_USAGE_STORAGE_BIT)) || + (pCreateInfo->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) || + (pCreateInfo->tiling == VK_IMAGE_TILING_LINEAR) || + device->instance->physicalDevice.rad_info.chip_class < VI || + create_info->scanout || !device->allow_dcc || + !radv_is_colorbuffer_format_supported(pCreateInfo->format, &blendable)) + surface->flags |= RADEON_SURF_DISABLE_DCC; + if (create_info->scanout) + surface->flags |= RADEON_SURF_SCANOUT; + return 0; +} +#define ATI_VENDOR_ID 0x1002 +static uint32_t si_get_bo_metadata_word1(struct radv_device *device) +{ + return (ATI_VENDOR_ID << 16) | device->instance->physicalDevice.rad_info.pci_id; +} + +static inline unsigned +si_tile_mode_index(const struct radv_image *image, unsigned level, bool stencil) +{ + if (stencil) + return image->surface.stencil_tiling_index[level]; + else + return image->surface.tiling_index[level]; +} + +static unsigned radv_map_swizzle(unsigned swizzle) +{ + switch (swizzle) { + case VK_SWIZZLE_Y: + return V_008F0C_SQ_SEL_Y; + case VK_SWIZZLE_Z: + return V_008F0C_SQ_SEL_Z; + case VK_SWIZZLE_W: + return V_008F0C_SQ_SEL_W; + case VK_SWIZZLE_0: + return V_008F0C_SQ_SEL_0; + case VK_SWIZZLE_1: + return V_008F0C_SQ_SEL_1; + default: /* VK_SWIZZLE_X */ + return V_008F0C_SQ_SEL_X; + } +} + +static void +radv_make_buffer_descriptor(struct radv_device *device, + struct radv_buffer *buffer, + VkFormat vk_format, + unsigned offset, + unsigned range, + uint32_t *state) +{ + const struct vk_format_description *desc; + unsigned stride; + uint64_t gpu_address = device->ws->buffer_get_va(buffer->bo); + uint64_t va = gpu_address + buffer->offset; + unsigned num_format, data_format; + int first_non_void; + desc = vk_format_description(vk_format); + first_non_void = vk_format_get_first_non_void_channel(vk_format); + stride = desc->block.bits / 8; + + num_format = radv_translate_buffer_numformat(desc, first_non_void); + data_format = radv_translate_buffer_dataformat(desc, first_non_void); + + va += offset; + state[0] = va; + state[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | + S_008F04_STRIDE(stride); + state[2] = range; + state[3] = S_008F0C_DST_SEL_X(radv_map_swizzle(desc->swizzle[0])) | + S_008F0C_DST_SEL_Y(radv_map_swizzle(desc->swizzle[1])) | + S_008F0C_DST_SEL_Z(radv_map_swizzle(desc->swizzle[2])) | + S_008F0C_DST_SEL_W(radv_map_swizzle(desc->swizzle[3])) | + S_008F0C_NUM_FORMAT(num_format) | + S_008F0C_DATA_FORMAT(data_format); +} + +static void +si_set_mutable_tex_desc_fields(struct radv_device *device, + struct radv_image *image, + const struct radeon_surf_level *base_level_info, + unsigned base_level, unsigned first_level, + unsigned block_width, bool is_stencil, + uint32_t *state) +{ + uint64_t gpu_address = device->ws->buffer_get_va(image->bo) + image->offset; + uint64_t va = gpu_address + base_level_info->offset; + unsigned pitch = base_level_info->nblk_x * block_width; + + state[1] &= C_008F14_BASE_ADDRESS_HI; + state[3] &= C_008F1C_TILING_INDEX; + state[4] &= C_008F20_PITCH; + state[6] &= C_008F28_COMPRESSION_EN; + + assert(!(va & 255)); + + state[0] = va >> 8; + state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40); + state[3] |= S_008F1C_TILING_INDEX(si_tile_mode_index(image, base_level, + is_stencil)); + state[4] |= S_008F20_PITCH(pitch - 1); + + if (image->surface.dcc_size && image->surface.level[first_level].dcc_enabled) { + state[6] |= S_008F28_COMPRESSION_EN(1); + state[7] = (gpu_address + + image->dcc_offset + + base_level_info->dcc_offset) >> 8; + } +} + +static unsigned radv_tex_dim(VkImageType image_type, VkImageViewType view_type, + unsigned nr_layers, unsigned nr_samples, bool is_storage_image) +{ + if (view_type == VK_IMAGE_VIEW_TYPE_CUBE || view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY) + return is_storage_image ? V_008F1C_SQ_RSRC_IMG_2D_ARRAY : V_008F1C_SQ_RSRC_IMG_CUBE; + switch (image_type) { + case VK_IMAGE_TYPE_1D: + return nr_layers > 1 ? V_008F1C_SQ_RSRC_IMG_1D_ARRAY : V_008F1C_SQ_RSRC_IMG_1D; + case VK_IMAGE_TYPE_2D: + if (nr_samples > 1) + return nr_layers > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY : V_008F1C_SQ_RSRC_IMG_2D_MSAA; + else + return nr_layers > 1 ? V_008F1C_SQ_RSRC_IMG_2D_ARRAY : V_008F1C_SQ_RSRC_IMG_2D; + case VK_IMAGE_TYPE_3D: + if (view_type == VK_IMAGE_VIEW_TYPE_3D) + return V_008F1C_SQ_RSRC_IMG_3D; + else + return V_008F1C_SQ_RSRC_IMG_2D_ARRAY; + default: + unreachable("illegale image type"); + } +} +/** + * Build the sampler view descriptor for a texture. + */ +static void +si_make_texture_descriptor(struct radv_device *device, + struct radv_image *image, + bool sampler, + VkImageViewType view_type, + VkFormat vk_format, + const VkComponentMapping *mapping, + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned width, unsigned height, unsigned depth, + uint32_t *state, + uint32_t *fmask_state) +{ + const struct vk_format_description *desc; + enum vk_swizzle swizzle[4]; + int first_non_void; + unsigned num_format, data_format, type; + + desc = vk_format_description(vk_format); + + if (desc->colorspace == VK_FORMAT_COLORSPACE_ZS) { + const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0}; + const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1}; + + switch (vk_format) { + case VK_FORMAT_X8_D24_UNORM_PACK32: + case VK_FORMAT_D24_UNORM_S8_UINT: + case VK_FORMAT_D32_SFLOAT_S8_UINT: + vk_format_compose_swizzles(mapping, swizzle_yyyy, swizzle); + break; + default: + vk_format_compose_swizzles(mapping, swizzle_xxxx, swizzle); + } + } else { + vk_format_compose_swizzles(mapping, desc->swizzle, swizzle); + } + + first_non_void = vk_format_get_first_non_void_channel(vk_format); + + num_format = radv_translate_tex_numformat(vk_format, desc, first_non_void); + if (num_format == ~0) { + num_format = 0; + } + + data_format = radv_translate_tex_dataformat(vk_format, desc, first_non_void); + if (data_format == ~0) { + data_format = 0; + } + + type = radv_tex_dim(image->type, view_type, image->array_size, image->samples, + (image->usage & VK_IMAGE_USAGE_STORAGE_BIT)); + if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) { + height = 1; + depth = image->array_size; + } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || + type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { + if (view_type != VK_IMAGE_VIEW_TYPE_3D) + depth = image->array_size; + } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) + depth = image->array_size / 6; + + state[0] = 0; + state[1] = (S_008F14_DATA_FORMAT(data_format) | + S_008F14_NUM_FORMAT(num_format)); + state[2] = (S_008F18_WIDTH(width - 1) | + S_008F18_HEIGHT(height - 1)); + state[3] = (S_008F1C_DST_SEL_X(radv_map_swizzle(swizzle[0])) | + S_008F1C_DST_SEL_Y(radv_map_swizzle(swizzle[1])) | + S_008F1C_DST_SEL_Z(radv_map_swizzle(swizzle[2])) | + S_008F1C_DST_SEL_W(radv_map_swizzle(swizzle[3])) | + S_008F1C_BASE_LEVEL(image->samples > 1 ? + 0 : first_level) | + S_008F1C_LAST_LEVEL(image->samples > 1 ? + util_logbase2(image->samples) : + last_level) | + S_008F1C_POW2_PAD(image->levels > 1) | + S_008F1C_TYPE(type)); + state[4] = S_008F20_DEPTH(depth - 1); + state[5] = (S_008F24_BASE_ARRAY(first_layer) | + S_008F24_LAST_ARRAY(last_layer)); + state[6] = 0; + state[7] = 0; + + if (image->dcc_offset) { + unsigned swap = radv_translate_colorswap(vk_format, FALSE); + + state[6] = S_008F28_ALPHA_IS_ON_MSB(swap <= 1); + } else { + /* The last dword is unused by hw. The shader uses it to clear + * bits in the first dword of sampler state. + */ + if (device->instance->physicalDevice.rad_info.chip_class <= CIK && image->samples <= 1) { + if (first_level == last_level) + state[7] = C_008F30_MAX_ANISO_RATIO; + else + state[7] = 0xffffffff; + } + } + + /* Initialize the sampler view for FMASK. */ + if (image->fmask.size) { + uint32_t fmask_format; + uint64_t gpu_address = device->ws->buffer_get_va(image->bo); + uint64_t va; + + va = gpu_address + image->offset + image->fmask.offset; + + switch (image->samples) { + case 2: + fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2; + break; + case 4: + fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4; + break; + case 8: + fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8; + break; + default: + assert(0); + fmask_format = V_008F14_IMG_DATA_FORMAT_INVALID; + } + + fmask_state[0] = va >> 8; + fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) | + S_008F14_DATA_FORMAT(fmask_format) | + S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_UINT); + fmask_state[2] = S_008F18_WIDTH(width - 1) | + S_008F18_HEIGHT(height - 1); + fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | + S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) | + S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | + S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) | + S_008F1C_TILING_INDEX(image->fmask.tile_mode_index) | + S_008F1C_TYPE(radv_tex_dim(image->type, view_type, 1, 0, false)); + fmask_state[4] = S_008F20_DEPTH(depth - 1) | + S_008F20_PITCH(image->fmask.pitch_in_pixels - 1); + fmask_state[5] = S_008F24_BASE_ARRAY(first_layer) | + S_008F24_LAST_ARRAY(last_layer); + fmask_state[6] = 0; + fmask_state[7] = 0; + } +} + +static void +radv_query_opaque_metadata(struct radv_device *device, + struct radv_image *image, + struct radeon_bo_metadata *md) +{ + static const VkComponentMapping fixedmapping; + uint32_t desc[8], i; + + /* Metadata image format format version 1: + * [0] = 1 (metadata format identifier) + * [1] = (VENDOR_ID << 16) | PCI_ID + * [2:9] = image descriptor for the whole resource + * [2] is always 0, because the base address is cleared + * [9] is the DCC offset bits [39:8] from the beginning of + * the buffer + * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level + */ + md->metadata[0] = 1; /* metadata image format version 1 */ + + /* TILE_MODE_INDEX is ambiguous without a PCI ID. */ + md->metadata[1] = si_get_bo_metadata_word1(device); + + + si_make_texture_descriptor(device, image, true, + (VkImageViewType)image->type, image->vk_format, + &fixedmapping, 0, image->levels - 1, 0, + image->array_size, + image->extent.width, image->extent.height, + image->extent.depth, + desc, NULL); + + si_set_mutable_tex_desc_fields(device, image, &image->surface.level[0], 0, 0, + image->surface.blk_w, false, desc); + + /* Clear the base address and set the relative DCC offset. */ + desc[0] = 0; + desc[1] &= C_008F14_BASE_ADDRESS_HI; + desc[7] = image->dcc_offset >> 8; + + /* Dwords [2:9] contain the image descriptor. */ + memcpy(&md->metadata[2], desc, sizeof(desc)); + + /* Dwords [10:..] contain the mipmap level offsets. */ + for (i = 0; i <= image->levels - 1; i++) + md->metadata[10+i] = image->surface.level[i].offset >> 8; + + md->size_metadata = (11 + image->levels - 1) * 4; +} + +void +radv_init_metadata(struct radv_device *device, + struct radv_image *image, + struct radeon_bo_metadata *metadata) +{ + struct radeon_surf *surface = &image->surface; + + memset(metadata, 0, sizeof(*metadata)); + metadata->microtile = surface->level[0].mode >= RADEON_SURF_MODE_1D ? + RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; + metadata->macrotile = surface->level[0].mode >= RADEON_SURF_MODE_2D ? + RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; + metadata->pipe_config = surface->pipe_config; + metadata->bankw = surface->bankw; + metadata->bankh = surface->bankh; + metadata->tile_split = surface->tile_split; + metadata->mtilea = surface->mtilea; + metadata->num_banks = surface->num_banks; + metadata->stride = surface->level[0].pitch_bytes; + metadata->scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; + + radv_query_opaque_metadata(device, image, metadata); +} + +/* The number of samples can be specified independently of the texture. */ +static void +radv_image_get_fmask_info(struct radv_device *device, + struct radv_image *image, + unsigned nr_samples, + struct radv_fmask_info *out) +{ + /* FMASK is allocated like an ordinary texture. */ + struct radeon_surf fmask = image->surface; + + memset(out, 0, sizeof(*out)); + + fmask.bo_alignment = 0; + fmask.bo_size = 0; + fmask.nsamples = 1; + fmask.flags |= RADEON_SURF_FMASK; + + /* Force 2D tiling if it wasn't set. This may occur when creating + * FMASK for MSAA resolve on R6xx. On R6xx, the single-sample + * destination buffer must have an FMASK too. */ + fmask.flags = RADEON_SURF_CLR(fmask.flags, MODE); + fmask.flags |= RADEON_SURF_SET(RADEON_SURF_MODE_2D, MODE); + + fmask.flags |= RADEON_SURF_HAS_TILE_MODE_INDEX; + + switch (nr_samples) { + case 2: + case 4: + fmask.bpe = 1; + break; + case 8: + fmask.bpe = 4; + break; + default: + return; + } + + device->ws->surface_init(device->ws, &fmask); + assert(fmask.level[0].mode == RADEON_SURF_MODE_2D); + + out->slice_tile_max = (fmask.level[0].nblk_x * fmask.level[0].nblk_y) / 64; + if (out->slice_tile_max) + out->slice_tile_max -= 1; + + out->tile_mode_index = fmask.tiling_index[0]; + out->pitch_in_pixels = fmask.level[0].nblk_x; + out->bank_height = fmask.bankh; + out->alignment = MAX2(256, fmask.bo_alignment); + out->size = fmask.bo_size; +} + +static void +radv_image_alloc_fmask(struct radv_device *device, + struct radv_image *image) +{ + radv_image_get_fmask_info(device, image, image->samples, &image->fmask); + + image->fmask.offset = align64(image->size, image->fmask.alignment); + image->size = image->fmask.offset + image->fmask.size; +} + +static void +radv_image_get_cmask_info(struct radv_device *device, + struct radv_image *image, + struct radv_cmask_info *out) +{ + unsigned pipe_interleave_bytes = device->instance->physicalDevice.rad_info.pipe_interleave_bytes; + unsigned num_pipes = device->instance->physicalDevice.rad_info.num_tile_pipes; + unsigned cl_width, cl_height; + + switch (num_pipes) { + case 2: + cl_width = 32; + cl_height = 16; + break; + case 4: + cl_width = 32; + cl_height = 32; + break; + case 8: + cl_width = 64; + cl_height = 32; + break; + case 16: /* Hawaii */ + cl_width = 64; + cl_height = 64; + break; + default: + assert(0); + return; + } + + unsigned base_align = num_pipes * pipe_interleave_bytes; + + unsigned width = align(image->surface.npix_x, cl_width*8); + unsigned height = align(image->surface.npix_y, cl_height*8); + unsigned slice_elements = (width * height) / (8*8); + + /* Each element of CMASK is a nibble. */ + unsigned slice_bytes = slice_elements / 2; + + out->pitch = width; + out->height = height; + out->xalign = cl_width * 8; + out->yalign = cl_height * 8; + out->slice_tile_max = (width * height) / (128*128); + if (out->slice_tile_max) + out->slice_tile_max -= 1; + + out->alignment = MAX2(256, base_align); + out->size = (image->type == VK_IMAGE_TYPE_3D ? image->extent.depth : image->array_size) * + align(slice_bytes, base_align); +} + +static void +radv_image_alloc_cmask(struct radv_device *device, + struct radv_image *image) +{ + radv_image_get_cmask_info(device, image, &image->cmask); + + image->cmask.offset = align64(image->size, image->cmask.alignment); + /* + 8 for storing the clear values */ + image->clear_value_offset = image->cmask.offset + image->cmask.size; + image->size = image->cmask.offset + image->cmask.size + 8; +} + +static void +radv_image_alloc_dcc(struct radv_device *device, + struct radv_image *image) +{ + image->dcc_offset = align64(image->size, image->surface.dcc_alignment); + /* + 8 for storing the clear values */ + image->clear_value_offset = image->dcc_offset + image->surface.dcc_size; + image->size = image->dcc_offset + image->surface.dcc_size + 8; +} + +static unsigned +radv_image_get_htile_size(struct radv_device *device, + struct radv_image *image) +{ + unsigned cl_width, cl_height, width, height; + unsigned slice_elements, slice_bytes, base_align; + unsigned num_pipes = device->instance->physicalDevice.rad_info.num_tile_pipes; + unsigned pipe_interleave_bytes = device->instance->physicalDevice.rad_info.pipe_interleave_bytes; + + /* Overalign HTILE on P2 configs to work around GPU hangs in + * piglit/depthstencil-render-miplevels 585. + * + * This has been confirmed to help Kabini & Stoney, where the hangs + * are always reproducible. I think I have seen the test hang + * on Carrizo too, though it was very rare there. + */ + if (device->instance->physicalDevice.rad_info.chip_class >= CIK && num_pipes < 4) + num_pipes = 4; + + switch (num_pipes) { + case 1: + cl_width = 32; + cl_height = 16; + break; + case 2: + cl_width = 32; + cl_height = 32; + break; + case 4: + cl_width = 64; + cl_height = 32; + break; + case 8: + cl_width = 64; + cl_height = 64; + break; + case 16: + cl_width = 128; + cl_height = 64; + break; + default: + assert(0); + return 0; + } + + width = align(image->surface.npix_x, cl_width * 8); + height = align(image->surface.npix_y, cl_height * 8); + + slice_elements = (width * height) / (8 * 8); + slice_bytes = slice_elements * 4; + + base_align = num_pipes * pipe_interleave_bytes; + + image->htile.pitch = width; + image->htile.height = height; + image->htile.xalign = cl_width * 8; + image->htile.yalign = cl_height * 8; + + return image->array_size * + align(slice_bytes, base_align); +} + +static void +radv_image_alloc_htile(struct radv_device *device, + struct radv_image *image) +{ + if (env_var_as_boolean("RADV_HIZ_DISABLE", false)) + return; + + image->htile.size = radv_image_get_htile_size(device, image); + + if (!image->htile.size) + return; + + image->htile.offset = align64(image->size, 32768); + + /* + 8 for storing the clear values */ + image->clear_value_offset = image->htile.offset + image->htile.size; + image->size = image->htile.offset + image->htile.size + 8; + image->alignment = align64(image->alignment, 32768); +} + +VkResult +radv_image_create(VkDevice _device, + const struct radv_image_create_info *create_info, + const VkAllocationCallbacks* alloc, + VkImage *pImage) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + const VkImageCreateInfo *pCreateInfo = create_info->vk_info; + struct radv_image *image = NULL; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO); + + radv_assert(pCreateInfo->mipLevels > 0); + radv_assert(pCreateInfo->arrayLayers > 0); + radv_assert(pCreateInfo->samples > 0); + radv_assert(pCreateInfo->extent.width > 0); + radv_assert(pCreateInfo->extent.height > 0); + radv_assert(pCreateInfo->extent.depth > 0); + + image = radv_alloc2(&device->alloc, alloc, sizeof(*image), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!image) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + memset(image, 0, sizeof(*image)); + image->type = pCreateInfo->imageType; + image->extent = pCreateInfo->extent; + image->vk_format = pCreateInfo->format; + image->levels = pCreateInfo->mipLevels; + image->array_size = pCreateInfo->arrayLayers; + image->samples = pCreateInfo->samples; + image->tiling = pCreateInfo->tiling; + image->usage = pCreateInfo->usage; + radv_init_surface(device, &image->surface, create_info); + + device->ws->surface_init(device->ws, &image->surface); + + image->size = image->surface.bo_size; + image->alignment = image->surface.bo_alignment; + + if ((pCreateInfo->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) && + image->surface.dcc_size) + radv_image_alloc_dcc(device, image); + else + image->surface.dcc_size = 0; + + if ((pCreateInfo->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) && + pCreateInfo->mipLevels == 1 && + !image->surface.dcc_size && image->extent.depth == 1) + radv_image_alloc_cmask(device, image); + if (image->samples > 1 && vk_format_is_color(pCreateInfo->format)) { + radv_image_alloc_fmask(device, image); + } else if (vk_format_is_depth(pCreateInfo->format)) { + + radv_image_alloc_htile(device, image); + } + + + if (create_info->stride && create_info->stride != image->surface.level[0].pitch_bytes) { + image->surface.level[0].nblk_x = create_info->stride / image->surface.bpe; + image->surface.level[0].pitch_bytes = create_info->stride; + image->surface.level[0].slice_size = create_info->stride * image->surface.level[0].nblk_y; + } + *pImage = radv_image_to_handle(image); + + return VK_SUCCESS; +} + +void +radv_image_view_init(struct radv_image_view *iview, + struct radv_device *device, + const VkImageViewCreateInfo* pCreateInfo, + struct radv_cmd_buffer *cmd_buffer, + VkImageUsageFlags usage_mask) +{ + RADV_FROM_HANDLE(radv_image, image, pCreateInfo->image); + const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange; + bool is_stencil = false; + switch (image->type) { + default: + unreachable("bad VkImageType"); + case VK_IMAGE_TYPE_1D: + case VK_IMAGE_TYPE_2D: + assert(range->baseArrayLayer + radv_get_layerCount(image, range) - 1 <= image->array_size); + break; + case VK_IMAGE_TYPE_3D: + assert(range->baseArrayLayer + radv_get_layerCount(image, range) - 1 + <= radv_minify(image->extent.depth, range->baseMipLevel)); + break; + } + iview->image = image; + iview->bo = image->bo; + iview->type = pCreateInfo->viewType; + iview->vk_format = pCreateInfo->format; + iview->aspect_mask = pCreateInfo->subresourceRange.aspectMask; + + if (iview->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) + is_stencil = true; + iview->extent = (VkExtent3D) { + .width = radv_minify(image->extent.width , range->baseMipLevel), + .height = radv_minify(image->extent.height, range->baseMipLevel), + .depth = radv_minify(image->extent.depth , range->baseMipLevel), + }; + + iview->extent.width = round_up_u32(iview->extent.width * vk_format_get_blockwidth(iview->vk_format), + vk_format_get_blockwidth(image->vk_format)); + iview->extent.height = round_up_u32(iview->extent.height * vk_format_get_blockheight(iview->vk_format), + vk_format_get_blockheight(image->vk_format)); + + iview->base_layer = range->baseArrayLayer; + iview->layer_count = radv_get_layerCount(image, range); + iview->base_mip = range->baseMipLevel; + + si_make_texture_descriptor(device, image, false, + iview->type, + pCreateInfo->format, + &pCreateInfo->components, + 0, radv_get_levelCount(image, range) - 1, + range->baseArrayLayer, + range->baseArrayLayer + radv_get_layerCount(image, range) - 1, + iview->extent.width, + iview->extent.height, + iview->extent.depth, + iview->descriptor, + iview->fmask_descriptor); + si_set_mutable_tex_desc_fields(device, image, + is_stencil ? &image->surface.stencil_level[range->baseMipLevel] : &image->surface.level[range->baseMipLevel], range->baseMipLevel, + range->baseMipLevel, + image->surface.blk_w, is_stencil, iview->descriptor); +} + +void radv_image_set_optimal_micro_tile_mode(struct radv_device *device, + struct radv_image *image, uint32_t micro_tile_mode) +{ + /* These magic numbers were copied from addrlib. It doesn't use any + * definitions for them either. They are all 2D_TILED_THIN1 modes with + * different bpp and micro tile mode. + */ + if (device->instance->physicalDevice.rad_info.chip_class >= CIK) { + switch (micro_tile_mode) { + case 0: /* displayable */ + image->surface.tiling_index[0] = 10; + break; + case 1: /* thin */ + image->surface.tiling_index[0] = 14; + break; + case 3: /* rotated */ + image->surface.tiling_index[0] = 28; + break; + default: /* depth, thick */ + assert(!"unexpected micro mode"); + return; + } + } else { /* SI */ + switch (micro_tile_mode) { + case 0: /* displayable */ + switch (image->surface.bpe) { + case 8: + image->surface.tiling_index[0] = 10; + break; + case 16: + image->surface.tiling_index[0] = 11; + break; + default: /* 32, 64 */ + image->surface.tiling_index[0] = 12; + break; + } + break; + case 1: /* thin */ + switch (image->surface.bpe) { + case 8: + image->surface.tiling_index[0] = 14; + break; + case 16: + image->surface.tiling_index[0] = 15; + break; + case 32: + image->surface.tiling_index[0] = 16; + break; + default: /* 64, 128 */ + image->surface.tiling_index[0] = 17; + break; + } + break; + default: /* depth, thick */ + assert(!"unexpected micro mode"); + return; + } + } + + image->surface.micro_tile_mode = micro_tile_mode; +} + +bool radv_layout_has_htile(const struct radv_image *image, + VkImageLayout layout) +{ + return (layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || + layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); +} + +bool radv_layout_is_htile_compressed(const struct radv_image *image, + VkImageLayout layout) +{ + return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; +} + +bool radv_layout_can_expclear(const struct radv_image *image, + VkImageLayout layout) +{ + return (layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || + layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); +} + +bool radv_layout_has_cmask(const struct radv_image *image, + VkImageLayout layout) +{ + return (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL || + layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); +} + +VkResult +radv_CreateImage(VkDevice device, + const VkImageCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkImage *pImage) +{ + return radv_image_create(device, + &(struct radv_image_create_info) { + .vk_info = pCreateInfo, + .scanout = false, + }, + pAllocator, + pImage); +} + +void +radv_DestroyImage(VkDevice _device, VkImage _image, + const VkAllocationCallbacks *pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + + if (!_image) + return; + + radv_free2(&device->alloc, pAllocator, radv_image_from_handle(_image)); +} + +void radv_GetImageSubresourceLayout( + VkDevice device, + VkImage _image, + const VkImageSubresource* pSubresource, + VkSubresourceLayout* pLayout) +{ + RADV_FROM_HANDLE(radv_image, image, _image); + int level = pSubresource->mipLevel; + int layer = pSubresource->arrayLayer; + + pLayout->offset = image->surface.level[level].offset + image->surface.level[level].slice_size * layer; + pLayout->rowPitch = image->surface.level[level].pitch_bytes; + pLayout->arrayPitch = image->surface.level[level].slice_size; + pLayout->depthPitch = image->surface.level[level].slice_size; + pLayout->size = image->surface.level[level].slice_size; + if (image->type == VK_IMAGE_TYPE_3D) + pLayout->size *= image->surface.level[level].nblk_z; +} + + +VkResult +radv_CreateImageView(VkDevice _device, + const VkImageViewCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkImageView *pView) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_image_view *view; + + view = radv_alloc2(&device->alloc, pAllocator, sizeof(*view), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (view == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + radv_image_view_init(view, device, pCreateInfo, NULL, ~0); + + *pView = radv_image_view_to_handle(view); + + return VK_SUCCESS; +} + +void +radv_DestroyImageView(VkDevice _device, VkImageView _iview, + const VkAllocationCallbacks *pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_image_view, iview, _iview); + + if (!iview) + return; + radv_free2(&device->alloc, pAllocator, iview); +} + +void radv_buffer_view_init(struct radv_buffer_view *view, + struct radv_device *device, + const VkBufferViewCreateInfo* pCreateInfo, + struct radv_cmd_buffer *cmd_buffer) +{ + RADV_FROM_HANDLE(radv_buffer, buffer, pCreateInfo->buffer); + + view->bo = buffer->bo; + view->range = pCreateInfo->range == VK_WHOLE_SIZE ? + buffer->size - pCreateInfo->offset : pCreateInfo->range; + view->vk_format = pCreateInfo->format; + + radv_make_buffer_descriptor(device, buffer, view->vk_format, + pCreateInfo->offset, view->range, view->state); +} + +VkResult +radv_CreateBufferView(VkDevice _device, + const VkBufferViewCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkBufferView *pView) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_buffer_view *view; + + view = radv_alloc2(&device->alloc, pAllocator, sizeof(*view), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!view) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + radv_buffer_view_init(view, device, pCreateInfo, NULL); + + *pView = radv_buffer_view_to_handle(view); + + return VK_SUCCESS; +} + +void +radv_DestroyBufferView(VkDevice _device, VkBufferView bufferView, + const VkAllocationCallbacks *pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_buffer_view, view, bufferView); + + if (!view) + return; + + radv_free2(&device->alloc, pAllocator, view); +} diff --git a/src/amd/vulkan/radv_meta.c b/src/amd/vulkan/radv_meta.c new file mode 100644 index 00000000000..04fa247dd36 --- /dev/null +++ b/src/amd/vulkan/radv_meta.c @@ -0,0 +1,388 @@ +/* + * Copyright © 2016 Red Hat + * based on intel anv code: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "radv_meta.h" + +#include +#include +#include +#include + +void +radv_meta_save(struct radv_meta_saved_state *state, + const struct radv_cmd_buffer *cmd_buffer, + uint32_t dynamic_mask) +{ + state->old_pipeline = cmd_buffer->state.pipeline; + state->old_descriptor_set0 = cmd_buffer->state.descriptors[0]; + memcpy(state->old_vertex_bindings, cmd_buffer->state.vertex_bindings, + sizeof(state->old_vertex_bindings)); + + state->dynamic_mask = dynamic_mask; + radv_dynamic_state_copy(&state->dynamic, &cmd_buffer->state.dynamic, + dynamic_mask); + + memcpy(state->push_constants, cmd_buffer->push_constants, MAX_PUSH_CONSTANTS_SIZE); +} + +void +radv_meta_restore(const struct radv_meta_saved_state *state, + struct radv_cmd_buffer *cmd_buffer) +{ + cmd_buffer->state.pipeline = state->old_pipeline; + radv_bind_descriptor_set(cmd_buffer, state->old_descriptor_set0, 0); + memcpy(cmd_buffer->state.vertex_bindings, state->old_vertex_bindings, + sizeof(state->old_vertex_bindings)); + + cmd_buffer->state.vb_dirty |= (1 << RADV_META_VERTEX_BINDING_COUNT) - 1; + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE; + + radv_dynamic_state_copy(&cmd_buffer->state.dynamic, &state->dynamic, + state->dynamic_mask); + cmd_buffer->state.dirty |= state->dynamic_mask; + + memcpy(cmd_buffer->push_constants, state->push_constants, MAX_PUSH_CONSTANTS_SIZE); + cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_COMPUTE_BIT; +} + +void +radv_meta_save_pass(struct radv_meta_saved_pass_state *state, + const struct radv_cmd_buffer *cmd_buffer) +{ + state->pass = cmd_buffer->state.pass; + state->subpass = cmd_buffer->state.subpass; + state->framebuffer = cmd_buffer->state.framebuffer; + state->attachments = cmd_buffer->state.attachments; + state->render_area = cmd_buffer->state.render_area; +} + +void +radv_meta_restore_pass(const struct radv_meta_saved_pass_state *state, + struct radv_cmd_buffer *cmd_buffer) +{ + cmd_buffer->state.pass = state->pass; + cmd_buffer->state.subpass = state->subpass; + cmd_buffer->state.framebuffer = state->framebuffer; + cmd_buffer->state.attachments = state->attachments; + cmd_buffer->state.render_area = state->render_area; + if (state->subpass) + radv_emit_framebuffer_state(cmd_buffer); +} + +void +radv_meta_save_compute(struct radv_meta_saved_compute_state *state, + const struct radv_cmd_buffer *cmd_buffer, + unsigned push_constant_size) +{ + state->old_pipeline = cmd_buffer->state.compute_pipeline; + state->old_descriptor_set0 = cmd_buffer->state.descriptors[0]; + + if (push_constant_size) + memcpy(state->push_constants, cmd_buffer->push_constants, push_constant_size); +} + +void +radv_meta_restore_compute(const struct radv_meta_saved_compute_state *state, + struct radv_cmd_buffer *cmd_buffer, + unsigned push_constant_size) +{ + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, + radv_pipeline_to_handle(state->old_pipeline)); + radv_bind_descriptor_set(cmd_buffer, state->old_descriptor_set0, 0); + + if (push_constant_size) { + memcpy(cmd_buffer->push_constants, state->push_constants, push_constant_size); + cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT; + } +} + +VkImageViewType +radv_meta_get_view_type(const struct radv_image *image) +{ + switch (image->type) { + case VK_IMAGE_TYPE_1D: return VK_IMAGE_VIEW_TYPE_1D; + case VK_IMAGE_TYPE_2D: return VK_IMAGE_VIEW_TYPE_2D; + case VK_IMAGE_TYPE_3D: return VK_IMAGE_VIEW_TYPE_3D; + default: + unreachable("bad VkImageViewType"); + } +} + +/** + * When creating a destination VkImageView, this function provides the needed + * VkImageViewCreateInfo::subresourceRange::baseArrayLayer. + */ +uint32_t +radv_meta_get_iview_layer(const struct radv_image *dest_image, + const VkImageSubresourceLayers *dest_subresource, + const VkOffset3D *dest_offset) +{ + switch (dest_image->type) { + case VK_IMAGE_TYPE_1D: + case VK_IMAGE_TYPE_2D: + return dest_subresource->baseArrayLayer; + case VK_IMAGE_TYPE_3D: + /* HACK: Vulkan does not allow attaching a 3D image to a framebuffer, + * but meta does it anyway. When doing so, we translate the + * destination's z offset into an array offset. + */ + return dest_offset->z; + default: + assert(!"bad VkImageType"); + return 0; + } +} + +static void * +meta_alloc(void* _device, size_t size, size_t alignment, + VkSystemAllocationScope allocationScope) +{ + struct radv_device *device = _device; + return device->alloc.pfnAllocation(device->alloc.pUserData, size, alignment, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); +} + +static void * +meta_realloc(void* _device, void *original, size_t size, size_t alignment, + VkSystemAllocationScope allocationScope) +{ + struct radv_device *device = _device; + return device->alloc.pfnReallocation(device->alloc.pUserData, original, + size, alignment, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); +} + +static void +meta_free(void* _device, void *data) +{ + struct radv_device *device = _device; + return device->alloc.pfnFree(device->alloc.pUserData, data); +} + +static bool +radv_builtin_cache_path(char *path) +{ + char *xdg_cache_home = getenv("XDG_CACHE_HOME"); + const char *suffix = "/radv_builtin_shaders"; + const char *suffix2 = "/.cache/radv_builtin_shaders"; + struct passwd pwd, *result; + char path2[PATH_MAX + 1]; /* PATH_MAX is not a real max,but suffices here. */ + + if (xdg_cache_home) { + + if (strlen(xdg_cache_home) + strlen(suffix) > PATH_MAX) + return false; + + strcpy(path, xdg_cache_home); + strcat(path, suffix); + return true; + } + + getpwuid_r(getuid(), &pwd, path2, PATH_MAX - strlen(suffix2), &result); + if (!result) + return false; + + strcpy(path, pwd.pw_dir); + strcat(path, "/.cache"); + mkdir(path, 0755); + + strcat(path, suffix); + return true; +} + +static void +radv_load_meta_pipeline(struct radv_device *device) +{ + char path[PATH_MAX + 1]; + struct stat st; + void *data = NULL; + + if (!radv_builtin_cache_path(path)) + return; + + int fd = open(path, O_RDONLY); + if (fd < 0) + return; + if (fstat(fd, &st)) + goto fail; + data = malloc(st.st_size); + if (!data) + goto fail; + if(read(fd, data, st.st_size) == -1) + goto fail; + + radv_pipeline_cache_load(&device->meta_state.cache, data, st.st_size); +fail: + free(data); + close(fd); +} + +static void +radv_store_meta_pipeline(struct radv_device *device) +{ + char path[PATH_MAX + 1], path2[PATH_MAX + 7]; + size_t size; + void *data = NULL; + + if (!device->meta_state.cache.modified) + return; + + if (radv_GetPipelineCacheData(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &size, NULL)) + return; + + if (!radv_builtin_cache_path(path)) + return; + + strcpy(path2, path); + strcat(path2, "XXXXXX"); + int fd = mkstemp(path2);//open(path, O_WRONLY | O_CREAT, 0600); + if (fd < 0) + return; + data = malloc(size); + if (!data) + goto fail; + + if (radv_GetPipelineCacheData(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &size, data)) + goto fail; + if(write(fd, data, size) == -1) + goto fail; + + rename(path2, path); +fail: + free(data); + close(fd); + unlink(path2); +} + +VkResult +radv_device_init_meta(struct radv_device *device) +{ + VkResult result; + + device->meta_state.alloc = (VkAllocationCallbacks) { + .pUserData = device, + .pfnAllocation = meta_alloc, + .pfnReallocation = meta_realloc, + .pfnFree = meta_free, + }; + + device->meta_state.cache.alloc = device->meta_state.alloc; + radv_pipeline_cache_init(&device->meta_state.cache, device); + radv_load_meta_pipeline(device); + + result = radv_device_init_meta_clear_state(device); + if (result != VK_SUCCESS) + goto fail_clear; + + result = radv_device_init_meta_resolve_state(device); + if (result != VK_SUCCESS) + goto fail_resolve; + + result = radv_device_init_meta_blit_state(device); + if (result != VK_SUCCESS) + goto fail_blit; + + result = radv_device_init_meta_blit2d_state(device); + if (result != VK_SUCCESS) + goto fail_blit2d; + + result = radv_device_init_meta_bufimage_state(device); + if (result != VK_SUCCESS) + goto fail_bufimage; + + result = radv_device_init_meta_depth_decomp_state(device); + if (result != VK_SUCCESS) + goto fail_depth_decomp; + + result = radv_device_init_meta_buffer_state(device); + if (result != VK_SUCCESS) + goto fail_buffer; + + result = radv_device_init_meta_fast_clear_flush_state(device); + if (result != VK_SUCCESS) + goto fail_fast_clear; + + result = radv_device_init_meta_resolve_compute_state(device); + if (result != VK_SUCCESS) + goto fail_resolve_compute; + return VK_SUCCESS; + +fail_resolve_compute: + radv_device_finish_meta_fast_clear_flush_state(device); +fail_fast_clear: + radv_device_finish_meta_buffer_state(device); +fail_buffer: + radv_device_finish_meta_depth_decomp_state(device); +fail_depth_decomp: + radv_device_finish_meta_bufimage_state(device); +fail_bufimage: + radv_device_finish_meta_blit2d_state(device); +fail_blit2d: + radv_device_finish_meta_blit_state(device); +fail_blit: + radv_device_finish_meta_resolve_state(device); +fail_resolve: + radv_device_finish_meta_clear_state(device); +fail_clear: + radv_pipeline_cache_finish(&device->meta_state.cache); + return result; +} + +void +radv_device_finish_meta(struct radv_device *device) +{ + radv_device_finish_meta_clear_state(device); + radv_device_finish_meta_resolve_state(device); + radv_device_finish_meta_blit_state(device); + radv_device_finish_meta_blit2d_state(device); + radv_device_finish_meta_bufimage_state(device); + radv_device_finish_meta_depth_decomp_state(device); + radv_device_finish_meta_buffer_state(device); + radv_device_finish_meta_fast_clear_flush_state(device); + radv_device_finish_meta_resolve_compute_state(device); + + radv_store_meta_pipeline(device); + radv_pipeline_cache_finish(&device->meta_state.cache); +} + +/* + * The most common meta operations all want to have the viewport + * reset and any scissors disabled. The rest of the dynamic state + * should have no effect. + */ +void +radv_meta_save_graphics_reset_vport_scissor(struct radv_meta_saved_state *saved_state, + struct radv_cmd_buffer *cmd_buffer) +{ + uint32_t dirty_state = (1 << VK_DYNAMIC_STATE_VIEWPORT) | (1 << VK_DYNAMIC_STATE_SCISSOR); + radv_meta_save(saved_state, cmd_buffer, dirty_state); + cmd_buffer->state.dynamic.viewport.count = 0; + cmd_buffer->state.dynamic.scissor.count = 0; + cmd_buffer->state.dirty |= dirty_state; +} diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h new file mode 100644 index 00000000000..ae63a308b2d --- /dev/null +++ b/src/amd/vulkan/radv_meta.h @@ -0,0 +1,190 @@ +/* + * Copyright © 2016 Red Hat + * based on intel anv code: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include "radv_private.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RADV_META_VERTEX_BINDING_COUNT 2 + +struct radv_meta_saved_state { + struct radv_vertex_binding old_vertex_bindings[RADV_META_VERTEX_BINDING_COUNT]; + struct radv_descriptor_set *old_descriptor_set0; + struct radv_pipeline *old_pipeline; + + /** + * Bitmask of (1 << VK_DYNAMIC_STATE_*). Defines the set of saved dynamic + * state. + */ + uint32_t dynamic_mask; + struct radv_dynamic_state dynamic; + + char push_constants[128]; +}; + +struct radv_meta_saved_pass_state { + struct radv_render_pass *pass; + const struct radv_subpass *subpass; + struct radv_attachment_state *attachments; + struct radv_framebuffer *framebuffer; + VkRect2D render_area; +}; + +struct radv_meta_saved_compute_state { + struct radv_descriptor_set *old_descriptor_set0; + struct radv_pipeline *old_pipeline; + + char push_constants[128]; +}; + +VkResult radv_device_init_meta_clear_state(struct radv_device *device); +void radv_device_finish_meta_clear_state(struct radv_device *device); + +VkResult radv_device_init_meta_resolve_state(struct radv_device *device); +void radv_device_finish_meta_resolve_state(struct radv_device *device); + +VkResult radv_device_init_meta_depth_decomp_state(struct radv_device *device); +void radv_device_finish_meta_depth_decomp_state(struct radv_device *device); + +VkResult radv_device_init_meta_fast_clear_flush_state(struct radv_device *device); +void radv_device_finish_meta_fast_clear_flush_state(struct radv_device *device); + +VkResult radv_device_init_meta_blit_state(struct radv_device *device); +void radv_device_finish_meta_blit_state(struct radv_device *device); + +VkResult radv_device_init_meta_blit2d_state(struct radv_device *device); +void radv_device_finish_meta_blit2d_state(struct radv_device *device); + +VkResult radv_device_init_meta_buffer_state(struct radv_device *device); +void radv_device_finish_meta_buffer_state(struct radv_device *device); + +VkResult radv_device_init_meta_resolve_compute_state(struct radv_device *device); +void radv_device_finish_meta_resolve_compute_state(struct radv_device *device); +void radv_meta_save(struct radv_meta_saved_state *state, + const struct radv_cmd_buffer *cmd_buffer, + uint32_t dynamic_mask); + +void radv_meta_restore(const struct radv_meta_saved_state *state, + struct radv_cmd_buffer *cmd_buffer); + +void radv_meta_save_pass(struct radv_meta_saved_pass_state *state, + const struct radv_cmd_buffer *cmd_buffer); + +void radv_meta_restore_pass(const struct radv_meta_saved_pass_state *state, + struct radv_cmd_buffer *cmd_buffer); + +void radv_meta_save_compute(struct radv_meta_saved_compute_state *state, + const struct radv_cmd_buffer *cmd_buffer, + unsigned push_constant_size); + +void radv_meta_restore_compute(const struct radv_meta_saved_compute_state *state, + struct radv_cmd_buffer *cmd_buffer, + unsigned push_constant_size); + +VkImageViewType radv_meta_get_view_type(const struct radv_image *image); + +uint32_t radv_meta_get_iview_layer(const struct radv_image *dest_image, + const VkImageSubresourceLayers *dest_subresource, + const VkOffset3D *dest_offset); + +struct radv_meta_blit2d_surf { + /** The size of an element in bytes. */ + uint8_t bs; + VkFormat format; + + struct radv_image *image; + unsigned level; + unsigned layer; + VkImageAspectFlags aspect_mask; +}; + +struct radv_meta_blit2d_buffer { + struct radv_buffer *buffer; + uint32_t offset; + uint32_t pitch; + uint8_t bs; + VkFormat format; +}; + +struct radv_meta_blit2d_rect { + uint32_t src_x, src_y; + uint32_t dst_x, dst_y; + uint32_t width, height; +}; + +void radv_meta_begin_blit2d(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_saved_state *save); + +void radv_meta_blit2d(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_blit2d_surf *src_img, + struct radv_meta_blit2d_buffer *src_buf, + struct radv_meta_blit2d_surf *dst, + unsigned num_rects, + struct radv_meta_blit2d_rect *rects); + +void radv_meta_end_blit2d(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_saved_state *save); + + +VkResult radv_device_init_meta_bufimage_state(struct radv_device *device); +void radv_device_finish_meta_bufimage_state(struct radv_device *device); +void radv_meta_begin_bufimage(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_saved_compute_state *save); +void radv_meta_end_bufimage(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_saved_compute_state *save); + +void radv_meta_image_to_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_blit2d_surf *src, + struct radv_meta_blit2d_buffer *dst, + unsigned num_rects, + struct radv_meta_blit2d_rect *rects); + +void radv_decompress_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + VkImageSubresourceRange *subresourceRange); +void radv_resummarize_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + VkImageSubresourceRange *subresourceRange); +void radv_fast_clear_flush_image_inplace(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image); + +void radv_meta_save_graphics_reset_vport_scissor(struct radv_meta_saved_state *saved_state, + struct radv_cmd_buffer *cmd_buffer); + +void radv_meta_resolve_compute_image(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *src_image, + VkImageLayout src_image_layout, + struct radv_image *dest_image, + VkImageLayout dest_image_layout, + uint32_t region_count, + const VkImageResolve *regions); + +#ifdef __cplusplus +} +#endif diff --git a/src/amd/vulkan/radv_meta_blit.c b/src/amd/vulkan/radv_meta_blit.c new file mode 100644 index 00000000000..497e42f8c08 --- /dev/null +++ b/src/amd/vulkan/radv_meta_blit.c @@ -0,0 +1,1270 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "radv_meta.h" +#include "nir/nir_builder.h" + +struct blit_region { + VkOffset3D src_offset; + VkExtent3D src_extent; + VkOffset3D dest_offset; + VkExtent3D dest_extent; +}; + +static nir_shader * +build_nir_vertex_shader(void) +{ + const struct glsl_type *vec4 = glsl_vec4_type(); + nir_builder b; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, NULL); + b.shader->info.name = ralloc_strdup(b.shader, "meta_blit_vs"); + + nir_variable *pos_in = nir_variable_create(b.shader, nir_var_shader_in, + vec4, "a_pos"); + pos_in->data.location = VERT_ATTRIB_GENERIC0; + nir_variable *pos_out = nir_variable_create(b.shader, nir_var_shader_out, + vec4, "gl_Position"); + pos_out->data.location = VARYING_SLOT_POS; + nir_copy_var(&b, pos_out, pos_in); + + nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in, + vec4, "a_tex_pos"); + tex_pos_in->data.location = VERT_ATTRIB_GENERIC1; + nir_variable *tex_pos_out = nir_variable_create(b.shader, nir_var_shader_out, + vec4, "v_tex_pos"); + tex_pos_out->data.location = VARYING_SLOT_VAR0; + tex_pos_out->data.interpolation = INTERP_MODE_SMOOTH; + nir_copy_var(&b, tex_pos_out, tex_pos_in); + + return b.shader; +} + +static nir_shader * +build_nir_copy_fragment_shader(enum glsl_sampler_dim tex_dim) +{ + char shader_name[64]; + const struct glsl_type *vec4 = glsl_vec4_type(); + nir_builder b; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL); + + sprintf(shader_name, "meta_blit_fs.%d", tex_dim); + b.shader->info.name = ralloc_strdup(b.shader, shader_name); + + nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in, + vec4, "v_tex_pos"); + tex_pos_in->data.location = VARYING_SLOT_VAR0; + + /* Swizzle the array index which comes in as Z coordinate into the right + * position. + */ + unsigned swz[] = { 0, (tex_dim == GLSL_SAMPLER_DIM_1D ? 2 : 1), 2 }; + nir_ssa_def *const tex_pos = + nir_swizzle(&b, nir_load_var(&b, tex_pos_in), swz, + (tex_dim == GLSL_SAMPLER_DIM_1D ? 2 : 3), false); + + const struct glsl_type *sampler_type = + glsl_sampler_type(tex_dim, false, tex_dim != GLSL_SAMPLER_DIM_3D, + glsl_get_base_type(vec4)); + nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform, + sampler_type, "s_tex"); + sampler->data.descriptor_set = 0; + sampler->data.binding = 0; + + nir_tex_instr *tex = nir_tex_instr_create(b.shader, 1); + tex->sampler_dim = tex_dim; + tex->op = nir_texop_tex; + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(tex_pos); + tex->dest_type = nir_type_float; /* TODO */ + tex->is_array = glsl_sampler_type_is_array(sampler_type); + tex->coord_components = tex_pos->num_components; + tex->texture = nir_deref_var_create(tex, sampler); + tex->sampler = nir_deref_var_create(tex, sampler); + + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); + nir_builder_instr_insert(&b, &tex->instr); + + nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out, + vec4, "f_color"); + color_out->data.location = FRAG_RESULT_DATA0; + nir_store_var(&b, color_out, &tex->dest.ssa, 0xf); + + return b.shader; +} + +static nir_shader * +build_nir_copy_fragment_shader_depth(enum glsl_sampler_dim tex_dim) +{ + char shader_name[64]; + const struct glsl_type *vec4 = glsl_vec4_type(); + nir_builder b; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL); + + sprintf(shader_name, "meta_blit_depth_fs.%d", tex_dim); + b.shader->info.name = ralloc_strdup(b.shader, shader_name); + + nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in, + vec4, "v_tex_pos"); + tex_pos_in->data.location = VARYING_SLOT_VAR0; + + /* Swizzle the array index which comes in as Z coordinate into the right + * position. + */ + unsigned swz[] = { 0, (tex_dim == GLSL_SAMPLER_DIM_1D ? 2 : 1), 2 }; + nir_ssa_def *const tex_pos = + nir_swizzle(&b, nir_load_var(&b, tex_pos_in), swz, + (tex_dim == GLSL_SAMPLER_DIM_1D ? 2 : 3), false); + + const struct glsl_type *sampler_type = + glsl_sampler_type(tex_dim, false, tex_dim != GLSL_SAMPLER_DIM_3D, + glsl_get_base_type(vec4)); + nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform, + sampler_type, "s_tex"); + sampler->data.descriptor_set = 0; + sampler->data.binding = 0; + + nir_tex_instr *tex = nir_tex_instr_create(b.shader, 1); + tex->sampler_dim = tex_dim; + tex->op = nir_texop_tex; + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(tex_pos); + tex->dest_type = nir_type_float; /* TODO */ + tex->is_array = glsl_sampler_type_is_array(sampler_type); + tex->coord_components = tex_pos->num_components; + tex->texture = nir_deref_var_create(tex, sampler); + tex->sampler = nir_deref_var_create(tex, sampler); + + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); + nir_builder_instr_insert(&b, &tex->instr); + + nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out, + vec4, "f_color"); + color_out->data.location = FRAG_RESULT_DEPTH; + nir_store_var(&b, color_out, &tex->dest.ssa, 0x1); + + return b.shader; +} + +static nir_shader * +build_nir_copy_fragment_shader_stencil(enum glsl_sampler_dim tex_dim) +{ + char shader_name[64]; + const struct glsl_type *vec4 = glsl_vec4_type(); + nir_builder b; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL); + + sprintf(shader_name, "meta_blit_stencil_fs.%d", tex_dim); + b.shader->info.name = ralloc_strdup(b.shader, shader_name); + + nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in, + vec4, "v_tex_pos"); + tex_pos_in->data.location = VARYING_SLOT_VAR0; + + /* Swizzle the array index which comes in as Z coordinate into the right + * position. + */ + unsigned swz[] = { 0, (tex_dim == GLSL_SAMPLER_DIM_1D ? 2 : 1), 2 }; + nir_ssa_def *const tex_pos = + nir_swizzle(&b, nir_load_var(&b, tex_pos_in), swz, + (tex_dim == GLSL_SAMPLER_DIM_1D ? 2 : 3), false); + + const struct glsl_type *sampler_type = + glsl_sampler_type(tex_dim, false, tex_dim != GLSL_SAMPLER_DIM_3D, + glsl_get_base_type(vec4)); + nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform, + sampler_type, "s_tex"); + sampler->data.descriptor_set = 0; + sampler->data.binding = 0; + + nir_tex_instr *tex = nir_tex_instr_create(b.shader, 1); + tex->sampler_dim = tex_dim; + tex->op = nir_texop_tex; + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(tex_pos); + tex->dest_type = nir_type_float; /* TODO */ + tex->is_array = glsl_sampler_type_is_array(sampler_type); + tex->coord_components = tex_pos->num_components; + tex->texture = nir_deref_var_create(tex, sampler); + tex->sampler = nir_deref_var_create(tex, sampler); + + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); + nir_builder_instr_insert(&b, &tex->instr); + + nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out, + vec4, "f_color"); + color_out->data.location = FRAG_RESULT_STENCIL; + nir_store_var(&b, color_out, &tex->dest.ssa, 0x1); + + return b.shader; +} + +static void +meta_emit_blit(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *src_image, + struct radv_image_view *src_iview, + VkOffset3D src_offset, + VkExtent3D src_extent, + struct radv_image *dest_image, + struct radv_image_view *dest_iview, + VkOffset3D dest_offset, + VkExtent3D dest_extent, + VkFilter blit_filter) +{ + struct radv_device *device = cmd_buffer->device; + unsigned offset = 0; + struct blit_vb_data { + float pos[2]; + float tex_coord[3]; + } vb_data[3]; + + assert(src_image->samples == dest_image->samples); + unsigned vb_size = 3 * sizeof(*vb_data); + vb_data[0] = (struct blit_vb_data) { + .pos = { + dest_offset.x, + dest_offset.y, + }, + .tex_coord = { + (float)(src_offset.x) / (float)src_iview->extent.width, + (float)(src_offset.y) / (float)src_iview->extent.height, + (float)src_offset.z / (float)src_iview->extent.depth, + }, + }; + + vb_data[1] = (struct blit_vb_data) { + .pos = { + dest_offset.x, + dest_offset.y + dest_extent.height, + }, + .tex_coord = { + (float)src_offset.x / (float)src_iview->extent.width, + (float)(src_offset.y + src_extent.height) / + (float)src_iview->extent.height, + (float)src_offset.z / (float)src_iview->extent.depth, + }, + }; + + vb_data[2] = (struct blit_vb_data) { + .pos = { + dest_offset.x + dest_extent.width, + dest_offset.y, + }, + .tex_coord = { + (float)(src_offset.x + src_extent.width) / (float)src_iview->extent.width, + (float)src_offset.y / (float)src_iview->extent.height, + (float)src_offset.z / (float)src_iview->extent.depth, + }, + }; + radv_cmd_buffer_upload_data(cmd_buffer, vb_size, 16, vb_data, &offset); + + struct radv_buffer vertex_buffer = { + .device = device, + .size = vb_size, + .bo = cmd_buffer->upload.upload_bo, + .offset = offset, + }; + + radv_CmdBindVertexBuffers(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, + (VkBuffer[]) { + radv_buffer_to_handle(&vertex_buffer) + }, + (VkDeviceSize[]) { + 0, + }); + + VkSampler sampler; + RADV_CALL(CreateSampler)(radv_device_to_handle(device), + &(VkSamplerCreateInfo) { + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .magFilter = blit_filter, + .minFilter = blit_filter, + .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, + .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, + .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, + }, &cmd_buffer->pool->alloc, &sampler); + + VkDescriptorSet set; + radv_temp_descriptor_set_create(cmd_buffer->device, cmd_buffer, + device->meta_state.blit.ds_layout, + &set); + + radv_UpdateDescriptorSets(radv_device_to_handle(device), + 1, /* writeCount */ + (VkWriteDescriptorSet[]) { + { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = set, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .pImageInfo = (VkDescriptorImageInfo[]) { + { + .sampler = sampler, + .imageView = radv_image_view_to_handle(src_iview), + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }, + } + } + }, 0, NULL); + + VkFramebuffer fb; + radv_CreateFramebuffer(radv_device_to_handle(device), + &(VkFramebufferCreateInfo) { + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = (VkImageView[]) { + radv_image_view_to_handle(dest_iview), + }, + .width = dest_iview->extent.width, + .height = dest_iview->extent.height, + .layers = 1, + }, &cmd_buffer->pool->alloc, &fb); + VkPipeline pipeline; + switch (src_iview->aspect_mask) { + case VK_IMAGE_ASPECT_COLOR_BIT: { + unsigned fs_key = radv_format_meta_fs_key(dest_image->vk_format); + + RADV_CALL(CmdBeginRenderPass)(radv_cmd_buffer_to_handle(cmd_buffer), + &(VkRenderPassBeginInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = device->meta_state.blit.render_pass[fs_key], + .framebuffer = fb, + .renderArea = { + .offset = { dest_offset.x, dest_offset.y }, + .extent = { dest_extent.width, dest_extent.height }, + }, + .clearValueCount = 0, + .pClearValues = NULL, + }, VK_SUBPASS_CONTENTS_INLINE); + switch (src_image->type) { + case VK_IMAGE_TYPE_1D: + pipeline = device->meta_state.blit.pipeline_1d_src[fs_key]; + break; + case VK_IMAGE_TYPE_2D: + pipeline = device->meta_state.blit.pipeline_2d_src[fs_key]; + break; + case VK_IMAGE_TYPE_3D: + pipeline = device->meta_state.blit.pipeline_3d_src[fs_key]; + break; + default: + unreachable(!"bad VkImageType"); + } + break; + } + case VK_IMAGE_ASPECT_DEPTH_BIT: + RADV_CALL(CmdBeginRenderPass)(radv_cmd_buffer_to_handle(cmd_buffer), + &(VkRenderPassBeginInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = device->meta_state.blit.depth_only_rp, + .framebuffer = fb, + .renderArea = { + .offset = { dest_offset.x, dest_offset.y }, + .extent = { dest_extent.width, dest_extent.height }, + }, + .clearValueCount = 0, + .pClearValues = NULL, + }, VK_SUBPASS_CONTENTS_INLINE); + switch (src_image->type) { + case VK_IMAGE_TYPE_1D: + pipeline = device->meta_state.blit.depth_only_1d_pipeline; + break; + case VK_IMAGE_TYPE_2D: + pipeline = device->meta_state.blit.depth_only_2d_pipeline; + break; + case VK_IMAGE_TYPE_3D: + pipeline = device->meta_state.blit.depth_only_3d_pipeline; + break; + default: + unreachable(!"bad VkImageType"); + } + break; + case VK_IMAGE_ASPECT_STENCIL_BIT: + RADV_CALL(CmdBeginRenderPass)(radv_cmd_buffer_to_handle(cmd_buffer), + &(VkRenderPassBeginInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = device->meta_state.blit.stencil_only_rp, + .framebuffer = fb, + .renderArea = { + .offset = { dest_offset.x, dest_offset.y }, + .extent = { dest_extent.width, dest_extent.height }, + }, + .clearValueCount = 0, + .pClearValues = NULL, + }, VK_SUBPASS_CONTENTS_INLINE); + switch (src_image->type) { + case VK_IMAGE_TYPE_1D: + pipeline = device->meta_state.blit.stencil_only_1d_pipeline; + break; + case VK_IMAGE_TYPE_2D: + pipeline = device->meta_state.blit.stencil_only_2d_pipeline; + break; + case VK_IMAGE_TYPE_3D: + pipeline = device->meta_state.blit.stencil_only_3d_pipeline; + break; + default: + unreachable(!"bad VkImageType"); + } + break; + default: + unreachable(!"bad VkImageType"); + } + + if (cmd_buffer->state.pipeline != radv_pipeline_from_handle(pipeline)) { + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); + } + + radv_CmdBindDescriptorSets(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_GRAPHICS, + device->meta_state.blit.pipeline_layout, 0, 1, + &set, 0, NULL); + + RADV_CALL(CmdDraw)(radv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0); + + RADV_CALL(CmdEndRenderPass)(radv_cmd_buffer_to_handle(cmd_buffer)); + + /* At the point where we emit the draw call, all data from the + * descriptor sets, etc. has been used. We are free to delete it. + */ + /* TODO: above comment is not valid for at least descriptor sets/pools, + * as we may not free them till after execution finishes. Check others. */ + + radv_temp_descriptor_set_destroy(cmd_buffer->device, set); + radv_DestroySampler(radv_device_to_handle(device), sampler, + &cmd_buffer->pool->alloc); + radv_DestroyFramebuffer(radv_device_to_handle(device), fb, + &cmd_buffer->pool->alloc); +} + +void radv_CmdBlitImage( + VkCommandBuffer commandBuffer, + VkImage srcImage, + VkImageLayout srcImageLayout, + VkImage destImage, + VkImageLayout destImageLayout, + uint32_t regionCount, + const VkImageBlit* pRegions, + VkFilter filter) + +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_image, src_image, srcImage); + RADV_FROM_HANDLE(radv_image, dest_image, destImage); + struct radv_meta_saved_state saved_state; + + /* From the Vulkan 1.0 spec: + * + * vkCmdBlitImage must not be used for multisampled source or + * destination images. Use vkCmdResolveImage for this purpose. + */ + assert(src_image->samples == 1); + assert(dest_image->samples == 1); + + radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer); + + for (unsigned r = 0; r < regionCount; r++) { + struct radv_image_view src_iview; + radv_image_view_init(&src_iview, cmd_buffer->device, + &(VkImageViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = srcImage, + .viewType = radv_meta_get_view_type(src_image), + .format = src_image->vk_format, + .subresourceRange = { + .aspectMask = pRegions[r].srcSubresource.aspectMask, + .baseMipLevel = pRegions[r].srcSubresource.mipLevel, + .levelCount = 1, + .baseArrayLayer = pRegions[r].srcSubresource.baseArrayLayer, + .layerCount = 1 + }, + }, + cmd_buffer, VK_IMAGE_USAGE_SAMPLED_BIT); + + if (pRegions[r].dstOffsets[1].x < pRegions[r].dstOffsets[0].x || + pRegions[r].dstOffsets[1].y < pRegions[r].dstOffsets[0].y || + pRegions[r].srcOffsets[1].x < pRegions[r].srcOffsets[0].x || + pRegions[r].srcOffsets[1].y < pRegions[r].srcOffsets[0].y) + radv_finishme("FINISHME: Allow flipping in blits"); + + const VkExtent3D dest_extent = { + .width = pRegions[r].dstOffsets[1].x - pRegions[r].dstOffsets[0].x, + .height = pRegions[r].dstOffsets[1].y - pRegions[r].dstOffsets[0].y, + .depth = 1, + }; + + const VkExtent3D src_extent = { + .width = pRegions[r].srcOffsets[1].x - pRegions[r].srcOffsets[0].x, + .height = pRegions[r].srcOffsets[1].y - pRegions[r].srcOffsets[0].y, + .depth = pRegions[r].srcOffsets[1].z - pRegions[r].srcOffsets[0].z, + }; + + + if (pRegions[r].srcSubresource.layerCount > 1) + radv_finishme("FINISHME: copy multiple array layers"); + + struct radv_image_view dest_iview; + unsigned usage; + if (pRegions[r].dstSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT) + usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + else + usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT; + + for (unsigned i = pRegions[r].dstOffsets[0].z; i < pRegions[r].dstOffsets[1].z; i++) { + + const VkOffset3D dest_offset = { + .x = pRegions[r].dstOffsets[0].x, + .y = pRegions[r].dstOffsets[0].y, + .z = i, + }; + VkOffset3D src_offset = { + .x = pRegions[r].srcOffsets[0].x, + .y = pRegions[r].srcOffsets[0].y, + .z = i, + }; + const uint32_t dest_array_slice = + radv_meta_get_iview_layer(dest_image, &pRegions[r].dstSubresource, + &dest_offset); + + radv_image_view_init(&dest_iview, cmd_buffer->device, + &(VkImageViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = destImage, + .viewType = radv_meta_get_view_type(dest_image), + .format = dest_image->vk_format, + .subresourceRange = { + .aspectMask = pRegions[r].dstSubresource.aspectMask, + .baseMipLevel = pRegions[r].dstSubresource.mipLevel, + .levelCount = 1, + .baseArrayLayer = dest_array_slice, + .layerCount = 1 + }, + }, + cmd_buffer, usage); + meta_emit_blit(cmd_buffer, + src_image, &src_iview, + src_offset, src_extent, + dest_image, &dest_iview, + dest_offset, dest_extent, + filter); + } + } + + radv_meta_restore(&saved_state, cmd_buffer); +} + +void +radv_device_finish_meta_blit_state(struct radv_device *device) +{ + for (unsigned i = 0; i < NUM_META_FS_KEYS; ++i) { + if (device->meta_state.blit.render_pass[i]) + radv_DestroyRenderPass(radv_device_to_handle(device), + device->meta_state.blit.render_pass[i], + &device->meta_state.alloc); + if (device->meta_state.blit.pipeline_1d_src[i]) + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.blit.pipeline_1d_src[i], + &device->meta_state.alloc); + if (device->meta_state.blit.pipeline_2d_src[i]) + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.blit.pipeline_2d_src[i], + &device->meta_state.alloc); + if (device->meta_state.blit.pipeline_3d_src[i]) + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.blit.pipeline_3d_src[i], + &device->meta_state.alloc); + } + + if (device->meta_state.blit.depth_only_rp) + radv_DestroyRenderPass(radv_device_to_handle(device), + device->meta_state.blit.depth_only_rp, + &device->meta_state.alloc); + if (device->meta_state.blit.depth_only_1d_pipeline) + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.blit.depth_only_1d_pipeline, + &device->meta_state.alloc); + if (device->meta_state.blit.depth_only_2d_pipeline) + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.blit.depth_only_2d_pipeline, + &device->meta_state.alloc); + if (device->meta_state.blit.depth_only_3d_pipeline) + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.blit.depth_only_3d_pipeline, + &device->meta_state.alloc); + if (device->meta_state.blit.stencil_only_rp) + radv_DestroyRenderPass(radv_device_to_handle(device), + device->meta_state.blit.stencil_only_rp, + &device->meta_state.alloc); + if (device->meta_state.blit.stencil_only_1d_pipeline) + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.blit.stencil_only_1d_pipeline, + &device->meta_state.alloc); + if (device->meta_state.blit.stencil_only_2d_pipeline) + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.blit.stencil_only_2d_pipeline, + &device->meta_state.alloc); + if (device->meta_state.blit.stencil_only_3d_pipeline) + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.blit.stencil_only_3d_pipeline, + &device->meta_state.alloc); + if (device->meta_state.blit.pipeline_layout) + radv_DestroyPipelineLayout(radv_device_to_handle(device), + device->meta_state.blit.pipeline_layout, + &device->meta_state.alloc); + if (device->meta_state.blit.ds_layout) + radv_DestroyDescriptorSetLayout(radv_device_to_handle(device), + device->meta_state.blit.ds_layout, + &device->meta_state.alloc); +} + +static VkFormat pipeline_formats[] = { + VK_FORMAT_R8G8B8A8_UNORM, + VK_FORMAT_R8G8B8A8_UINT, + VK_FORMAT_R8G8B8A8_SINT, + VK_FORMAT_R16G16B16A16_UNORM, + VK_FORMAT_R16G16B16A16_SNORM, + VK_FORMAT_R16G16B16A16_UINT, + VK_FORMAT_R16G16B16A16_SINT, + VK_FORMAT_R32_SFLOAT, + VK_FORMAT_R32G32_SFLOAT, + VK_FORMAT_R32G32B32A32_SFLOAT +}; + +static VkResult +radv_device_init_meta_blit_color(struct radv_device *device, + struct radv_shader_module *vs) +{ + struct radv_shader_module fs_1d = {0}, fs_2d = {0}, fs_3d = {0}; + VkResult result; + + fs_1d.nir = build_nir_copy_fragment_shader(GLSL_SAMPLER_DIM_1D); + fs_2d.nir = build_nir_copy_fragment_shader(GLSL_SAMPLER_DIM_2D); + fs_3d.nir = build_nir_copy_fragment_shader(GLSL_SAMPLER_DIM_3D); + + for (unsigned i = 0; i < ARRAY_SIZE(pipeline_formats); ++i) { + unsigned key = radv_format_meta_fs_key(pipeline_formats[i]); + result = radv_CreateRenderPass(radv_device_to_handle(device), + &(VkRenderPassCreateInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = pipeline_formats[i], + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + .finalLayout = VK_IMAGE_LAYOUT_GENERAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .colorAttachmentCount = 1, + .pColorAttachments = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + .pResolveAttachments = NULL, + .pDepthStencilAttachment = &(VkAttachmentReference) { + .attachment = VK_ATTACHMENT_UNUSED, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + .preserveAttachmentCount = 1, + .pPreserveAttachments = (uint32_t[]) { 0 }, + }, + .dependencyCount = 0, + }, &device->meta_state.alloc, &device->meta_state.blit.render_pass[key]); + if (result != VK_SUCCESS) + goto fail; + + VkPipelineVertexInputStateCreateInfo vi_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) { + { + .binding = 0, + .stride = 5 * sizeof(float), + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX + }, + }, + .vertexAttributeDescriptionCount = 2, + .pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) { + { + /* Position */ + .location = 0, + .binding = 0, + .format = VK_FORMAT_R32G32_SFLOAT, + .offset = 0 + }, + { + /* Texture Coordinate */ + .location = 1, + .binding = 0, + .format = VK_FORMAT_R32G32B32_SFLOAT, + .offset = 8 + } + } + }; + + VkPipelineShaderStageCreateInfo pipeline_shader_stages[] = { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = radv_shader_module_to_handle(vs), + .pName = "main", + .pSpecializationInfo = NULL + }, { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = VK_NULL_HANDLE, /* TEMPLATE VALUE! FILL ME IN! */ + .pName = "main", + .pSpecializationInfo = NULL + }, + }; + + const VkGraphicsPipelineCreateInfo vk_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = ARRAY_SIZE(pipeline_shader_stages), + .pStages = pipeline_shader_stages, + .pVertexInputState = &vi_create_info, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + .primitiveRestartEnable = false, + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 0, + .scissorCount = 0, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .rasterizerDiscardEnable = false, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = 1, + .sampleShadingEnable = false, + .pSampleMask = (VkSampleMask[]) { UINT32_MAX }, + }, + .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = (VkPipelineColorBlendAttachmentState []) { + { .colorWriteMask = + VK_COLOR_COMPONENT_A_BIT | + VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT }, + } + }, + .pDynamicState = &(VkPipelineDynamicStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = 2, + .pDynamicStates = (VkDynamicState[]) { + VK_DYNAMIC_STATE_LINE_WIDTH, + VK_DYNAMIC_STATE_BLEND_CONSTANTS, + }, + }, + .flags = 0, + .layout = device->meta_state.blit.pipeline_layout, + .renderPass = device->meta_state.blit.render_pass[key], + .subpass = 0, + }; + + const struct radv_graphics_pipeline_create_info radv_pipeline_info = { + .use_rectlist = true + }; + + pipeline_shader_stages[1].module = radv_shader_module_to_handle(&fs_1d); + result = radv_graphics_pipeline_create(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &vk_pipeline_info, &radv_pipeline_info, + &device->meta_state.alloc, &device->meta_state.blit.pipeline_1d_src[key]); + if (result != VK_SUCCESS) + goto fail; + + pipeline_shader_stages[1].module = radv_shader_module_to_handle(&fs_2d); + result = radv_graphics_pipeline_create(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &vk_pipeline_info, &radv_pipeline_info, + &device->meta_state.alloc, &device->meta_state.blit.pipeline_2d_src[key]); + if (result != VK_SUCCESS) + goto fail; + + pipeline_shader_stages[1].module = radv_shader_module_to_handle(&fs_3d); + result = radv_graphics_pipeline_create(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &vk_pipeline_info, &radv_pipeline_info, + &device->meta_state.alloc, &device->meta_state.blit.pipeline_3d_src[key]); + if (result != VK_SUCCESS) + goto fail; + + } + + result = VK_SUCCESS; +fail: + ralloc_free(fs_1d.nir); + ralloc_free(fs_2d.nir); + ralloc_free(fs_3d.nir); + return result; +} + +static VkResult +radv_device_init_meta_blit_depth(struct radv_device *device, + struct radv_shader_module *vs) +{ + struct radv_shader_module fs_1d = {0}, fs_2d = {0}, fs_3d = {0}; + VkResult result; + + fs_1d.nir = build_nir_copy_fragment_shader_depth(GLSL_SAMPLER_DIM_1D); + fs_2d.nir = build_nir_copy_fragment_shader_depth(GLSL_SAMPLER_DIM_2D); + fs_3d.nir = build_nir_copy_fragment_shader_depth(GLSL_SAMPLER_DIM_3D); + + result = radv_CreateRenderPass(radv_device_to_handle(device), + &(VkRenderPassCreateInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = 0, + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + .finalLayout = VK_IMAGE_LAYOUT_GENERAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .colorAttachmentCount = 0, + .pColorAttachments = NULL, + .pResolveAttachments = NULL, + .pDepthStencilAttachment = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + .preserveAttachmentCount = 1, + .pPreserveAttachments = (uint32_t[]) { 0 }, + }, + .dependencyCount = 0, + }, &device->meta_state.alloc, &device->meta_state.blit.depth_only_rp); + if (result != VK_SUCCESS) + goto fail; + + VkPipelineVertexInputStateCreateInfo vi_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) { + { + .binding = 0, + .stride = 5 * sizeof(float), + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX + }, + }, + .vertexAttributeDescriptionCount = 2, + .pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) { + { + /* Position */ + .location = 0, + .binding = 0, + .format = VK_FORMAT_R32G32_SFLOAT, + .offset = 0 + }, + { + /* Texture Coordinate */ + .location = 1, + .binding = 0, + .format = VK_FORMAT_R32G32B32_SFLOAT, + .offset = 8 + } + } + }; + + VkPipelineShaderStageCreateInfo pipeline_shader_stages[] = { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = radv_shader_module_to_handle(vs), + .pName = "main", + .pSpecializationInfo = NULL + }, { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = VK_NULL_HANDLE, /* TEMPLATE VALUE! FILL ME IN! */ + .pName = "main", + .pSpecializationInfo = NULL + }, + }; + + const VkGraphicsPipelineCreateInfo vk_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = ARRAY_SIZE(pipeline_shader_stages), + .pStages = pipeline_shader_stages, + .pVertexInputState = &vi_create_info, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + .primitiveRestartEnable = false, + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 0, + .scissorCount = 0, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .rasterizerDiscardEnable = false, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = 1, + .sampleShadingEnable = false, + .pSampleMask = (VkSampleMask[]) { UINT32_MAX }, + }, + .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 0, + .pAttachments = NULL, + }, + .pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = true, + .depthWriteEnable = true, + .depthCompareOp = VK_COMPARE_OP_ALWAYS, + }, + .pDynamicState = &(VkPipelineDynamicStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = 7, + .pDynamicStates = (VkDynamicState[]) { + VK_DYNAMIC_STATE_LINE_WIDTH, + VK_DYNAMIC_STATE_DEPTH_BIAS, + VK_DYNAMIC_STATE_BLEND_CONSTANTS, + VK_DYNAMIC_STATE_DEPTH_BOUNDS, + VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, + VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, + VK_DYNAMIC_STATE_STENCIL_REFERENCE, + }, + }, + .flags = 0, + .layout = device->meta_state.blit.pipeline_layout, + .renderPass = device->meta_state.blit.depth_only_rp, + .subpass = 0, + }; + + const struct radv_graphics_pipeline_create_info radv_pipeline_info = { + .use_rectlist = true + }; + + pipeline_shader_stages[1].module = radv_shader_module_to_handle(&fs_1d); + result = radv_graphics_pipeline_create(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &vk_pipeline_info, &radv_pipeline_info, + &device->meta_state.alloc, &device->meta_state.blit.depth_only_1d_pipeline); + if (result != VK_SUCCESS) + goto fail; + + pipeline_shader_stages[1].module = radv_shader_module_to_handle(&fs_2d); + result = radv_graphics_pipeline_create(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &vk_pipeline_info, &radv_pipeline_info, + &device->meta_state.alloc, &device->meta_state.blit.depth_only_2d_pipeline); + if (result != VK_SUCCESS) + goto fail; + + pipeline_shader_stages[1].module = radv_shader_module_to_handle(&fs_3d); + result = radv_graphics_pipeline_create(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &vk_pipeline_info, &radv_pipeline_info, + &device->meta_state.alloc, &device->meta_state.blit.depth_only_3d_pipeline); + if (result != VK_SUCCESS) + goto fail; + +fail: + ralloc_free(fs_1d.nir); + ralloc_free(fs_2d.nir); + ralloc_free(fs_3d.nir); + return result; +} + +static VkResult +radv_device_init_meta_blit_stencil(struct radv_device *device, + struct radv_shader_module *vs) +{ + struct radv_shader_module fs_1d = {0}, fs_2d = {0}, fs_3d = {0}; + VkResult result; + + fs_1d.nir = build_nir_copy_fragment_shader_stencil(GLSL_SAMPLER_DIM_1D); + fs_2d.nir = build_nir_copy_fragment_shader_stencil(GLSL_SAMPLER_DIM_2D); + fs_3d.nir = build_nir_copy_fragment_shader_stencil(GLSL_SAMPLER_DIM_3D); + + result = radv_CreateRenderPass(radv_device_to_handle(device), + &(VkRenderPassCreateInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = 0, + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + .finalLayout = VK_IMAGE_LAYOUT_GENERAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .colorAttachmentCount = 0, + .pColorAttachments = NULL, + .pResolveAttachments = NULL, + .pDepthStencilAttachment = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + .preserveAttachmentCount = 1, + .pPreserveAttachments = (uint32_t[]) { 0 }, + }, + .dependencyCount = 0, + }, &device->meta_state.alloc, &device->meta_state.blit.stencil_only_rp); + if (result != VK_SUCCESS) + goto fail; + + VkPipelineVertexInputStateCreateInfo vi_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) { + { + .binding = 0, + .stride = 5 * sizeof(float), + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX + }, + }, + .vertexAttributeDescriptionCount = 2, + .pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) { + { + /* Position */ + .location = 0, + .binding = 0, + .format = VK_FORMAT_R32G32_SFLOAT, + .offset = 0 + }, + { + /* Texture Coordinate */ + .location = 1, + .binding = 0, + .format = VK_FORMAT_R32G32B32_SFLOAT, + .offset = 8 + } + } + }; + + VkPipelineShaderStageCreateInfo pipeline_shader_stages[] = { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = radv_shader_module_to_handle(vs), + .pName = "main", + .pSpecializationInfo = NULL + }, { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = VK_NULL_HANDLE, /* TEMPLATE VALUE! FILL ME IN! */ + .pName = "main", + .pSpecializationInfo = NULL + }, + }; + + const VkGraphicsPipelineCreateInfo vk_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = ARRAY_SIZE(pipeline_shader_stages), + .pStages = pipeline_shader_stages, + .pVertexInputState = &vi_create_info, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + .primitiveRestartEnable = false, + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 0, + .scissorCount = 0, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .rasterizerDiscardEnable = false, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = 1, + .sampleShadingEnable = false, + .pSampleMask = (VkSampleMask[]) { UINT32_MAX }, + }, + .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 0, + .pAttachments = NULL, + }, + .pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = false, + .depthWriteEnable = false, + .stencilTestEnable = true, + .front = { + .failOp = VK_STENCIL_OP_REPLACE, + .passOp = VK_STENCIL_OP_REPLACE, + .depthFailOp = VK_STENCIL_OP_REPLACE, + .compareOp = VK_COMPARE_OP_ALWAYS, + .compareMask = 0xff, + .writeMask = 0xff, + .reference = 0 + }, + .back = { + .failOp = VK_STENCIL_OP_REPLACE, + .passOp = VK_STENCIL_OP_REPLACE, + .depthFailOp = VK_STENCIL_OP_REPLACE, + .compareOp = VK_COMPARE_OP_ALWAYS, + .compareMask = 0xff, + .writeMask = 0xff, + .reference = 0 + }, + .depthCompareOp = VK_COMPARE_OP_ALWAYS, + }, + + .pDynamicState = &(VkPipelineDynamicStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = 4, + .pDynamicStates = (VkDynamicState[]) { + VK_DYNAMIC_STATE_LINE_WIDTH, + VK_DYNAMIC_STATE_DEPTH_BIAS, + VK_DYNAMIC_STATE_BLEND_CONSTANTS, + VK_DYNAMIC_STATE_DEPTH_BOUNDS, + }, + }, + .flags = 0, + .layout = device->meta_state.blit.pipeline_layout, + .renderPass = device->meta_state.blit.stencil_only_rp, + .subpass = 0, + }; + + const struct radv_graphics_pipeline_create_info radv_pipeline_info = { + .use_rectlist = true + }; + + pipeline_shader_stages[1].module = radv_shader_module_to_handle(&fs_1d); + result = radv_graphics_pipeline_create(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &vk_pipeline_info, &radv_pipeline_info, + &device->meta_state.alloc, &device->meta_state.blit.stencil_only_1d_pipeline); + if (result != VK_SUCCESS) + goto fail; + + pipeline_shader_stages[1].module = radv_shader_module_to_handle(&fs_2d); + result = radv_graphics_pipeline_create(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &vk_pipeline_info, &radv_pipeline_info, + &device->meta_state.alloc, &device->meta_state.blit.stencil_only_2d_pipeline); + if (result != VK_SUCCESS) + goto fail; + + pipeline_shader_stages[1].module = radv_shader_module_to_handle(&fs_3d); + result = radv_graphics_pipeline_create(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &vk_pipeline_info, &radv_pipeline_info, + &device->meta_state.alloc, &device->meta_state.blit.stencil_only_3d_pipeline); + if (result != VK_SUCCESS) + goto fail; + +fail: + ralloc_free(fs_1d.nir); + ralloc_free(fs_2d.nir); + ralloc_free(fs_3d.nir); + return result; +} + +VkResult +radv_device_init_meta_blit_state(struct radv_device *device) +{ + VkResult result; + struct radv_shader_module vs = {0}; + zero(device->meta_state.blit); + + VkDescriptorSetLayoutCreateInfo ds_layout_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = 1, + .pBindings = (VkDescriptorSetLayoutBinding[]) { + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT, + .pImmutableSamplers = NULL + }, + } + }; + result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), + &ds_layout_info, + &device->meta_state.alloc, + &device->meta_state.blit.ds_layout); + if (result != VK_SUCCESS) + goto fail; + + result = radv_CreatePipelineLayout(radv_device_to_handle(device), + &(VkPipelineLayoutCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &device->meta_state.blit.ds_layout, + }, + &device->meta_state.alloc, &device->meta_state.blit.pipeline_layout); + if (result != VK_SUCCESS) + goto fail; + + vs.nir = build_nir_vertex_shader(); + + result = radv_device_init_meta_blit_color(device, &vs); + if (result != VK_SUCCESS) + goto fail; + + result = radv_device_init_meta_blit_depth(device, &vs); + if (result != VK_SUCCESS) + goto fail; + + result = radv_device_init_meta_blit_stencil(device, &vs); + if (result != VK_SUCCESS) + goto fail; + return VK_SUCCESS; + +fail: + ralloc_free(vs.nir); + radv_device_finish_meta_blit_state(device); + return result; +} diff --git a/src/amd/vulkan/radv_meta_blit2d.c b/src/amd/vulkan/radv_meta_blit2d.c new file mode 100644 index 00000000000..ba426e1e9a7 --- /dev/null +++ b/src/amd/vulkan/radv_meta_blit2d.c @@ -0,0 +1,1282 @@ +/* + * Copyright © 2016 Red Hat + * + * based on anv driver: + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "radv_meta.h" +#include "nir/nir_builder.h" + +enum blit2d_dst_type { + /* We can bind this destination as a "normal" render target and render + * to it just like you would anywhere else. + */ + BLIT2D_DST_TYPE_NORMAL, + + /* The destination has a 3-channel RGB format. Since we can't render to + * non-power-of-two textures, we have to bind it as a red texture and + * select the correct component for the given red pixel in the shader. + */ + BLIT2D_DST_TYPE_RGB, + + BLIT2D_NUM_DST_TYPES, +}; + + +enum blit2d_src_type { + BLIT2D_SRC_TYPE_IMAGE, + BLIT2D_SRC_TYPE_BUFFER, + BLIT2D_NUM_SRC_TYPES, +}; + +static void +create_iview(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_blit2d_surf *surf, + VkImageUsageFlags usage, + struct radv_image_view *iview, VkFormat depth_format) +{ + VkFormat format; + + if (depth_format) + format = depth_format; + else + format = surf->format; + + radv_image_view_init(iview, cmd_buffer->device, + &(VkImageViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = radv_image_to_handle(surf->image), + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = format, + .subresourceRange = { + .aspectMask = surf->aspect_mask, + .baseMipLevel = surf->level, + .levelCount = 1, + .baseArrayLayer = surf->layer, + .layerCount = 1 + }, + }, cmd_buffer, usage); +} + +static void +create_bview(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_blit2d_buffer *src, + struct radv_buffer_view *bview, VkFormat depth_format) +{ + VkFormat format; + + if (depth_format) + format = depth_format; + else + format = src->format; + radv_buffer_view_init(bview, cmd_buffer->device, + &(VkBufferViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, + .flags = 0, + .buffer = radv_buffer_to_handle(src->buffer), + .format = format, + .offset = src->offset, + .range = VK_WHOLE_SIZE, + }, cmd_buffer); + +} + +struct blit2d_src_temps { + struct radv_image_view iview; + + VkDescriptorSet set; + struct radv_buffer_view bview; +}; + +static void +blit2d_bind_src(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_blit2d_surf *src_img, + struct radv_meta_blit2d_buffer *src_buf, + struct radv_meta_blit2d_rect *rect, + struct blit2d_src_temps *tmp, + enum blit2d_src_type src_type, VkFormat depth_format) +{ + struct radv_device *device = cmd_buffer->device; + VkDevice vk_device = radv_device_to_handle(cmd_buffer->device); + + if (src_type == BLIT2D_SRC_TYPE_BUFFER) { + create_bview(cmd_buffer, src_buf, &tmp->bview, depth_format); + + radv_temp_descriptor_set_create(cmd_buffer->device, cmd_buffer, + device->meta_state.blit2d.ds_layouts[src_type], + &tmp->set); + + radv_UpdateDescriptorSets(vk_device, + 1, /* writeCount */ + (VkWriteDescriptorSet[]) { + { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = tmp->set, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, + .pTexelBufferView = (VkBufferView[]) { radv_buffer_view_to_handle(&tmp->bview) } + } + }, 0, NULL); + + radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), + device->meta_state.blit2d.p_layouts[src_type], + VK_SHADER_STAGE_FRAGMENT_BIT, 0, 4, + &src_buf->pitch); + } else { + create_iview(cmd_buffer, src_img, VK_IMAGE_USAGE_SAMPLED_BIT, &tmp->iview, + depth_format); + + radv_temp_descriptor_set_create(cmd_buffer->device, cmd_buffer, + device->meta_state.blit2d.ds_layouts[src_type], + &tmp->set); + + radv_UpdateDescriptorSets(vk_device, + 1, /* writeCount */ + (VkWriteDescriptorSet[]) { + { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = tmp->set, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .pImageInfo = (VkDescriptorImageInfo[]) { + { + .sampler = NULL, + .imageView = radv_image_view_to_handle(&tmp->iview), + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }, + } + } + }, 0, NULL); + + } + + radv_CmdBindDescriptorSets(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_GRAPHICS, + device->meta_state.blit2d.p_layouts[src_type], 0, 1, + &tmp->set, 0, NULL); +} + +static void +blit2d_unbind_src(struct radv_cmd_buffer *cmd_buffer, + struct blit2d_src_temps *tmp, + enum blit2d_src_type src_type) +{ + radv_temp_descriptor_set_destroy(cmd_buffer->device, tmp->set); +} + +struct blit2d_dst_temps { + VkImage image; + struct radv_image_view iview; + VkFramebuffer fb; +}; + +static void +blit2d_bind_dst(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_blit2d_surf *dst, + uint32_t width, + uint32_t height, + VkFormat depth_format, + struct blit2d_dst_temps *tmp) +{ + VkImageUsageFlagBits bits; + + if (dst->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) + bits = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + else + bits = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT; + + create_iview(cmd_buffer, dst, bits, + &tmp->iview, depth_format); + + radv_CreateFramebuffer(radv_device_to_handle(cmd_buffer->device), + &(VkFramebufferCreateInfo) { + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = (VkImageView[]) { + radv_image_view_to_handle(&tmp->iview), + }, + .width = width, + .height = height, + .layers = 1 + }, &cmd_buffer->pool->alloc, &tmp->fb); +} + +static void +blit2d_unbind_dst(struct radv_cmd_buffer *cmd_buffer, + struct blit2d_dst_temps *tmp) +{ + VkDevice vk_device = radv_device_to_handle(cmd_buffer->device); + radv_DestroyFramebuffer(vk_device, tmp->fb, &cmd_buffer->pool->alloc); +} + +static void +bind_pipeline(struct radv_cmd_buffer *cmd_buffer, + enum blit2d_src_type src_type, unsigned fs_key) +{ + VkPipeline pipeline = + cmd_buffer->device->meta_state.blit2d.pipelines[src_type][fs_key]; + + if (cmd_buffer->state.pipeline != radv_pipeline_from_handle(pipeline)) { + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); + } +} + +static void +bind_depth_pipeline(struct radv_cmd_buffer *cmd_buffer, + enum blit2d_src_type src_type) +{ + VkPipeline pipeline = + cmd_buffer->device->meta_state.blit2d.depth_only_pipeline[src_type]; + + if (cmd_buffer->state.pipeline != radv_pipeline_from_handle(pipeline)) { + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); + } +} + +static void +bind_stencil_pipeline(struct radv_cmd_buffer *cmd_buffer, + enum blit2d_src_type src_type) +{ + VkPipeline pipeline = + cmd_buffer->device->meta_state.blit2d.stencil_only_pipeline[src_type]; + + if (cmd_buffer->state.pipeline != radv_pipeline_from_handle(pipeline)) { + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); + } +} + +static void +radv_meta_blit2d_normal_dst(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_blit2d_surf *src_img, + struct radv_meta_blit2d_buffer *src_buf, + struct radv_meta_blit2d_surf *dst, + unsigned num_rects, + struct radv_meta_blit2d_rect *rects, enum blit2d_src_type src_type) +{ + struct radv_device *device = cmd_buffer->device; + + for (unsigned r = 0; r < num_rects; ++r) { + VkFormat depth_format = 0; + if (dst->aspect_mask != VK_IMAGE_ASPECT_COLOR_BIT) + depth_format = dst->image->vk_format; + struct blit2d_src_temps src_temps; + blit2d_bind_src(cmd_buffer, src_img, src_buf, &rects[r], &src_temps, src_type, depth_format); + + uint32_t offset = 0; + struct blit2d_dst_temps dst_temps; + blit2d_bind_dst(cmd_buffer, dst, rects[r].dst_x + rects[r].width, + rects[r].dst_y + rects[r].height, depth_format, &dst_temps); + + struct blit_vb_data { + float pos[2]; + float tex_coord[2]; + } vb_data[3]; + + unsigned vb_size = 3 * sizeof(*vb_data); + + vb_data[0] = (struct blit_vb_data) { + .pos = { + rects[r].dst_x, + rects[r].dst_y, + }, + .tex_coord = { + rects[r].src_x, + rects[r].src_y, + }, + }; + + vb_data[1] = (struct blit_vb_data) { + .pos = { + rects[r].dst_x, + rects[r].dst_y + rects[r].height, + }, + .tex_coord = { + rects[r].src_x, + rects[r].src_y + rects[r].height, + }, + }; + + vb_data[2] = (struct blit_vb_data) { + .pos = { + rects[r].dst_x + rects[r].width, + rects[r].dst_y, + }, + .tex_coord = { + rects[r].src_x + rects[r].width, + rects[r].src_y, + }, + }; + + + radv_cmd_buffer_upload_data(cmd_buffer, vb_size, 16, vb_data, &offset); + + struct radv_buffer vertex_buffer = { + .device = device, + .size = vb_size, + .bo = cmd_buffer->upload.upload_bo, + .offset = offset, + }; + + radv_CmdBindVertexBuffers(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, + (VkBuffer[]) { + radv_buffer_to_handle(&vertex_buffer), + }, + (VkDeviceSize[]) { + 0, + }); + + + if (dst->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) { + unsigned fs_key = radv_format_meta_fs_key(dst_temps.iview.vk_format); + + RADV_CALL(CmdBeginRenderPass)(radv_cmd_buffer_to_handle(cmd_buffer), + &(VkRenderPassBeginInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = device->meta_state.blit2d.render_passes[fs_key], + .framebuffer = dst_temps.fb, + .renderArea = { + .offset = { rects[r].dst_x, rects[r].dst_y, }, + .extent = { rects[r].width, rects[r].height }, + }, + .clearValueCount = 0, + .pClearValues = NULL, + }, VK_SUBPASS_CONTENTS_INLINE); + + + bind_pipeline(cmd_buffer, src_type, fs_key); + } else if (dst->aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) { + RADV_CALL(CmdBeginRenderPass)(radv_cmd_buffer_to_handle(cmd_buffer), + &(VkRenderPassBeginInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = device->meta_state.blit2d.depth_only_rp, + .framebuffer = dst_temps.fb, + .renderArea = { + .offset = { rects[r].dst_x, rects[r].dst_y, }, + .extent = { rects[r].width, rects[r].height }, + }, + .clearValueCount = 0, + .pClearValues = NULL, + }, VK_SUBPASS_CONTENTS_INLINE); + + + bind_depth_pipeline(cmd_buffer, src_type); + + } else if (dst->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) { + RADV_CALL(CmdBeginRenderPass)(radv_cmd_buffer_to_handle(cmd_buffer), + &(VkRenderPassBeginInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = device->meta_state.blit2d.stencil_only_rp, + .framebuffer = dst_temps.fb, + .renderArea = { + .offset = { rects[r].dst_x, rects[r].dst_y, }, + .extent = { rects[r].width, rects[r].height }, + }, + .clearValueCount = 0, + .pClearValues = NULL, + }, VK_SUBPASS_CONTENTS_INLINE); + + + bind_stencil_pipeline(cmd_buffer, src_type); + } + + RADV_CALL(CmdDraw)(radv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0); + RADV_CALL(CmdEndRenderPass)(radv_cmd_buffer_to_handle(cmd_buffer)); + + /* At the point where we emit the draw call, all data from the + * descriptor sets, etc. has been used. We are free to delete it. + */ + blit2d_unbind_src(cmd_buffer, &src_temps, src_type); + blit2d_unbind_dst(cmd_buffer, &dst_temps); + } +} + +void +radv_meta_blit2d(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_blit2d_surf *src_img, + struct radv_meta_blit2d_buffer *src_buf, + struct radv_meta_blit2d_surf *dst, + unsigned num_rects, + struct radv_meta_blit2d_rect *rects) +{ + enum blit2d_src_type src_type = src_buf ? BLIT2D_SRC_TYPE_BUFFER : + BLIT2D_SRC_TYPE_IMAGE; + radv_meta_blit2d_normal_dst(cmd_buffer, src_img, src_buf, dst, + num_rects, rects, src_type); +} + +static nir_shader * +build_nir_vertex_shader(void) +{ + const struct glsl_type *vec4 = glsl_vec4_type(); + const struct glsl_type *vec2 = glsl_vector_type(GLSL_TYPE_FLOAT, 2); + nir_builder b; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, NULL); + b.shader->info.name = ralloc_strdup(b.shader, "meta_blit_vs"); + + nir_variable *pos_in = nir_variable_create(b.shader, nir_var_shader_in, + vec4, "a_pos"); + pos_in->data.location = VERT_ATTRIB_GENERIC0; + nir_variable *pos_out = nir_variable_create(b.shader, nir_var_shader_out, + vec4, "gl_Position"); + pos_out->data.location = VARYING_SLOT_POS; + nir_copy_var(&b, pos_out, pos_in); + + nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in, + vec2, "a_tex_pos"); + tex_pos_in->data.location = VERT_ATTRIB_GENERIC1; + nir_variable *tex_pos_out = nir_variable_create(b.shader, nir_var_shader_out, + vec2, "v_tex_pos"); + tex_pos_out->data.location = VARYING_SLOT_VAR0; + tex_pos_out->data.interpolation = INTERP_MODE_SMOOTH; + nir_copy_var(&b, tex_pos_out, tex_pos_in); + + return b.shader; +} + +typedef nir_ssa_def* (*texel_fetch_build_func)(struct nir_builder *, + struct radv_device *, + nir_ssa_def *); + +static nir_ssa_def * +build_nir_texel_fetch(struct nir_builder *b, struct radv_device *device, + nir_ssa_def *tex_pos) +{ + const struct glsl_type *sampler_type = + glsl_sampler_type(GLSL_SAMPLER_DIM_2D, false, false, GLSL_TYPE_UINT); + nir_variable *sampler = nir_variable_create(b->shader, nir_var_uniform, + sampler_type, "s_tex"); + sampler->data.descriptor_set = 0; + sampler->data.binding = 0; + + nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2); + tex->sampler_dim = GLSL_SAMPLER_DIM_2D; + tex->op = nir_texop_txf; + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(tex_pos); + tex->src[1].src_type = nir_tex_src_lod; + tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0)); + tex->dest_type = nir_type_uint; + tex->is_array = false; + tex->coord_components = 2; + tex->texture = nir_deref_var_create(tex, sampler); + tex->sampler = NULL; + + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); + nir_builder_instr_insert(b, &tex->instr); + + return &tex->dest.ssa; +} + + +static nir_ssa_def * +build_nir_buffer_fetch(struct nir_builder *b, struct radv_device *device, + nir_ssa_def *tex_pos) +{ + const struct glsl_type *sampler_type = + glsl_sampler_type(GLSL_SAMPLER_DIM_BUF, false, false, GLSL_TYPE_UINT); + nir_variable *sampler = nir_variable_create(b->shader, nir_var_uniform, + sampler_type, "s_tex"); + sampler->data.descriptor_set = 0; + sampler->data.binding = 0; + + nir_intrinsic_instr *width = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant); + width->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); + width->num_components = 1; + nir_ssa_dest_init(&width->instr, &width->dest, 1, 32, "width"); + nir_builder_instr_insert(b, &width->instr); + + nir_ssa_def *pos_x = nir_channel(b, tex_pos, 0); + nir_ssa_def *pos_y = nir_channel(b, tex_pos, 1); + pos_y = nir_imul(b, pos_y, &width->dest.ssa); + pos_x = nir_iadd(b, pos_x, pos_y); + //pos_x = nir_iadd(b, pos_x, nir_imm_int(b, 100000)); + + nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1); + tex->sampler_dim = GLSL_SAMPLER_DIM_BUF; + tex->op = nir_texop_txf; + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(pos_x); + tex->dest_type = nir_type_uint; + tex->is_array = false; + tex->coord_components = 1; + tex->texture = nir_deref_var_create(tex, sampler); + tex->sampler = NULL; + + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); + nir_builder_instr_insert(b, &tex->instr); + + return &tex->dest.ssa; +} + +static const VkPipelineVertexInputStateCreateInfo normal_vi_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) { + { + .binding = 0, + .stride = 4 * sizeof(float), + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX + }, + }, + .vertexAttributeDescriptionCount = 2, + .pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) { + { + /* Position */ + .location = 0, + .binding = 0, + .format = VK_FORMAT_R32G32_SFLOAT, + .offset = 0 + }, + { + /* Texture Coordinate */ + .location = 1, + .binding = 0, + .format = VK_FORMAT_R32G32_SFLOAT, + .offset = 8 + }, + }, +}; + +static nir_shader * +build_nir_copy_fragment_shader(struct radv_device *device, + texel_fetch_build_func txf_func, const char* name) +{ + const struct glsl_type *vec4 = glsl_vec4_type(); + const struct glsl_type *vec2 = glsl_vector_type(GLSL_TYPE_FLOAT, 2); + nir_builder b; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL); + b.shader->info.name = ralloc_strdup(b.shader, name); + + nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in, + vec2, "v_tex_pos"); + tex_pos_in->data.location = VARYING_SLOT_VAR0; + + nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out, + vec4, "f_color"); + color_out->data.location = FRAG_RESULT_DATA0; + + nir_ssa_def *pos_int = nir_f2i(&b, nir_load_var(&b, tex_pos_in)); + unsigned swiz[4] = { 0, 1 }; + nir_ssa_def *tex_pos = nir_swizzle(&b, pos_int, swiz, 2, false); + + nir_ssa_def *color = txf_func(&b, device, tex_pos); + nir_store_var(&b, color_out, color, 0xf); + + return b.shader; +} + +static nir_shader * +build_nir_copy_fragment_shader_depth(struct radv_device *device, + texel_fetch_build_func txf_func, const char* name) +{ + const struct glsl_type *vec4 = glsl_vec4_type(); + const struct glsl_type *vec2 = glsl_vector_type(GLSL_TYPE_FLOAT, 2); + nir_builder b; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL); + b.shader->info.name = ralloc_strdup(b.shader, name); + + nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in, + vec2, "v_tex_pos"); + tex_pos_in->data.location = VARYING_SLOT_VAR0; + + nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out, + vec4, "f_color"); + color_out->data.location = FRAG_RESULT_DEPTH; + + nir_ssa_def *pos_int = nir_f2i(&b, nir_load_var(&b, tex_pos_in)); + unsigned swiz[4] = { 0, 1 }; + nir_ssa_def *tex_pos = nir_swizzle(&b, pos_int, swiz, 2, false); + + nir_ssa_def *color = txf_func(&b, device, tex_pos); + nir_store_var(&b, color_out, color, 0x1); + + return b.shader; +} + +static nir_shader * +build_nir_copy_fragment_shader_stencil(struct radv_device *device, + texel_fetch_build_func txf_func, const char* name) +{ + const struct glsl_type *vec4 = glsl_vec4_type(); + const struct glsl_type *vec2 = glsl_vector_type(GLSL_TYPE_FLOAT, 2); + nir_builder b; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL); + b.shader->info.name = ralloc_strdup(b.shader, name); + + nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in, + vec2, "v_tex_pos"); + tex_pos_in->data.location = VARYING_SLOT_VAR0; + + nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out, + vec4, "f_color"); + color_out->data.location = FRAG_RESULT_STENCIL; + + nir_ssa_def *pos_int = nir_f2i(&b, nir_load_var(&b, tex_pos_in)); + unsigned swiz[4] = { 0, 1 }; + nir_ssa_def *tex_pos = nir_swizzle(&b, pos_int, swiz, 2, false); + + nir_ssa_def *color = txf_func(&b, device, tex_pos); + nir_store_var(&b, color_out, color, 0x1); + + return b.shader; +} + +void +radv_device_finish_meta_blit2d_state(struct radv_device *device) +{ + for(unsigned j = 0; j < NUM_META_FS_KEYS; ++j) { + if (device->meta_state.blit2d.render_passes[j]) { + radv_DestroyRenderPass(radv_device_to_handle(device), + device->meta_state.blit2d.render_passes[j], + &device->meta_state.alloc); + } + } + + radv_DestroyRenderPass(radv_device_to_handle(device), + device->meta_state.blit2d.depth_only_rp, + &device->meta_state.alloc); + radv_DestroyRenderPass(radv_device_to_handle(device), + device->meta_state.blit2d.stencil_only_rp, + &device->meta_state.alloc); + + for (unsigned src = 0; src < BLIT2D_NUM_SRC_TYPES; src++) { + if (device->meta_state.blit2d.p_layouts[src]) { + radv_DestroyPipelineLayout(radv_device_to_handle(device), + device->meta_state.blit2d.p_layouts[src], + &device->meta_state.alloc); + } + + if (device->meta_state.blit2d.ds_layouts[src]) { + radv_DestroyDescriptorSetLayout(radv_device_to_handle(device), + device->meta_state.blit2d.ds_layouts[src], + &device->meta_state.alloc); + } + + for (unsigned j = 0; j < NUM_META_FS_KEYS; ++j) { + if (device->meta_state.blit2d.pipelines[src][j]) { + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.blit2d.pipelines[src][j], + &device->meta_state.alloc); + } + } + + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.blit2d.depth_only_pipeline[src], + &device->meta_state.alloc); + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.blit2d.stencil_only_pipeline[src], + &device->meta_state.alloc); + } +} + +static VkResult +blit2d_init_color_pipeline(struct radv_device *device, + enum blit2d_src_type src_type, + VkFormat format) +{ + VkResult result; + unsigned fs_key = radv_format_meta_fs_key(format); + const char *name; + + texel_fetch_build_func src_func; + switch(src_type) { + case BLIT2D_SRC_TYPE_IMAGE: + src_func = build_nir_texel_fetch; + name = "meta_blit2d_image_fs"; + break; + case BLIT2D_SRC_TYPE_BUFFER: + src_func = build_nir_buffer_fetch; + name = "meta_blit2d_buffer_fs"; + break; + default: + unreachable("unknown blit src type\n"); + break; + } + + const VkPipelineVertexInputStateCreateInfo *vi_create_info; + struct radv_shader_module fs = { .nir = NULL }; + + + fs.nir = build_nir_copy_fragment_shader(device, src_func, name); + vi_create_info = &normal_vi_create_info; + + struct radv_shader_module vs = { + .nir = build_nir_vertex_shader(), + }; + + VkPipelineShaderStageCreateInfo pipeline_shader_stages[] = { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = radv_shader_module_to_handle(&vs), + .pName = "main", + .pSpecializationInfo = NULL + }, { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = radv_shader_module_to_handle(&fs), + .pName = "main", + .pSpecializationInfo = NULL + }, + }; + + if (!device->meta_state.blit2d.render_passes[fs_key]) { + result = radv_CreateRenderPass(radv_device_to_handle(device), + &(VkRenderPassCreateInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = format, + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + .finalLayout = VK_IMAGE_LAYOUT_GENERAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .colorAttachmentCount = 1, + .pColorAttachments = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + .pResolveAttachments = NULL, + .pDepthStencilAttachment = &(VkAttachmentReference) { + .attachment = VK_ATTACHMENT_UNUSED, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + .preserveAttachmentCount = 1, + .pPreserveAttachments = (uint32_t[]) { 0 }, + }, + .dependencyCount = 0, + }, &device->meta_state.alloc, &device->meta_state.blit2d.render_passes[fs_key]); + } + + const VkGraphicsPipelineCreateInfo vk_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = ARRAY_SIZE(pipeline_shader_stages), + .pStages = pipeline_shader_stages, + .pVertexInputState = vi_create_info, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + .primitiveRestartEnable = false, + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 0, + .scissorCount = 0, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .rasterizerDiscardEnable = false, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = 1, + .sampleShadingEnable = false, + .pSampleMask = (VkSampleMask[]) { UINT32_MAX }, + }, + .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = (VkPipelineColorBlendAttachmentState []) { + { .colorWriteMask = + VK_COLOR_COMPONENT_A_BIT | + VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT }, + } + }, + .pDynamicState = &(VkPipelineDynamicStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = 7, + .pDynamicStates = (VkDynamicState[]) { + VK_DYNAMIC_STATE_LINE_WIDTH, + VK_DYNAMIC_STATE_DEPTH_BIAS, + VK_DYNAMIC_STATE_BLEND_CONSTANTS, + VK_DYNAMIC_STATE_DEPTH_BOUNDS, + VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, + VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, + VK_DYNAMIC_STATE_STENCIL_REFERENCE, + }, + }, + .flags = 0, + .layout = device->meta_state.blit2d.p_layouts[src_type], + .renderPass = device->meta_state.blit2d.render_passes[fs_key], + .subpass = 0, + }; + + const struct radv_graphics_pipeline_create_info radv_pipeline_info = { + .use_rectlist = true + }; + + result = radv_graphics_pipeline_create(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &vk_pipeline_info, &radv_pipeline_info, + &device->meta_state.alloc, + &device->meta_state.blit2d.pipelines[src_type][fs_key]); + + + ralloc_free(vs.nir); + ralloc_free(fs.nir); + + return result; +} + +static VkResult +blit2d_init_depth_only_pipeline(struct radv_device *device, + enum blit2d_src_type src_type) +{ + VkResult result; + const char *name; + + texel_fetch_build_func src_func; + switch(src_type) { + case BLIT2D_SRC_TYPE_IMAGE: + src_func = build_nir_texel_fetch; + name = "meta_blit2d_depth_image_fs"; + break; + case BLIT2D_SRC_TYPE_BUFFER: + src_func = build_nir_buffer_fetch; + name = "meta_blit2d_depth_buffer_fs"; + break; + default: + unreachable("unknown blit src type\n"); + break; + } + + const VkPipelineVertexInputStateCreateInfo *vi_create_info; + struct radv_shader_module fs = { .nir = NULL }; + + fs.nir = build_nir_copy_fragment_shader_depth(device, src_func, name); + vi_create_info = &normal_vi_create_info; + + struct radv_shader_module vs = { + .nir = build_nir_vertex_shader(), + }; + + VkPipelineShaderStageCreateInfo pipeline_shader_stages[] = { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = radv_shader_module_to_handle(&vs), + .pName = "main", + .pSpecializationInfo = NULL + }, { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = radv_shader_module_to_handle(&fs), + .pName = "main", + .pSpecializationInfo = NULL + }, + }; + + if (!device->meta_state.blit2d.depth_only_rp) { + result = radv_CreateRenderPass(radv_device_to_handle(device), + &(VkRenderPassCreateInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = 0, + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + .finalLayout = VK_IMAGE_LAYOUT_GENERAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .colorAttachmentCount = 0, + .pColorAttachments = NULL, + .pResolveAttachments = NULL, + .pDepthStencilAttachment = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + .preserveAttachmentCount = 1, + .pPreserveAttachments = (uint32_t[]) { 0 }, + }, + .dependencyCount = 0, + }, &device->meta_state.alloc, &device->meta_state.blit2d.depth_only_rp); + } + + const VkGraphicsPipelineCreateInfo vk_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = ARRAY_SIZE(pipeline_shader_stages), + .pStages = pipeline_shader_stages, + .pVertexInputState = vi_create_info, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + .primitiveRestartEnable = false, + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 0, + .scissorCount = 0, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .rasterizerDiscardEnable = false, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = 1, + .sampleShadingEnable = false, + .pSampleMask = (VkSampleMask[]) { UINT32_MAX }, + }, + .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 0, + .pAttachments = NULL, + }, + .pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = true, + .depthWriteEnable = true, + .depthCompareOp = VK_COMPARE_OP_ALWAYS, + }, + .pDynamicState = &(VkPipelineDynamicStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = 7, + .pDynamicStates = (VkDynamicState[]) { + VK_DYNAMIC_STATE_LINE_WIDTH, + VK_DYNAMIC_STATE_DEPTH_BIAS, + VK_DYNAMIC_STATE_BLEND_CONSTANTS, + VK_DYNAMIC_STATE_DEPTH_BOUNDS, + VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, + VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, + VK_DYNAMIC_STATE_STENCIL_REFERENCE, + }, + }, + .flags = 0, + .layout = device->meta_state.blit2d.p_layouts[src_type], + .renderPass = device->meta_state.blit2d.depth_only_rp, + .subpass = 0, + }; + + const struct radv_graphics_pipeline_create_info radv_pipeline_info = { + .use_rectlist = true + }; + + result = radv_graphics_pipeline_create(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &vk_pipeline_info, &radv_pipeline_info, + &device->meta_state.alloc, + &device->meta_state.blit2d.depth_only_pipeline[src_type]); + + + ralloc_free(vs.nir); + ralloc_free(fs.nir); + + return result; +} + +static VkResult +blit2d_init_stencil_only_pipeline(struct radv_device *device, + enum blit2d_src_type src_type) +{ + VkResult result; + const char *name; + + texel_fetch_build_func src_func; + switch(src_type) { + case BLIT2D_SRC_TYPE_IMAGE: + src_func = build_nir_texel_fetch; + name = "meta_blit2d_stencil_image_fs"; + break; + case BLIT2D_SRC_TYPE_BUFFER: + src_func = build_nir_buffer_fetch; + name = "meta_blit2d_stencil_buffer_fs"; + break; + default: + unreachable("unknown blit src type\n"); + break; + } + + const VkPipelineVertexInputStateCreateInfo *vi_create_info; + struct radv_shader_module fs = { .nir = NULL }; + + fs.nir = build_nir_copy_fragment_shader_stencil(device, src_func, name); + vi_create_info = &normal_vi_create_info; + + struct radv_shader_module vs = { + .nir = build_nir_vertex_shader(), + }; + + VkPipelineShaderStageCreateInfo pipeline_shader_stages[] = { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = radv_shader_module_to_handle(&vs), + .pName = "main", + .pSpecializationInfo = NULL + }, { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = radv_shader_module_to_handle(&fs), + .pName = "main", + .pSpecializationInfo = NULL + }, + }; + + if (!device->meta_state.blit2d.stencil_only_rp) { + result = radv_CreateRenderPass(radv_device_to_handle(device), + &(VkRenderPassCreateInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = 0, + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + .finalLayout = VK_IMAGE_LAYOUT_GENERAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .colorAttachmentCount = 0, + .pColorAttachments = NULL, + .pResolveAttachments = NULL, + .pDepthStencilAttachment = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + .preserveAttachmentCount = 1, + .pPreserveAttachments = (uint32_t[]) { 0 }, + }, + .dependencyCount = 0, + }, &device->meta_state.alloc, &device->meta_state.blit2d.stencil_only_rp); + } + + const VkGraphicsPipelineCreateInfo vk_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = ARRAY_SIZE(pipeline_shader_stages), + .pStages = pipeline_shader_stages, + .pVertexInputState = vi_create_info, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + .primitiveRestartEnable = false, + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 0, + .scissorCount = 0, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .rasterizerDiscardEnable = false, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = 1, + .sampleShadingEnable = false, + .pSampleMask = (VkSampleMask[]) { UINT32_MAX }, + }, + .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 0, + .pAttachments = NULL, + }, + .pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = false, + .depthWriteEnable = false, + .stencilTestEnable = true, + .front = { + .failOp = VK_STENCIL_OP_REPLACE, + .passOp = VK_STENCIL_OP_REPLACE, + .depthFailOp = VK_STENCIL_OP_REPLACE, + .compareOp = VK_COMPARE_OP_ALWAYS, + .compareMask = 0xff, + .writeMask = 0xff, + .reference = 0 + }, + .back = { + .failOp = VK_STENCIL_OP_REPLACE, + .passOp = VK_STENCIL_OP_REPLACE, + .depthFailOp = VK_STENCIL_OP_REPLACE, + .compareOp = VK_COMPARE_OP_ALWAYS, + .compareMask = 0xff, + .writeMask = 0xff, + .reference = 0 + }, + .depthCompareOp = VK_COMPARE_OP_ALWAYS, + }, + .pDynamicState = &(VkPipelineDynamicStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = 4, + .pDynamicStates = (VkDynamicState[]) { + VK_DYNAMIC_STATE_LINE_WIDTH, + VK_DYNAMIC_STATE_DEPTH_BIAS, + VK_DYNAMIC_STATE_BLEND_CONSTANTS, + VK_DYNAMIC_STATE_DEPTH_BOUNDS, + }, + }, + .flags = 0, + .layout = device->meta_state.blit2d.p_layouts[src_type], + .renderPass = device->meta_state.blit2d.stencil_only_rp, + .subpass = 0, + }; + + const struct radv_graphics_pipeline_create_info radv_pipeline_info = { + .use_rectlist = true + }; + + result = radv_graphics_pipeline_create(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &vk_pipeline_info, &radv_pipeline_info, + &device->meta_state.alloc, + &device->meta_state.blit2d.stencil_only_pipeline[src_type]); + + + ralloc_free(vs.nir); + ralloc_free(fs.nir); + + return result; +} + +static VkFormat pipeline_formats[] = { + VK_FORMAT_R8G8B8A8_UNORM, + VK_FORMAT_R8G8B8A8_UINT, + VK_FORMAT_R8G8B8A8_SINT, + VK_FORMAT_R16G16B16A16_UNORM, + VK_FORMAT_R16G16B16A16_SNORM, + VK_FORMAT_R16G16B16A16_UINT, + VK_FORMAT_R16G16B16A16_SINT, + VK_FORMAT_R32_SFLOAT, + VK_FORMAT_R32G32_SFLOAT, + VK_FORMAT_R32G32B32A32_SFLOAT +}; + +VkResult +radv_device_init_meta_blit2d_state(struct radv_device *device) +{ + VkResult result; + + zero(device->meta_state.blit2d); + + result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), + &(VkDescriptorSetLayoutCreateInfo) { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = 1, + .pBindings = (VkDescriptorSetLayoutBinding[]) { + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT, + .pImmutableSamplers = NULL + }, + } + }, &device->meta_state.alloc, &device->meta_state.blit2d.ds_layouts[BLIT2D_SRC_TYPE_IMAGE]); + if (result != VK_SUCCESS) + goto fail; + + result = radv_CreatePipelineLayout(radv_device_to_handle(device), + &(VkPipelineLayoutCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &device->meta_state.blit2d.ds_layouts[BLIT2D_SRC_TYPE_IMAGE], + }, + &device->meta_state.alloc, &device->meta_state.blit2d.p_layouts[BLIT2D_SRC_TYPE_IMAGE]); + if (result != VK_SUCCESS) + goto fail; + + result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), + &(VkDescriptorSetLayoutCreateInfo) { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = 1, + .pBindings = (VkDescriptorSetLayoutBinding[]) { + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT, + .pImmutableSamplers = NULL + }, + } + }, &device->meta_state.alloc, &device->meta_state.blit2d.ds_layouts[BLIT2D_SRC_TYPE_BUFFER]); + if (result != VK_SUCCESS) + goto fail; + + const VkPushConstantRange push_constant_range = {VK_SHADER_STAGE_FRAGMENT_BIT, 0, 4}; + result = radv_CreatePipelineLayout(radv_device_to_handle(device), + &(VkPipelineLayoutCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &device->meta_state.blit2d.ds_layouts[BLIT2D_SRC_TYPE_BUFFER], + .pushConstantRangeCount = 1, + .pPushConstantRanges = &push_constant_range, + }, + &device->meta_state.alloc, &device->meta_state.blit2d.p_layouts[BLIT2D_SRC_TYPE_BUFFER]); + if (result != VK_SUCCESS) + goto fail; + + for (unsigned src = 0; src < BLIT2D_NUM_SRC_TYPES; src++) { + for (unsigned j = 0; j < ARRAY_SIZE(pipeline_formats); ++j) { + result = blit2d_init_color_pipeline(device, src, pipeline_formats[j]); + if (result != VK_SUCCESS) + goto fail; + } + + result = blit2d_init_depth_only_pipeline(device, src); + if (result != VK_SUCCESS) + goto fail; + + result = blit2d_init_stencil_only_pipeline(device, src); + if (result != VK_SUCCESS) + goto fail; + } + + return VK_SUCCESS; + +fail: + radv_device_finish_meta_blit2d_state(device); + return result; +} diff --git a/src/amd/vulkan/radv_meta_buffer.c b/src/amd/vulkan/radv_meta_buffer.c new file mode 100644 index 00000000000..adea25e02fb --- /dev/null +++ b/src/amd/vulkan/radv_meta_buffer.c @@ -0,0 +1,543 @@ +#include "radv_meta.h" +#include "nir/nir_builder.h" + +#include "sid.h" +#include "radv_cs.h" + +static nir_shader * +build_buffer_fill_shader(struct radv_device *dev) +{ + nir_builder b; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL); + b.shader->info.name = ralloc_strdup(b.shader, "meta_buffer_fill"); + b.shader->info.cs.local_size[0] = 64; + b.shader->info.cs.local_size[1] = 1; + b.shader->info.cs.local_size[2] = 1; + + nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0); + nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0); + nir_ssa_def *block_size = nir_imm_ivec4(&b, + b.shader->info.cs.local_size[0], + b.shader->info.cs.local_size[1], + b.shader->info.cs.local_size[2], 0); + + nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id); + + nir_ssa_def *offset = nir_imul(&b, global_id, nir_imm_int(&b, 16)); + offset = nir_swizzle(&b, offset, (unsigned[]) {0, 0, 0, 0}, 1, false); + + nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader, + nir_intrinsic_vulkan_resource_index); + dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); + nir_intrinsic_set_desc_set(dst_buf, 0); + nir_intrinsic_set_binding(dst_buf, 0); + nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, 1, 32, NULL); + nir_builder_instr_insert(&b, &dst_buf->instr); + + nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant); + load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); + load->num_components = 1; + nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, "fill_value"); + nir_builder_instr_insert(&b, &load->instr); + + nir_ssa_def *swizzled_load = nir_swizzle(&b, &load->dest.ssa, (unsigned[]) { 0, 0, 0, 0}, 4, false); + + nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); + store->src[0] = nir_src_for_ssa(swizzled_load); + store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); + store->src[2] = nir_src_for_ssa(offset); + nir_intrinsic_set_write_mask(store, 0xf); + store->num_components = 4; + nir_builder_instr_insert(&b, &store->instr); + + return b.shader; +} + +static nir_shader * +build_buffer_copy_shader(struct radv_device *dev) +{ + nir_builder b; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL); + b.shader->info.name = ralloc_strdup(b.shader, "meta_buffer_copy"); + b.shader->info.cs.local_size[0] = 64; + b.shader->info.cs.local_size[1] = 1; + b.shader->info.cs.local_size[2] = 1; + + nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0); + nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0); + nir_ssa_def *block_size = nir_imm_ivec4(&b, + b.shader->info.cs.local_size[0], + b.shader->info.cs.local_size[1], + b.shader->info.cs.local_size[2], 0); + + nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id); + + nir_ssa_def *offset = nir_imul(&b, global_id, nir_imm_int(&b, 16)); + offset = nir_swizzle(&b, offset, (unsigned[]) {0, 0, 0, 0}, 1, false); + + nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader, + nir_intrinsic_vulkan_resource_index); + dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); + nir_intrinsic_set_desc_set(dst_buf, 0); + nir_intrinsic_set_binding(dst_buf, 0); + nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, 1, 32, NULL); + nir_builder_instr_insert(&b, &dst_buf->instr); + + nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader, + nir_intrinsic_vulkan_resource_index); + src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); + nir_intrinsic_set_desc_set(src_buf, 0); + nir_intrinsic_set_binding(src_buf, 1); + nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, 1, 32, NULL); + nir_builder_instr_insert(&b, &src_buf->instr); + + nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo); + load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa); + load->src[1] = nir_src_for_ssa(offset); + nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL); + load->num_components = 4; + nir_builder_instr_insert(&b, &load->instr); + + nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); + store->src[0] = nir_src_for_ssa(&load->dest.ssa); + store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); + store->src[2] = nir_src_for_ssa(offset); + nir_intrinsic_set_write_mask(store, 0xf); + store->num_components = 4; + nir_builder_instr_insert(&b, &store->instr); + + return b.shader; +} + + + +VkResult radv_device_init_meta_buffer_state(struct radv_device *device) +{ + VkResult result; + struct radv_shader_module fill_cs = { .nir = NULL }; + struct radv_shader_module copy_cs = { .nir = NULL }; + + zero(device->meta_state.buffer); + + fill_cs.nir = build_buffer_fill_shader(device); + copy_cs.nir = build_buffer_copy_shader(device); + + VkDescriptorSetLayoutCreateInfo fill_ds_create_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = 1, + .pBindings = (VkDescriptorSetLayoutBinding[]) { + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL + }, + } + }; + + result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), + &fill_ds_create_info, + &device->meta_state.alloc, + &device->meta_state.buffer.fill_ds_layout); + if (result != VK_SUCCESS) + goto fail; + + VkDescriptorSetLayoutCreateInfo copy_ds_create_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = 2, + .pBindings = (VkDescriptorSetLayoutBinding[]) { + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL + }, + { + .binding = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL + }, + } + }; + + result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), + ©_ds_create_info, + &device->meta_state.alloc, + &device->meta_state.buffer.copy_ds_layout); + if (result != VK_SUCCESS) + goto fail; + + + VkPipelineLayoutCreateInfo fill_pl_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &device->meta_state.buffer.fill_ds_layout, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 4}, + }; + + result = radv_CreatePipelineLayout(radv_device_to_handle(device), + &fill_pl_create_info, + &device->meta_state.alloc, + &device->meta_state.buffer.fill_p_layout); + if (result != VK_SUCCESS) + goto fail; + + VkPipelineLayoutCreateInfo copy_pl_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &device->meta_state.buffer.copy_ds_layout, + .pushConstantRangeCount = 0, + }; + + result = radv_CreatePipelineLayout(radv_device_to_handle(device), + ©_pl_create_info, + &device->meta_state.alloc, + &device->meta_state.buffer.copy_p_layout); + if (result != VK_SUCCESS) + goto fail; + + VkPipelineShaderStageCreateInfo fill_pipeline_shader_stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = radv_shader_module_to_handle(&fill_cs), + .pName = "main", + .pSpecializationInfo = NULL, + }; + + VkComputePipelineCreateInfo fill_vk_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = fill_pipeline_shader_stage, + .flags = 0, + .layout = device->meta_state.buffer.fill_p_layout, + }; + + result = radv_CreateComputePipelines(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + 1, &fill_vk_pipeline_info, NULL, + &device->meta_state.buffer.fill_pipeline); + if (result != VK_SUCCESS) + goto fail; + + VkPipelineShaderStageCreateInfo copy_pipeline_shader_stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = radv_shader_module_to_handle(©_cs), + .pName = "main", + .pSpecializationInfo = NULL, + }; + + VkComputePipelineCreateInfo copy_vk_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = copy_pipeline_shader_stage, + .flags = 0, + .layout = device->meta_state.buffer.copy_p_layout, + }; + + result = radv_CreateComputePipelines(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + 1, ©_vk_pipeline_info, NULL, + &device->meta_state.buffer.copy_pipeline); + if (result != VK_SUCCESS) + goto fail; + + ralloc_free(fill_cs.nir); + ralloc_free(copy_cs.nir); + return VK_SUCCESS; +fail: + radv_device_finish_meta_buffer_state(device); + ralloc_free(fill_cs.nir); + ralloc_free(copy_cs.nir); + return result; +} + +void radv_device_finish_meta_buffer_state(struct radv_device *device) +{ + if (device->meta_state.buffer.copy_pipeline) + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.buffer.copy_pipeline, + &device->meta_state.alloc); + + if (device->meta_state.buffer.fill_pipeline) + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.buffer.fill_pipeline, + &device->meta_state.alloc); + + if (device->meta_state.buffer.copy_p_layout) + radv_DestroyPipelineLayout(radv_device_to_handle(device), + device->meta_state.buffer.copy_p_layout, + &device->meta_state.alloc); + + if (device->meta_state.buffer.fill_p_layout) + radv_DestroyPipelineLayout(radv_device_to_handle(device), + device->meta_state.buffer.fill_p_layout, + &device->meta_state.alloc); + + if (device->meta_state.buffer.copy_ds_layout) + radv_DestroyDescriptorSetLayout(radv_device_to_handle(device), + device->meta_state.buffer.copy_ds_layout, + &device->meta_state.alloc); + + if (device->meta_state.buffer.fill_ds_layout) + radv_DestroyDescriptorSetLayout(radv_device_to_handle(device), + device->meta_state.buffer.fill_ds_layout, + &device->meta_state.alloc); +} + +static void fill_buffer_shader(struct radv_cmd_buffer *cmd_buffer, + struct radeon_winsys_bo *bo, + uint64_t offset, uint64_t size, uint32_t value) +{ + struct radv_device *device = cmd_buffer->device; + uint64_t block_count = round_up_u64(size, 1024); + struct radv_meta_saved_compute_state saved_state; + VkDescriptorSet ds; + + radv_meta_save_compute(&saved_state, cmd_buffer, 4); + + radv_temp_descriptor_set_create(device, cmd_buffer, + device->meta_state.buffer.fill_ds_layout, + &ds); + + struct radv_buffer dst_buffer = { + .bo = bo, + .offset = offset, + .size = size + }; + + radv_UpdateDescriptorSets(radv_device_to_handle(device), + 1, /* writeCount */ + (VkWriteDescriptorSet[]) { + { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = ds, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .pBufferInfo = &(VkDescriptorBufferInfo) { + .buffer = radv_buffer_to_handle(&dst_buffer), + .offset = 0, + .range = size + } + } + }, 0, NULL); + + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_COMPUTE, + device->meta_state.buffer.fill_pipeline); + + radv_CmdBindDescriptorSets(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_COMPUTE, + device->meta_state.buffer.fill_p_layout, 0, 1, + &ds, 0, NULL); + + radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), + device->meta_state.buffer.fill_p_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, 4, + &value); + + radv_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1); + + radv_temp_descriptor_set_destroy(device, ds); + + radv_meta_restore_compute(&saved_state, cmd_buffer, 4); +} + +static void copy_buffer_shader(struct radv_cmd_buffer *cmd_buffer, + struct radeon_winsys_bo *src_bo, + struct radeon_winsys_bo *dst_bo, + uint64_t src_offset, uint64_t dst_offset, + uint64_t size) +{ + struct radv_device *device = cmd_buffer->device; + uint64_t block_count = round_up_u64(size, 1024); + struct radv_meta_saved_compute_state saved_state; + VkDescriptorSet ds; + + radv_meta_save_compute(&saved_state, cmd_buffer, 0); + + radv_temp_descriptor_set_create(device, cmd_buffer, + device->meta_state.buffer.copy_ds_layout, + &ds); + + struct radv_buffer dst_buffer = { + .bo = dst_bo, + .offset = dst_offset, + .size = size + }; + + struct radv_buffer src_buffer = { + .bo = src_bo, + .offset = src_offset, + .size = size + }; + + radv_UpdateDescriptorSets(radv_device_to_handle(device), + 2, /* writeCount */ + (VkWriteDescriptorSet[]) { + { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = ds, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .pBufferInfo = &(VkDescriptorBufferInfo) { + .buffer = radv_buffer_to_handle(&dst_buffer), + .offset = 0, + .range = size + } + }, + { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = ds, + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .pBufferInfo = &(VkDescriptorBufferInfo) { + .buffer = radv_buffer_to_handle(&src_buffer), + .offset = 0, + .range = size + } + } + }, 0, NULL); + + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_COMPUTE, + device->meta_state.buffer.copy_pipeline); + + radv_CmdBindDescriptorSets(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_COMPUTE, + device->meta_state.buffer.copy_p_layout, 0, 1, + &ds, 0, NULL); + + + radv_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1); + + radv_temp_descriptor_set_destroy(device, ds); + + radv_meta_restore_compute(&saved_state, cmd_buffer, 0); +} + + +void radv_fill_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radeon_winsys_bo *bo, + uint64_t offset, uint64_t size, uint32_t value) +{ + assert(!(offset & 3)); + assert(!(size & 3)); + + if (size >= 4096) + fill_buffer_shader(cmd_buffer, bo, offset, size, value); + else if (size) { + uint64_t va = cmd_buffer->device->ws->buffer_get_va(bo); + va += offset; + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, bo, 8); + si_cp_dma_clear_buffer(cmd_buffer, va, size, value); + } +} + +static +void radv_copy_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radeon_winsys_bo *src_bo, + struct radeon_winsys_bo *dst_bo, + uint64_t src_offset, uint64_t dst_offset, + uint64_t size) +{ + if (size >= 4096 && !(size & 3) && !(src_offset & 3) && !(dst_offset & 3)) + copy_buffer_shader(cmd_buffer, src_bo, dst_bo, + src_offset, dst_offset, size); + else if (size) { + uint64_t src_va = cmd_buffer->device->ws->buffer_get_va(src_bo); + uint64_t dst_va = cmd_buffer->device->ws->buffer_get_va(dst_bo); + src_va += src_offset; + dst_va += dst_offset; + + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, src_bo, 8); + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, dst_bo, 8); + + si_cp_dma_buffer_copy(cmd_buffer, src_va, dst_va, size); + } +} + +void radv_CmdFillBuffer( + VkCommandBuffer commandBuffer, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + VkDeviceSize fillSize, + uint32_t data) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer); + + if (fillSize == VK_WHOLE_SIZE) + fillSize = (dst_buffer->size - dstOffset) & ~3ull; + + radv_fill_buffer(cmd_buffer, dst_buffer->bo, dst_buffer->offset + dstOffset, + fillSize, data); +} + +void radv_CmdCopyBuffer( + VkCommandBuffer commandBuffer, + VkBuffer srcBuffer, + VkBuffer destBuffer, + uint32_t regionCount, + const VkBufferCopy* pRegions) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_buffer, src_buffer, srcBuffer); + RADV_FROM_HANDLE(radv_buffer, dest_buffer, destBuffer); + + for (unsigned r = 0; r < regionCount; r++) { + uint64_t src_offset = src_buffer->offset + pRegions[r].srcOffset; + uint64_t dest_offset = dest_buffer->offset + pRegions[r].dstOffset; + uint64_t copy_size = pRegions[r].size; + + radv_copy_buffer(cmd_buffer, src_buffer->bo, dest_buffer->bo, + src_offset, dest_offset, copy_size); + } +} + +void radv_CmdUpdateBuffer( + VkCommandBuffer commandBuffer, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + VkDeviceSize dataSize, + const uint32_t* pData) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer); + uint64_t words = dataSize / 4; + uint64_t va = cmd_buffer->device->ws->buffer_get_va(dst_buffer->bo); + va += dstOffset + dst_buffer->offset; + + assert(!(dataSize & 3)); + assert(!(va & 3)); + + if (dataSize < 4096) { + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, dst_buffer->bo, 8); + + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, words + 4); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + words, 0)); + radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEMORY_SYNC) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_ME)); + radeon_emit(cmd_buffer->cs, va); + radeon_emit(cmd_buffer->cs, va >> 32); + radeon_emit_array(cmd_buffer->cs, pData, words); + } else { + uint32_t buf_offset; + radv_cmd_buffer_upload_data(cmd_buffer, dataSize, 32, pData, &buf_offset); + radv_copy_buffer(cmd_buffer, cmd_buffer->upload.upload_bo, dst_buffer->bo, + buf_offset, dstOffset + dst_buffer->offset, dataSize); + } +} diff --git a/src/amd/vulkan/radv_meta_bufimage.c b/src/amd/vulkan/radv_meta_bufimage.c new file mode 100644 index 00000000000..287ab3f2570 --- /dev/null +++ b/src/amd/vulkan/radv_meta_bufimage.c @@ -0,0 +1,396 @@ +#include "radv_meta.h" +#include "nir/nir_builder.h" + +static nir_shader * +build_nir_itob_compute_shader(struct radv_device *dev) +{ + nir_builder b; + const struct glsl_type *sampler_type = glsl_sampler_type(GLSL_SAMPLER_DIM_2D, + false, + false, + GLSL_TYPE_FLOAT); + const struct glsl_type *img_type = glsl_sampler_type(GLSL_SAMPLER_DIM_BUF, + false, + false, + GLSL_TYPE_FLOAT); + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL); + b.shader->info.name = ralloc_strdup(b.shader, "meta_itob_cs"); + b.shader->info.cs.local_size[0] = 16; + b.shader->info.cs.local_size[1] = 16; + b.shader->info.cs.local_size[2] = 1; + nir_variable *input_img = nir_variable_create(b.shader, nir_var_uniform, + sampler_type, "s_tex"); + input_img->data.descriptor_set = 0; + input_img->data.binding = 0; + + nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, + img_type, "out_img"); + output_img->data.descriptor_set = 0; + output_img->data.binding = 1; + + nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0); + nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0); + nir_ssa_def *block_size = nir_imm_ivec4(&b, + b.shader->info.cs.local_size[0], + b.shader->info.cs.local_size[1], + b.shader->info.cs.local_size[2], 0); + + nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id); + + + + nir_intrinsic_instr *offset = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant); + offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); + offset->num_components = 2; + nir_ssa_dest_init(&offset->instr, &offset->dest, 2, 32, "offset"); + nir_builder_instr_insert(&b, &offset->instr); + + nir_intrinsic_instr *stride = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant); + stride->src[0] = nir_src_for_ssa(nir_imm_int(&b, 8)); + stride->num_components = 1; + nir_ssa_dest_init(&stride->instr, &stride->dest, 1, 32, "stride"); + nir_builder_instr_insert(&b, &stride->instr); + + nir_ssa_def *img_coord = nir_iadd(&b, global_id, &offset->dest.ssa); + + nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2); + tex->sampler_dim = GLSL_SAMPLER_DIM_2D; + tex->op = nir_texop_txf; + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(img_coord); + tex->src[1].src_type = nir_tex_src_lod; + tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, 0)); + tex->dest_type = nir_type_float; + tex->is_array = false; + tex->coord_components = 2; + tex->texture = nir_deref_var_create(tex, input_img); + tex->sampler = NULL; + + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); + nir_builder_instr_insert(&b, &tex->instr); + + nir_ssa_def *pos_x = nir_channel(&b, global_id, 0); + nir_ssa_def *pos_y = nir_channel(&b, global_id, 1); + + nir_ssa_def *tmp = nir_imul(&b, pos_y, &stride->dest.ssa); + tmp = nir_iadd(&b, tmp, pos_x); + + nir_ssa_def *coord = nir_vec4(&b, tmp, tmp, tmp, tmp); + + nir_ssa_def *outval = &tex->dest.ssa; + nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_store); + store->src[0] = nir_src_for_ssa(coord); + store->src[1] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32)); + store->src[2] = nir_src_for_ssa(outval); + store->variables[0] = nir_deref_var_create(store, output_img); + + nir_builder_instr_insert(&b, &store->instr); + return b.shader; +} + +/* Image to buffer - don't write use image accessors */ +static VkResult +radv_device_init_meta_itob_state(struct radv_device *device) +{ + VkResult result; + struct radv_shader_module cs = { .nir = NULL }; + + zero(device->meta_state.itob); + + cs.nir = build_nir_itob_compute_shader(device); + + /* + * two descriptors one for the image being sampled + * one for the buffer being written. + */ + VkDescriptorSetLayoutCreateInfo ds_create_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = 2, + .pBindings = (VkDescriptorSetLayoutBinding[]) { + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL + }, + { + .binding = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL + }, + } + }; + + result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), + &ds_create_info, + &device->meta_state.alloc, + &device->meta_state.itob.img_ds_layout); + if (result != VK_SUCCESS) + goto fail; + + + VkPipelineLayoutCreateInfo pl_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &device->meta_state.itob.img_ds_layout, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 12}, + }; + + result = radv_CreatePipelineLayout(radv_device_to_handle(device), + &pl_create_info, + &device->meta_state.alloc, + &device->meta_state.itob.img_p_layout); + if (result != VK_SUCCESS) + goto fail; + + /* compute shader */ + + VkPipelineShaderStageCreateInfo pipeline_shader_stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = radv_shader_module_to_handle(&cs), + .pName = "main", + .pSpecializationInfo = NULL, + }; + + VkComputePipelineCreateInfo vk_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = pipeline_shader_stage, + .flags = 0, + .layout = device->meta_state.itob.img_p_layout, + }; + + result = radv_CreateComputePipelines(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + 1, &vk_pipeline_info, NULL, + &device->meta_state.itob.pipeline); + if (result != VK_SUCCESS) + goto fail; + + ralloc_free(cs.nir); + return VK_SUCCESS; +fail: + ralloc_free(cs.nir); + return result; +} + +static void +radv_device_finish_meta_itob_state(struct radv_device *device) +{ + if (device->meta_state.itob.img_p_layout) { + radv_DestroyPipelineLayout(radv_device_to_handle(device), + device->meta_state.itob.img_p_layout, + &device->meta_state.alloc); + } + if (device->meta_state.itob.img_ds_layout) { + radv_DestroyDescriptorSetLayout(radv_device_to_handle(device), + device->meta_state.itob.img_ds_layout, + &device->meta_state.alloc); + } + if (device->meta_state.itob.pipeline) { + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.itob.pipeline, + &device->meta_state.alloc); + } +} + +void +radv_device_finish_meta_bufimage_state(struct radv_device *device) +{ + radv_device_finish_meta_itob_state(device); +} + +VkResult +radv_device_init_meta_bufimage_state(struct radv_device *device) +{ + VkResult result; + + result = radv_device_init_meta_itob_state(device); + if (result != VK_SUCCESS) + return result; + return VK_SUCCESS; +} + +void +radv_meta_begin_bufimage(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_saved_compute_state *save) +{ + radv_meta_save_compute(save, cmd_buffer, 12); +} + +void +radv_meta_end_bufimage(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_saved_compute_state *save) +{ + radv_meta_restore_compute(save, cmd_buffer, 12); +} + +static void +create_iview(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_blit2d_surf *surf, + VkImageUsageFlags usage, + struct radv_image_view *iview) +{ + + radv_image_view_init(iview, cmd_buffer->device, + &(VkImageViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = radv_image_to_handle(surf->image), + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = surf->format, + .subresourceRange = { + .aspectMask = surf->aspect_mask, + .baseMipLevel = surf->level, + .levelCount = 1, + .baseArrayLayer = surf->layer, + .layerCount = 1 + }, + }, cmd_buffer, usage); +} + +static void +create_bview(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer *buffer, + unsigned offset, + VkFormat format, + struct radv_buffer_view *bview) +{ + radv_buffer_view_init(bview, cmd_buffer->device, + &(VkBufferViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, + .flags = 0, + .buffer = radv_buffer_to_handle(buffer), + .format = format, + .offset = offset, + .range = VK_WHOLE_SIZE, + }, cmd_buffer); + +} + +struct itob_temps { + struct radv_image_view src_iview; + + struct radv_buffer_view dst_bview; + VkDescriptorSet set; +}; + +static void +itob_bind_src_image(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_blit2d_surf *src, + struct radv_meta_blit2d_rect *rect, + struct itob_temps *tmp) +{ + create_iview(cmd_buffer, src, VK_IMAGE_USAGE_SAMPLED_BIT, &tmp->src_iview); +} + +static void +itob_bind_dst_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_blit2d_buffer *dst, + struct radv_meta_blit2d_rect *rect, + struct itob_temps *tmp) +{ + create_bview(cmd_buffer, dst->buffer, dst->offset, dst->format, &tmp->dst_bview); +} + +static void +itob_bind_descriptors(struct radv_cmd_buffer *cmd_buffer, + struct itob_temps *tmp) +{ + struct radv_device *device = cmd_buffer->device; + VkDevice vk_device = radv_device_to_handle(cmd_buffer->device); + + radv_temp_descriptor_set_create(device, cmd_buffer, + device->meta_state.itob.img_ds_layout, + &tmp->set); + + radv_UpdateDescriptorSets(vk_device, + 2, /* writeCount */ + (VkWriteDescriptorSet[]) { + { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = tmp->set, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .pImageInfo = (VkDescriptorImageInfo[]) { + { + .sampler = NULL, + .imageView = radv_image_view_to_handle(&tmp->src_iview), + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }, + } + }, + { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = tmp->set, + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, + .pTexelBufferView = (VkBufferView[]) { radv_buffer_view_to_handle(&tmp->dst_bview) }, + } + }, 0, NULL); + + radv_CmdBindDescriptorSets(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_COMPUTE, + device->meta_state.itob.img_p_layout, 0, 1, + &tmp->set, 0, NULL); +} + +static void +itob_unbind_src_image(struct radv_cmd_buffer *cmd_buffer, + struct itob_temps *temps) +{ +} + +static void +bind_pipeline(struct radv_cmd_buffer *cmd_buffer) +{ + VkPipeline pipeline = + cmd_buffer->device->meta_state.itob.pipeline; + + if (cmd_buffer->state.compute_pipeline != radv_pipeline_from_handle(pipeline)) { + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + } +} + +void +radv_meta_image_to_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_blit2d_surf *src, + struct radv_meta_blit2d_buffer *dst, + unsigned num_rects, + struct radv_meta_blit2d_rect *rects) +{ + struct radv_device *device = cmd_buffer->device; + + for (unsigned r = 0; r < num_rects; ++r) { + struct itob_temps temps; + + itob_bind_src_image(cmd_buffer, src, &rects[r], &temps); + itob_bind_dst_buffer(cmd_buffer, dst, &rects[r], &temps); + itob_bind_descriptors(cmd_buffer, &temps); + + bind_pipeline(cmd_buffer); + + unsigned push_constants[3] = { + rects[r].src_x, + rects[r].src_y, + dst->pitch + }; + radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), + device->meta_state.itob.img_p_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, 12, + push_constants); + + radv_unaligned_dispatch(cmd_buffer, rects[r].width, rects[r].height, 1); + radv_temp_descriptor_set_destroy(cmd_buffer->device, temps.set); + itob_unbind_src_image(cmd_buffer, &temps); + } + +} diff --git a/src/amd/vulkan/radv_meta_clear.c b/src/amd/vulkan/radv_meta_clear.c new file mode 100644 index 00000000000..1c1b8c6b1e3 --- /dev/null +++ b/src/amd/vulkan/radv_meta_clear.c @@ -0,0 +1,1192 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "radv_meta.h" +#include "radv_private.h" +#include "nir/nir_builder.h" + +#include "util/format_rgb9e5.h" +#include "vk_format.h" +/** Vertex attributes for color clears. */ +struct color_clear_vattrs { + float position[2]; + VkClearColorValue color; +}; + +/** Vertex attributes for depthstencil clears. */ +struct depthstencil_clear_vattrs { + float position[2]; + float depth_clear; +}; + +enum { + DEPTH_CLEAR_SLOW, + DEPTH_CLEAR_FAST_EXPCLEAR, + DEPTH_CLEAR_FAST_NO_EXPCLEAR +}; + +static void +build_color_shaders(struct nir_shader **out_vs, + struct nir_shader **out_fs, + uint32_t frag_output) +{ + nir_builder vs_b; + nir_builder fs_b; + + nir_builder_init_simple_shader(&vs_b, NULL, MESA_SHADER_VERTEX, NULL); + nir_builder_init_simple_shader(&fs_b, NULL, MESA_SHADER_FRAGMENT, NULL); + + vs_b.shader->info.name = ralloc_strdup(vs_b.shader, "meta_clear_color_vs"); + fs_b.shader->info.name = ralloc_strdup(fs_b.shader, "meta_clear_color_fs"); + + const struct glsl_type *position_type = glsl_vec4_type(); + const struct glsl_type *color_type = glsl_vec4_type(); + + nir_variable *vs_in_pos = + nir_variable_create(vs_b.shader, nir_var_shader_in, position_type, + "a_position"); + vs_in_pos->data.location = VERT_ATTRIB_GENERIC0; + + nir_variable *vs_out_pos = + nir_variable_create(vs_b.shader, nir_var_shader_out, position_type, + "gl_Position"); + vs_out_pos->data.location = VARYING_SLOT_POS; + + nir_variable *vs_in_color = + nir_variable_create(vs_b.shader, nir_var_shader_in, color_type, + "a_color"); + vs_in_color->data.location = VERT_ATTRIB_GENERIC1; + + nir_variable *vs_out_color = + nir_variable_create(vs_b.shader, nir_var_shader_out, color_type, + "v_color"); + vs_out_color->data.location = VARYING_SLOT_VAR0; + vs_out_color->data.interpolation = INTERP_MODE_FLAT; + + nir_variable *fs_in_color = + nir_variable_create(fs_b.shader, nir_var_shader_in, color_type, + "v_color"); + fs_in_color->data.location = vs_out_color->data.location; + fs_in_color->data.interpolation = vs_out_color->data.interpolation; + + nir_variable *fs_out_color = + nir_variable_create(fs_b.shader, nir_var_shader_out, color_type, + "f_color"); + fs_out_color->data.location = FRAG_RESULT_DATA0 + frag_output; + + nir_copy_var(&vs_b, vs_out_pos, vs_in_pos); + nir_copy_var(&vs_b, vs_out_color, vs_in_color); + nir_copy_var(&fs_b, fs_out_color, fs_in_color); + + *out_vs = vs_b.shader; + *out_fs = fs_b.shader; +} + +static VkResult +create_pipeline(struct radv_device *device, + struct radv_render_pass *render_pass, + uint32_t samples, + struct nir_shader *vs_nir, + struct nir_shader *fs_nir, + const VkPipelineVertexInputStateCreateInfo *vi_state, + const VkPipelineDepthStencilStateCreateInfo *ds_state, + const VkPipelineColorBlendStateCreateInfo *cb_state, + const struct radv_graphics_pipeline_create_info *extra, + const VkAllocationCallbacks *alloc, + struct radv_pipeline **pipeline) +{ + VkDevice device_h = radv_device_to_handle(device); + VkResult result; + + struct radv_shader_module vs_m = { .nir = vs_nir }; + struct radv_shader_module fs_m = { .nir = fs_nir }; + + VkPipeline pipeline_h = VK_NULL_HANDLE; + result = radv_graphics_pipeline_create(device_h, + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &(VkGraphicsPipelineCreateInfo) { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = fs_nir ? 2 : 1, + .pStages = (VkPipelineShaderStageCreateInfo[]) { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = radv_shader_module_to_handle(&vs_m), + .pName = "main", + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = radv_shader_module_to_handle(&fs_m), + .pName = "main", + }, + }, + .pVertexInputState = vi_state, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + .primitiveRestartEnable = false, + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 0, + .scissorCount = 0, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .rasterizerDiscardEnable = false, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE, + .depthBiasEnable = false, + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = samples, + .sampleShadingEnable = false, + .pSampleMask = NULL, + .alphaToCoverageEnable = false, + .alphaToOneEnable = false, + }, + .pDepthStencilState = ds_state, + .pColorBlendState = cb_state, + .pDynamicState = &(VkPipelineDynamicStateCreateInfo) { + /* The meta clear pipeline declares all state as dynamic. + * As a consequence, vkCmdBindPipeline writes no dynamic state + * to the cmd buffer. Therefore, at the end of the meta clear, + * we need only restore dynamic state was vkCmdSet. + */ + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = 6, + .pDynamicStates = (VkDynamicState[]) { + /* Everything except stencil write mask */ + VK_DYNAMIC_STATE_LINE_WIDTH, + VK_DYNAMIC_STATE_DEPTH_BIAS, + VK_DYNAMIC_STATE_BLEND_CONSTANTS, + VK_DYNAMIC_STATE_DEPTH_BOUNDS, + VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, + VK_DYNAMIC_STATE_STENCIL_REFERENCE, + }, + }, + .flags = 0, + .renderPass = radv_render_pass_to_handle(render_pass), + .subpass = 0, + }, + extra, + alloc, + &pipeline_h); + + ralloc_free(vs_nir); + ralloc_free(fs_nir); + + *pipeline = radv_pipeline_from_handle(pipeline_h); + + return result; +} + +static VkResult +create_color_pipeline(struct radv_device *device, + VkFormat vk_format, + uint32_t samples, + uint32_t frag_output, + struct radv_pipeline **pipeline, + VkRenderPass *pass) +{ + struct nir_shader *vs_nir; + struct nir_shader *fs_nir; + VkResult result; + build_color_shaders(&vs_nir, &fs_nir, frag_output); + + const VkPipelineVertexInputStateCreateInfo vi_state = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) { + { + .binding = 0, + .stride = sizeof(struct color_clear_vattrs), + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX + }, + }, + .vertexAttributeDescriptionCount = 2, + .pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) { + { + /* Position */ + .location = 0, + .binding = 0, + .format = VK_FORMAT_R32G32_SFLOAT, + .offset = offsetof(struct color_clear_vattrs, position), + }, + { + /* Color */ + .location = 1, + .binding = 0, + .format = VK_FORMAT_R32G32B32A32_SFLOAT, + .offset = offsetof(struct color_clear_vattrs, color), + }, + }, + }; + + const VkPipelineDepthStencilStateCreateInfo ds_state = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = false, + .depthWriteEnable = false, + .depthBoundsTestEnable = false, + .stencilTestEnable = false, + }; + + VkPipelineColorBlendAttachmentState blend_attachment_state[MAX_RTS] = { 0 }; + blend_attachment_state[frag_output] = (VkPipelineColorBlendAttachmentState) { + .blendEnable = false, + .colorWriteMask = VK_COLOR_COMPONENT_A_BIT | + VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT, + }; + + const VkPipelineColorBlendStateCreateInfo cb_state = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .logicOpEnable = false, + .attachmentCount = MAX_RTS, + .pAttachments = blend_attachment_state + }; + + result = radv_CreateRenderPass(radv_device_to_handle(device), + &(VkRenderPassCreateInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = vk_format, + .samples = samples, + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + .finalLayout = VK_IMAGE_LAYOUT_GENERAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .colorAttachmentCount = 1, + .pColorAttachments = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + .pResolveAttachments = NULL, + .pDepthStencilAttachment = &(VkAttachmentReference) { + .attachment = VK_ATTACHMENT_UNUSED, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + .preserveAttachmentCount = 1, + .pPreserveAttachments = (uint32_t[]) { 0 }, + }, + .dependencyCount = 0, + }, &device->meta_state.alloc, pass); + + if (result != VK_SUCCESS) + return result; + struct radv_graphics_pipeline_create_info extra = { + .use_rectlist = true, + }; + result = create_pipeline(device, radv_render_pass_from_handle(*pass), + samples, vs_nir, fs_nir, &vi_state, &ds_state, &cb_state, + &extra, &device->meta_state.alloc, pipeline); + + return result; +} + +static void +destroy_pipeline(struct radv_device *device, struct radv_pipeline *pipeline) +{ + if (!pipeline) + return; + + RADV_CALL(DestroyPipeline)(radv_device_to_handle(device), + radv_pipeline_to_handle(pipeline), + &device->meta_state.alloc); + +} + +static void +destroy_render_pass(struct radv_device *device, VkRenderPass renderpass) +{ + RADV_CALL(DestroyRenderPass)(radv_device_to_handle(device), renderpass, + &device->meta_state.alloc); +} + +void +radv_device_finish_meta_clear_state(struct radv_device *device) +{ + struct radv_meta_state *state = &device->meta_state; + + for (uint32_t i = 0; i < ARRAY_SIZE(state->clear); ++i) { + for (uint32_t j = 0; j < ARRAY_SIZE(state->clear[i].color_pipelines); ++j) { + destroy_pipeline(device, state->clear[i].color_pipelines[j]); + destroy_render_pass(device, state->clear[i].render_pass[j]); + } + + for (uint32_t j = 0; j < NUM_DEPTH_CLEAR_PIPELINES; j++) { + destroy_pipeline(device, state->clear[i].depth_only_pipeline[j]); + destroy_render_pass(device, state->clear[i].depth_only_rp[j]); + destroy_pipeline(device, state->clear[i].stencil_only_pipeline[j]); + destroy_render_pass(device, state->clear[i].stencil_only_rp[j]); + destroy_pipeline(device, state->clear[i].depthstencil_pipeline[j]); + destroy_render_pass(device, state->clear[i].depthstencil_rp[j]); + } + } + +} + +static void +emit_color_clear(struct radv_cmd_buffer *cmd_buffer, + const VkClearAttachment *clear_att, + const VkClearRect *clear_rect) +{ + struct radv_device *device = cmd_buffer->device; + const struct radv_subpass *subpass = cmd_buffer->state.subpass; + const struct radv_framebuffer *fb = cmd_buffer->state.framebuffer; + const uint32_t subpass_att = clear_att->colorAttachment; + const uint32_t pass_att = subpass->color_attachments[subpass_att].attachment; + const struct radv_image_view *iview = fb->attachments[pass_att].attachment; + const uint32_t samples = iview->image->samples; + const uint32_t samples_log2 = ffs(samples) - 1; + unsigned fs_key = radv_format_meta_fs_key(iview->vk_format); + struct radv_pipeline *pipeline; + VkClearColorValue clear_value = clear_att->clearValue.color; + VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer); + VkPipeline pipeline_h; + uint32_t offset; + + if (fs_key == -1) { + radv_finishme("color clears incomplete"); + return; + } + pipeline = device->meta_state.clear[samples_log2].color_pipelines[fs_key]; + pipeline_h = radv_pipeline_to_handle(pipeline); + + if (!pipeline) { + radv_finishme("color clears incomplete"); + return; + } + assert(samples_log2 < ARRAY_SIZE(device->meta_state.clear)); + assert(pipeline); + assert(clear_att->aspectMask == VK_IMAGE_ASPECT_COLOR_BIT); + assert(clear_att->colorAttachment < subpass->color_count); + + const struct color_clear_vattrs vertex_data[3] = { + { + .position = { + clear_rect->rect.offset.x, + clear_rect->rect.offset.y, + }, + .color = clear_value, + }, + { + .position = { + clear_rect->rect.offset.x, + clear_rect->rect.offset.y + clear_rect->rect.extent.height, + }, + .color = clear_value, + }, + { + .position = { + clear_rect->rect.offset.x + clear_rect->rect.extent.width, + clear_rect->rect.offset.y, + }, + .color = clear_value, + }, + }; + + struct radv_subpass clear_subpass = { + .color_count = 1, + .color_attachments = (VkAttachmentReference[]) { + subpass->color_attachments[clear_att->colorAttachment] + }, + .depth_stencil_attachment = (VkAttachmentReference) { VK_ATTACHMENT_UNUSED, VK_IMAGE_LAYOUT_UNDEFINED } + }; + + radv_cmd_buffer_set_subpass(cmd_buffer, &clear_subpass, false); + + radv_cmd_buffer_upload_data(cmd_buffer, sizeof(vertex_data), 16, vertex_data, &offset); + struct radv_buffer vertex_buffer = { + .device = device, + .size = sizeof(vertex_data), + .bo = cmd_buffer->upload.upload_bo, + .offset = offset, + }; + + + RADV_CALL(CmdBindVertexBuffers)(cmd_buffer_h, 0, 1, + (VkBuffer[]) { radv_buffer_to_handle(&vertex_buffer) }, + (VkDeviceSize[]) { 0 }); + + if (cmd_buffer->state.pipeline != pipeline) { + RADV_CALL(CmdBindPipeline)(cmd_buffer_h, VK_PIPELINE_BIND_POINT_GRAPHICS, + pipeline_h); + } + + RADV_CALL(CmdDraw)(cmd_buffer_h, 3, 1, 0, 0); + + radv_cmd_buffer_set_subpass(cmd_buffer, subpass, false); +} + + +static void +build_depthstencil_shader(struct nir_shader **out_vs, struct nir_shader **out_fs) +{ + nir_builder vs_b, fs_b; + + nir_builder_init_simple_shader(&vs_b, NULL, MESA_SHADER_VERTEX, NULL); + nir_builder_init_simple_shader(&fs_b, NULL, MESA_SHADER_FRAGMENT, NULL); + + vs_b.shader->info.name = ralloc_strdup(vs_b.shader, "meta_clear_depthstencil_vs"); + fs_b.shader->info.name = ralloc_strdup(fs_b.shader, "meta_clear_depthstencil_fs"); + const struct glsl_type *position_type = glsl_vec4_type(); + + nir_variable *vs_in_pos = + nir_variable_create(vs_b.shader, nir_var_shader_in, position_type, + "a_position"); + vs_in_pos->data.location = VERT_ATTRIB_GENERIC0; + + nir_variable *vs_out_pos = + nir_variable_create(vs_b.shader, nir_var_shader_out, position_type, + "gl_Position"); + vs_out_pos->data.location = VARYING_SLOT_POS; + + nir_copy_var(&vs_b, vs_out_pos, vs_in_pos); + + *out_vs = vs_b.shader; + *out_fs = fs_b.shader; +} + +static VkResult +create_depthstencil_pipeline(struct radv_device *device, + VkImageAspectFlags aspects, + uint32_t samples, + int index, + struct radv_pipeline **pipeline, + VkRenderPass *render_pass) +{ + struct nir_shader *vs_nir, *fs_nir; + VkResult result; + build_depthstencil_shader(&vs_nir, &fs_nir); + + const VkPipelineVertexInputStateCreateInfo vi_state = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) { + { + .binding = 0, + .stride = sizeof(struct depthstencil_clear_vattrs), + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX + }, + }, + .vertexAttributeDescriptionCount = 1, + .pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) { + { + /* Position */ + .location = 0, + .binding = 0, + .format = VK_FORMAT_R32G32B32_SFLOAT, + .offset = offsetof(struct depthstencil_clear_vattrs, position), + }, + }, + }; + + const VkPipelineDepthStencilStateCreateInfo ds_state = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = (aspects & VK_IMAGE_ASPECT_DEPTH_BIT), + .depthCompareOp = VK_COMPARE_OP_ALWAYS, + .depthWriteEnable = (aspects & VK_IMAGE_ASPECT_DEPTH_BIT), + .depthBoundsTestEnable = false, + .stencilTestEnable = (aspects & VK_IMAGE_ASPECT_STENCIL_BIT), + .front = { + .passOp = VK_STENCIL_OP_REPLACE, + .compareOp = VK_COMPARE_OP_ALWAYS, + .writeMask = UINT32_MAX, + .reference = 0, /* dynamic */ + }, + .back = { 0 /* dont care */ }, + }; + + const VkPipelineColorBlendStateCreateInfo cb_state = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .logicOpEnable = false, + .attachmentCount = 0, + .pAttachments = NULL, + }; + + result = radv_CreateRenderPass(radv_device_to_handle(device), + &(VkRenderPassCreateInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = VK_FORMAT_UNDEFINED, + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + .finalLayout = VK_IMAGE_LAYOUT_GENERAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .colorAttachmentCount = 0, + .pColorAttachments = NULL, + .pResolveAttachments = NULL, + .pDepthStencilAttachment = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + .preserveAttachmentCount = 1, + .pPreserveAttachments = (uint32_t[]) { 0 }, + }, + .dependencyCount = 0, + }, &device->meta_state.alloc, render_pass); + if (result != VK_SUCCESS) + return result; + + struct radv_graphics_pipeline_create_info extra = { + .use_rectlist = true, + }; + + if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + extra.db_depth_clear = index == DEPTH_CLEAR_SLOW ? false : true; + extra.db_depth_disable_expclear = index == DEPTH_CLEAR_FAST_NO_EXPCLEAR ? true : false; + } + if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + extra.db_stencil_clear = index == DEPTH_CLEAR_SLOW ? false : true; + extra.db_stencil_disable_expclear = index == DEPTH_CLEAR_FAST_NO_EXPCLEAR ? true : false; + } + result = create_pipeline(device, radv_render_pass_from_handle(*render_pass), + samples, vs_nir, fs_nir, &vi_state, &ds_state, &cb_state, + &extra, &device->meta_state.alloc, pipeline); + return result; +} + +static bool depth_view_can_fast_clear(const struct radv_image_view *iview, + VkImageLayout layout, + const VkClearRect *clear_rect) +{ + if (clear_rect->rect.offset.x || clear_rect->rect.offset.y || + clear_rect->rect.extent.width != iview->extent.width || + clear_rect->rect.extent.height != iview->extent.height) + return false; + if (iview->image->htile.size && + iview->base_mip == 0 && + iview->base_layer == 0 && + radv_layout_can_expclear(iview->image, layout) && + memcmp(&iview->extent, &iview->image->extent, sizeof(iview->extent)) == 0) + return true; + return false; +} + +static struct radv_pipeline * +pick_depthstencil_pipeline(struct radv_meta_state *meta_state, + const struct radv_image_view *iview, + int samples_log2, + VkImageAspectFlags aspects, + VkImageLayout layout, + const VkClearRect *clear_rect, + VkClearDepthStencilValue clear_value) +{ + bool fast = depth_view_can_fast_clear(iview, layout, clear_rect); + int index = DEPTH_CLEAR_SLOW; + + if (fast) { + /* we don't know the previous clear values, so we always have + * the NO_EXPCLEAR path */ + index = DEPTH_CLEAR_FAST_NO_EXPCLEAR; + } + + switch (aspects) { + case VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT: + return meta_state->clear[samples_log2].depthstencil_pipeline[index]; + case VK_IMAGE_ASPECT_DEPTH_BIT: + return meta_state->clear[samples_log2].depth_only_pipeline[index]; + case VK_IMAGE_ASPECT_STENCIL_BIT: + return meta_state->clear[samples_log2].stencil_only_pipeline[index]; + } + unreachable("expected depth or stencil aspect"); +} + +static void +emit_depthstencil_clear(struct radv_cmd_buffer *cmd_buffer, + const VkClearAttachment *clear_att, + const VkClearRect *clear_rect) +{ + struct radv_device *device = cmd_buffer->device; + struct radv_meta_state *meta_state = &device->meta_state; + const struct radv_subpass *subpass = cmd_buffer->state.subpass; + const struct radv_framebuffer *fb = cmd_buffer->state.framebuffer; + const uint32_t pass_att = subpass->depth_stencil_attachment.attachment; + VkClearDepthStencilValue clear_value = clear_att->clearValue.depthStencil; + VkImageAspectFlags aspects = clear_att->aspectMask; + const struct radv_image_view *iview = fb->attachments[pass_att].attachment; + const uint32_t samples = iview->image->samples; + const uint32_t samples_log2 = ffs(samples) - 1; + VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer); + uint32_t offset; + + assert(aspects == VK_IMAGE_ASPECT_DEPTH_BIT || + aspects == VK_IMAGE_ASPECT_STENCIL_BIT || + aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | + VK_IMAGE_ASPECT_STENCIL_BIT)); + assert(pass_att != VK_ATTACHMENT_UNUSED); + + const struct depthstencil_clear_vattrs vertex_data[3] = { + { + .position = { + clear_rect->rect.offset.x, + clear_rect->rect.offset.y, + }, + .depth_clear = clear_value.depth, + }, + { + .position = { + clear_rect->rect.offset.x, + clear_rect->rect.offset.y + clear_rect->rect.extent.height, + }, + .depth_clear = clear_value.depth, + }, + { + .position = { + clear_rect->rect.offset.x + clear_rect->rect.extent.width, + clear_rect->rect.offset.y, + }, + .depth_clear = clear_value.depth, + }, + }; + + radv_cmd_buffer_upload_data(cmd_buffer, sizeof(vertex_data), 16, vertex_data, &offset); + struct radv_buffer vertex_buffer = { + .device = device, + .size = sizeof(vertex_data), + .bo = cmd_buffer->upload.upload_bo, + .offset = offset, + }; + + if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + RADV_CALL(CmdSetStencilReference)(cmd_buffer_h, VK_STENCIL_FACE_FRONT_BIT, + clear_value.stencil); + } + + RADV_CALL(CmdBindVertexBuffers)(cmd_buffer_h, 0, 1, + (VkBuffer[]) { radv_buffer_to_handle(&vertex_buffer) }, + (VkDeviceSize[]) { 0 }); + + struct radv_pipeline *pipeline = pick_depthstencil_pipeline(meta_state, + iview, + samples_log2, + aspects, + subpass->depth_stencil_attachment.layout, + clear_rect, + clear_value); + if (cmd_buffer->state.pipeline != pipeline) { + RADV_CALL(CmdBindPipeline)(cmd_buffer_h, VK_PIPELINE_BIND_POINT_GRAPHICS, + radv_pipeline_to_handle(pipeline)); + } + + if (depth_view_can_fast_clear(iview, subpass->depth_stencil_attachment.layout, clear_rect)) + radv_set_depth_clear_regs(cmd_buffer, iview->image, clear_value, aspects); + + RADV_CALL(CmdDraw)(cmd_buffer_h, 3, 1, 0, 0); +} + + +static VkFormat pipeline_formats[] = { + VK_FORMAT_R8G8B8A8_UNORM, + VK_FORMAT_R8G8B8A8_UINT, + VK_FORMAT_R8G8B8A8_SINT, + VK_FORMAT_R16G16B16A16_UNORM, + VK_FORMAT_R16G16B16A16_SNORM, + VK_FORMAT_R16G16B16A16_UINT, + VK_FORMAT_R16G16B16A16_SINT, + VK_FORMAT_R32_SFLOAT, + VK_FORMAT_R32G32_SFLOAT, + VK_FORMAT_R32G32B32A32_SFLOAT +}; + +VkResult +radv_device_init_meta_clear_state(struct radv_device *device) +{ + VkResult res; + struct radv_meta_state *state = &device->meta_state; + + memset(&device->meta_state.clear, 0, sizeof(device->meta_state.clear)); + + for (uint32_t i = 0; i < ARRAY_SIZE(state->clear); ++i) { + uint32_t samples = 1 << i; + for (uint32_t j = 0; j < ARRAY_SIZE(pipeline_formats); ++j) { + VkFormat format = pipeline_formats[j]; + unsigned fs_key = radv_format_meta_fs_key(format); + assert(!state->clear[i].color_pipelines[fs_key]); + res = create_color_pipeline(device, format, samples, 0, &state->clear[i].color_pipelines[fs_key], + &state->clear[i].render_pass[fs_key]); + if (res != VK_SUCCESS) + goto fail; + + } + + for (uint32_t j = 0; j < NUM_DEPTH_CLEAR_PIPELINES; j++) { + res = create_depthstencil_pipeline(device, + VK_IMAGE_ASPECT_DEPTH_BIT, + samples, + j, + &state->clear[i].depth_only_pipeline[j], + &state->clear[i].depth_only_rp[j]); + if (res != VK_SUCCESS) + goto fail; + + res = create_depthstencil_pipeline(device, + VK_IMAGE_ASPECT_STENCIL_BIT, + samples, + j, + &state->clear[i].stencil_only_pipeline[j], + &state->clear[i].stencil_only_rp[j]); + if (res != VK_SUCCESS) + goto fail; + + res = create_depthstencil_pipeline(device, + VK_IMAGE_ASPECT_DEPTH_BIT | + VK_IMAGE_ASPECT_STENCIL_BIT, + samples, + j, + &state->clear[i].depthstencil_pipeline[j], + &state->clear[i].depthstencil_rp[j]); + if (res != VK_SUCCESS) + goto fail; + } + } + return VK_SUCCESS; + +fail: + radv_device_finish_meta_clear_state(device); + return res; +} + +static bool +emit_fast_color_clear(struct radv_cmd_buffer *cmd_buffer, + const VkClearAttachment *clear_att, + const VkClearRect *clear_rect) +{ + const struct radv_subpass *subpass = cmd_buffer->state.subpass; + const uint32_t subpass_att = clear_att->colorAttachment; + const uint32_t pass_att = subpass->color_attachments[subpass_att].attachment; + VkImageLayout image_layout = subpass->color_attachments[subpass_att].layout; + const struct radv_framebuffer *fb = cmd_buffer->state.framebuffer; + const struct radv_image_view *iview = fb->attachments[pass_att].attachment; + VkClearColorValue clear_value = clear_att->clearValue.color; + uint32_t clear_color[2]; + bool ret; + + if (!iview->image->cmask.size && !iview->image->surface.dcc_size) + return false; + + if (!cmd_buffer->device->allow_fast_clears) + return false; + + if (!radv_layout_has_cmask(iview->image, image_layout)) + goto fail; + if (vk_format_get_blocksizebits(iview->image->vk_format) > 64) + goto fail; + + /* don't fast clear 3D */ + if (iview->image->type == VK_IMAGE_TYPE_3D) + goto fail; + + /* all layers are bound */ + if (iview->base_layer > 0) + goto fail; + if (iview->image->array_size != iview->layer_count) + goto fail; + + if (iview->image->levels > 1) + goto fail; + + if (iview->image->surface.level[0].mode < RADEON_SURF_MODE_1D) + goto fail; + + if (memcmp(&iview->extent, &iview->image->extent, sizeof(iview->extent))) + goto fail; + + if (clear_rect->rect.offset.x || clear_rect->rect.offset.y || + clear_rect->rect.extent.width != iview->image->extent.width || + clear_rect->rect.extent.height != iview->image->extent.height) + goto fail; + + if (clear_rect->baseArrayLayer != 0) + goto fail; + if (clear_rect->layerCount != iview->image->array_size) + goto fail; + + /* DCC */ + ret = radv_format_pack_clear_color(iview->image->vk_format, + clear_color, &clear_value); + if (ret == false) + goto fail; + + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; + si_emit_cache_flush(cmd_buffer); + /* clear cmask buffer */ + if (iview->image->surface.dcc_size) { + radv_fill_buffer(cmd_buffer, iview->image->bo, + iview->image->offset + iview->image->dcc_offset, + iview->image->surface.dcc_size, 0x20202020); + } else { + radv_fill_buffer(cmd_buffer, iview->image->bo, + iview->image->offset + iview->image->cmask.offset, + iview->image->cmask.size, 0); + } + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_VMEM_L1 | + RADV_CMD_FLAG_INV_GLOBAL_L2; + + radv_set_color_clear_regs(cmd_buffer, iview->image, subpass_att, clear_color); + + return true; +fail: + return false; +} + +/** + * The parameters mean that same as those in vkCmdClearAttachments. + */ +static void +emit_clear(struct radv_cmd_buffer *cmd_buffer, + const VkClearAttachment *clear_att, + const VkClearRect *clear_rect) +{ + if (clear_att->aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { + + if (!emit_fast_color_clear(cmd_buffer, clear_att, clear_rect)) + emit_color_clear(cmd_buffer, clear_att, clear_rect); + } else { + assert(clear_att->aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | + VK_IMAGE_ASPECT_STENCIL_BIT)); + emit_depthstencil_clear(cmd_buffer, clear_att, clear_rect); + } +} + +static bool +subpass_needs_clear(const struct radv_cmd_buffer *cmd_buffer) +{ + const struct radv_cmd_state *cmd_state = &cmd_buffer->state; + uint32_t ds; + + if (!cmd_state->subpass) + return false; + ds = cmd_state->subpass->depth_stencil_attachment.attachment; + for (uint32_t i = 0; i < cmd_state->subpass->color_count; ++i) { + uint32_t a = cmd_state->subpass->color_attachments[i].attachment; + if (cmd_state->attachments[a].pending_clear_aspects) { + return true; + } + } + + if (ds != VK_ATTACHMENT_UNUSED && + cmd_state->attachments[ds].pending_clear_aspects) { + return true; + } + + return false; +} + +/** + * Emit any pending attachment clears for the current subpass. + * + * @see radv_attachment_state::pending_clear_aspects + */ +void +radv_cmd_buffer_clear_subpass(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_cmd_state *cmd_state = &cmd_buffer->state; + struct radv_meta_saved_state saved_state; + + if (!subpass_needs_clear(cmd_buffer)) + return; + + radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer); + + if (cmd_state->framebuffer->layers > 1) + radv_finishme("clearing multi-layer framebuffer"); + + VkClearRect clear_rect = { + .rect = cmd_state->render_area, + .baseArrayLayer = 0, + .layerCount = 1, /* FINISHME: clear multi-layer framebuffer */ + }; + + for (uint32_t i = 0; i < cmd_state->subpass->color_count; ++i) { + uint32_t a = cmd_state->subpass->color_attachments[i].attachment; + + if (!cmd_state->attachments[a].pending_clear_aspects) + continue; + + assert(cmd_state->attachments[a].pending_clear_aspects == + VK_IMAGE_ASPECT_COLOR_BIT); + + VkClearAttachment clear_att = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .colorAttachment = i, /* Use attachment index relative to subpass */ + .clearValue = cmd_state->attachments[a].clear_value, + }; + + emit_clear(cmd_buffer, &clear_att, &clear_rect); + cmd_state->attachments[a].pending_clear_aspects = 0; + } + + uint32_t ds = cmd_state->subpass->depth_stencil_attachment.attachment; + + if (ds != VK_ATTACHMENT_UNUSED) { + + if (cmd_state->attachments[ds].pending_clear_aspects) { + + VkClearAttachment clear_att = { + .aspectMask = cmd_state->attachments[ds].pending_clear_aspects, + .clearValue = cmd_state->attachments[ds].clear_value, + }; + + emit_clear(cmd_buffer, &clear_att, &clear_rect); + cmd_state->attachments[ds].pending_clear_aspects = 0; + } + } + + radv_meta_restore(&saved_state, cmd_buffer); +} + +static void +radv_cmd_clear_image(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + VkImageLayout image_layout, + const VkClearValue *clear_value, + uint32_t range_count, + const VkImageSubresourceRange *ranges) +{ + VkDevice device_h = radv_device_to_handle(cmd_buffer->device); + VkFormat format = image->vk_format; + VkClearValue internal_clear_value = *clear_value; + + if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) { + uint32_t value; + format = VK_FORMAT_R32_UINT; + value = float3_to_rgb9e5(clear_value->color.float32); + internal_clear_value.color.uint32[0] = value; + } + + for (uint32_t r = 0; r < range_count; r++) { + const VkImageSubresourceRange *range = &ranges[r]; + for (uint32_t l = 0; l < radv_get_levelCount(image, range); ++l) { + const uint32_t layer_count = image->type == VK_IMAGE_TYPE_3D ? + radv_minify(image->extent.depth, l) : + radv_get_layerCount(image, range); + for (uint32_t s = 0; s < layer_count; ++s) { + struct radv_image_view iview; + radv_image_view_init(&iview, cmd_buffer->device, + &(VkImageViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = radv_image_to_handle(image), + .viewType = radv_meta_get_view_type(image), + .format = format, + .subresourceRange = { + .aspectMask = range->aspectMask, + .baseMipLevel = range->baseMipLevel + l, + .levelCount = 1, + .baseArrayLayer = range->baseArrayLayer + s, + .layerCount = 1 + }, + }, + cmd_buffer, VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT); + + VkFramebuffer fb; + radv_CreateFramebuffer(device_h, + &(VkFramebufferCreateInfo) { + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = (VkImageView[]) { + radv_image_view_to_handle(&iview), + }, + .width = iview.extent.width, + .height = iview.extent.height, + .layers = 1 + }, + &cmd_buffer->pool->alloc, + &fb); + + VkAttachmentDescription att_desc = { + .format = iview.vk_format, + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = image_layout, + .finalLayout = image_layout, + }; + + VkSubpassDescription subpass_desc = { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .colorAttachmentCount = 0, + .pColorAttachments = NULL, + .pResolveAttachments = NULL, + .pDepthStencilAttachment = NULL, + .preserveAttachmentCount = 0, + .pPreserveAttachments = NULL, + }; + + const VkAttachmentReference att_ref = { + .attachment = 0, + .layout = image_layout, + }; + + if (range->aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { + subpass_desc.colorAttachmentCount = 1; + subpass_desc.pColorAttachments = &att_ref; + } else { + subpass_desc.pDepthStencilAttachment = &att_ref; + } + + VkRenderPass pass; + radv_CreateRenderPass(device_h, + &(VkRenderPassCreateInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &att_desc, + .subpassCount = 1, + .pSubpasses = &subpass_desc, + }, + &cmd_buffer->pool->alloc, + &pass); + + RADV_CALL(CmdBeginRenderPass)(radv_cmd_buffer_to_handle(cmd_buffer), + &(VkRenderPassBeginInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderArea = { + .offset = { 0, 0, }, + .extent = { + .width = iview.extent.width, + .height = iview.extent.height, + }, + }, + .renderPass = pass, + .framebuffer = fb, + .clearValueCount = 0, + .pClearValues = NULL, + }, + VK_SUBPASS_CONTENTS_INLINE); + + VkClearAttachment clear_att = { + .aspectMask = range->aspectMask, + .colorAttachment = 0, + .clearValue = internal_clear_value, + }; + + VkClearRect clear_rect = { + .rect = { + .offset = { 0, 0 }, + .extent = { iview.extent.width, iview.extent.height }, + }, + .baseArrayLayer = range->baseArrayLayer, + .layerCount = 1, /* FINISHME: clear multi-layer framebuffer */ + }; + + emit_clear(cmd_buffer, &clear_att, &clear_rect); + + RADV_CALL(CmdEndRenderPass)(radv_cmd_buffer_to_handle(cmd_buffer)); + RADV_CALL(DestroyRenderPass)(device_h, pass, + &cmd_buffer->pool->alloc); + RADV_CALL(DestroyFramebuffer)(device_h, fb, + &cmd_buffer->pool->alloc); + } + } + } +} + +void radv_CmdClearColorImage( + VkCommandBuffer commandBuffer, + VkImage image_h, + VkImageLayout imageLayout, + const VkClearColorValue* pColor, + uint32_t rangeCount, + const VkImageSubresourceRange* pRanges) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_image, image, image_h); + struct radv_meta_saved_state saved_state; + + radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer); + + radv_cmd_clear_image(cmd_buffer, image, imageLayout, + (const VkClearValue *) pColor, + rangeCount, pRanges); + + radv_meta_restore(&saved_state, cmd_buffer); +} + +void radv_CmdClearDepthStencilImage( + VkCommandBuffer commandBuffer, + VkImage image_h, + VkImageLayout imageLayout, + const VkClearDepthStencilValue* pDepthStencil, + uint32_t rangeCount, + const VkImageSubresourceRange* pRanges) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_image, image, image_h); + struct radv_meta_saved_state saved_state; + + radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer); + + radv_cmd_clear_image(cmd_buffer, image, imageLayout, + (const VkClearValue *) pDepthStencil, + rangeCount, pRanges); + + radv_meta_restore(&saved_state, cmd_buffer); +} + +void radv_CmdClearAttachments( + VkCommandBuffer commandBuffer, + uint32_t attachmentCount, + const VkClearAttachment* pAttachments, + uint32_t rectCount, + const VkClearRect* pRects) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_meta_saved_state saved_state; + + if (!cmd_buffer->state.subpass) + return; + + radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer); + + /* FINISHME: We can do better than this dumb loop. It thrashes too much + * state. + */ + for (uint32_t a = 0; a < attachmentCount; ++a) { + for (uint32_t r = 0; r < rectCount; ++r) { + emit_clear(cmd_buffer, &pAttachments[a], &pRects[r]); + } + } + + radv_meta_restore(&saved_state, cmd_buffer); +} diff --git a/src/amd/vulkan/radv_meta_copy.c b/src/amd/vulkan/radv_meta_copy.c new file mode 100644 index 00000000000..4c01eb7acbb --- /dev/null +++ b/src/amd/vulkan/radv_meta_copy.c @@ -0,0 +1,399 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "radv_meta.h" +#include "vk_format.h" + +static VkExtent3D +meta_image_block_size(const struct radv_image *image) +{ + const struct vk_format_description *desc = vk_format_description(image->vk_format); + return (VkExtent3D) { desc->block.width, desc->block.height, 1 }; +} + +/* Returns the user-provided VkBufferImageCopy::imageExtent in units of + * elements rather than texels. One element equals one texel or one block + * if Image is uncompressed or compressed, respectively. + */ +static struct VkExtent3D +meta_region_extent_el(const struct radv_image *image, + const struct VkExtent3D *extent) +{ + const VkExtent3D block = meta_image_block_size(image); + return radv_sanitize_image_extent(image->type, (VkExtent3D) { + .width = DIV_ROUND_UP(extent->width , block.width), + .height = DIV_ROUND_UP(extent->height, block.height), + .depth = DIV_ROUND_UP(extent->depth , block.depth), + }); +} + +/* Returns the user-provided VkBufferImageCopy::imageOffset in units of + * elements rather than texels. One element equals one texel or one block + * if Image is uncompressed or compressed, respectively. + */ +static struct VkOffset3D +meta_region_offset_el(const struct radv_image *image, + const struct VkOffset3D *offset) +{ + const VkExtent3D block = meta_image_block_size(image); + return radv_sanitize_image_offset(image->type, (VkOffset3D) { + .x = offset->x / block.width, + .y = offset->y / block.height, + .z = offset->z / block.depth, + }); +} + +static VkFormat +vk_format_for_size(int bs) +{ + switch (bs) { + case 1: return VK_FORMAT_R8_UINT; + case 2: return VK_FORMAT_R8G8_UINT; + case 4: return VK_FORMAT_R8G8B8A8_UINT; + case 8: return VK_FORMAT_R16G16B16A16_UINT; + case 16: return VK_FORMAT_R32G32B32A32_UINT; + default: + unreachable("Invalid format block size"); + } +} + +static struct radv_meta_blit2d_surf +blit_surf_for_image_level_layer(struct radv_image* image, VkImageAspectFlags aspectMask, + int level, int layer) +{ + VkFormat format = image->vk_format; + if (aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) + format = vk_format_depth_only(format); + else if (aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) + format = vk_format_stencil_only(format); + + if (!image->surface.dcc_size) + format = vk_format_for_size(vk_format_get_blocksize(format)); + + return (struct radv_meta_blit2d_surf) { + .format = format, + .bs = vk_format_get_blocksize(format), + .level = level, + .layer = layer, + .image = image, + .aspect_mask = aspectMask, + }; +} + +static void +meta_copy_buffer_to_image(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer* buffer, + struct radv_image* image, + uint32_t regionCount, + const VkBufferImageCopy* pRegions) +{ + struct radv_meta_saved_state saved_state; + + /* The Vulkan 1.0 spec says "dstImage must have a sample count equal to + * VK_SAMPLE_COUNT_1_BIT." + */ + assert(image->samples == 1); + + radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer); + + for (unsigned r = 0; r < regionCount; r++) { + + /** + * From the Vulkan 1.0.6 spec: 18.3 Copying Data Between Images + * extent is the size in texels of the source image to copy in width, + * height and depth. 1D images use only x and width. 2D images use x, y, + * width and height. 3D images use x, y, z, width, height and depth. + * + * + * Also, convert the offsets and extent from units of texels to units of + * blocks - which is the highest resolution accessible in this command. + */ + const VkOffset3D img_offset_el = + meta_region_offset_el(image, &pRegions[r].imageOffset); + const VkExtent3D bufferExtent = { + .width = pRegions[r].bufferRowLength ? + pRegions[r].bufferRowLength : pRegions[r].imageExtent.width, + .height = pRegions[r].bufferImageHeight ? + pRegions[r].bufferImageHeight : pRegions[r].imageExtent.height, + }; + const VkExtent3D buf_extent_el = + meta_region_extent_el(image, &bufferExtent); + + /* Start creating blit rect */ + const VkExtent3D img_extent_el = + meta_region_extent_el(image, &pRegions[r].imageExtent); + struct radv_meta_blit2d_rect rect = { + .width = img_extent_el.width, + .height = img_extent_el.height, + }; + + /* Create blit surfaces */ + struct radv_meta_blit2d_surf img_bsurf = + blit_surf_for_image_level_layer(image, + pRegions[r].imageSubresource.aspectMask, + pRegions[r].imageSubresource.mipLevel, + pRegions[r].imageSubresource.baseArrayLayer); + + struct radv_meta_blit2d_buffer buf_bsurf = { + .bs = img_bsurf.bs, + .format = img_bsurf.format, + .buffer = buffer, + .offset = pRegions[r].bufferOffset, + .pitch = buf_extent_el.width, + }; + + /* Loop through each 3D or array slice */ + unsigned num_slices_3d = img_extent_el.depth; + unsigned num_slices_array = pRegions[r].imageSubresource.layerCount; + unsigned slice_3d = 0; + unsigned slice_array = 0; + while (slice_3d < num_slices_3d && slice_array < num_slices_array) { + + rect.dst_x = img_offset_el.x; + rect.dst_y = img_offset_el.y; + + + /* Perform Blit */ + radv_meta_blit2d(cmd_buffer, NULL, &buf_bsurf, &img_bsurf, 1, &rect); + + /* Once we've done the blit, all of the actual information about + * the image is embedded in the command buffer so we can just + * increment the offset directly in the image effectively + * re-binding it to different backing memory. + */ + buf_bsurf.offset += buf_extent_el.width * + buf_extent_el.height * buf_bsurf.bs; + img_bsurf.layer++; + if (image->type == VK_IMAGE_TYPE_3D) + slice_3d++; + else + slice_array++; + } + } + radv_meta_restore(&saved_state, cmd_buffer); +} + +void radv_CmdCopyBufferToImage( + VkCommandBuffer commandBuffer, + VkBuffer srcBuffer, + VkImage destImage, + VkImageLayout destImageLayout, + uint32_t regionCount, + const VkBufferImageCopy* pRegions) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_image, dest_image, destImage); + RADV_FROM_HANDLE(radv_buffer, src_buffer, srcBuffer); + + meta_copy_buffer_to_image(cmd_buffer, src_buffer, dest_image, + regionCount, pRegions); +} + +static void +meta_copy_image_to_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer* buffer, + struct radv_image* image, + uint32_t regionCount, + const VkBufferImageCopy* pRegions) +{ + struct radv_meta_saved_compute_state saved_state; + + radv_meta_begin_bufimage(cmd_buffer, &saved_state); + for (unsigned r = 0; r < regionCount; r++) { + + /** + * From the Vulkan 1.0.6 spec: 18.3 Copying Data Between Images + * extent is the size in texels of the source image to copy in width, + * height and depth. 1D images use only x and width. 2D images use x, y, + * width and height. 3D images use x, y, z, width, height and depth. + * + * + * Also, convert the offsets and extent from units of texels to units of + * blocks - which is the highest resolution accessible in this command. + */ + const VkOffset3D img_offset_el = + meta_region_offset_el(image, &pRegions[r].imageOffset); + const VkExtent3D bufferExtent = { + .width = pRegions[r].bufferRowLength ? + pRegions[r].bufferRowLength : pRegions[r].imageExtent.width, + .height = pRegions[r].bufferImageHeight ? + pRegions[r].bufferImageHeight : pRegions[r].imageExtent.height, + }; + const VkExtent3D buf_extent_el = + meta_region_extent_el(image, &bufferExtent); + + /* Start creating blit rect */ + const VkExtent3D img_extent_el = + meta_region_extent_el(image, &pRegions[r].imageExtent); + struct radv_meta_blit2d_rect rect = { + .width = img_extent_el.width, + .height = img_extent_el.height, + }; + + /* Create blit surfaces */ + struct radv_meta_blit2d_surf img_info = + blit_surf_for_image_level_layer(image, + pRegions[r].imageSubresource.aspectMask, + pRegions[r].imageSubresource.mipLevel, + pRegions[r].imageSubresource.baseArrayLayer); + struct radv_meta_blit2d_buffer buf_info = { + .bs = img_info.bs, + .format = img_info.format, + .buffer = buffer, + .offset = pRegions[r].bufferOffset, + .pitch = buf_extent_el.width, + }; + + /* Loop through each 3D or array slice */ + unsigned num_slices_3d = img_extent_el.depth; + unsigned num_slices_array = pRegions[r].imageSubresource.layerCount; + unsigned slice_3d = 0; + unsigned slice_array = 0; + while (slice_3d < num_slices_3d && slice_array < num_slices_array) { + + rect.src_x = img_offset_el.x; + rect.src_y = img_offset_el.y; + + + /* Perform Blit */ + radv_meta_image_to_buffer(cmd_buffer, &img_info, &buf_info, 1, &rect); + + buf_info.offset += buf_extent_el.width * + buf_extent_el.height * buf_info.bs; + img_info.layer++; + if (image->type == VK_IMAGE_TYPE_3D) + slice_3d++; + else + slice_array++; + } + } + radv_meta_end_bufimage(cmd_buffer, &saved_state); +} + +void radv_CmdCopyImageToBuffer( + VkCommandBuffer commandBuffer, + VkImage srcImage, + VkImageLayout srcImageLayout, + VkBuffer destBuffer, + uint32_t regionCount, + const VkBufferImageCopy* pRegions) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_image, src_image, srcImage); + RADV_FROM_HANDLE(radv_buffer, dst_buffer, destBuffer); + + meta_copy_image_to_buffer(cmd_buffer, dst_buffer, src_image, + regionCount, pRegions); +} + +void radv_CmdCopyImage( + VkCommandBuffer commandBuffer, + VkImage srcImage, + VkImageLayout srcImageLayout, + VkImage destImage, + VkImageLayout destImageLayout, + uint32_t regionCount, + const VkImageCopy* pRegions) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_image, src_image, srcImage); + RADV_FROM_HANDLE(radv_image, dest_image, destImage); + struct radv_meta_saved_state saved_state; + + /* From the Vulkan 1.0 spec: + * + * vkCmdCopyImage can be used to copy image data between multisample + * images, but both images must have the same number of samples. + */ + assert(src_image->samples == dest_image->samples); + + radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer); + + for (unsigned r = 0; r < regionCount; r++) { + assert(pRegions[r].srcSubresource.aspectMask == + pRegions[r].dstSubresource.aspectMask); + + /* Create blit surfaces */ + struct radv_meta_blit2d_surf b_src = + blit_surf_for_image_level_layer(src_image, + pRegions[r].srcSubresource.aspectMask, + pRegions[r].srcSubresource.mipLevel, + pRegions[r].srcSubresource.baseArrayLayer); + struct radv_meta_blit2d_surf b_dst = + blit_surf_for_image_level_layer(dest_image, + pRegions[r].dstSubresource.aspectMask, + pRegions[r].dstSubresource.mipLevel, + pRegions[r].dstSubresource.baseArrayLayer); + + /* for DCC */ + b_src.format = b_dst.format; + + /** + * From the Vulkan 1.0.6 spec: 18.4 Copying Data Between Buffers and Images + * imageExtent is the size in texels of the image to copy in width, height + * and depth. 1D images use only x and width. 2D images use x, y, width + * and height. 3D images use x, y, z, width, height and depth. + * + * Also, convert the offsets and extent from units of texels to units of + * blocks - which is the highest resolution accessible in this command. + */ + const VkOffset3D dst_offset_el = + meta_region_offset_el(dest_image, &pRegions[r].dstOffset); + const VkOffset3D src_offset_el = + meta_region_offset_el(src_image, &pRegions[r].srcOffset); + const VkExtent3D img_extent_el = + meta_region_extent_el(src_image, &pRegions[r].extent); + + /* Start creating blit rect */ + struct radv_meta_blit2d_rect rect = { + .width = img_extent_el.width, + .height = img_extent_el.height, + }; + + /* Loop through each 3D or array slice */ + unsigned num_slices_3d = img_extent_el.depth; + unsigned num_slices_array = pRegions[r].dstSubresource.layerCount; + unsigned slice_3d = 0; + unsigned slice_array = 0; + while (slice_3d < num_slices_3d && slice_array < num_slices_array) { + + /* Finish creating blit rect */ + rect.dst_x = dst_offset_el.x; + rect.dst_y = dst_offset_el.y; + rect.src_x = src_offset_el.x; + rect.src_y = src_offset_el.y; + + /* Perform Blit */ + radv_meta_blit2d(cmd_buffer, &b_src, NULL, &b_dst, 1, &rect); + + b_src.layer++; + b_dst.layer++; + if (dest_image->type == VK_IMAGE_TYPE_3D) + slice_3d++; + else + slice_array++; + } + } + + radv_meta_restore(&saved_state, cmd_buffer); +} diff --git a/src/amd/vulkan/radv_meta_decompress.c b/src/amd/vulkan/radv_meta_decompress.c new file mode 100644 index 00000000000..498cc239bde --- /dev/null +++ b/src/amd/vulkan/radv_meta_decompress.c @@ -0,0 +1,463 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#include "radv_meta.h" +#include "radv_private.h" +#include "nir/nir_builder.h" +#include "sid.h" +/** + * Vertex attributes used by all pipelines. + */ +struct vertex_attrs { + float position[2]; /**< 3DPRIM_RECTLIST */ +}; + +/* passthrough vertex shader */ +static nir_shader * +build_nir_vs(void) +{ + const struct glsl_type *vec4 = glsl_vec4_type(); + + nir_builder b; + nir_variable *a_position; + nir_variable *v_position; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, NULL); + b.shader->info.name = ralloc_strdup(b.shader, "meta_depth_decomp_vs"); + + a_position = nir_variable_create(b.shader, nir_var_shader_in, vec4, + "a_position"); + a_position->data.location = VERT_ATTRIB_GENERIC0; + + v_position = nir_variable_create(b.shader, nir_var_shader_out, vec4, + "gl_Position"); + v_position->data.location = VARYING_SLOT_POS; + + nir_copy_var(&b, v_position, a_position); + + return b.shader; +} + +/* simple passthrough shader */ +static nir_shader * +build_nir_fs(void) +{ + nir_builder b; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL); + b.shader->info.name = ralloc_asprintf(b.shader, + "meta_depth_decomp_noop_fs"); + + return b.shader; +} + +static VkResult +create_pass(struct radv_device *device) +{ + VkResult result; + VkDevice device_h = radv_device_to_handle(device); + const VkAllocationCallbacks *alloc = &device->meta_state.alloc; + VkAttachmentDescription attachment; + + attachment.format = VK_FORMAT_UNDEFINED; + attachment.samples = 1; + attachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; + attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; + attachment.initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + attachment.finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + + result = radv_CreateRenderPass(device_h, + &(VkRenderPassCreateInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &attachment, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .colorAttachmentCount = 0, + .pColorAttachments = NULL, + .pResolveAttachments = NULL, + .pDepthStencilAttachment = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, + }, + .preserveAttachmentCount = 0, + .pPreserveAttachments = NULL, + }, + .dependencyCount = 0, + }, + alloc, + &device->meta_state.depth_decomp.pass); + + return result; +} + +static VkResult +create_pipeline(struct radv_device *device, + VkShaderModule vs_module_h) +{ + VkResult result; + VkDevice device_h = radv_device_to_handle(device); + + struct radv_shader_module fs_module = { + .nir = build_nir_fs(), + }; + + if (!fs_module.nir) { + /* XXX: Need more accurate error */ + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto cleanup; + } + + const VkGraphicsPipelineCreateInfo pipeline_create_info = { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = 2, + .pStages = (VkPipelineShaderStageCreateInfo[]) { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = vs_module_h, + .pName = "main", + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = radv_shader_module_to_handle(&fs_module), + .pName = "main", + }, + }, + .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) { + { + .binding = 0, + .stride = sizeof(struct vertex_attrs), + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX + }, + }, + .vertexAttributeDescriptionCount = 1, + .pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) { + { + /* Position */ + .location = 0, + .binding = 0, + .format = VK_FORMAT_R32G32_SFLOAT, + .offset = offsetof(struct vertex_attrs, position), + }, + }, + }, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + .primitiveRestartEnable = false, + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 0, + .scissorCount = 0, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .depthClampEnable = false, + .rasterizerDiscardEnable = false, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE, + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = 1, + .sampleShadingEnable = false, + .pSampleMask = NULL, + .alphaToCoverageEnable = false, + .alphaToOneEnable = false, + }, + .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .logicOpEnable = false, + .attachmentCount = 0, + .pAttachments = NULL, + }, + .pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = false, + .depthWriteEnable = false, + .depthBoundsTestEnable = false, + .stencilTestEnable = false, + }, + .pDynamicState = NULL, + .renderPass = device->meta_state.depth_decomp.pass, + .subpass = 0, + }; + + result = radv_graphics_pipeline_create(device_h, + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &pipeline_create_info, + &(struct radv_graphics_pipeline_create_info) { + .use_rectlist = true, + .db_flush_depth_inplace = true, + .db_flush_stencil_inplace = true, + }, + &device->meta_state.alloc, + &device->meta_state.depth_decomp.decompress_pipeline); + if (result != VK_SUCCESS) + goto cleanup; + + result = radv_graphics_pipeline_create(device_h, + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &pipeline_create_info, + &(struct radv_graphics_pipeline_create_info) { + .use_rectlist = true, + .db_flush_depth_inplace = true, + .db_flush_stencil_inplace = true, + .db_resummarize = true, + }, + &device->meta_state.alloc, + &device->meta_state.depth_decomp.resummarize_pipeline); + if (result != VK_SUCCESS) + goto cleanup; + + goto cleanup; + +cleanup: + ralloc_free(fs_module.nir); + return result; +} + +void +radv_device_finish_meta_depth_decomp_state(struct radv_device *device) +{ + struct radv_meta_state *state = &device->meta_state; + VkDevice device_h = radv_device_to_handle(device); + VkRenderPass pass_h = device->meta_state.depth_decomp.pass; + const VkAllocationCallbacks *alloc = &device->meta_state.alloc; + + if (pass_h) + RADV_CALL(DestroyRenderPass)(device_h, pass_h, + &device->meta_state.alloc); + + VkPipeline pipeline_h = state->depth_decomp.decompress_pipeline; + if (pipeline_h) { + RADV_CALL(DestroyPipeline)(device_h, pipeline_h, alloc); + } + pipeline_h = state->depth_decomp.resummarize_pipeline; + if (pipeline_h) { + RADV_CALL(DestroyPipeline)(device_h, pipeline_h, alloc); + } +} + +VkResult +radv_device_init_meta_depth_decomp_state(struct radv_device *device) +{ + VkResult res = VK_SUCCESS; + + zero(device->meta_state.depth_decomp); + + struct radv_shader_module vs_module = { .nir = build_nir_vs() }; + if (!vs_module.nir) { + /* XXX: Need more accurate error */ + res = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail; + } + + res = create_pass(device); + if (res != VK_SUCCESS) + goto fail; + + VkShaderModule vs_module_h = radv_shader_module_to_handle(&vs_module); + res = create_pipeline(device, vs_module_h); + if (res != VK_SUCCESS) + goto fail; + + goto cleanup; + +fail: + radv_device_finish_meta_depth_decomp_state(device); + +cleanup: + ralloc_free(vs_module.nir); + + return res; +} + +static void +emit_depth_decomp(struct radv_cmd_buffer *cmd_buffer, + const VkOffset2D *dest_offset, + const VkExtent2D *depth_decomp_extent, + VkPipeline pipeline_h) +{ + struct radv_device *device = cmd_buffer->device; + VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer); + uint32_t offset; + const struct vertex_attrs vertex_data[3] = { + { + .position = { + dest_offset->x, + dest_offset->y, + }, + }, + { + .position = { + dest_offset->x, + dest_offset->y + depth_decomp_extent->height, + }, + }, + { + .position = { + dest_offset->x + depth_decomp_extent->width, + dest_offset->y, + }, + }, + }; + + radv_cmd_buffer_upload_data(cmd_buffer, sizeof(vertex_data), 16, vertex_data, &offset); + struct radv_buffer vertex_buffer = { + .device = device, + .size = sizeof(vertex_data), + .bo = cmd_buffer->upload.upload_bo, + .offset = offset, + }; + + VkBuffer vertex_buffer_h = radv_buffer_to_handle(&vertex_buffer); + + radv_CmdBindVertexBuffers(cmd_buffer_h, + /*firstBinding*/ 0, + /*bindingCount*/ 1, + (VkBuffer[]) { vertex_buffer_h }, + (VkDeviceSize[]) { 0 }); + + RADV_FROM_HANDLE(radv_pipeline, pipeline, pipeline_h); + + if (cmd_buffer->state.pipeline != pipeline) { + radv_CmdBindPipeline(cmd_buffer_h, VK_PIPELINE_BIND_POINT_GRAPHICS, + pipeline_h); + } + + RADV_CALL(CmdDraw)(cmd_buffer_h, 3, 1, 0, 0); +} + + +static void radv_process_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + VkImageSubresourceRange *subresourceRange, + VkPipeline pipeline_h) +{ + struct radv_meta_saved_state saved_state; + struct radv_meta_saved_pass_state saved_pass_state; + VkDevice device_h = radv_device_to_handle(cmd_buffer->device); + VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer); + uint32_t width = radv_minify(image->extent.width, + subresourceRange->baseMipLevel); + uint32_t height = radv_minify(image->extent.height, + subresourceRange->baseMipLevel); + + if (!image->htile.size) + return; + radv_meta_save_pass(&saved_pass_state, cmd_buffer); + + radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer); + + for (uint32_t layer = 0; layer < subresourceRange->layerCount; layer++) { + struct radv_image_view iview; + + radv_image_view_init(&iview, cmd_buffer->device, + &(VkImageViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = radv_image_to_handle(image), + .format = image->vk_format, + .subresourceRange = { + .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT, + .baseMipLevel = subresourceRange->baseMipLevel, + .levelCount = 1, + .baseArrayLayer = subresourceRange->baseArrayLayer + layer, + .layerCount = 1, + }, + }, + cmd_buffer, VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT); + + + VkFramebuffer fb_h; + radv_CreateFramebuffer(device_h, + &(VkFramebufferCreateInfo) { + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = (VkImageView[]) { + radv_image_view_to_handle(&iview) + }, + .width = width, + .height = height, + .layers = 1 + }, + &cmd_buffer->pool->alloc, + &fb_h); + + RADV_CALL(CmdBeginRenderPass)(cmd_buffer_h, + &(VkRenderPassBeginInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = cmd_buffer->device->meta_state.depth_decomp.pass, + .framebuffer = fb_h, + .renderArea = { + .offset = { + 0, + 0, + }, + .extent = { + width, + height, + } + }, + .clearValueCount = 0, + .pClearValues = NULL, + }, + VK_SUBPASS_CONTENTS_INLINE); + + emit_depth_decomp(cmd_buffer, &(VkOffset2D){0, 0 }, &(VkExtent2D){width, height}, pipeline_h); + RADV_CALL(CmdEndRenderPass)(cmd_buffer_h); + + radv_DestroyFramebuffer(device_h, fb_h, + &cmd_buffer->pool->alloc); + } + radv_meta_restore(&saved_state, cmd_buffer); + radv_meta_restore_pass(&saved_pass_state, cmd_buffer); +} + +void radv_decompress_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + VkImageSubresourceRange *subresourceRange) +{ + radv_process_depth_image_inplace(cmd_buffer, image, subresourceRange, + cmd_buffer->device->meta_state.depth_decomp.decompress_pipeline); +} + +void radv_resummarize_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + VkImageSubresourceRange *subresourceRange) +{ + radv_process_depth_image_inplace(cmd_buffer, image, subresourceRange, + cmd_buffer->device->meta_state.depth_decomp.resummarize_pipeline); +} diff --git a/src/amd/vulkan/radv_meta_fast_clear.c b/src/amd/vulkan/radv_meta_fast_clear.c new file mode 100644 index 00000000000..0fb10c7b3b0 --- /dev/null +++ b/src/amd/vulkan/radv_meta_fast_clear.c @@ -0,0 +1,536 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#include "radv_meta.h" +#include "radv_private.h" +#include "nir/nir_builder.h" +#include "sid.h" +/** + * Vertex attributes used by all pipelines. + */ +struct vertex_attrs { + float position[2]; /**< 3DPRIM_RECTLIST */ + float tex_position[2]; +}; + +/* passthrough vertex shader */ +static nir_shader * +build_nir_vs(void) +{ + const struct glsl_type *vec4 = glsl_vec4_type(); + + nir_builder b; + nir_variable *a_position; + nir_variable *v_position; + nir_variable *a_tex_position; + nir_variable *v_tex_position; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, NULL); + b.shader->info.name = ralloc_strdup(b.shader, "meta_fast_clear_vs"); + + a_position = nir_variable_create(b.shader, nir_var_shader_in, vec4, + "a_position"); + a_position->data.location = VERT_ATTRIB_GENERIC0; + + v_position = nir_variable_create(b.shader, nir_var_shader_out, vec4, + "gl_Position"); + v_position->data.location = VARYING_SLOT_POS; + + a_tex_position = nir_variable_create(b.shader, nir_var_shader_in, vec4, + "a_tex_position"); + a_tex_position->data.location = VERT_ATTRIB_GENERIC1; + + v_tex_position = nir_variable_create(b.shader, nir_var_shader_out, vec4, + "v_tex_position"); + v_tex_position->data.location = VARYING_SLOT_VAR0; + + nir_copy_var(&b, v_position, a_position); + nir_copy_var(&b, v_tex_position, a_tex_position); + + return b.shader; +} + +/* simple passthrough shader */ +static nir_shader * +build_nir_fs(void) +{ + const struct glsl_type *vec4 = glsl_vec4_type(); + nir_builder b; + nir_variable *v_tex_position; /* vec4, varying texture coordinate */ + nir_variable *f_color; /* vec4, fragment output color */ + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL); + b.shader->info.name = ralloc_asprintf(b.shader, + "meta_fast_clear_fs"); + + v_tex_position = nir_variable_create(b.shader, nir_var_shader_in, vec4, + "v_tex_position"); + v_tex_position->data.location = VARYING_SLOT_VAR0; + + f_color = nir_variable_create(b.shader, nir_var_shader_out, vec4, + "f_color"); + f_color->data.location = FRAG_RESULT_DATA0; + + nir_copy_var(&b, f_color, v_tex_position); + + return b.shader; +} + +static VkResult +create_pass(struct radv_device *device) +{ + VkResult result; + VkDevice device_h = radv_device_to_handle(device); + const VkAllocationCallbacks *alloc = &device->meta_state.alloc; + VkAttachmentDescription attachment; + + attachment.format = VK_FORMAT_UNDEFINED; + attachment.samples = 1; + attachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; + attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; + attachment.initialLayout = VK_IMAGE_LAYOUT_GENERAL; + attachment.finalLayout = VK_IMAGE_LAYOUT_GENERAL; + + result = radv_CreateRenderPass(device_h, + &(VkRenderPassCreateInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &attachment, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .colorAttachmentCount = 1, + .pColorAttachments = (VkAttachmentReference[]) { + { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + }, + .pResolveAttachments = NULL, + .pDepthStencilAttachment = &(VkAttachmentReference) { + .attachment = VK_ATTACHMENT_UNUSED, + }, + .preserveAttachmentCount = 0, + .pPreserveAttachments = NULL, + }, + .dependencyCount = 0, + }, + alloc, + &device->meta_state.fast_clear_flush.pass); + + return result; +} + +static VkResult +create_pipeline(struct radv_device *device, + VkShaderModule vs_module_h) +{ + VkResult result; + VkDevice device_h = radv_device_to_handle(device); + + struct radv_shader_module fs_module = { + .nir = build_nir_fs(), + }; + + if (!fs_module.nir) { + /* XXX: Need more accurate error */ + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto cleanup; + } + + const VkPipelineShaderStageCreateInfo stages[2] = { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = vs_module_h, + .pName = "main", + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = radv_shader_module_to_handle(&fs_module), + .pName = "main", + }, + }; + + const VkPipelineVertexInputStateCreateInfo vi_state = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) { + { + .binding = 0, + .stride = sizeof(struct vertex_attrs), + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX + }, + }, + .vertexAttributeDescriptionCount = 2, + .pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) { + { + /* Position */ + .location = 0, + .binding = 0, + .format = VK_FORMAT_R32G32_SFLOAT, + .offset = offsetof(struct vertex_attrs, position), + }, + { + /* Texture Coordinate */ + .location = 1, + .binding = 0, + .format = VK_FORMAT_R32G32_SFLOAT, + .offset = offsetof(struct vertex_attrs, tex_position), + }, + } + }; + + const VkPipelineInputAssemblyStateCreateInfo ia_state = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + .primitiveRestartEnable = false, + }; + + const VkPipelineColorBlendStateCreateInfo blend_state = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .logicOpEnable = false, + .attachmentCount = 1, + .pAttachments = (VkPipelineColorBlendAttachmentState []) { + { + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | + VK_COLOR_COMPONENT_A_BIT, + }, + } + }; + const VkPipelineRasterizationStateCreateInfo rs_state = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .depthClampEnable = false, + .rasterizerDiscardEnable = false, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE, + }; + + result = radv_graphics_pipeline_create(device_h, + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &(VkGraphicsPipelineCreateInfo) { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = 2, + .pStages = stages, + + .pVertexInputState = &vi_state, + .pInputAssemblyState = &ia_state, + + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 0, + .scissorCount = 0, + }, + .pRasterizationState = &rs_state, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = 1, + .sampleShadingEnable = false, + .pSampleMask = NULL, + .alphaToCoverageEnable = false, + .alphaToOneEnable = false, + }, + .pColorBlendState = &blend_state, + .pDynamicState = NULL, + .renderPass = device->meta_state.fast_clear_flush.pass, + .subpass = 0, + }, + &(struct radv_graphics_pipeline_create_info) { + .use_rectlist = true, + .custom_blend_mode = V_028808_CB_ELIMINATE_FAST_CLEAR, + }, + &device->meta_state.alloc, + &device->meta_state.fast_clear_flush.cmask_eliminate_pipeline); + if (result != VK_SUCCESS) + goto cleanup; + + result = radv_graphics_pipeline_create(device_h, + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &(VkGraphicsPipelineCreateInfo) { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = 2, + .pStages = stages, + + .pVertexInputState = &vi_state, + .pInputAssemblyState = &ia_state, + + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 0, + .scissorCount = 0, + }, + .pRasterizationState = &rs_state, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = 1, + .sampleShadingEnable = false, + .pSampleMask = NULL, + .alphaToCoverageEnable = false, + .alphaToOneEnable = false, + }, + .pColorBlendState = &blend_state, + .pDynamicState = NULL, + .renderPass = device->meta_state.fast_clear_flush.pass, + .subpass = 0, + }, + &(struct radv_graphics_pipeline_create_info) { + .use_rectlist = true, + .custom_blend_mode = V_028808_CB_FMASK_DECOMPRESS, + }, + &device->meta_state.alloc, + &device->meta_state.fast_clear_flush.fmask_decompress_pipeline); + if (result != VK_SUCCESS) + goto cleanup_cmask; + + goto cleanup; +cleanup_cmask: + RADV_CALL(DestroyPipeline)(device_h, device->meta_state.fast_clear_flush.cmask_eliminate_pipeline, &device->meta_state.alloc); +cleanup: + ralloc_free(fs_module.nir); + return result; +} + +void +radv_device_finish_meta_fast_clear_flush_state(struct radv_device *device) +{ + struct radv_meta_state *state = &device->meta_state; + VkDevice device_h = radv_device_to_handle(device); + VkRenderPass pass_h = device->meta_state.fast_clear_flush.pass; + const VkAllocationCallbacks *alloc = &device->meta_state.alloc; + + if (pass_h) + RADV_CALL(DestroyRenderPass)(device_h, pass_h, + &device->meta_state.alloc); + + VkPipeline pipeline_h = state->fast_clear_flush.cmask_eliminate_pipeline; + if (pipeline_h) { + RADV_CALL(DestroyPipeline)(device_h, pipeline_h, alloc); + } + + pipeline_h = state->fast_clear_flush.fmask_decompress_pipeline; + if (pipeline_h) { + RADV_CALL(DestroyPipeline)(device_h, pipeline_h, alloc); + } +} + +VkResult +radv_device_init_meta_fast_clear_flush_state(struct radv_device *device) +{ + VkResult res = VK_SUCCESS; + + zero(device->meta_state.fast_clear_flush); + + struct radv_shader_module vs_module = { .nir = build_nir_vs() }; + if (!vs_module.nir) { + /* XXX: Need more accurate error */ + res = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail; + } + + res = create_pass(device); + if (res != VK_SUCCESS) + goto fail; + + VkShaderModule vs_module_h = radv_shader_module_to_handle(&vs_module); + res = create_pipeline(device, vs_module_h); + if (res != VK_SUCCESS) + goto fail; + + goto cleanup; + +fail: + radv_device_finish_meta_fast_clear_flush_state(device); + +cleanup: + ralloc_free(vs_module.nir); + + return res; +} + +static void +emit_fast_clear_flush(struct radv_cmd_buffer *cmd_buffer, + const VkExtent2D *resolve_extent, + bool fmask_decompress) +{ + struct radv_device *device = cmd_buffer->device; + VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer); + uint32_t offset; + const struct vertex_attrs vertex_data[3] = { + { + .position = { + 0, + 0, + }, + .tex_position = { + 0, + 0, + }, + }, + { + .position = { + 0, + resolve_extent->height, + }, + .tex_position = { + 0, + resolve_extent->height, + }, + }, + { + .position = { + resolve_extent->width, + 0, + }, + .tex_position = { + resolve_extent->width, + 0, + }, + }, + }; + + cmd_buffer->state.flush_bits |= (RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_CB_META); + radv_cmd_buffer_upload_data(cmd_buffer, sizeof(vertex_data), 16, vertex_data, &offset); + struct radv_buffer vertex_buffer = { + .device = device, + .size = sizeof(vertex_data), + .bo = cmd_buffer->upload.upload_bo, + .offset = offset, + }; + + VkBuffer vertex_buffer_h = radv_buffer_to_handle(&vertex_buffer); + + radv_CmdBindVertexBuffers(cmd_buffer_h, + /*firstBinding*/ 0, + /*bindingCount*/ 1, + (VkBuffer[]) { vertex_buffer_h }, + (VkDeviceSize[]) { 0 }); + + VkPipeline pipeline_h; + if (fmask_decompress) + pipeline_h = device->meta_state.fast_clear_flush.fmask_decompress_pipeline; + else + pipeline_h = device->meta_state.fast_clear_flush.cmask_eliminate_pipeline; + RADV_FROM_HANDLE(radv_pipeline, pipeline, pipeline_h); + + if (cmd_buffer->state.pipeline != pipeline) { + radv_CmdBindPipeline(cmd_buffer_h, VK_PIPELINE_BIND_POINT_GRAPHICS, + pipeline_h); + } + + RADV_CALL(CmdDraw)(cmd_buffer_h, 3, 1, 0, 0); + cmd_buffer->state.flush_bits |= (RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_CB_META); + si_emit_cache_flush(cmd_buffer); +} + +/** + */ +void +radv_fast_clear_flush_image_inplace(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image) +{ + struct radv_meta_saved_state saved_state; + struct radv_meta_saved_pass_state saved_pass_state; + VkDevice device_h = radv_device_to_handle(cmd_buffer->device); + VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer); + + if (!image->cmask.size) + return; + + if (!cmd_buffer->device->allow_fast_clears) + return; + + radv_meta_save_pass(&saved_pass_state, cmd_buffer); + radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer); + + struct radv_image_view iview; + radv_image_view_init(&iview, cmd_buffer->device, + &(VkImageViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = radv_image_to_handle(image), + .format = image->vk_format, + .subresourceRange = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }, + cmd_buffer, VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT); + + VkFramebuffer fb_h; + radv_CreateFramebuffer(device_h, + &(VkFramebufferCreateInfo) { + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = (VkImageView[]) { + radv_image_view_to_handle(&iview) + }, + .width = image->extent.width, + .height = image->extent.height, + .layers = 1 + }, + &cmd_buffer->pool->alloc, + &fb_h); + + RADV_CALL(CmdBeginRenderPass)(cmd_buffer_h, + &(VkRenderPassBeginInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = cmd_buffer->device->meta_state.fast_clear_flush.pass, + .framebuffer = fb_h, + .renderArea = { + .offset = { + 0, + 0, + }, + .extent = { + image->extent.width, + image->extent.height, + } + }, + .clearValueCount = 0, + .pClearValues = NULL, + }, + VK_SUBPASS_CONTENTS_INLINE); + + emit_fast_clear_flush(cmd_buffer, + &(VkExtent2D) { image->extent.width, image->extent.height }, + image->fmask.size > 0); + RADV_CALL(CmdEndRenderPass)(cmd_buffer_h); + + radv_DestroyFramebuffer(device_h, fb_h, + &cmd_buffer->pool->alloc); + + radv_meta_restore(&saved_state, cmd_buffer); + radv_meta_restore_pass(&saved_pass_state, cmd_buffer); +} diff --git a/src/amd/vulkan/radv_meta_resolve.c b/src/amd/vulkan/radv_meta_resolve.c new file mode 100644 index 00000000000..514aa8c7ef9 --- /dev/null +++ b/src/amd/vulkan/radv_meta_resolve.c @@ -0,0 +1,670 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#include "radv_meta.h" +#include "radv_private.h" +#include "nir/nir_builder.h" +#include "sid.h" +/** + * Vertex attributes used by all pipelines. + */ +struct vertex_attrs { + float position[2]; /**< 3DPRIM_RECTLIST */ + float tex_position[2]; +}; + +/* passthrough vertex shader */ +static nir_shader * +build_nir_vs(void) +{ + const struct glsl_type *vec4 = glsl_vec4_type(); + + nir_builder b; + nir_variable *a_position; + nir_variable *v_position; + nir_variable *a_tex_position; + nir_variable *v_tex_position; + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, NULL); + b.shader->info.name = ralloc_strdup(b.shader, "meta_resolve_vs"); + + a_position = nir_variable_create(b.shader, nir_var_shader_in, vec4, + "a_position"); + a_position->data.location = VERT_ATTRIB_GENERIC0; + + v_position = nir_variable_create(b.shader, nir_var_shader_out, vec4, + "gl_Position"); + v_position->data.location = VARYING_SLOT_POS; + + a_tex_position = nir_variable_create(b.shader, nir_var_shader_in, vec4, + "a_tex_position"); + a_tex_position->data.location = VERT_ATTRIB_GENERIC1; + + v_tex_position = nir_variable_create(b.shader, nir_var_shader_out, vec4, + "v_tex_position"); + v_tex_position->data.location = VARYING_SLOT_VAR0; + + nir_copy_var(&b, v_position, a_position); + nir_copy_var(&b, v_tex_position, a_tex_position); + + return b.shader; +} + +/* simple passthrough shader */ +static nir_shader * +build_nir_fs(void) +{ + const struct glsl_type *vec4 = glsl_vec4_type(); + nir_builder b; + nir_variable *v_tex_position; /* vec4, varying texture coordinate */ + nir_variable *f_color; /* vec4, fragment output color */ + + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL); + b.shader->info.name = ralloc_asprintf(b.shader, + "meta_resolve_fs"); + + v_tex_position = nir_variable_create(b.shader, nir_var_shader_in, vec4, + "v_tex_position"); + v_tex_position->data.location = VARYING_SLOT_VAR0; + + f_color = nir_variable_create(b.shader, nir_var_shader_out, vec4, + "f_color"); + f_color->data.location = FRAG_RESULT_DATA0; + + nir_copy_var(&b, f_color, v_tex_position); + + return b.shader; +} + +static VkResult +create_pass(struct radv_device *device) +{ + VkResult result; + VkDevice device_h = radv_device_to_handle(device); + const VkAllocationCallbacks *alloc = &device->meta_state.alloc; + VkAttachmentDescription attachments[2]; + int i; + + for (i = 0; i < 2; i++) { + attachments[i].format = VK_FORMAT_UNDEFINED; + attachments[i].samples = 1; + attachments[i].loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; + attachments[i].storeOp = VK_ATTACHMENT_STORE_OP_STORE; + attachments[i].initialLayout = VK_IMAGE_LAYOUT_GENERAL; + attachments[i].finalLayout = VK_IMAGE_LAYOUT_GENERAL; + } + + result = radv_CreateRenderPass(device_h, + &(VkRenderPassCreateInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 2, + .pAttachments = attachments, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .colorAttachmentCount = 2, + .pColorAttachments = (VkAttachmentReference[]) { + { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + { + .attachment = 1, + .layout = VK_IMAGE_LAYOUT_GENERAL, + }, + }, + .pResolveAttachments = NULL, + .pDepthStencilAttachment = &(VkAttachmentReference) { + .attachment = VK_ATTACHMENT_UNUSED, + }, + .preserveAttachmentCount = 0, + .pPreserveAttachments = NULL, + }, + .dependencyCount = 0, + }, + alloc, + &device->meta_state.resolve.pass); + + return result; +} + +static VkResult +create_pipeline(struct radv_device *device, + VkShaderModule vs_module_h) +{ + VkResult result; + VkDevice device_h = radv_device_to_handle(device); + + struct radv_shader_module fs_module = { + .nir = build_nir_fs(), + }; + + if (!fs_module.nir) { + /* XXX: Need more accurate error */ + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto cleanup; + } + + result = radv_graphics_pipeline_create(device_h, + radv_pipeline_cache_to_handle(&device->meta_state.cache), + &(VkGraphicsPipelineCreateInfo) { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .stageCount = 2, + .pStages = (VkPipelineShaderStageCreateInfo[]) { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = vs_module_h, + .pName = "main", + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = radv_shader_module_to_handle(&fs_module), + .pName = "main", + }, + }, + .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) { + { + .binding = 0, + .stride = sizeof(struct vertex_attrs), + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX + }, + }, + .vertexAttributeDescriptionCount = 2, + .pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) { + { + /* Position */ + .location = 0, + .binding = 0, + .format = VK_FORMAT_R32G32_SFLOAT, + .offset = offsetof(struct vertex_attrs, position), + }, + { + /* Texture Coordinate */ + .location = 1, + .binding = 0, + .format = VK_FORMAT_R32G32_SFLOAT, + .offset = offsetof(struct vertex_attrs, tex_position), + }, + }, + }, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + .primitiveRestartEnable = false, + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 0, + .scissorCount = 0, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .depthClampEnable = false, + .rasterizerDiscardEnable = false, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE, + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = 1, + .sampleShadingEnable = false, + .pSampleMask = NULL, + .alphaToCoverageEnable = false, + .alphaToOneEnable = false, + }, + .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .logicOpEnable = false, + .attachmentCount = 2, + .pAttachments = (VkPipelineColorBlendAttachmentState []) { + { + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | + VK_COLOR_COMPONENT_A_BIT, + }, + { + .colorWriteMask = 0, + + } + }, + }, + .pDynamicState = NULL, + .renderPass = device->meta_state.resolve.pass, + .subpass = 0, + }, + &(struct radv_graphics_pipeline_create_info) { + .use_rectlist = true, + .custom_blend_mode = V_028808_CB_RESOLVE, + }, + &device->meta_state.alloc, + &device->meta_state.resolve.pipeline); + if (result != VK_SUCCESS) + goto cleanup; + + goto cleanup; + +cleanup: + ralloc_free(fs_module.nir); + return result; +} + +void +radv_device_finish_meta_resolve_state(struct radv_device *device) +{ + struct radv_meta_state *state = &device->meta_state; + VkDevice device_h = radv_device_to_handle(device); + VkRenderPass pass_h = device->meta_state.resolve.pass; + const VkAllocationCallbacks *alloc = &device->meta_state.alloc; + + if (pass_h) + RADV_CALL(DestroyRenderPass)(device_h, pass_h, + &device->meta_state.alloc); + + VkPipeline pipeline_h = state->resolve.pipeline; + if (pipeline_h) { + RADV_CALL(DestroyPipeline)(device_h, pipeline_h, alloc); + } +} + +VkResult +radv_device_init_meta_resolve_state(struct radv_device *device) +{ + VkResult res = VK_SUCCESS; + + zero(device->meta_state.resolve); + + struct radv_shader_module vs_module = { .nir = build_nir_vs() }; + if (!vs_module.nir) { + /* XXX: Need more accurate error */ + res = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail; + } + + res = create_pass(device); + if (res != VK_SUCCESS) + goto fail; + + VkShaderModule vs_module_h = radv_shader_module_to_handle(&vs_module); + res = create_pipeline(device, vs_module_h); + if (res != VK_SUCCESS) + goto fail; + + goto cleanup; + +fail: + radv_device_finish_meta_resolve_state(device); + +cleanup: + ralloc_free(vs_module.nir); + + return res; +} + +static void +emit_resolve(struct radv_cmd_buffer *cmd_buffer, + const VkOffset2D *src_offset, + const VkOffset2D *dest_offset, + const VkExtent2D *resolve_extent) +{ + struct radv_device *device = cmd_buffer->device; + VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer); + uint32_t offset; + const struct vertex_attrs vertex_data[3] = { + { + .position = { + dest_offset->x, + dest_offset->y, + }, + .tex_position = { + src_offset->x, + src_offset->y, + }, + }, + { + .position = { + dest_offset->x, + dest_offset->y + resolve_extent->height, + }, + .tex_position = { + src_offset->x, + src_offset->y + resolve_extent->height, + }, + }, + { + .position = { + dest_offset->x + resolve_extent->width, + dest_offset->y, + }, + .tex_position = { + src_offset->x + resolve_extent->width, + src_offset->y, + }, + }, + }; + + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; + radv_cmd_buffer_upload_data(cmd_buffer, sizeof(vertex_data), 16, vertex_data, &offset); + struct radv_buffer vertex_buffer = { + .device = device, + .size = sizeof(vertex_data), + .bo = cmd_buffer->upload.upload_bo, + .offset = offset, + }; + + VkBuffer vertex_buffer_h = radv_buffer_to_handle(&vertex_buffer); + + radv_CmdBindVertexBuffers(cmd_buffer_h, + /*firstBinding*/ 0, + /*bindingCount*/ 1, + (VkBuffer[]) { vertex_buffer_h }, + (VkDeviceSize[]) { 0 }); + + VkPipeline pipeline_h = device->meta_state.resolve.pipeline; + RADV_FROM_HANDLE(radv_pipeline, pipeline, pipeline_h); + + if (cmd_buffer->state.pipeline != pipeline) { + radv_CmdBindPipeline(cmd_buffer_h, VK_PIPELINE_BIND_POINT_GRAPHICS, + pipeline_h); + } + + RADV_CALL(CmdDraw)(cmd_buffer_h, 3, 1, 0, 0); + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; + si_emit_cache_flush(cmd_buffer); +} + +void radv_CmdResolveImage( + VkCommandBuffer cmd_buffer_h, + VkImage src_image_h, + VkImageLayout src_image_layout, + VkImage dest_image_h, + VkImageLayout dest_image_layout, + uint32_t region_count, + const VkImageResolve* regions) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, cmd_buffer_h); + RADV_FROM_HANDLE(radv_image, src_image, src_image_h); + RADV_FROM_HANDLE(radv_image, dest_image, dest_image_h); + struct radv_device *device = cmd_buffer->device; + struct radv_meta_saved_state saved_state; + VkDevice device_h = radv_device_to_handle(device); + bool use_compute_resolve = false; + + /* we can use the hw resolve only for single full resolves */ + if (region_count == 1) { + if (regions[0].srcOffset.x || + regions[0].srcOffset.y || + regions[0].srcOffset.z) + use_compute_resolve = true; + if (regions[0].dstOffset.x || + regions[0].dstOffset.y || + regions[0].dstOffset.z) + use_compute_resolve = true; + + if (regions[0].extent.width != src_image->extent.width || + regions[0].extent.height != src_image->extent.height || + regions[0].extent.depth != src_image->extent.depth) + use_compute_resolve = true; + } else + use_compute_resolve = true; + + if (use_compute_resolve) { + radv_meta_resolve_compute_image(cmd_buffer, + src_image, + src_image_layout, + dest_image, + dest_image_layout, + region_count, regions); + return; + } + + radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer); + + assert(src_image->samples > 1); + assert(dest_image->samples == 1); + + if (src_image->samples >= 16) { + /* See commit aa3f9aaf31e9056a255f9e0472ebdfdaa60abe54 for the + * glBlitFramebuffer workaround for samples >= 16. + */ + radv_finishme("vkCmdResolveImage: need interpolation workaround when " + "samples >= 16"); + } + + if (src_image->array_size > 1) + radv_finishme("vkCmdResolveImage: multisample array images"); + + for (uint32_t r = 0; r < region_count; ++r) { + const VkImageResolve *region = ®ions[r]; + + /* From the Vulkan 1.0 spec: + * + * - The aspectMask member of srcSubresource and dstSubresource must + * only contain VK_IMAGE_ASPECT_COLOR_BIT + * + * - The layerCount member of srcSubresource and dstSubresource must + * match + */ + assert(region->srcSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT); + assert(region->dstSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT); + assert(region->srcSubresource.layerCount == + region->dstSubresource.layerCount); + + const uint32_t src_base_layer = + radv_meta_get_iview_layer(src_image, ®ion->srcSubresource, + ®ion->srcOffset); + + const uint32_t dest_base_layer = + radv_meta_get_iview_layer(dest_image, ®ion->dstSubresource, + ®ion->dstOffset); + + /** + * From Vulkan 1.0.6 spec: 18.6 Resolving Multisample Images + * + * extent is the size in texels of the source image to resolve in width, + * height and depth. 1D images use only x and width. 2D images use x, y, + * width and height. 3D images use x, y, z, width, height and depth. + * + * srcOffset and dstOffset select the initial x, y, and z offsets in + * texels of the sub-regions of the source and destination image data. + * extent is the size in texels of the source image to resolve in width, + * height and depth. 1D images use only x and width. 2D images use x, y, + * width and height. 3D images use x, y, z, width, height and depth. + */ + const struct VkExtent3D extent = + radv_sanitize_image_extent(src_image->type, region->extent); + const struct VkOffset3D srcOffset = + radv_sanitize_image_offset(src_image->type, region->srcOffset); + const struct VkOffset3D dstOffset = + radv_sanitize_image_offset(dest_image->type, region->dstOffset); + + + for (uint32_t layer = 0; layer < region->srcSubresource.layerCount; + ++layer) { + + struct radv_image_view src_iview; + radv_image_view_init(&src_iview, cmd_buffer->device, + &(VkImageViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = src_image_h, + .viewType = radv_meta_get_view_type(src_image), + .format = src_image->vk_format, + .subresourceRange = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = region->srcSubresource.mipLevel, + .levelCount = 1, + .baseArrayLayer = src_base_layer + layer, + .layerCount = 1, + }, + }, + cmd_buffer, VK_IMAGE_USAGE_SAMPLED_BIT); + + struct radv_image_view dest_iview; + radv_image_view_init(&dest_iview, cmd_buffer->device, + &(VkImageViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = dest_image_h, + .viewType = radv_meta_get_view_type(dest_image), + .format = dest_image->vk_format, + .subresourceRange = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = region->dstSubresource.mipLevel, + .levelCount = 1, + .baseArrayLayer = dest_base_layer + layer, + .layerCount = 1, + }, + }, + cmd_buffer, VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT); + + VkFramebuffer fb_h; + radv_CreateFramebuffer(device_h, + &(VkFramebufferCreateInfo) { + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .attachmentCount = 2, + .pAttachments = (VkImageView[]) { + radv_image_view_to_handle(&src_iview), + radv_image_view_to_handle(&dest_iview), + }, + .width = radv_minify(dest_image->extent.width, + region->dstSubresource.mipLevel), + .height = radv_minify(dest_image->extent.height, + region->dstSubresource.mipLevel), + .layers = 1 + }, + &cmd_buffer->pool->alloc, + &fb_h); + + RADV_CALL(CmdBeginRenderPass)(cmd_buffer_h, + &(VkRenderPassBeginInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = device->meta_state.resolve.pass, + .framebuffer = fb_h, + .renderArea = { + .offset = { + dstOffset.x, + dstOffset.y, + }, + .extent = { + extent.width, + extent.height, + } + }, + .clearValueCount = 0, + .pClearValues = NULL, + }, + VK_SUBPASS_CONTENTS_INLINE); + + emit_resolve(cmd_buffer, + &(VkOffset2D) { + .x = srcOffset.x, + .y = srcOffset.y, + }, + &(VkOffset2D) { + .x = dstOffset.x, + .y = dstOffset.y, + }, + &(VkExtent2D) { + .width = extent.width, + .height = extent.height, + }); + + RADV_CALL(CmdEndRenderPass)(cmd_buffer_h); + + radv_DestroyFramebuffer(device_h, fb_h, + &cmd_buffer->pool->alloc); + } + } + + radv_meta_restore(&saved_state, cmd_buffer); +} + +/** + * Emit any needed resolves for the current subpass. + */ +void +radv_cmd_buffer_resolve_subpass(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_framebuffer *fb = cmd_buffer->state.framebuffer; + const struct radv_subpass *subpass = cmd_buffer->state.subpass; + struct radv_meta_saved_state saved_state; + + /* FINISHME(perf): Skip clears for resolve attachments. + * + * From the Vulkan 1.0 spec: + * + * If the first use of an attachment in a render pass is as a resolve + * attachment, then the loadOp is effectively ignored as the resolve is + * guaranteed to overwrite all pixels in the render area. + */ + + if (!subpass->has_resolve) + return; + + radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer); + + for (uint32_t i = 0; i < subpass->color_count; ++i) { + VkAttachmentReference src_att = subpass->color_attachments[i]; + VkAttachmentReference dest_att = subpass->resolve_attachments[i]; + struct radv_image *dst_img = cmd_buffer->state.framebuffer->attachments[dest_att.attachment].attachment->image; + if (dest_att.attachment == VK_ATTACHMENT_UNUSED) + continue; + + if (dst_img->surface.dcc_size) { + radv_initialize_dcc(cmd_buffer, dst_img, 0xffffffff); + cmd_buffer->state.attachments[dest_att.attachment].current_layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + } + + struct radv_subpass resolve_subpass = { + .color_count = 2, + .color_attachments = (VkAttachmentReference[]) { src_att, dest_att }, + .depth_stencil_attachment = { .attachment = VK_ATTACHMENT_UNUSED }, + }; + + radv_cmd_buffer_set_subpass(cmd_buffer, &resolve_subpass, false); + + /* Subpass resolves must respect the render area. We can ignore the + * render area here because vkCmdBeginRenderPass set the render area + * with 3DSTATE_DRAWING_RECTANGLE. + * + * XXX(chadv): Does the hardware really respect + * 3DSTATE_DRAWING_RECTANGLE when draing a 3DPRIM_RECTLIST? + */ + emit_resolve(cmd_buffer, + &(VkOffset2D) { 0, 0 }, + &(VkOffset2D) { 0, 0 }, + &(VkExtent2D) { fb->width, fb->height }); + } + + cmd_buffer->state.subpass = subpass; + radv_meta_restore(&saved_state, cmd_buffer); +} diff --git a/src/amd/vulkan/radv_meta_resolve_cs.c b/src/amd/vulkan/radv_meta_resolve_cs.c new file mode 100644 index 00000000000..c6525b6f364 --- /dev/null +++ b/src/amd/vulkan/radv_meta_resolve_cs.c @@ -0,0 +1,461 @@ +/* + * Copyright © 2016 Dave Airlie + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + + +#include +#include + +#include "radv_meta.h" +#include "radv_private.h" +#include "nir/nir_builder.h" +#include "sid.h" +#include "vk_format.h" + +static nir_shader * +build_resolve_compute_shader(struct radv_device *dev, bool is_integer, int samples) +{ + nir_builder b; + char name[64]; + nir_if *outer_if = NULL; + const struct glsl_type *sampler_type = glsl_sampler_type(GLSL_SAMPLER_DIM_MS, + false, + false, + GLSL_TYPE_FLOAT); + const struct glsl_type *img_type = glsl_sampler_type(GLSL_SAMPLER_DIM_2D, + false, + false, + GLSL_TYPE_FLOAT); + snprintf(name, 64, "meta_resolve_cs-%d-%s", samples, is_integer ? "int" : "float"); + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL); + b.shader->info.name = ralloc_strdup(b.shader, name); + b.shader->info.cs.local_size[0] = 16; + b.shader->info.cs.local_size[1] = 16; + b.shader->info.cs.local_size[2] = 1; + + nir_variable *input_img = nir_variable_create(b.shader, nir_var_uniform, + sampler_type, "s_tex"); + input_img->data.descriptor_set = 0; + input_img->data.binding = 0; + + nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, + img_type, "out_img"); + output_img->data.descriptor_set = 0; + output_img->data.binding = 1; + nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0); + nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0); + nir_ssa_def *block_size = nir_imm_ivec4(&b, + b.shader->info.cs.local_size[0], + b.shader->info.cs.local_size[1], + b.shader->info.cs.local_size[2], 0); + + nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id); + + nir_intrinsic_instr *src_offset = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant); + src_offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); + src_offset->num_components = 2; + nir_ssa_dest_init(&src_offset->instr, &src_offset->dest, 2, 32, "src_offset"); + nir_builder_instr_insert(&b, &src_offset->instr); + + nir_intrinsic_instr *dst_offset = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant); + dst_offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 8)); + dst_offset->num_components = 2; + nir_ssa_dest_init(&dst_offset->instr, &dst_offset->dest, 2, 32, "dst_offset"); + nir_builder_instr_insert(&b, &dst_offset->instr); + + nir_ssa_def *img_coord = nir_iadd(&b, global_id, &src_offset->dest.ssa); + /* do a txf_ms on each sample */ + nir_ssa_def *tmp; + + nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2); + tex->sampler_dim = GLSL_SAMPLER_DIM_MS; + tex->op = nir_texop_txf_ms; + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(img_coord); + tex->src[1].src_type = nir_tex_src_ms_index; + tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, 0)); + tex->dest_type = nir_type_float; + tex->is_array = false; + tex->coord_components = 2; + tex->texture = nir_deref_var_create(tex, input_img); + tex->sampler = NULL; + + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); + nir_builder_instr_insert(&b, &tex->instr); + + tmp = &tex->dest.ssa; + nir_variable *color = + nir_local_variable_create(b.impl, glsl_vec4_type(), "color"); + + if (!is_integer && samples > 1) { + nir_tex_instr *tex_all_same = nir_tex_instr_create(b.shader, 1); + tex_all_same->sampler_dim = GLSL_SAMPLER_DIM_MS; + tex_all_same->op = nir_texop_samples_identical; + tex_all_same->src[0].src_type = nir_tex_src_coord; + tex_all_same->src[0].src = nir_src_for_ssa(img_coord); + tex_all_same->dest_type = nir_type_float; + tex_all_same->is_array = false; + tex_all_same->coord_components = 2; + tex_all_same->texture = nir_deref_var_create(tex_all_same, input_img); + tex_all_same->sampler = NULL; + + nir_ssa_dest_init(&tex_all_same->instr, &tex_all_same->dest, 1, 32, "tex"); + nir_builder_instr_insert(&b, &tex_all_same->instr); + + nir_ssa_def *all_same = nir_ine(&b, &tex_all_same->dest.ssa, nir_imm_int(&b, 0)); + nir_if *if_stmt = nir_if_create(b.shader); + if_stmt->condition = nir_src_for_ssa(all_same); + nir_cf_node_insert(b.cursor, &if_stmt->cf_node); + + b.cursor = nir_after_cf_list(&if_stmt->then_list); + for (int i = 1; i < samples; i++) { + nir_tex_instr *tex_add = nir_tex_instr_create(b.shader, 2); + tex_add->sampler_dim = GLSL_SAMPLER_DIM_MS; + tex_add->op = nir_texop_txf_ms; + tex_add->src[0].src_type = nir_tex_src_coord; + tex_add->src[0].src = nir_src_for_ssa(img_coord); + tex_add->src[1].src_type = nir_tex_src_ms_index; + tex_add->src[1].src = nir_src_for_ssa(nir_imm_int(&b, i)); + tex_add->dest_type = nir_type_float; + tex_add->is_array = false; + tex_add->coord_components = 2; + tex_add->texture = nir_deref_var_create(tex_add, input_img); + tex_add->sampler = NULL; + + nir_ssa_dest_init(&tex_add->instr, &tex_add->dest, 4, 32, "tex"); + nir_builder_instr_insert(&b, &tex_add->instr); + + tmp = nir_fadd(&b, tmp, &tex_add->dest.ssa); + } + + tmp = nir_fdiv(&b, tmp, nir_imm_float(&b, samples)); + nir_store_var(&b, color, tmp, 0xf); + b.cursor = nir_after_cf_list(&if_stmt->else_list); + outer_if = if_stmt; + } + nir_store_var(&b, color, &tex->dest.ssa, 0xf); + + if (outer_if) + b.cursor = nir_after_cf_node(&outer_if->cf_node); + + nir_ssa_def *newv = nir_load_var(&b, color); + nir_ssa_def *coord = nir_iadd(&b, global_id, &dst_offset->dest.ssa); + nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_store); + store->src[0] = nir_src_for_ssa(coord); + store->src[1] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32)); + store->src[2] = nir_src_for_ssa(newv); + store->variables[0] = nir_deref_var_create(store, output_img); + nir_builder_instr_insert(&b, &store->instr); + return b.shader; +} + + +static VkResult +create_layout(struct radv_device *device) +{ + VkResult result; + /* + * two descriptors one for the image being sampled + * one for the buffer being written. + */ + VkDescriptorSetLayoutCreateInfo ds_create_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = 2, + .pBindings = (VkDescriptorSetLayoutBinding[]) { + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL + }, + { + .binding = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL + }, + } + }; + + result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), + &ds_create_info, + &device->meta_state.alloc, + &device->meta_state.resolve_compute.ds_layout); + if (result != VK_SUCCESS) + goto fail; + + + VkPipelineLayoutCreateInfo pl_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &device->meta_state.resolve_compute.ds_layout, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 16}, + }; + + result = radv_CreatePipelineLayout(radv_device_to_handle(device), + &pl_create_info, + &device->meta_state.alloc, + &device->meta_state.resolve_compute.p_layout); + if (result != VK_SUCCESS) + goto fail; + return VK_SUCCESS; +fail: + return result; +} + +static VkResult +create_resolve_pipeline(struct radv_device *device, + int samples, + bool is_integer, + VkPipeline *pipeline) +{ + VkResult result; + struct radv_shader_module cs = { .nir = NULL }; + + cs.nir = build_resolve_compute_shader(device, is_integer, samples); + + /* compute shader */ + + VkPipelineShaderStageCreateInfo pipeline_shader_stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = radv_shader_module_to_handle(&cs), + .pName = "main", + .pSpecializationInfo = NULL, + }; + + VkComputePipelineCreateInfo vk_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = pipeline_shader_stage, + .flags = 0, + .layout = device->meta_state.resolve_compute.p_layout, + }; + + result = radv_CreateComputePipelines(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + 1, &vk_pipeline_info, NULL, + pipeline); + if (result != VK_SUCCESS) + goto fail; + + ralloc_free(cs.nir); + return VK_SUCCESS; +fail: + ralloc_free(cs.nir); + return result; +} + +VkResult +radv_device_init_meta_resolve_compute_state(struct radv_device *device) +{ + struct radv_meta_state *state = &device->meta_state; + VkResult res; + memset(&device->meta_state.resolve_compute, 0, sizeof(device->meta_state.resolve_compute)); + + res = create_layout(device); + if (res != VK_SUCCESS) + return res; + + for (uint32_t i = 0; i < MAX_SAMPLES_LOG2; ++i) { + uint32_t samples = 1 << i; + + res = create_resolve_pipeline(device, samples, false, + &state->resolve_compute.rc[i].pipeline); + + res = create_resolve_pipeline(device, samples, true, + &state->resolve_compute.rc[i].i_pipeline); + + } + + return res; +} + +void +radv_device_finish_meta_resolve_compute_state(struct radv_device *device) +{ + struct radv_meta_state *state = &device->meta_state; + for (uint32_t i = 0; i < MAX_SAMPLES_LOG2; ++i) { + radv_DestroyPipeline(radv_device_to_handle(device), + state->resolve_compute.rc[i].pipeline, + &state->alloc); + + radv_DestroyPipeline(radv_device_to_handle(device), + state->resolve_compute.rc[i].i_pipeline, + &state->alloc); + } + + radv_DestroyDescriptorSetLayout(radv_device_to_handle(device), + state->resolve_compute.ds_layout, + &state->alloc); + radv_DestroyPipelineLayout(radv_device_to_handle(device), + state->resolve_compute.p_layout, + &state->alloc); +} + +void radv_meta_resolve_compute_image(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *src_image, + VkImageLayout src_image_layout, + struct radv_image *dest_image, + VkImageLayout dest_image_layout, + uint32_t region_count, + const VkImageResolve *regions) +{ + struct radv_device *device = cmd_buffer->device; + struct radv_meta_saved_compute_state saved_state; + const uint32_t samples = src_image->samples; + const uint32_t samples_log2 = ffs(samples) - 1; + radv_meta_save_compute(&saved_state, cmd_buffer, 16); + + for (uint32_t r = 0; r < region_count; ++r) { + const VkImageResolve *region = ®ions[r]; + + assert(region->srcSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT); + assert(region->dstSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT); + assert(region->srcSubresource.layerCount == region->dstSubresource.layerCount); + + const uint32_t src_base_layer = + radv_meta_get_iview_layer(src_image, ®ion->srcSubresource, + ®ion->srcOffset); + + const uint32_t dest_base_layer = + radv_meta_get_iview_layer(dest_image, ®ion->dstSubresource, + ®ion->dstOffset); + + const struct VkExtent3D extent = + radv_sanitize_image_extent(src_image->type, region->extent); + const struct VkOffset3D srcOffset = + radv_sanitize_image_offset(src_image->type, region->srcOffset); + const struct VkOffset3D dstOffset = + radv_sanitize_image_offset(dest_image->type, region->dstOffset); + + for (uint32_t layer = 0; layer < region->srcSubresource.layerCount; + ++layer) { + + struct radv_image_view src_iview; + VkDescriptorSet set; + radv_image_view_init(&src_iview, cmd_buffer->device, + &(VkImageViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = radv_image_to_handle(src_image), + .viewType = radv_meta_get_view_type(src_image), + .format = src_image->vk_format, + .subresourceRange = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = region->srcSubresource.mipLevel, + .levelCount = 1, + .baseArrayLayer = src_base_layer + layer, + .layerCount = 1, + }, + }, + cmd_buffer, VK_IMAGE_USAGE_SAMPLED_BIT); + + struct radv_image_view dest_iview; + radv_image_view_init(&dest_iview, cmd_buffer->device, + &(VkImageViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = radv_image_to_handle(dest_image), + .viewType = radv_meta_get_view_type(dest_image), + .format = dest_image->vk_format, + .subresourceRange = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = region->dstSubresource.mipLevel, + .levelCount = 1, + .baseArrayLayer = dest_base_layer + layer, + .layerCount = 1, + }, + }, + cmd_buffer, VK_IMAGE_USAGE_STORAGE_BIT); + + + radv_temp_descriptor_set_create(device, cmd_buffer, + device->meta_state.resolve_compute.ds_layout, + &set); + + radv_UpdateDescriptorSets(radv_device_to_handle(device), + 2, /* writeCount */ + (VkWriteDescriptorSet[]) { + { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = set, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .pImageInfo = (VkDescriptorImageInfo[]) { + { + .sampler = NULL, + .imageView = radv_image_view_to_handle(&src_iview), + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }, + } + }, + { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = set, + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .pImageInfo = (VkDescriptorImageInfo[]) { + { + .sampler = NULL, + .imageView = radv_image_view_to_handle(&dest_iview), + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }, + } + } + }, 0, NULL); + + radv_CmdBindDescriptorSets(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_COMPUTE, + device->meta_state.resolve_compute.p_layout, 0, 1, + &set, 0, NULL); + + VkPipeline pipeline; + if (vk_format_is_int(src_image->vk_format)) + pipeline = device->meta_state.resolve_compute.rc[samples_log2].i_pipeline; + else + pipeline = device->meta_state.resolve_compute.rc[samples_log2].pipeline; + if (cmd_buffer->state.compute_pipeline != radv_pipeline_from_handle(pipeline)) { + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), + VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + } + + unsigned push_constants[4] = { + srcOffset.x, + srcOffset.y, + dstOffset.x, + dstOffset.y, + }; + radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), + device->meta_state.resolve_compute.p_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, 16, + push_constants); + radv_unaligned_dispatch(cmd_buffer, extent.width, extent.height, 1); + radv_temp_descriptor_set_destroy(cmd_buffer->device, set); + } + } + radv_meta_restore_compute(&saved_state, cmd_buffer, 16); +} diff --git a/src/amd/vulkan/radv_pass.c b/src/amd/vulkan/radv_pass.c new file mode 100644 index 00000000000..fa217977caa --- /dev/null +++ b/src/amd/vulkan/radv_pass.c @@ -0,0 +1,183 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based in part on anv driver which is: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#include "radv_private.h" + +VkResult radv_CreateRenderPass( + VkDevice _device, + const VkRenderPassCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkRenderPass* pRenderPass) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_render_pass *pass; + size_t size; + size_t attachments_offset; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO); + + size = sizeof(*pass); + size += pCreateInfo->subpassCount * sizeof(pass->subpasses[0]); + attachments_offset = size; + size += pCreateInfo->attachmentCount * sizeof(pass->attachments[0]); + + pass = radv_alloc2(&device->alloc, pAllocator, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (pass == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + memset(pass, 0, size); + pass->attachment_count = pCreateInfo->attachmentCount; + pass->subpass_count = pCreateInfo->subpassCount; + pass->attachments = (void *) pass + attachments_offset; + + for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) { + struct radv_render_pass_attachment *att = &pass->attachments[i]; + + att->format = pCreateInfo->pAttachments[i].format; + att->samples = pCreateInfo->pAttachments[i].samples; + att->load_op = pCreateInfo->pAttachments[i].loadOp; + att->stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp; + att->initial_layout = pCreateInfo->pAttachments[i].initialLayout; + att->final_layout = pCreateInfo->pAttachments[i].finalLayout; + // att->store_op = pCreateInfo->pAttachments[i].storeOp; + // att->stencil_store_op = pCreateInfo->pAttachments[i].stencilStoreOp; + } + uint32_t subpass_attachment_count = 0; + VkAttachmentReference *p; + for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) { + const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i]; + + subpass_attachment_count += + desc->inputAttachmentCount + + desc->colorAttachmentCount + + /* Count colorAttachmentCount again for resolve_attachments */ + desc->colorAttachmentCount; + } + + if (subpass_attachment_count) { + pass->subpass_attachments = + radv_alloc2(&device->alloc, pAllocator, + subpass_attachment_count * sizeof(VkAttachmentReference), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (pass->subpass_attachments == NULL) { + radv_free2(&device->alloc, pAllocator, pass); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } + } else + pass->subpass_attachments = NULL; + + p = pass->subpass_attachments; + for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) { + const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i]; + struct radv_subpass *subpass = &pass->subpasses[i]; + + subpass->input_count = desc->inputAttachmentCount; + subpass->color_count = desc->colorAttachmentCount; + + if (desc->inputAttachmentCount > 0) { + subpass->input_attachments = p; + p += desc->inputAttachmentCount; + + for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) { + subpass->input_attachments[j] + = desc->pInputAttachments[j]; + } + } + + if (desc->colorAttachmentCount > 0) { + subpass->color_attachments = p; + p += desc->colorAttachmentCount; + + for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { + subpass->color_attachments[j] + = desc->pColorAttachments[j]; + } + } + + subpass->has_resolve = false; + if (desc->pResolveAttachments) { + subpass->resolve_attachments = p; + p += desc->colorAttachmentCount; + + for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { + uint32_t a = desc->pResolveAttachments[j].attachment; + subpass->resolve_attachments[j] + = desc->pResolveAttachments[j]; + if (a != VK_ATTACHMENT_UNUSED) + subpass->has_resolve = true; + } + } + + if (desc->pDepthStencilAttachment) { + subpass->depth_stencil_attachment = + *desc->pDepthStencilAttachment; + } else { + subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED; + } + } + + for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) { + uint32_t dst = pCreateInfo->pDependencies[i].dstSubpass; + if (dst == VK_SUBPASS_EXTERNAL) { + pass->end_barrier.src_stage_mask = pCreateInfo->pDependencies[i].srcStageMask; + pass->end_barrier.src_access_mask = pCreateInfo->pDependencies[i].srcAccessMask; + pass->end_barrier.dst_access_mask = pCreateInfo->pDependencies[i].dstAccessMask; + } else { + pass->subpasses[dst].start_barrier.src_stage_mask = pCreateInfo->pDependencies[i].srcStageMask; + pass->subpasses[dst].start_barrier.src_access_mask = pCreateInfo->pDependencies[i].srcAccessMask; + pass->subpasses[dst].start_barrier.dst_access_mask = pCreateInfo->pDependencies[i].dstAccessMask; + } + } + + *pRenderPass = radv_render_pass_to_handle(pass); + + return VK_SUCCESS; +} + +void radv_DestroyRenderPass( + VkDevice _device, + VkRenderPass _pass, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_render_pass, pass, _pass); + + if (!_pass) + return; + radv_free2(&device->alloc, pAllocator, pass->subpass_attachments); + radv_free2(&device->alloc, pAllocator, pass); +} + +void radv_GetRenderAreaGranularity( + VkDevice device, + VkRenderPass renderPass, + VkExtent2D* pGranularity) +{ + pGranularity->width = 1; + pGranularity->height = 1; +} + diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c new file mode 100644 index 00000000000..89300e59924 --- /dev/null +++ b/src/amd/vulkan/radv_pipeline.c @@ -0,0 +1,1408 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based in part on anv driver which is: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "util/mesa-sha1.h" +#include "radv_private.h" +#include "nir/nir.h" +#include "nir/nir_builder.h" +#include "spirv/nir_spirv.h" + +#include +#include + +#include "sid.h" +#include "r600d_common.h" +#include "ac_binary.h" +#include "ac_llvm_util.h" +#include "ac_nir_to_llvm.h" +#include "vk_format.h" +#include "util/debug.h" +void radv_shader_variant_destroy(struct radv_device *device, + struct radv_shader_variant *variant); + +static const struct nir_shader_compiler_options nir_options = { + .vertex_id_zero_based = true, + .lower_scmp = true, + .lower_flrp32 = true, + .lower_fsat = true, + .lower_pack_snorm_2x16 = true, + .lower_pack_snorm_4x8 = true, + .lower_pack_unorm_2x16 = true, + .lower_pack_unorm_4x8 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_snorm_4x8 = true, + .lower_unpack_unorm_2x16 = true, + .lower_unpack_unorm_4x8 = true, + .lower_extract_byte = true, + .lower_extract_word = true, +}; + +VkResult radv_CreateShaderModule( + VkDevice _device, + const VkShaderModuleCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkShaderModule* pShaderModule) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_shader_module *module; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO); + assert(pCreateInfo->flags == 0); + + module = radv_alloc2(&device->alloc, pAllocator, + sizeof(*module) + pCreateInfo->codeSize, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (module == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + module->nir = NULL; + module->size = pCreateInfo->codeSize; + memcpy(module->data, pCreateInfo->pCode, module->size); + + _mesa_sha1_compute(module->data, module->size, module->sha1); + + *pShaderModule = radv_shader_module_to_handle(module); + + return VK_SUCCESS; +} + +void radv_DestroyShaderModule( + VkDevice _device, + VkShaderModule _module, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_shader_module, module, _module); + + if (!module) + return; + + radv_free2(&device->alloc, pAllocator, module); +} + +void radv_DestroyPipeline( + VkDevice _device, + VkPipeline _pipeline, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline); + + if (!_pipeline) + return; + + for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) + if (pipeline->shaders[i]) + radv_shader_variant_destroy(device, pipeline->shaders[i]); + + radv_free2(&device->alloc, pAllocator, pipeline); +} + + +static void +radv_optimize_nir(struct nir_shader *shader) +{ + bool progress; + + do { + progress = false; + + NIR_PASS_V(shader, nir_lower_vars_to_ssa); + NIR_PASS_V(shader, nir_lower_alu_to_scalar); + NIR_PASS_V(shader, nir_lower_phis_to_scalar); + + NIR_PASS(progress, shader, nir_copy_prop); + NIR_PASS(progress, shader, nir_opt_remove_phis); + NIR_PASS(progress, shader, nir_opt_dce); + NIR_PASS(progress, shader, nir_opt_dead_cf); + NIR_PASS(progress, shader, nir_opt_cse); + NIR_PASS(progress, shader, nir_opt_peephole_select, 8); + NIR_PASS(progress, shader, nir_opt_algebraic); + NIR_PASS(progress, shader, nir_opt_constant_folding); + NIR_PASS(progress, shader, nir_opt_undef); + } while (progress); +} + +static nir_shader * +radv_shader_compile_to_nir(struct radv_device *device, + struct radv_shader_module *module, + const char *entrypoint_name, + gl_shader_stage stage, + const VkSpecializationInfo *spec_info, + bool dump) +{ + if (strcmp(entrypoint_name, "main") != 0) { + radv_finishme("Multiple shaders per module not really supported"); + } + + nir_shader *nir; + nir_function *entry_point; + if (module->nir) { + /* Some things such as our meta clear/blit code will give us a NIR + * shader directly. In that case, we just ignore the SPIR-V entirely + * and just use the NIR shader */ + nir = module->nir; + nir->options = &nir_options; + nir_validate_shader(nir); + + assert(exec_list_length(&nir->functions) == 1); + struct exec_node *node = exec_list_get_head(&nir->functions); + entry_point = exec_node_data(nir_function, node, node); + } else { + uint32_t *spirv = (uint32_t *) module->data; + assert(module->size % 4 == 0); + + uint32_t num_spec_entries = 0; + struct nir_spirv_specialization *spec_entries = NULL; + if (spec_info && spec_info->mapEntryCount > 0) { + num_spec_entries = spec_info->mapEntryCount; + spec_entries = malloc(num_spec_entries * sizeof(*spec_entries)); + for (uint32_t i = 0; i < num_spec_entries; i++) { + VkSpecializationMapEntry entry = spec_info->pMapEntries[i]; + const void *data = spec_info->pData + entry.offset; + assert(data + entry.size <= spec_info->pData + spec_info->dataSize); + + spec_entries[i].id = spec_info->pMapEntries[i].constantID; + spec_entries[i].data = *(const uint32_t *)data; + } + } + + entry_point = spirv_to_nir(spirv, module->size / 4, + spec_entries, num_spec_entries, + stage, entrypoint_name, &nir_options); + nir = entry_point->shader; + assert(nir->stage == stage); + nir_validate_shader(nir); + + free(spec_entries); + + nir_lower_returns(nir); + nir_validate_shader(nir); + + nir_inline_functions(nir); + nir_validate_shader(nir); + + /* Pick off the single entrypoint that we want */ + foreach_list_typed_safe(nir_function, func, node, &nir->functions) { + if (func != entry_point) + exec_node_remove(&func->node); + } + assert(exec_list_length(&nir->functions) == 1); + entry_point->name = ralloc_strdup(entry_point, "main"); + + nir_remove_dead_variables(nir, nir_var_shader_in); + nir_remove_dead_variables(nir, nir_var_shader_out); + nir_remove_dead_variables(nir, nir_var_system_value); + nir_validate_shader(nir); + + nir_lower_system_values(nir); + nir_validate_shader(nir); + } + + /* Vulkan uses the separate-shader linking model */ + nir->info.separate_shader = true; + + // nir = brw_preprocess_nir(compiler, nir); + + nir_shader_gather_info(nir, entry_point->impl); + + nir_variable_mode indirect_mask = 0; + // if (compiler->glsl_compiler_options[stage].EmitNoIndirectInput) + indirect_mask |= nir_var_shader_in; + // if (compiler->glsl_compiler_options[stage].EmitNoIndirectTemp) + indirect_mask |= nir_var_local; + + nir_lower_indirect_derefs(nir, indirect_mask); + + static const nir_lower_tex_options tex_options = { + .lower_txp = ~0, + }; + + nir_lower_tex(nir, &tex_options); + + nir_lower_vars_to_ssa(nir); + nir_lower_var_copies(nir); + nir_lower_global_vars_to_local(nir); + nir_remove_dead_variables(nir, nir_var_local); + radv_optimize_nir(nir); + + if (dump) + nir_print_shader(nir, stderr); + + return nir; +} + +void radv_shader_variant_destroy(struct radv_device *device, + struct radv_shader_variant *variant) +{ + if (__sync_fetch_and_sub(&variant->ref_count, 1) != 1) + return; + + device->ws->buffer_destroy(variant->bo); + free(variant); +} + +static +struct radv_shader_variant *radv_shader_variant_create(struct radv_device *device, + struct nir_shader *shader, + struct radv_pipeline_layout *layout, + const union ac_shader_variant_key *key, + void** code_out, + unsigned *code_size_out, + bool dump) +{ + struct radv_shader_variant *variant = calloc(1, sizeof(struct radv_shader_variant)); + enum radeon_family chip_family = device->instance->physicalDevice.rad_info.family; + LLVMTargetMachineRef tm; + if (!variant) + return NULL; + + struct ac_nir_compiler_options options = {0}; + options.layout = layout; + if (key) + options.key = *key; + + struct ac_shader_binary binary; + + options.unsafe_math = env_var_as_boolean("RADV_UNSAFE_MATH", false); + options.family = chip_family; + options.chip_class = device->instance->physicalDevice.rad_info.chip_class; + tm = ac_create_target_machine(chip_family); + ac_compile_nir_shader(tm, &binary, &variant->config, + &variant->info, shader, &options, dump); + LLVMDisposeTargetMachine(tm); + + bool scratch_enabled = variant->config.scratch_bytes_per_wave > 0; + unsigned vgpr_comp_cnt = 0; + + if (scratch_enabled) + radv_finishme("shader scratch space"); + switch (shader->stage) { + case MESA_SHADER_VERTEX: + variant->rsrc2 = S_00B12C_USER_SGPR(variant->info.num_user_sgprs) | + S_00B12C_SCRATCH_EN(scratch_enabled); + vgpr_comp_cnt = variant->info.vs.vgpr_comp_cnt; + break; + case MESA_SHADER_FRAGMENT: + variant->rsrc2 = S_00B12C_USER_SGPR(variant->info.num_user_sgprs) | + S_00B12C_SCRATCH_EN(scratch_enabled); + break; + case MESA_SHADER_COMPUTE: + variant->rsrc2 = S_00B84C_USER_SGPR(variant->info.num_user_sgprs) | + S_00B84C_SCRATCH_EN(scratch_enabled) | + S_00B84C_TGID_X_EN(1) | S_00B84C_TGID_Y_EN(1) | + S_00B84C_TGID_Z_EN(1) | S_00B84C_TIDIG_COMP_CNT(2) | + S_00B84C_TG_SIZE_EN(1) | + S_00B84C_LDS_SIZE(variant->config.lds_size); + break; + default: + unreachable("unsupported shader type"); + break; + } + + variant->rsrc1 = S_00B848_VGPRS((variant->config.num_vgprs - 1) / 4) | + S_00B848_SGPRS((variant->config.num_sgprs - 1) / 8) | + S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) | + S_00B848_DX10_CLAMP(1) | + S_00B848_FLOAT_MODE(variant->config.float_mode); + + variant->bo = device->ws->buffer_create(device->ws, binary.code_size, 256, + RADEON_DOMAIN_GTT, RADEON_FLAG_CPU_ACCESS); + + void *ptr = device->ws->buffer_map(variant->bo); + memcpy(ptr, binary.code, binary.code_size); + device->ws->buffer_unmap(variant->bo); + + if (code_out) { + *code_out = binary.code; + *code_size_out = binary.code_size; + } else + free(binary.code); + free(binary.config); + free(binary.rodata); + free(binary.global_symbol_offsets); + free(binary.relocs); + free(binary.disasm_string); + variant->ref_count = 1; + return variant; +} + + +static struct radv_shader_variant * +radv_pipeline_compile(struct radv_pipeline *pipeline, + struct radv_pipeline_cache *cache, + struct radv_shader_module *module, + const char *entrypoint, + gl_shader_stage stage, + const VkSpecializationInfo *spec_info, + struct radv_pipeline_layout *layout, + const union ac_shader_variant_key *key, + bool dump) +{ + unsigned char sha1[20]; + struct radv_shader_variant *variant; + nir_shader *nir; + void *code = NULL; + unsigned code_size; + + if (module->nir) + _mesa_sha1_compute(module->nir->info.name, + strlen(module->nir->info.name), + module->sha1); + + radv_hash_shader(sha1, module, entrypoint, spec_info, layout, key); + + if (cache) { + variant = radv_create_shader_variant_from_pipeline_cache(pipeline->device, + cache, + sha1); + if (variant) + return variant; + } + + nir = radv_shader_compile_to_nir(pipeline->device, + module, entrypoint, stage, + spec_info, dump); + if (nir == NULL) + return NULL; + + variant = radv_shader_variant_create(pipeline->device, nir, layout, key, + &code, &code_size, dump); + if (!module->nir) + ralloc_free(nir); + + if (variant && cache) + variant = radv_pipeline_cache_insert_shader(cache, sha1, variant, + code, code_size); + + if (code) + free(code); + return variant; +} + +static uint32_t si_translate_blend_function(VkBlendOp op) +{ + switch (op) { + case VK_BLEND_OP_ADD: + return V_028780_COMB_DST_PLUS_SRC; + case VK_BLEND_OP_SUBTRACT: + return V_028780_COMB_SRC_MINUS_DST; + case VK_BLEND_OP_REVERSE_SUBTRACT: + return V_028780_COMB_DST_MINUS_SRC; + case VK_BLEND_OP_MIN: + return V_028780_COMB_MIN_DST_SRC; + case VK_BLEND_OP_MAX: + return V_028780_COMB_MAX_DST_SRC; + default: + return 0; + } +} + +static uint32_t si_translate_blend_factor(VkBlendFactor factor) +{ + switch (factor) { + case VK_BLEND_FACTOR_ZERO: + return V_028780_BLEND_ZERO; + case VK_BLEND_FACTOR_ONE: + return V_028780_BLEND_ONE; + case VK_BLEND_FACTOR_SRC_COLOR: + return V_028780_BLEND_SRC_COLOR; + case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR: + return V_028780_BLEND_ONE_MINUS_SRC_COLOR; + case VK_BLEND_FACTOR_DST_COLOR: + return V_028780_BLEND_DST_COLOR; + case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR: + return V_028780_BLEND_ONE_MINUS_DST_COLOR; + case VK_BLEND_FACTOR_SRC_ALPHA: + return V_028780_BLEND_SRC_ALPHA; + case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA: + return V_028780_BLEND_ONE_MINUS_SRC_ALPHA; + case VK_BLEND_FACTOR_DST_ALPHA: + return V_028780_BLEND_DST_ALPHA; + case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA: + return V_028780_BLEND_ONE_MINUS_DST_ALPHA; + case VK_BLEND_FACTOR_CONSTANT_COLOR: + return V_028780_BLEND_CONSTANT_COLOR; + case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR: + return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR; + case VK_BLEND_FACTOR_CONSTANT_ALPHA: + return V_028780_BLEND_CONSTANT_ALPHA; + case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA: + return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA; + case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE: + return V_028780_BLEND_SRC_ALPHA_SATURATE; + case VK_BLEND_FACTOR_SRC1_COLOR: + return V_028780_BLEND_SRC1_COLOR; + case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR: + return V_028780_BLEND_INV_SRC1_COLOR; + case VK_BLEND_FACTOR_SRC1_ALPHA: + return V_028780_BLEND_SRC1_ALPHA; + case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA: + return V_028780_BLEND_INV_SRC1_ALPHA; + default: + return 0; + } +} + +static bool is_dual_src(VkBlendFactor factor) +{ + switch (factor) { + case VK_BLEND_FACTOR_SRC1_COLOR: + case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR: + case VK_BLEND_FACTOR_SRC1_ALPHA: + case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA: + return true; + default: + return false; + } +} + +static unsigned si_choose_spi_color_format(VkFormat vk_format, + bool blend_enable, + bool blend_need_alpha) +{ + const struct vk_format_description *desc = vk_format_description(vk_format); + unsigned format, ntype, swap; + + /* Alpha is needed for alpha-to-coverage. + * Blending may be with or without alpha. + */ + unsigned normal = 0; /* most optimal, may not support blending or export alpha */ + unsigned alpha = 0; /* exports alpha, but may not support blending */ + unsigned blend = 0; /* supports blending, but may not export alpha */ + unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */ + + format = radv_translate_colorformat(vk_format); + ntype = radv_translate_color_numformat(vk_format, desc, + vk_format_get_first_non_void_channel(vk_format)); + swap = radv_translate_colorswap(vk_format, false); + + /* Choose the SPI color formats. These are required values for Stoney/RB+. + * Other chips have multiple choices, though they are not necessarily better. + */ + switch (format) { + case V_028C70_COLOR_5_6_5: + case V_028C70_COLOR_1_5_5_5: + case V_028C70_COLOR_5_5_5_1: + case V_028C70_COLOR_4_4_4_4: + case V_028C70_COLOR_10_11_11: + case V_028C70_COLOR_11_11_10: + case V_028C70_COLOR_8: + case V_028C70_COLOR_8_8: + case V_028C70_COLOR_8_8_8_8: + case V_028C70_COLOR_10_10_10_2: + case V_028C70_COLOR_2_10_10_10: + if (ntype == V_028C70_NUMBER_UINT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; + else if (ntype == V_028C70_NUMBER_SINT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; + else + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; + break; + + case V_028C70_COLOR_16: + case V_028C70_COLOR_16_16: + case V_028C70_COLOR_16_16_16_16: + if (ntype == V_028C70_NUMBER_UNORM || + ntype == V_028C70_NUMBER_SNORM) { + /* UNORM16 and SNORM16 don't support blending */ + if (ntype == V_028C70_NUMBER_UNORM) + normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR; + else + normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR; + + /* Use 32 bits per channel for blending. */ + if (format == V_028C70_COLOR_16) { + if (swap == V_028C70_SWAP_STD) { /* R */ + blend = V_028714_SPI_SHADER_32_R; + blend_alpha = V_028714_SPI_SHADER_32_AR; + } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ + blend = blend_alpha = V_028714_SPI_SHADER_32_AR; + else + assert(0); + } else if (format == V_028C70_COLOR_16_16) { + if (swap == V_028C70_SWAP_STD) { /* RG */ + blend = V_028714_SPI_SHADER_32_GR; + blend_alpha = V_028714_SPI_SHADER_32_ABGR; + } else if (swap == V_028C70_SWAP_ALT) /* RA */ + blend = blend_alpha = V_028714_SPI_SHADER_32_AR; + else + assert(0); + } else /* 16_16_16_16 */ + blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR; + } else if (ntype == V_028C70_NUMBER_UINT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; + else if (ntype == V_028C70_NUMBER_SINT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; + else if (ntype == V_028C70_NUMBER_FLOAT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; + else + assert(0); + break; + + case V_028C70_COLOR_32: + if (swap == V_028C70_SWAP_STD) { /* R */ + blend = normal = V_028714_SPI_SHADER_32_R; + alpha = blend_alpha = V_028714_SPI_SHADER_32_AR; + } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; + else + assert(0); + break; + + case V_028C70_COLOR_32_32: + if (swap == V_028C70_SWAP_STD) { /* RG */ + blend = normal = V_028714_SPI_SHADER_32_GR; + alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR; + } else if (swap == V_028C70_SWAP_ALT) /* RA */ + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; + else + assert(0); + break; + + case V_028C70_COLOR_32_32_32_32: + case V_028C70_COLOR_8_24: + case V_028C70_COLOR_24_8: + case V_028C70_COLOR_X24_8_32_FLOAT: + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; + break; + + default: + unreachable("unhandled blend format"); + } + + if (blend_enable && blend_need_alpha) + return blend_alpha; + else if(blend_need_alpha) + return alpha; + else if(blend_enable) + return blend; + else + return normal; +} + +static unsigned si_get_cb_shader_mask(unsigned spi_shader_col_format) +{ + unsigned i, cb_shader_mask = 0; + + for (i = 0; i < 8; i++) { + switch ((spi_shader_col_format >> (i * 4)) & 0xf) { + case V_028714_SPI_SHADER_ZERO: + break; + case V_028714_SPI_SHADER_32_R: + cb_shader_mask |= 0x1 << (i * 4); + break; + case V_028714_SPI_SHADER_32_GR: + cb_shader_mask |= 0x3 << (i * 4); + break; + case V_028714_SPI_SHADER_32_AR: + cb_shader_mask |= 0x9 << (i * 4); + break; + case V_028714_SPI_SHADER_FP16_ABGR: + case V_028714_SPI_SHADER_UNORM16_ABGR: + case V_028714_SPI_SHADER_SNORM16_ABGR: + case V_028714_SPI_SHADER_UINT16_ABGR: + case V_028714_SPI_SHADER_SINT16_ABGR: + case V_028714_SPI_SHADER_32_ABGR: + cb_shader_mask |= 0xf << (i * 4); + break; + default: + assert(0); + } + } + return cb_shader_mask; +} + +static void +radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline, + const VkGraphicsPipelineCreateInfo *pCreateInfo, + uint32_t blend_enable, + uint32_t blend_need_alpha, + bool single_cb_enable) +{ + RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); + struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; + struct radv_blend_state *blend = &pipeline->graphics.blend; + unsigned col_format = 0; + + for (unsigned i = 0; i < (single_cb_enable ? 1 : subpass->color_count); ++i) { + struct radv_render_pass_attachment *attachment; + unsigned cf; + + attachment = pass->attachments + subpass->color_attachments[i].attachment; + + cf = si_choose_spi_color_format(attachment->format, + blend_enable & (1 << i), + blend_need_alpha & (1 << i)); + + col_format |= cf << (4 * i); + } + + blend->cb_shader_mask = si_get_cb_shader_mask(col_format); + + if (!col_format) + col_format |= V_028714_SPI_SHADER_32_R; + blend->spi_shader_col_format = col_format; +} + +static bool +format_is_int8(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + int channel = vk_format_get_first_non_void_channel(format); + + return channel >= 0 && desc->channel[channel].pure_integer && + desc->channel[channel].size == 8; +} + +unsigned radv_format_meta_fs_key(VkFormat format) +{ + unsigned col_format = si_choose_spi_color_format(format, false, false) - 1; + bool is_int8 = format_is_int8(format); + + return col_format + (is_int8 ? 3 : 0); +} + +static unsigned +radv_pipeline_compute_is_int8(const VkGraphicsPipelineCreateInfo *pCreateInfo) +{ + RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); + struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; + unsigned is_int8 = 0; + + for (unsigned i = 0; i < subpass->color_count; ++i) { + struct radv_render_pass_attachment *attachment; + + attachment = pass->attachments + subpass->color_attachments[i].attachment; + + if (format_is_int8(attachment->format)) + is_int8 |= 1 << i; + } + + return is_int8; +} + +static void +radv_pipeline_init_blend_state(struct radv_pipeline *pipeline, + const VkGraphicsPipelineCreateInfo *pCreateInfo, + const struct radv_graphics_pipeline_create_info *extra) +{ + const VkPipelineColorBlendStateCreateInfo *vkblend = pCreateInfo->pColorBlendState; + struct radv_blend_state *blend = &pipeline->graphics.blend; + unsigned mode = V_028808_CB_NORMAL; + uint32_t blend_enable = 0, blend_need_alpha = 0; + int i; + bool single_cb_enable = false; + if (extra && extra->custom_blend_mode) { + single_cb_enable = true; + mode = extra->custom_blend_mode; + } + blend->cb_color_control = 0; + if (vkblend->logicOpEnable) + blend->cb_color_control |= S_028808_ROP3(vkblend->logicOp | (vkblend->logicOp << 4)); + else + blend->cb_color_control |= S_028808_ROP3(0xcc); + + blend->db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) | + S_028B70_ALPHA_TO_MASK_OFFSET1(2) | + S_028B70_ALPHA_TO_MASK_OFFSET2(2) | + S_028B70_ALPHA_TO_MASK_OFFSET3(2); + + blend->cb_target_mask = 0; + for (i = 0; i < vkblend->attachmentCount; i++) { + const VkPipelineColorBlendAttachmentState *att = &vkblend->pAttachments[i]; + unsigned blend_cntl = 0; + VkBlendOp eqRGB = att->colorBlendOp; + VkBlendFactor srcRGB = att->srcColorBlendFactor; + VkBlendFactor dstRGB = att->dstColorBlendFactor; + VkBlendOp eqA = att->alphaBlendOp; + VkBlendFactor srcA = att->srcAlphaBlendFactor; + VkBlendFactor dstA = att->dstAlphaBlendFactor; + + blend->sx_mrt0_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED); + + if (!att->colorWriteMask) + continue; + + blend->cb_target_mask |= (unsigned)att->colorWriteMask << (4 * i); + if (!att->blendEnable) { + blend->cb_blend_control[i] = blend_cntl; + continue; + } + + if (is_dual_src(srcRGB) || is_dual_src(dstRGB) || is_dual_src(srcA) || is_dual_src(dstA)) + radv_finishme("dual source blending"); + if (eqRGB == VK_BLEND_OP_MIN || eqRGB == VK_BLEND_OP_MAX) { + srcRGB = VK_BLEND_FACTOR_ONE; + dstRGB = VK_BLEND_FACTOR_ONE; + } + if (eqA == VK_BLEND_OP_MIN || eqA == VK_BLEND_OP_MAX) { + srcA = VK_BLEND_FACTOR_ONE; + dstA = VK_BLEND_FACTOR_ONE; + } + + blend_cntl |= S_028780_ENABLE(1); + + blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB)); + blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB)); + blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB)); + if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) { + blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1); + blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA)); + blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA)); + blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA)); + } + blend->cb_blend_control[i] = blend_cntl; + + blend_enable |= 1 << i; + + if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA || + dstRGB == VK_BLEND_FACTOR_SRC_ALPHA || + srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE || + dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE || + srcRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA || + dstRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA) + blend_need_alpha |= 1 << i; + } + for (i = vkblend->attachmentCount; i < 8; i++) + blend->cb_blend_control[i] = 0; + + if (blend->cb_target_mask) + blend->cb_color_control |= S_028808_MODE(mode); + else + blend->cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE); + + radv_pipeline_compute_spi_color_formats(pipeline, pCreateInfo, + blend_enable, blend_need_alpha, single_cb_enable); +} + +static uint32_t si_translate_stencil_op(enum VkStencilOp op) +{ + switch (op) { + case VK_STENCIL_OP_KEEP: + return V_02842C_STENCIL_KEEP; + case VK_STENCIL_OP_ZERO: + return V_02842C_STENCIL_ZERO; + case VK_STENCIL_OP_REPLACE: + return V_02842C_STENCIL_REPLACE_TEST; + case VK_STENCIL_OP_INCREMENT_AND_CLAMP: + return V_02842C_STENCIL_ADD_CLAMP; + case VK_STENCIL_OP_DECREMENT_AND_CLAMP: + return V_02842C_STENCIL_SUB_CLAMP; + case VK_STENCIL_OP_INVERT: + return V_02842C_STENCIL_INVERT; + case VK_STENCIL_OP_INCREMENT_AND_WRAP: + return V_02842C_STENCIL_ADD_WRAP; + case VK_STENCIL_OP_DECREMENT_AND_WRAP: + return V_02842C_STENCIL_SUB_WRAP; + default: + return 0; + } +} +static void +radv_pipeline_init_depth_stencil_state(struct radv_pipeline *pipeline, + const VkGraphicsPipelineCreateInfo *pCreateInfo, + const struct radv_graphics_pipeline_create_info *extra) +{ + const VkPipelineDepthStencilStateCreateInfo *vkds = pCreateInfo->pDepthStencilState; + struct radv_depth_stencil_state *ds = &pipeline->graphics.ds; + + memset(ds, 0, sizeof(*ds)); + if (!vkds) + return; + ds->db_depth_control = S_028800_Z_ENABLE(vkds->depthTestEnable ? 1 : 0) | + S_028800_Z_WRITE_ENABLE(vkds->depthWriteEnable ? 1 : 0) | + S_028800_ZFUNC(vkds->depthCompareOp) | + S_028800_DEPTH_BOUNDS_ENABLE(vkds->depthBoundsTestEnable ? 1 : 0); + + if (vkds->stencilTestEnable) { + ds->db_depth_control |= S_028800_STENCIL_ENABLE(1) | S_028800_BACKFACE_ENABLE(1); + ds->db_depth_control |= S_028800_STENCILFUNC(vkds->front.compareOp); + ds->db_stencil_control |= S_02842C_STENCILFAIL(si_translate_stencil_op(vkds->front.failOp)); + ds->db_stencil_control |= S_02842C_STENCILZPASS(si_translate_stencil_op(vkds->front.passOp)); + ds->db_stencil_control |= S_02842C_STENCILZFAIL(si_translate_stencil_op(vkds->front.depthFailOp)); + + ds->db_depth_control |= S_028800_STENCILFUNC_BF(vkds->back.compareOp); + ds->db_stencil_control |= S_02842C_STENCILFAIL_BF(si_translate_stencil_op(vkds->back.failOp)); + ds->db_stencil_control |= S_02842C_STENCILZPASS_BF(si_translate_stencil_op(vkds->back.passOp)); + ds->db_stencil_control |= S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(vkds->back.depthFailOp)); + } + + if (extra) { + + ds->db_render_control |= S_028000_DEPTH_CLEAR_ENABLE(extra->db_depth_clear); + ds->db_render_control |= S_028000_STENCIL_CLEAR_ENABLE(extra->db_stencil_clear); + + ds->db_render_control |= S_028000_RESUMMARIZE_ENABLE(extra->db_resummarize); + ds->db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(extra->db_flush_depth_inplace); + ds->db_render_control |= S_028000_STENCIL_COMPRESS_DISABLE(extra->db_flush_stencil_inplace); + ds->db_render_override2 |= S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(extra->db_depth_disable_expclear); + ds->db_render_override2 |= S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(extra->db_stencil_disable_expclear); + } +} + +static uint32_t si_translate_fill(VkPolygonMode func) +{ + switch(func) { + case VK_POLYGON_MODE_FILL: + return V_028814_X_DRAW_TRIANGLES; + case VK_POLYGON_MODE_LINE: + return V_028814_X_DRAW_LINES; + case VK_POLYGON_MODE_POINT: + return V_028814_X_DRAW_POINTS; + default: + assert(0); + return V_028814_X_DRAW_POINTS; + } +} +static void +radv_pipeline_init_raster_state(struct radv_pipeline *pipeline, + const VkGraphicsPipelineCreateInfo *pCreateInfo) +{ + const VkPipelineRasterizationStateCreateInfo *vkraster = pCreateInfo->pRasterizationState; + struct radv_raster_state *raster = &pipeline->graphics.raster; + + memset(raster, 0, sizeof(*raster)); + + raster->spi_interp_control = + S_0286D4_FLAT_SHADE_ENA(1) | + S_0286D4_PNT_SPRITE_ENA(1) | + S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | + S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) | + S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) | + S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) | + S_0286D4_PNT_SPRITE_TOP_1(0); // vulkan is top to bottom - 1.0 at bottom + + raster->pa_cl_vs_out_cntl = S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1); + raster->pa_cl_clip_cntl = S_028810_PS_UCP_MODE(3) | + S_028810_DX_CLIP_SPACE_DEF(1) | // vulkan uses DX conventions. + S_028810_ZCLIP_NEAR_DISABLE(vkraster->depthClampEnable ? 1 : 0) | + S_028810_ZCLIP_FAR_DISABLE(vkraster->depthClampEnable ? 1 : 0) | + S_028810_DX_RASTERIZATION_KILL(vkraster->rasterizerDiscardEnable ? 1 : 0) | + S_028810_DX_LINEAR_ATTR_CLIP_ENA(1); + + raster->pa_su_vtx_cntl = + S_028BE4_PIX_CENTER(1) | // TODO verify + S_028BE4_ROUND_MODE(V_028BE4_X_ROUND_TO_EVEN) | + S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH); + + raster->pa_su_sc_mode_cntl = + S_028814_FACE(vkraster->frontFace) | + S_028814_CULL_FRONT(!!(vkraster->cullMode & VK_CULL_MODE_FRONT_BIT)) | + S_028814_CULL_BACK(!!(vkraster->cullMode & VK_CULL_MODE_BACK_BIT)) | + S_028814_POLY_MODE(vkraster->polygonMode != VK_POLYGON_MODE_FILL) | + S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(vkraster->polygonMode)) | + S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(vkraster->polygonMode)) | + S_028814_POLY_OFFSET_FRONT_ENABLE(vkraster->depthBiasEnable ? 1 : 0) | + S_028814_POLY_OFFSET_BACK_ENABLE(vkraster->depthBiasEnable ? 1 : 0) | + S_028814_POLY_OFFSET_PARA_ENABLE(vkraster->depthBiasEnable ? 1 : 0); + +} + +static void +radv_pipeline_init_multisample_state(struct radv_pipeline *pipeline, + const VkGraphicsPipelineCreateInfo *pCreateInfo) +{ + const VkPipelineMultisampleStateCreateInfo *vkms = pCreateInfo->pMultisampleState; + struct radv_blend_state *blend = &pipeline->graphics.blend; + struct radv_multisample_state *ms = &pipeline->graphics.ms; + unsigned num_tile_pipes = pipeline->device->instance->physicalDevice.rad_info.num_tile_pipes; + int ps_iter_samples = 1; + uint32_t mask = 0xffff; + + ms->num_samples = vkms->rasterizationSamples; + ms->pa_sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1); + ms->pa_sc_aa_config = 0; + ms->db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | + S_028804_STATIC_ANCHOR_ASSOCIATIONS(1); + ms->pa_sc_mode_cntl_1 = + S_028A4C_WALK_FENCE_ENABLE(1) | //TODO linear dst fixes + S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) | + /* always 1: */ + S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | + S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) | + S_028A4C_TILE_WALK_ORDER_ENABLE(1) | + S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) | + EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | + EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1); + + if (vkms->rasterizationSamples > 1) { + unsigned log_samples = util_logbase2(vkms->rasterizationSamples); + unsigned log_ps_iter_samples = util_logbase2(util_next_power_of_two(ps_iter_samples)); + ms->pa_sc_mode_cntl_0 = S_028A48_MSAA_ENABLE(1); + ms->pa_sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1); /* CM_R_028BDC_PA_SC_LINE_CNTL */ + ms->db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_samples) | + S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | + S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | + S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples); + ms->pa_sc_aa_config |= S_028BE0_MSAA_NUM_SAMPLES(log_samples) | + S_028BE0_MAX_SAMPLE_DIST(radv_cayman_get_maxdist(log_samples)) | + S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples); /* CM_R_028BE0_PA_SC_AA_CONFIG */ + ms->pa_sc_mode_cntl_1 |= EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1); + } + + if (vkms->alphaToCoverageEnable) + blend->db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1); + + if (vkms->pSampleMask) { + mask = vkms->pSampleMask[0] & 0xffff; + } + + ms->pa_sc_aa_mask[0] = mask | (mask << 16); + ms->pa_sc_aa_mask[1] = mask | (mask << 16); +} + +static uint32_t +si_translate_prim(enum VkPrimitiveTopology topology) +{ + switch (topology) { + case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: + return V_008958_DI_PT_POINTLIST; + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: + return V_008958_DI_PT_LINELIST; + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: + return V_008958_DI_PT_LINESTRIP; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: + return V_008958_DI_PT_TRILIST; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: + return V_008958_DI_PT_TRISTRIP; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: + return V_008958_DI_PT_TRIFAN; + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: + return V_008958_DI_PT_LINELIST_ADJ; + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: + return V_008958_DI_PT_LINESTRIP_ADJ; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: + return V_008958_DI_PT_TRILIST_ADJ; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: + return V_008958_DI_PT_TRISTRIP_ADJ; + case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST: + return V_008958_DI_PT_PATCH; + default: + assert(0); + return 0; + } +} + +static uint32_t +si_conv_prim_to_gs_out(enum VkPrimitiveTopology topology) +{ + switch (topology) { + case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: + case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST: + return V_028A6C_OUTPRIM_TYPE_POINTLIST; + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: + return V_028A6C_OUTPRIM_TYPE_LINESTRIP; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: + return V_028A6C_OUTPRIM_TYPE_TRISTRIP; + default: + assert(0); + return 0; + } +} + +static unsigned si_map_swizzle(unsigned swizzle) +{ + switch (swizzle) { + case VK_SWIZZLE_Y: + return V_008F0C_SQ_SEL_Y; + case VK_SWIZZLE_Z: + return V_008F0C_SQ_SEL_Z; + case VK_SWIZZLE_W: + return V_008F0C_SQ_SEL_W; + case VK_SWIZZLE_0: + return V_008F0C_SQ_SEL_0; + case VK_SWIZZLE_1: + return V_008F0C_SQ_SEL_1; + default: /* VK_SWIZZLE_X */ + return V_008F0C_SQ_SEL_X; + } +} + +static void +radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline, + const VkGraphicsPipelineCreateInfo *pCreateInfo) +{ + radv_cmd_dirty_mask_t states = RADV_CMD_DIRTY_DYNAMIC_ALL; + RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); + struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass]; + + pipeline->dynamic_state = default_dynamic_state; + + if (pCreateInfo->pDynamicState) { + /* Remove all of the states that are marked as dynamic */ + uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount; + for (uint32_t s = 0; s < count; s++) + states &= ~(1 << pCreateInfo->pDynamicState->pDynamicStates[s]); + } + + struct radv_dynamic_state *dynamic = &pipeline->dynamic_state; + + dynamic->viewport.count = pCreateInfo->pViewportState->viewportCount; + if (states & (1 << VK_DYNAMIC_STATE_VIEWPORT)) { + typed_memcpy(dynamic->viewport.viewports, + pCreateInfo->pViewportState->pViewports, + pCreateInfo->pViewportState->viewportCount); + } + + dynamic->scissor.count = pCreateInfo->pViewportState->scissorCount; + if (states & (1 << VK_DYNAMIC_STATE_SCISSOR)) { + typed_memcpy(dynamic->scissor.scissors, + pCreateInfo->pViewportState->pScissors, + pCreateInfo->pViewportState->scissorCount); + } + + if (states & (1 << VK_DYNAMIC_STATE_LINE_WIDTH)) { + assert(pCreateInfo->pRasterizationState); + dynamic->line_width = pCreateInfo->pRasterizationState->lineWidth; + } + + if (states & (1 << VK_DYNAMIC_STATE_DEPTH_BIAS)) { + assert(pCreateInfo->pRasterizationState); + dynamic->depth_bias.bias = + pCreateInfo->pRasterizationState->depthBiasConstantFactor; + dynamic->depth_bias.clamp = + pCreateInfo->pRasterizationState->depthBiasClamp; + dynamic->depth_bias.slope = + pCreateInfo->pRasterizationState->depthBiasSlopeFactor; + } + + if (states & (1 << VK_DYNAMIC_STATE_BLEND_CONSTANTS)) { + assert(pCreateInfo->pColorBlendState); + typed_memcpy(dynamic->blend_constants, + pCreateInfo->pColorBlendState->blendConstants, 4); + } + + /* If there is no depthstencil attachment, then don't read + * pDepthStencilState. The Vulkan spec states that pDepthStencilState may + * be NULL in this case. Even if pDepthStencilState is non-NULL, there is + * no need to override the depthstencil defaults in + * radv_pipeline::dynamic_state when there is no depthstencil attachment. + * + * From the Vulkan spec (20 Oct 2015, git-aa308cb): + * + * pDepthStencilState [...] may only be NULL if renderPass and subpass + * specify a subpass that has no depth/stencil attachment. + */ + if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) { + if (states & (1 << VK_DYNAMIC_STATE_DEPTH_BOUNDS)) { + assert(pCreateInfo->pDepthStencilState); + dynamic->depth_bounds.min = + pCreateInfo->pDepthStencilState->minDepthBounds; + dynamic->depth_bounds.max = + pCreateInfo->pDepthStencilState->maxDepthBounds; + } + + if (states & (1 << VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK)) { + assert(pCreateInfo->pDepthStencilState); + dynamic->stencil_compare_mask.front = + pCreateInfo->pDepthStencilState->front.compareMask; + dynamic->stencil_compare_mask.back = + pCreateInfo->pDepthStencilState->back.compareMask; + } + + if (states & (1 << VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) { + assert(pCreateInfo->pDepthStencilState); + dynamic->stencil_write_mask.front = + pCreateInfo->pDepthStencilState->front.writeMask; + dynamic->stencil_write_mask.back = + pCreateInfo->pDepthStencilState->back.writeMask; + } + + if (states & (1 << VK_DYNAMIC_STATE_STENCIL_REFERENCE)) { + assert(pCreateInfo->pDepthStencilState); + dynamic->stencil_reference.front = + pCreateInfo->pDepthStencilState->front.reference; + dynamic->stencil_reference.back = + pCreateInfo->pDepthStencilState->back.reference; + } + } + + pipeline->dynamic_state_mask = states; +} + +static union ac_shader_variant_key +radv_compute_vs_key(const VkGraphicsPipelineCreateInfo *pCreateInfo) +{ + union ac_shader_variant_key key; + const VkPipelineVertexInputStateCreateInfo *input_state = + pCreateInfo->pVertexInputState; + + memset(&key, 0, sizeof(key)); + key.vs.instance_rate_inputs = 0; + + for (unsigned i = 0; i < input_state->vertexAttributeDescriptionCount; ++i) { + unsigned binding; + binding = input_state->pVertexAttributeDescriptions[i].binding; + if (input_state->pVertexBindingDescriptions[binding].inputRate) + key.vs.instance_rate_inputs |= 1u << input_state->pVertexAttributeDescriptions[i].location; + } + return key; +} + +VkResult +radv_pipeline_init(struct radv_pipeline *pipeline, + struct radv_device *device, + struct radv_pipeline_cache *cache, + const VkGraphicsPipelineCreateInfo *pCreateInfo, + const struct radv_graphics_pipeline_create_info *extra, + const VkAllocationCallbacks *alloc) +{ + struct radv_shader_module fs_m = {0}; + + bool dump = getenv("RADV_DUMP_SHADERS"); + if (alloc == NULL) + alloc = &device->alloc; + + pipeline->device = device; + pipeline->layout = radv_pipeline_layout_from_handle(pCreateInfo->layout); + + radv_pipeline_init_dynamic_state(pipeline, pCreateInfo); + const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = { 0, }; + struct radv_shader_module *modules[MESA_SHADER_STAGES] = { 0, }; + for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) { + gl_shader_stage stage = ffs(pCreateInfo->pStages[i].stage) - 1; + pStages[stage] = &pCreateInfo->pStages[i]; + modules[stage] = radv_shader_module_from_handle(pStages[stage]->module); + } + + radv_pipeline_init_blend_state(pipeline, pCreateInfo, extra); + + /* */ + if (modules[MESA_SHADER_VERTEX]) { + union ac_shader_variant_key key = radv_compute_vs_key(pCreateInfo); + + pipeline->shaders[MESA_SHADER_VERTEX] = + radv_pipeline_compile(pipeline, cache, modules[MESA_SHADER_VERTEX], + pStages[MESA_SHADER_VERTEX]->pName, + MESA_SHADER_VERTEX, + pStages[MESA_SHADER_VERTEX]->pSpecializationInfo, + pipeline->layout, &key, dump); + + pipeline->active_stages |= mesa_to_vk_shader_stage(MESA_SHADER_VERTEX); + } + + if (!modules[MESA_SHADER_FRAGMENT]) { + nir_builder fs_b; + nir_builder_init_simple_shader(&fs_b, NULL, MESA_SHADER_FRAGMENT, NULL); + fs_b.shader->info.name = ralloc_strdup(fs_b.shader, "noop_fs"); + fs_m.nir = fs_b.shader; + modules[MESA_SHADER_FRAGMENT] = &fs_m; + } + + if (modules[MESA_SHADER_FRAGMENT]) { + union ac_shader_variant_key key; + key.fs.col_format = pipeline->graphics.blend.spi_shader_col_format; + key.fs.is_int8 = radv_pipeline_compute_is_int8(pCreateInfo); + + const VkPipelineShaderStageCreateInfo *stage = pStages[MESA_SHADER_FRAGMENT]; + + pipeline->shaders[MESA_SHADER_FRAGMENT] = + radv_pipeline_compile(pipeline, cache, modules[MESA_SHADER_FRAGMENT], + stage ? stage->pName : "main", + MESA_SHADER_FRAGMENT, + stage ? stage->pSpecializationInfo : NULL, + pipeline->layout, &key, dump); + pipeline->active_stages |= mesa_to_vk_shader_stage(MESA_SHADER_FRAGMENT); + } + + if (fs_m.nir) + ralloc_free(fs_m.nir); + + radv_pipeline_init_depth_stencil_state(pipeline, pCreateInfo, extra); + radv_pipeline_init_raster_state(pipeline, pCreateInfo); + radv_pipeline_init_multisample_state(pipeline, pCreateInfo); + pipeline->graphics.prim = si_translate_prim(pCreateInfo->pInputAssemblyState->topology); + pipeline->graphics.gs_out = si_conv_prim_to_gs_out(pCreateInfo->pInputAssemblyState->topology); + if (extra && extra->use_rectlist) { + pipeline->graphics.prim = V_008958_DI_PT_RECTLIST; + pipeline->graphics.gs_out = V_028A6C_OUTPRIM_TYPE_TRISTRIP; + } + pipeline->graphics.prim_restart_enable = !!pCreateInfo->pInputAssemblyState->primitiveRestartEnable; + + const VkPipelineVertexInputStateCreateInfo *vi_info = + pCreateInfo->pVertexInputState; + for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) { + const VkVertexInputAttributeDescription *desc = + &vi_info->pVertexAttributeDescriptions[i]; + unsigned loc = desc->location; + const struct vk_format_description *format_desc; + int first_non_void; + uint32_t num_format, data_format; + format_desc = vk_format_description(desc->format); + first_non_void = vk_format_get_first_non_void_channel(desc->format); + + num_format = radv_translate_buffer_numformat(format_desc, first_non_void); + data_format = radv_translate_buffer_dataformat(format_desc, first_non_void); + + pipeline->va_rsrc_word3[loc] = S_008F0C_DST_SEL_X(si_map_swizzle(format_desc->swizzle[0])) | + S_008F0C_DST_SEL_Y(si_map_swizzle(format_desc->swizzle[1])) | + S_008F0C_DST_SEL_Z(si_map_swizzle(format_desc->swizzle[2])) | + S_008F0C_DST_SEL_W(si_map_swizzle(format_desc->swizzle[3])) | + S_008F0C_NUM_FORMAT(num_format) | + S_008F0C_DATA_FORMAT(data_format); + pipeline->va_format_size[loc] = format_desc->block.bits / 8; + pipeline->va_offset[loc] = desc->offset; + pipeline->va_binding[loc] = desc->binding; + pipeline->num_vertex_attribs = MAX2(pipeline->num_vertex_attribs, loc + 1); + } + + for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) { + const VkVertexInputBindingDescription *desc = + &vi_info->pVertexBindingDescriptions[i]; + + pipeline->binding_stride[desc->binding] = desc->stride; + } + + return VK_SUCCESS; +} + +VkResult +radv_graphics_pipeline_create( + VkDevice _device, + VkPipelineCache _cache, + const VkGraphicsPipelineCreateInfo *pCreateInfo, + const struct radv_graphics_pipeline_create_info *extra, + const VkAllocationCallbacks *pAllocator, + VkPipeline *pPipeline) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache); + struct radv_pipeline *pipeline; + VkResult result; + + pipeline = radv_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (pipeline == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + memset(pipeline, 0, sizeof(*pipeline)); + result = radv_pipeline_init(pipeline, device, cache, + pCreateInfo, extra, pAllocator); + if (result != VK_SUCCESS) { + radv_free2(&device->alloc, pAllocator, pipeline); + return result; + } + + *pPipeline = radv_pipeline_to_handle(pipeline); + + return VK_SUCCESS; +} + +VkResult radv_CreateGraphicsPipelines( + VkDevice _device, + VkPipelineCache pipelineCache, + uint32_t count, + const VkGraphicsPipelineCreateInfo* pCreateInfos, + const VkAllocationCallbacks* pAllocator, + VkPipeline* pPipelines) +{ + VkResult result = VK_SUCCESS; + unsigned i = 0; + + for (; i < count; i++) { + result = radv_graphics_pipeline_create(_device, + pipelineCache, + &pCreateInfos[i], + NULL, pAllocator, &pPipelines[i]); + if (result != VK_SUCCESS) { + for (unsigned j = 0; j < i; j++) { + radv_DestroyPipeline(_device, pPipelines[j], pAllocator); + } + + return result; + } + } + + return VK_SUCCESS; +} + +static VkResult radv_compute_pipeline_create( + VkDevice _device, + VkPipelineCache _cache, + const VkComputePipelineCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkPipeline* pPipeline) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache); + RADV_FROM_HANDLE(radv_shader_module, module, pCreateInfo->stage.module); + struct radv_pipeline *pipeline; + bool dump = getenv("RADV_DUMP_SHADERS"); + + pipeline = radv_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (pipeline == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + memset(pipeline, 0, sizeof(*pipeline)); + pipeline->device = device; + pipeline->layout = radv_pipeline_layout_from_handle(pCreateInfo->layout); + + pipeline->shaders[MESA_SHADER_COMPUTE] = + radv_pipeline_compile(pipeline, cache, module, + pCreateInfo->stage.pName, + MESA_SHADER_COMPUTE, + pCreateInfo->stage.pSpecializationInfo, + pipeline->layout, NULL, dump); + + *pPipeline = radv_pipeline_to_handle(pipeline); + return VK_SUCCESS; +} +VkResult radv_CreateComputePipelines( + VkDevice _device, + VkPipelineCache pipelineCache, + uint32_t count, + const VkComputePipelineCreateInfo* pCreateInfos, + const VkAllocationCallbacks* pAllocator, + VkPipeline* pPipelines) +{ + VkResult result = VK_SUCCESS; + + unsigned i = 0; + for (; i < count; i++) { + result = radv_compute_pipeline_create(_device, pipelineCache, + &pCreateInfos[i], + pAllocator, &pPipelines[i]); + if (result != VK_SUCCESS) { + for (unsigned j = 0; j < i; j++) { + radv_DestroyPipeline(_device, pPipelines[j], pAllocator); + } + + return result; + } + } + + return VK_SUCCESS; +} diff --git a/src/amd/vulkan/radv_pipeline_cache.c b/src/amd/vulkan/radv_pipeline_cache.c new file mode 100644 index 00000000000..032a7e46040 --- /dev/null +++ b/src/amd/vulkan/radv_pipeline_cache.c @@ -0,0 +1,475 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "util/mesa-sha1.h" +#include "util/debug.h" +#include "radv_private.h" + +#include "ac_nir_to_llvm.h" + +struct cache_entry { + unsigned char sha1[20]; + uint32_t code_size; + struct ac_shader_variant_info variant_info; + struct ac_shader_config config; + uint32_t rsrc1, rsrc2; + struct radv_shader_variant *variant; + uint32_t code[0]; +}; + +void +radv_pipeline_cache_init(struct radv_pipeline_cache *cache, + struct radv_device *device) +{ + cache->device = device; + pthread_mutex_init(&cache->mutex, NULL); + + cache->modified = false; + cache->kernel_count = 0; + cache->total_size = 0; + cache->table_size = 1024; + const size_t byte_size = cache->table_size * sizeof(cache->hash_table[0]); + cache->hash_table = malloc(byte_size); + + /* We don't consider allocation failure fatal, we just start with a 0-sized + * cache. */ + if (cache->hash_table == NULL || + !env_var_as_boolean("RADV_ENABLE_PIPELINE_CACHE", true)) + cache->table_size = 0; + else + memset(cache->hash_table, 0, byte_size); +} + +void +radv_pipeline_cache_finish(struct radv_pipeline_cache *cache) +{ + for (unsigned i = 0; i < cache->table_size; ++i) + if (cache->hash_table[i]) { + if (cache->hash_table[i]->variant) + radv_shader_variant_destroy(cache->device, + cache->hash_table[i]->variant); + radv_free(&cache->alloc, cache->hash_table[i]); + } + pthread_mutex_destroy(&cache->mutex); + free(cache->hash_table); +} + +static uint32_t +entry_size(struct cache_entry *entry) +{ + return sizeof(*entry) + entry->code_size; +} + +void +radv_hash_shader(unsigned char *hash, struct radv_shader_module *module, + const char *entrypoint, + const VkSpecializationInfo *spec_info, + const struct radv_pipeline_layout *layout, + const union ac_shader_variant_key *key) +{ + struct mesa_sha1 *ctx; + + ctx = _mesa_sha1_init(); + if (key) + _mesa_sha1_update(ctx, key, sizeof(*key)); + _mesa_sha1_update(ctx, module->sha1, sizeof(module->sha1)); + _mesa_sha1_update(ctx, entrypoint, strlen(entrypoint)); + if (layout) + _mesa_sha1_update(ctx, layout->sha1, sizeof(layout->sha1)); + if (spec_info) { + _mesa_sha1_update(ctx, spec_info->pMapEntries, + spec_info->mapEntryCount * sizeof spec_info->pMapEntries[0]); + _mesa_sha1_update(ctx, spec_info->pData, spec_info->dataSize); + } + _mesa_sha1_final(ctx, hash); +} + + +static struct cache_entry * +radv_pipeline_cache_search_unlocked(struct radv_pipeline_cache *cache, + const unsigned char *sha1) +{ + const uint32_t mask = cache->table_size - 1; + const uint32_t start = (*(uint32_t *) sha1); + + for (uint32_t i = 0; i < cache->table_size; i++) { + const uint32_t index = (start + i) & mask; + struct cache_entry *entry = cache->hash_table[index]; + + if (!entry) + return NULL; + + if (memcmp(entry->sha1, sha1, sizeof(entry->sha1)) == 0) { + return entry; + } + } + + unreachable("hash table should never be full"); +} + +static struct cache_entry * +radv_pipeline_cache_search(struct radv_pipeline_cache *cache, + const unsigned char *sha1) +{ + struct cache_entry *entry; + + pthread_mutex_lock(&cache->mutex); + + entry = radv_pipeline_cache_search_unlocked(cache, sha1); + + pthread_mutex_unlock(&cache->mutex); + + return entry; +} + +struct radv_shader_variant * +radv_create_shader_variant_from_pipeline_cache(struct radv_device *device, + struct radv_pipeline_cache *cache, + const unsigned char *sha1) +{ + struct cache_entry *entry = radv_pipeline_cache_search(cache, sha1); + + if (!entry) + return NULL; + + if (!entry->variant) { + struct radv_shader_variant *variant; + + variant = calloc(1, sizeof(struct radv_shader_variant)); + if (!variant) + return NULL; + + variant->config = entry->config; + variant->info = entry->variant_info; + variant->rsrc1 = entry->rsrc1; + variant->rsrc2 = entry->rsrc2; + variant->ref_count = 1; + + variant->bo = device->ws->buffer_create(device->ws, entry->code_size, 256, + RADEON_DOMAIN_GTT, RADEON_FLAG_CPU_ACCESS); + + void *ptr = device->ws->buffer_map(variant->bo); + memcpy(ptr, entry->code, entry->code_size); + device->ws->buffer_unmap(variant->bo); + + entry->variant = variant; + } + + __sync_fetch_and_add(&entry->variant->ref_count, 1); + return entry->variant; +} + + +static void +radv_pipeline_cache_set_entry(struct radv_pipeline_cache *cache, + struct cache_entry *entry) +{ + const uint32_t mask = cache->table_size - 1; + const uint32_t start = (*(uint32_t *) entry->sha1); + + /* We'll always be able to insert when we get here. */ + assert(cache->kernel_count < cache->table_size / 2); + + for (uint32_t i = 0; i < cache->table_size; i++) { + const uint32_t index = (start + i) & mask; + if (!cache->hash_table[index]) { + cache->hash_table[index] = entry; + break; + } + } + + cache->total_size += entry_size(entry); + cache->kernel_count++; +} + + +static VkResult +radv_pipeline_cache_grow(struct radv_pipeline_cache *cache) +{ + const uint32_t table_size = cache->table_size * 2; + const uint32_t old_table_size = cache->table_size; + const size_t byte_size = table_size * sizeof(cache->hash_table[0]); + struct cache_entry **table; + struct cache_entry **old_table = cache->hash_table; + + table = malloc(byte_size); + if (table == NULL) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + cache->hash_table = table; + cache->table_size = table_size; + cache->kernel_count = 0; + cache->total_size = 0; + + memset(cache->hash_table, 0, byte_size); + for (uint32_t i = 0; i < old_table_size; i++) { + struct cache_entry *entry = old_table[i]; + if (!entry) + continue; + + radv_pipeline_cache_set_entry(cache, entry); + } + + free(old_table); + + return VK_SUCCESS; +} + +static void +radv_pipeline_cache_add_entry(struct radv_pipeline_cache *cache, + struct cache_entry *entry) +{ + if (cache->kernel_count == cache->table_size / 2) + radv_pipeline_cache_grow(cache); + + /* Failing to grow that hash table isn't fatal, but may mean we don't + * have enough space to add this new kernel. Only add it if there's room. + */ + if (cache->kernel_count < cache->table_size / 2) + radv_pipeline_cache_set_entry(cache, entry); +} + +struct radv_shader_variant * +radv_pipeline_cache_insert_shader(struct radv_pipeline_cache *cache, + const unsigned char *sha1, + struct radv_shader_variant *variant, + const void *code, unsigned code_size) +{ + pthread_mutex_lock(&cache->mutex); + struct cache_entry *entry = radv_pipeline_cache_search_unlocked(cache, sha1); + if (entry) { + if (entry->variant) { + radv_shader_variant_destroy(cache->device, variant); + variant = entry->variant; + } else { + entry->variant = variant; + } + __sync_fetch_and_add(&variant->ref_count, 1); + pthread_mutex_unlock(&cache->mutex); + return variant; + } + + entry = radv_alloc(&cache->alloc, sizeof(*entry) + code_size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_CACHE); + if (!entry) { + pthread_mutex_unlock(&cache->mutex); + return variant; + } + + memcpy(entry->sha1, sha1, 20); + memcpy(entry->code, code, code_size); + entry->config = variant->config; + entry->variant_info = variant->info; + entry->rsrc1 = variant->rsrc1; + entry->rsrc2 = variant->rsrc2; + entry->code_size = code_size; + entry->variant = variant; + __sync_fetch_and_add(&variant->ref_count, 1); + + radv_pipeline_cache_add_entry(cache, entry); + + cache->modified = true; + pthread_mutex_unlock(&cache->mutex); + return variant; +} + +struct cache_header { + uint32_t header_size; + uint32_t header_version; + uint32_t vendor_id; + uint32_t device_id; + uint8_t uuid[VK_UUID_SIZE]; +}; +void +radv_pipeline_cache_load(struct radv_pipeline_cache *cache, + const void *data, size_t size) +{ + struct radv_device *device = cache->device; + struct cache_header header; + uint8_t uuid[VK_UUID_SIZE]; + + if (size < sizeof(header)) + return; + memcpy(&header, data, sizeof(header)); + if (header.header_size < sizeof(header)) + return; + if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE) + return; + if (header.vendor_id != 0x1002) + return; + if (header.device_id != device->instance->physicalDevice.rad_info.pci_id) + return; + radv_device_get_cache_uuid(uuid); + if (memcmp(header.uuid, uuid, VK_UUID_SIZE) != 0) + return; + + char *end = (void *) data + size; + char *p = (void *) data + header.header_size; + + while (end - p >= sizeof(struct cache_entry)) { + struct cache_entry *entry = (struct cache_entry*)p; + struct cache_entry *dest_entry; + if(end - p < sizeof(*entry) + entry->code_size) + break; + + dest_entry = radv_alloc(&cache->alloc, sizeof(*entry) + entry->code_size, + 8, VK_SYSTEM_ALLOCATION_SCOPE_CACHE); + if (dest_entry) { + memcpy(dest_entry, entry, sizeof(*entry) + entry->code_size); + dest_entry->variant = NULL; + radv_pipeline_cache_add_entry(cache, dest_entry); + } + p += sizeof (*entry) + entry->code_size; + } +} + +VkResult radv_CreatePipelineCache( + VkDevice _device, + const VkPipelineCacheCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkPipelineCache* pPipelineCache) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_pipeline_cache *cache; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO); + assert(pCreateInfo->flags == 0); + + cache = radv_alloc2(&device->alloc, pAllocator, + sizeof(*cache), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (cache == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + if (pAllocator) + cache->alloc = *pAllocator; + else + cache->alloc = device->alloc; + + radv_pipeline_cache_init(cache, device); + + if (pCreateInfo->initialDataSize > 0) { + radv_pipeline_cache_load(cache, + pCreateInfo->pInitialData, + pCreateInfo->initialDataSize); + } + + *pPipelineCache = radv_pipeline_cache_to_handle(cache); + + return VK_SUCCESS; +} + +void radv_DestroyPipelineCache( + VkDevice _device, + VkPipelineCache _cache, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache); + + if (!cache) + return; + radv_pipeline_cache_finish(cache); + + radv_free2(&device->alloc, pAllocator, cache); +} + +VkResult radv_GetPipelineCacheData( + VkDevice _device, + VkPipelineCache _cache, + size_t* pDataSize, + void* pData) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache); + struct cache_header *header; + VkResult result = VK_SUCCESS; + const size_t size = sizeof(*header) + cache->total_size; + if (pData == NULL) { + *pDataSize = size; + return VK_SUCCESS; + } + if (*pDataSize < sizeof(*header)) { + *pDataSize = 0; + return VK_INCOMPLETE; + } + void *p = pData, *end = pData + *pDataSize; + header = p; + header->header_size = sizeof(*header); + header->header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE; + header->vendor_id = 0x1002; + header->device_id = device->instance->physicalDevice.rad_info.pci_id; + radv_device_get_cache_uuid(header->uuid); + p += header->header_size; + + struct cache_entry *entry; + for (uint32_t i = 0; i < cache->table_size; i++) { + if (!cache->hash_table[i]) + continue; + entry = cache->hash_table[i]; + const uint32_t size = entry_size(entry); + if (end < p + size) { + result = VK_INCOMPLETE; + break; + } + + memcpy(p, entry, size); + ((struct cache_entry*)p)->variant = NULL; + p += size; + } + *pDataSize = p - pData; + + return result; +} + +static void +radv_pipeline_cache_merge(struct radv_pipeline_cache *dst, + struct radv_pipeline_cache *src) +{ + for (uint32_t i = 0; i < src->table_size; i++) { + struct cache_entry *entry = src->hash_table[i]; + if (!entry || radv_pipeline_cache_search(dst, entry->sha1)) + continue; + + radv_pipeline_cache_add_entry(dst, entry); + + src->hash_table[i] = NULL; + } +} + +VkResult radv_MergePipelineCaches( + VkDevice _device, + VkPipelineCache destCache, + uint32_t srcCacheCount, + const VkPipelineCache* pSrcCaches) +{ + RADV_FROM_HANDLE(radv_pipeline_cache, dst, destCache); + + for (uint32_t i = 0; i < srcCacheCount; i++) { + RADV_FROM_HANDLE(radv_pipeline_cache, src, pSrcCaches[i]); + + radv_pipeline_cache_merge(dst, src); + } + + return VK_SUCCESS; +} diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h new file mode 100644 index 00000000000..e738218fc15 --- /dev/null +++ b/src/amd/vulkan/radv_private.h @@ -0,0 +1,1402 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based in part on anv driver which is: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_VALGRIND +#include +#include +#define VG(x) x +#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x)) +#else +#define VG(x) +#endif + +#include +#include "radv_device_info.h" +#include "compiler/shader_enums.h" +#include "util/macros.h" +#include "util/list.h" +#include "main/macros.h" +#include "radv_radeon_winsys.h" +#include "ac_binary.h" +#include "ac_nir_to_llvm.h" +#include "radv_descriptor_set.h" + +#include + +/* Pre-declarations needed for WSI entrypoints */ +struct wl_surface; +struct wl_display; +typedef struct xcb_connection_t xcb_connection_t; +typedef uint32_t xcb_visualid_t; +typedef uint32_t xcb_window_t; + +#include +#include +#include + +#include "radv_entrypoints.h" + + +#define MAX_VBS 32 +#define MAX_VERTEX_ATTRIBS 32 +#define MAX_RTS 8 +#define MAX_VIEWPORTS 16 +#define MAX_SCISSORS 16 +#define MAX_PUSH_CONSTANTS_SIZE 128 +#define MAX_DYNAMIC_BUFFERS 16 +#define MAX_IMAGES 8 +#define MAX_SAMPLES_LOG2 4 /* SKL supports 16 samples */ +#define NUM_META_FS_KEYS 11 + +#define NUM_DEPTH_CLEAR_PIPELINES 3 + +#define radv_noreturn __attribute__((__noreturn__)) +#define radv_printflike(a, b) __attribute__((__format__(__printf__, a, b))) + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +static inline uint32_t +align_u32(uint32_t v, uint32_t a) +{ + assert(a != 0 && a == (a & -a)); + return (v + a - 1) & ~(a - 1); +} + +static inline uint32_t +align_u32_npot(uint32_t v, uint32_t a) +{ + return (v + a - 1) / a * a; +} + +static inline uint64_t +align_u64(uint64_t v, uint64_t a) +{ + assert(a != 0 && a == (a & -a)); + return (v + a - 1) & ~(a - 1); +} + +static inline int32_t +align_i32(int32_t v, int32_t a) +{ + assert(a != 0 && a == (a & -a)); + return (v + a - 1) & ~(a - 1); +} + +/** Alignment must be a power of 2. */ +static inline bool +radv_is_aligned(uintmax_t n, uintmax_t a) +{ + assert(a == (a & -a)); + return (n & (a - 1)) == 0; +} + +static inline uint32_t +round_up_u32(uint32_t v, uint32_t a) +{ + return (v + a - 1) / a; +} + +static inline uint64_t +round_up_u64(uint64_t v, uint64_t a) +{ + return (v + a - 1) / a; +} + +static inline uint32_t +radv_minify(uint32_t n, uint32_t levels) +{ + if (unlikely(n == 0)) + return 0; + else + return MAX(n >> levels, 1); +} +static inline float +radv_clamp_f(float f, float min, float max) +{ + assert(min < max); + + if (f > max) + return max; + else if (f < min) + return min; + else + return f; +} + +static inline bool +radv_clear_mask(uint32_t *inout_mask, uint32_t clear_mask) +{ + if (*inout_mask & clear_mask) { + *inout_mask &= ~clear_mask; + return true; + } else { + return false; + } +} + +#define for_each_bit(b, dword) \ + for (uint32_t __dword = (dword); \ + (b) = __builtin_ffs(__dword) - 1, __dword; \ + __dword &= ~(1 << (b))) + +#define typed_memcpy(dest, src, count) ({ \ + static_assert(sizeof(*src) == sizeof(*dest), ""); \ + memcpy((dest), (src), (count) * sizeof(*(src))); \ + }) + +#define zero(x) (memset(&(x), 0, sizeof(x))) + +/* Define no kernel as 1, since that's an illegal offset for a kernel */ +#define NO_KERNEL 1 + +struct radv_common { + VkStructureType sType; + const void* pNext; +}; + +/* Whenever we generate an error, pass it through this function. Useful for + * debugging, where we can break on it. Only call at error site, not when + * propagating errors. Might be useful to plug in a stack trace here. + */ + +VkResult __vk_errorf(VkResult error, const char *file, int line, const char *format, ...); + +#ifdef DEBUG +#define vk_error(error) __vk_errorf(error, __FILE__, __LINE__, NULL); +#define vk_errorf(error, format, ...) __vk_errorf(error, __FILE__, __LINE__, format, ## __VA_ARGS__); +#else +#define vk_error(error) error +#define vk_errorf(error, format, ...) error +#endif + +void __radv_finishme(const char *file, int line, const char *format, ...) + radv_printflike(3, 4); +void radv_loge(const char *format, ...) radv_printflike(1, 2); +void radv_loge_v(const char *format, va_list va); + +/** + * Print a FINISHME message, including its source location. + */ +#define radv_finishme(format, ...) \ + __radv_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__); + +/* A non-fatal assert. Useful for debugging. */ +#ifdef DEBUG +#define radv_assert(x) ({ \ + if (unlikely(!(x))) \ + fprintf(stderr, "%s:%d ASSERT: %s\n", __FILE__, __LINE__, #x); \ + }) +#else +#define radv_assert(x) +#endif + +/** + * If a block of code is annotated with radv_validate, then the block runs only + * in debug builds. + */ +#ifdef DEBUG +#define radv_validate if (1) +#else +#define radv_validate if (0) +#endif + +void radv_abortf(const char *format, ...) radv_noreturn radv_printflike(1, 2); +void radv_abortfv(const char *format, va_list va) radv_noreturn; + +#define stub_return(v) \ + do { \ + radv_finishme("stub %s", __func__); \ + return (v); \ + } while (0) + +#define stub() \ + do { \ + radv_finishme("stub %s", __func__); \ + return; \ + } while (0) + +/** + * A dynamically growable, circular buffer. Elements are added at head and + * removed from tail. head and tail are free-running uint32_t indices and we + * only compute the modulo with size when accessing the array. This way, + * number of bytes in the queue is always head - tail, even in case of + * wraparound. + */ + +struct radv_vector { + uint32_t head; + uint32_t tail; + uint32_t element_size; + uint32_t size; + void *data; +}; + +int radv_vector_init(struct radv_vector *queue, uint32_t element_size, uint32_t size); +void *radv_vector_add(struct radv_vector *queue); +void *radv_vector_remove(struct radv_vector *queue); + +static inline int +radv_vector_length(struct radv_vector *queue) +{ + return (queue->head - queue->tail) / queue->element_size; +} + +static inline void * +radv_vector_head(struct radv_vector *vector) +{ + assert(vector->tail < vector->head); + return (void *)((char *)vector->data + + ((vector->head - vector->element_size) & + (vector->size - 1))); +} + +static inline void * +radv_vector_tail(struct radv_vector *vector) +{ + return (void *)((char *)vector->data + (vector->tail & (vector->size - 1))); +} + +static inline void +radv_vector_finish(struct radv_vector *queue) +{ + free(queue->data); +} + +#define radv_vector_foreach(elem, queue) \ + static_assert(__builtin_types_compatible_p(__typeof__(queue), struct radv_vector *), ""); \ + for (uint32_t __radv_vector_offset = (queue)->tail; \ + elem = (queue)->data + (__radv_vector_offset & ((queue)->size - 1)), __radv_vector_offset < (queue)->head; \ + __radv_vector_offset += (queue)->element_size) + +void *radv_resolve_entrypoint(uint32_t index); +void *radv_lookup_entrypoint(const char *name); + +extern struct radv_dispatch_table dtable; + +#define RADV_CALL(func) ({ \ + if (dtable.func == NULL) { \ + size_t idx = offsetof(struct radv_dispatch_table, func) / sizeof(void *); \ + dtable.entrypoints[idx] = radv_resolve_entrypoint(idx); \ + } \ + dtable.func; \ + }) + +static inline void * +radv_alloc(const VkAllocationCallbacks *alloc, + size_t size, size_t align, + VkSystemAllocationScope scope) +{ + return alloc->pfnAllocation(alloc->pUserData, size, align, scope); +} + +static inline void * +radv_realloc(const VkAllocationCallbacks *alloc, + void *ptr, size_t size, size_t align, + VkSystemAllocationScope scope) +{ + return alloc->pfnReallocation(alloc->pUserData, ptr, size, align, scope); +} + +static inline void +radv_free(const VkAllocationCallbacks *alloc, void *data) +{ + alloc->pfnFree(alloc->pUserData, data); +} + +static inline void * +radv_alloc2(const VkAllocationCallbacks *parent_alloc, + const VkAllocationCallbacks *alloc, + size_t size, size_t align, + VkSystemAllocationScope scope) +{ + if (alloc) + return radv_alloc(alloc, size, align, scope); + else + return radv_alloc(parent_alloc, size, align, scope); +} + +static inline void +radv_free2(const VkAllocationCallbacks *parent_alloc, + const VkAllocationCallbacks *alloc, + void *data) +{ + if (alloc) + radv_free(alloc, data); + else + radv_free(parent_alloc, data); +} + +struct radv_wsi_interaface; + +#define VK_ICD_WSI_PLATFORM_MAX 5 + +struct radv_physical_device { + VK_LOADER_DATA _loader_data; + + struct radv_instance * instance; + + struct radeon_winsys *ws; + struct radeon_info rad_info; + uint32_t chipset_id; + char path[20]; + const char * name; + uint64_t aperture_size; + int cmd_parser_version; + uint32_t pci_vendor_id; + uint32_t pci_device_id; + + struct radv_wsi_interface * wsi[VK_ICD_WSI_PLATFORM_MAX]; +}; + +struct radv_instance { + VK_LOADER_DATA _loader_data; + + VkAllocationCallbacks alloc; + + uint32_t apiVersion; + int physicalDeviceCount; + struct radv_physical_device physicalDevice; +}; + +VkResult radv_init_wsi(struct radv_physical_device *physical_device); +void radv_finish_wsi(struct radv_physical_device *physical_device); + +struct cache_entry; + +struct radv_pipeline_cache { + struct radv_device * device; + pthread_mutex_t mutex; + + uint32_t total_size; + uint32_t table_size; + uint32_t kernel_count; + struct cache_entry ** hash_table; + bool modified; + + VkAllocationCallbacks alloc; +}; + +void +radv_pipeline_cache_init(struct radv_pipeline_cache *cache, + struct radv_device *device); +void +radv_pipeline_cache_finish(struct radv_pipeline_cache *cache); +void +radv_pipeline_cache_load(struct radv_pipeline_cache *cache, + const void *data, size_t size); + +struct radv_shader_variant * +radv_create_shader_variant_from_pipeline_cache(struct radv_device *device, + struct radv_pipeline_cache *cache, + const unsigned char *sha1); + +struct radv_shader_variant * +radv_pipeline_cache_insert_shader(struct radv_pipeline_cache *cache, + const unsigned char *sha1, + struct radv_shader_variant *variant, + const void *code, unsigned code_size); + +void radv_shader_variant_destroy(struct radv_device *device, + struct radv_shader_variant *variant); + +struct radv_meta_state { + VkAllocationCallbacks alloc; + + struct radv_pipeline_cache cache; + + /** + * Use array element `i` for images with `2^i` samples. + */ + struct { + VkRenderPass render_pass[NUM_META_FS_KEYS]; + struct radv_pipeline *color_pipelines[NUM_META_FS_KEYS]; + + VkRenderPass depth_only_rp[NUM_DEPTH_CLEAR_PIPELINES]; + struct radv_pipeline *depth_only_pipeline[NUM_DEPTH_CLEAR_PIPELINES]; + VkRenderPass stencil_only_rp[NUM_DEPTH_CLEAR_PIPELINES]; + struct radv_pipeline *stencil_only_pipeline[NUM_DEPTH_CLEAR_PIPELINES]; + VkRenderPass depthstencil_rp[NUM_DEPTH_CLEAR_PIPELINES]; + struct radv_pipeline *depthstencil_pipeline[NUM_DEPTH_CLEAR_PIPELINES]; + } clear[1 + MAX_SAMPLES_LOG2]; + + struct { + VkRenderPass render_pass[NUM_META_FS_KEYS]; + + /** Pipeline that blits from a 1D image. */ + VkPipeline pipeline_1d_src[NUM_META_FS_KEYS]; + + /** Pipeline that blits from a 2D image. */ + VkPipeline pipeline_2d_src[NUM_META_FS_KEYS]; + + /** Pipeline that blits from a 3D image. */ + VkPipeline pipeline_3d_src[NUM_META_FS_KEYS]; + + VkRenderPass depth_only_rp; + VkPipeline depth_only_1d_pipeline; + VkPipeline depth_only_2d_pipeline; + VkPipeline depth_only_3d_pipeline; + + VkRenderPass stencil_only_rp; + VkPipeline stencil_only_1d_pipeline; + VkPipeline stencil_only_2d_pipeline; + VkPipeline stencil_only_3d_pipeline; + VkPipelineLayout pipeline_layout; + VkDescriptorSetLayout ds_layout; + } blit; + + struct { + VkRenderPass render_passes[NUM_META_FS_KEYS]; + + VkPipelineLayout p_layouts[2]; + VkDescriptorSetLayout ds_layouts[2]; + VkPipeline pipelines[2][NUM_META_FS_KEYS]; + + VkRenderPass depth_only_rp; + VkPipeline depth_only_pipeline[2]; + + VkRenderPass stencil_only_rp; + VkPipeline stencil_only_pipeline[2]; + } blit2d; + + struct { + VkPipelineLayout img_p_layout; + VkDescriptorSetLayout img_ds_layout; + VkPipeline pipeline; + } itob; + struct { + VkRenderPass render_pass; + VkPipelineLayout img_p_layout; + VkDescriptorSetLayout img_ds_layout; + VkPipeline pipeline; + } btoi; + + struct { + VkPipeline pipeline; + VkRenderPass pass; + } resolve; + + struct { + VkDescriptorSetLayout ds_layout; + VkPipelineLayout p_layout; + struct { + VkPipeline pipeline; + VkPipeline i_pipeline; + } rc[MAX_SAMPLES_LOG2]; + } resolve_compute; + + struct { + VkPipeline decompress_pipeline; + VkPipeline resummarize_pipeline; + VkRenderPass pass; + } depth_decomp; + + struct { + VkPipeline cmask_eliminate_pipeline; + VkPipeline fmask_decompress_pipeline; + VkRenderPass pass; + } fast_clear_flush; + + struct { + VkPipelineLayout fill_p_layout; + VkPipelineLayout copy_p_layout; + VkDescriptorSetLayout fill_ds_layout; + VkDescriptorSetLayout copy_ds_layout; + VkPipeline fill_pipeline; + VkPipeline copy_pipeline; + } buffer; +}; + +struct radv_queue { + VK_LOADER_DATA _loader_data; + + struct radv_device * device; + + struct radv_state_pool * pool; +}; + +struct radv_device { + VK_LOADER_DATA _loader_data; + + VkAllocationCallbacks alloc; + + struct radv_instance * instance; + struct radeon_winsys *ws; + struct radeon_winsys_ctx *hw_ctx; + + struct radv_meta_state meta_state; + struct radv_queue queue; + struct radeon_winsys_cs *empty_cs; + + bool allow_fast_clears; + bool allow_dcc; + + /* MSAA sample locations. + * The first index is the sample index. + * The second index is the coordinate: X, Y. */ + float sample_locations_1x[1][2]; + float sample_locations_2x[2][2]; + float sample_locations_4x[4][2]; + float sample_locations_8x[8][2]; + float sample_locations_16x[16][2]; +}; + +void radv_device_get_cache_uuid(void *uuid); + +struct radv_device_memory { + struct radeon_winsys_bo *bo; + uint32_t type_index; + VkDeviceSize map_size; + void * map; +}; + + +struct radv_descriptor_range { + uint64_t va; + uint32_t size; +}; + +struct radv_descriptor_set { + const struct radv_descriptor_set_layout *layout; + struct list_head descriptor_pool; + uint32_t size; + + struct radv_buffer_view *buffer_views; + struct radeon_winsys_bo *bo; + uint64_t va; + uint32_t *mapped_ptr; + struct radv_descriptor_range *dynamic_descriptors; + struct radeon_winsys_bo *descriptors[0]; +}; + +struct radv_descriptor_pool_free_node { + int next; + uint32_t offset; + uint32_t size; +}; + +struct radv_descriptor_pool { + struct list_head descriptor_sets; + + struct radeon_winsys_bo *bo; + uint8_t *mapped_ptr; + uint64_t current_offset; + uint64_t size; + + int free_list; + int full_list; + uint32_t max_sets; + struct radv_descriptor_pool_free_node free_nodes[]; +}; + +struct radv_buffer { + struct radv_device * device; + VkDeviceSize size; + + VkBufferUsageFlags usage; + + /* Set when bound */ + struct radeon_winsys_bo * bo; + VkDeviceSize offset; +}; + + +enum radv_cmd_dirty_bits { + RADV_CMD_DIRTY_DYNAMIC_VIEWPORT = 1 << 0, /* VK_DYNAMIC_STATE_VIEWPORT */ + RADV_CMD_DIRTY_DYNAMIC_SCISSOR = 1 << 1, /* VK_DYNAMIC_STATE_SCISSOR */ + RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 2, /* VK_DYNAMIC_STATE_LINE_WIDTH */ + RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS = 1 << 3, /* VK_DYNAMIC_STATE_DEPTH_BIAS */ + RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS = 1 << 4, /* VK_DYNAMIC_STATE_BLEND_CONSTANTS */ + RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS = 1 << 5, /* VK_DYNAMIC_STATE_DEPTH_BOUNDS */ + RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 6, /* VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK */ + RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7, /* VK_DYNAMIC_STATE_STENCIL_WRITE_MASK */ + RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE = 1 << 8, /* VK_DYNAMIC_STATE_STENCIL_REFERENCE */ + RADV_CMD_DIRTY_DYNAMIC_ALL = (1 << 9) - 1, + RADV_CMD_DIRTY_PIPELINE = 1 << 9, + RADV_CMD_DIRTY_INDEX_BUFFER = 1 << 10, + RADV_CMD_DIRTY_RENDER_TARGETS = 1 << 11, +}; +typedef uint32_t radv_cmd_dirty_mask_t; + +enum radv_cmd_flush_bits { + RADV_CMD_FLAG_INV_ICACHE = 1 << 0, + /* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */ + RADV_CMD_FLAG_INV_SMEM_L1 = 1 << 1, + /* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */ + RADV_CMD_FLAG_INV_VMEM_L1 = 1 << 2, + /* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */ + RADV_CMD_FLAG_INV_GLOBAL_L2 = 1 << 3, + /* Framebuffer caches */ + RADV_CMD_FLAG_FLUSH_AND_INV_CB_META = 1 << 4, + RADV_CMD_FLAG_FLUSH_AND_INV_DB_META = 1 << 5, + RADV_CMD_FLAG_FLUSH_AND_INV_DB = 1 << 6, + RADV_CMD_FLAG_FLUSH_AND_INV_CB = 1 << 7, + /* Engine synchronization. */ + RADV_CMD_FLAG_VS_PARTIAL_FLUSH = 1 << 8, + RADV_CMD_FLAG_PS_PARTIAL_FLUSH = 1 << 9, + RADV_CMD_FLAG_CS_PARTIAL_FLUSH = 1 << 10, + RADV_CMD_FLAG_VGT_FLUSH = 1 << 11, + + RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER = (RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | + RADV_CMD_FLAG_FLUSH_AND_INV_DB | + RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) +}; + +struct radv_vertex_binding { + struct radv_buffer * buffer; + VkDeviceSize offset; +}; + +struct radv_dynamic_state { + struct { + uint32_t count; + VkViewport viewports[MAX_VIEWPORTS]; + } viewport; + + struct { + uint32_t count; + VkRect2D scissors[MAX_SCISSORS]; + } scissor; + + float line_width; + + struct { + float bias; + float clamp; + float slope; + } depth_bias; + + float blend_constants[4]; + + struct { + float min; + float max; + } depth_bounds; + + struct { + uint32_t front; + uint32_t back; + } stencil_compare_mask; + + struct { + uint32_t front; + uint32_t back; + } stencil_write_mask; + + struct { + uint32_t front; + uint32_t back; + } stencil_reference; +}; + +extern const struct radv_dynamic_state default_dynamic_state; + +void radv_dynamic_state_copy(struct radv_dynamic_state *dest, + const struct radv_dynamic_state *src, + uint32_t copy_mask); +/** + * Attachment state when recording a renderpass instance. + * + * The clear value is valid only if there exists a pending clear. + */ +struct radv_attachment_state { + VkImageAspectFlags pending_clear_aspects; + VkClearValue clear_value; + VkImageLayout current_layout; +}; + +struct radv_cmd_state { + uint32_t vb_dirty; + bool vertex_descriptors_dirty; + radv_cmd_dirty_mask_t dirty; + + struct radv_pipeline * pipeline; + struct radv_pipeline * emitted_pipeline; + struct radv_pipeline * compute_pipeline; + struct radv_pipeline * emitted_compute_pipeline; + struct radv_framebuffer * framebuffer; + struct radv_render_pass * pass; + const struct radv_subpass * subpass; + struct radv_dynamic_state dynamic; + struct radv_vertex_binding vertex_bindings[MAX_VBS]; + struct radv_descriptor_set * descriptors[MAX_SETS]; + struct radv_attachment_state * attachments; + VkRect2D render_area; + struct radv_buffer * index_buffer; + uint32_t index_type; + uint32_t index_offset; + uint32_t last_primitive_reset_index; + enum radv_cmd_flush_bits flush_bits; + unsigned active_occlusion_queries; + float offset_scale; +}; +struct radv_cmd_pool { + VkAllocationCallbacks alloc; + struct list_head cmd_buffers; +}; + +struct radv_cmd_buffer_upload { + uint8_t *map; + unsigned offset; + uint64_t size; + struct radeon_winsys_bo *upload_bo; + struct list_head list; +}; + +struct radv_cmd_buffer { + VK_LOADER_DATA _loader_data; + + struct radv_device * device; + + struct radv_cmd_pool * pool; + struct list_head pool_link; + + VkCommandBufferUsageFlags usage_flags; + VkCommandBufferLevel level; + struct radeon_winsys_cs *cs; + struct radv_cmd_state state; + + uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE]; + uint32_t dynamic_buffers[16 * MAX_DYNAMIC_BUFFERS]; + VkShaderStageFlags push_constant_stages; + + struct radv_cmd_buffer_upload upload; + + bool record_fail; +}; + +struct radv_image; + +void si_init_config(struct radv_physical_device *physical_device, + struct radv_cmd_buffer *cmd_buffer); +void si_write_viewport(struct radeon_winsys_cs *cs, int first_vp, + int count, const VkViewport *viewports); +void si_write_scissors(struct radeon_winsys_cs *cs, int first, + int count, const VkRect2D *scissors); +uint32_t si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer); +void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer); +void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer, + uint64_t src_va, uint64_t dest_va, + uint64_t size); +void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, + uint64_t size, unsigned value); +void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer); +void radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, + struct radv_descriptor_set *set, + unsigned idx); +bool +radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, + unsigned size, + unsigned alignment, + unsigned *out_offset, + void **ptr); +void +radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, + const struct radv_subpass *subpass, + bool transitions); +bool +radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, + unsigned size, unsigned alignmnet, + const void *data, unsigned *out_offset); +void +radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer); +void radv_cmd_buffer_clear_subpass(struct radv_cmd_buffer *cmd_buffer); +void radv_cmd_buffer_resolve_subpass(struct radv_cmd_buffer *cmd_buffer); +void radv_cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples); +unsigned radv_cayman_get_maxdist(int log_samples); +void radv_device_init_msaa(struct radv_device *device); +void radv_set_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + VkClearDepthStencilValue ds_clear_value, + VkImageAspectFlags aspects); +void radv_set_color_clear_regs(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + int idx, + uint32_t color_values[2]); +void radv_fill_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radeon_winsys_bo *bo, + uint64_t offset, uint64_t size, uint32_t value); + +/* + * Takes x,y,z as exact numbers of invocations, instead of blocks. + * + * Limitations: Can't call normal dispatch functions without binding or rebinding + * the compute pipeline. + */ +void radv_unaligned_dispatch( + struct radv_cmd_buffer *cmd_buffer, + uint32_t x, + uint32_t y, + uint32_t z); + +struct radv_event { + struct radeon_winsys_bo *bo; + uint64_t *map; +}; + +struct nir_shader; + +struct radv_shader_module { + struct nir_shader * nir; + unsigned char sha1[20]; + uint32_t size; + char data[0]; +}; + +union ac_shader_variant_key; + +void +radv_hash_shader(unsigned char *hash, struct radv_shader_module *module, + const char *entrypoint, + const VkSpecializationInfo *spec_info, + const struct radv_pipeline_layout *layout, + const union ac_shader_variant_key *key); + +static inline gl_shader_stage +vk_to_mesa_shader_stage(VkShaderStageFlagBits vk_stage) +{ + assert(__builtin_popcount(vk_stage) == 1); + return ffs(vk_stage) - 1; +} + +static inline VkShaderStageFlagBits +mesa_to_vk_shader_stage(gl_shader_stage mesa_stage) +{ + return (1 << mesa_stage); +} + +#define RADV_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1) + +#define radv_foreach_stage(stage, stage_bits) \ + for (gl_shader_stage stage, \ + __tmp = (gl_shader_stage)((stage_bits) & RADV_STAGE_MASK); \ + stage = __builtin_ffs(__tmp) - 1, __tmp; \ + __tmp &= ~(1 << (stage))) + +struct radv_shader_variant { + uint32_t ref_count; + + struct radeon_winsys_bo *bo; + struct ac_shader_config config; + struct ac_shader_variant_info info; + unsigned rsrc1; + unsigned rsrc2; +}; + +struct radv_depth_stencil_state { + uint32_t db_depth_control; + uint32_t db_stencil_control; + uint32_t db_render_control; + uint32_t db_render_override2; +}; + +struct radv_blend_state { + uint32_t cb_color_control; + uint32_t cb_target_mask; + uint32_t sx_mrt0_blend_opt[8]; + uint32_t cb_blend_control[8]; + + uint32_t spi_shader_col_format; + uint32_t cb_shader_mask; + uint32_t db_alpha_to_mask; +}; + +unsigned radv_format_meta_fs_key(VkFormat format); + +struct radv_raster_state { + uint32_t pa_cl_clip_cntl; + uint32_t pa_cl_vs_out_cntl; + uint32_t spi_interp_control; + uint32_t pa_su_point_size; + uint32_t pa_su_point_minmax; + uint32_t pa_su_line_cntl; + uint32_t pa_su_vtx_cntl; + uint32_t pa_su_sc_mode_cntl; +}; + +struct radv_multisample_state { + uint32_t db_eqaa; + uint32_t pa_sc_line_cntl; + uint32_t pa_sc_mode_cntl_0; + uint32_t pa_sc_mode_cntl_1; + uint32_t pa_sc_aa_config; + uint32_t pa_sc_aa_mask[2]; + unsigned num_samples; +}; + +struct radv_pipeline { + struct radv_device * device; + uint32_t dynamic_state_mask; + struct radv_dynamic_state dynamic_state; + + struct radv_pipeline_layout * layout; + + bool needs_data_cache; + + struct radv_shader_variant * shaders[MESA_SHADER_STAGES]; + VkShaderStageFlags active_stages; + + uint32_t va_rsrc_word3[MAX_VERTEX_ATTRIBS]; + uint32_t va_format_size[MAX_VERTEX_ATTRIBS]; + uint32_t va_binding[MAX_VERTEX_ATTRIBS]; + uint32_t va_offset[MAX_VERTEX_ATTRIBS]; + uint32_t num_vertex_attribs; + uint32_t binding_stride[MAX_VBS]; + + union { + struct { + struct radv_blend_state blend; + struct radv_depth_stencil_state ds; + struct radv_raster_state raster; + struct radv_multisample_state ms; + unsigned prim; + unsigned gs_out; + bool prim_restart_enable; + } graphics; + }; +}; + +struct radv_graphics_pipeline_create_info { + bool use_rectlist; + bool db_depth_clear; + bool db_stencil_clear; + bool db_depth_disable_expclear; + bool db_stencil_disable_expclear; + bool db_flush_depth_inplace; + bool db_flush_stencil_inplace; + bool db_resummarize; + uint32_t custom_blend_mode; +}; + +VkResult +radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device, + struct radv_pipeline_cache *cache, + const VkGraphicsPipelineCreateInfo *pCreateInfo, + const struct radv_graphics_pipeline_create_info *extra, + const VkAllocationCallbacks *alloc); + +VkResult +radv_graphics_pipeline_create(VkDevice device, + VkPipelineCache cache, + const VkGraphicsPipelineCreateInfo *pCreateInfo, + const struct radv_graphics_pipeline_create_info *extra, + const VkAllocationCallbacks *alloc, + VkPipeline *pPipeline); + +struct vk_format_description; +uint32_t radv_translate_buffer_dataformat(const struct vk_format_description *desc, + int first_non_void); +uint32_t radv_translate_buffer_numformat(const struct vk_format_description *desc, + int first_non_void); +uint32_t radv_translate_colorformat(VkFormat format); +uint32_t radv_translate_color_numformat(VkFormat format, + const struct vk_format_description *desc, + int first_non_void); +uint32_t radv_colorformat_endian_swap(uint32_t colorformat); +unsigned radv_translate_colorswap(VkFormat format, bool do_endian_swap); +uint32_t radv_translate_dbformat(VkFormat format); +uint32_t radv_translate_tex_dataformat(VkFormat format, + const struct vk_format_description *desc, + int first_non_void); +uint32_t radv_translate_tex_numformat(VkFormat format, + const struct vk_format_description *desc, + int first_non_void); +bool radv_format_pack_clear_color(VkFormat format, + uint32_t clear_vals[2], + VkClearColorValue *value); +bool radv_is_colorbuffer_format_supported(VkFormat format, bool *blendable); + +struct radv_fmask_info { + uint64_t offset; + uint64_t size; + unsigned alignment; + unsigned pitch_in_pixels; + unsigned bank_height; + unsigned slice_tile_max; + unsigned tile_mode_index; +}; + +struct radv_cmask_info { + uint64_t offset; + uint64_t size; + unsigned alignment; + unsigned pitch; + unsigned height; + unsigned xalign; + unsigned yalign; + unsigned slice_tile_max; + unsigned base_address_reg; +}; + +struct r600_htile_info { + uint64_t offset; + uint64_t size; + unsigned pitch; + unsigned height; + unsigned xalign; + unsigned yalign; +}; + +struct radv_image { + VkImageType type; + /* The original VkFormat provided by the client. This may not match any + * of the actual surface formats. + */ + VkFormat vk_format; + VkImageAspectFlags aspects; + VkExtent3D extent; + uint32_t levels; + uint32_t array_size; + uint32_t samples; /**< VkImageCreateInfo::samples */ + VkImageUsageFlags usage; /**< Superset of VkImageCreateInfo::usage. */ + VkImageTiling tiling; /** VkImageCreateInfo::tiling */ + + VkDeviceSize size; + uint32_t alignment; + + /* Set when bound */ + struct radeon_winsys_bo *bo; + VkDeviceSize offset; + uint32_t dcc_offset; + struct radeon_surf surface; + + struct radv_fmask_info fmask; + struct radv_cmask_info cmask; + uint32_t clear_value_offset; + + /* Depth buffer compression and fast clear. */ + struct r600_htile_info htile; +}; + +bool radv_layout_has_htile(const struct radv_image *image, + VkImageLayout layout); +bool radv_layout_is_htile_compressed(const struct radv_image *image, + VkImageLayout layout); +bool radv_layout_can_expclear(const struct radv_image *image, + VkImageLayout layout); +bool radv_layout_has_cmask(const struct radv_image *image, + VkImageLayout layout); +static inline uint32_t +radv_get_layerCount(const struct radv_image *image, + const VkImageSubresourceRange *range) +{ + return range->layerCount == VK_REMAINING_ARRAY_LAYERS ? + image->array_size - range->baseArrayLayer : range->layerCount; +} + +static inline uint32_t +radv_get_levelCount(const struct radv_image *image, + const VkImageSubresourceRange *range) +{ + return range->levelCount == VK_REMAINING_MIP_LEVELS ? + image->levels - range->baseMipLevel : range->levelCount; +} + +struct radeon_bo_metadata; +void +radv_init_metadata(struct radv_device *device, + struct radv_image *image, + struct radeon_bo_metadata *metadata); + +struct radv_image_view { + struct radv_image *image; /**< VkImageViewCreateInfo::image */ + struct radeon_winsys_bo *bo; + + VkImageViewType type; + VkImageAspectFlags aspect_mask; + VkFormat vk_format; + uint32_t base_layer; + uint32_t layer_count; + uint32_t base_mip; + VkExtent3D extent; /**< Extent of VkImageViewCreateInfo::baseMipLevel. */ + + uint32_t descriptor[8]; + uint32_t fmask_descriptor[8]; +}; + +struct radv_image_create_info { + const VkImageCreateInfo *vk_info; + uint32_t stride; + bool scanout; +}; + +VkResult radv_image_create(VkDevice _device, + const struct radv_image_create_info *info, + const VkAllocationCallbacks* alloc, + VkImage *pImage); + +void radv_image_view_init(struct radv_image_view *view, + struct radv_device *device, + const VkImageViewCreateInfo* pCreateInfo, + struct radv_cmd_buffer *cmd_buffer, + VkImageUsageFlags usage_mask); +void radv_image_set_optimal_micro_tile_mode(struct radv_device *device, + struct radv_image *image, uint32_t micro_tile_mode); +struct radv_buffer_view { + struct radeon_winsys_bo *bo; + VkFormat vk_format; + uint64_t range; /**< VkBufferViewCreateInfo::range */ + uint32_t state[4]; +}; +void radv_buffer_view_init(struct radv_buffer_view *view, + struct radv_device *device, + const VkBufferViewCreateInfo* pCreateInfo, + struct radv_cmd_buffer *cmd_buffer); + +static inline struct VkExtent3D +radv_sanitize_image_extent(const VkImageType imageType, + const struct VkExtent3D imageExtent) +{ + switch (imageType) { + case VK_IMAGE_TYPE_1D: + return (VkExtent3D) { imageExtent.width, 1, 1 }; + case VK_IMAGE_TYPE_2D: + return (VkExtent3D) { imageExtent.width, imageExtent.height, 1 }; + case VK_IMAGE_TYPE_3D: + return imageExtent; + default: + unreachable("invalid image type"); + } +} + +static inline struct VkOffset3D +radv_sanitize_image_offset(const VkImageType imageType, + const struct VkOffset3D imageOffset) +{ + switch (imageType) { + case VK_IMAGE_TYPE_1D: + return (VkOffset3D) { imageOffset.x, 0, 0 }; + case VK_IMAGE_TYPE_2D: + return (VkOffset3D) { imageOffset.x, imageOffset.y, 0 }; + case VK_IMAGE_TYPE_3D: + return imageOffset; + default: + unreachable("invalid image type"); + } +} + +struct radv_sampler { + uint32_t state[4]; +}; + +struct radv_color_buffer_info { + uint32_t cb_color_base; + uint32_t cb_color_pitch; + uint32_t cb_color_slice; + uint32_t cb_color_view; + uint32_t cb_color_info; + uint32_t cb_color_attrib; + uint32_t cb_dcc_control; + uint32_t cb_color_cmask; + uint32_t cb_color_cmask_slice; + uint32_t cb_color_fmask; + uint32_t cb_color_fmask_slice; + uint32_t cb_clear_value0; + uint32_t cb_clear_value1; + uint32_t cb_dcc_base; + uint32_t micro_tile_mode; +}; + +struct radv_ds_buffer_info { + uint32_t db_depth_info; + uint32_t db_z_info; + uint32_t db_stencil_info; + uint32_t db_z_read_base; + uint32_t db_stencil_read_base; + uint32_t db_z_write_base; + uint32_t db_stencil_write_base; + uint32_t db_depth_view; + uint32_t db_depth_size; + uint32_t db_depth_slice; + uint32_t db_htile_surface; + uint32_t db_htile_data_base; + uint32_t pa_su_poly_offset_db_fmt_cntl; + float offset_scale; +}; + +struct radv_attachment_info { + union { + struct radv_color_buffer_info cb; + struct radv_ds_buffer_info ds; + }; + struct radv_image_view *attachment; +}; + +struct radv_framebuffer { + uint32_t width; + uint32_t height; + uint32_t layers; + + uint32_t attachment_count; + struct radv_attachment_info attachments[0]; +}; + +struct radv_subpass_barrier { + VkPipelineStageFlags src_stage_mask; + VkAccessFlags src_access_mask; + VkAccessFlags dst_access_mask; +}; + +struct radv_subpass { + uint32_t input_count; + VkAttachmentReference * input_attachments; + uint32_t color_count; + VkAttachmentReference * color_attachments; + VkAttachmentReference * resolve_attachments; + VkAttachmentReference depth_stencil_attachment; + + /** Subpass has at least one resolve attachment */ + bool has_resolve; + + struct radv_subpass_barrier start_barrier; +}; + +struct radv_render_pass_attachment { + VkFormat format; + uint32_t samples; + VkAttachmentLoadOp load_op; + VkAttachmentLoadOp stencil_load_op; + VkImageLayout initial_layout; + VkImageLayout final_layout; +}; + +struct radv_render_pass { + uint32_t attachment_count; + uint32_t subpass_count; + VkAttachmentReference * subpass_attachments; + struct radv_render_pass_attachment * attachments; + struct radv_subpass_barrier end_barrier; + struct radv_subpass subpasses[0]; +}; + +VkResult radv_device_init_meta(struct radv_device *device); +void radv_device_finish_meta(struct radv_device *device); + +struct radv_query_pool { + struct radeon_winsys_bo *bo; + uint32_t stride; + uint32_t availability_offset; + char *ptr; + VkQueryType type; +}; + +VkResult +radv_temp_descriptor_set_create(struct radv_device *device, + struct radv_cmd_buffer *cmd_buffer, + VkDescriptorSetLayout _layout, + VkDescriptorSet *_set); + +void +radv_temp_descriptor_set_destroy(struct radv_device *device, + VkDescriptorSet _set); +void radv_initialise_cmask(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, uint32_t value); +void radv_initialize_dcc(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, uint32_t value); +#define RADV_DEFINE_HANDLE_CASTS(__radv_type, __VkType) \ + \ + static inline struct __radv_type * \ + __radv_type ## _from_handle(__VkType _handle) \ + { \ + return (struct __radv_type *) _handle; \ + } \ + \ + static inline __VkType \ + __radv_type ## _to_handle(struct __radv_type *_obj) \ + { \ + return (__VkType) _obj; \ + } + +#define RADV_DEFINE_NONDISP_HANDLE_CASTS(__radv_type, __VkType) \ + \ + static inline struct __radv_type * \ + __radv_type ## _from_handle(__VkType _handle) \ + { \ + return (struct __radv_type *)(uintptr_t) _handle; \ + } \ + \ + static inline __VkType \ + __radv_type ## _to_handle(struct __radv_type *_obj) \ + { \ + return (__VkType)(uintptr_t) _obj; \ + } + +#define RADV_FROM_HANDLE(__radv_type, __name, __handle) \ + struct __radv_type *__name = __radv_type ## _from_handle(__handle) + +RADV_DEFINE_HANDLE_CASTS(radv_cmd_buffer, VkCommandBuffer) +RADV_DEFINE_HANDLE_CASTS(radv_device, VkDevice) +RADV_DEFINE_HANDLE_CASTS(radv_instance, VkInstance) +RADV_DEFINE_HANDLE_CASTS(radv_physical_device, VkPhysicalDevice) +RADV_DEFINE_HANDLE_CASTS(radv_queue, VkQueue) + +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_cmd_pool, VkCommandPool) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_buffer, VkBuffer) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_buffer_view, VkBufferView) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_descriptor_pool, VkDescriptorPool) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_descriptor_set, VkDescriptorSet) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_descriptor_set_layout, VkDescriptorSetLayout) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_device_memory, VkDeviceMemory) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_fence, VkFence) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_event, VkEvent) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_framebuffer, VkFramebuffer) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_image, VkImage) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_image_view, VkImageView); +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline_cache, VkPipelineCache) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline, VkPipeline) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline_layout, VkPipelineLayout) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_query_pool, VkQueryPool) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_render_pass, VkRenderPass) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_sampler, VkSampler) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_shader_module, VkShaderModule) + +#define RADV_DEFINE_STRUCT_CASTS(__radv_type, __VkType) \ + \ + static inline const __VkType * \ + __radv_type ## _to_ ## __VkType(const struct __radv_type *__radv_obj) \ + { \ + return (const __VkType *) __radv_obj; \ + } + +#define RADV_COMMON_TO_STRUCT(__VkType, __vk_name, __common_name) \ + const __VkType *__vk_name = radv_common_to_ ## __VkType(__common_name) + +RADV_DEFINE_STRUCT_CASTS(radv_common, VkMemoryBarrier) +RADV_DEFINE_STRUCT_CASTS(radv_common, VkBufferMemoryBarrier) +RADV_DEFINE_STRUCT_CASTS(radv_common, VkImageMemoryBarrier) + + diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c new file mode 100644 index 00000000000..f60c10a3a8c --- /dev/null +++ b/src/amd/vulkan/radv_query.c @@ -0,0 +1,415 @@ +/* + * Copyrigh 2016 Red Hat Inc. + * Based on anv: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "radv_private.h" +#include "radv_cs.h" +#include "sid.h" + +static unsigned get_max_db(struct radv_device *device) +{ + unsigned num_db = device->instance->physicalDevice.rad_info.num_render_backends; + unsigned rb_mask = device->instance->physicalDevice.rad_info.enabled_rb_mask; + + if (device->instance->physicalDevice.rad_info.chip_class == SI) + num_db = 8; + else + num_db = MAX2(8, num_db); + + /* Otherwise we need to change the query reset procedure */ + assert(rb_mask == ((1ull << num_db) - 1)); + + return num_db; +} + +VkResult radv_CreateQueryPool( + VkDevice _device, + const VkQueryPoolCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkQueryPool* pQueryPool) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + uint64_t size; + struct radv_query_pool *pool = radv_alloc2(&device->alloc, pAllocator, + sizeof(*pool), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (!pool) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + + switch(pCreateInfo->queryType) { + case VK_QUERY_TYPE_OCCLUSION: + /* 16 bytes tmp. buffer as the compute packet writes 64 bits, but + * the app. may have 32 bits of space. */ + pool->stride = 16 * get_max_db(device) + 16; + break; + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + pool->stride = 16 * 11; + break; + case VK_QUERY_TYPE_TIMESTAMP: + pool->stride = 8; + break; + default: + unreachable("creating unhandled query type"); + } + + pool->type = pCreateInfo->queryType; + pool->availability_offset = pool->stride * pCreateInfo->queryCount; + size = pool->availability_offset + 4 * pCreateInfo->queryCount; + + pool->bo = device->ws->buffer_create(device->ws, size, + 64, RADEON_DOMAIN_GTT, 0); + + if (!pool->bo) { + radv_free2(&device->alloc, pAllocator, pool); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + + pool->ptr = device->ws->buffer_map(pool->bo); + + if (!pool->ptr) { + device->ws->buffer_destroy(pool->bo); + radv_free2(&device->alloc, pAllocator, pool); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + memset(pool->ptr, 0, size); + + *pQueryPool = radv_query_pool_to_handle(pool); + return VK_SUCCESS; +} + +void radv_DestroyQueryPool( + VkDevice _device, + VkQueryPool _pool, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_query_pool, pool, _pool); + + if (!pool) + return; + + device->ws->buffer_destroy(pool->bo); + radv_free2(&device->alloc, pAllocator, pool); +} + +VkResult radv_GetQueryPoolResults( + VkDevice _device, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount, + size_t dataSize, + void* pData, + VkDeviceSize stride, + VkQueryResultFlags flags) +{ + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + char *data = pData; + VkResult result = VK_SUCCESS; + + for(unsigned i = 0; i < queryCount; ++i, data += stride) { + char *dest = data; + unsigned query = firstQuery + i; + char *src = pool->ptr + query * pool->stride; + uint32_t available; + + if (flags & VK_QUERY_RESULT_WAIT_BIT) { + while(!*(volatile uint32_t*)(pool->ptr + pool->availability_offset + 4 * query)) + ; + } + + if (!*(uint32_t*)(pool->ptr + pool->availability_offset + 4 * query) && + !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) { + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) + *(uint32_t*)dest = 0; + result = VK_NOT_READY; + continue; + + } + + available = *(uint32_t*)(pool->ptr + pool->availability_offset + 4 * query); + switch (pool->type) { + case VK_QUERY_TYPE_TIMESTAMP: + if (flags & VK_QUERY_RESULT_64_BIT) { + *(uint64_t*)dest = *(uint64_t*)src; + dest += 8; + } else { + *(uint32_t*)dest = *(uint32_t*)src; + dest += 4; + } + break; + case VK_QUERY_TYPE_OCCLUSION: { + uint64_t result = *(uint64_t*)(src + pool->stride - 16); + + if (flags & VK_QUERY_RESULT_64_BIT) { + *(uint64_t*)dest = result; + dest += 8; + } else { + *(uint32_t*)dest = result; + dest += 4; + } + break; + default: + unreachable("trying to get results of unhandled query type"); + } + } + + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { + *(uint32_t*)dest = available; + dest += 4; + } + } + + return result; +} + +void radv_CmdCopyQueryPoolResults( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + VkDeviceSize stride, + VkQueryResultFlags flags) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer); + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); + uint64_t dest_va = cmd_buffer->device->ws->buffer_get_va(dst_buffer->bo); + dest_va += dst_buffer->offset + dstOffset; + + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, pool->bo, 8); + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, dst_buffer->bo, 8); + + for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) { + unsigned query = firstQuery + i; + uint64_t local_src_va = va + query * pool->stride; + unsigned elem_size = (flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4; + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 26); + + if (flags & VK_QUERY_RESULT_WAIT_BIT) { + /* TODO, not sure if there is any case where we won't always be ready yet */ + uint64_t avail_va = va + pool->availability_offset + 4 * query; + + + /* This waits on the ME. All copies below are done on the ME */ + radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1)); + radeon_emit(cs, avail_va); + radeon_emit(cs, avail_va >> 32); + radeon_emit(cs, 1); /* reference value */ + radeon_emit(cs, 0xffffffff); /* mask */ + radeon_emit(cs, 4); /* poll interval */ + } + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + local_src_va += pool->stride - 16; + + case VK_QUERY_TYPE_TIMESTAMP: + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | + COPY_DATA_DST_SEL(COPY_DATA_MEM) | + ((flags & VK_QUERY_RESULT_64_BIT) ? COPY_DATA_COUNT_SEL : 0)); + radeon_emit(cs, local_src_va); + radeon_emit(cs, local_src_va >> 32); + radeon_emit(cs, dest_va); + radeon_emit(cs, dest_va >> 32); + break; + default: + unreachable("trying to get results of unhandled query type"); + } + + /* The flag could be still changed while the data copy is busy and we + * then might have invalid data, but a ready flag. However, the availability + * writes happen on the ME too, so they should be synchronized. Might need to + * revisit this with multiple queues. + */ + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { + uint64_t avail_va = va + pool->availability_offset + 4 * query; + uint64_t avail_dest_va = dest_va; + if (pool->type != VK_QUERY_TYPE_PIPELINE_STATISTICS) + avail_dest_va += elem_size; + else + abort(); + + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | + COPY_DATA_DST_SEL(COPY_DATA_MEM)); + radeon_emit(cs, avail_va); + radeon_emit(cs, avail_va >> 32); + radeon_emit(cs, avail_dest_va); + radeon_emit(cs, avail_dest_va >> 32); + } + + assert(cs->cdw <= cdw_max); + } + +} + +void radv_CmdResetQueryPool( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); + + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, pool->bo, 8); + + si_cp_dma_clear_buffer(cmd_buffer, va + firstQuery * pool->stride, + queryCount * pool->stride, 0); + si_cp_dma_clear_buffer(cmd_buffer, va + pool->availability_offset + firstQuery * 4, + queryCount * 4, 0); +} + +void radv_CmdBeginQuery( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query, + VkQueryControlFlags flags) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); + va += pool->stride * query; + + cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 8); + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + radeon_check_space(cmd_buffer->device->ws, cs, 7); + + ++cmd_buffer->state.active_occlusion_queries; + if (cmd_buffer->state.active_occlusion_queries == 1) + radv_set_db_count_control(cmd_buffer); + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + break; + default: + unreachable("beginning unhandled query type"); + } +} + + +void radv_CmdEndQuery( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); + uint64_t avail_va = va + pool->availability_offset + 4 * query; + va += pool->stride * query; + + cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 8); + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + radeon_check_space(cmd_buffer->device->ws, cs, 14); + + cmd_buffer->state.active_occlusion_queries--; + if (cmd_buffer->state.active_occlusion_queries == 0) + radv_set_db_count_control(cmd_buffer); + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); + radeon_emit(cs, va + 8); + radeon_emit(cs, (va + 8) >> 32); + + radeon_emit(cs, PKT3(PKT3_OCCLUSION_QUERY, 3, 0)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, va + pool->stride - 16); + radeon_emit(cs, (va + pool->stride - 16) >> 32); + + break; + default: + unreachable("ending unhandled query type"); + } + + radeon_check_space(cmd_buffer->device->ws, cs, 5); + + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); + radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_ME)); + radeon_emit(cs, avail_va); + radeon_emit(cs, avail_va >> 32); + radeon_emit(cs, 1); +} + +void radv_CmdWriteTimestamp( + VkCommandBuffer commandBuffer, + VkPipelineStageFlagBits pipelineStage, + VkQueryPool queryPool, + uint32_t query) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); + uint64_t avail_va = va + pool->availability_offset + 4 * query; + uint64_t query_va = va + pool->stride * query; + + cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 5); + + unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 11); + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5)); + radeon_emit(cs, query_va); + radeon_emit(cs, (3 << 29) | ((query_va >> 32) & 0xFFFF)); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); + radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_ME)); + radeon_emit(cs, avail_va); + radeon_emit(cs, avail_va >> 32); + radeon_emit(cs, 1); + + assert(cmd_buffer->cs->cdw <= cdw_max); +} diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h new file mode 100644 index 00000000000..29a4ee3e2db --- /dev/null +++ b/src/amd/vulkan/radv_radeon_winsys.h @@ -0,0 +1,336 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * Based on radeon_winsys.h which is: + * Copyright 2008 Corbin Simpson + * Copyright 2010 Marek Olšák + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#include +#include +#include +#include "main/macros.h" +#include "amd_family.h" + +#define FREE(x) free(x) + +enum radeon_bo_domain { /* bitfield */ + RADEON_DOMAIN_GTT = 2, + RADEON_DOMAIN_VRAM = 4, + RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT +}; + +enum radeon_bo_flag { /* bitfield */ + RADEON_FLAG_GTT_WC = (1 << 0), + RADEON_FLAG_CPU_ACCESS = (1 << 1), + RADEON_FLAG_NO_CPU_ACCESS = (1 << 2), +}; + +enum radeon_bo_usage { /* bitfield */ + RADEON_USAGE_READ = 2, + RADEON_USAGE_WRITE = 4, + RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE +}; + +enum ring_type { + RING_GFX = 0, + RING_COMPUTE, + RING_DMA, + RING_UVD, + RING_VCE, + RING_LAST, +}; + +struct radeon_winsys_cs { + unsigned cdw; /* Number of used dwords. */ + unsigned max_dw; /* Maximum number of dwords. */ + uint32_t *buf; /* The base pointer of the chunk. */ +}; + +struct radeon_info { + /* PCI info: domain:bus:dev:func */ + uint32_t pci_domain; + uint32_t pci_bus; + uint32_t pci_dev; + uint32_t pci_func; + + /* Device info. */ + uint32_t pci_id; + enum radeon_family family; + const char *name; + enum chip_class chip_class; + uint32_t gart_page_size; + uint64_t gart_size; + uint64_t vram_size; + bool has_dedicated_vram; + bool has_virtual_memory; + bool gfx_ib_pad_with_type2; + bool has_sdma; + bool has_uvd; + uint32_t vce_fw_version; + uint32_t vce_harvest_config; + uint32_t clock_crystal_freq; + + /* Kernel info. */ + uint32_t drm_major; /* version */ + uint32_t drm_minor; + uint32_t drm_patchlevel; + bool has_userptr; + + /* Shader cores. */ + uint32_t r600_max_quad_pipes; /* wave size / 16 */ + uint32_t max_shader_clock; + uint32_t num_good_compute_units; + uint32_t max_se; /* shader engines */ + uint32_t max_sh_per_se; /* shader arrays per shader engine */ + + /* Render backends (color + depth blocks). */ + uint32_t r300_num_gb_pipes; + uint32_t r300_num_z_pipes; + uint32_t r600_gb_backend_map; /* R600 harvest config */ + bool r600_gb_backend_map_valid; + uint32_t r600_num_banks; + uint32_t num_render_backends; + uint32_t num_tile_pipes; /* pipe count from PIPE_CONFIG */ + uint32_t pipe_interleave_bytes; + uint32_t enabled_rb_mask; /* GCN harvest config */ + + /* Tile modes. */ + uint32_t si_tile_mode_array[32]; + uint32_t cik_macrotile_mode_array[16]; +}; + +#define RADEON_SURF_MAX_LEVEL 32 + +#define RADEON_SURF_TYPE_MASK 0xFF +#define RADEON_SURF_TYPE_SHIFT 0 +#define RADEON_SURF_TYPE_1D 0 +#define RADEON_SURF_TYPE_2D 1 +#define RADEON_SURF_TYPE_3D 2 +#define RADEON_SURF_TYPE_CUBEMAP 3 +#define RADEON_SURF_TYPE_1D_ARRAY 4 +#define RADEON_SURF_TYPE_2D_ARRAY 5 +#define RADEON_SURF_MODE_MASK 0xFF +#define RADEON_SURF_MODE_SHIFT 8 +#define RADEON_SURF_MODE_LINEAR_ALIGNED 1 +#define RADEON_SURF_MODE_1D 2 +#define RADEON_SURF_MODE_2D 3 +#define RADEON_SURF_SCANOUT (1 << 16) +#define RADEON_SURF_ZBUFFER (1 << 17) +#define RADEON_SURF_SBUFFER (1 << 18) +#define RADEON_SURF_Z_OR_SBUFFER (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER) +#define RADEON_SURF_HAS_SBUFFER_MIPTREE (1 << 19) +#define RADEON_SURF_HAS_TILE_MODE_INDEX (1 << 20) +#define RADEON_SURF_FMASK (1 << 21) +#define RADEON_SURF_DISABLE_DCC (1 << 22) + +#define RADEON_SURF_GET(v, field) (((v) >> RADEON_SURF_ ## field ## _SHIFT) & RADEON_SURF_ ## field ## _MASK) +#define RADEON_SURF_SET(v, field) (((v) & RADEON_SURF_ ## field ## _MASK) << RADEON_SURF_ ## field ## _SHIFT) +#define RADEON_SURF_CLR(v, field) ((v) & ~(RADEON_SURF_ ## field ## _MASK << RADEON_SURF_ ## field ## _SHIFT)) + +struct radeon_surf_level { + uint64_t offset; + uint64_t slice_size; + uint32_t npix_x; + uint32_t npix_y; + uint32_t npix_z; + uint32_t nblk_x; + uint32_t nblk_y; + uint32_t nblk_z; + uint32_t pitch_bytes; + uint32_t mode; + uint64_t dcc_offset; + uint64_t dcc_fast_clear_size; + bool dcc_enabled; +}; + + +/* surface defintions from the winsys */ +struct radeon_surf { + /* These are inputs to the calculator. */ + uint32_t npix_x; + uint32_t npix_y; + uint32_t npix_z; + uint32_t blk_w; + uint32_t blk_h; + uint32_t blk_d; + uint32_t array_size; + uint32_t last_level; + uint32_t bpe; + uint32_t nsamples; + uint32_t flags; + + /* These are return values. Some of them can be set by the caller, but + * they will be treated as hints (e.g. bankw, bankh) and might be + * changed by the calculator. + */ + uint64_t bo_size; + uint64_t bo_alignment; + /* This applies to EG and later. */ + uint32_t bankw; + uint32_t bankh; + uint32_t mtilea; + uint32_t tile_split; + uint32_t stencil_tile_split; + uint64_t stencil_offset; + struct radeon_surf_level level[RADEON_SURF_MAX_LEVEL]; + struct radeon_surf_level stencil_level[RADEON_SURF_MAX_LEVEL]; + uint32_t tiling_index[RADEON_SURF_MAX_LEVEL]; + uint32_t stencil_tiling_index[RADEON_SURF_MAX_LEVEL]; + uint32_t pipe_config; + uint32_t num_banks; + uint32_t macro_tile_index; + uint32_t micro_tile_mode; /* displayable, thin, depth, rotated */ + + /* Whether the depth miptree or stencil miptree as used by the DB are + * adjusted from their TC compatible form to ensure depth/stencil + * compatibility. If either is true, the corresponding plane cannot be + * sampled from. + */ + bool depth_adjusted; + bool stencil_adjusted; + + uint64_t dcc_size; + uint64_t dcc_alignment; +}; + +enum radeon_bo_layout { + RADEON_LAYOUT_LINEAR = 0, + RADEON_LAYOUT_TILED, + RADEON_LAYOUT_SQUARETILED, + + RADEON_LAYOUT_UNKNOWN +}; + +/* Tiling info for display code, DRI sharing, and other data. */ +struct radeon_bo_metadata { + /* Tiling flags describing the texture layout for display code + * and DRI sharing. + */ + enum radeon_bo_layout microtile; + enum radeon_bo_layout macrotile; + unsigned pipe_config; + unsigned bankw; + unsigned bankh; + unsigned tile_split; + unsigned mtilea; + unsigned num_banks; + unsigned stride; + bool scanout; + + /* Additional metadata associated with the buffer, in bytes. + * The maximum size is 64 * 4. This is opaque for the winsys & kernel. + * Supported by amdgpu only. + */ + uint32_t size_metadata; + uint32_t metadata[64]; +}; + +struct radeon_winsys_bo; +struct radeon_winsys_fence; + +struct radeon_winsys { + void (*destroy)(struct radeon_winsys *ws); + + void (*query_info)(struct radeon_winsys *ws, + struct radeon_info *info); + + struct radeon_winsys_bo *(*buffer_create)(struct radeon_winsys *ws, + uint64_t size, + unsigned alignment, + enum radeon_bo_domain domain, + enum radeon_bo_flag flags); + + void (*buffer_destroy)(struct radeon_winsys_bo *bo); + void *(*buffer_map)(struct radeon_winsys_bo *bo); + + struct radeon_winsys_bo *(*buffer_from_fd)(struct radeon_winsys *ws, + int fd, + unsigned *stride, unsigned *offset); + + bool (*buffer_get_fd)(struct radeon_winsys *ws, + struct radeon_winsys_bo *bo, + int *fd); + + void (*buffer_unmap)(struct radeon_winsys_bo *bo); + + uint64_t (*buffer_get_va)(struct radeon_winsys_bo *bo); + + void (*buffer_set_metadata)(struct radeon_winsys_bo *bo, + struct radeon_bo_metadata *md); + struct radeon_winsys_ctx *(*ctx_create)(struct radeon_winsys *ws); + void (*ctx_destroy)(struct radeon_winsys_ctx *ctx); + + bool (*ctx_wait_idle)(struct radeon_winsys_ctx *ctx); + + struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys *ws, + enum ring_type ring_type); + + void (*cs_destroy)(struct radeon_winsys_cs *cs); + + void (*cs_reset)(struct radeon_winsys_cs *cs); + + bool (*cs_finalize)(struct radeon_winsys_cs *cs); + + void (*cs_grow)(struct radeon_winsys_cs * cs, size_t min_size); + + int (*cs_submit)(struct radeon_winsys_ctx *ctx, + struct radeon_winsys_cs **cs_array, + unsigned cs_count, + bool can_patch, + struct radeon_winsys_fence *fence); + + void (*cs_add_buffer)(struct radeon_winsys_cs *cs, + struct radeon_winsys_bo *bo, + uint8_t priority); + + void (*cs_execute_secondary)(struct radeon_winsys_cs *parent, + struct radeon_winsys_cs *child); + + int (*surface_init)(struct radeon_winsys *ws, + struct radeon_surf *surf); + + int (*surface_best)(struct radeon_winsys *ws, + struct radeon_surf *surf); + + struct radeon_winsys_fence *(*create_fence)(); + void (*destroy_fence)(struct radeon_winsys_fence *fence); + bool (*fence_wait)(struct radeon_winsys *ws, + struct radeon_winsys_fence *fence, + bool absolute, + uint64_t timeout); +}; + +static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value) +{ + cs->buf[cs->cdw++] = value; +} + +static inline void radeon_emit_array(struct radeon_winsys_cs *cs, + const uint32_t *values, unsigned count) +{ + memcpy(cs->buf + cs->cdw, values, count * 4); + cs->cdw += count; +} + diff --git a/src/amd/vulkan/radv_util.c b/src/amd/vulkan/radv_util.c new file mode 100644 index 00000000000..bf7abd49d1d --- /dev/null +++ b/src/amd/vulkan/radv_util.c @@ -0,0 +1,204 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "radv_private.h" + +#include "util/u_math.h" + +/** Log an error message. */ +void radv_printflike(1, 2) + radv_loge(const char *format, ...) +{ + va_list va; + + va_start(va, format); + radv_loge_v(format, va); + va_end(va); +} + +/** \see radv_loge() */ +void +radv_loge_v(const char *format, va_list va) +{ + fprintf(stderr, "vk: error: "); + vfprintf(stderr, format, va); + fprintf(stderr, "\n"); +} + +void radv_printflike(3, 4) + __radv_finishme(const char *file, int line, const char *format, ...) +{ + va_list ap; + char buffer[256]; + + va_start(ap, format); + vsnprintf(buffer, sizeof(buffer), format, ap); + va_end(ap); + + fprintf(stderr, "%s:%d: FINISHME: %s\n", file, line, buffer); +} + +void radv_noreturn radv_printflike(1, 2) + radv_abortf(const char *format, ...) +{ + va_list va; + + va_start(va, format); + radv_abortfv(format, va); + va_end(va); +} + +void radv_noreturn +radv_abortfv(const char *format, va_list va) +{ + fprintf(stderr, "vk: error: "); + vfprintf(stderr, format, va); + fprintf(stderr, "\n"); + abort(); +} + +VkResult +__vk_errorf(VkResult error, const char *file, int line, const char *format, ...) +{ + va_list ap; + char buffer[256]; + +#define ERROR_CASE(error) case error: error_str = #error; break; + + const char *error_str; + switch ((int32_t)error) { + + /* Core errors */ + ERROR_CASE(VK_ERROR_OUT_OF_HOST_MEMORY) + ERROR_CASE(VK_ERROR_OUT_OF_DEVICE_MEMORY) + ERROR_CASE(VK_ERROR_INITIALIZATION_FAILED) + ERROR_CASE(VK_ERROR_DEVICE_LOST) + ERROR_CASE(VK_ERROR_MEMORY_MAP_FAILED) + ERROR_CASE(VK_ERROR_LAYER_NOT_PRESENT) + ERROR_CASE(VK_ERROR_EXTENSION_NOT_PRESENT) + ERROR_CASE(VK_ERROR_INCOMPATIBLE_DRIVER) + + /* Extension errors */ + ERROR_CASE(VK_ERROR_OUT_OF_DATE_KHR) + + default: + assert(!"Unknown error"); + error_str = "unknown error"; + } + +#undef ERROR_CASE + + if (format) { + va_start(ap, format); + vsnprintf(buffer, sizeof(buffer), format, ap); + va_end(ap); + + fprintf(stderr, "%s:%d: %s (%s)\n", file, line, buffer, error_str); + } else { + fprintf(stderr, "%s:%d: %s\n", file, line, error_str); + } + + return error; +} + +int +radv_vector_init(struct radv_vector *vector, uint32_t element_size, uint32_t size) +{ + assert(util_is_power_of_two(size)); + assert(element_size < size && util_is_power_of_two(element_size)); + + vector->head = 0; + vector->tail = 0; + vector->element_size = element_size; + vector->size = size; + vector->data = malloc(size); + + return vector->data != NULL; +} + +void * +radv_vector_add(struct radv_vector *vector) +{ + uint32_t offset, size, split, src_tail, dst_tail; + void *data; + + if (vector->head - vector->tail == vector->size) { + size = vector->size * 2; + data = malloc(size); + if (data == NULL) + return NULL; + src_tail = vector->tail & (vector->size - 1); + dst_tail = vector->tail & (size - 1); + if (src_tail == 0) { + /* Since we know that the vector is full, this means that it's + * linear from start to end so we can do one copy. + */ + memcpy(data + dst_tail, vector->data, vector->size); + } else { + /* In this case, the vector is split into two pieces and we have + * to do two copies. We have to be careful to make sure each + * piece goes to the right locations. Thanks to the change in + * size, it may or may not still wrap around. + */ + split = align_u32(vector->tail, vector->size); + assert(vector->tail <= split && split < vector->head); + memcpy(data + dst_tail, vector->data + src_tail, + split - vector->tail); + memcpy(data + (split & (size - 1)), vector->data, + vector->head - split); + } + free(vector->data); + vector->data = data; + vector->size = size; + } + + assert(vector->head - vector->tail < vector->size); + + offset = vector->head & (vector->size - 1); + vector->head += vector->element_size; + + return vector->data + offset; +} + +void * +radv_vector_remove(struct radv_vector *vector) +{ + uint32_t offset; + + if (vector->head == vector->tail) + return NULL; + + assert(vector->head - vector->tail <= vector->size); + + offset = vector->tail & (vector->size - 1); + vector->tail += vector->element_size; + + return vector->data + offset; +} diff --git a/src/amd/vulkan/radv_util.h b/src/amd/vulkan/radv_util.h new file mode 100644 index 00000000000..57a4c95adb6 --- /dev/null +++ b/src/amd/vulkan/radv_util.h @@ -0,0 +1,9 @@ +#pragma once + +#ifdef HAVE___BUILTIN_POPCOUNT +#define util_bitcount(i) __builtin_popcount(i) +#else +extern unsigned int +util_bitcount(unsigned int n); +#endif + diff --git a/src/amd/vulkan/radv_wsi.c b/src/amd/vulkan/radv_wsi.c new file mode 100644 index 00000000000..5922d6ab8b7 --- /dev/null +++ b/src/amd/vulkan/radv_wsi.c @@ -0,0 +1,246 @@ +/* + * Copyright © 2016 Red Hat + * based on intel anv code: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "radv_wsi.h" + + +VkResult +radv_init_wsi(struct radv_physical_device *physical_device) +{ + VkResult result; + + memset(physical_device->wsi, 0, sizeof(physical_device->wsi)); + +#ifdef VK_USE_PLATFORM_XCB_KHR + result = radv_x11_init_wsi(physical_device); + if (result != VK_SUCCESS) + return result; +#endif + +#ifdef VK_USE_PLATFORM_WAYLAND_KHR + result = radv_wl_init_wsi(physical_device); + if (result != VK_SUCCESS) { +#ifdef VK_USE_PLATFORM_XCB_KHR + radv_x11_finish_wsi(physical_device); +#endif + return result; + } +#endif + + return VK_SUCCESS; +} + +void +radv_finish_wsi(struct radv_physical_device *physical_device) +{ +#ifdef VK_USE_PLATFORM_WAYLAND_KHR + radv_wl_finish_wsi(physical_device); +#endif +#ifdef VK_USE_PLATFORM_XCB_KHR + radv_x11_finish_wsi(physical_device); +#endif +} + +void radv_DestroySurfaceKHR( + VkInstance _instance, + VkSurfaceKHR _surface, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_instance, instance, _instance); + RADV_FROM_HANDLE(_VkIcdSurfaceBase, surface, _surface); + + radv_free2(&instance->alloc, pAllocator, surface); +} + +VkResult radv_GetPhysicalDeviceSurfaceSupportKHR( + VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + VkSurfaceKHR _surface, + VkBool32* pSupported) +{ + RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice); + RADV_FROM_HANDLE(_VkIcdSurfaceBase, surface, _surface); + struct radv_wsi_interface *iface = device->wsi[surface->platform]; + + return iface->get_support(surface, device, queueFamilyIndex, pSupported); +} + +VkResult radv_GetPhysicalDeviceSurfaceCapabilitiesKHR( + VkPhysicalDevice physicalDevice, + VkSurfaceKHR _surface, + VkSurfaceCapabilitiesKHR* pSurfaceCapabilities) +{ + RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice); + RADV_FROM_HANDLE(_VkIcdSurfaceBase, surface, _surface); + struct radv_wsi_interface *iface = device->wsi[surface->platform]; + + return iface->get_capabilities(surface, device, pSurfaceCapabilities); +} + +VkResult radv_GetPhysicalDeviceSurfaceFormatsKHR( + VkPhysicalDevice physicalDevice, + VkSurfaceKHR _surface, + uint32_t* pSurfaceFormatCount, + VkSurfaceFormatKHR* pSurfaceFormats) +{ + RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice); + RADV_FROM_HANDLE(_VkIcdSurfaceBase, surface, _surface); + struct radv_wsi_interface *iface = device->wsi[surface->platform]; + + return iface->get_formats(surface, device, pSurfaceFormatCount, + pSurfaceFormats); +} + +VkResult radv_GetPhysicalDeviceSurfacePresentModesKHR( + VkPhysicalDevice physicalDevice, + VkSurfaceKHR _surface, + uint32_t* pPresentModeCount, + VkPresentModeKHR* pPresentModes) +{ + RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice); + RADV_FROM_HANDLE(_VkIcdSurfaceBase, surface, _surface); + struct radv_wsi_interface *iface = device->wsi[surface->platform]; + + return iface->get_present_modes(surface, device, pPresentModeCount, + pPresentModes); +} + +VkResult radv_CreateSwapchainKHR( + VkDevice _device, + const VkSwapchainCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSwapchainKHR* pSwapchain) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(_VkIcdSurfaceBase, surface, pCreateInfo->surface); + struct radv_wsi_interface *iface = + device->instance->physicalDevice.wsi[surface->platform]; + struct radv_swapchain *swapchain; + + VkResult result = iface->create_swapchain(surface, device, pCreateInfo, + pAllocator, &swapchain); + if (result != VK_SUCCESS) + return result; + + if (pAllocator) + swapchain->alloc = *pAllocator; + else + swapchain->alloc = device->alloc; + + for (unsigned i = 0; i < ARRAY_SIZE(swapchain->fences); i++) + swapchain->fences[i] = VK_NULL_HANDLE; + + *pSwapchain = radv_swapchain_to_handle(swapchain); + + return VK_SUCCESS; +} + +void radv_DestroySwapchainKHR( + VkDevice device, + VkSwapchainKHR _swapchain, + const VkAllocationCallbacks* pAllocator) +{ + RADV_FROM_HANDLE(radv_swapchain, swapchain, _swapchain); + + for (unsigned i = 0; i < ARRAY_SIZE(swapchain->fences); i++) { + if (swapchain->fences[i] != VK_NULL_HANDLE) + radv_DestroyFence(device, swapchain->fences[i], pAllocator); + } + + swapchain->destroy(swapchain, pAllocator); +} + +VkResult radv_GetSwapchainImagesKHR( + VkDevice device, + VkSwapchainKHR _swapchain, + uint32_t* pSwapchainImageCount, + VkImage* pSwapchainImages) +{ + RADV_FROM_HANDLE(radv_swapchain, swapchain, _swapchain); + + return swapchain->get_images(swapchain, pSwapchainImageCount, + pSwapchainImages); +} + +VkResult radv_AcquireNextImageKHR( + VkDevice device, + VkSwapchainKHR _swapchain, + uint64_t timeout, + VkSemaphore semaphore, + VkFence fence, + uint32_t* pImageIndex) +{ + RADV_FROM_HANDLE(radv_swapchain, swapchain, _swapchain); + + return swapchain->acquire_next_image(swapchain, timeout, semaphore, + pImageIndex); +} + +VkResult radv_QueuePresentKHR( + VkQueue _queue, + const VkPresentInfoKHR* pPresentInfo) +{ + RADV_FROM_HANDLE(radv_queue, queue, _queue); + VkResult result = VK_SUCCESS; + + for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) { + RADV_FROM_HANDLE(radv_swapchain, swapchain, pPresentInfo->pSwapchains[i]); + + assert(swapchain->device == queue->device); + if (swapchain->fences[0] == VK_NULL_HANDLE) { + result = radv_CreateFence(radv_device_to_handle(queue->device), + &(VkFenceCreateInfo) { + .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, + .flags = 0, + }, &swapchain->alloc, &swapchain->fences[0]); + if (result != VK_SUCCESS) + return result; + } else { + radv_ResetFences(radv_device_to_handle(queue->device), + 1, &swapchain->fences[0]); + } + + radv_QueueSubmit(_queue, 0, NULL, swapchain->fences[0]); + + result = swapchain->queue_present(swapchain, queue, + pPresentInfo->pImageIndices[i]); + /* TODO: What if one of them returns OUT_OF_DATE? */ + if (result != VK_SUCCESS) + return result; + + VkFence last = swapchain->fences[2]; + swapchain->fences[2] = swapchain->fences[1]; + swapchain->fences[1] = swapchain->fences[0]; + swapchain->fences[0] = last; + + if (last != VK_NULL_HANDLE) { + radv_WaitForFences(radv_device_to_handle(queue->device), + 1, &last, true, 1); + } + + } + + return VK_SUCCESS; +} diff --git a/src/amd/vulkan/radv_wsi.h b/src/amd/vulkan/radv_wsi.h new file mode 100644 index 00000000000..3e453fbe773 --- /dev/null +++ b/src/amd/vulkan/radv_wsi.h @@ -0,0 +1,79 @@ +/* + * Copyright © 2016 Red Hat + * based on intel anv code: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include "radv_private.h" + +struct radv_swapchain; +struct radv_wsi_interface { + VkResult (*get_support)(VkIcdSurfaceBase *surface, + struct radv_physical_device *device, + uint32_t queueFamilyIndex, + VkBool32* pSupported); + VkResult (*get_capabilities)(VkIcdSurfaceBase *surface, + struct radv_physical_device *device, + VkSurfaceCapabilitiesKHR* pSurfaceCapabilities); + VkResult (*get_formats)(VkIcdSurfaceBase *surface, + struct radv_physical_device *device, + uint32_t* pSurfaceFormatCount, + VkSurfaceFormatKHR* pSurfaceFormats); + VkResult (*get_present_modes)(VkIcdSurfaceBase *surface, + struct radv_physical_device *device, + uint32_t* pPresentModeCount, + VkPresentModeKHR* pPresentModes); + VkResult (*create_swapchain)(VkIcdSurfaceBase *surface, + struct radv_device *device, + const VkSwapchainCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + struct radv_swapchain **swapchain); +}; + +struct radv_swapchain { + struct radv_device *device; + + VkAllocationCallbacks alloc; + + VkFence fences[3]; + + VkResult (*destroy)(struct radv_swapchain *swapchain, + const VkAllocationCallbacks *pAllocator); + VkResult (*get_images)(struct radv_swapchain *swapchain, + uint32_t *pCount, VkImage *pSwapchainImages); + VkResult (*acquire_next_image)(struct radv_swapchain *swap_chain, + uint64_t timeout, VkSemaphore semaphore, + uint32_t *image_index); + VkResult (*queue_present)(struct radv_swapchain *swap_chain, + struct radv_queue *queue, + uint32_t image_index); +}; + +RADV_DEFINE_NONDISP_HANDLE_CASTS(_VkIcdSurfaceBase, VkSurfaceKHR) +RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_swapchain, VkSwapchainKHR) + +VkResult radv_x11_init_wsi(struct radv_physical_device *physical_device); +void radv_x11_finish_wsi(struct radv_physical_device *physical_device); +VkResult radv_wl_init_wsi(struct radv_physical_device *physical_device); +void radv_wl_finish_wsi(struct radv_physical_device *physical_device); diff --git a/src/amd/vulkan/radv_wsi_wayland.c b/src/amd/vulkan/radv_wsi_wayland.c new file mode 100644 index 00000000000..2b4a3d32a45 --- /dev/null +++ b/src/amd/vulkan/radv_wsi_wayland.c @@ -0,0 +1,880 @@ +/* + * Copyright © 2016 Red Hat + * based on intel anv code: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#include "radv_wsi.h" + +#include "vk_format.h" +#include + +#define MIN_NUM_IMAGES 2 + +struct wsi_wl_display { + struct radv_physical_device *physical_device; + struct wl_display * display; + struct wl_drm * drm; + + /* Vector of VkFormats supported */ + struct radv_vector formats; + + uint32_t capabilities; +}; + +struct wsi_wayland { + struct radv_wsi_interface base; + + struct radv_physical_device * physical_device; + + pthread_mutex_t mutex; + /* Hash table of wl_display -> wsi_wl_display mappings */ + struct hash_table * displays; +}; + +static void +wsi_wl_display_add_vk_format(struct wsi_wl_display *display, VkFormat format) +{ + /* Don't add a format that's already in the list */ + VkFormat *f; + radv_vector_foreach(f, &display->formats) + if (*f == format) + return; + + /* Don't add formats that aren't renderable. */ + VkFormatProperties props; + radv_GetPhysicalDeviceFormatProperties( + radv_physical_device_to_handle(display->physical_device), format, &props); + if (!(props.optimalTilingFeatures & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) + return; + + f = radv_vector_add(&display->formats); + if (f) + *f = format; +} + +static void +drm_handle_device(void *data, struct wl_drm *drm, const char *name) +{ + fprintf(stderr, "wl_drm.device(%s)\n", name); +} + +static uint32_t +wl_drm_format_for_vk_format(VkFormat vk_format, bool alpha) +{ + switch (vk_format) { + /* TODO: Figure out what all the formats mean and make this table + * correct. + */ +#if 0 + case VK_FORMAT_R4G4B4A4_UNORM: + return alpha ? WL_DRM_FORMAT_ABGR4444 : WL_DRM_FORMAT_XBGR4444; + case VK_FORMAT_R5G6B5_UNORM: + return WL_DRM_FORMAT_BGR565; + case VK_FORMAT_R5G5B5A1_UNORM: + return alpha ? WL_DRM_FORMAT_ABGR1555 : WL_DRM_FORMAT_XBGR1555; + case VK_FORMAT_R8G8B8_UNORM: + return WL_DRM_FORMAT_XBGR8888; + case VK_FORMAT_R8G8B8A8_UNORM: + return alpha ? WL_DRM_FORMAT_ABGR8888 : WL_DRM_FORMAT_XBGR8888; + case VK_FORMAT_R10G10B10A2_UNORM: + return alpha ? WL_DRM_FORMAT_ABGR2101010 : WL_DRM_FORMAT_XBGR2101010; + case VK_FORMAT_B4G4R4A4_UNORM: + return alpha ? WL_DRM_FORMAT_ARGB4444 : WL_DRM_FORMAT_XRGB4444; + case VK_FORMAT_B5G6R5_UNORM: + return WL_DRM_FORMAT_RGB565; + case VK_FORMAT_B5G5R5A1_UNORM: + return alpha ? WL_DRM_FORMAT_XRGB1555 : WL_DRM_FORMAT_XRGB1555; +#endif + case VK_FORMAT_B8G8R8_SRGB: + return WL_DRM_FORMAT_BGRX8888; + case VK_FORMAT_B8G8R8A8_SRGB: + return alpha ? WL_DRM_FORMAT_ARGB8888 : WL_DRM_FORMAT_XRGB8888; +#if 0 + case VK_FORMAT_B10G10R10A2_UNORM: + return alpha ? WL_DRM_FORMAT_ARGB2101010 : WL_DRM_FORMAT_XRGB2101010; +#endif + + default: + assert(!"Unsupported Vulkan format"); + return 0; + } +} + +static void +drm_handle_format(void *data, struct wl_drm *drm, uint32_t wl_format) +{ + struct wsi_wl_display *display = data; + + switch (wl_format) { +#if 0 + case WL_DRM_FORMAT_ABGR4444: + case WL_DRM_FORMAT_XBGR4444: + wsi_wl_display_add_vk_format(display, VK_FORMAT_R4G4B4A4_UNORM); + break; + case WL_DRM_FORMAT_BGR565: + wsi_wl_display_add_vk_format(display, VK_FORMAT_R5G6B5_UNORM); + break; + case WL_DRM_FORMAT_ABGR1555: + case WL_DRM_FORMAT_XBGR1555: + wsi_wl_display_add_vk_format(display, VK_FORMAT_R5G5B5A1_UNORM); + break; + case WL_DRM_FORMAT_XBGR8888: + wsi_wl_display_add_vk_format(display, VK_FORMAT_R8G8B8_UNORM); + /* fallthrough */ + case WL_DRM_FORMAT_ABGR8888: + wsi_wl_display_add_vk_format(display, VK_FORMAT_R8G8B8A8_UNORM); + break; + case WL_DRM_FORMAT_ABGR2101010: + case WL_DRM_FORMAT_XBGR2101010: + wsi_wl_display_add_vk_format(display, VK_FORMAT_R10G10B10A2_UNORM); + break; + case WL_DRM_FORMAT_ARGB4444: + case WL_DRM_FORMAT_XRGB4444: + wsi_wl_display_add_vk_format(display, VK_FORMAT_B4G4R4A4_UNORM); + break; + case WL_DRM_FORMAT_RGB565: + wsi_wl_display_add_vk_format(display, VK_FORMAT_B5G6R5_UNORM); + break; + case WL_DRM_FORMAT_ARGB1555: + case WL_DRM_FORMAT_XRGB1555: + wsi_wl_display_add_vk_format(display, VK_FORMAT_B5G5R5A1_UNORM); + break; +#endif + case WL_DRM_FORMAT_XRGB8888: + wsi_wl_display_add_vk_format(display, VK_FORMAT_B8G8R8_SRGB); + /* fallthrough */ + case WL_DRM_FORMAT_ARGB8888: + wsi_wl_display_add_vk_format(display, VK_FORMAT_B8G8R8A8_SRGB); + break; +#if 0 + case WL_DRM_FORMAT_ARGB2101010: + case WL_DRM_FORMAT_XRGB2101010: + wsi_wl_display_add_vk_format(display, VK_FORMAT_B10G10R10A2_UNORM); + break; +#endif + } +} + +static void +drm_handle_authenticated(void *data, struct wl_drm *drm) +{ +} + +static void +drm_handle_capabilities(void *data, struct wl_drm *drm, uint32_t capabilities) +{ + struct wsi_wl_display *display = data; + + display->capabilities = capabilities; +} + +static const struct wl_drm_listener drm_listener = { + drm_handle_device, + drm_handle_format, + drm_handle_authenticated, + drm_handle_capabilities, +}; + +static void +registry_handle_global(void *data, struct wl_registry *registry, + uint32_t name, const char *interface, uint32_t version) +{ + struct wsi_wl_display *display = data; + + if (strcmp(interface, "wl_drm") == 0) { + assert(display->drm == NULL); + + assert(version >= 2); + display->drm = wl_registry_bind(registry, name, &wl_drm_interface, 2); + + if (display->drm) + wl_drm_add_listener(display->drm, &drm_listener, display); + } +} + +static void +registry_handle_global_remove(void *data, struct wl_registry *registry, + uint32_t name) +{ /* No-op */ } + +static const struct wl_registry_listener registry_listener = { + registry_handle_global, + registry_handle_global_remove +}; + +static void +wsi_wl_display_destroy(struct wsi_wayland *wsi, struct wsi_wl_display *display) +{ + radv_vector_finish(&display->formats); + if (display->drm) + wl_drm_destroy(display->drm); + radv_free(&wsi->physical_device->instance->alloc, display); +} + +static struct wsi_wl_display * +wsi_wl_display_create(struct wsi_wayland *wsi, struct wl_display *wl_display) +{ + struct wsi_wl_display *display = + radv_alloc(&wsi->physical_device->instance->alloc, sizeof(*display), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + if (!display) + return NULL; + + memset(display, 0, sizeof(*display)); + + display->display = wl_display; + display->physical_device = wsi->physical_device; + + if (!radv_vector_init(&display->formats, sizeof(VkFormat), 8)) + goto fail; + + struct wl_registry *registry = wl_display_get_registry(wl_display); + if (!registry) + return NULL; + + wl_registry_add_listener(registry, ®istry_listener, display); + + /* Round-rip to get the wl_drm global */ + wl_display_roundtrip(wl_display); + + if (!display->drm) + goto fail; + + /* Round-rip to get wl_drm formats and capabilities */ + wl_display_roundtrip(wl_display); + + /* We need prime support */ + if (!(display->capabilities & WL_DRM_CAPABILITY_PRIME)) + goto fail; + + /* We don't need this anymore */ + wl_registry_destroy(registry); + + return display; + +fail: + if (registry) + wl_registry_destroy(registry); + + wsi_wl_display_destroy(wsi, display); + return NULL; +} + +static struct wsi_wl_display * +wsi_wl_get_display(struct radv_physical_device *device, + struct wl_display *wl_display) +{ + struct wsi_wayland *wsi = + (struct wsi_wayland *)device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND]; + + pthread_mutex_lock(&wsi->mutex); + + struct hash_entry *entry = _mesa_hash_table_search(wsi->displays, + wl_display); + if (!entry) { + /* We're about to make a bunch of blocking calls. Let's drop the + * mutex for now so we don't block up too badly. + */ + pthread_mutex_unlock(&wsi->mutex); + + struct wsi_wl_display *display = wsi_wl_display_create(wsi, wl_display); + + pthread_mutex_lock(&wsi->mutex); + + entry = _mesa_hash_table_search(wsi->displays, wl_display); + if (entry) { + /* Oops, someone raced us to it */ + wsi_wl_display_destroy(wsi, display); + } else { + entry = _mesa_hash_table_insert(wsi->displays, wl_display, display); + } + } + + pthread_mutex_unlock(&wsi->mutex); + + return entry->data; +} + +VkBool32 radv_GetPhysicalDeviceWaylandPresentationSupportKHR( + VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + struct wl_display* display) +{ + RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice); + + return wsi_wl_get_display(physical_device, display) != NULL; +} + +static VkResult +wsi_wl_surface_get_support(VkIcdSurfaceBase *surface, + struct radv_physical_device *device, + uint32_t queueFamilyIndex, + VkBool32* pSupported) +{ + *pSupported = true; + + return VK_SUCCESS; +} + +static const VkPresentModeKHR present_modes[] = { + VK_PRESENT_MODE_MAILBOX_KHR, + VK_PRESENT_MODE_FIFO_KHR, +}; + +static VkResult +wsi_wl_surface_get_capabilities(VkIcdSurfaceBase *surface, + struct radv_physical_device *device, + VkSurfaceCapabilitiesKHR* caps) +{ + caps->minImageCount = MIN_NUM_IMAGES; + caps->maxImageCount = 4; + caps->currentExtent = (VkExtent2D) { -1, -1 }; + caps->minImageExtent = (VkExtent2D) { 1, 1 }; + caps->maxImageExtent = (VkExtent2D) { INT16_MAX, INT16_MAX }; + caps->supportedTransforms = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; + caps->currentTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; + caps->maxImageArrayLayers = 1; + + caps->supportedCompositeAlpha = + VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR | + VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR; + + caps->supportedUsageFlags = + VK_IMAGE_USAGE_TRANSFER_SRC_BIT | + VK_IMAGE_USAGE_SAMPLED_BIT | + VK_IMAGE_USAGE_TRANSFER_DST_BIT | + VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + + return VK_SUCCESS; +} + +static VkResult +wsi_wl_surface_get_formats(VkIcdSurfaceBase *icd_surface, + struct radv_physical_device *device, + uint32_t* pSurfaceFormatCount, + VkSurfaceFormatKHR* pSurfaceFormats) +{ + VkIcdSurfaceWayland *surface = (VkIcdSurfaceWayland *)icd_surface; + struct wsi_wl_display *display = + wsi_wl_get_display(device, surface->display); + + uint32_t count = radv_vector_length(&display->formats); + + if (pSurfaceFormats == NULL) { + *pSurfaceFormatCount = count; + return VK_SUCCESS; + } + + assert(*pSurfaceFormatCount >= count); + *pSurfaceFormatCount = count; + + VkFormat *f; + radv_vector_foreach(f, &display->formats) { + *(pSurfaceFormats++) = (VkSurfaceFormatKHR) { + .format = *f, + /* TODO: We should get this from the compositor somehow */ + .colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR, + }; + } + + return VK_SUCCESS; +} + +static VkResult +wsi_wl_surface_get_present_modes(VkIcdSurfaceBase *surface, + struct radv_physical_device *device, + uint32_t* pPresentModeCount, + VkPresentModeKHR* pPresentModes) +{ + if (pPresentModes == NULL) { + *pPresentModeCount = ARRAY_SIZE(present_modes); + return VK_SUCCESS; + } + + assert(*pPresentModeCount >= ARRAY_SIZE(present_modes)); + typed_memcpy(pPresentModes, present_modes, *pPresentModeCount); + *pPresentModeCount = ARRAY_SIZE(present_modes); + + return VK_SUCCESS; +} + +static VkResult +wsi_wl_surface_create_swapchain(VkIcdSurfaceBase *surface, + struct radv_device *device, + const VkSwapchainCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + struct radv_swapchain **swapchain); + +VkResult radv_CreateWaylandSurfaceKHR( + VkInstance _instance, + const VkWaylandSurfaceCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSurfaceKHR* pSurface) +{ + RADV_FROM_HANDLE(radv_instance, instance, _instance); + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR); + + VkIcdSurfaceWayland *surface; + + surface = radv_alloc2(&instance->alloc, pAllocator, sizeof *surface, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (surface == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + surface->base.platform = VK_ICD_WSI_PLATFORM_WAYLAND; + surface->display = pCreateInfo->display; + surface->surface = pCreateInfo->surface; + + *pSurface = _VkIcdSurfaceBase_to_handle(&surface->base); + + return VK_SUCCESS; +} + +struct wsi_wl_image { + struct radv_image * image; + struct radv_device_memory * memory; + struct wl_buffer * buffer; + bool busy; +}; + +struct wsi_wl_swapchain { + struct radv_swapchain base; + + struct wsi_wl_display * display; + struct wl_event_queue * queue; + struct wl_surface * surface; + + VkExtent2D extent; + VkFormat vk_format; + uint32_t drm_format; + + VkPresentModeKHR present_mode; + bool fifo_ready; + + uint32_t image_count; + struct wsi_wl_image images[0]; +}; + +static VkResult +wsi_wl_swapchain_get_images(struct radv_swapchain *radv_chain, + uint32_t *pCount, VkImage *pSwapchainImages) +{ + struct wsi_wl_swapchain *chain = (struct wsi_wl_swapchain *)radv_chain; + + if (pSwapchainImages == NULL) { + *pCount = chain->image_count; + return VK_SUCCESS; + } + + assert(chain->image_count <= *pCount); + for (uint32_t i = 0; i < chain->image_count; i++) + pSwapchainImages[i] = radv_image_to_handle(chain->images[i].image); + + *pCount = chain->image_count; + + return VK_SUCCESS; +} + +static VkResult +wsi_wl_swapchain_acquire_next_image(struct radv_swapchain *radv_chain, + uint64_t timeout, + VkSemaphore semaphore, + uint32_t *image_index) +{ + struct wsi_wl_swapchain *chain = (struct wsi_wl_swapchain *)radv_chain; + + int ret = wl_display_dispatch_queue_pending(chain->display->display, + chain->queue); + /* XXX: I'm not sure if out-of-date is the right error here. If + * wl_display_dispatch_queue_pending fails it most likely means we got + * kicked by the server so this seems more-or-less correct. + */ + if (ret < 0) + return vk_error(VK_ERROR_OUT_OF_DATE_KHR); + + while (1) { + for (uint32_t i = 0; i < chain->image_count; i++) { + if (!chain->images[i].busy) { + /* We found a non-busy image */ + *image_index = i; + return VK_SUCCESS; + } + } + + /* This time we do a blocking dispatch because we can't go + * anywhere until we get an event. + */ + int ret = wl_display_roundtrip_queue(chain->display->display, + chain->queue); + if (ret < 0) + return vk_error(VK_ERROR_OUT_OF_DATE_KHR); + } +} + +static void +frame_handle_done(void *data, struct wl_callback *callback, uint32_t serial) +{ + struct wsi_wl_swapchain *chain = data; + + chain->fifo_ready = true; + + wl_callback_destroy(callback); +} + +static const struct wl_callback_listener frame_listener = { + frame_handle_done, +}; + +static VkResult +wsi_wl_swapchain_queue_present(struct radv_swapchain *radv_chain, + struct radv_queue *queue, + uint32_t image_index) +{ + struct wsi_wl_swapchain *chain = (struct wsi_wl_swapchain *)radv_chain; + + if (chain->present_mode == VK_PRESENT_MODE_FIFO_KHR) { + while (!chain->fifo_ready) { + int ret = wl_display_dispatch_queue(chain->display->display, + chain->queue); + if (ret < 0) + return vk_error(VK_ERROR_OUT_OF_DATE_KHR); + } + } + + assert(image_index < chain->image_count); + wl_surface_attach(chain->surface, chain->images[image_index].buffer, 0, 0); + wl_surface_damage(chain->surface, 0, 0, INT32_MAX, INT32_MAX); + + if (chain->present_mode == VK_PRESENT_MODE_FIFO_KHR) { + struct wl_callback *frame = wl_surface_frame(chain->surface); + wl_proxy_set_queue((struct wl_proxy *)frame, chain->queue); + wl_callback_add_listener(frame, &frame_listener, chain); + chain->fifo_ready = false; + } + + chain->images[image_index].busy = true; + wl_surface_commit(chain->surface); + wl_display_flush(chain->display->display); + + return VK_SUCCESS; +} + +static void +wsi_wl_image_finish(struct wsi_wl_swapchain *chain, struct wsi_wl_image *image, + const VkAllocationCallbacks* pAllocator) +{ + VkDevice vk_device = radv_device_to_handle(chain->base.device); + radv_FreeMemory(vk_device, radv_device_memory_to_handle(image->memory), + pAllocator); + radv_DestroyImage(vk_device, radv_image_to_handle(image->image), + pAllocator); +} + +static void +buffer_handle_release(void *data, struct wl_buffer *buffer) +{ + struct wsi_wl_image *image = data; + + assert(image->buffer == buffer); + + image->busy = false; +} + +static const struct wl_buffer_listener buffer_listener = { + buffer_handle_release, +}; + +static VkResult +wsi_wl_image_init(struct wsi_wl_swapchain *chain, + struct wsi_wl_image *image, + const VkSwapchainCreateInfoKHR *pCreateInfo, + const VkAllocationCallbacks* pAllocator) +{ + VkDevice vk_device = radv_device_to_handle(chain->base.device); + VkResult result; + bool bret; + VkImage vk_image; + struct radeon_surf *surface; + int fd; + result = radv_image_create(vk_device, + &(struct radv_image_create_info) { + .vk_info = + &(VkImageCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .imageType = VK_IMAGE_TYPE_2D, + .format = chain->vk_format, + .extent = { + .width = chain->extent.width, + .height = chain->extent.height, + .depth = 1 + }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = 1, + /* FIXME: Need a way to use X tiling to allow scanout */ + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | + pCreateInfo->imageUsage), + .flags = 0, + }, + .scanout = true}, + pAllocator, + &vk_image); + + if (result != VK_SUCCESS) + return result; + + image->image = radv_image_from_handle(vk_image); + assert(vk_format_is_color(image->image->vk_format)); + + VkDeviceMemory vk_memory; + result = radv_AllocateMemory(vk_device, + &(VkMemoryAllocateInfo) { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = image->image->size, + .memoryTypeIndex = 0, + }, + pAllocator, + &vk_memory); + + if (result != VK_SUCCESS) + goto fail_image; + + image->memory = radv_device_memory_from_handle(vk_memory); + + result = radv_BindImageMemory(vk_device, vk_image, vk_memory, 0); + + if (result != VK_SUCCESS) + goto fail_mem; + + bret = chain->base.device->ws->buffer_get_fd(chain->base.device->ws, + image->memory->bo, &fd); + if (bret == false) + goto fail_mem; + + { + struct radeon_bo_metadata metadata; + radv_init_metadata(chain->base.device, image->image, &metadata); + chain->base.device->ws->buffer_set_metadata(image->memory->bo, &metadata); + } + surface = &image->image->surface; + + image->buffer = wl_drm_create_prime_buffer(chain->display->drm, + fd, /* name */ + chain->extent.width, + chain->extent.height, + chain->drm_format, + surface->level[0].offset, + surface->level[0].pitch_bytes, + 0, 0, 0, 0 /* unused */); + wl_display_roundtrip(chain->display->display); + close(fd); + + wl_proxy_set_queue((struct wl_proxy *)image->buffer, chain->queue); + wl_buffer_add_listener(image->buffer, &buffer_listener, image); + + return VK_SUCCESS; + +fail_mem: + radv_FreeMemory(vk_device, vk_memory, pAllocator); +fail_image: + radv_DestroyImage(vk_device, vk_image, pAllocator); + + return result; +} + +static VkResult +wsi_wl_swapchain_destroy(struct radv_swapchain *radv_chain, + const VkAllocationCallbacks *pAllocator) +{ + struct wsi_wl_swapchain *chain = (struct wsi_wl_swapchain *)radv_chain; + + for (uint32_t i = 0; i < chain->image_count; i++) { + if (chain->images[i].buffer) + wsi_wl_image_finish(chain, &chain->images[i], pAllocator); + } + + radv_free2(&chain->base.device->alloc, pAllocator, chain); + + return VK_SUCCESS; +} + +static VkResult +wsi_wl_surface_create_swapchain(VkIcdSurfaceBase *icd_surface, + struct radv_device *device, + const VkSwapchainCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + struct radv_swapchain **swapchain_out) +{ + VkIcdSurfaceWayland *surface = (VkIcdSurfaceWayland *)icd_surface; + struct wsi_wl_swapchain *chain; + VkResult result; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR); + + int num_images = pCreateInfo->minImageCount; + + assert(num_images >= MIN_NUM_IMAGES); + + /* For true mailbox mode, we need at least 4 images: + * 1) One to scan out from + * 2) One to have queued for scan-out + * 3) One to be currently held by the Wayland compositor + * 4) One to render to + */ + if (pCreateInfo->presentMode == VK_PRESENT_MODE_MAILBOX_KHR) + num_images = MAX2(num_images, 4); + + size_t size = sizeof(*chain) + num_images * sizeof(chain->images[0]); + chain = radv_alloc2(&device->alloc, pAllocator, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (chain == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + chain->base.device = device; + chain->base.destroy = wsi_wl_swapchain_destroy; + chain->base.get_images = wsi_wl_swapchain_get_images; + chain->base.acquire_next_image = wsi_wl_swapchain_acquire_next_image; + chain->base.queue_present = wsi_wl_swapchain_queue_present; + + chain->surface = surface->surface; + chain->extent = pCreateInfo->imageExtent; + chain->vk_format = pCreateInfo->imageFormat; + chain->drm_format = wl_drm_format_for_vk_format(chain->vk_format, false); + + chain->present_mode = pCreateInfo->presentMode; + chain->fifo_ready = true; + + chain->image_count = num_images; + + /* Mark a bunch of stuff as NULL. This way we can just call + * destroy_swapchain for cleanup. + */ + for (uint32_t i = 0; i < chain->image_count; i++) + chain->images[i].buffer = NULL; + chain->queue = NULL; + + chain->display = wsi_wl_get_display(&device->instance->physicalDevice, + surface->display); + if (!chain->display) { + result = vk_error(VK_ERROR_INITIALIZATION_FAILED); + goto fail; + } + + chain->queue = wl_display_create_queue(chain->display->display); + if (!chain->queue) { + result = vk_error(VK_ERROR_INITIALIZATION_FAILED); + goto fail; + } + + for (uint32_t i = 0; i < chain->image_count; i++) { + result = wsi_wl_image_init(chain, &chain->images[i], + pCreateInfo, pAllocator); + if (result != VK_SUCCESS) + goto fail; + chain->images[i].busy = false; + } + + *swapchain_out = &chain->base; + + return VK_SUCCESS; + +fail: + wsi_wl_swapchain_destroy(&chain->base, pAllocator); + + return result; +} + +VkResult +radv_wl_init_wsi(struct radv_physical_device *device) +{ + struct wsi_wayland *wsi; + VkResult result; + + wsi = radv_alloc(&device->instance->alloc, sizeof(*wsi), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + if (!wsi) { + result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail; + } + + wsi->physical_device = device; + + int ret = pthread_mutex_init(&wsi->mutex, NULL); + if (ret != 0) { + if (ret == ENOMEM) { + result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } else { + /* FINISHME: Choose a better error. */ + result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } + + goto fail_alloc; + } + + wsi->displays = _mesa_hash_table_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + if (!wsi->displays) { + result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_mutex; + } + + wsi->base.get_support = wsi_wl_surface_get_support; + wsi->base.get_capabilities = wsi_wl_surface_get_capabilities; + wsi->base.get_formats = wsi_wl_surface_get_formats; + wsi->base.get_present_modes = wsi_wl_surface_get_present_modes; + wsi->base.create_swapchain = wsi_wl_surface_create_swapchain; + + device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND] = &wsi->base; + + return VK_SUCCESS; + +fail_mutex: + pthread_mutex_destroy(&wsi->mutex); + +fail_alloc: + radv_free(&device->instance->alloc, wsi); +fail: + device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND] = NULL; + + return result; +} + +void +radv_wl_finish_wsi(struct radv_physical_device *device) +{ + struct wsi_wayland *wsi = + (struct wsi_wayland *)device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND]; + + if (wsi) { + _mesa_hash_table_destroy(wsi->displays, NULL); + + pthread_mutex_destroy(&wsi->mutex); + + radv_free(&device->instance->alloc, wsi); + } +} diff --git a/src/amd/vulkan/radv_wsi_x11.c b/src/amd/vulkan/radv_wsi_x11.c new file mode 100644 index 00000000000..0aae2a3530d --- /dev/null +++ b/src/amd/vulkan/radv_wsi_x11.c @@ -0,0 +1,963 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based mostly on anv driver which is: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include "radv_wsi.h" +#include "vk_format.h" +#include "util/hash_table.h" + +struct wsi_x11_connection { + bool has_dri3; + bool has_present; +}; + +struct wsi_x11 { + struct radv_wsi_interface base; + + pthread_mutex_t mutex; + /* Hash table of xcb_connection -> wsi_x11_connection mappings */ + struct hash_table *connections; +}; + +static struct wsi_x11_connection * +wsi_x11_connection_create(struct radv_physical_device *device, + xcb_connection_t *conn) +{ + xcb_query_extension_cookie_t dri3_cookie, pres_cookie; + xcb_query_extension_reply_t *dri3_reply, *pres_reply; + + struct wsi_x11_connection *wsi_conn = + radv_alloc(&device->instance->alloc, sizeof(*wsi_conn), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + if (!wsi_conn) + return NULL; + + dri3_cookie = xcb_query_extension(conn, 4, "DRI3"); + pres_cookie = xcb_query_extension(conn, 7, "PRESENT"); + + dri3_reply = xcb_query_extension_reply(conn, dri3_cookie, NULL); + pres_reply = xcb_query_extension_reply(conn, pres_cookie, NULL); + if (dri3_reply == NULL || pres_reply == NULL) { + free(dri3_reply); + free(pres_reply); + radv_free(&device->instance->alloc, wsi_conn); + return NULL; + } + + wsi_conn->has_dri3 = dri3_reply->present != 0; + wsi_conn->has_present = pres_reply->present != 0; + + free(dri3_reply); + free(pres_reply); + + return wsi_conn; +} + +static void +wsi_x11_connection_destroy(struct radv_physical_device *device, + struct wsi_x11_connection *conn) +{ + radv_free(&device->instance->alloc, conn); +} + +static struct wsi_x11_connection * +wsi_x11_get_connection(struct radv_physical_device *device, + xcb_connection_t *conn) +{ + struct wsi_x11 *wsi = + (struct wsi_x11 *)device->wsi[VK_ICD_WSI_PLATFORM_XCB]; + + pthread_mutex_lock(&wsi->mutex); + + struct hash_entry *entry = _mesa_hash_table_search(wsi->connections, conn); + if (!entry) { + /* We're about to make a bunch of blocking calls. Let's drop the + * mutex for now so we don't block up too badly. + */ + pthread_mutex_unlock(&wsi->mutex); + + struct wsi_x11_connection *wsi_conn = + wsi_x11_connection_create(device, conn); + + pthread_mutex_lock(&wsi->mutex); + + entry = _mesa_hash_table_search(wsi->connections, conn); + if (entry) { + /* Oops, someone raced us to it */ + wsi_x11_connection_destroy(device, wsi_conn); + } else { + entry = _mesa_hash_table_insert(wsi->connections, conn, wsi_conn); + } + } + + pthread_mutex_unlock(&wsi->mutex); + + return entry->data; +} + +static const VkSurfaceFormatKHR formats[] = { + { .format = VK_FORMAT_B8G8R8A8_UNORM, }, + { .format = VK_FORMAT_B8G8R8A8_SRGB, }, +}; + +static const VkPresentModeKHR present_modes[] = { + VK_PRESENT_MODE_MAILBOX_KHR, +}; + +static xcb_screen_t * +get_screen_for_root(xcb_connection_t *conn, xcb_window_t root) +{ + xcb_screen_iterator_t screen_iter = + xcb_setup_roots_iterator(xcb_get_setup(conn)); + + for (; screen_iter.rem; xcb_screen_next (&screen_iter)) { + if (screen_iter.data->root == root) + return screen_iter.data; + } + + return NULL; +} + +static xcb_visualtype_t * +screen_get_visualtype(xcb_screen_t *screen, xcb_visualid_t visual_id, + unsigned *depth) +{ + xcb_depth_iterator_t depth_iter = + xcb_screen_allowed_depths_iterator(screen); + + for (; depth_iter.rem; xcb_depth_next (&depth_iter)) { + xcb_visualtype_iterator_t visual_iter = + xcb_depth_visuals_iterator (depth_iter.data); + + for (; visual_iter.rem; xcb_visualtype_next (&visual_iter)) { + if (visual_iter.data->visual_id == visual_id) { + if (depth) + *depth = depth_iter.data->depth; + return visual_iter.data; + } + } + } + + return NULL; +} + +static xcb_visualtype_t * +connection_get_visualtype(xcb_connection_t *conn, xcb_visualid_t visual_id, + unsigned *depth) +{ + xcb_screen_iterator_t screen_iter = + xcb_setup_roots_iterator(xcb_get_setup(conn)); + + /* For this we have to iterate over all of the screens which is rather + * annoying. Fortunately, there is probably only 1. + */ + for (; screen_iter.rem; xcb_screen_next (&screen_iter)) { + xcb_visualtype_t *visual = screen_get_visualtype(screen_iter.data, + visual_id, depth); + if (visual) + return visual; + } + + return NULL; +} + +static xcb_visualtype_t * +get_visualtype_for_window(xcb_connection_t *conn, xcb_window_t window, + unsigned *depth) +{ + xcb_query_tree_cookie_t tree_cookie; + xcb_get_window_attributes_cookie_t attrib_cookie; + xcb_query_tree_reply_t *tree; + xcb_get_window_attributes_reply_t *attrib; + + tree_cookie = xcb_query_tree(conn, window); + attrib_cookie = xcb_get_window_attributes(conn, window); + + tree = xcb_query_tree_reply(conn, tree_cookie, NULL); + attrib = xcb_get_window_attributes_reply(conn, attrib_cookie, NULL); + if (attrib == NULL || tree == NULL) { + free(attrib); + free(tree); + return NULL; + } + + xcb_window_t root = tree->root; + xcb_visualid_t visual_id = attrib->visual; + free(attrib); + free(tree); + + xcb_screen_t *screen = get_screen_for_root(conn, root); + if (screen == NULL) + return NULL; + + return screen_get_visualtype(screen, visual_id, depth); +} + +static bool +visual_has_alpha(xcb_visualtype_t *visual, unsigned depth) +{ + uint32_t rgb_mask = visual->red_mask | + visual->green_mask | + visual->blue_mask; + + uint32_t all_mask = 0xffffffff >> (32 - depth); + + /* Do we have bits left over after RGB? */ + return (all_mask & ~rgb_mask) != 0; +} + +VkBool32 radv_GetPhysicalDeviceXcbPresentationSupportKHR( + VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + xcb_connection_t* connection, + xcb_visualid_t visual_id) +{ + RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice); + + struct wsi_x11_connection *wsi_conn = + wsi_x11_get_connection(device, connection); + + if (!wsi_conn->has_dri3) { + fprintf(stderr, "vulkan: No DRI3 support\n"); + return false; + } + + unsigned visual_depth; + if (!connection_get_visualtype(connection, visual_id, &visual_depth)) + return false; + + if (visual_depth != 24 && visual_depth != 32) + return false; + + return true; +} + +VkBool32 radv_GetPhysicalDeviceXlibPresentationSupportKHR( + VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + Display* dpy, + VisualID visualID) +{ + return radv_GetPhysicalDeviceXcbPresentationSupportKHR(physicalDevice, + queueFamilyIndex, + XGetXCBConnection(dpy), + visualID); +} + +static xcb_connection_t* +x11_surface_get_connection(VkIcdSurfaceBase *icd_surface) +{ + if (icd_surface->platform == VK_ICD_WSI_PLATFORM_XLIB) + return XGetXCBConnection(((VkIcdSurfaceXlib *)icd_surface)->dpy); + else + return ((VkIcdSurfaceXcb *)icd_surface)->connection; +} + +static xcb_window_t +x11_surface_get_window(VkIcdSurfaceBase *icd_surface) +{ + if (icd_surface->platform == VK_ICD_WSI_PLATFORM_XLIB) + return ((VkIcdSurfaceXlib *)icd_surface)->window; + else + return ((VkIcdSurfaceXcb *)icd_surface)->window; +} + +static VkResult +x11_surface_get_support(VkIcdSurfaceBase *icd_surface, + struct radv_physical_device *device, + uint32_t queueFamilyIndex, + VkBool32* pSupported) +{ + xcb_connection_t *conn = x11_surface_get_connection(icd_surface); + xcb_window_t window = x11_surface_get_window(icd_surface); + + struct wsi_x11_connection *wsi_conn = + wsi_x11_get_connection(device, conn); + if (!wsi_conn) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + if (!wsi_conn->has_dri3) { + fprintf(stderr, "vulkan: No DRI3 support\n"); + *pSupported = false; + return VK_SUCCESS; + } + + unsigned visual_depth; + if (!get_visualtype_for_window(conn, window, &visual_depth)) { + *pSupported = false; + return VK_SUCCESS; + } + + if (visual_depth != 24 && visual_depth != 32) { + *pSupported = false; + return VK_SUCCESS; + } + + *pSupported = true; + return VK_SUCCESS; +} + +static VkResult +x11_surface_get_capabilities(VkIcdSurfaceBase *icd_surface, + struct radv_physical_device *device, + VkSurfaceCapabilitiesKHR *caps) +{ + xcb_connection_t *conn = x11_surface_get_connection(icd_surface); + xcb_window_t window = x11_surface_get_window(icd_surface); + xcb_get_geometry_cookie_t geom_cookie; + xcb_generic_error_t *err; + xcb_get_geometry_reply_t *geom; + unsigned visual_depth; + + geom_cookie = xcb_get_geometry(conn, window); + + /* This does a round-trip. This is why we do get_geometry first and + * wait to read the reply until after we have a visual. + */ + xcb_visualtype_t *visual = + get_visualtype_for_window(conn, window, &visual_depth); + + geom = xcb_get_geometry_reply(conn, geom_cookie, &err); + if (geom) { + VkExtent2D extent = { geom->width, geom->height }; + caps->currentExtent = extent; + caps->minImageExtent = extent; + caps->maxImageExtent = extent; + } else { + /* This can happen if the client didn't wait for the configure event + * to come back from the compositor. In that case, we don't know the + * size of the window so we just return valid "I don't know" stuff. + */ + caps->currentExtent = (VkExtent2D) { -1, -1 }; + caps->minImageExtent = (VkExtent2D) { 1, 1 }; + caps->maxImageExtent = (VkExtent2D) { INT16_MAX, INT16_MAX }; + } + free(err); + free(geom); + + if (visual_has_alpha(visual, visual_depth)) { + caps->supportedCompositeAlpha = VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR | + VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR; + } else { + caps->supportedCompositeAlpha = VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR | + VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; + } + + caps->minImageCount = 2; + caps->maxImageCount = 4; + caps->supportedTransforms = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; + caps->currentTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; + caps->maxImageArrayLayers = 1; + caps->supportedUsageFlags = + VK_IMAGE_USAGE_TRANSFER_SRC_BIT | + VK_IMAGE_USAGE_SAMPLED_BIT | + VK_IMAGE_USAGE_TRANSFER_DST_BIT | + VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + + return VK_SUCCESS; +} + +static VkResult +x11_surface_get_formats(VkIcdSurfaceBase *surface, + struct radv_physical_device *device, + uint32_t *pSurfaceFormatCount, + VkSurfaceFormatKHR *pSurfaceFormats) +{ + if (pSurfaceFormats == NULL) { + *pSurfaceFormatCount = ARRAY_SIZE(formats); + return VK_SUCCESS; + } + + assert(*pSurfaceFormatCount >= ARRAY_SIZE(formats)); + typed_memcpy(pSurfaceFormats, formats, *pSurfaceFormatCount); + *pSurfaceFormatCount = ARRAY_SIZE(formats); + + return VK_SUCCESS; +} + +static VkResult +x11_surface_get_present_modes(VkIcdSurfaceBase *surface, + struct radv_physical_device *device, + uint32_t *pPresentModeCount, + VkPresentModeKHR *pPresentModes) +{ + if (pPresentModes == NULL) { + *pPresentModeCount = ARRAY_SIZE(present_modes); + return VK_SUCCESS; + } + + assert(*pPresentModeCount >= ARRAY_SIZE(present_modes)); + typed_memcpy(pPresentModes, present_modes, *pPresentModeCount); + *pPresentModeCount = ARRAY_SIZE(present_modes); + + return VK_SUCCESS; +} + +static VkResult +x11_surface_create_swapchain(VkIcdSurfaceBase *surface, + struct radv_device *device, + const VkSwapchainCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + struct radv_swapchain **swapchain); + +VkResult radv_CreateXcbSurfaceKHR( + VkInstance _instance, + const VkXcbSurfaceCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSurfaceKHR* pSurface) +{ + RADV_FROM_HANDLE(radv_instance, instance, _instance); + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XCB_SURFACE_CREATE_INFO_KHR); + + VkIcdSurfaceXcb *surface; + + surface = radv_alloc2(&instance->alloc, pAllocator, sizeof *surface, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (surface == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + surface->base.platform = VK_ICD_WSI_PLATFORM_XCB; + surface->connection = pCreateInfo->connection; + surface->window = pCreateInfo->window; + + *pSurface = _VkIcdSurfaceBase_to_handle(&surface->base); + + return VK_SUCCESS; +} + +VkResult radv_CreateXlibSurfaceKHR( + VkInstance _instance, + const VkXlibSurfaceCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSurfaceKHR* pSurface) +{ + RADV_FROM_HANDLE(radv_instance, instance, _instance); + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR); + + VkIcdSurfaceXlib *surface; + + surface = radv_alloc2(&instance->alloc, pAllocator, sizeof *surface, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (surface == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + surface->base.platform = VK_ICD_WSI_PLATFORM_XLIB; + surface->dpy = pCreateInfo->dpy; + surface->window = pCreateInfo->window; + + *pSurface = _VkIcdSurfaceBase_to_handle(&surface->base); + + return VK_SUCCESS; +} + +struct x11_image { + struct radv_image * image; + struct radv_device_memory * memory; + xcb_pixmap_t pixmap; + bool busy; + struct xshmfence * shm_fence; + uint32_t sync_fence; +}; + +struct x11_swapchain { + struct radv_swapchain base; + + xcb_connection_t * conn; + xcb_window_t window; + xcb_gc_t gc; + VkExtent2D extent; + uint32_t image_count; + + xcb_present_event_t event_id; + xcb_special_event_t * special_event; + uint64_t send_sbc; + uint32_t stamp; + + struct x11_image images[0]; +}; + +static VkResult +x11_get_images(struct radv_swapchain *radv_chain, + uint32_t* pCount, VkImage *pSwapchainImages) +{ + struct x11_swapchain *chain = (struct x11_swapchain *)radv_chain; + + if (pSwapchainImages == NULL) { + *pCount = chain->image_count; + return VK_SUCCESS; + } + + assert(chain->image_count <= *pCount); + for (uint32_t i = 0; i < chain->image_count; i++) + pSwapchainImages[i] = radv_image_to_handle(chain->images[i].image); + + *pCount = chain->image_count; + + return VK_SUCCESS; +} + +static VkResult +x11_handle_dri3_present_event(struct x11_swapchain *chain, + xcb_present_generic_event_t *event) +{ + switch (event->evtype) { + case XCB_PRESENT_CONFIGURE_NOTIFY: { + xcb_present_configure_notify_event_t *config = (void *) event; + + if (config->width != chain->extent.width || + config->height != chain->extent.height) + return vk_error(VK_ERROR_OUT_OF_DATE_KHR); + + break; + } + + case XCB_PRESENT_EVENT_IDLE_NOTIFY: { + xcb_present_idle_notify_event_t *idle = (void *) event; + + for (unsigned i = 0; i < chain->image_count; i++) { + if (chain->images[i].pixmap == idle->pixmap) { + chain->images[i].busy = false; + break; + } + } + + break; + } + + case XCB_PRESENT_COMPLETE_NOTIFY: + default: + break; + } + + return VK_SUCCESS; +} + +static VkResult +x11_acquire_next_image(struct radv_swapchain *radv_chain, + uint64_t timeout, + VkSemaphore semaphore, + uint32_t *image_index) +{ + struct x11_swapchain *chain = (struct x11_swapchain *)radv_chain; + + while (1) { + for (uint32_t i = 0; i < chain->image_count; i++) { + if (!chain->images[i].busy) { + /* We found a non-busy image */ + xshmfence_await(chain->images[i].shm_fence); + *image_index = i; + return VK_SUCCESS; + } + } + + xcb_flush(chain->conn); + xcb_generic_event_t *event = + xcb_wait_for_special_event(chain->conn, chain->special_event); + if (!event) + return vk_error(VK_ERROR_OUT_OF_DATE_KHR); + + VkResult result = x11_handle_dri3_present_event(chain, (void *)event); + free(event); + if (result != VK_SUCCESS) + return result; + } +} + +static VkResult +x11_queue_present(struct radv_swapchain *radv_chain, + struct radv_queue *queue, + uint32_t image_index) +{ + struct x11_swapchain *chain = (struct x11_swapchain *)radv_chain; + struct x11_image *image = &chain->images[image_index]; + + assert(image_index < chain->image_count); + + uint32_t options = XCB_PRESENT_OPTION_NONE; + + int64_t target_msc = 0; + int64_t divisor = 0; + int64_t remainder = 0; + + options |= XCB_PRESENT_OPTION_ASYNC; + + xshmfence_reset(image->shm_fence); + + ++chain->send_sbc; + xcb_void_cookie_t cookie = + xcb_present_pixmap(chain->conn, + chain->window, + image->pixmap, + (uint32_t) chain->send_sbc, + 0, /* valid */ + 0, /* update */ + 0, /* x_off */ + 0, /* y_off */ + XCB_NONE, /* target_crtc */ + XCB_NONE, + image->sync_fence, + options, + target_msc, + divisor, + remainder, 0, NULL); + xcb_discard_reply(chain->conn, cookie.sequence); + image->busy = true; + + xcb_flush(chain->conn); + + return VK_SUCCESS; +} + +static VkResult +x11_image_init(struct radv_device *device, struct x11_swapchain *chain, + const VkSwapchainCreateInfoKHR *pCreateInfo, + const VkAllocationCallbacks* pAllocator, + struct x11_image *image) +{ + xcb_void_cookie_t cookie; + VkResult result = VK_SUCCESS; + int fd; + VkImage image_h; + bool bret; + struct radeon_surf *surface; + result = radv_image_create(radv_device_to_handle(device), + &(struct radv_image_create_info) { + .vk_info = + &(VkImageCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .imageType = VK_IMAGE_TYPE_2D, + .format = pCreateInfo->imageFormat, + .extent = { + .width = pCreateInfo->imageExtent.width, + .height = pCreateInfo->imageExtent.height, + .depth = 1 + }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = 1, + /* FIXME: Need a way to use X tiling to allow scanout */ + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, + .flags = 0, + }, + .scanout = true}, + NULL, + &image_h); + if (result != VK_SUCCESS) + return result; + + image->image = radv_image_from_handle(image_h); + + VkDeviceMemory memory_h; + result = radv_AllocateMemory(radv_device_to_handle(device), + &(VkMemoryAllocateInfo) { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = image->image->size, + .memoryTypeIndex = 0, + }, + NULL /* XXX: pAllocator */, + &memory_h); + if (result != VK_SUCCESS) + goto fail_create_image; + + image->memory = radv_device_memory_from_handle(memory_h); + // image->memory->bo.is_winsys_bo = true; + + radv_BindImageMemory(VK_NULL_HANDLE, image_h, memory_h, 0); + + bret = device->ws->buffer_get_fd(device->ws, + image->memory->bo, &fd); + if (bret == false) + goto fail_alloc_memory; + + { + struct radeon_bo_metadata metadata; + radv_init_metadata(device, image->image, &metadata); + device->ws->buffer_set_metadata(image->memory->bo, &metadata); + } + surface = &image->image->surface; + uint32_t bpp = 32; + uint32_t depth = 24; + image->pixmap = xcb_generate_id(chain->conn); + + cookie = + xcb_dri3_pixmap_from_buffer_checked(chain->conn, + image->pixmap, + chain->window, + image->image->size, + pCreateInfo->imageExtent.width, + pCreateInfo->imageExtent.height, + surface->level[0].pitch_bytes, + depth, bpp, fd); + xcb_discard_reply(chain->conn, cookie.sequence); + + int fence_fd = xshmfence_alloc_shm(); + if (fence_fd < 0) + goto fail_pixmap; + + image->shm_fence = xshmfence_map_shm(fence_fd); + if (image->shm_fence == NULL) + goto fail_shmfence_alloc; + + image->sync_fence = xcb_generate_id(chain->conn); + xcb_dri3_fence_from_fd(chain->conn, + image->pixmap, + image->sync_fence, + false, + fence_fd); + + image->busy = false; + xshmfence_trigger(image->shm_fence); + + return VK_SUCCESS; + +fail_shmfence_alloc: + close(fence_fd); + +fail_pixmap: + cookie = xcb_free_pixmap(chain->conn, image->pixmap); + xcb_discard_reply(chain->conn, cookie.sequence); + +fail_alloc_memory: + radv_FreeMemory(radv_device_to_handle(chain->base.device), + radv_device_memory_to_handle(image->memory), pAllocator); + +fail_create_image: + radv_DestroyImage(radv_device_to_handle(chain->base.device), + radv_image_to_handle(image->image), pAllocator); + + return result; +} + +static void +x11_image_finish(struct x11_swapchain *chain, + const VkAllocationCallbacks* pAllocator, + struct x11_image *image) +{ + xcb_void_cookie_t cookie; + + cookie = xcb_sync_destroy_fence(chain->conn, image->sync_fence); + xcb_discard_reply(chain->conn, cookie.sequence); + xshmfence_unmap_shm(image->shm_fence); + + cookie = xcb_free_pixmap(chain->conn, image->pixmap); + xcb_discard_reply(chain->conn, cookie.sequence); + + radv_DestroyImage(radv_device_to_handle(chain->base.device), + radv_image_to_handle(image->image), pAllocator); + + radv_FreeMemory(radv_device_to_handle(chain->base.device), + radv_device_memory_to_handle(image->memory), pAllocator); + +} + +static VkResult +x11_swapchain_destroy(struct radv_swapchain *radv_chain, + const VkAllocationCallbacks *pAllocator) +{ + struct x11_swapchain *chain = (struct x11_swapchain *)radv_chain; + + for (uint32_t i = 0; i < chain->image_count; i++) + x11_image_finish(chain, pAllocator, &chain->images[i]); + + xcb_unregister_for_special_event(chain->conn, chain->special_event); + + radv_free2(&chain->base.device->alloc, pAllocator, chain); + + return VK_SUCCESS; +} + +static VkResult +x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface, + struct radv_device *device, + const VkSwapchainCreateInfoKHR *pCreateInfo, + const VkAllocationCallbacks* pAllocator, + struct radv_swapchain **swapchain_out) +{ + struct x11_swapchain *chain; + xcb_void_cookie_t cookie; + VkResult result; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR); + + int num_images = pCreateInfo->minImageCount; + + /* For true mailbox mode, we need at least 4 images: + * 1) One to scan out from + * 2) One to have queued for scan-out + * 3) One to be currently held by the Wayland compositor + * 4) One to render to + */ + if (pCreateInfo->presentMode == VK_PRESENT_MODE_MAILBOX_KHR) + num_images = MAX(num_images, 4); + + size_t size = sizeof(*chain) + num_images * sizeof(chain->images[0]); + chain = radv_alloc2(&device->alloc, pAllocator, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (chain == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + chain->base.device = device; + chain->base.destroy = x11_swapchain_destroy; + chain->base.get_images = x11_get_images; + chain->base.acquire_next_image = x11_acquire_next_image; + chain->base.queue_present = x11_queue_present; + + chain->conn = x11_surface_get_connection(icd_surface); + chain->window = x11_surface_get_window(icd_surface); + chain->extent = pCreateInfo->imageExtent; + chain->image_count = num_images; + + chain->send_sbc = 0; + chain->event_id = xcb_generate_id(chain->conn); + xcb_present_select_input(chain->conn, chain->event_id, chain->window, + XCB_PRESENT_EVENT_MASK_CONFIGURE_NOTIFY | + XCB_PRESENT_EVENT_MASK_COMPLETE_NOTIFY | + XCB_PRESENT_EVENT_MASK_IDLE_NOTIFY); + + /* Create an XCB event queue to hold present events outside of the usual + * application event queue + */ + chain->special_event = + xcb_register_for_special_xge(chain->conn, &xcb_present_id, + chain->event_id, NULL); + + chain->gc = xcb_generate_id(chain->conn); + if (!chain->gc) { + /* FINISHME: Choose a better error. */ + result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_register; + } + + cookie = xcb_create_gc(chain->conn, + chain->gc, + chain->window, + XCB_GC_GRAPHICS_EXPOSURES, + (uint32_t []) { 0 }); + xcb_discard_reply(chain->conn, cookie.sequence); + + uint32_t image = 0; + for (; image < chain->image_count; image++) { + result = x11_image_init(device, chain, pCreateInfo, pAllocator, + &chain->images[image]); + if (result != VK_SUCCESS) + goto fail_init_images; + } + + *swapchain_out = &chain->base; + + return VK_SUCCESS; + +fail_init_images: + for (uint32_t j = 0; j < image; j++) + x11_image_finish(chain, pAllocator, &chain->images[j]); + +fail_register: + xcb_unregister_for_special_event(chain->conn, chain->special_event); + + radv_free2(&device->alloc, pAllocator, chain); + + return result; +} + +VkResult +radv_x11_init_wsi(struct radv_physical_device *device) +{ + struct wsi_x11 *wsi; + VkResult result; + + wsi = radv_alloc(&device->instance->alloc, sizeof(*wsi), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + if (!wsi) { + result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail; + } + + int ret = pthread_mutex_init(&wsi->mutex, NULL); + if (ret != 0) { + if (ret == ENOMEM) { + result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } else { + /* FINISHME: Choose a better error. */ + result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } + + goto fail_alloc; + } + + wsi->connections = _mesa_hash_table_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + if (!wsi->connections) { + result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_mutex; + } + + wsi->base.get_support = x11_surface_get_support; + wsi->base.get_capabilities = x11_surface_get_capabilities; + wsi->base.get_formats = x11_surface_get_formats; + wsi->base.get_present_modes = x11_surface_get_present_modes; + wsi->base.create_swapchain = x11_surface_create_swapchain; + + device->wsi[VK_ICD_WSI_PLATFORM_XCB] = &wsi->base; + device->wsi[VK_ICD_WSI_PLATFORM_XLIB] = &wsi->base; + + return VK_SUCCESS; + +fail_mutex: + pthread_mutex_destroy(&wsi->mutex); +fail_alloc: + radv_free(&device->instance->alloc, wsi); +fail: + device->wsi[VK_ICD_WSI_PLATFORM_XCB] = NULL; + device->wsi[VK_ICD_WSI_PLATFORM_XLIB] = NULL; + + return result; +} + +void +radv_x11_finish_wsi(struct radv_physical_device *device) +{ + struct wsi_x11 *wsi = + (struct wsi_x11 *)device->wsi[VK_ICD_WSI_PLATFORM_XCB]; + + if (wsi) { + _mesa_hash_table_destroy(wsi->connections, NULL); + + pthread_mutex_destroy(&wsi->mutex); + + radv_free(&device->instance->alloc, wsi); + } +} diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c new file mode 100644 index 00000000000..a61a950de68 --- /dev/null +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -0,0 +1,1119 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based on si_state.c + * Copyright © 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* command buffer handling for SI */ + +#include "radv_private.h" +#include "radv_cs.h" +#include "sid.h" +#include "radv_util.h" +#include "main/macros.h" + +#define SI_GS_PER_ES 128 + +static void +si_write_harvested_raster_configs(struct radv_physical_device *physical_device, + struct radeon_winsys_cs *cs, + unsigned raster_config, + unsigned raster_config_1) +{ + unsigned sh_per_se = MAX2(physical_device->rad_info.max_sh_per_se, 1); + unsigned num_se = MAX2(physical_device->rad_info.max_se, 1); + unsigned rb_mask = physical_device->rad_info.enabled_rb_mask; + unsigned num_rb = MIN2(physical_device->rad_info.num_render_backends, 16); + unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2); + unsigned rb_per_se = num_rb / num_se; + unsigned se_mask[4]; + unsigned se; + + se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask; + se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask; + se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask; + se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask; + + assert(num_se == 1 || num_se == 2 || num_se == 4); + assert(sh_per_se == 1 || sh_per_se == 2); + assert(rb_per_pkr == 1 || rb_per_pkr == 2); + + /* XXX: I can't figure out what the *_XSEL and *_YSEL + * fields are for, so I'm leaving them as their default + * values. */ + + if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) || + (!se_mask[2] && !se_mask[3]))) { + raster_config_1 &= C_028354_SE_PAIR_MAP; + + if (!se_mask[0] && !se_mask[1]) { + raster_config_1 |= + S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3); + } else { + raster_config_1 |= + S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0); + } + } + + for (se = 0; se < num_se; se++) { + unsigned raster_config_se = raster_config; + unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se); + unsigned pkr1_mask = pkr0_mask << rb_per_pkr; + int idx = (se / 2) * 2; + + if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) { + raster_config_se &= C_028350_SE_MAP; + + if (!se_mask[idx]) { + raster_config_se |= + S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3); + } else { + raster_config_se |= + S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0); + } + } + + pkr0_mask &= rb_mask; + pkr1_mask &= rb_mask; + if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) { + raster_config_se &= C_028350_PKR_MAP; + + if (!pkr0_mask) { + raster_config_se |= + S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3); + } else { + raster_config_se |= + S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0); + } + } + + if (rb_per_se >= 2) { + unsigned rb0_mask = 1 << (se * rb_per_se); + unsigned rb1_mask = rb0_mask << 1; + + rb0_mask &= rb_mask; + rb1_mask &= rb_mask; + if (!rb0_mask || !rb1_mask) { + raster_config_se &= C_028350_RB_MAP_PKR0; + + if (!rb0_mask) { + raster_config_se |= + S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3); + } else { + raster_config_se |= + S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0); + } + } + + if (rb_per_se > 2) { + rb0_mask = 1 << (se * rb_per_se + rb_per_pkr); + rb1_mask = rb0_mask << 1; + rb0_mask &= rb_mask; + rb1_mask &= rb_mask; + if (!rb0_mask || !rb1_mask) { + raster_config_se &= C_028350_RB_MAP_PKR1; + + if (!rb0_mask) { + raster_config_se |= + S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3); + } else { + raster_config_se |= + S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0); + } + } + } + } + + /* GRBM_GFX_INDEX has a different offset on SI and CI+ */ + if (physical_device->rad_info.chip_class < CIK) + radeon_set_config_reg(cs, GRBM_GFX_INDEX, + SE_INDEX(se) | SH_BROADCAST_WRITES | + INSTANCE_BROADCAST_WRITES); + else + radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, + S_030800_SE_INDEX(se) | S_030800_SH_BROADCAST_WRITES(1) | + S_030800_INSTANCE_BROADCAST_WRITES(1)); + radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG, raster_config_se); + if (physical_device->rad_info.chip_class >= CIK) + radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); + } + + /* GRBM_GFX_INDEX has a different offset on SI and CI+ */ + if (physical_device->rad_info.chip_class < CIK) + radeon_set_config_reg(cs, GRBM_GFX_INDEX, + SE_BROADCAST_WRITES | SH_BROADCAST_WRITES | + INSTANCE_BROADCAST_WRITES); + else + radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, + S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | + S_030800_INSTANCE_BROADCAST_WRITES(1)); +} + +static void +si_init_compute(struct radv_physical_device *physical_device, + struct radeon_winsys_cs *cs) +{ + radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + + radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 3); + radeon_emit(cs, 0); + /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */ + radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); + radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | S_00B85C_SH1_CU_EN(0xffff)); + + if (physical_device->rad_info.chip_class >= CIK) { + /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */ + radeon_set_sh_reg_seq(cs, + R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); + radeon_emit(cs, S_00B864_SH0_CU_EN(0xffff) | + S_00B864_SH1_CU_EN(0xffff)); + radeon_emit(cs, S_00B868_SH0_CU_EN(0xffff) | + S_00B868_SH1_CU_EN(0xffff)); + } + + /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID + * and is now per pipe, so it should be handled in the + * kernel if we want to use something other than the default value, + * which is now 0x22f. + */ + if (physical_device->rad_info.chip_class <= SI) { + /* XXX: This should be: + * (number of compute units) * 4 * (waves per simd) - 1 */ + + radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, + 0x190 /* Default value */); + } +} + + +void si_init_config(struct radv_physical_device *physical_device, + struct radv_cmd_buffer *cmd_buffer) +{ + unsigned num_rb = MIN2(physical_device->rad_info.num_render_backends, 16); + unsigned rb_mask = physical_device->rad_info.enabled_rb_mask; + unsigned raster_config, raster_config_1; + int i; + struct radeon_winsys_cs *cs = cmd_buffer->cs; + radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); + radeon_emit(cs, CONTEXT_CONTROL_LOAD_ENABLE(1)); + radeon_emit(cs, CONTEXT_CONTROL_SHADOW_ENABLE(1)); + + radeon_set_context_reg(cs, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); + radeon_set_context_reg(cs, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); + + /* FIXME calculate these values somehow ??? */ + radeon_set_context_reg(cs, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES); + radeon_set_context_reg(cs, R_028A58_VGT_ES_PER_GS, 0x40); + radeon_set_context_reg(cs, R_028A5C_VGT_GS_PER_VS, 0x2); + + radeon_set_context_reg(cs, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); + radeon_set_context_reg(cs, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); + + radeon_set_context_reg(cs, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); + radeon_set_context_reg(cs, R_028AB8_VGT_VTX_CNT_EN, 0x0); + if (physical_device->rad_info.chip_class < CIK) + radeon_set_config_reg(cs, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) | + S_008A14_CLIP_VTX_REORDER_ENA(1)); + + radeon_set_context_reg(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 0x76543210); + radeon_set_context_reg(cs, R_028BD8_PA_SC_CENTROID_PRIORITY_1, 0xfedcba98); + + radeon_set_context_reg(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0); + + for (i = 0; i < 16; i++) { + radeon_set_context_reg(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 + i*8, 0); + radeon_set_context_reg(cs, R_0282D4_PA_SC_VPORT_ZMAX_0 + i*8, fui(1.0)); + } + + switch (physical_device->rad_info.family) { + case CHIP_TAHITI: + case CHIP_PITCAIRN: + raster_config = 0x2a00126a; + raster_config_1 = 0x00000000; + break; + case CHIP_VERDE: + raster_config = 0x0000124a; + raster_config_1 = 0x00000000; + break; + case CHIP_OLAND: + raster_config = 0x00000082; + raster_config_1 = 0x00000000; + break; + case CHIP_HAINAN: + raster_config = 0x00000000; + raster_config_1 = 0x00000000; + break; + case CHIP_BONAIRE: + raster_config = 0x16000012; + raster_config_1 = 0x00000000; + break; + case CHIP_HAWAII: + raster_config = 0x3a00161a; + raster_config_1 = 0x0000002e; + break; + case CHIP_FIJI: + if (physical_device->rad_info.cik_macrotile_mode_array[0] == 0x000000e8) { + /* old kernels with old tiling config */ + raster_config = 0x16000012; + raster_config_1 = 0x0000002a; + } else { + raster_config = 0x3a00161a; + raster_config_1 = 0x0000002e; + } + break; + case CHIP_POLARIS10: + raster_config = 0x16000012; + raster_config_1 = 0x0000002a; + break; + case CHIP_POLARIS11: + raster_config = 0x16000012; + raster_config_1 = 0x00000000; + break; + case CHIP_TONGA: + raster_config = 0x16000012; + raster_config_1 = 0x0000002a; + break; + case CHIP_ICELAND: + if (num_rb == 1) + raster_config = 0x00000000; + else + raster_config = 0x00000002; + raster_config_1 = 0x00000000; + break; + case CHIP_CARRIZO: + raster_config = 0x00000002; + raster_config_1 = 0x00000000; + break; + case CHIP_KAVERI: + /* KV should be 0x00000002, but that causes problems with radeon */ + raster_config = 0x00000000; /* 0x00000002 */ + raster_config_1 = 0x00000000; + break; + case CHIP_KABINI: + case CHIP_MULLINS: + case CHIP_STONEY: + raster_config = 0x00000000; + raster_config_1 = 0x00000000; + break; + default: + fprintf(stderr, + "radeonsi: Unknown GPU, using 0 for raster_config\n"); + raster_config = 0x00000000; + raster_config_1 = 0x00000000; + break; + } + + /* Always use the default config when all backends are enabled + * (or when we failed to determine the enabled backends). + */ + if (!rb_mask || util_bitcount(rb_mask) >= num_rb) { + radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG, + raster_config); + if (physical_device->rad_info.chip_class >= CIK) + radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1, + raster_config_1); + } else { + si_write_harvested_raster_configs(physical_device, cs, raster_config, raster_config_1); + } + + radeon_set_context_reg(cs, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1)); + radeon_set_context_reg(cs, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1)); + radeon_set_context_reg(cs, R_028244_PA_SC_GENERIC_SCISSOR_BR, + S_028244_BR_X(16384) | S_028244_BR_Y(16384)); + radeon_set_context_reg(cs, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0); + radeon_set_context_reg(cs, R_028034_PA_SC_SCREEN_SCISSOR_BR, + S_028034_BR_X(16384) | S_028034_BR_Y(16384)); + + radeon_set_context_reg(cs, R_02820C_PA_SC_CLIPRECT_RULE, 0xFFFF); + radeon_set_context_reg(cs, R_028230_PA_SC_EDGERULE, 0xAAAAAAAA); + /* PA_SU_HARDWARE_SCREEN_OFFSET must be 0 due to hw bug on SI */ + radeon_set_context_reg(cs, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, 0); + radeon_set_context_reg(cs, R_028820_PA_CL_NANINF_CNTL, 0); + + radeon_set_context_reg(cs, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, fui(1.0)); + radeon_set_context_reg(cs, R_028BEC_PA_CL_GB_VERT_DISC_ADJ, fui(1.0)); + radeon_set_context_reg(cs, R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, fui(1.0)); + radeon_set_context_reg(cs, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, fui(1.0)); + + radeon_set_context_reg(cs, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0); + radeon_set_context_reg(cs, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); + radeon_set_context_reg(cs, R_028AC8_DB_PRELOAD_CONTROL, 0x0); + radeon_set_context_reg(cs, R_02800C_DB_RENDER_OVERRIDE, + S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) | + S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE)); + + radeon_set_context_reg(cs, R_028400_VGT_MAX_VTX_INDX, ~0); + radeon_set_context_reg(cs, R_028404_VGT_MIN_VTX_INDX, 0); + radeon_set_context_reg(cs, R_028408_VGT_INDX_OFFSET, 0); + + if (physical_device->rad_info.chip_class >= CIK) { + radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0); + radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xffff)); + radeon_set_sh_reg(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff)); + + if (physical_device->rad_info.num_good_compute_units / + (physical_device->rad_info.max_se * physical_device->rad_info.max_sh_per_se) <= 4) { + /* Too few available compute units per SH. Disallowing + * VS to run on CU0 could hurt us more than late VS + * allocation would help. + * + * LATE_ALLOC_VS = 2 is the highest safe number. + */ + radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff)); + radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xffff)); + radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(2)); + } else { + /* Set LATE_ALLOC_VS == 31. It should be less than + * the number of scratch waves. Limitations: + * - VS can't execute on CU0. + * - If HS writes outputs to LDS, LS can't execute on CU0. + */ + radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xfffe)); + radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xfffe)); + radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(31)); + } + + radeon_set_sh_reg(cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff)); + } + + if (physical_device->rad_info.chip_class >= VI) { + radeon_set_context_reg(cs, R_028424_CB_DCC_CONTROL, + S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) | + S_028424_OVERWRITE_COMBINER_WATERMARK(4)); + radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 30); + radeon_set_context_reg(cs, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32); + radeon_set_context_reg(cs, R_028B50_VGT_TESS_DISTRIBUTION, + S_028B50_ACCUM_ISOLINE(32) | + S_028B50_ACCUM_TRI(11) | + S_028B50_ACCUM_QUAD(11) | + S_028B50_DONUT_SPLIT(16)); + } else { + radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); + radeon_set_context_reg(cs, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16); + } + + if (physical_device->rad_info.family == CHIP_STONEY) + radeon_set_context_reg(cs, R_028C40_PA_SC_SHADER_CONTROL, 0); + + si_init_compute(physical_device, cs); +} + +static void +get_viewport_xform(const VkViewport *viewport, + float scale[3], float translate[3]) +{ + float x = viewport->x; + float y = viewport->y; + float half_width = 0.5f * viewport->width; + float half_height = 0.5f * viewport->height; + double n = viewport->minDepth; + double f = viewport->maxDepth; + + scale[0] = half_width; + translate[0] = half_width + x; + scale[1] = half_height; + translate[1] = half_height + y; + + scale[2] = (f - n); + translate[2] = n; +} + +void +si_write_viewport(struct radeon_winsys_cs *cs, int first_vp, + int count, const VkViewport *viewports) +{ + int i; + + if (count == 0) { + radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6); + radeon_emit(cs, fui(1.0)); + radeon_emit(cs, fui(0.0)); + radeon_emit(cs, fui(1.0)); + radeon_emit(cs, fui(0.0)); + radeon_emit(cs, fui(1.0)); + radeon_emit(cs, fui(0.0)); + + radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2); + radeon_emit(cs, fui(0.0)); + radeon_emit(cs, fui(1.0)); + + return; + } + radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + + first_vp * 4 * 6, count * 6); + + for (i = 0; i < count; i++) { + float scale[3], translate[3]; + + + get_viewport_xform(&viewports[i], scale, translate); + radeon_emit(cs, fui(scale[0])); + radeon_emit(cs, fui(translate[0])); + radeon_emit(cs, fui(scale[1])); + radeon_emit(cs, fui(translate[1])); + radeon_emit(cs, fui(scale[2])); + radeon_emit(cs, fui(translate[2])); + } + + for (i = 0; i < count; i++) { + float zmin = MIN2(viewports[i].minDepth, viewports[i].maxDepth); + float zmax = MAX2(viewports[i].minDepth, viewports[i].maxDepth); + radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 + + first_vp * 4 * 2, count * 2); + radeon_emit(cs, fui(zmin)); + radeon_emit(cs, fui(zmax)); + } +} + +void +si_write_scissors(struct radeon_winsys_cs *cs, int first, + int count, const VkRect2D *scissors) +{ + int i; + if (count == 0) + return; + + radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + first * 4 * 2, count * 2); + for (i = 0; i < count; i++) { + radeon_emit(cs, S_028250_TL_X(scissors[i].offset.x) | + S_028250_TL_Y(scissors[i].offset.y) | + S_028250_WINDOW_OFFSET_DISABLE(1)); + radeon_emit(cs, S_028254_BR_X(scissors[i].offset.x + scissors[i].extent.width) | + S_028254_BR_Y(scissors[i].offset.y + scissors[i].extent.height)); + } +} + +uint32_t +si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer) +{ + enum chip_class chip_class = cmd_buffer->device->instance->physicalDevice.rad_info.chip_class; + struct radeon_info *info = &cmd_buffer->device->instance->physicalDevice.rad_info; + unsigned prim = cmd_buffer->state.pipeline->graphics.prim; + unsigned primgroup_size = 128; /* recommended without a GS */ + unsigned max_primgroup_in_wave = 2; + /* SWITCH_ON_EOP(0) is always preferable. */ + bool wd_switch_on_eop = false; + bool ia_switch_on_eop = false; + bool ia_switch_on_eoi = false; + bool partial_vs_wave = false; + bool partial_es_wave = false; + + /* TODO GS */ + + /* TODO TES */ + + /* TODO linestipple */ + + if (chip_class >= CIK) { + /* WD_SWITCH_ON_EOP has no effect on GPUs with less than + * 4 shader engines. Set 1 to pass the assertion below. + * The other cases are hardware requirements. */ + if (info->max_se < 4 || + prim == V_008958_DI_PT_POLYGON || + prim == V_008958_DI_PT_LINELOOP || + prim == V_008958_DI_PT_TRIFAN || + prim == V_008958_DI_PT_TRISTRIP_ADJ) + // info->primitive_restart || + // info->count_from_stream_output) + wd_switch_on_eop = true; + + /* TODO HAWAII */ + + /* Required on CIK and later. */ + if (info->max_se > 2 && !wd_switch_on_eop) + ia_switch_on_eoi = true; + + /* Required by Hawaii and, for some special cases, by VI. */ +#if 0 + if (ia_switch_on_eoi && + (sctx->b.family == CHIP_HAWAII || + (sctx->b.chip_class == VI && + (sctx->gs_shader.cso || max_primgroup_in_wave != 2)))) + partial_vs_wave = true; +#endif + +#if 0 + /* Instancing bug on Bonaire. */ + if (sctx->b.family == CHIP_BONAIRE && ia_switch_on_eoi && + (info->indirect || info->instance_count > 1)) + partial_vs_wave = true; +#endif + /* If the WD switch is false, the IA switch must be false too. */ + assert(wd_switch_on_eop || !ia_switch_on_eop); + } + /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */ + if (ia_switch_on_eoi) + partial_es_wave = true; + + /* GS requirement. */ +#if 0 + if (SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3) + partial_es_wave = true; +#endif + + /* Hw bug with single-primitive instances and SWITCH_ON_EOI + * on multi-SE chips. */ +#if 0 + if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi && + (info->indirect || + (info->instance_count > 1 && + si_num_prims_for_vertices(info) <= 1))) + sctx->b.flags |= SI_CONTEXT_VGT_FLUSH; +#endif + return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | + S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) | + S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) | + S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) | + S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1) | + S_028AA8_WD_SWITCH_ON_EOP(chip_class >= CIK ? wd_switch_on_eop : 0) | + S_028AA8_MAX_PRIMGRP_IN_WAVE(chip_class >= VI ? + max_primgroup_in_wave : 0); + +} + +void +si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer) +{ + enum chip_class chip_class = cmd_buffer->device->instance->physicalDevice.rad_info.chip_class; + unsigned cp_coher_cntl = 0; + + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128); + + if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_ICACHE) + cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); + if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_SMEM_L1) + cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); + if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) + cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1); + if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) { + cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1); + if (chip_class >= VI) + cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1); + } + + if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { + cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | + S_0085F0_CB0_DEST_BASE_ENA(1) | + S_0085F0_CB1_DEST_BASE_ENA(1) | + S_0085F0_CB2_DEST_BASE_ENA(1) | + S_0085F0_CB3_DEST_BASE_ENA(1) | + S_0085F0_CB4_DEST_BASE_ENA(1) | + S_0085F0_CB5_DEST_BASE_ENA(1) | + S_0085F0_CB6_DEST_BASE_ENA(1) | + S_0085F0_CB7_DEST_BASE_ENA(1); + + /* Necessary for DCC */ + if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= VI) { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_DATA_TS) | + EVENT_INDEX(5)); + radeon_emit(cmd_buffer->cs, 0); + radeon_emit(cmd_buffer->cs, 0); + radeon_emit(cmd_buffer->cs, 0); + radeon_emit(cmd_buffer->cs, 0); + } + } + + if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { + cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | + S_0085F0_DB_DEST_BASE_ENA(1); + } + + if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); + } + + if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); + } + + if (!(cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_DB))) { + if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + } else if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + } + } + + if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + } + + /* VGT state sync */ + if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_VGT_FLUSH) { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + } + + /* Make sure ME is idle (it executes most packets) before continuing. + * This prevents read-after-write hazards between PFP and ME. + */ + if (cp_coher_cntl || (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cmd_buffer->cs, 0); + } + + /* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle. + * Therefore, it should be last. Done in PFP. + */ + if (cp_coher_cntl) { + /* ACQUIRE_MEM is only required on a compute ring. */ + radeon_emit(cmd_buffer->cs, PKT3(PKT3_SURFACE_SYNC, 3, 0)); + radeon_emit(cmd_buffer->cs, cp_coher_cntl); /* CP_COHER_CNTL */ + radeon_emit(cmd_buffer->cs, 0xffffffff); /* CP_COHER_SIZE */ + radeon_emit(cmd_buffer->cs, 0); /* CP_COHER_BASE */ + radeon_emit(cmd_buffer->cs, 0x0000000A); /* POLL_INTERVAL */ + } + + cmd_buffer->state.flush_bits = 0; +} + + +/* Set this if you want the 3D engine to wait until CP DMA is done. + * It should be set on the last CP DMA packet. */ +#define R600_CP_DMA_SYNC (1 << 0) /* R600+ */ + +/* Set this if the source data was used as a destination in a previous CP DMA + * packet. It's for preventing a read-after-write (RAW) hazard between two + * CP DMA packets. */ +#define SI_CP_DMA_RAW_WAIT (1 << 1) /* SI+ */ +#define CIK_CP_DMA_USE_L2 (1 << 2) + +/* Alignment for optimal performance. */ +#define CP_DMA_ALIGNMENT 32 +/* The max number of bytes to copy per packet. */ +#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - CP_DMA_ALIGNMENT) + +static void si_emit_cp_dma_copy_buffer(struct radv_cmd_buffer *cmd_buffer, + uint64_t dst_va, uint64_t src_va, + unsigned size, unsigned flags) +{ + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0; + uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0; + uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0; + uint32_t sel = flags & CIK_CP_DMA_USE_L2 ? + S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | + S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0; + + assert(size); + assert((size & ((1<<21)-1)) == size); + + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9); + + if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= CIK) { + radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(cs, sync_flag | sel); /* CP_SYNC [31] */ + radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ + radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + } else { + radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); + radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ + radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */ + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + } + + /* CP DMA is executed in ME, but index buffers are read by PFP. + * This ensures that ME (CP DMA) is idle before PFP starts fetching + * indices. If we wanted to execute CP DMA in PFP, this packet + * should precede it. + */ + if (sync_flag) { + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); + } +} + +/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */ +static void si_emit_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, + uint64_t dst_va, unsigned size, + uint32_t clear_value, unsigned flags) +{ + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0; + uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0; + uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0; + uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0; + + assert(size); + assert((size & ((1<<21)-1)) == size); + + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9); + + if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= CIK) { + radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(cs, sync_flag | dst_sel | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */ + radeon_emit(cs, clear_value); /* DATA [31:0] */ + radeon_emit(cs, 0); + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [15:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + } else { + radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); + radeon_emit(cs, clear_value); /* DATA [31:0] */ + radeon_emit(cs, sync_flag | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */ + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + } + + /* See "copy_buffer" for explanation. */ + if (sync_flag) { + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); + } +} + +static void si_cp_dma_prepare(struct radv_cmd_buffer *cmd_buffer, uint64_t byte_count, + uint64_t remaining_size, unsigned *flags) +{ + + /* Flush the caches for the first copy only. + * Also wait for the previous CP DMA operations. + */ + if (cmd_buffer->state.flush_bits) { + si_emit_cache_flush(cmd_buffer); + *flags |= SI_CP_DMA_RAW_WAIT; + } + + /* Do the synchronization after the last dma, so that all data + * is written to memory. + */ + if (byte_count == remaining_size) + *flags |= R600_CP_DMA_SYNC; +} + +static void si_cp_dma_realign_engine(struct radv_cmd_buffer *cmd_buffer, unsigned size) +{ + uint64_t va; + uint32_t offset; + unsigned dma_flags = 0; + unsigned buf_size = CP_DMA_ALIGNMENT * 2; + void *ptr; + + assert(size < CP_DMA_ALIGNMENT); + + radv_cmd_buffer_upload_alloc(cmd_buffer, buf_size, CP_DMA_ALIGNMENT, &offset, &ptr); + + va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo); + va += offset; + + si_cp_dma_prepare(cmd_buffer, size, size, &dma_flags); + + si_emit_cp_dma_copy_buffer(cmd_buffer, va, va + CP_DMA_ALIGNMENT, size, + dma_flags); +} + +void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer, + uint64_t src_va, uint64_t dest_va, + uint64_t size) +{ + uint64_t main_src_va, main_dest_va; + uint64_t skipped_size = 0, realign_size = 0; + + + if (cmd_buffer->device->instance->physicalDevice.rad_info.family <= CHIP_CARRIZO || + cmd_buffer->device->instance->physicalDevice.rad_info.family == CHIP_STONEY) { + /* If the size is not aligned, we must add a dummy copy at the end + * just to align the internal counter. Otherwise, the DMA engine + * would slow down by an order of magnitude for following copies. + */ + if (size % CP_DMA_ALIGNMENT) + realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT); + + /* If the copy begins unaligned, we must start copying from the next + * aligned block and the skipped part should be copied after everything + * else has been copied. Only the src alignment matters, not dst. + */ + if (src_va % CP_DMA_ALIGNMENT) { + skipped_size = CP_DMA_ALIGNMENT - (src_va % CP_DMA_ALIGNMENT); + /* The main part will be skipped if the size is too small. */ + skipped_size = MIN2(skipped_size, size); + size -= skipped_size; + } + } + main_src_va = src_va + skipped_size; + main_dest_va = dest_va + skipped_size; + + while (size) { + unsigned dma_flags = 0; + unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); + + si_cp_dma_prepare(cmd_buffer, byte_count, + size + skipped_size + realign_size, + &dma_flags); + + si_emit_cp_dma_copy_buffer(cmd_buffer, main_dest_va, main_src_va, + byte_count, dma_flags); + + size -= byte_count; + main_src_va += byte_count; + main_dest_va += byte_count; + } + + if (skipped_size) { + unsigned dma_flags = 0; + + si_cp_dma_prepare(cmd_buffer, skipped_size, + size + skipped_size + realign_size, + &dma_flags); + + si_emit_cp_dma_copy_buffer(cmd_buffer, dest_va, src_va, + skipped_size, dma_flags); + } + if (realign_size) + si_cp_dma_realign_engine(cmd_buffer, realign_size); +} + +void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, + uint64_t size, unsigned value) +{ + + if (!size) + return; + + assert(va % 4 == 0 && size % 4 == 0); + + while (size) { + unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); + unsigned dma_flags = 0; + + si_cp_dma_prepare(cmd_buffer, byte_count, size, &dma_flags); + + /* Emit the clear packet. */ + si_emit_cp_dma_clear_buffer(cmd_buffer, va, byte_count, value, + dma_flags); + + size -= byte_count; + va += byte_count; + } +} + +/* For MSAA sample positions. */ +#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \ + (((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) | \ + (((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) | \ + (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \ + (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28)) + + +/* 2xMSAA + * There are two locations (4, 4), (-4, -4). */ +const uint32_t eg_sample_locs_2x[4] = { + FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), + FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), + FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), + FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), +}; +const unsigned eg_max_dist_2x = 4; +/* 4xMSAA + * There are 4 locations: (-2, 6), (6, -2), (-6, 2), (2, 6). */ +const uint32_t eg_sample_locs_4x[4] = { + FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), + FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), + FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), + FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), +}; +const unsigned eg_max_dist_4x = 6; + +/* Cayman 8xMSAA */ +static const uint32_t cm_sample_locs_8x[] = { + FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), + FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), + FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), + FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), + FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), + FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), + FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), + FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), +}; +static const unsigned cm_max_dist_8x = 8; +/* Cayman 16xMSAA */ +static const uint32_t cm_sample_locs_16x[] = { + FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), + FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), + FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), + FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), + FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), + FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), + FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), + FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), + FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), + FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), + FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), + FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), + FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), + FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), + FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), + FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), +}; +static const unsigned cm_max_dist_16x = 8; + +unsigned radv_cayman_get_maxdist(int log_samples) +{ + unsigned max_dist[] = { + 0, + eg_max_dist_2x, + eg_max_dist_4x, + cm_max_dist_8x, + cm_max_dist_16x + }; + return max_dist[log_samples]; +} + +void radv_cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples) +{ + switch (nr_samples) { + default: + case 1: + radeon_set_context_reg(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 0); + radeon_set_context_reg(cs, CM_R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 0); + radeon_set_context_reg(cs, CM_R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 0); + radeon_set_context_reg(cs, CM_R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 0); + break; + case 2: + radeon_set_context_reg(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, eg_sample_locs_2x[0]); + radeon_set_context_reg(cs, CM_R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, eg_sample_locs_2x[1]); + radeon_set_context_reg(cs, CM_R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, eg_sample_locs_2x[2]); + radeon_set_context_reg(cs, CM_R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, eg_sample_locs_2x[3]); + break; + case 4: + radeon_set_context_reg(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, eg_sample_locs_4x[0]); + radeon_set_context_reg(cs, CM_R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, eg_sample_locs_4x[1]); + radeon_set_context_reg(cs, CM_R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, eg_sample_locs_4x[2]); + radeon_set_context_reg(cs, CM_R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, eg_sample_locs_4x[3]); + break; + case 8: + radeon_set_context_reg_seq(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14); + radeon_emit(cs, cm_sample_locs_8x[0]); + radeon_emit(cs, cm_sample_locs_8x[4]); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + radeon_emit(cs, cm_sample_locs_8x[1]); + radeon_emit(cs, cm_sample_locs_8x[5]); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + radeon_emit(cs, cm_sample_locs_8x[2]); + radeon_emit(cs, cm_sample_locs_8x[6]); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + radeon_emit(cs, cm_sample_locs_8x[3]); + radeon_emit(cs, cm_sample_locs_8x[7]); + break; + case 16: + radeon_set_context_reg_seq(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 16); + radeon_emit(cs, cm_sample_locs_16x[0]); + radeon_emit(cs, cm_sample_locs_16x[4]); + radeon_emit(cs, cm_sample_locs_16x[8]); + radeon_emit(cs, cm_sample_locs_16x[12]); + radeon_emit(cs, cm_sample_locs_16x[1]); + radeon_emit(cs, cm_sample_locs_16x[5]); + radeon_emit(cs, cm_sample_locs_16x[9]); + radeon_emit(cs, cm_sample_locs_16x[13]); + radeon_emit(cs, cm_sample_locs_16x[2]); + radeon_emit(cs, cm_sample_locs_16x[6]); + radeon_emit(cs, cm_sample_locs_16x[10]); + radeon_emit(cs, cm_sample_locs_16x[14]); + radeon_emit(cs, cm_sample_locs_16x[3]); + radeon_emit(cs, cm_sample_locs_16x[7]); + radeon_emit(cs, cm_sample_locs_16x[11]); + radeon_emit(cs, cm_sample_locs_16x[15]); + break; + } +} + +static void radv_cayman_get_sample_position(struct radv_device *device, + unsigned sample_count, + unsigned sample_index, float *out_value) +{ + int offset, index; + struct { + int idx:4; + } val; + switch (sample_count) { + case 1: + default: + out_value[0] = out_value[1] = 0.5; + break; + case 2: + offset = 4 * (sample_index * 2); + val.idx = (eg_sample_locs_2x[0] >> offset) & 0xf; + out_value[0] = (float)(val.idx + 8) / 16.0f; + val.idx = (eg_sample_locs_2x[0] >> (offset + 4)) & 0xf; + out_value[1] = (float)(val.idx + 8) / 16.0f; + break; + case 4: + offset = 4 * (sample_index * 2); + val.idx = (eg_sample_locs_4x[0] >> offset) & 0xf; + out_value[0] = (float)(val.idx + 8) / 16.0f; + val.idx = (eg_sample_locs_4x[0] >> (offset + 4)) & 0xf; + out_value[1] = (float)(val.idx + 8) / 16.0f; + break; + case 8: + offset = 4 * (sample_index % 4 * 2); + index = (sample_index / 4) * 4; + val.idx = (cm_sample_locs_8x[index] >> offset) & 0xf; + out_value[0] = (float)(val.idx + 8) / 16.0f; + val.idx = (cm_sample_locs_8x[index] >> (offset + 4)) & 0xf; + out_value[1] = (float)(val.idx + 8) / 16.0f; + break; + case 16: + offset = 4 * (sample_index % 4 * 2); + index = (sample_index / 4) * 4; + val.idx = (cm_sample_locs_16x[index] >> offset) & 0xf; + out_value[0] = (float)(val.idx + 8) / 16.0f; + val.idx = (cm_sample_locs_16x[index] >> (offset + 4)) & 0xf; + out_value[1] = (float)(val.idx + 8) / 16.0f; + break; + } +} + +void radv_device_init_msaa(struct radv_device *device) +{ + int i; + radv_cayman_get_sample_position(device, 1, 0, device->sample_locations_1x[0]); + + for (i = 0; i < 2; i++) + radv_cayman_get_sample_position(device, 2, i, device->sample_locations_2x[i]); + for (i = 0; i < 4; i++) + radv_cayman_get_sample_position(device, 4, i, device->sample_locations_4x[i]); + for (i = 0; i < 8; i++) + radv_cayman_get_sample_position(device, 8, i, device->sample_locations_8x[i]); + for (i = 0; i < 16; i++) + radv_cayman_get_sample_position(device, 16, i, device->sample_locations_16x[i]); +} diff --git a/src/amd/vulkan/vk_format.h b/src/amd/vulkan/vk_format.h new file mode 100644 index 00000000000..e0087f1dea5 --- /dev/null +++ b/src/amd/vulkan/vk_format.h @@ -0,0 +1,449 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * Based on u_format.h which is: + * Copyright 2009-2010 Vmware, Inc. + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +enum vk_format_layout { + /** + * Formats with vk_format_block::width == vk_format_block::height == 1 + * that can be described as an ordinary data structure. + */ + VK_FORMAT_LAYOUT_PLAIN = 0, + + /** + * Formats with sub-sampled channels. + * + * This is for formats like YVYU where there is less than one sample per + * pixel. + */ + VK_FORMAT_LAYOUT_SUBSAMPLED = 3, + + /** + * S3 Texture Compression formats. + */ + VK_FORMAT_LAYOUT_S3TC = 4, + + /** + * Red-Green Texture Compression formats. + */ + VK_FORMAT_LAYOUT_RGTC = 5, + + /** + * Ericsson Texture Compression + */ + VK_FORMAT_LAYOUT_ETC = 6, + + /** + * BC6/7 Texture Compression + */ + VK_FORMAT_LAYOUT_BPTC = 7, + + /** + * ASTC + */ + VK_FORMAT_LAYOUT_ASTC = 8, + + /** + * Everything else that doesn't fit in any of the above layouts. + */ + VK_FORMAT_LAYOUT_OTHER = 9 +}; + +struct vk_format_block +{ + /** Block width in pixels */ + unsigned width; + + /** Block height in pixels */ + unsigned height; + + /** Block size in bits */ + unsigned bits; +}; + +enum vk_format_type { + VK_FORMAT_TYPE_VOID = 0, + VK_FORMAT_TYPE_UNSIGNED = 1, + VK_FORMAT_TYPE_SIGNED = 2, + VK_FORMAT_TYPE_FIXED = 3, + VK_FORMAT_TYPE_FLOAT = 4 +}; + + +enum vk_format_colorspace { + VK_FORMAT_COLORSPACE_RGB = 0, + VK_FORMAT_COLORSPACE_SRGB = 1, + VK_FORMAT_COLORSPACE_YUV = 2, + VK_FORMAT_COLORSPACE_ZS = 3 +}; + +struct vk_format_channel_description { + unsigned type:5; + unsigned normalized:1; + unsigned pure_integer:1; + unsigned scaled:1; + unsigned size:8; + unsigned shift:16; +}; + +struct vk_format_description +{ + VkFormat format; + const char *name; + const char *short_name; + + struct vk_format_block block; + enum vk_format_layout layout; + + unsigned nr_channels:3; + unsigned is_array:1; + unsigned is_bitmask:1; + unsigned is_mixed:1; + + struct vk_format_channel_description channel[4]; + + unsigned char swizzle[4]; + + enum vk_format_colorspace colorspace; +}; + +extern const struct vk_format_description vk_format_description_table[]; + +const struct vk_format_description *vk_format_description(VkFormat format); + +/** + * Return total bits needed for the pixel format per block. + */ +static inline uint +vk_format_get_blocksizebits(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + assert(desc); + if (!desc) { + return 0; + } + + return desc->block.bits; +} + +/** + * Return bytes per block (not pixel) for the given format. + */ +static inline uint +vk_format_get_blocksize(VkFormat format) +{ + uint bits = vk_format_get_blocksizebits(format); + uint bytes = bits / 8; + + assert(bits % 8 == 0); + assert(bytes > 0); + if (bytes == 0) { + bytes = 1; + } + + return bytes; +} + +static inline uint +vk_format_get_blockwidth(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + assert(desc); + if (!desc) { + return 1; + } + + return desc->block.width; +} + +static inline uint +vk_format_get_blockheight(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + assert(desc); + if (!desc) { + return 1; + } + + return desc->block.height; +} + +/** + * Return the index of the first non-void channel + * -1 if no non-void channels + */ +static inline int +vk_format_get_first_non_void_channel(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + int i; + + for (i = 0; i < 4; i++) + if (desc->channel[i].type != VK_FORMAT_TYPE_VOID) + break; + + if (i == 4) + return -1; + + return i; +} + +enum vk_swizzle { + VK_SWIZZLE_X, + VK_SWIZZLE_Y, + VK_SWIZZLE_Z, + VK_SWIZZLE_W, + VK_SWIZZLE_0, + VK_SWIZZLE_1, + VK_SWIZZLE_NONE, + VK_SWIZZLE_MAX, /**< Number of enums counter (must be last) */ +}; + +static inline VkImageAspectFlags +vk_format_aspects(VkFormat format) +{ + switch (format) { + case VK_FORMAT_UNDEFINED: + return 0; + + case VK_FORMAT_S8_UINT: + return VK_IMAGE_ASPECT_STENCIL_BIT; + + case VK_FORMAT_D16_UNORM_S8_UINT: + case VK_FORMAT_D24_UNORM_S8_UINT: + case VK_FORMAT_D32_SFLOAT_S8_UINT: + return VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + + case VK_FORMAT_D16_UNORM: + case VK_FORMAT_X8_D24_UNORM_PACK32: + case VK_FORMAT_D32_SFLOAT: + return VK_IMAGE_ASPECT_DEPTH_BIT; + + default: + return VK_IMAGE_ASPECT_COLOR_BIT; + } +} + +static inline enum vk_swizzle +radv_swizzle_conv(int idx, const unsigned char chan[4], VkComponentSwizzle vk_swiz) +{ + int x; + switch (vk_swiz) { + case VK_COMPONENT_SWIZZLE_IDENTITY: + return chan[idx]; + case VK_COMPONENT_SWIZZLE_ZERO: + return VK_SWIZZLE_0; + case VK_COMPONENT_SWIZZLE_ONE: + return VK_SWIZZLE_1; + case VK_COMPONENT_SWIZZLE_R: + for (x = 0; x < 4; x++) + if (chan[x] == 0) + return x; + return VK_SWIZZLE_0; + case VK_COMPONENT_SWIZZLE_G: + for (x = 0; x < 4; x++) + if (chan[x] == 1) + return x; + return VK_SWIZZLE_0; + case VK_COMPONENT_SWIZZLE_B: + for (x = 0; x < 4; x++) + if (chan[x] == 2) + return x; + return VK_SWIZZLE_0; + case VK_COMPONENT_SWIZZLE_A: + for (x = 0; x < 4; x++) + if (chan[x] == 3) + return x; + return VK_SWIZZLE_1; + default: + return chan[idx]; + } +} + +static inline void vk_format_compose_swizzles(const VkComponentMapping *mapping, + const unsigned char swz[4], + enum vk_swizzle dst[4]) +{ + dst[0] = radv_swizzle_conv(0, swz, mapping->r); + dst[1] = radv_swizzle_conv(1, swz, mapping->g); + dst[2] = radv_swizzle_conv(2, swz, mapping->b); + dst[3] = radv_swizzle_conv(3, swz, mapping->a); +} + +static inline bool +vk_format_is_compressed(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + assert(desc); + if (!desc) { + return false; + } + + switch (desc->layout) { + case VK_FORMAT_LAYOUT_S3TC: + case VK_FORMAT_LAYOUT_RGTC: + case VK_FORMAT_LAYOUT_ETC: + case VK_FORMAT_LAYOUT_BPTC: + case VK_FORMAT_LAYOUT_ASTC: + /* XXX add other formats in the future */ + return true; + default: + return false; + } +} + +static inline bool +vk_format_has_depth(const struct vk_format_description *desc) +{ + return desc->colorspace == VK_FORMAT_COLORSPACE_ZS && + desc->swizzle[0] != VK_SWIZZLE_NONE; +} + +static inline bool +vk_format_has_stencil(const struct vk_format_description *desc) +{ + return desc->colorspace == VK_FORMAT_COLORSPACE_ZS && + desc->swizzle[1] != VK_SWIZZLE_NONE; +} + +static inline bool +vk_format_is_depth_or_stencil(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + assert(desc); + if (!desc) { + return false; + } + + return vk_format_has_depth(desc) || + vk_format_has_stencil(desc); +} + +static inline bool +vk_format_is_depth(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + + assert(desc); + if (!desc) { + return false; + } + + return vk_format_has_depth(desc); +} + +static inline bool +vk_format_is_color(VkFormat format) +{ + return !vk_format_is_depth_or_stencil(format); +} + +static inline VkFormat +vk_format_depth_only(VkFormat format) +{ + switch (format) { + case VK_FORMAT_D16_UNORM_S8_UINT: + return VK_FORMAT_D16_UNORM; + case VK_FORMAT_D24_UNORM_S8_UINT: + return VK_FORMAT_X8_D24_UNORM_PACK32; + case VK_FORMAT_D32_SFLOAT_S8_UINT: + return VK_FORMAT_D32_SFLOAT; + default: + return format; + } +} + +static inline bool +vk_format_is_int(VkFormat format) +{ + const struct vk_format_description *desc = vk_format_description(format); + int channel = vk_format_get_first_non_void_channel(format); + + return channel >= 0 && desc->channel[channel].pure_integer; +} + +static inline VkFormat +vk_format_stencil_only(VkFormat format) +{ + return VK_FORMAT_S8_UINT; +} + +static inline uint +vk_format_get_component_bits(VkFormat format, + enum vk_format_colorspace colorspace, + uint component) +{ + const struct vk_format_description *desc = vk_format_description(format); + enum vk_format_colorspace desc_colorspace; + + assert(format); + if (!format) { + return 0; + } + + assert(component < 4); + + /* Treat RGB and SRGB as equivalent. */ + if (colorspace == VK_FORMAT_COLORSPACE_SRGB) { + colorspace = VK_FORMAT_COLORSPACE_RGB; + } + if (desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) { + desc_colorspace = VK_FORMAT_COLORSPACE_RGB; + } else { + desc_colorspace = desc->colorspace; + } + + if (desc_colorspace != colorspace) { + return 0; + } + + switch (desc->swizzle[component]) { + case VK_SWIZZLE_X: + return desc->channel[0].size; + case VK_SWIZZLE_Y: + return desc->channel[1].size; + case VK_SWIZZLE_Z: + return desc->channel[2].size; + case VK_SWIZZLE_W: + return desc->channel[3].size; + default: + return 0; + } +} +#ifdef __cplusplus +} // extern "C" { +#endif diff --git a/src/amd/vulkan/vk_format_layout.csv b/src/amd/vulkan/vk_format_layout.csv new file mode 100644 index 00000000000..ae9ceda08eb --- /dev/null +++ b/src/amd/vulkan/vk_format_layout.csv @@ -0,0 +1,188 @@ +/* this is pretty much taken from the gallium one. */ + + +VK_FORMAT_UNDEFINED , plain, 1, 1, u8 , , , , x001, rgb +VK_FORMAT_R4G4_UNORM_PACK8 , plain, 1, 1, un4 , un4 , , , xy01, rgb +VK_FORMAT_R4G4B4A4_UNORM_PACK16 , plain, 1, 1, un4 , un4 , un4 , un4 , wzyx, rgb +VK_FORMAT_B4G4R4A4_UNORM_PACK16 , plain, 1, 1, un4 , un4 , un4 , un4 , wxyz, rgb +VK_FORMAT_R5G6B5_UNORM_PACK16 , plain, 1, 1, un5 , un6 , un5 , , zyx1, rgb +VK_FORMAT_B5G6R5_UNORM_PACK16 , plain, 1, 1, un5 , un6 , un5 , , xyz1, rgb +VK_FORMAT_R5G5B5A1_UNORM_PACK16 , plain, 1, 1, un1 , un5 , un5 , un5 , wzyx, rgb +VK_FORMAT_B5G5R5A1_UNORM_PACK16 , plain, 1, 1, un1 , un5 , un5 , un5 , wxyz, rgb +VK_FORMAT_A1R5G5B5_UNORM_PACK16 , plain, 1, 1, un5 , un5 , un5 , un1 , zyxw, rgb +VK_FORMAT_R8_UNORM , plain, 1, 1, un8 , , , , x001, rgb +VK_FORMAT_R8_SNORM , plain, 1, 1, sn8 , , , , x001, rgb +VK_FORMAT_R8_USCALED , plain, 1, 1, us8 , , , , x001, rgb +VK_FORMAT_R8_SSCALED , plain, 1, 1, ss8 , , , , x001, rgb +VK_FORMAT_R8_UINT , plain, 1, 1, up8 , , , , x001, rgb +VK_FORMAT_R8_SINT , plain, 1, 1, sp8 , , , , x001, rgb +VK_FORMAT_R8_SRGB , plain, 1, 1, un8 , , , , x001, srgb +VK_FORMAT_R8G8_UNORM , plain, 1, 1, un8 , un8 , , , xy01, rgb +VK_FORMAT_R8G8_SNORM , plain, 1, 1, sn8 , sn8 , , , xy01, rgb +VK_FORMAT_R8G8_USCALED , plain, 1, 1, us8 , us8 , , , xy01, rgb +VK_FORMAT_R8G8_SSCALED , plain, 1, 1, ss8 , ss8 , , , xy01, rgb +VK_FORMAT_R8G8_UINT , plain, 1, 1, up8 , up8 , , , xy01, rgb +VK_FORMAT_R8G8_SINT , plain, 1, 1, sp8 , sp8 , , , xy01, rgb +VK_FORMAT_R8G8_SRGB , plain, 1, 1, un8 , un8 , , , xy01, srgb +VK_FORMAT_R8G8B8_UNORM , plain, 1, 1, un8 , un8 , un8 , , xyz1, rgb +VK_FORMAT_R8G8B8_SNORM , plain, 1, 1, sn8 , sn8 , sn8 , , xyz1, rgb +VK_FORMAT_R8G8B8_USCALED , plain, 1, 1, us8 , us8 , us8 , , xyz1, rgb +VK_FORMAT_R8G8B8_SSCALED , plain, 1, 1, ss8 , ss8 , ss8 , , xyz1, rgb +VK_FORMAT_R8G8B8_UINT , plain, 1, 1, up8 , up8 , up8 , , xyz1, rgb +VK_FORMAT_R8G8B8_SINT , plain, 1, 1, sp8 , sp8 , sp8 , , xyz1, rgb +VK_FORMAT_R8G8B8_SRGB , plain, 1, 1, un8 , un8 , un8 , , xyz1, srgb +VK_FORMAT_B8G8R8_UNORM , plain, 1, 1, un8 , un8 , un8 , , zyx1, rgb +VK_FORMAT_B8G8R8_SNORM , plain, 1, 1, sn8 , sn8 , sn8 , , zyx1, rgb +VK_FORMAT_B8G8R8_USCALED , plain, 1, 1, us8 , us8 , us8 , , zyx1, rgb +VK_FORMAT_B8G8R8_SSCALED , plain, 1, 1, ss8 , ss8 , ss8 , , zyx1, rgb +VK_FORMAT_B8G8R8_UINT , plain, 1, 1, up8 , up8 , up8 , , zyx1, rgb +VK_FORMAT_B8G8R8_SINT , plain, 1, 1, sp8 , sp8 , sp8 , , zyx1, rgb +VK_FORMAT_B8G8R8_SRGB , plain, 1, 1, un8 , un8 , un8 , , zyx1, srgb +VK_FORMAT_R8G8B8A8_UNORM , plain, 1, 1, un8 , un8 , un8 , un8 , xyzw, rgb +VK_FORMAT_R8G8B8A8_SNORM , plain, 1, 1, sn8 , sn8 , sn8 , sn8 , xyzw, rgb +VK_FORMAT_R8G8B8A8_USCALED , plain, 1, 1, us8 , us8 , us8 , us8 , xyzw, rgb +VK_FORMAT_R8G8B8A8_SSCALED , plain, 1, 1, ss8 , ss8 , ss8 , ss8 , xyzw, rgb +VK_FORMAT_R8G8B8A8_UINT , plain, 1, 1, up8 , up8 , up8 , up8 , xyzw, rgb +VK_FORMAT_R8G8B8A8_SINT , plain, 1, 1, sp8 , sp8 , sp8 , sp8 , xyzw, rgb +VK_FORMAT_R8G8B8A8_SRGB , plain, 1, 1, un8 , un8 , un8 , un8 , xyzw, srgb +VK_FORMAT_B8G8R8A8_UNORM , plain, 1, 1, un8 , un8 , un8 , un8 , zyxw, rgb +VK_FORMAT_B8G8R8A8_SNORM , plain, 1, 1, sn8 , sn8 , sn8 , sn8 , zyxw, rgb +VK_FORMAT_B8G8R8A8_USCALED , plain, 1, 1, us8 , us8 , us8 , us8 , zyxw, rgb +VK_FORMAT_B8G8R8A8_SSCALED , plain, 1, 1, ss8 , ss8 , ss8 , ss8 , zyxw, rgb +VK_FORMAT_B8G8R8A8_UINT , plain, 1, 1, up8 , up8 , up8 , up8 , zyxw, rgb +VK_FORMAT_B8G8R8A8_SINT , plain, 1, 1, sp8 , sp8 , sp8 , sp8 , zyxw, rgb +VK_FORMAT_B8G8R8A8_SRGB , plain, 1, 1, un8 , un8 , un8 , un8 , zyxw, srgb +VK_FORMAT_A8B8G8R8_UNORM_PACK32 , plain, 1, 1, un8 , un8 , un8 , un8 , xyzw, rgb +VK_FORMAT_A8B8G8R8_SNORM_PACK32 , plain, 1, 1, sn8 , sn8 , sn8 , sn8 , xyzw, rgb +VK_FORMAT_A8B8G8R8_USCALED_PACK32 , plain, 1, 1, us8 , us8 , us8 , us8 , xyzw, rgb +VK_FORMAT_A8B8G8R8_SSCALED_PACK32 , plain, 1, 1, ss8 , ss8 , ss8 , ss8 , xyzw, rgb +VK_FORMAT_A8B8G8R8_UINT_PACK32 , plain, 1, 1, up8 , up8 , up8 , up8 , xyzw, rgb +VK_FORMAT_A8B8G8R8_SINT_PACK32 , plain, 1, 1, sp8 , sp8 , sp8 , sp8 , xyzw, rgb +VK_FORMAT_A8B8G8R8_SRGB_PACK32 , plain, 1, 1, un8 , un8 , un8 , un8 , xyzw, srgb +VK_FORMAT_A2R10G10B10_UNORM_PACK32 , plain, 1, 1, un10, un10, un10, un2 , zyxw, rgb +VK_FORMAT_A2R10G10B10_SNORM_PACK32 , plain, 1, 1, sn10, sn10, sn10, sn2 , zyxw, rgb +VK_FORMAT_A2R10G10B10_USCALED_PACK32 , plain, 1, 1, us10, us10, us10, us2 , zyxw, rgb +VK_FORMAT_A2R10G10B10_SSCALED_PACK32 , plain, 1, 1, ss10, ss10, ss10, ss2 , zyxw, rgb +VK_FORMAT_A2R10G10B10_UINT_PACK32 , plain, 1, 1, up10, up10, up10, up2 , zyxw, rgb +VK_FORMAT_A2R10G10B10_SINT_PACK32 , plain, 1, 1, sp10, sp10, sp10, sp2 , zyxw, rgb +VK_FORMAT_A2B10G10R10_UNORM_PACK32 , plain, 1, 1, un10, un10, un10, un2 , xyzw, rgb +VK_FORMAT_A2B10G10R10_SNORM_PACK32 , plain, 1, 1, sn10, sn10, sn10, sn2 , xyzw, rgb +VK_FORMAT_A2B10G10R10_USCALED_PACK32 , plain, 1, 1, us10, us10, us10, us2 , xyzw, rgb +VK_FORMAT_A2B10G10R10_SSCALED_PACK32 , plain, 1, 1, ss10, ss10, ss10, ss2 , xyzw, rgb +VK_FORMAT_A2B10G10R10_UINT_PACK32 , plain, 1, 1, up10, up10, up10, up2 , xyzw, rgb +VK_FORMAT_A2B10G10R10_SINT_PACK32 , plain, 1, 1, sp10, sp10, sp10, sp2 , xyzw, rgb +VK_FORMAT_R16_UNORM , plain, 1, 1, un16, , , , x001, rgb +VK_FORMAT_R16_SNORM , plain, 1, 1, sn16, , , , x001, rgb +VK_FORMAT_R16_USCALED , plain, 1, 1, us16, , , , x001, rgb +VK_FORMAT_R16_SSCALED , plain, 1, 1, ss16, , , , x001, rgb +VK_FORMAT_R16_UINT , plain, 1, 1, up16, , , , x001, rgb +VK_FORMAT_R16_SINT , plain, 1, 1, sp16, , , , x001, rgb +VK_FORMAT_R16_SFLOAT , plain, 1, 1, f16 , , , , x001, rgb +VK_FORMAT_R16G16_UNORM , plain, 1, 1, un16, un16, , , xy01, rgb +VK_FORMAT_R16G16_SNORM , plain, 1, 1, sn16, sn16, , , xy01, rgb +VK_FORMAT_R16G16_USCALED , plain, 1, 1, us16, us16, , , xy01, rgb +VK_FORMAT_R16G16_SSCALED , plain, 1, 1, ss16, ss16, , , xy01, rgb +VK_FORMAT_R16G16_UINT , plain, 1, 1, up16, up16, , , xy01, rgb +VK_FORMAT_R16G16_SINT , plain, 1, 1, sp16, sp16, , , xy01, rgb +VK_FORMAT_R16G16_SFLOAT , plain, 1, 1, f16 , f16 , , , xy01, rgb +VK_FORMAT_R16G16B16_UNORM , plain, 1, 1, un16, un16, un16, , xyz1, rgb +VK_FORMAT_R16G16B16_SNORM , plain, 1, 1, sn16, sn16, sn16, , xyz1, rgb +VK_FORMAT_R16G16B16_USCALED , plain, 1, 1, us16, us16, us16, , xyz1, rgb +VK_FORMAT_R16G16B16_SSCALED , plain, 1, 1, ss16, ss16, ss16, , xyz1, rgb +VK_FORMAT_R16G16B16_UINT , plain, 1, 1, up16, up16, up16, , xyz1, rgb +VK_FORMAT_R16G16B16_SINT , plain, 1, 1, sp16, sp16, sp16, , xyz1, rgb +VK_FORMAT_R16G16B16_SFLOAT , plain, 1, 1, f16 , f16 , f16 , , xyz1, rgb +VK_FORMAT_R16G16B16A16_UNORM , plain, 1, 1, un16, un16, un16, un16, xyzw, rgb +VK_FORMAT_R16G16B16A16_SNORM , plain, 1, 1, sn16, sn16, sn16, sn16, xyzw, rgb +VK_FORMAT_R16G16B16A16_USCALED , plain, 1, 1, us16, us16, us16, us16, xyzw, rgb +VK_FORMAT_R16G16B16A16_SSCALED , plain, 1, 1, ss16, ss16, ss16, ss16, xyzw, rgb +VK_FORMAT_R16G16B16A16_UINT , plain, 1, 1, up16, up16, up16, up16, xyzw, rgb +VK_FORMAT_R16G16B16A16_SINT , plain, 1, 1, sp16, sp16, sp16, sp16, xyzw, rgb +VK_FORMAT_R16G16B16A16_SFLOAT , plain, 1, 1, f16 , f16 , f16 , f16 , xyzw, rgb +VK_FORMAT_R32_UINT , plain, 1, 1, up32, , , , x001, rgb +VK_FORMAT_R32_SINT , plain, 1, 1, sp32, , , , x001, rgb +VK_FORMAT_R32_SFLOAT , plain, 1, 1, f32 , , , , x001, rgb +VK_FORMAT_R32G32_UINT , plain, 1, 1, up32, up32, , , xy01, rgb +VK_FORMAT_R32G32_SINT , plain, 1, 1, sp32, sp32, , , xy01, rgb +VK_FORMAT_R32G32_SFLOAT , plain, 1, 1, f32 , f32 , , , xy01, rgb +VK_FORMAT_R32G32B32_UINT , plain, 1, 1, up32, up32, up32, , xyz1, rgb +VK_FORMAT_R32G32B32_SINT , plain, 1, 1, sp32, sp32, sp32, , xyz1, rgb +VK_FORMAT_R32G32B32_SFLOAT , plain, 1, 1, f32 , f32 , f32 , , xyz1, rgb +VK_FORMAT_R32G32B32A32_UINT , plain, 1, 1, up32, up32, up32, up32, xyzw, rgb +VK_FORMAT_R32G32B32A32_SINT , plain, 1, 1, sp32, sp32, sp32, sp32, xyzw, rgb +VK_FORMAT_R32G32B32A32_SFLOAT , plain, 1, 1, f32 , f32 , f32 , f32 , xyzw, rgb +VK_FORMAT_R64_UINT , plain, 1, 1, up64, , , , x001, rgb +VK_FORMAT_R64_SINT , plain, 1, 1, sp64, , , , x001, rgb +VK_FORMAT_R64_SFLOAT , plain, 1, 1, f64 , , , , x001, rgb +VK_FORMAT_R64G64_UINT , plain, 1, 1, up64, up64, , , xy01, rgb +VK_FORMAT_R64G64_SINT , plain, 1, 1, sp64, sp64, , , xy01, rgb +VK_FORMAT_R64G64_SFLOAT , plain, 1, 1, f64 , f64 , , , xy01, rgb +VK_FORMAT_R64G64B64_UINT , plain, 1, 1, up64, up64, up64, , xyz1, rgb +VK_FORMAT_R64G64B64_SINT , plain, 1, 1, sp64, sp64, sp64, , xyz1, rgb +VK_FORMAT_R64G64B64_SFLOAT , plain, 1, 1, f64 , f64 , f64 , , xyz1, rgb +VK_FORMAT_R64G64B64A64_UINT , plain, 1, 1, up64, up64, up64, up64, xyzw, rgb +VK_FORMAT_R64G64B64A64_SINT , plain, 1, 1, sp64, sp64, sp64, sp64, xyzw, rgb +VK_FORMAT_R64G64B64A64_SFLOAT , plain, 1, 1, f64 , f64 , f64 , f64 , xyzw, rgb +VK_FORMAT_B10G11R11_UFLOAT_PACK32 , other, 1, 1, x32 , , , , xyz1, rgb +VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 , other, 1, 1, x32 , , , , xyz1, rgb +VK_FORMAT_D16_UNORM , plain, 1, 1, un16, , , , x___, zs +VK_FORMAT_X8_D24_UNORM_PACK32 , plain, 1, 1, un24, x8 , , , x___, zs +VK_FORMAT_D32_SFLOAT , plain, 1, 1, f32 , , , , x___, zs +VK_FORMAT_S8_UINT , plain, 1, 1, up8 , , , , _x__, zs +VK_FORMAT_D16_UNORM_S8_UINT , plain, 1, 1, un16, up8 , , , xy__, zs +VK_FORMAT_D24_UNORM_S8_UINT , plain, 1, 1, un24, up8 , , , xy__, zs +VK_FORMAT_D32_SFLOAT_S8_UINT , plain, 1, 1, f32 , up8 , , , xy__, zs +VK_FORMAT_BC1_RGB_UNORM_BLOCK , s3tc, 4, 4, x64 , , , , xyz1, rgb +VK_FORMAT_BC1_RGB_SRGB_BLOCK , s3tc, 4, 4, x64 , , , , xyz1, srgb +VK_FORMAT_BC1_RGBA_UNORM_BLOCK , s3tc, 4, 4, x64 , , , , xyzw, rgb +VK_FORMAT_BC1_RGBA_SRGB_BLOCK , s3tc, 4, 4, x64 , , , , xyzw, srgb +VK_FORMAT_BC2_UNORM_BLOCK , s3tc, 4, 4, x128, , , , xyzw, rgb +VK_FORMAT_BC2_SRGB_BLOCK , s3tc, 4, 4, x128, , , , xyzw, srgb +VK_FORMAT_BC3_UNORM_BLOCK , s3tc, 4, 4, x128, , , , xyzw, rgb +VK_FORMAT_BC3_SRGB_BLOCK , s3tc, 4, 4, x128, , , , xyzw, srgb +VK_FORMAT_BC4_UNORM_BLOCK , rgtc, 4, 4, x64, , , , x001, rgb +VK_FORMAT_BC4_SNORM_BLOCK , rgtc, 4, 4, x64, , , , x001, rgb +VK_FORMAT_BC5_UNORM_BLOCK , rgtc, 4, 4, x128, , , , xy01, rgb +VK_FORMAT_BC5_SNORM_BLOCK , rgtc, 4, 4, x128, , , , xy01, rgb +VK_FORMAT_BC6H_UFLOAT_BLOCK , bptc, 4, 4, x128, , , , xyz1, rgb +VK_FORMAT_BC6H_SFLOAT_BLOCK , bptc, 4, 4, x128, , , , xyz1, rgb +VK_FORMAT_BC7_UNORM_BLOCK , bptc, 4, 4, x128, , , , xyzw, rgb +VK_FORMAT_BC7_SRGB_BLOCK , bptc, 4, 4, x128, , , , xyzw, srgb +VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK, +VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK, +VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK, +VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK, +VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK, +VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK, +VK_FORMAT_EAC_R11_UNORM_BLOCK, +VK_FORMAT_EAC_R11_SNORM_BLOCK, +VK_FORMAT_EAC_R11G11_UNORM_BLOCK, +VK_FORMAT_EAC_R11G11_SNORM_BLOCK, +VK_FORMAT_ASTC_4x4_UNORM_BLOCK, +VK_FORMAT_ASTC_4x4_SRGB_BLOCK, +VK_FORMAT_ASTC_5x4_UNORM_BLOCK, +VK_FORMAT_ASTC_5x4_SRGB_BLOCK, +VK_FORMAT_ASTC_5x5_UNORM_BLOCK, +VK_FORMAT_ASTC_5x5_SRGB_BLOCK, +VK_FORMAT_ASTC_6x5_UNORM_BLOCK, +VK_FORMAT_ASTC_6x5_SRGB_BLOCK, +VK_FORMAT_ASTC_6x6_UNORM_BLOCK, +VK_FORMAT_ASTC_6x6_SRGB_BLOCK, +VK_FORMAT_ASTC_8x5_UNORM_BLOCK, +VK_FORMAT_ASTC_8x5_SRGB_BLOCK, +VK_FORMAT_ASTC_8x6_UNORM_BLOCK, +VK_FORMAT_ASTC_8x6_SRGB_BLOCK, +VK_FORMAT_ASTC_8x8_UNORM_BLOCK, +VK_FORMAT_ASTC_8x8_SRGB_BLOCK, +VK_FORMAT_ASTC_10x5_UNORM_BLOCK, +VK_FORMAT_ASTC_10x5_SRGB_BLOCK, +VK_FORMAT_ASTC_10x6_UNORM_BLOCK, +VK_FORMAT_ASTC_10x6_SRGB_BLOCK, +VK_FORMAT_ASTC_10x8_UNORM_BLOCK, +VK_FORMAT_ASTC_10x8_SRGB_BLOCK, +VK_FORMAT_ASTC_10x10_UNORM_BLOCK, +VK_FORMAT_ASTC_10x10_SRGB_BLOCK, +VK_FORMAT_ASTC_12x10_UNORM_BLOCK, +VK_FORMAT_ASTC_12x10_SRGB_BLOCK, +VK_FORMAT_ASTC_12x12_UNORM_BLOCK, +VK_FORMAT_ASTC_12x12_SRGB_BLOCK, diff --git a/src/amd/vulkan/vk_format_parse.py b/src/amd/vulkan/vk_format_parse.py new file mode 100755 index 00000000000..b743fc2bdb4 --- /dev/null +++ b/src/amd/vulkan/vk_format_parse.py @@ -0,0 +1,384 @@ +#!/usr/bin/env python + +''' +/************************************************************************** + * + * Copyright 2009 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ +''' + + +VOID, UNSIGNED, SIGNED, FIXED, FLOAT = range(5) + +SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_0, SWIZZLE_1, SWIZZLE_NONE, = range(7) + +PLAIN = 'plain' +SCALED = 'scaled' + +RGB = 'rgb' +SRGB = 'srgb' +YUV = 'yuv' +ZS = 'zs' + + +def is_pot(x): + return (x & (x - 1)) == 0 + + +VERY_LARGE = 99999999999999999999999 + + +class Channel: + '''Describe the channel of a color channel.''' + + def __init__(self, type, norm, pure, scaled, size, name = ''): + self.type = type + self.norm = norm + self.pure = pure + self.size = size + self.scaled = scaled + self.sign = type in (SIGNED, FIXED, FLOAT) + self.name = name + + def __str__(self): + s = str(self.type) + if self.norm: + s += 'n' + if self.pure: + s += 'p' + if self.scaled: + s += 's' + s += str(self.size) + return s + + def __eq__(self, other): + return self.type == other.type and self.norm == other.norm and self.pure == other.pure and self.size == other.size and self.scaled == other.scaled + + def max(self): + '''Maximum representable number.''' + if self.type == FLOAT: + return VERY_LARGE + if self.type == FIXED: + return (1 << (self.size/2)) - 1 + if self.norm: + return 1 + if self.type == UNSIGNED: + return (1 << self.size) - 1 + if self.type == SIGNED: + return (1 << (self.size - 1)) - 1 + assert False + + def min(self): + '''Minimum representable number.''' + if self.type == FLOAT: + return -VERY_LARGE + if self.type == FIXED: + return -(1 << (self.size/2)) + if self.type == UNSIGNED: + return 0 + if self.norm: + return -1 + if self.type == SIGNED: + return -(1 << (self.size - 1)) + assert False + + +class Format: + '''Describe a pixel format.''' + + def __init__(self, name, layout, block_width, block_height, le_channels, le_swizzles, be_channels, be_swizzles, colorspace): + self.name = name + self.layout = layout + self.block_width = block_width + self.block_height = block_height + self.le_channels = le_channels + self.le_swizzles = le_swizzles + self.be_channels = be_channels + self.be_swizzles = be_swizzles + self.name = name + self.colorspace = colorspace + + def __str__(self): + return self.name + + def short_name(self): + '''Make up a short norm for a format, suitable to be used as suffix in + function names.''' + + name = self.name + if name.startswith('VK_FORMAT_'): + name = name[len('VK_FORMAT_'):] + name = name.lower() + return name + + def block_size(self): + size = 0 + for channel in self.le_channels: + size += channel.size + return size + + def nr_channels(self): + nr_channels = 0 + for channel in self.le_channels: + if channel.size: + nr_channels += 1 + return nr_channels + + def array_element(self): + if self.layout != PLAIN: + return None + ref_channel = self.le_channels[0] + if ref_channel.type == VOID: + ref_channel = self.le_channels[1] + for channel in self.le_channels: + if channel.size and (channel.size != ref_channel.size or channel.size % 8): + return None + if channel.type != VOID: + if channel.type != ref_channel.type: + return None + if channel.norm != ref_channel.norm: + return None + if channel.pure != ref_channel.pure: + return None + if channel.scaled != ref_channel.scaled: + return None + return ref_channel + + def is_array(self): + return self.array_element() != None + + def is_mixed(self): + if self.layout != PLAIN: + return False + ref_channel = self.le_channels[0] + if ref_channel.type == VOID: + ref_channel = self.le_channels[1] + for channel in self.le_channels[1:]: + if channel.type != VOID: + if channel.type != ref_channel.type: + return True + if channel.norm != ref_channel.norm: + return True + if channel.pure != ref_channel.pure: + return True + if channel.scaled != ref_channel.scaled: + return True + return False + + def is_pot(self): + return is_pot(self.block_size()) + + def is_int(self): + if self.layout != PLAIN: + return False + for channel in self.le_channels: + if channel.type not in (VOID, UNSIGNED, SIGNED): + return False + return True + + def is_float(self): + if self.layout != PLAIN: + return False + for channel in self.le_channels: + if channel.type not in (VOID, FLOAT): + return False + return True + + def is_bitmask(self): + if self.layout != PLAIN: + return False + if self.block_size() not in (8, 16, 32): + return False + for channel in self.le_channels: + if channel.type not in (VOID, UNSIGNED, SIGNED): + return False + return True + + def is_pure_color(self): + if self.layout != PLAIN or self.colorspace == ZS: + return False + pures = [channel.pure + for channel in self.le_channels + if channel.type != VOID] + for x in pures: + assert x == pures[0] + return pures[0] + + def channel_type(self): + types = [channel.type + for channel in self.le_channels + if channel.type != VOID] + for x in types: + assert x == types[0] + return types[0] + + def is_pure_signed(self): + return self.is_pure_color() and self.channel_type() == SIGNED + + def is_pure_unsigned(self): + return self.is_pure_color() and self.channel_type() == UNSIGNED + + def has_channel(self, id): + return self.le_swizzles[id] != SWIZZLE_NONE + + def has_depth(self): + return self.colorspace == ZS and self.has_channel(0) + + def has_stencil(self): + return self.colorspace == ZS and self.has_channel(1) + + def stride(self): + return self.block_size()/8 + + +_type_parse_map = { + '': VOID, + 'x': VOID, + 'u': UNSIGNED, + 's': SIGNED, + 'h': FIXED, + 'f': FLOAT, +} + +_swizzle_parse_map = { + 'x': SWIZZLE_X, + 'y': SWIZZLE_Y, + 'z': SWIZZLE_Z, + 'w': SWIZZLE_W, + '0': SWIZZLE_0, + '1': SWIZZLE_1, + '_': SWIZZLE_NONE, +} + +def _parse_channels(fields, layout, colorspace, swizzles): + if layout == PLAIN: + names = ['']*4 + if colorspace in (RGB, SRGB): + for i in range(4): + swizzle = swizzles[i] + if swizzle < 4: + names[swizzle] += 'rgba'[i] + elif colorspace == ZS: + for i in range(4): + swizzle = swizzles[i] + if swizzle < 4: + names[swizzle] += 'zs'[i] + else: + assert False + for i in range(4): + if names[i] == '': + names[i] = 'x' + else: + names = ['x', 'y', 'z', 'w'] + + channels = [] + for i in range(0, 4): + field = fields[i] + if field: + type = _type_parse_map[field[0]] + if field[1] == 'n': + norm = True + pure = False + scaled = False + size = int(field[2:]) + elif field[1] == 'p': + pure = True + norm = False + scaled = False + size = int(field[2:]) + elif field[1] == 's': + pure = False + norm = False + scaled = True + size = int(field[2:]) + else: + norm = False + pure = False + scaled = False + size = int(field[1:]) + else: + type = VOID + norm = False + pure = False + scaled = False + size = 0 + channel = Channel(type, norm, pure, scaled, size, names[i]) + channels.append(channel) + + return channels + +def parse(filename): + '''Parse the format description in CSV format in terms of the + Channel and Format classes above.''' + + stream = open(filename) + formats = [] + for line in stream: + try: + comment = line.index('#') + except ValueError: + pass + else: + line = line[:comment] + line = line.strip() + if not line: + continue + + fields = [field.strip() for field in line.split(',')] + if len (fields) < 10: + continue + if len (fields) == 10: + fields += fields[4:9] + assert len (fields) == 15 + + name = fields[0] + layout = fields[1] + block_width, block_height = map(int, fields[2:4]) + colorspace = fields[9] + + le_swizzles = [_swizzle_parse_map[swizzle] for swizzle in fields[8]] + le_channels = _parse_channels(fields[4:8], layout, colorspace, le_swizzles) + + be_swizzles = [_swizzle_parse_map[swizzle] for swizzle in fields[14]] + be_channels = _parse_channels(fields[10:14], layout, colorspace, be_swizzles) + + le_shift = 0 + for channel in le_channels: + channel.shift = le_shift + le_shift += channel.size + + be_shift = 0 + for channel in be_channels[3::-1]: + channel.shift = be_shift + be_shift += channel.size + + assert le_shift == be_shift + for i in range(4): + assert (le_swizzles[i] != SWIZZLE_NONE) == (be_swizzles[i] != SWIZZLE_NONE) + + format = Format(name, layout, block_width, block_height, le_channels, le_swizzles, be_channels, be_swizzles, colorspace) + formats.append(format) + return formats + diff --git a/src/amd/vulkan/vk_format_table.py b/src/amd/vulkan/vk_format_table.py new file mode 100755 index 00000000000..06b98e568b4 --- /dev/null +++ b/src/amd/vulkan/vk_format_table.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python + +CopyRight = ''' +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ +''' + + +import sys + +from vk_format_parse import * + +def layout_map(layout): + return 'VK_FORMAT_LAYOUT_' + str(layout).upper() + + +def colorspace_map(colorspace): + return 'VK_FORMAT_COLORSPACE_' + str(colorspace).upper() + + +colorspace_channels_map = { + 'rgb': ['r', 'g', 'b', 'a'], + 'srgb': ['sr', 'sg', 'sb', 'a'], + 'zs': ['z', 's'], + 'yuv': ['y', 'u', 'v'], +} + + +type_map = { + VOID: "VK_FORMAT_TYPE_VOID", + UNSIGNED: "VK_FORMAT_TYPE_UNSIGNED", + SIGNED: "VK_FORMAT_TYPE_SIGNED", + FIXED: "VK_FORMAT_TYPE_FIXED", + FLOAT: "VK_FORMAT_TYPE_FLOAT", +} + + +def bool_map(value): + if value: + return "true" + else: + return "false" + + +swizzle_map = { + SWIZZLE_X: "VK_SWIZZLE_X", + SWIZZLE_Y: "VK_SWIZZLE_Y", + SWIZZLE_Z: "VK_SWIZZLE_Z", + SWIZZLE_W: "VK_SWIZZLE_W", + SWIZZLE_0: "VK_SWIZZLE_0", + SWIZZLE_1: "VK_SWIZZLE_1", + SWIZZLE_NONE: "VK_SWIZZLE_NONE", +} + +def print_channels(format, func): + if format.nr_channels() <= 1: + func(format.le_channels, format.le_swizzles) + else: + print '#ifdef PIPE_ARCH_BIG_ENDIAN' + func(format.be_channels, format.be_swizzles) + print '#else' + func(format.le_channels, format.le_swizzles) + print '#endif' + +def write_format_table(formats): + print '/* This file is autogenerated by u_format_table.py from u_format.csv. Do not edit directly. */' + print + # This will print the copyright message on the top of this file + print CopyRight.strip() + print + print '#include "stdbool.h"' + print '#include "vk_format.h"' + print + + def do_channel_array(channels, swizzles): + print " {" + for i in range(4): + channel = channels[i] + if i < 3: + sep = "," + else: + sep = "" + if channel.size: + print " {%s, %s, %s, %s, %u, %u}%s\t/* %s = %s */" % (type_map[channel.type], bool_map(channel.norm), bool_map(channel.pure), bool_map(channel.scaled), channel.size, channel.shift, sep, "xyzw"[i], channel.name) + else: + print " {0, 0, 0, 0, 0}%s" % (sep,) + print " }," + + def do_swizzle_array(channels, swizzles): + print " {" + for i in range(4): + swizzle = swizzles[i] + if i < 3: + sep = "," + else: + sep = "" + try: + comment = colorspace_channels_map[format.colorspace][i] + except (KeyError, IndexError): + comment = 'ignored' + print " %s%s\t/* %s */" % (swizzle_map[swizzle], sep, comment) + print " }," + + for format in formats: + print 'const struct vk_format_description' + print 'vk_format_%s_description = {' % (format.short_name(),) + print " %s," % (format.name,) + print " \"%s\"," % (format.name,) + print " \"%s\"," % (format.short_name(),) + print " {%u, %u, %u},\t/* block */" % (format.block_width, format.block_height, format.block_size()) + print " %s," % (layout_map(format.layout),) + print " %u,\t/* nr_channels */" % (format.nr_channels(),) + print " %s,\t/* is_array */" % (bool_map(format.is_array()),) + print " %s,\t/* is_bitmask */" % (bool_map(format.is_bitmask()),) + print " %s,\t/* is_mixed */" % (bool_map(format.is_mixed()),) + print_channels(format, do_channel_array) + print_channels(format, do_swizzle_array) + print " %s," % (colorspace_map(format.colorspace),) + print "};" + print + + print "const struct vk_format_description *" + print "vk_format_description(VkFormat format)" + print "{" + print " if (format > VK_FORMAT_END_RANGE) {" + print " return NULL;" + print " }" + print + print " switch (format) {" + for format in formats: + print " case %s:" % format.name + print " return &vk_format_%s_description;" % (format.short_name(),) + print " default:" + print " return NULL;" + print " }" + print "}" + print + + +def main(): + + formats = [] + for arg in sys.argv[1:]: + formats.extend(parse(arg)) + write_format_table(formats) + + +if __name__ == '__main__': + main() diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c new file mode 100644 index 00000000000..7319a988872 --- /dev/null +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c @@ -0,0 +1,297 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based on amdgpu winsys. + * Copyright © 2011 Marek Olšák + * Copyright © 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "radv_amdgpu_bo.h" + +#include +#include +#include + +static void radv_amdgpu_winsys_bo_destroy(struct radeon_winsys_bo *_bo) +{ + struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); + + if (bo->ws->debug_all_bos) { + pthread_mutex_lock(&bo->ws->global_bo_list_lock); + LIST_DEL(&bo->global_list_item); + bo->ws->num_buffers--; + pthread_mutex_unlock(&bo->ws->global_bo_list_lock); + } + amdgpu_bo_va_op(bo->bo, 0, bo->size, bo->va, 0, AMDGPU_VA_OP_UNMAP); + amdgpu_va_range_free(bo->va_handle); + amdgpu_bo_free(bo->bo); + FREE(bo); +} + +static void radv_amdgpu_add_buffer_to_global_list(struct radv_amdgpu_winsys_bo *bo) +{ + struct radv_amdgpu_winsys *ws = bo->ws; + + if (bo->ws->debug_all_bos) { + pthread_mutex_lock(&ws->global_bo_list_lock); + LIST_ADDTAIL(&bo->global_list_item, &ws->global_bo_list); + ws->num_buffers++; + pthread_mutex_unlock(&ws->global_bo_list_lock); + } +} + +static struct radeon_winsys_bo * +radv_amdgpu_winsys_bo_create(struct radeon_winsys *_ws, + uint64_t size, + unsigned alignment, + enum radeon_bo_domain initial_domain, + unsigned flags) +{ + struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); + struct radv_amdgpu_winsys_bo *bo; + struct amdgpu_bo_alloc_request request = {0}; + amdgpu_bo_handle buf_handle; + uint64_t va = 0; + amdgpu_va_handle va_handle; + int r; + bo = CALLOC_STRUCT(radv_amdgpu_winsys_bo); + if (!bo) { + return NULL; + } + + request.alloc_size = size; + request.phys_alignment = alignment; + + if (initial_domain & RADEON_DOMAIN_VRAM) + request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM; + if (initial_domain & RADEON_DOMAIN_GTT) + request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; + + if (flags & RADEON_FLAG_CPU_ACCESS) + request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; + if (flags & RADEON_FLAG_NO_CPU_ACCESS) + request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS; + if (flags & RADEON_FLAG_GTT_WC) + request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC; + + r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle); + if (r) { + fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n"); + fprintf(stderr, "amdgpu: size : %"PRIu64" bytes\n", size); + fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment); + fprintf(stderr, "amdgpu: domains : %u\n", initial_domain); + goto error_bo_alloc; + } + + r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, + size, alignment, 0, &va, &va_handle, 0); + if (r) + goto error_va_alloc; + + r = amdgpu_bo_va_op(buf_handle, 0, size, va, 0, AMDGPU_VA_OP_MAP); + if (r) + goto error_va_map; + + bo->bo = buf_handle; + bo->va = va; + bo->va_handle = va_handle; + bo->initial_domain = initial_domain; + bo->size = size; + bo->is_shared = false; + bo->ws = ws; + radv_amdgpu_add_buffer_to_global_list(bo); + return (struct radeon_winsys_bo *)bo; +error_va_map: + amdgpu_va_range_free(va_handle); + +error_va_alloc: + amdgpu_bo_free(buf_handle); + +error_bo_alloc: + FREE(bo); + return NULL; +} + +static uint64_t radv_amdgpu_winsys_bo_get_va(struct radeon_winsys_bo *_bo) +{ + struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); + return bo->va; +} + +static void * +radv_amdgpu_winsys_bo_map(struct radeon_winsys_bo *_bo) +{ + struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); + int ret; + void *data; + ret = amdgpu_bo_cpu_map(bo->bo, &data); + if (ret) + return NULL; + return data; +} + +static void +radv_amdgpu_winsys_bo_unmap(struct radeon_winsys_bo *_bo) +{ + struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); + amdgpu_bo_cpu_unmap(bo->bo); +} + +static struct radeon_winsys_bo * +radv_amdgpu_winsys_bo_from_fd(struct radeon_winsys *_ws, + int fd, unsigned *stride, + unsigned *offset) +{ + struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); + struct radv_amdgpu_winsys_bo *bo; + uint64_t va; + amdgpu_va_handle va_handle; + enum amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd; + struct amdgpu_bo_import_result result = {0}; + struct amdgpu_bo_info info = {0}; + enum radeon_bo_domain initial = 0; + int r; + bo = CALLOC_STRUCT(radv_amdgpu_winsys_bo); + if (!bo) + return NULL; + + r = amdgpu_bo_import(ws->dev, type, fd, &result); + if (r) + goto error; + + r = amdgpu_bo_query_info(result.buf_handle, &info); + if (r) + goto error_query; + + r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, + result.alloc_size, 1 << 20, 0, &va, &va_handle, 0); + if (r) + goto error_query; + + r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP); + if (r) + goto error_va_map; + + if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM) + initial |= RADEON_DOMAIN_VRAM; + if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT) + initial |= RADEON_DOMAIN_GTT; + + bo->bo = result.buf_handle; + bo->va = va; + bo->va_handle = va_handle; + bo->initial_domain = initial; + bo->size = result.alloc_size; + bo->is_shared = true; + return (struct radeon_winsys_bo *)bo; +error_va_map: + amdgpu_va_range_free(va_handle); + +error_query: + amdgpu_bo_free(result.buf_handle); + +error: + FREE(bo); + return NULL; +} + +static bool +radv_amdgpu_winsys_get_fd(struct radeon_winsys *_ws, + struct radeon_winsys_bo *_bo, + int *fd) +{ + struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); + enum amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd; + int r; + unsigned handle; + r = amdgpu_bo_export(bo->bo, type, &handle); + if (r) + return false; + + *fd = (int)handle; + bo->is_shared = true; + return true; +} + +static unsigned radv_eg_tile_split_rev(unsigned eg_tile_split) +{ + switch (eg_tile_split) { + case 64: return 0; + case 128: return 1; + case 256: return 2; + case 512: return 3; + default: + case 1024: return 4; + case 2048: return 5; + case 4096: return 6; + } +} + +static void +radv_amdgpu_winsys_bo_set_metadata(struct radeon_winsys_bo *_bo, + struct radeon_bo_metadata *md) +{ + struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); + struct amdgpu_bo_metadata metadata = {0}; + uint32_t tiling_flags = 0; + + if (md->macrotile == RADEON_LAYOUT_TILED) + tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */ + else if (md->microtile == RADEON_LAYOUT_TILED) + tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */ + else + tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */ + + tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->pipe_config); + tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->bankw)); + tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->bankh)); + if (md->tile_split) + tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, radv_eg_tile_split_rev(md->tile_split)); + tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->mtilea)); + tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->num_banks)-1); + + if (md->scanout) + tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */ + else + tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */ + + metadata.tiling_info = tiling_flags; + metadata.size_metadata = md->size_metadata; + memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata)); + + amdgpu_bo_set_metadata(bo->bo, &metadata); +} + +void radv_amdgpu_bo_init_functions(struct radv_amdgpu_winsys *ws) +{ + ws->base.buffer_create = radv_amdgpu_winsys_bo_create; + ws->base.buffer_destroy = radv_amdgpu_winsys_bo_destroy; + ws->base.buffer_get_va = radv_amdgpu_winsys_bo_get_va; + ws->base.buffer_map = radv_amdgpu_winsys_bo_map; + ws->base.buffer_unmap = radv_amdgpu_winsys_bo_unmap; + ws->base.buffer_from_fd = radv_amdgpu_winsys_bo_from_fd; + ws->base.buffer_get_fd = radv_amdgpu_winsys_get_fd; + ws->base.buffer_set_metadata = radv_amdgpu_winsys_bo_set_metadata; +} diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h new file mode 100644 index 00000000000..59a1bb76502 --- /dev/null +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h @@ -0,0 +1,50 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based on amdgpu winsys. + * Copyright © 2011 Marek Olšák + * Copyright © 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once +#include "radv_amdgpu_winsys.h" +struct radv_amdgpu_winsys_bo { + amdgpu_bo_handle bo; + amdgpu_va_handle va_handle; + + uint64_t va; + enum radeon_bo_domain initial_domain; + uint64_t size; + bool is_shared; + + struct radv_amdgpu_winsys *ws; + struct list_head global_list_item; +}; + +static inline +struct radv_amdgpu_winsys_bo *radv_amdgpu_winsys_bo(struct radeon_winsys_bo *bo) +{ + return (struct radv_amdgpu_winsys_bo *)bo; +} + +void radv_amdgpu_bo_init_functions(struct radv_amdgpu_winsys *ws); + diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c new file mode 100644 index 00000000000..dedc778f1cd --- /dev/null +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -0,0 +1,778 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +#include "amdgpu_id.h" +#include "radv_radeon_winsys.h" +#include "radv_amdgpu_cs.h" +#include "radv_amdgpu_bo.h" +#include "sid.h" + +struct radv_amdgpu_cs { + struct radeon_winsys_cs base; + struct radv_amdgpu_winsys *ws; + + struct amdgpu_cs_ib_info ib; + + struct radeon_winsys_bo *ib_buffer; + uint8_t *ib_mapped; + unsigned max_num_buffers; + unsigned num_buffers; + amdgpu_bo_handle *handles; + uint8_t *priorities; + + struct radeon_winsys_bo **old_ib_buffers; + unsigned num_old_ib_buffers; + unsigned max_num_old_ib_buffers; + unsigned *ib_size_ptr; + bool failed; + bool is_chained; + + int buffer_hash_table[1024]; +}; + +static inline struct radv_amdgpu_cs * +radv_amdgpu_cs(struct radeon_winsys_cs *base) +{ + return (struct radv_amdgpu_cs*)base; +} + + +static struct radeon_winsys_fence *radv_amdgpu_create_fence() +{ + struct radv_amdgpu_cs_fence *fence = calloc(1, sizeof(struct amdgpu_cs_fence)); + return (struct radeon_winsys_fence*)fence; +} + +static void radv_amdgpu_destroy_fence(struct radeon_winsys_fence *_fence) +{ + struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence; + free(fence); +} + +static bool radv_amdgpu_fence_wait(struct radeon_winsys *_ws, + struct radeon_winsys_fence *_fence, + bool absolute, + uint64_t timeout) +{ + struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence; + unsigned flags = absolute ? AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE : 0; + int r; + uint32_t expired = 0; + /* Now use the libdrm query. */ + r = amdgpu_cs_query_fence_status(fence, + timeout, + flags, + &expired); + + if (r) { + fprintf(stderr, "amdgpu: radv_amdgpu_cs_query_fence_status failed.\n"); + return false; + } + + if (expired) { + return true; + } + return false; + +} + +static void radv_amdgpu_cs_destroy(struct radeon_winsys_cs *rcs) +{ + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(rcs); + if (cs->ib_buffer) + cs->ws->base.buffer_destroy(cs->ib_buffer); + else + free(cs->base.buf); + + for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i) + cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]); + free(cs->old_ib_buffers); + free(cs->handles); + free(cs->priorities); + free(cs); +} + +static boolean radv_amdgpu_init_cs(struct radv_amdgpu_cs *cs, + enum ring_type ring_type) +{ + for (int i = 0; i < ARRAY_SIZE(cs->buffer_hash_table); ++i) { + cs->buffer_hash_table[i] = -1; + } + return true; +} + +static struct radeon_winsys_cs * +radv_amdgpu_cs_create(struct radeon_winsys *ws, + enum ring_type ring_type) +{ + struct radv_amdgpu_cs *cs; + uint32_t ib_size = 20 * 1024 * 4; + cs = calloc(1, sizeof(struct radv_amdgpu_cs)); + if (!cs) + return NULL; + + cs->ws = radv_amdgpu_winsys(ws); + radv_amdgpu_init_cs(cs, RING_GFX); + + if (cs->ws->use_ib_bos) { + cs->ib_buffer = ws->buffer_create(ws, ib_size, 0, + RADEON_DOMAIN_GTT, + RADEON_FLAG_CPU_ACCESS); + if (!cs->ib_buffer) { + free(cs); + return NULL; + } + + cs->ib_mapped = ws->buffer_map(cs->ib_buffer); + if (!cs->ib_mapped) { + ws->buffer_destroy(cs->ib_buffer); + free(cs); + return NULL; + } + + cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->va; + cs->base.buf = (uint32_t *)cs->ib_mapped; + cs->base.max_dw = ib_size / 4 - 4; + cs->ib_size_ptr = &cs->ib.size; + cs->ib.size = 0; + + ws->cs_add_buffer(&cs->base, cs->ib_buffer, 8); + } else { + cs->base.buf = malloc(16384); + cs->base.max_dw = 4096; + if (!cs->base.buf) { + free(cs); + return NULL; + } + } + + return &cs->base; +} + +static void radv_amdgpu_cs_grow(struct radeon_winsys_cs *_cs, size_t min_size) +{ + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs); + uint64_t ib_size = MAX2(min_size * 4 + 16, cs->base.max_dw * 4 * 2); + + /* max that fits in the chain size field. */ + ib_size = MIN2(ib_size, 0xfffff); + + if (cs->failed) { + cs->base.cdw = 0; + return; + } + + if (!cs->ws->use_ib_bos) { + uint32_t *new_buf = realloc(cs->base.buf, ib_size); + if (new_buf) { + cs->base.buf = new_buf; + cs->base.max_dw = ib_size / 4; + } else { + cs->failed = true; + cs->base.cdw = 0; + } + return; + } + + while (!cs->base.cdw || (cs->base.cdw & 7) != 4) + cs->base.buf[cs->base.cdw++] = 0xffff1000; + + *cs->ib_size_ptr |= cs->base.cdw + 4; + + if (cs->num_old_ib_buffers == cs->max_num_old_ib_buffers) { + cs->max_num_old_ib_buffers = MAX2(1, cs->max_num_old_ib_buffers * 2); + cs->old_ib_buffers = realloc(cs->old_ib_buffers, + cs->max_num_old_ib_buffers * sizeof(void*)); + } + + cs->old_ib_buffers[cs->num_old_ib_buffers++] = cs->ib_buffer; + + cs->ib_buffer = cs->ws->base.buffer_create(&cs->ws->base, ib_size, 0, + RADEON_DOMAIN_GTT, + RADEON_FLAG_CPU_ACCESS); + + if (!cs->ib_buffer) { + cs->base.cdw = 0; + cs->failed = true; + cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers]; + } + + cs->ib_mapped = cs->ws->base.buffer_map(cs->ib_buffer); + if (!cs->ib_mapped) { + cs->ws->base.buffer_destroy(cs->ib_buffer); + cs->base.cdw = 0; + cs->failed = true; + cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers]; + } + + cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer, 8); + + cs->base.buf[cs->base.cdw++] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0); + cs->base.buf[cs->base.cdw++] = radv_amdgpu_winsys_bo(cs->ib_buffer)->va; + cs->base.buf[cs->base.cdw++] = radv_amdgpu_winsys_bo(cs->ib_buffer)->va >> 32; + cs->ib_size_ptr = cs->base.buf + cs->base.cdw; + cs->base.buf[cs->base.cdw++] = S_3F2_CHAIN(1) | S_3F2_VALID(1); + + cs->base.buf = (uint32_t *)cs->ib_mapped; + cs->base.cdw = 0; + cs->base.max_dw = ib_size / 4 - 4; + +} + +static bool radv_amdgpu_cs_finalize(struct radeon_winsys_cs *_cs) +{ + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs); + + if (cs->ws->use_ib_bos) { + while (!cs->base.cdw || (cs->base.cdw & 7) != 0) + cs->base.buf[cs->base.cdw++] = 0xffff1000; + + *cs->ib_size_ptr |= cs->base.cdw; + + cs->is_chained = false; + } + + return !cs->failed; +} + +static void radv_amdgpu_cs_reset(struct radeon_winsys_cs *_cs) +{ + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs); + cs->base.cdw = 0; + cs->failed = false; + + for (unsigned i = 0; i < cs->num_buffers; ++i) { + unsigned hash = ((uintptr_t)cs->handles[i] >> 6) & + (ARRAY_SIZE(cs->buffer_hash_table) - 1); + cs->buffer_hash_table[hash] = -1; + } + + cs->num_buffers = 0; + + if (cs->ws->use_ib_bos) { + cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer, 8); + + for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i) + cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]); + + cs->num_old_ib_buffers = 0; + cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->va; + cs->ib_size_ptr = &cs->ib.size; + cs->ib.size = 0; + } +} + +static int radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs *cs, + amdgpu_bo_handle bo) +{ + unsigned hash = ((uintptr_t)bo >> 6) & (ARRAY_SIZE(cs->buffer_hash_table) - 1); + int index = cs->buffer_hash_table[hash]; + + if (index == -1) + return -1; + + if(cs->handles[index] == bo) + return index; + + for (unsigned i = 0; i < cs->num_buffers; ++i) { + if (cs->handles[i] == bo) { + cs->buffer_hash_table[hash] = i; + return i; + } + } + return -1; +} + +static void radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs, + amdgpu_bo_handle bo, + uint8_t priority) +{ + unsigned hash; + int index = radv_amdgpu_cs_find_buffer(cs, bo); + + if (index != -1) { + cs->priorities[index] = MAX2(cs->priorities[index], priority); + return; + } + + if (cs->num_buffers == cs->max_num_buffers) { + unsigned new_count = MAX2(1, cs->max_num_buffers * 2); + cs->handles = realloc(cs->handles, new_count * sizeof(amdgpu_bo_handle)); + cs->priorities = realloc(cs->priorities, new_count * sizeof(uint8_t)); + cs->max_num_buffers = new_count; + } + + cs->handles[cs->num_buffers] = bo; + cs->priorities[cs->num_buffers] = priority; + + hash = ((uintptr_t)bo >> 6) & (ARRAY_SIZE(cs->buffer_hash_table) - 1); + cs->buffer_hash_table[hash] = cs->num_buffers; + + ++cs->num_buffers; +} + +static void radv_amdgpu_cs_add_buffer(struct radeon_winsys_cs *_cs, + struct radeon_winsys_bo *_bo, + uint8_t priority) +{ + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs); + struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); + + radv_amdgpu_cs_add_buffer_internal(cs, bo->bo, priority); +} + +static void radv_amdgpu_cs_execute_secondary(struct radeon_winsys_cs *_parent, + struct radeon_winsys_cs *_child) +{ + struct radv_amdgpu_cs *parent = radv_amdgpu_cs(_parent); + struct radv_amdgpu_cs *child = radv_amdgpu_cs(_child); + + for (unsigned i = 0; i < child->num_buffers; ++i) { + radv_amdgpu_cs_add_buffer_internal(parent, child->handles[i], + child->priorities[i]); + } + + if (parent->ws->use_ib_bos) { + if (parent->base.cdw + 4 > parent->base.max_dw) + radv_amdgpu_cs_grow(&parent->base, 4); + + parent->base.buf[parent->base.cdw++] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0); + parent->base.buf[parent->base.cdw++] = child->ib.ib_mc_address; + parent->base.buf[parent->base.cdw++] = child->ib.ib_mc_address >> 32; + parent->base.buf[parent->base.cdw++] = child->ib.size; + } else { + if (parent->base.cdw + child->base.cdw > parent->base.max_dw) + radv_amdgpu_cs_grow(&parent->base, child->base.cdw); + + memcpy(parent->base.buf + parent->base.cdw, child->base.buf, 4 * child->base.cdw); + parent->base.cdw += child->base.cdw; + } +} + +static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws, + struct radeon_winsys_cs **cs_array, + unsigned count, + struct radv_amdgpu_winsys_bo *extra_bo, + amdgpu_bo_list_handle *bo_list) +{ + int r; + if (ws->debug_all_bos) { + struct radv_amdgpu_winsys_bo *bo; + amdgpu_bo_handle *handles; + unsigned num = 0; + + pthread_mutex_lock(&ws->global_bo_list_lock); + + handles = malloc(sizeof(handles[0]) * ws->num_buffers); + if (!handles) { + pthread_mutex_unlock(&ws->global_bo_list_lock); + return -ENOMEM; + } + + LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, global_list_item) { + assert(num < ws->num_buffers); + handles[num++] = bo->bo; + } + + r = amdgpu_bo_list_create(ws->dev, ws->num_buffers, + handles, NULL, + bo_list); + free(handles); + pthread_mutex_unlock(&ws->global_bo_list_lock); + } else if (count == 1 && !extra_bo) { + struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0]; + r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, cs->handles, + cs->priorities, bo_list); + } else { + unsigned total_buffer_count = !!extra_bo; + unsigned unique_bo_count = !!extra_bo; + for (unsigned i = 0; i < count; ++i) { + struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[i]; + total_buffer_count += cs->num_buffers; + } + + amdgpu_bo_handle *handles = malloc(sizeof(amdgpu_bo_handle) * total_buffer_count); + uint8_t *priorities = malloc(sizeof(uint8_t) * total_buffer_count); + if (!handles || !priorities) { + free(handles); + free(priorities); + return -ENOMEM; + } + + if (extra_bo) { + handles[0] = extra_bo->bo; + priorities[0] = 8; + } + + for (unsigned i = 0; i < count; ++i) { + struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[i]; + for (unsigned j = 0; j < cs->num_buffers; ++j) { + bool found = false; + for (unsigned k = 0; k < unique_bo_count; ++k) { + if (handles[k] == cs->handles[j]) { + found = true; + priorities[k] = MAX2(priorities[k], + cs->priorities[j]); + break; + } + } + if (!found) { + handles[unique_bo_count] = cs->handles[j]; + priorities[unique_bo_count] = cs->priorities[j]; + ++unique_bo_count; + } + } + } + r = amdgpu_bo_list_create(ws->dev, unique_bo_count, handles, + priorities, bo_list); + + free(handles); + free(priorities); + } + return r; +} + +static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx, + struct radeon_winsys_cs **cs_array, + unsigned cs_count, + struct radeon_winsys_fence *_fence) +{ + int r; + struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx); + struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence; + struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[0]); + amdgpu_bo_list_handle bo_list; + struct amdgpu_cs_request request = {0}; + + for (unsigned i = cs_count; i--;) { + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]); + + if (cs->is_chained) { + *cs->ib_size_ptr -= 4; + cs->is_chained = false; + } + + if (i + 1 < cs_count) { + struct radv_amdgpu_cs *next = radv_amdgpu_cs(cs_array[i + 1]); + assert(cs->base.cdw + 4 <= cs->base.max_dw); + + cs->is_chained = true; + *cs->ib_size_ptr += 4; + + cs->base.buf[cs->base.cdw + 0] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0); + cs->base.buf[cs->base.cdw + 1] = next->ib.ib_mc_address; + cs->base.buf[cs->base.cdw + 2] = next->ib.ib_mc_address >> 32; + cs->base.buf[cs->base.cdw + 3] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | next->ib.size; + } + } + + r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, &bo_list); + if (r) { + fprintf(stderr, "amdgpu: Failed to created the BO list for submission\n"); + return r; + } + + request.ip_type = AMDGPU_HW_IP_GFX; + request.number_of_ibs = 1; + request.ibs = &cs0->ib; + request.resources = bo_list; + + r = amdgpu_cs_submit(ctx->ctx, 0, &request, 1); + if (r) { + if (r == -ENOMEM) + fprintf(stderr, "amdgpu: Not enough memory for command submission.\n"); + else + fprintf(stderr, "amdgpu: The CS has been rejected, " + "see dmesg for more information.\n"); + } + + amdgpu_bo_list_destroy(bo_list); + + if (fence) { + fence->context = ctx->ctx; + fence->ip_type = request.ip_type; + fence->ip_instance = request.ip_instance; + fence->ring = request.ring; + fence->fence = request.seq_no; + } + ctx->last_seq_no = request.seq_no; + + return r; +} + +static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx, + struct radeon_winsys_cs **cs_array, + unsigned cs_count, + struct radeon_winsys_fence *_fence) +{ + int r; + struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx); + struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence; + amdgpu_bo_list_handle bo_list; + struct amdgpu_cs_request request; + + assert(cs_count); + + for (unsigned i = 0; i < cs_count;) { + struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[i]); + struct amdgpu_cs_ib_info ibs[AMDGPU_CS_MAX_IBS_PER_SUBMIT]; + unsigned cnt = MIN2(AMDGPU_CS_MAX_IBS_PER_SUBMIT, cs_count - i); + + memset(&request, 0, sizeof(request)); + + r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, NULL, &bo_list); + if (r) { + fprintf(stderr, "amdgpu: Failed to created the BO list for submission\n"); + return r; + } + + request.ip_type = AMDGPU_HW_IP_GFX; + request.resources = bo_list; + request.number_of_ibs = cnt; + request.ibs = ibs; + + for (unsigned j = 0; j < cnt; ++j) { + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i + j]); + ibs[j] = cs->ib; + + if (cs->is_chained) { + *cs->ib_size_ptr -= 4; + cs->is_chained = false; + } + } + + r = amdgpu_cs_submit(ctx->ctx, 0, &request, 1); + if (r) { + if (r == -ENOMEM) + fprintf(stderr, "amdgpu: Not enough memory for command submission.\n"); + else + fprintf(stderr, "amdgpu: The CS has been rejected, " + "see dmesg for more information.\n"); + } + + amdgpu_bo_list_destroy(bo_list); + + if (r) + return r; + + i += cnt; + } + if (fence) { + fence->context = ctx->ctx; + fence->ip_type = request.ip_type; + fence->ip_instance = request.ip_instance; + fence->ring = request.ring; + fence->fence = request.seq_no; + } + ctx->last_seq_no = request.seq_no; + + return 0; +} + +static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx, + struct radeon_winsys_cs **cs_array, + unsigned cs_count, + struct radeon_winsys_fence *_fence) +{ + int r; + struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx); + struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence; + struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[0]); + struct radeon_winsys *ws = (struct radeon_winsys*)cs0->ws; + amdgpu_bo_list_handle bo_list; + struct amdgpu_cs_request request; + uint32_t pad_word = 0xffff1000U; + + if (radv_amdgpu_winsys(ws)->family == FAMILY_SI) + pad_word = 0x80000000; + + assert(cs_count); + + for (unsigned i = 0; i < cs_count;) { + struct amdgpu_cs_ib_info ib = {0}; + struct radeon_winsys_bo *bo = NULL; + uint32_t *ptr; + unsigned cnt = 0; + unsigned size = 0; + + while (i + cnt < cs_count && 0xffff8 - size >= radv_amdgpu_cs(cs_array[i + cnt])->base.cdw) { + size += radv_amdgpu_cs(cs_array[i + cnt])->base.cdw; + ++cnt; + } + + assert(cnt); + + bo = ws->buffer_create(ws, 4 * size, 4096, RADEON_DOMAIN_GTT, RADEON_FLAG_CPU_ACCESS); + ptr = ws->buffer_map(bo); + + for (unsigned j = 0; j < cnt; ++j) { + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i + j]); + memcpy(ptr, cs->base.buf, 4 * cs->base.cdw); + ptr += cs->base.cdw; + + } + + while(!size || (size & 7)) { + *ptr++ = pad_word; + ++size; + } + + memset(&request, 0, sizeof(request)); + + + r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, + (struct radv_amdgpu_winsys_bo*)bo, &bo_list); + if (r) { + fprintf(stderr, "amdgpu: Failed to created the BO list for submission\n"); + return r; + } + + ib.size = size; + ib.ib_mc_address = ws->buffer_get_va(bo); + + request.ip_type = AMDGPU_HW_IP_GFX; + request.resources = bo_list; + request.number_of_ibs = 1; + request.ibs = &ib; + + r = amdgpu_cs_submit(ctx->ctx, 0, &request, 1); + if (r) { + if (r == -ENOMEM) + fprintf(stderr, "amdgpu: Not enough memory for command submission.\n"); + else + fprintf(stderr, "amdgpu: The CS has been rejected, " + "see dmesg for more information.\n"); + } + + amdgpu_bo_list_destroy(bo_list); + + ws->buffer_destroy(bo); + if (r) + return r; + + i += cnt; + } + if (fence) { + fence->context = ctx->ctx; + fence->ip_type = request.ip_type; + fence->ip_instance = request.ip_instance; + fence->ring = request.ring; + fence->fence = request.seq_no; + } + ctx->last_seq_no = request.seq_no; + + return 0; +} + +static int radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx, + struct radeon_winsys_cs **cs_array, + unsigned cs_count, + bool can_patch, + struct radeon_winsys_fence *_fence) +{ + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[0]); + if (!cs->ws->use_ib_bos) { + return radv_amdgpu_winsys_cs_submit_sysmem(_ctx, cs_array, + cs_count, _fence); + } else if (can_patch && cs_count > AMDGPU_CS_MAX_IBS_PER_SUBMIT && false) { + return radv_amdgpu_winsys_cs_submit_chained(_ctx, cs_array, + cs_count, _fence); + } else { + return radv_amdgpu_winsys_cs_submit_fallback(_ctx, cs_array, + cs_count, _fence); + } +} + +static struct radeon_winsys_ctx *radv_amdgpu_ctx_create(struct radeon_winsys *_ws) +{ + struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); + struct radv_amdgpu_ctx *ctx = CALLOC_STRUCT(radv_amdgpu_ctx); + int r; + + if (!ctx) + return NULL; + r = amdgpu_cs_ctx_create(ws->dev, &ctx->ctx); + if (r) { + fprintf(stderr, "amdgpu: radv_amdgpu_cs_ctx_create failed. (%i)\n", r); + goto error_create; + } + ctx->ws = ws; + return (struct radeon_winsys_ctx *)ctx; +error_create: + return NULL; +} + +static void radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx) +{ + struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx; + amdgpu_cs_ctx_free(ctx->ctx); + FREE(ctx); +} + +static bool radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx) +{ + struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx; + + if (ctx->last_seq_no) { + uint32_t expired; + struct amdgpu_cs_fence fence; + + fence.context = ctx->ctx; + fence.ip_type = RING_GFX; + fence.ip_instance = 0; + fence.ring = 0; + fence.fence = ctx->last_seq_no; + + int ret = amdgpu_cs_query_fence_status(&fence, 1000000000ull, 0, + &expired); + + if (ret || !expired) + return false; + } + + return true; +} + +void radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws) +{ + ws->base.ctx_create = radv_amdgpu_ctx_create; + ws->base.ctx_destroy = radv_amdgpu_ctx_destroy; + ws->base.ctx_wait_idle = radv_amdgpu_ctx_wait_idle; + ws->base.cs_create = radv_amdgpu_cs_create; + ws->base.cs_destroy = radv_amdgpu_cs_destroy; + ws->base.cs_grow = radv_amdgpu_cs_grow; + ws->base.cs_finalize = radv_amdgpu_cs_finalize; + ws->base.cs_reset = radv_amdgpu_cs_reset; + ws->base.cs_add_buffer = radv_amdgpu_cs_add_buffer; + ws->base.cs_execute_secondary = radv_amdgpu_cs_execute_secondary; + ws->base.cs_submit = radv_amdgpu_winsys_cs_submit; + ws->base.create_fence = radv_amdgpu_create_fence; + ws->base.destroy_fence = radv_amdgpu_destroy_fence; + ws->base.fence_wait = radv_amdgpu_fence_wait; +} diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h new file mode 100644 index 00000000000..230639a2580 --- /dev/null +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h @@ -0,0 +1,51 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based on amdgpu winsys. + * Copyright © 2011 Marek Olšák + * Copyright © 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#include +#include +#include +#include "r600d_common.h" +#include + +#include "radv_radeon_winsys.h" + +#include "radv_amdgpu_winsys.h" +struct radv_amdgpu_ctx { + struct radv_amdgpu_winsys *ws; + amdgpu_context_handle ctx; + uint64_t last_seq_no; +}; + +static inline struct radv_amdgpu_ctx * +radv_amdgpu_ctx(struct radeon_winsys_ctx *base) +{ + return (struct radv_amdgpu_ctx *)base; +} + +void radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws); diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c new file mode 100644 index 00000000000..a3c24115a13 --- /dev/null +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c @@ -0,0 +1,523 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based on amdgpu winsys. + * Copyright © 2011 Marek Olšák + * Copyright © 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include "radv_private.h" +#include "addrlib/addrinterface.h" +#include "util/bitset.h" +#include "radv_amdgpu_winsys.h" +#include "radv_amdgpu_surface.h" +#include "sid.h" +#ifndef NO_ENTRIES +#define NO_ENTRIES 32 +#endif + +#ifndef NO_MACRO_ENTRIES +#define NO_MACRO_ENTRIES 16 +#endif + +#ifndef CIASICIDGFXENGINE_SOUTHERNISLAND +#define CIASICIDGFXENGINE_SOUTHERNISLAND 0x0000000A +#endif + +static int radv_amdgpu_surface_sanity(const struct radeon_surf *surf) +{ + unsigned type = RADEON_SURF_GET(surf->flags, TYPE); + + if (!(surf->flags & RADEON_SURF_HAS_TILE_MODE_INDEX)) + return -EINVAL; + + /* all dimension must be at least 1 ! */ + if (!surf->npix_x || !surf->npix_y || !surf->npix_z || + !surf->array_size) + return -EINVAL; + + if (!surf->blk_w || !surf->blk_h || !surf->blk_d) + return -EINVAL; + + switch (surf->nsamples) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return -EINVAL; + } + + switch (type) { + case RADEON_SURF_TYPE_1D: + if (surf->npix_y > 1) + return -EINVAL; + /* fall through */ + case RADEON_SURF_TYPE_2D: + case RADEON_SURF_TYPE_CUBEMAP: + if (surf->npix_z > 1 || surf->array_size > 1) + return -EINVAL; + break; + case RADEON_SURF_TYPE_3D: + if (surf->array_size > 1) + return -EINVAL; + break; + case RADEON_SURF_TYPE_1D_ARRAY: + if (surf->npix_y > 1) + return -EINVAL; + /* fall through */ + case RADEON_SURF_TYPE_2D_ARRAY: + if (surf->npix_z > 1) + return -EINVAL; + break; + default: + return -EINVAL; + } + return 0; +} + +static void *ADDR_API radv_allocSysMem(const ADDR_ALLOCSYSMEM_INPUT * pInput) +{ + return malloc(pInput->sizeInBytes); +} + +static ADDR_E_RETURNCODE ADDR_API radv_freeSysMem(const ADDR_FREESYSMEM_INPUT * pInput) +{ + free(pInput->pVirtAddr); + return ADDR_OK; +} + +ADDR_HANDLE radv_amdgpu_addr_create(struct amdgpu_gpu_info *amdinfo, int family, int rev_id, + enum chip_class chip_class) +{ + ADDR_CREATE_INPUT addrCreateInput = {0}; + ADDR_CREATE_OUTPUT addrCreateOutput = {0}; + ADDR_REGISTER_VALUE regValue = {0}; + ADDR_CREATE_FLAGS createFlags = {{0}}; + ADDR_E_RETURNCODE addrRet; + + addrCreateInput.size = sizeof(ADDR_CREATE_INPUT); + addrCreateOutput.size = sizeof(ADDR_CREATE_OUTPUT); + + regValue.noOfBanks = amdinfo->mc_arb_ramcfg & 0x3; + regValue.gbAddrConfig = amdinfo->gb_addr_cfg; + regValue.noOfRanks = (amdinfo->mc_arb_ramcfg & 0x4) >> 2; + + regValue.backendDisables = amdinfo->backend_disable[0]; + regValue.pTileConfig = amdinfo->gb_tile_mode; + regValue.noOfEntries = ARRAY_SIZE(amdinfo->gb_tile_mode); + if (chip_class == SI) { + regValue.pMacroTileConfig = NULL; + regValue.noOfMacroEntries = 0; + } else { + regValue.pMacroTileConfig = amdinfo->gb_macro_tile_mode; + regValue.noOfMacroEntries = ARRAY_SIZE(amdinfo->gb_macro_tile_mode); + } + + createFlags.value = 0; + createFlags.useTileIndex = 1; + createFlags.degradeBaseLevel = 1; + + addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND; + addrCreateInput.chipFamily = family; + addrCreateInput.chipRevision = rev_id; + addrCreateInput.createFlags = createFlags; + addrCreateInput.callbacks.allocSysMem = radv_allocSysMem; + addrCreateInput.callbacks.freeSysMem = radv_freeSysMem; + addrCreateInput.callbacks.debugPrint = 0; + addrCreateInput.regValue = regValue; + + addrRet = AddrCreate(&addrCreateInput, &addrCreateOutput); + if (addrRet != ADDR_OK) + return NULL; + + return addrCreateOutput.hLib; +} + +static int radv_compute_level(ADDR_HANDLE addrlib, + struct radeon_surf *surf, bool is_stencil, + unsigned level, unsigned type, bool compressed, + ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn, + ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut, + ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn, + ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut) +{ + struct radeon_surf_level *surf_level; + ADDR_E_RETURNCODE ret; + + AddrSurfInfoIn->mipLevel = level; + AddrSurfInfoIn->width = u_minify(surf->npix_x, level); + AddrSurfInfoIn->height = u_minify(surf->npix_y, level); + + if (type == RADEON_SURF_TYPE_3D) + AddrSurfInfoIn->numSlices = u_minify(surf->npix_z, level); + else if (type == RADEON_SURF_TYPE_CUBEMAP) + AddrSurfInfoIn->numSlices = 6; + else + AddrSurfInfoIn->numSlices = surf->array_size; + + if (level > 0) { + /* Set the base level pitch. This is needed for calculation + * of non-zero levels. */ + if (is_stencil) + AddrSurfInfoIn->basePitch = surf->stencil_level[0].nblk_x; + else + AddrSurfInfoIn->basePitch = surf->level[0].nblk_x; + + /* Convert blocks to pixels for compressed formats. */ + if (compressed) + AddrSurfInfoIn->basePitch *= surf->blk_w; + } + + ret = AddrComputeSurfaceInfo(addrlib, + AddrSurfInfoIn, + AddrSurfInfoOut); + if (ret != ADDR_OK) { + return ret; + } + + surf_level = is_stencil ? &surf->stencil_level[level] : &surf->level[level]; + surf_level->offset = align64(surf->bo_size, AddrSurfInfoOut->baseAlign); + surf_level->slice_size = AddrSurfInfoOut->sliceSize; + surf_level->pitch_bytes = AddrSurfInfoOut->pitch * (is_stencil ? 1 : surf->bpe); + surf_level->npix_x = u_minify(surf->npix_x, level); + surf_level->npix_y = u_minify(surf->npix_y, level); + surf_level->npix_z = u_minify(surf->npix_z, level); + surf_level->nblk_x = AddrSurfInfoOut->pitch; + surf_level->nblk_y = AddrSurfInfoOut->height; + if (type == RADEON_SURF_TYPE_3D) + surf_level->nblk_z = AddrSurfInfoOut->depth; + else + surf_level->nblk_z = 1; + + switch (AddrSurfInfoOut->tileMode) { + case ADDR_TM_LINEAR_ALIGNED: + surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + break; + case ADDR_TM_1D_TILED_THIN1: + surf_level->mode = RADEON_SURF_MODE_1D; + break; + case ADDR_TM_2D_TILED_THIN1: + surf_level->mode = RADEON_SURF_MODE_2D; + break; + default: + assert(0); + } + + if (is_stencil) + surf->stencil_tiling_index[level] = AddrSurfInfoOut->tileIndex; + else + surf->tiling_index[level] = AddrSurfInfoOut->tileIndex; + + surf->bo_size = surf_level->offset + AddrSurfInfoOut->surfSize; + + /* Clear DCC fields at the beginning. */ + surf_level->dcc_offset = 0; + surf_level->dcc_enabled = false; + + /* The previous level's flag tells us if we can use DCC for this level. */ + if (AddrSurfInfoIn->flags.dccCompatible && + (level == 0 || AddrDccOut->subLvlCompressible)) { + AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize; + AddrDccIn->tileMode = AddrSurfInfoOut->tileMode; + AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo; + AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex; + AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex; + + ret = AddrComputeDccInfo(addrlib, + AddrDccIn, + AddrDccOut); + + if (ret == ADDR_OK) { + surf_level->dcc_offset = surf->dcc_size; + surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize; + surf_level->dcc_enabled = true; + surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize; + surf->dcc_alignment = MAX(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign); + } + } + + return 0; +} + +static void radv_set_micro_tile_mode(struct radeon_surf *surf, + struct radeon_info *info) +{ + uint32_t tile_mode = info->si_tile_mode_array[surf->tiling_index[0]]; + + if (info->chip_class >= CIK) + surf->micro_tile_mode = G_009910_MICRO_TILE_MODE_NEW(tile_mode); + else + surf->micro_tile_mode = G_009910_MICRO_TILE_MODE(tile_mode); +} + + +static int radv_amdgpu_winsys_surface_init(struct radeon_winsys *_ws, + struct radeon_surf *surf) +{ + struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); + unsigned level, mode, type; + bool compressed; + ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0}; + ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0}; + ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0}; + ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0}; + ADDR_TILEINFO AddrTileInfoIn = {0}; + ADDR_TILEINFO AddrTileInfoOut = {0}; + int r; + + r = radv_amdgpu_surface_sanity(surf); + if (r) + return r; + + AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT); + AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT); + AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT); + AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT); + AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut; + + type = RADEON_SURF_GET(surf->flags, TYPE); + mode = RADEON_SURF_GET(surf->flags, MODE); + compressed = surf->blk_w == 4 && surf->blk_h == 4; + + /* MSAA and FMASK require 2D tiling. */ + if (surf->nsamples > 1 || + (surf->flags & RADEON_SURF_FMASK)) + mode = RADEON_SURF_MODE_2D; + + /* DB doesn't support linear layouts. */ + if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER) && + mode < RADEON_SURF_MODE_1D) + mode = RADEON_SURF_MODE_1D; + + /* Set the requested tiling mode. */ + switch (mode) { + case RADEON_SURF_MODE_LINEAR_ALIGNED: + AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_ALIGNED; + break; + case RADEON_SURF_MODE_1D: + AddrSurfInfoIn.tileMode = ADDR_TM_1D_TILED_THIN1; + break; + case RADEON_SURF_MODE_2D: + AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_THIN1; + break; + default: + assert(0); + } + + /* The format must be set correctly for the allocation of compressed + * textures to work. In other cases, setting the bpp is sufficient. */ + if (compressed) { + switch (surf->bpe) { + case 8: + AddrSurfInfoIn.format = ADDR_FMT_BC1; + break; + case 16: + AddrSurfInfoIn.format = ADDR_FMT_BC3; + break; + default: + assert(0); + } + } + else { + AddrDccIn.bpp = AddrSurfInfoIn.bpp = surf->bpe * 8; + } + + AddrDccIn.numSamples = AddrSurfInfoIn.numSamples = surf->nsamples; + AddrSurfInfoIn.tileIndex = -1; + + /* Set the micro tile type. */ + if (surf->flags & RADEON_SURF_SCANOUT) + AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE; + else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER) + AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER; + else + AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE; + + AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER); + AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0; + AddrSurfInfoIn.flags.cube = type == RADEON_SURF_TYPE_CUBEMAP; + AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0; + AddrSurfInfoIn.flags.pow2Pad = surf->last_level > 0; + AddrSurfInfoIn.flags.degrade4Space = 1; + + /* DCC notes: + * - If we add MSAA support, keep in mind that CB can't decompress 8bpp + * with samples >= 4. + * - Mipmapped array textures have low performance (discovered by a closed + * driver team). + */ + AddrSurfInfoIn.flags.dccCompatible = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) && + !(surf->flags & RADEON_SURF_DISABLE_DCC) && + !compressed && AddrDccIn.numSamples <= 1 && + ((surf->array_size == 1 && surf->npix_z == 1) || + surf->last_level == 0); + + AddrSurfInfoIn.flags.noStencil = (surf->flags & RADEON_SURF_SBUFFER) == 0; + AddrSurfInfoIn.flags.compressZ = AddrSurfInfoIn.flags.depth; + + /* noStencil = 0 can result in a depth part that is incompatible with + * mipmapped texturing. So set noStencil = 1 when mipmaps are requested (in + * this case, we may end up setting stencil_adjusted). + * + * TODO: update addrlib to a newer version, remove this, and + * use flags.matchStencilTileCfg = 1 as an alternative fix. + */ + if (surf->last_level > 0) + AddrSurfInfoIn.flags.noStencil = 1; + + /* Set preferred macrotile parameters. This is usually required + * for shared resources. This is for 2D tiling only. */ + if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 && + surf->bankw && surf->bankh && surf->mtilea && surf->tile_split) { + /* If any of these parameters are incorrect, the calculation + * will fail. */ + AddrTileInfoIn.banks = surf->num_banks; + AddrTileInfoIn.bankWidth = surf->bankw; + AddrTileInfoIn.bankHeight = surf->bankh; + AddrTileInfoIn.macroAspectRatio = surf->mtilea; + AddrTileInfoIn.tileSplitBytes = surf->tile_split; + AddrTileInfoIn.pipeConfig = surf->pipe_config + 1; /* +1 compared to GB_TILE_MODE */ + AddrSurfInfoIn.flags.degrade4Space = 0; + AddrSurfInfoIn.pTileInfo = &AddrTileInfoIn; + + /* If AddrSurfInfoIn.pTileInfo is set, Addrlib doesn't set + * the tile index, because we are expected to know it if + * we know the other parameters. + * + * This is something that can easily be fixed in Addrlib. + * For now, just figure it out here. + * Note that only 2D_TILE_THIN1 is handled here. + */ + assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); + assert(AddrSurfInfoIn.tileMode == ADDR_TM_2D_TILED_THIN1); + + if (ws->info.chip_class == SI) { + if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) { + if (surf->bpe == 2) + AddrSurfInfoIn.tileIndex = 11; /* 16bpp */ + else + AddrSurfInfoIn.tileIndex = 12; /* 32bpp */ + } else { + if (surf->bpe == 1) + AddrSurfInfoIn.tileIndex = 14; /* 8bpp */ + else if (surf->bpe == 2) + AddrSurfInfoIn.tileIndex = 15; /* 16bpp */ + else if (surf->bpe == 4) + AddrSurfInfoIn.tileIndex = 16; /* 32bpp */ + else + AddrSurfInfoIn.tileIndex = 17; /* 64bpp (and 128bpp) */ + } + } else { + if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) + AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */ + else + AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */ + } + } + + surf->bo_size = 0; + surf->dcc_size = 0; + surf->dcc_alignment = 1; + + /* Calculate texture layout information. */ + for (level = 0; level <= surf->last_level; level++) { + r = radv_compute_level(ws->addrlib, surf, false, level, type, compressed, + &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut); + if (r) + return r; + + if (level == 0) { + surf->bo_alignment = AddrSurfInfoOut.baseAlign; + surf->pipe_config = AddrSurfInfoOut.pTileInfo->pipeConfig - 1; + radv_set_micro_tile_mode(surf, &ws->info); + + /* For 2D modes only. */ + if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) { + surf->bankw = AddrSurfInfoOut.pTileInfo->bankWidth; + surf->bankh = AddrSurfInfoOut.pTileInfo->bankHeight; + surf->mtilea = AddrSurfInfoOut.pTileInfo->macroAspectRatio; + surf->tile_split = AddrSurfInfoOut.pTileInfo->tileSplitBytes; + surf->num_banks = AddrSurfInfoOut.pTileInfo->banks; + surf->macro_tile_index = AddrSurfInfoOut.macroModeIndex; + } else { + surf->macro_tile_index = 0; + } + } + } + + /* Calculate texture layout information for stencil. */ + if (surf->flags & RADEON_SURF_SBUFFER) { + AddrSurfInfoIn.bpp = 8; + AddrSurfInfoIn.flags.depth = 0; + AddrSurfInfoIn.flags.stencil = 1; + /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */ + AddrTileInfoIn.tileSplitBytes = surf->stencil_tile_split; + + for (level = 0; level <= surf->last_level; level++) { + r = radv_compute_level(ws->addrlib, surf, true, level, type, compressed, + &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut); + if (r) + return r; + + /* DB uses the depth pitch for both stencil and depth. */ + if (surf->stencil_level[level].nblk_x != surf->level[level].nblk_x) + surf->stencil_adjusted = true; + + if (level == 0) { + /* For 2D modes only. */ + if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) { + surf->stencil_tile_split = + AddrSurfInfoOut.pTileInfo->tileSplitBytes; + } + } + } + } + + /* Recalculate the whole DCC miptree size including disabled levels. + * This is what addrlib does, but calling addrlib would be a lot more + * complicated. + */ +#if 0 + if (surf->dcc_size && surf->last_level > 0) { + surf->dcc_size = align64(surf->bo_size >> 8, + ws->info.pipe_interleave_bytes * + ws->info.num_tile_pipes); + } +#endif + return 0; +} + +static int radv_amdgpu_winsys_surface_best(struct radeon_winsys *rws, + struct radeon_surf *surf) +{ + return 0; +} + +void radv_amdgpu_surface_init_functions(struct radv_amdgpu_winsys *ws) +{ + ws->base.surface_init = radv_amdgpu_winsys_surface_init; + ws->base.surface_best = radv_amdgpu_winsys_surface_best; +} diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.h new file mode 100644 index 00000000000..acc12af3d08 --- /dev/null +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.h @@ -0,0 +1,29 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#include + +void radv_amdgpu_surface_init_functions(struct radv_amdgpu_winsys *ws); +ADDR_HANDLE radv_amdgpu_addr_create(struct amdgpu_gpu_info *amdinfo, int family, int rev_id, enum chip_class chip_class); diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c new file mode 100644 index 00000000000..94505367e23 --- /dev/null +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c @@ -0,0 +1,359 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * based on amdgpu winsys. + * Copyright © 2011 Marek Olšák + * Copyright © 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#include "radv_amdgpu_winsys.h" +#include "radv_amdgpu_winsys_public.h" +#include "radv_amdgpu_surface.h" +#include "amdgpu_id.h" +#include "xf86drm.h" +#include +#include +#include +#include +#include +#include "radv_amdgpu_cs.h" +#include "radv_amdgpu_bo.h" +#include "radv_amdgpu_surface.h" +#define CIK_TILE_MODE_COLOR_2D 14 + +#define CIK__GB_TILE_MODE__PIPE_CONFIG(x) (((x) >> 6) & 0x1f) +#define CIK__PIPE_CONFIG__ADDR_SURF_P2 0 +#define CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16 4 +#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16 5 +#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32 6 +#define CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32 7 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16 8 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16 9 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16 10 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16 11 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16 12 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32 13 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32 14 +#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16 16 +#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16 17 + +static unsigned radv_cik_get_num_tile_pipes(struct amdgpu_gpu_info *info) +{ + unsigned mode2d = info->gb_tile_mode[CIK_TILE_MODE_COLOR_2D]; + + switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) { + case CIK__PIPE_CONFIG__ADDR_SURF_P2: + return 2; + case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32: + case CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32: + return 4; + case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32: + case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32: + return 8; + case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16: + case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16: + return 16; + default: + fprintf(stderr, "Invalid CIK pipe configuration, assuming P2\n"); + assert(!"this should never occur"); + return 2; + } +} + +static const char * +get_chip_name(enum radeon_family family) +{ + switch (family) { + case CHIP_TAHITI: return "AMD RADV TAHITI"; + case CHIP_PITCAIRN: return "AMD RADV PITCAIRN"; + case CHIP_VERDE: return "AMD RADV CAPE VERDE"; + case CHIP_OLAND: return "AMD RADV OLAND"; + case CHIP_HAINAN: return "AMD RADV HAINAN"; + case CHIP_BONAIRE: return "AMD RADV BONAIRE"; + case CHIP_KAVERI: return "AMD RADV KAVERI"; + case CHIP_KABINI: return "AMD RADV KABINI"; + case CHIP_HAWAII: return "AMD RADV HAWAII"; + case CHIP_MULLINS: return "AMD RADV MULLINS"; + case CHIP_TONGA: return "AMD RADV TONGA"; + case CHIP_ICELAND: return "AMD RADV ICELAND"; + case CHIP_CARRIZO: return "AMD RADV CARRIZO"; + case CHIP_FIJI: return "AMD RADV FIJI"; + case CHIP_POLARIS10: return "AMD RADV POLARIS10"; + case CHIP_POLARIS11: return "AMD RADV POLARIS11"; + case CHIP_STONEY: return "AMD RADV STONEY"; + default: return "AMD RADV unknown"; + } +} + + +static bool +do_winsys_init(struct radv_amdgpu_winsys *ws, int fd) +{ + struct amdgpu_buffer_size_alignments alignment_info = {}; + struct amdgpu_heap_info vram, gtt; + struct drm_amdgpu_info_hw_ip dma = {}; + drmDevicePtr devinfo; + int r; + int i, j; + /* Get PCI info. */ + r = drmGetDevice(fd, &devinfo); + if (r) { + fprintf(stderr, "amdgpu: drmGetDevice failed.\n"); + goto fail; + } + ws->info.pci_domain = devinfo->businfo.pci->domain; + ws->info.pci_bus = devinfo->businfo.pci->bus; + ws->info.pci_dev = devinfo->businfo.pci->dev; + ws->info.pci_func = devinfo->businfo.pci->func; + drmFreeDevice(&devinfo); + + /* Query hardware and driver information. */ + r = amdgpu_query_gpu_info(ws->dev, &ws->amdinfo); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n"); + goto fail; + } + + r = amdgpu_query_buffer_size_alignment(ws->dev, &alignment_info); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n"); + goto fail; + } + + r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n"); + goto fail; + } + + r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_GTT, 0, >t); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n"); + goto fail; + } + + r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_DMA, 0, &dma); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n"); + goto fail; + } + ws->info.pci_id = ws->amdinfo.asic_id; /* TODO: is this correct? */ + ws->info.vce_harvest_config = ws->amdinfo.vce_harvest_config; + + switch (ws->info.pci_id) { +#define CHIPSET(pci_id, name, cfamily) case pci_id: ws->info.family = CHIP_##cfamily; break; +#include "pci_ids/radeonsi_pci_ids.h" +#undef CHIPSET + default: + fprintf(stderr, "amdgpu: Invalid PCI ID.\n"); + goto fail; + } + + if (ws->info.family >= CHIP_TONGA) + ws->info.chip_class = VI; + else if (ws->info.family >= CHIP_BONAIRE) + ws->info.chip_class = CIK; + else if (ws->info.family >= CHIP_TAHITI) + ws->info.chip_class = SI; + else { + fprintf(stderr, "amdgpu: Unknown family.\n"); + goto fail; + } + + /* family and rev_id are for addrlib */ + switch (ws->info.family) { + case CHIP_TAHITI: + ws->family = FAMILY_SI; + ws->rev_id = SI_TAHITI_P_A0; + break; + case CHIP_PITCAIRN: + ws->family = FAMILY_SI; + ws->rev_id = SI_PITCAIRN_PM_A0; + break; + case CHIP_VERDE: + ws->family = FAMILY_SI; + ws->rev_id = SI_CAPEVERDE_M_A0; + break; + case CHIP_OLAND: + ws->family = FAMILY_SI; + ws->rev_id = SI_OLAND_M_A0; + break; + case CHIP_HAINAN: + ws->family = FAMILY_SI; + ws->rev_id = SI_HAINAN_V_A0; + break; + case CHIP_BONAIRE: + ws->family = FAMILY_CI; + ws->rev_id = CI_BONAIRE_M_A0; + break; + case CHIP_KAVERI: + ws->family = FAMILY_KV; + ws->rev_id = KV_SPECTRE_A0; + break; + case CHIP_KABINI: + ws->family = FAMILY_KV; + ws->rev_id = KB_KALINDI_A0; + break; + case CHIP_HAWAII: + ws->family = FAMILY_CI; + ws->rev_id = CI_HAWAII_P_A0; + break; + case CHIP_MULLINS: + ws->family = FAMILY_KV; + ws->rev_id = ML_GODAVARI_A0; + break; + case CHIP_TONGA: + ws->family = FAMILY_VI; + ws->rev_id = VI_TONGA_P_A0; + break; + case CHIP_ICELAND: + ws->family = FAMILY_VI; + ws->rev_id = VI_ICELAND_M_A0; + break; + case CHIP_CARRIZO: + ws->family = FAMILY_CZ; + ws->rev_id = CARRIZO_A0; + break; + case CHIP_STONEY: + ws->family = FAMILY_CZ; + ws->rev_id = STONEY_A0; + break; + case CHIP_FIJI: + ws->family = FAMILY_VI; + ws->rev_id = VI_FIJI_P_A0; + break; + case CHIP_POLARIS10: + ws->family = FAMILY_VI; + ws->rev_id = VI_POLARIS10_P_A0; + break; + case CHIP_POLARIS11: + ws->family = FAMILY_VI; + ws->rev_id = VI_POLARIS11_M_A0; + break; + default: + fprintf(stderr, "amdgpu: Unknown family.\n"); + goto fail; + } + + ws->addrlib = radv_amdgpu_addr_create(&ws->amdinfo, ws->family, ws->rev_id, ws->info.chip_class); + if (!ws->addrlib) { + fprintf(stderr, "amdgpu: Cannot create addrlib.\n"); + goto fail; + } + /* Set hardware information. */ + ws->info.name = get_chip_name(ws->info.family); + ws->info.gart_size = gtt.heap_size; + ws->info.vram_size = vram.heap_size; + /* convert the shader clock from KHz to MHz */ + ws->info.max_shader_clock = ws->amdinfo.max_engine_clk / 1000; + ws->info.max_se = ws->amdinfo.num_shader_engines; + ws->info.max_sh_per_se = ws->amdinfo.num_shader_arrays_per_engine; + ws->info.has_uvd = 0; + ws->info.vce_fw_version = 0; + ws->info.has_userptr = TRUE; + ws->info.num_render_backends = ws->amdinfo.rb_pipes; + ws->info.clock_crystal_freq = ws->amdinfo.gpu_counter_freq; + ws->info.num_tile_pipes = radv_cik_get_num_tile_pipes(&ws->amdinfo); + ws->info.pipe_interleave_bytes = 256 << ((ws->amdinfo.gb_addr_cfg >> 4) & 0x7); + ws->info.has_virtual_memory = TRUE; + ws->info.has_sdma = dma.available_rings != 0; + + /* Get the number of good compute units. */ + ws->info.num_good_compute_units = 0; + for (i = 0; i < ws->info.max_se; i++) + for (j = 0; j < ws->info.max_sh_per_se; j++) + ws->info.num_good_compute_units += + util_bitcount(ws->amdinfo.cu_bitmap[i][j]); + + memcpy(ws->info.si_tile_mode_array, ws->amdinfo.gb_tile_mode, + sizeof(ws->amdinfo.gb_tile_mode)); + ws->info.enabled_rb_mask = ws->amdinfo.enabled_rb_pipes_mask; + + memcpy(ws->info.cik_macrotile_mode_array, ws->amdinfo.gb_macro_tile_mode, + sizeof(ws->amdinfo.gb_macro_tile_mode)); + + ws->info.gart_page_size = alignment_info.size_remote; + + if (ws->info.chip_class == SI) + ws->info.gfx_ib_pad_with_type2 = TRUE; + + ws->use_ib_bos = ws->family >= FAMILY_CI; + return true; +fail: + return false; +} + +static void radv_amdgpu_winsys_query_info(struct radeon_winsys *rws, + struct radeon_info *info) +{ + *info = ((struct radv_amdgpu_winsys *)rws)->info; +} + +static void radv_amdgpu_winsys_destroy(struct radeon_winsys *rws) +{ + struct radv_amdgpu_winsys *ws = (struct radv_amdgpu_winsys*)rws; + + AddrDestroy(ws->addrlib); + amdgpu_device_deinitialize(ws->dev); + FREE(rws); +} + +struct radeon_winsys * +radv_amdgpu_winsys_create(int fd) +{ + uint32_t drm_major, drm_minor, r; + amdgpu_device_handle dev; + struct radv_amdgpu_winsys *ws; + + r = amdgpu_device_initialize(fd, &drm_major, &drm_minor, &dev); + if (r) + return NULL; + + ws = calloc(1, sizeof(struct radv_amdgpu_winsys)); + if (!ws) + return NULL; + + + ws->dev = dev; + ws->info.drm_major = drm_major; + ws->info.drm_minor = drm_minor; + if (!do_winsys_init(ws, fd)) + goto fail; + + ws->debug_all_bos = getenv("RADV_DEBUG_ALL_BOS") ? true : false; + LIST_INITHEAD(&ws->global_bo_list); + pthread_mutex_init(&ws->global_bo_list_lock, NULL); + ws->base.query_info = radv_amdgpu_winsys_query_info; + ws->base.destroy = radv_amdgpu_winsys_destroy; + radv_amdgpu_bo_init_functions(ws); + radv_amdgpu_cs_init_functions(ws); + radv_amdgpu_surface_init_functions(ws); + return &ws->base; +fail: + return NULL; +} diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h new file mode 100644 index 00000000000..b79495d9952 --- /dev/null +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h @@ -0,0 +1,57 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * based on amdgpu winsys. + * Copyright © 2011 Marek Olšák + * Copyright © 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#include "radv_radeon_winsys.h" +#include "addrlib/addrinterface.h" +#include +#include "util/list.h" + +struct radv_amdgpu_winsys { + struct radeon_winsys base; + amdgpu_device_handle dev; + + struct radeon_info info; + struct amdgpu_gpu_info amdinfo; + ADDR_HANDLE addrlib; + + uint32_t rev_id; + unsigned family; + + bool debug_all_bos; + pthread_mutex_t global_bo_list_lock; + struct list_head global_bo_list; + unsigned num_buffers; + + bool use_ib_bos; +}; + +static inline struct radv_amdgpu_winsys * +radv_amdgpu_winsys(struct radeon_winsys *base) +{ + return (struct radv_amdgpu_winsys*)base; +} diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h new file mode 100644 index 00000000000..cf066011c26 --- /dev/null +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h @@ -0,0 +1,30 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based on amdgpu winsys. + * Copyright © 2011 Marek Olšák + * Copyright © 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +struct radeon_winsys *radv_amdgpu_winsys_create(int fd); -- 2.30.2