From 4985e380dd776ac65c4ae5627138211f9d9f03ce Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Wed, 29 Apr 2020 17:14:58 -0500 Subject: [PATCH] intel/eu: Use non-coherent mode (BTI=253) for stateless A64 messages We don't care about full IA coherency since we always have the opportunity in GL or Vulkan to flush the data cache. Using IA-coherent mode is likely just making A64 access slower than it needs to be. Reviewed-by: Caio Marcelo de Oliveira Filho Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_eu.h | 12 ++++++---- src/intel/compiler/brw_eu_defines.h | 37 ++++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index 7ae17dbdd37..262c527b2e9 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -744,7 +744,8 @@ brw_dp_a64_untyped_surface_rw_desc(const struct gen_device_info *devinfo, SET_BITS(brw_mdc_cmask(num_channels), 3, 0) | SET_BITS(simd_mode, 5, 4); - return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control); + return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT, + msg_type, msg_control); } /** @@ -782,7 +783,8 @@ brw_dp_a64_byte_scattered_rw_desc(const struct gen_device_info *devinfo, SET_BITS(brw_mdc_a64_ds(bit_size / 8), 3, 2) | SET_BITS(exec_size == 16, 4, 4); - return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control); + return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT, + msg_type, msg_control); } static inline uint32_t @@ -803,7 +805,8 @@ brw_dp_a64_untyped_atomic_desc(const struct gen_device_info *devinfo, SET_BITS(bit_size == 64, 4, 4) | SET_BITS(response_expected, 5, 5); - return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control); + return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT, + msg_type, msg_control); } static inline uint32_t @@ -822,7 +825,8 @@ brw_dp_a64_untyped_atomic_float_desc(const struct gen_device_info *devinfo, SET_BITS(atomic_op, 1, 0) | SET_BITS(response_expected, 5, 5); - return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control); + return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT, + msg_type, msg_control); } static inline uint32_t diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 742e12c2830..d63360222ec 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -1419,12 +1419,37 @@ enum brw_message_target { /* Dataport special binding table indices: */ #define BRW_BTI_STATELESS 255 #define GEN7_BTI_SLM 254 -/* Note that on Gen8+ BTI 255 was redefined to be IA-coherent according to the - * hardware spec, however because the DRM sets bit 4 of HDC_CHICKEN0 on BDW, - * CHV and at least some pre-production steppings of SKL due to - * WaForceEnableNonCoherent, HDC memory access may have been overridden by the - * kernel to be non-coherent (matching the behavior of the same BTI on - * pre-Gen8 hardware) and BTI 255 may actually be an alias for BTI 253. + +#define HSW_BTI_STATELESS_LOCALLY_COHERENT 255 +#define HSW_BTI_STATELESS_NON_COHERENT 253 +#define HSW_BTI_STATELESS_GLOBALLY_COHERENT 252 +#define HSW_BTI_STATELESS_LLC_COHERENT 251 +#define HSW_BTI_STATELESS_L3_UNCACHED 250 + +/* The hardware docs are a bit contradictory here. On Haswell, where they + * first added cache ability control, there were 5 different cache modes (see + * HSW_BTI_STATELESS_* above). On Broadwell, they reduced to two: + * + * - IA-Coherent (BTI=255): Coherent within Gen and coherent within the + * entire IA cache memory hierarchy. + * + * - Non-Coherent (BTI=253): Coherent within Gen, same cache type. + * + * Information about stateless cache coherency can be found in the "A32 + * Stateless" section of the "3D Media GPGPU" volume of the PRM for each + * hardware generation. + * + * Unfortunately, the docs for MDC_STATELESS appear to have been copied and + * pasted from Haswell and give the Haswell definitions for the BTI values of + * 255 and 253 including a warning about accessing 253 surfaces from multiple + * threads. This seems to be a copy+paste error and the definitions from the + * "A32 Stateless" section should be trusted instead. + * + * Note that because the DRM sets bit 4 of HDC_CHICKEN0 on BDW, CHV and at + * least some pre-production steppings of SKL due to WaForceEnableNonCoherent, + * HDC memory access may have been overridden by the kernel to be non-coherent + * (matching the behavior of the same BTI on pre-Gen8 hardware) and BTI 255 + * may actually be an alias for BTI 253. */ #define GEN8_BTI_STATELESS_IA_COHERENT 255 #define GEN8_BTI_STATELESS_NON_COHERENT 253 -- 2.30.2