From 37125b7cc220fd9b77e9882268892ca4e79a0627 Mon Sep 17 00:00:00 2001
From: Gert Wollny <gert.wollny@collabora.com>
Date: Fri, 27 Dec 2019 17:49:27 +0100
Subject: [PATCH] r600/sfn: Add lowering UBO access to r600 specific codes

r600 reads vec4 from the UBO, but the offsets in nir are evaluated to the component.
If the offsets are not literal then all non-vec4 reads must resolve the component
after reading a vec4 component (TODO: figure out whether there is a consistent way
to deduct the component that is actually read).

Signed-off-by: Gert Wollny <gert.wollny@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3225>
---
 src/compiler/nir/nir_intrinsics.py            |   8 ++
 src/compiler/nir/nir_validate.c               |   2 +
 src/gallium/drivers/r600/sfn/sfn_nir.cpp      | 105 +++++++++++++++++-
 .../drivers/r600/sfn/sfn_shader_base.cpp      |   2 +-
 4 files changed, 115 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 026f715edca..33012d4bb01 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -866,6 +866,14 @@ load("output_u8_as_fp16_pan", 0, [], [CAN_ELIMINATE, CAN_REORDER])
 # src[] = { sampler_index }
 load("sampler_lod_parameters_pan", 1, [CAN_ELIMINATE, CAN_REORDER])
 
+# R600 specific instrincs
+#
+# R600 can only fetch 16 byte aligned data from an UBO, and the actual offset
+# is given in vec4 units, so we have to fetch the a vec4 and get the component
+# later
+# src[] = { buffer_index, offset }.
+load("ubo_r600", 2, [ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE, CAN_REORDER])
+
 # V3D-specific instrinc for tile buffer color reads.
 #
 # The hardware requires that we read the samples and components of a pixel
diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c
index 38e32e4ce3d..e0da743144a 100644
--- a/src/compiler/nir/nir_validate.c
+++ b/src/compiler/nir/nir_validate.c
@@ -531,6 +531,7 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
 
    case nir_intrinsic_load_deref: {
       nir_deref_instr *src = nir_src_as_deref(instr->src[0]);
+      assert(src);
       validate_assert(state, glsl_type_is_vector_or_scalar(src->type) ||
                       (src->mode == nir_var_uniform &&
                        glsl_get_base_type(src->type) == GLSL_TYPE_SUBROUTINE));
@@ -545,6 +546,7 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
 
    case nir_intrinsic_store_deref: {
       nir_deref_instr *dst = nir_src_as_deref(instr->src[0]);
+      assert(dst);
       validate_assert(state, glsl_type_is_vector_or_scalar(dst->type));
       validate_assert(state, instr->num_components ==
                              glsl_get_vector_elements(dst->type));
diff --git a/src/gallium/drivers/r600/sfn/sfn_nir.cpp b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
index 206435528a6..ff7bee26b4f 100644
--- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
@@ -386,12 +386,112 @@ bool r600_lower_scratch_addresses(nir_shader *shader)
    return progress;
 }
 
+static nir_ssa_def *
+r600_lower_ubo_to_align16_impl(nir_builder *b, nir_instr *instr, void *_options)
+{
+   b->cursor = nir_before_instr(instr);
+
+   nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
+   assert(op->intrinsic == nir_intrinsic_load_ubo);
+
+   bool const_address = (nir_src_is_const(op->src[1]) && nir_src_is_const(op->src[0]));
+
+   nir_ssa_def *offset = op->src[1].ssa;
+
+   /* This is ugly: With const addressing we can actually set a proper fetch target mask,
+    * but for this we need the component encoded, we don't shift and do de decoding in the
+    * backend. Otherwise we shift by four and resolve the component here
+    * (TODO: encode the start component in the intrinsic when the offset base is non-constant
+    * but a multiple of 16 */
+
+   nir_ssa_def *new_offset = offset;
+   if (!const_address)
+      new_offset = nir_ishr(b, offset,  nir_imm_int(b, 4));
+
+   nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_r600);
+   load->num_components = const_address ? op->num_components : 4;
+   load->src[0] = op->src[0];
+   load->src[1] = nir_src_for_ssa(new_offset);
+   nir_intrinsic_set_align(load, nir_intrinsic_align(op), nir_intrinsic_align_offset(op));
+
+   nir_ssa_dest_init(&load->instr, &load->dest, load->num_components, 32, NULL);
+   nir_builder_instr_insert(b, &load->instr);
+
+   /* when four components are loaded or both the offset and the location
+    * are constant, then the backend can deal with it better */
+   if (op->num_components == 4 || const_address)
+      return &load->dest.ssa;
+
+   /* What comes below is a performance disaster when the offset is not constant
+    * because then we have to assume that any component can be the first one and we
+    * have to pick the result manually. */
+   nir_ssa_def *first_comp = nir_iand(b, nir_ishr(b, offset,  nir_imm_int(b, 2)),
+                                     nir_imm_int(b,3));
+
+   const unsigned swz_000[4] = {0, 0, 0, 0};
+   nir_ssa_def *component_select = nir_ieq(b, nir_imm_ivec4(b, 0, 1, 2, 3),
+                                           nir_swizzle(b, first_comp, swz_000, 4));
+
+   const unsigned szw_0[1] = {0};
+   const unsigned szw_1[1] = {1};
+   const unsigned szw_2[1] = {2};
+
+   if (op->num_components == 1) {
+      const unsigned szw_3[1] = {3};
+      nir_ssa_def *check0 = nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_0, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_3, 1));
+      nir_ssa_def *check1 = nir_bcsel(b, nir_swizzle(b, component_select, szw_1, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_1, 1),
+                                      check0);
+      return nir_bcsel(b, nir_swizzle(b, component_select, szw_2, 1),
+                       nir_swizzle(b, &load->dest.ssa, szw_2, 1),
+                       check1);
+   } else if (op->num_components == 2) {
+      const unsigned szw_01[2] = {0, 1};
+      const unsigned szw_12[2] = {1, 2};
+      const unsigned szw_23[2] = {2, 3};
+
+      nir_ssa_def *check0 = nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_01, 2),
+                                      nir_swizzle(b, &load->dest.ssa, szw_23, 2));
+      return nir_bcsel(b, nir_swizzle(b, component_select, szw_1, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_12, 2),
+                                      check0);
+   } else {
+      const unsigned szw_012[3] = {0, 1, 3};
+      const unsigned szw_123[3] = {1, 2, 3};
+      return nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+                       nir_swizzle(b, &load->dest.ssa, szw_012, 3),
+                       nir_swizzle(b, &load->dest.ssa, szw_123, 3));
+   }
+}
+
+bool r600_lower_ubo_to_align16_filter(const nir_instr *instr, const void *_options)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
+   return op->intrinsic == nir_intrinsic_load_ubo;
+}
+
+
+bool r600_lower_ubo_to_align16(nir_shader *shader)
+{
+   return nir_shader_lower_instructions(shader,
+                                        r600_lower_ubo_to_align16_filter,
+                                        r600_lower_ubo_to_align16_impl,
+                                        nullptr);
+}
+
 }
 
 using r600::r600_nir_lower_int_tg4;
 using r600::r600_nir_lower_pack_unpack_2x16;
 using r600::r600_lower_scratch_addresses;
 using r600::r600_lower_fs_out_to_vector;
+using r600::r600_lower_ubo_to_align16;
 
 int
 r600_glsl_type_size(const struct glsl_type *type, bool is_bindless)
@@ -512,7 +612,10 @@ int r600_shader_from_nir(struct r600_context *rctx,
    const nir_function *func = reinterpret_cast<const nir_function *>(exec_list_get_head_const(&sel->nir->functions));
    bool optimize = func->impl->registers.length() == 0 && !has_saturate(func);
 
-
+   if (optimize) {
+      optimize_once(sel->nir);
+      NIR_PASS_V(sel->nir, r600_lower_ubo_to_align16);
+   }
    /* It seems the output of this optimization is cached somewhere, and
     * when there are registers, then we can no longer copy propagate, so
     * skip the optimization then. (There is probably a better way, but yeah)
diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
index 96c8e804e2b..0ee6bbd6313 100644
--- a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
@@ -447,7 +447,7 @@ bool ShaderFromNirProcessor::emit_intrinsic_instruction(nir_intrinsic_instr* ins
    case nir_intrinsic_discard:
    case nir_intrinsic_discard_if:
       return emit_discard_if(instr);
-   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ubo_r600:
       return emit_load_ubo(instr);
    case nir_intrinsic_copy_deref:
    case nir_intrinsic_load_constant:
-- 
2.30.2