From b46867cd378e5fb135fd060d50c8028d3dac622a Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Thu, 19 May 2016 12:50:01 +0200
Subject: [PATCH] i965/fs: do not depend on std140 alignment rules for UBO
 loads

The previous implementation relied on the std140 alignment rules to
avoid handling misalignment in the case where we are loading more than
2 double components from a vector, which requires to emit a second load
message.

This alternative implementation deals with misalignment and is more
flexible going forward.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 59 ++++++------------------
 1 file changed, 13 insertions(+), 46 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index d5ef569536a..b7df28663d5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -3585,9 +3585,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                                nir->info.num_ubos - 1);
       }
 
-      /* Number of 32-bit slots in the type */
-      unsigned type_slots = MAX2(1, type_sz(dest.type) / 4);
-
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       if (const_offset == NULL) {
          fs_reg base_offset = retype(get_nir_src(instr->src[1]),
@@ -3605,55 +3602,25 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
           * we let CSE deal with duplicate loads. Here we see a vector access
           * and we have to split it if necessary.
           */
-         fs_reg packed_consts = vgrf(glsl_type::float_type);
-         packed_consts.type = dest.type;
+         const unsigned type_size = type_sz(dest.type);
+         const fs_reg packed_consts = bld.vgrf(BRW_REGISTER_TYPE_F);
+         for (unsigned c = 0; c < instr->num_components;) {
+            const unsigned base = const_offset->u32[0] + c * type_size;
 
-         unsigned const_offset_aligned = const_offset->u32[0] & ~15;
+            /* Number of usable components in the next 16B-aligned load */
+            const unsigned count = MIN2(instr->num_components - c,
+                                        (16 - base % 16) / type_size);
 
-         /* A vec4 only contains half of a dvec4, if we need more than 2
-          * components of a dvec4 we will have to issue another load for
-          * components z and w.
-          */
-         int num_components;
-         if (type_slots == 1)
-            num_components = instr->num_components;
-         else
-            num_components = MIN2(2, instr->num_components);
-
-         /* The computation of num_components doesn't take into account
-          * misalignment, which should be okay according to std140 vector
-          * alignment rules.
-          */
-         assert(const_offset->u32[0] % 16 +
-                type_sz(dest.type) * num_components <= 16);
-
-         int remaining_components = instr->num_components;
-         while (remaining_components > 0) {
-            /* Read the vec4 from a 16-byte aligned offset */
-            struct brw_reg const_offset_reg = brw_imm_ud(const_offset_aligned);
             bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                     retype(packed_consts, BRW_REGISTER_TYPE_F),
-                     surf_index, const_offset_reg);
-
-            const fs_reg consts = byte_offset(packed_consts, (const_offset->u32[0] % 16));
-            unsigned dest_offset = instr->num_components - remaining_components;
+                     packed_consts, surf_index, brw_imm_ud(base & ~15));
 
-            /* XXX: This doesn't update the sub-16B offset across iterations of
-             * the loop, which should work for std140 vector alignment rules.
-             */
-            assert(dest_offset == 0 || const_offset->u32[0] % 16 == 0);
+            const fs_reg consts =
+               retype(byte_offset(packed_consts, base & 15), dest.type);
 
-            for (int i = 0; i < num_components; i++)
-               bld.MOV(offset(dest, bld, i + dest_offset), component(consts, i));
+            for (unsigned d = 0; d < count; d++)
+               bld.MOV(offset(dest, bld, c + d), component(consts, d));
 
-            /* If this is a large enough 64-bit load, we will need to emit
-             * another message
-             */
-            remaining_components -= num_components;
-            assert(remaining_components == 0 ||
-                   (remaining_components <= 2 && type_slots == 2));
-            num_components = remaining_components;
-            const_offset_aligned += 16;
+            c += count;
          }
       }
       break;
-- 
2.30.2