i965: Introduce a MOV_INDIRECT opcode.
authorKenneth Graunke <kenneth@whitecape.org>
Sun, 8 Nov 2015 02:58:34 +0000 (18:58 -0800)
committerKenneth Graunke <kenneth@whitecape.org>
Sun, 15 Nov 2015 00:41:37 +0000 (16:41 -0800)
The geometry and tessellation control shader stages both read from
multiple URB entries (one per vertex).  The thread payload contains
several URB handles which reference these separate memory segments.

In GLSL, these inputs are represented as per-vertex arrays; the
outermost array index selects which vertex's inputs to read.  This
array index does not necessarily need to be constant.

To handle that, we need to use indirect addressing on GRFs to select
which of the thread payload registers has the appropriate URB handle.
(This is before we can even think about applying the pull model!)

This patch introduces a new opcode which performs a MOV from a
source using VxH indirect addressing (which allows each of the 8
SIMD channels to select distinct data.)

Based on a patch by Jason Ekstrand.

v2: Rename from INDIRECT_THREAD_PAYLOAD_MOV to MOV_INDIRECT; make it
    a bit more generic.  Use regs_read() instead of hacking up the
    register allocator.  (Suggested by Jason Ekstrand.)

v3: Fix regs_read() to be more accurate for small unaligned regions.
    Also rebase on Matt's work.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com> [v3]
Reviewed-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com> [v1]
src/mesa/drivers/dri/i965/brw_defines.h
src/mesa/drivers/dri/i965/brw_fs.cpp
src/mesa/drivers/dri/i965/brw_fs.h
src/mesa/drivers/dri/i965/brw_fs_cse.cpp
src/mesa/drivers/dri/i965/brw_fs_generator.cpp
src/mesa/drivers/dri/i965/brw_shader.cpp

index 6484484ed3469a8019d1f5be3b9db7f17db3ec4f..0b8de63df42ab245c1d8572679f63e63b79ccf08 100644 (file)
@@ -1289,6 +1289,16 @@ enum opcode {
     * Calculate the high 32-bits of a 32x32 multiply.
     */
    SHADER_OPCODE_MULH,
+
+   /**
+    * A MOV that uses VxH indirect addressing.
+    *
+    * Source 0: A register to start from (HW_REG).
+    * Source 1: An indirect offset (in bytes, UD GRF).
+    * Source 2: The length of the region that could be accessed (in bytes,
+    *           UD immediate).
+    */
+   SHADER_OPCODE_MOV_INDIRECT,
 };
 
 enum brw_urb_write_flags {
index 80b8c8e120754877e6e15070ccd63efe60788622..84b5920d4f5bfb8e854c7dc2fef4b64bf2081829 100644 (file)
@@ -840,6 +840,34 @@ fs_inst::regs_read(int arg) const
    case SHADER_OPCODE_BARRIER:
       return 1;
 
+   case SHADER_OPCODE_MOV_INDIRECT:
+      if (arg == 0) {
+         assert(src[2].file == IMM);
+         unsigned region_length = src[2].ud;
+
+         if (src[0].file == FIXED_GRF) {
+            /* If the start of the region is not register aligned, then
+             * there's some portion of the register that's technically
+             * unread at the beginning.
+             *
+             * However, the register allocator works in terms of whole
+             * registers, and does not use subnr.  It assumes that the
+             * read starts at the beginning of the register, and extends
+             * regs_read() whole registers beyond that.
+             *
+             * To compensate, we extend the region length to include this
+             * unread portion at the beginning.
+             */
+            if (src[0].subnr)
+               region_length += src[0].subnr * type_sz(src[0].type);
+
+            return DIV_ROUND_UP(region_length, REG_SIZE);
+         } else {
+            assert(!"Invalid register file");
+         }
+      }
+      break;
+
    default:
       if (is_tex() && arg == 0 && src[0].file == VGRF)
          return mlen;
index f40e58b8ca0f2b88b5dcac16903159226b07ba00..cbfc07f68bce974ac4a8b4d46866d9ba4ca6c5bf 100644 (file)
@@ -527,6 +527,11 @@ private:
                                  struct brw_reg offset,
                                  struct brw_reg value);
 
+   void generate_mov_indirect(fs_inst *inst,
+                              struct brw_reg dst,
+                              struct brw_reg reg,
+                              struct brw_reg indirect_byte_offset);
+
    bool patch_discard_jumps_to_fb_writes();
 
    const struct brw_compiler *compiler;
index 8c67caff6e09bc88020349b0d31e99f48a682796..3c40fcd4fd29678e6dad4fb1015286e23622c126 100644 (file)
@@ -78,6 +78,7 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
    case FS_OPCODE_LINTERP:
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
    case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_MOV_INDIRECT:
       return true;
    case SHADER_OPCODE_RCP:
    case SHADER_OPCODE_RSQ:
index 139cda3ca5937110a7c94a780aca074eb2520f67..e5a286a763b7e36111ece288673f899c8de8eb68 100644 (file)
@@ -371,6 +371,36 @@ fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
    }
 }
 
+void
+fs_generator::generate_mov_indirect(fs_inst *inst,
+                                    struct brw_reg dst,
+                                    struct brw_reg reg,
+                                    struct brw_reg indirect_byte_offset)
+{
+   assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
+   assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
+
+   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
+
+   /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
+   struct brw_reg addr = vec8(brw_address_reg(0));
+
+   /* The destination stride of an instruction (in bytes) must be greater
+    * than or equal to the size of the rest of the instruction.  Since the
+    * address register is of type UW, we can't use a D-type instruction.
+    * In order to get around this, re re-type to UW and use a stride.
+    */
+   indirect_byte_offset =
+      retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
+
+   /* Prior to Broadwell, there are only 8 address registers. */
+   assert(inst->exec_size == 8 || devinfo->gen >= 8);
+
+   brw_MOV(p, addr, indirect_byte_offset);
+   brw_inst_set_mask_control(devinfo, brw_last_inst, BRW_MASK_DISABLE);
+   brw_MOV(p, dst, retype(brw_VxH_indirect(0, imm_byte_offset), dst.type));
+}
+
 void
 fs_generator::generate_urb_read(fs_inst *inst,
                                 struct brw_reg dst,
@@ -2079,6 +2109,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          fill_count++;
         break;
 
+      case SHADER_OPCODE_MOV_INDIRECT:
+         generate_mov_indirect(inst, dst, src[0], src[1]);
+         break;
+
       case SHADER_OPCODE_URB_READ_SIMD8:
       case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
          generate_urb_read(inst, dst, src[0]);
index a438e1881d523b9b8b3ed2307d30f22e365dbbc5..50b288b5c51e0e941b842fee3d89f55eb8154f40 100644 (file)
@@ -554,6 +554,8 @@ brw_instruction_name(enum opcode op)
       return "barrier";
    case SHADER_OPCODE_MULH:
       return "mulh";
+   case SHADER_OPCODE_MOV_INDIRECT:
+      return "mov_indirect";
    }
 
    unreachable("not reached");