i965: Introduce the BROADCAST pseudo-opcode.
authorFrancisco Jerez <currojerez@riseup.net>
Fri, 20 Feb 2015 18:14:24 +0000 (20:14 +0200)
committerFrancisco Jerez <currojerez@riseup.net>
Mon, 4 May 2015 14:44:17 +0000 (17:44 +0300)
The BROADCAST instruction picks the channel from its first source
given by an index passed in as second source.  This will be used in
situations where all channels from the same SIMD thread have to agree
on the value of something, e.g. a surface binding table index.

This is in particular the case for UBO, sampler and image arrays,
which can be indexed dynamically with the restriction that all active
SIMD channels access the same index, provided to the shared unit as
part of a single scalar field of the message descriptor.  Simply
taking the index value from the first channel as we were doing until
now is incorrect, because it might contain an uninitialized value if
the channel had previously been disabled by non-uniform control flow.

v2: Minor style fixes.  Improve commit message.

Reviewed-by: Matt Turner <mattst88@gmail.com>
src/mesa/drivers/dri/i965/brw_defines.h
src/mesa/drivers/dri/i965/brw_eu.h
src/mesa/drivers/dri/i965/brw_eu_emit.c
src/mesa/drivers/dri/i965/brw_fs_generator.cpp
src/mesa/drivers/dri/i965/brw_shader.cpp
src/mesa/drivers/dri/i965/brw_vec4_generator.cpp

index 634885b0d47df6992e09ecdf5dbaa4afb221c78e..5072e680db1af54772248c3b857de0c739e956a9 100644 (file)
@@ -918,6 +918,12 @@ enum opcode {
 
    SHADER_OPCODE_URB_WRITE_SIMD8,
 
+   /**
+    * Pick the channel from its first source register given by the index
+    * specified as second source.  Useful for variable indexing of surfaces.
+    */
+   SHADER_OPCODE_BROADCAST,
+
    VEC4_OPCODE_MOV_BYTES,
    VEC4_OPCODE_PACK_BYTES,
    VEC4_OPCODE_UNPACK_UNIFORM,
index 84c1e57b53634f635c6c98a11462081c7f8537e1..a0c938a142bde250781f89052d6c513ce9309d31 100644 (file)
@@ -461,6 +461,12 @@ brw_pixel_interpolator_query(struct brw_codegen *p,
                              unsigned msg_length,
                              unsigned response_length);
 
+void
+brw_broadcast(struct brw_codegen *p,
+              struct brw_reg dst,
+              struct brw_reg src,
+              struct brw_reg idx);
+
 /***********************************************************************
  * brw_eu_util.c:
  */
index 30396623612b55ab36371bad4e1e39b9c916fe00..73bed49ecee201ef8e768a1ed47bd59e50b4375f 100644 (file)
@@ -3212,6 +3212,81 @@ brw_pixel_interpolator_query(struct brw_codegen *p,
    brw_inst_set_pi_message_data(devinfo, insn, data);
 }
 
+void
+brw_broadcast(struct brw_codegen *p,
+              struct brw_reg dst,
+              struct brw_reg src,
+              struct brw_reg idx)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
+   brw_inst *inst;
+
+   assert(src.file == BRW_GENERAL_REGISTER_FILE &&
+          src.address_mode == BRW_ADDRESS_DIRECT);
+
+   if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
+       idx.file == BRW_IMMEDIATE_VALUE) {
+      /* Trivial, the source is already uniform or the index is a constant.
+       * We will typically not get here if the optimizer is doing its job, but
+       * asserting would be mean.
+       */
+      const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.dw1.ud : 0;
+      brw_MOV(p, dst,
+              (align1 ? stride(suboffset(src, i), 0, 1, 0) :
+               stride(suboffset(src, 4 * i), 0, 4, 1)));
+   } else {
+      if (align1) {
+         const struct brw_reg addr =
+            retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
+         const unsigned offset = src.nr * REG_SIZE + src.subnr;
+         /* Limit in bytes of the signed indirect addressing immediate. */
+         const unsigned limit = 512;
+
+         brw_push_insn_state(p);
+         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+         /* Take into account the component size and horizontal stride. */
+         assert(src.vstride == src.hstride + src.width);
+         brw_SHL(p, addr, vec1(idx),
+                 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
+                            src.hstride - 1));
+
+         /* We can only address up to limit bytes using the indirect
+          * addressing immediate, account for the difference if the source
+          * register is above this limit.
+          */
+         if (offset >= limit)
+            brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
+
+         brw_pop_insn_state(p);
+
+         /* Use indirect addressing to fetch the specified component. */
+         brw_MOV(p, dst,
+                 retype(brw_vec1_indirect(addr.subnr, offset % limit),
+                        src.type));
+      } else {
+         /* In SIMD4x2 mode the index can be either zero or one, replicate it
+          * to all bits of a flag register,
+          */
+         inst = brw_MOV(p,
+                        brw_null_reg(),
+                        stride(brw_swizzle1(idx, 0), 0, 4, 1));
+         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
+         brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
+         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
+
+         /* and use predicated SEL to pick the right channel. */
+         inst = brw_SEL(p, dst,
+                        stride(suboffset(src, 4), 0, 4, 1),
+                        stride(src, 0, 4, 1));
+         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
+         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
+      }
+   }
+}
+
 /**
  * This instruction is generated as a single-channel align1 instruction by
  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
index b81978c87751c206a59dbd509969497512e088de..2c6a12e4becac93bb1f007e47257e7902d4fe645 100644 (file)
@@ -2061,6 +2061,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          generate_set_simd4x2_offset(inst, dst, src[0]);
          break;
 
+      case SHADER_OPCODE_BROADCAST:
+         brw_broadcast(p, dst, src[0], src[1]);
+         break;
+
       case FS_OPCODE_SET_OMASK:
          generate_set_omask(inst, dst, src[0]);
          break;
index 20588e9a0fe48162a660279e2d3493ab1fcf8de3..1944c2648c878b69ad66c926aa30889d79e9c8dc 100644 (file)
@@ -517,6 +517,9 @@ brw_instruction_name(enum opcode op)
    case SHADER_OPCODE_URB_WRITE_SIMD8:
       return "gen8_urb_write_simd8";
 
+   case SHADER_OPCODE_BROADCAST:
+      return "broadcast";
+
    case VEC4_OPCODE_MOV_BYTES:
       return "mov_bytes";
    case VEC4_OPCODE_PACK_BYTES:
index c15fa165ec855ae8540fa04f0c20dcfb3e478e09..3dc808c85eeca5b423e1b7b30cec8c8ca1b22fb0 100644 (file)
@@ -1512,6 +1512,10 @@ vec4_generator::generate_code(const cfg_t *cfg)
          brw_memory_fence(p, dst);
          break;
 
+      case SHADER_OPCODE_BROADCAST:
+         brw_broadcast(p, dst, src[0], src[1]);
+         break;
+
       case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
          generate_unpack_flags(dst);
          break;