i965/fs: Add support for nir_intrinsic_shuffle

author Jason Ekstrand <jason.ekstrand@intel.com>

Tue, 29 Aug 2017 16:21:32 +0000 (09:21 -0700)

committer Jason Ekstrand <jason.ekstrand@intel.com>

Wed, 7 Mar 2018 20:13:47 +0000 (12:13 -0800)
author Jason Ekstrand <jason.ekstrand@intel.com>
Tue, 29 Aug 2017 16:21:32 +0000 (09:21 -0700)
committer Jason Ekstrand <jason.ekstrand@intel.com>
Wed, 7 Mar 2018 20:13:47 +0000 (12:13 -0800)
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h

index 30e2e8f0708413ec64d508bc9acfeeab0df9436a..3449c73d771a45452ee2fa72b048208d86683dd7 100644 (file)
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -451,6 +451,15 @@ enum opcode {
      */
     SHADER_OPCODE_BROADCAST,
  
+   /* Pick the channel from its first source register given by the index
+    * specified as second source.
+    *
+    * This is similar to the BROADCAST instruction except that it takes a
+    * dynamic index and potentially puts a different value in each output
+    * channel.
+    */
+   SHADER_OPCODE_SHUFFLE,
+
     SHADER_OPCODE_GET_BUFFER_SIZE,
  
     VEC4_OPCODE_MOV_BYTES,
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp

index fc4cfbfb0be147f26f78bdd6217acf1cd9a75ee5..47f1f6e9c9f0ca31d74164fe83f310c1fbb21b22 100644 (file)
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -310,6 +310,13 @@ fs_inst::has_source_and_destination_hazard() const
     case FS_OPCODE_PACK_HALF_2x16_SPLIT:
        /* Multiple partial writes to the destination */
        return true;
+   case SHADER_OPCODE_SHUFFLE:
+      /* This instruction returns an arbitrary channel from the source and
+       * gets split into smaller instructions in the generator.  It's possible
+       * that one of the instructions will read from a channel corresponding
+       * to an earlier instruction.
+       */
+      return true;
     default:
        /* The SIMD16 compressed instruction
         *
@@ -2531,6 +2538,20 @@ fs_visitor::opt_algebraic()
           }
           break;
  
+      case SHADER_OPCODE_SHUFFLE:
+         if (is_uniform(inst->src[0])) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            progress = true;
+         } else if (inst->src[1].file == IMM) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = component(inst->src[0],
+                                     inst->src[1].ud);
+            inst->sources = 1;
+            progress = true;
+         }
+         break;
+
        default:
          break;
        }
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h

index b0799a0f5e2f8fc5d1c2a4963e249d62be5f41ab..1b7df844696feb08babcde0d14a27ec5060ebea1 100644 (file)
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -471,6 +471,11 @@ private:
                                struct brw_reg reg,
                                struct brw_reg indirect_byte_offset);
  
+   void generate_shuffle(fs_inst *inst,
+                         struct brw_reg dst,
+                         struct brw_reg src,
+                         struct brw_reg idx);
+
     bool patch_discard_jumps_to_fb_writes();
  
     const struct brw_compiler *compiler;
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp

index df34099713b81ff1b2d83818d5895b0d05d941d4..9b8f8ce683e0160f54c19cbf5aca755f575f576a 100644 (file)
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -540,6 +540,106 @@ fs_generator::generate_mov_indirect(fs_inst *inst,
     }
  }
  
+void
+fs_generator::generate_shuffle(fs_inst *inst,
+                               struct brw_reg dst,
+                               struct brw_reg src,
+                               struct brw_reg idx)
+{
+   /* Ivy bridge has some strange behavior that makes this a real pain to
+    * implement for 64-bit values so we just don't bother.
+    */
+   assert(devinfo->gen >= 8 || devinfo->is_haswell || type_sz(src.type) <= 4);
+
+   /* Because we're using the address register, we're limited to 8-wide
+    * execution on gen7.  On gen8, we're limited to 16-wide by the address
+    * register file and 8-wide for 64-bit types.  We could try and make this
+    * instruction splittable higher up in the compiler but that gets weird
+    * because it reads all of the channels regardless of execution size.  It's
+    * easier just to split it here.
+    */
+   const unsigned lower_width =
+      (devinfo->gen <= 7 || type_sz(src.type) > 4) ?
+      8 : MIN2(16, inst->exec_size);
+
+   brw_set_default_exec_size(p, cvt(lower_width) - 1);
+   for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
+      brw_set_default_group(p, group);
+
+      if ((src.vstride == 0 && src.hstride == 0) ||
+          idx.file == BRW_IMMEDIATE_VALUE) {
+         /* Trivial, the source is already uniform or the index is a constant.
+          * We will typically not get here if the optimizer is doing its job,
+          * but asserting would be mean.
+          */
+         const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
+         brw_MOV(p, suboffset(dst, group), stride(suboffset(src, i), 0, 1, 0));
+      } else {
+         /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
+         struct brw_reg addr = vec8(brw_address_reg(0));
+
+         struct brw_reg group_idx = suboffset(idx, group);
+
+         if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
+            /* Things get grumpy if the register is too wide. */
+            group_idx.width--;
+            group_idx.vstride--;
+         }
+
+         assert(type_sz(group_idx.type) <= 4);
+         if (type_sz(group_idx.type) == 4) {
+            /* The destination stride of an instruction (in bytes) must be
+             * greater than or equal to the size of the rest of the
+             * instruction.  Since the address register is of type UW, we
+             * can't use a D-type instruction.  In order to get around this,
+             * re retype to UW and use a stride.
+             */
+            group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W);
+         }
+
+         /* Take into account the component size and horizontal stride. */
+         assert(src.vstride == src.hstride + src.width);
+         brw_SHL(p, addr, group_idx,
+                 brw_imm_uw(_mesa_logbase2(type_sz(src.type)) +
+                            src.hstride - 1));
+
+         /* Add on the register start offset */
+         brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr));
+
+         if (type_sz(src.type) > 4 &&
+             ((devinfo->gen == 7 && !devinfo->is_haswell) ||
+              devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
+            /* IVB has an issue (which we found empirically) where it reads
+             * two address register components per channel for indirectly
+             * addressed 64-bit sources.
+             *
+             * From the Cherryview PRM Vol 7. "Register Region Restrictions":
+             *
+             *    "When source or destination datatype is 64b or operation is
+             *    integer DWord multiply, indirect addressing must not be
+             *    used."
+             *
+             * To work around both of these, we do two integer MOVs insead of
+             * one 64-bit MOV.  Because no double value should ever cross a
+             * register boundary, it's safe to use the immediate offset in the
+             * indirect here to handle adding 4 bytes to the offset and avoid
+             * the extra ADD to the register file.
+             */
+            struct brw_reg gdst = suboffset(dst, group);
+            struct brw_reg dst_d = retype(spread(gdst, 2),
+                                          BRW_REGISTER_TYPE_D);
+            brw_MOV(p, dst_d,
+                    retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
+            brw_MOV(p, byte_offset(dst_d, 4),
+                    retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
+         } else {
+            brw_MOV(p, suboffset(dst, group),
+                    retype(brw_VxH_indirect(0, 0), src.type));
+         }
+      }
+   }
+}
+
  void
  fs_generator::generate_urb_read(fs_inst *inst,
                                  struct brw_reg dst,
@@ -2189,6 +2289,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
           brw_broadcast(p, dst, src[0], src[1]);
           break;
  
+      case SHADER_OPCODE_SHUFFLE:
+         generate_shuffle(inst, dst, src[0], src[1]);
+         break;
+
        case FS_OPCODE_SET_SAMPLE_ID:
           generate_set_sample_id(inst, dst, src[0], src[1]);
           break;
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp

index 71e871c500af734190a490630a8ca06e17975049..d2d32f95930da9ba7611e334d564e93e608576ff 100644 (file)
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4507,6 +4507,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        break;
     }
  
+   case nir_intrinsic_shuffle: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      const fs_reg index = get_nir_src(instr->src[1]);
+
+      bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
+      break;
+   }
+
     case nir_intrinsic_first_invocation: {
        fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
        bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c

index cbfafd4db02faebc1d72b163249ab6d34a41a384..dbad4a14b173026efae3256c8756f3b18e4edf19 100644 (file)
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -656,6 +656,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
        .lower_to_scalar = true,
        .lower_subgroup_masks = true,
        .lower_vote_trivial = !is_scalar,
+      .lower_shuffle = true,
     };
     OPT(nir_lower_subgroups, &subgroups_options);
  
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp

index abfad4e54c3ba1896ef2f0dc22602905eed1c4d2..b1227e17e2c7c38db542ee8f46b33099a817dc14 100644 (file)
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -330,6 +330,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
        return "find_live_channel";
     case SHADER_OPCODE_BROADCAST:
        return "broadcast";
+   case SHADER_OPCODE_SHUFFLE:
+      return "shuffle";
  
     case SHADER_OPCODE_GET_BUFFER_SIZE:
        return "get_buffer_size";
author	Jason Ekstrand <jason.ekstrand@intel.com>
	Tue, 29 Aug 2017 16:21:32 +0000 (09:21 -0700)
committer	Jason Ekstrand <jason.ekstrand@intel.com>
	Wed, 7 Mar 2018 20:13:47 +0000 (12:13 -0800)
src/intel/compiler/brw_eu_defines.h		patch \| blob \| history
src/intel/compiler/brw_fs.cpp		patch \| blob \| history
src/intel/compiler/brw_fs.h		patch \| blob \| history
src/intel/compiler/brw_fs_generator.cpp		patch \| blob \| history
src/intel/compiler/brw_fs_nir.cpp		patch \| blob \| history
src/intel/compiler/brw_nir.c		patch \| blob \| history
src/intel/compiler/brw_shader.cpp		patch \| blob \| history