i965/fs: Switch to using sampler LD messages for uniform pull constants.

author Eric Anholt <eric@anholt.net>

Wed, 6 Mar 2013 22:47:22 +0000 (14:47 -0800)

committer Eric Anholt <eric@anholt.net>

Mon, 11 Mar 2013 19:11:53 +0000 (12:11 -0700)
author Eric Anholt <eric@anholt.net>
Wed, 6 Mar 2013 22:47:22 +0000 (14:47 -0800)
committer Eric Anholt <eric@anholt.net>
Mon, 11 Mar 2013 19:11:53 +0000 (12:11 -0700)
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h

index d9b7f9aeb15a1edf0d753cb7d211aa1a9bd37c34..6414e69892d00dbedf516cb7d806fa16abc962c1 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -727,7 +727,7 @@ enum opcode {
     FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
     FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
     FS_OPCODE_DISCARD_JUMP,
-   FS_OPCODE_SET_GLOBAL_OFFSET,
+   FS_OPCODE_SET_SIMD4X2_OFFSET,
     FS_OPCODE_PACK_HALF_2x16_SPLIT,
     FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X,
     FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index b97a19e05103220d8afc3620e6747d0be540ae6f..5380abfe2f4af810312495744852fc91a80e75f1 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2461,6 +2461,11 @@ fs_visitor::insert_gen4_send_dependency_workarounds()
   * scheduling full flexibility, while the conversion to native instructions
   * allows the post-register-allocation scheduler the best information
   * possible.
+ *
+ * Note that execution masking for setting up pull constant loads is special:
+ * the channels that need to be written are unrelated to the current execution
+ * mask, since a later instruction will use one of the result channels as a
+ * source operand for all 8 or 16 of its channels.
   */
  void
  fs_visitor::lower_uniform_pull_constant_loads()
@@ -2477,26 +2482,24 @@ fs_visitor::lower_uniform_pull_constant_loads()
                  const_offset_reg.type == BRW_REGISTER_TYPE_UD);
           const_offset_reg.imm.u /= 16;
           fs_reg payload = fs_reg(this, glsl_type::uint_type);
-         struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
-                                    BRW_REGISTER_TYPE_UD);
-
-         fs_inst *setup1 = MOV(payload, fs_reg(g0));
-         setup1->force_writemask_all = true;
-         /* We don't need the second half of this vgrf to be filled with g1
-          * in the 16-wide case, but if we use force_uncompressed then live
-          * variable analysis won't consider this a def!
+
+         /* This is actually going to be a MOV, but since only the first dword
+          * is accessed, we have a special opcode to do just that one.  Note
+          * that this needs to be an operation that will be considered a def
+          * by live variable analysis, or register allocation will explode.
            */
+         fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
+                                               payload, const_offset_reg);
+         setup->force_writemask_all = true;
  
-         fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
-                                                payload, payload,
-                                                const_offset_reg);
+         setup->ir = inst->ir;
+         setup->annotation = inst->annotation;
+         inst->insert_before(setup);
  
-         setup1->ir = inst->ir;
-         setup1->annotation = inst->annotation;
-         inst->insert_before(setup1);
-         setup2->ir = inst->ir;
-         setup2->annotation = inst->annotation;
-         inst->insert_before(setup2);
+         /* Similarly, this will only populate the first 4 channels of the
+          * result register (since we only use smear values from 0-3), but we
+          * don't tell the optimizer.
+          */
           inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
           inst->src[1] = payload;
  
@@ -2533,7 +2536,7 @@ fs_visitor::dump_instruction(fs_inst *inst)
        case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
           printf("uniform_pull_const_gen7");
           break;
-      case FS_OPCODE_SET_GLOBAL_OFFSET:
+      case FS_OPCODE_SET_SIMD4X2_OFFSET:
           printf("set_global_offset");
           break;
        default:
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h

index f7ccc7909e22423442c930896d99dae3cfe8a900..febd56bfe2e9d7f023ac81cfc8beaf4a44f53542 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -546,10 +546,9 @@ private:
                                                   struct brw_reg index,
                                                   struct brw_reg offset);
     void generate_mov_dispatch_to_flags(fs_inst *inst);
-   void generate_set_global_offset(fs_inst *inst,
-                                   struct brw_reg dst,
-                                   struct brw_reg src,
-                                   struct brw_reg offset);
+   void generate_set_simd4x2_offset(fs_inst *inst,
+                                    struct brw_reg dst,
+                                    struct brw_reg offset);
     void generate_discard_jump(fs_inst *inst);
  
     void generate_pack_half_2x16_split(fs_inst *inst,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp

index 2391ad1202674f5630c723ee2af73c6dfe1488e5..712fef6e0934e1d74cee24fabc816342541b46a0 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -647,6 +647,8 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
     uint32_t surf_index = index.dw1.ud;
  
     assert(offset.file == BRW_GENERAL_REGISTER_FILE);
+   /* Reference just the dword we need, to avoid angering validate_reg(). */
+   offset = brw_vec1_grf(offset.nr, 0);
  
     brw_push_insn_state(p);
     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
@@ -654,20 +656,22 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
     struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
     brw_pop_insn_state(p);
  
+   /* We use the SIMD4x2 mode because we want to end up with 4 components in
+    * the destination loaded consecutively from the same offset (which appears
+    * in the first component, and the rest are ignored).
+    */
+   dst.width = BRW_WIDTH_4;
     brw_set_dest(p, send, dst);
     brw_set_src0(p, send, offset);
-
-   uint32_t msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
-   uint32_t msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ;
-   bool header_present = true;
-   brw_set_dp_read_message(p, send,
+   brw_set_sampler_message(p, send,
                             surf_index,
-                           msg_control,
-                           msg_type,
-                           BRW_DATAPORT_READ_TARGET_DATA_CACHE,
-                           1,
-                           header_present,
-                           1);
+                           0, /* LD message ignores sampler unit */
+                           GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                           1, /* rlen */
+                           1, /* mlen */
+                           false, /* no header */
+                           BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                           0);
  }
  
  void
@@ -858,31 +862,23 @@ brw_reg_from_fs_reg(fs_reg *reg)
  }
  
  /**
- * Sets the second dword of a vgrf for gen7+ message setup.
+ * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
+ * sampler LD messages.
   *
- * For setting up gen7 messages in VGRFs, we need to be able to set the second
- * dword for some payloads where in the MRF world we'd have just used
- * brw_message_reg().  We don't want to bake it into the send message's code
- * generation because that means we don't get a chance to schedule the
- * instructions.
+ * We don't want to bake it into the send message's code generation because
+ * that means we don't get a chance to schedule the instructions.
   */
  void
-fs_generator::generate_set_global_offset(fs_inst *inst,
-                                         struct brw_reg dst,
-                                         struct brw_reg src,
-                                         struct brw_reg value)
+fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
+                                          struct brw_reg dst,
+                                          struct brw_reg value)
  {
-   /* We use a matching src and dst to get the information on how this
-    * instruction works exposed to various optimization passes that would
-    * otherwise treat it as completely overwriting the dst.
-    */
-   assert(src.file == dst.file && src.nr == dst.nr);
     assert(value.file == BRW_IMMEDIATE_VALUE);
  
     brw_push_insn_state(p);
     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
     brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 2), value.type), value);
+   brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
     brw_pop_insn_state(p);
  }
  
@@ -1298,8 +1294,8 @@ fs_generator::generate_code(exec_list *instructions)
           brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
           break;
  
-      case FS_OPCODE_SET_GLOBAL_OFFSET:
-         generate_set_global_offset(inst, dst, src[0], src[1]);
+      case FS_OPCODE_SET_SIMD4X2_OFFSET:
+         generate_set_simd4x2_offset(inst, dst, src[0]);
           break;
  
        case FS_OPCODE_PACK_HALF_2x16_SPLIT:
author	Eric Anholt <eric@anholt.net>
	Wed, 6 Mar 2013 22:47:22 +0000 (14:47 -0800)
committer	Eric Anholt <eric@anholt.net>
	Mon, 11 Mar 2013 19:11:53 +0000 (12:11 -0700)
src/mesa/drivers/dri/i965/brw_defines.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs_emit.cpp		patch \| blob \| history