i965/fs: Reset the register file to VGRF in lower_integer_multiplication

[mesa.git] / src / intel / compiler / brw_fs.cpp
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp

index fd51cace752ae27a3167d7108bdc046374be5468..09adcbc2df16d2dbf6cbedc38ab6abea1de4b632 100644 (file)
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -945,7 +945,7 @@ fs_inst::flags_written() const
   * instruction -- the FS opcodes often generate MOVs in addition.
   */
  int
-fs_visitor::implied_mrf_writes(fs_inst *inst)
+fs_visitor::implied_mrf_writes(fs_inst *inst) const
  {
     if (inst->mlen == 0)
        return 0;
@@ -1219,7 +1219,7 @@ fs_visitor::emit_sampleid_setup()
     assert(devinfo->gen >= 6);
  
     const fs_builder abld = bld.annotate("compute sample id");
-   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uint_type));
  
     if (!key->multisample_fbo) {
        /* As per GL_ARB_sample_shading specification:
@@ -1256,16 +1256,16 @@ fs_visitor::emit_sampleid_setup()
         * TODO: These payload bits exist on Gen7 too, but they appear to always
         *       be zero, so this code fails to work.  We should find out why.
         */
-      fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+      fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UW);
  
        abld.SHR(tmp, fs_reg(stride(retype(brw_vec1_grf(1, 0),
-                                         BRW_REGISTER_TYPE_B), 1, 8, 0)),
+                                         BRW_REGISTER_TYPE_UB), 1, 8, 0)),
                      brw_imm_v(0x44440000));
        abld.AND(*reg, tmp, brw_imm_w(0xf));
     } else {
        const fs_reg t1 = component(fs_reg(VGRF, alloc.allocate(1),
-                                         BRW_REGISTER_TYPE_D), 0);
-      const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+                                         BRW_REGISTER_TYPE_UD), 0);
+      const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UW);
  
        /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
         * 8x multisampling, subspan 0 will represent sample N (where N
@@ -1291,7 +1291,7 @@ fs_visitor::emit_sampleid_setup()
         * accomodate 16x MSAA.
         */
        abld.exec_all().group(1, 0)
-          .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
+          .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
                 brw_imm_ud(0xc0));
        abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
  
@@ -2096,6 +2096,15 @@ fs_visitor::assign_constant_locations()
     if (subgroup_id_index >= 0)
        max_push_components--; /* Save a slot for the thread ID */
  
+   /* FIXME: We currently have some GPU hangs that happen apparently when using
+    * push constants. Since we have no solution for such hangs yet, just
+    * go ahead and use pull constants for now.
+    */
+   if (devinfo->gen == 10 && compiler->supports_pull_constants) {
+      compiler->shader_perf_log(log_data, "Disabling push constants.");
+      max_push_components = 0;
+   }
+
     /* We push small arrays, but no bigger than 16 floats.  This is big enough
      * for a vec4 but hopefully not large enough to push out other stuff.  We
      * should probably use a better heuristic at some point.
@@ -2155,7 +2164,7 @@ fs_visitor::assign_constant_locations()
  
        unsigned push_start_align = cplx_align_apply(align, num_push_constants);
        unsigned chunk_size = u - chunk_start + 1;
-      if (!compiler->supports_pull_constants ||
+      if ((!compiler->supports_pull_constants && u < UBO_START) ||
            (chunk_size < max_chunk_size &&
             push_start_align + chunk_size <= max_push_components)) {
           /* Align up the number of push constants */
@@ -3631,13 +3640,18 @@ fs_visitor::lower_integer_multiplication()
                  regions_overlap(inst->dst, inst->size_written,
                                  inst->src[1], inst->size_read(1))) {
                 needs_mov = true;
-               low.nr = alloc.allocate(regs_written(inst));
-               low.offset = low.offset % REG_SIZE;
+               /* Get a new VGRF but keep the same stride as inst->dst */
+               low = fs_reg(VGRF, alloc.allocate(regs_written(inst)),
+                            inst->dst.type);
+               low.stride = inst->dst.stride;
+               low.offset = inst->dst.offset % REG_SIZE;
              }
  
-            fs_reg high = inst->dst;
-            high.nr = alloc.allocate(regs_written(inst));
-            high.offset = high.offset % REG_SIZE;
+            /* Get a new VGRF but keep the same stride as inst->dst */
+            fs_reg high(VGRF, alloc.allocate(regs_written(inst)),
+                        inst->dst.type);
+            high.stride = inst->dst.stride;
+            high.offset = inst->dst.offset % REG_SIZE;
  
              if (devinfo->gen >= 7) {
                 if (inst->src[1].file == IMM) {
@@ -4998,7 +5012,7 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
        return MIN2(8, inst->exec_size);
  
     case FS_OPCODE_LINTERP:
-   case FS_OPCODE_GET_BUFFER_SIZE:
+   case SHADER_OPCODE_GET_BUFFER_SIZE:
     case FS_OPCODE_DDX_COARSE:
     case FS_OPCODE_DDX_FINE:
     case FS_OPCODE_DDY_COARSE: