intel/eu/gen12: Don't set thread control, it's gone.

[mesa.git] / src / intel / compiler / brw_fs_nir.cpp
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp

index abce29c92f5a54b4ae6aa6abd0cf2574624e8fab..6a9a2548b6ce5754cb41da47adf35447ae41249b 100644 (file)
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -1258,6 +1258,10 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
           }
        }
  
+      /* We emit the rounding mode after the previous fsign optimization since
+       * it won't result in a MUL, but will try to negate the value by other
+       * means.
+       */
        if (nir_has_any_rounding_mode_enabled(execution_mode)) {
           brw_rnd_mode rnd =
              brw_rnd_mode_from_execution_mode(execution_mode);
@@ -1813,6 +1817,13 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
        break;
  
     case nir_op_flrp:
+      if (nir_has_any_rounding_mode_enabled(execution_mode)) {
+         brw_rnd_mode rnd =
+            brw_rnd_mode_from_execution_mode(execution_mode);
+         bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
+                  brw_imm_d(rnd));
+      }
+
        inst = bld.LRP(result, op[0], op[1], op[2]);
        inst->saturate = instr->dest.saturate;
        break;
@@ -4925,10 +4936,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        opcode brw_op = brw_op_for_nir_reduction_op(redop);
        brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
  
+      /* There are a couple of register region issues that make things
+       * complicated for 8-bit types:
+       *
+       *    1. Only raw moves are allowed to write to a packed 8-bit
+       *       destination.
+       *    2. If we use a strided destination, the efficient way to do scan
+       *       operations ends up using strides that are too big to encode in
+       *       an instruction.
+       *
+       * To get around these issues, we just do all 8-bit scan operations in
+       * 16 bits.  It's actually fewer instructions than what we'd have to do
+       * if we were trying to do it in native 8-bit types and the results are
+       * the same once we truncate to 8 bits at the end.
+       */
+      brw_reg_type scan_type = src.type;
+      if (type_sz(scan_type) == 1)
+         scan_type = brw_reg_type_from_bit_size(16, src.type);
+
        /* Set up a register for all of our scratching around and initialize it
         * to reduction operation's identity value.
         */
-      fs_reg scan = bld.vgrf(src.type);
+      fs_reg scan = bld.vgrf(scan_type);
        bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
  
        bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
@@ -4971,10 +5000,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        opcode brw_op = brw_op_for_nir_reduction_op(redop);
        brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
  
+      /* There are a couple of register region issues that make things
+       * complicated for 8-bit types:
+       *
+       *    1. Only raw moves are allowed to write to a packed 8-bit
+       *       destination.
+       *    2. If we use a strided destination, the efficient way to do scan
+       *       operations ends up using strides that are too big to encode in
+       *       an instruction.
+       *
+       * To get around these issues, we just do all 8-bit scan operations in
+       * 16 bits.  It's actually fewer instructions than what we'd have to do
+       * if we were trying to do it in native 8-bit types and the results are
+       * the same once we truncate to 8 bits at the end.
+       */
+      brw_reg_type scan_type = src.type;
+      if (type_sz(scan_type) == 1)
+         scan_type = brw_reg_type_from_bit_size(16, src.type);
+
        /* Set up a register for all of our scratching around and initialize it
         * to reduction operation's identity value.
         */
-      fs_reg scan = bld.vgrf(src.type);
+      fs_reg scan = bld.vgrf(scan_type);
        const fs_builder allbld = bld.exec_all();
        allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
  
@@ -4983,7 +5030,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
            * shift of the contents before we can begin.  To make things worse,
            * we can't do this with a normal stride; we have to use indirects.
            */
-         fs_reg shifted = bld.vgrf(src.type);
+         fs_reg shifted = bld.vgrf(scan_type);
           fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
           allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
                           brw_imm_w(-1));