i965: Define method to check whether a backend_reg is inside a given range.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
index 2f264255c0799844332b60c8fd7bb8c90849c5d0..a57f501a37e2b1ed17a62ed37e96ba7e9482ef16 100644 (file)
@@ -28,8 +28,6 @@
  * from the LIR.
  */
 
-extern "C" {
-
 #include <sys/types.h>
 
 #include "util/hash_table.h"
@@ -43,7 +41,6 @@ extern "C" {
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
-}
 #include "brw_fs.h"
 #include "brw_cfg.h"
 #include "brw_dead_control_flow.h"
@@ -126,7 +123,8 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    case HW_REG:
    case MRF:
    case ATTR:
-      this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
+      this->regs_written =
+         DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
       break;
    case BAD_FILE:
       this->regs_written = 0;
@@ -489,10 +487,7 @@ fs_inst::equals(fs_inst *inst) const
 bool
 fs_inst::overwrites_reg(const fs_reg &reg) const
 {
-   return (reg.file == dst.file &&
-           reg.reg == dst.reg &&
-           reg.reg_offset >= dst.reg_offset  &&
-           reg.reg_offset < dst.reg_offset + regs_written);
+   return reg.in_range(dst, regs_written);
 }
 
 bool
@@ -672,14 +667,21 @@ fs_visitor::type_size(const struct glsl_type *type)
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_ERROR:
    case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_DOUBLE:
       unreachable("not reached");
    }
 
    return 0;
 }
 
+/**
+ * Create a MOV to read the timestamp register.
+ *
+ * The caller is responsible for emitting the MOV.  The return value is
+ * the destination of the MOV, with extra parameters set.
+ */
 fs_reg
-fs_visitor::get_timestamp()
+fs_visitor::get_timestamp(fs_inst **out_mov)
 {
    assert(brw->gen >= 7);
 
@@ -690,7 +692,7 @@ fs_visitor::get_timestamp()
 
    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 
-   fs_inst *mov = emit(MOV(dst, ts));
+   fs_inst *mov = MOV(dst, ts);
    /* We want to read the 3 fields we care about even if it's not enabled in
     * the dispatch.
     */
@@ -708,6 +710,7 @@ fs_visitor::get_timestamp()
     */
    dst.set_smear(0);
 
+   *out_mov = mov;
    return dst;
 }
 
@@ -715,7 +718,9 @@ void
 fs_visitor::emit_shader_time_begin()
 {
    current_annotation = "shader time start";
-   shader_start_time = get_timestamp();
+   fs_inst *mov;
+   shader_start_time = get_timestamp(&mov);
+   emit(mov);
 }
 
 void
@@ -751,38 +756,50 @@ fs_visitor::emit_shader_time_end()
       unreachable("fs_visitor::emit_shader_time_end missing code");
    }
 
-   fs_reg shader_end_time = get_timestamp();
+   /* Insert our code just before the final SEND with EOT. */
+   exec_node *end = this->instructions.get_tail();
+   assert(end && ((fs_inst *) end)->eot);
+
+   fs_inst *tm_read;
+   fs_reg shader_end_time = get_timestamp(&tm_read);
+   end->insert_before(tm_read);
 
    /* Check that there weren't any timestamp reset events (assuming these
     * were the only two timestamp reads that happened).
     */
    fs_reg reset = shader_end_time;
    reset.set_smear(2);
-   fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
+   fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
    test->conditional_mod = BRW_CONDITIONAL_Z;
-   emit(IF(BRW_PREDICATE_NORMAL));
+   test->force_writemask_all = true;
+   end->insert_before(test);
+   end->insert_before(IF(BRW_PREDICATE_NORMAL));
 
    fs_reg start = shader_start_time;
    start.negate = true;
    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
-   emit(ADD(diff, start, shader_end_time));
+   diff.set_smear(0);
+   fs_inst *add = ADD(diff, start, shader_end_time);
+   add->force_writemask_all = true;
+   end->insert_before(add);
 
    /* If there were no instructions between the two timestamp gets, the diff
     * is 2 cycles.  Remove that overhead, so I can forget about that when
     * trying to determine the time taken for single instructions.
     */
-   emit(ADD(diff, diff, fs_reg(-2u)));
-
-   emit_shader_time_write(type, diff);
-   emit_shader_time_write(written_type, fs_reg(1u));
-   emit(BRW_OPCODE_ELSE);
-   emit_shader_time_write(reset_type, fs_reg(1u));
-   emit(BRW_OPCODE_ENDIF);
+   add = ADD(diff, diff, fs_reg(-2u));
+   add->force_writemask_all = true;
+   end->insert_before(add);
+
+   end->insert_before(SHADER_TIME_ADD(type, diff));
+   end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
+   end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
+   end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
+   end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 }
 
-void
-fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
-                                   fs_reg value)
+fs_inst *
+fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 {
    int shader_time_index =
       brw_get_shader_time_index(brw, shader_prog, prog, type);
@@ -794,8 +811,8 @@ fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
    else
       payload = vgrf(glsl_type::uint_type);
 
-   emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
-                             fs_reg(), payload, offset, value));
+   return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
+                               fs_reg(), payload, offset, value);
 }
 
 void
@@ -809,11 +826,11 @@ fs_visitor::vfail(const char *format, va_list va)
    failed = true;
 
    msg = ralloc_vasprintf(mem_ctx, format, va);
-   msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
+   msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 
    this->fail_msg = msg;
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (debug_enabled) {
       fprintf(stderr, "%s",  msg);
    }
 }
@@ -1562,6 +1579,17 @@ fs_visitor::emit_sampleid_setup()
    return reg;
 }
 
+void
+fs_visitor::resolve_source_modifiers(fs_reg *src)
+{
+   if (!src->abs && !src->negate)
+      return;
+
+   fs_reg temp = retype(vgrf(1), src->type);
+   emit(MOV(temp, *src));
+   *src = temp;
+}
+
 fs_reg
 fs_visitor::fix_math_operand(fs_reg src)
 {
@@ -1662,6 +1690,21 @@ fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
    return inst;
 }
 
+void
+fs_visitor::emit_discard_jump()
+{
+   /* For performance, after a discard, jump to the end of the
+    * shader if all relevant channels have been discarded.
+    */
+   fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
+   discard_jump->flag_subreg = 1;
+
+   discard_jump->predicate = (dispatch_width == 8)
+                             ? BRW_PREDICATE_ALIGN1_ANY8H
+                             : BRW_PREDICATE_ALIGN1_ANY16H;
+   discard_jump->predicate_inverse = true;
+}
+
 void
 fs_visitor::assign_curb_setup()
 {
@@ -2250,8 +2293,13 @@ fs_visitor::demote_pull_constants()
         if (inst->src[i].file != UNIFORM)
            continue;
 
-         int pull_index = pull_constant_loc[inst->src[i].reg +
-                                            inst->src[i].reg_offset];
+         int pull_index;
+         unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
+         if (location >= uniforms) /* Out of bounds access */
+            pull_index = -1;
+         else
+            pull_index = pull_constant_loc[location];
+
          if (pull_index == -1)
            continue;
 
@@ -2324,6 +2372,15 @@ fs_visitor::opt_algebraic()
            break;
         }
 
+         /* a * -1.0 = -a */
+         if (inst->src[1].is_negative_one()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0].negate = !inst->src[0].negate;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+
          /* a * 0.0 = 0.0 */
          if (inst->src[1].is_zero()) {
             inst->opcode = BRW_OPCODE_MOV;
@@ -2333,6 +2390,14 @@ fs_visitor::opt_algebraic()
             break;
          }
 
+         if (inst->src[0].file == IMM) {
+            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
         break;
       case BRW_OPCODE_ADD:
          if (inst->src[1].file != IMM)
@@ -2345,6 +2410,15 @@ fs_visitor::opt_algebraic()
             progress = true;
             break;
          }
+
+         if (inst->src[0].file == IMM) {
+            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
          break;
       case BRW_OPCODE_OR:
          if (inst->src[0].equals(inst->src[1])) {
@@ -2392,6 +2466,7 @@ fs_visitor::opt_algebraic()
                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
                      inst->opcode = BRW_OPCODE_MOV;
                      inst->src[1] = reg_undef;
+                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
                      progress = true;
                   }
                   break;
@@ -2418,6 +2493,33 @@ fs_visitor::opt_algebraic()
             }
          }
          break;
+      case BRW_OPCODE_MAD:
+         if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[1] = reg_undef;
+            inst->src[2] = reg_undef;
+            progress = true;
+         } else if (inst->src[0].is_zero()) {
+            inst->opcode = BRW_OPCODE_MUL;
+            inst->src[0] = inst->src[2];
+            inst->src[2] = reg_undef;
+            progress = true;
+         } else if (inst->src[1].is_one()) {
+            inst->opcode = BRW_OPCODE_ADD;
+            inst->src[1] = inst->src[2];
+            inst->src[2] = reg_undef;
+            progress = true;
+         } else if (inst->src[2].is_one()) {
+            inst->opcode = BRW_OPCODE_ADD;
+            inst->src[2] = reg_undef;
+            progress = true;
+         } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
+            inst->opcode = BRW_OPCODE_ADD;
+            inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
+            inst->src[2] = reg_undef;
+            progress = true;
+         }
+         break;
       case SHADER_OPCODE_RCP: {
          fs_inst *prev = (fs_inst *)inst->prev;
          if (prev->opcode == SHADER_OPCODE_SQRT) {
@@ -2432,8 +2534,16 @@ fs_visitor::opt_algebraic()
       default:
         break;
       }
-   }
 
+      /* Swap if src[0] is immediate. */
+      if (progress && inst->is_commutative()) {
+         if (inst->src[0].file == IMM) {
+            fs_reg tmp = inst->src[1];
+            inst->src[1] = inst->src[0];
+            inst->src[0] = tmp;
+         }
+      }
+   }
    return progress;
 }
 
@@ -2503,6 +2613,47 @@ fs_visitor::opt_register_renaming()
    return progress;
 }
 
+/**
+ * Remove redundant or useless discard jumps.
+ *
+ * For example, we can eliminate jumps in the following sequence:
+ *
+ * discard-jump       (redundant with the next jump)
+ * discard-jump       (useless; jumps to the next instruction)
+ * placeholder-halt
+ */
+bool
+fs_visitor::opt_redundant_discard_jumps()
+{
+   bool progress = false;
+
+   bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
+
+   fs_inst *placeholder_halt = NULL;
+   foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
+      if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
+         placeholder_halt = inst;
+         break;
+      }
+   }
+
+   if (!placeholder_halt)
+      return false;
+
+   /* Delete any HALTs immediately before the placeholder halt. */
+   for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
+        !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
+        prev = (fs_inst *) placeholder_halt->prev) {
+      prev->remove(last_bblock);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 bool
 fs_visitor::compute_to_mrf()
 {
@@ -2756,7 +2907,7 @@ fs_visitor::remove_duplicate_mrf_writes()
 
       /* Clear out any MRF move records whose sources got overwritten. */
       if (inst->dst.file == GRF) {
-        for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
+        for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
            if (last_mrf_move[i] &&
                last_mrf_move[i]->src[0].reg == inst->dst.reg) {
               last_mrf_move[i] = NULL;
@@ -2779,8 +2930,7 @@ fs_visitor::remove_duplicate_mrf_writes()
 }
 
 static void
-clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
-                        int first_grf, int grf_len)
+clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
 {
    /* Clear the flag for registers that actually got read (as expected). */
    for (int i = 0; i < inst->sources; i++) {
@@ -2831,8 +2981,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
    memset(needs_dep, false, sizeof(needs_dep));
    memset(needs_dep, true, write_len);
 
-   clear_deps_for_inst_src(inst, dispatch_width,
-                           needs_dep, first_write_grf, write_len);
+   clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
 
    /* Walk backwards looking for writes to registers we're writing which
     * aren't read since being written.  If we hit the start of the program,
@@ -2872,8 +3021,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
       }
 
       /* Clear the flag for registers that actually got read (as expected). */
-      clear_deps_for_inst_src(scan_inst, dispatch_width,
-                              needs_dep, first_write_grf, write_len);
+      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
 
       /* Continue the loop only if we haven't resolved all the dependencies */
       int i;
@@ -2918,8 +3066,7 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
       }
 
       /* Clear the flag for registers that actually got read (as expected). */
-      clear_deps_for_inst_src(scan_inst, dispatch_width,
-                              needs_dep, first_write_grf, write_len);
+      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
 
       /* We insert our reads as late as possible since they're reading the
        * result of a SEND, which has massive latency.
@@ -2941,16 +3088,6 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
       if (i == write_len)
          return;
    }
-
-   /* If we hit the end of the program, resolve all remaining dependencies out
-    * of paranoia.
-    */
-   fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
-   assert(last_inst->eot);
-   for (int i = 0; i < write_len; i++) {
-      if (needs_dep[i])
-         last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
-   }
 }
 
 void
@@ -3008,7 +3145,7 @@ fs_visitor::lower_uniform_pull_constant_loads()
          assert(const_offset_reg.file == IMM &&
                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
-         fs_reg payload = vgrf(glsl_type::uint_type);
+         fs_reg payload = fs_reg(GRF, alloc.allocate(1));
 
          /* We have to use a message header on Skylake to get SIMD4x2 mode.
           * Reserve space for the register.
@@ -3057,7 +3194,7 @@ fs_visitor::lower_load_payload()
    bool progress = false;
 
    int vgrf_to_reg[alloc.count];
-   int reg_count = 16; /* Leave room for MRF */
+   int reg_count = 0;
    for (unsigned i = 0; i < alloc.count; ++i) {
       vgrf_to_reg[i] = reg_count;
       reg_count += alloc.sizes[i];
@@ -3071,18 +3208,13 @@ fs_visitor::lower_load_payload()
    memset(metadata, 0, sizeof(metadata));
 
    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
-      int dst_reg;
       if (inst->dst.file == GRF) {
-         dst_reg = vgrf_to_reg[inst->dst.reg];
-      } else {
-         /* MRF */
-         dst_reg = inst->dst.reg;
-      }
-
-      if (inst->dst.file == MRF || inst->dst.file == GRF) {
-         bool force_sechalf = inst->force_sechalf;
+         const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
+         bool force_sechalf = inst->force_sechalf &&
+                              !inst->force_writemask_all;
          bool toggle_sechalf = inst->dst.width == 16 &&
-                               type_sz(inst->dst.type) == 4;
+                               type_sz(inst->dst.type) == 4 &&
+                               !inst->force_writemask_all;
          for (int i = 0; i < inst->regs_written; ++i) {
             metadata[dst_reg + i].written = true;
             metadata[dst_reg + i].force_sechalf = force_sechalf;
@@ -3123,22 +3255,28 @@ fs_visitor::lower_load_payload()
                                 inst->src[i].reg_offset;
                   mov->force_sechalf = metadata[src_reg].force_sechalf;
                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
-                  metadata[dst_reg] = metadata[src_reg];
-                  if (dst.width * type_sz(dst.type) > 32) {
-                     assert((!metadata[src_reg].written ||
-                             !metadata[src_reg].force_sechalf) &&
-                            (!metadata[src_reg + 1].written ||
-                             metadata[src_reg + 1].force_sechalf));
-                     metadata[dst_reg + 1] = metadata[src_reg + 1];
-                  }
                } else {
-                  metadata[dst_reg].force_writemask_all = false;
-                  metadata[dst_reg].force_sechalf = false;
-                  if (dst.width == 16) {
-                     metadata[dst_reg + 1].force_writemask_all = false;
-                     metadata[dst_reg + 1].force_sechalf = true;
+                  /* We don't have any useful metadata for immediates or
+                   * uniforms.  Assume that any of the channels of the
+                   * destination may be used.
+                   */
+                  assert(inst->src[i].file == IMM ||
+                         inst->src[i].file == UNIFORM);
+                  mov->force_writemask_all = true;
+               }
+
+               if (dst.file == GRF) {
+                  const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
+                  const bool force_writemask = mov->force_writemask_all;
+                  metadata[dst_reg].force_writemask_all = force_writemask;
+                  metadata[dst_reg].force_sechalf = mov->force_sechalf;
+                  if (dst.width * type_sz(dst.type) > 32) {
+                     assert(!mov->force_sechalf);
+                     metadata[dst_reg + 1].force_writemask_all = force_writemask;
+                     metadata[dst_reg + 1].force_sechalf = !force_writemask;
                   }
                }
+
                inst->insert_before(block, mov);
             }
 
@@ -3165,7 +3303,6 @@ fs_visitor::dump_instructions()
 void
 fs_visitor::dump_instructions(const char *name)
 {
-   calculate_register_pressure();
    FILE *file = stderr;
    if (name && geteuid() != 0) {
       file = fopen(name, "w");
@@ -3173,14 +3310,23 @@ fs_visitor::dump_instructions(const char *name)
          file = stderr;
    }
 
-   int ip = 0, max_pressure = 0;
-   foreach_block_and_inst(block, backend_instruction, inst, cfg) {
-      max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
-      fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
-      dump_instruction(inst, file);
-      ++ip;
+   if (cfg) {
+      calculate_register_pressure();
+      int ip = 0, max_pressure = 0;
+      foreach_block_and_inst(block, backend_instruction, inst, cfg) {
+         max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
+         fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
+         dump_instruction(inst, file);
+         ip++;
+      }
+      fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
+   } else {
+      int ip = 0;
+      foreach_in_list(backend_instruction, inst, &instructions) {
+         fprintf(file, "%4d: ", ip++);
+         dump_instruction(inst, file);
+      }
    }
-   fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
 
    if (file != stderr) {
       fclose(file);
@@ -3312,9 +3458,11 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
          case BRW_REGISTER_TYPE_F:
             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
             break;
+         case BRW_REGISTER_TYPE_W:
          case BRW_REGISTER_TYPE_D:
             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
             break;
+         case BRW_REGISTER_TYPE_UW:
          case BRW_REGISTER_TYPE_UD:
             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
             break;
@@ -3546,8 +3694,6 @@ fs_visitor::optimize()
 {
    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
 
-   calculate_cfg();
-
    split_virtual_grfs();
 
    move_uniform_array_access_to_pull_constants();
@@ -3597,6 +3743,7 @@ fs_visitor::optimize()
       OPT(opt_peephole_sel);
       OPT(dead_control_flow_eliminate, this);
       OPT(opt_register_renaming);
+      OPT(opt_redundant_discard_jumps);
       OPT(opt_saturate_propagation);
       OPT(register_coalesce);
       OPT(compute_to_mrf);
@@ -3613,6 +3760,8 @@ fs_visitor::optimize()
       OPT(dead_code_eliminate);
    }
 
+   OPT(opt_combine_constants);
+
    lower_uniform_pull_constant_loads();
 }
 
@@ -3701,6 +3850,26 @@ fs_visitor::allocate_registers()
       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
 }
 
+static bool
+env_var_as_boolean(const char *var_name, bool default_value)
+{
+   const char *str = getenv(var_name);
+   if (str == NULL)
+      return default_value;
+
+   if (strcmp(str, "1") == 0 ||
+       strcasecmp(str, "true") == 0 ||
+       strcasecmp(str, "yes") == 0) {
+      return true;
+   } else if (strcmp(str, "0") == 0 ||
+              strcasecmp(str, "false") == 0 ||
+              strcasecmp(str, "no") == 0) {
+      return false;
+   } else {
+      return default_value;
+   }
+}
+
 bool
 fs_visitor::run_vs()
 {
@@ -3712,17 +3881,24 @@ fs_visitor::run_vs()
    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
       emit_shader_time_begin();
 
-   foreach_in_list(ir_instruction, ir, shader->base.ir) {
-      base_ir = ir;
-      this->result = reg_undef;
-      ir->accept(this);
+   if (env_var_as_boolean("INTEL_USE_NIR", false)) {
+      emit_nir_code();
+   } else {
+      foreach_in_list(ir_instruction, ir, shader->base.ir) {
+         base_ir = ir;
+         this->result = reg_undef;
+         ir->accept(this);
+      }
+      base_ir = NULL;
    }
-   base_ir = NULL;
+
    if (failed)
       return false;
 
    emit_urb_writes();
 
+   calculate_cfg();
+
    optimize();
 
    assign_curb_setup();
@@ -3779,7 +3955,7 @@ fs_visitor::run_fs()
        * functions called "main").
        */
       if (shader) {
-         if (getenv("INTEL_USE_NIR") != NULL) {
+         if (env_var_as_boolean("INTEL_USE_NIR", false)) {
             emit_nir_code();
          } else {
             foreach_in_list(ir_instruction, ir, shader->base.ir) {
@@ -3802,6 +3978,11 @@ fs_visitor::run_fs()
 
       emit_fb_writes();
 
+      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+         emit_shader_time_end();
+
+      calculate_cfg();
+
       optimize();
 
       assign_curb_setup();
@@ -3899,7 +4080,7 @@ brw_wm_fs_emit(struct brw_context *brw,
    }
 
    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
-                  &fp->Base, v.runtime_check_aads_emit, "FS");
+                  &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       char *name;