intel/fs,vec4: Stuff the constant data from NIR in the end of the program
[mesa.git] / src / intel / compiler / brw_vec4_generator.cpp
index 15d6d290fc2faf9add988a97b8c824c47bda5b87..e9142c2c65c0270ae46dbbec9bd794fe8dbfd6bb 100644 (file)
@@ -23,7 +23,8 @@
 #include "brw_vec4.h"
 #include "brw_cfg.h"
 #include "brw_eu.h"
-#include "common/gen_debug.h"
+#include "dev/gen_debug.h"
+#include "util/mesa-sha1.h"
 
 using namespace brw;
 
@@ -269,6 +270,17 @@ generate_tex(struct brw_codegen *p,
       break;
    }
 
+   /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
+    * is set as part of the message descriptor.  On gen4, the PRM seems to
+    * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
+    * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
+    * gone from the message descriptor entirely and you just get UINT32 all
+    * the time regasrdless.  Since we can really only do non-UINT32 on gen4,
+    * just stomp it to UINT32 all the time.
+    */
+   if (inst->opcode == SHADER_OPCODE_TXS)
+      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+
    uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
          inst->opcode == SHADER_OPCODE_TG4_OFFSET)
          ? prog_data->base.binding_table.gather_texture_start
@@ -291,8 +303,6 @@ generate_tex(struct brw_codegen *p,
                  inst->header_size != 0,
                  BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                  return_format);
-
-      brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index);
    } else {
       /* Non-constant sampler index. */
 
@@ -324,17 +334,16 @@ generate_tex(struct brw_codegen *p,
          gen6_resolve_implied_move(p, &src, inst->base_mrf);
 
       /* dst = send(offset, a0.0 | <descriptor>) */
-      brw_inst *insn = brw_send_indirect_message(
-         p, BRW_SFID_SAMPLER, dst, src, addr);
-      brw_set_sampler_message(p, insn,
-                              0 /* surface */,
-                              0 /* sampler */,
-                              msg_type,
-                              1 /* rlen */,
-                              inst->mlen /* mlen */,
-                              inst->header_size != 0 /* header */,
-                              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
-                              return_format);
+      brw_send_indirect_message(
+         p, BRW_SFID_SAMPLER, dst, src, addr,
+         brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
+         brw_sampler_desc(devinfo,
+                          0 /* surface */,
+                          0 /* sampler */,
+                          msg_type,
+                          BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                          return_format),
+         false /* EOT */);
 
       /* visitor knows more than we do about the surface limit required,
        * so has already done marking.
@@ -446,6 +455,9 @@ generate_gs_set_write_offset(struct brw_codegen *p,
       brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3),
               brw_imm_ud(src0.ud * src1.ud));
    } else {
+      if (src1.file == BRW_IMMEDIATE_VALUE) {
+         src1 = brw_imm_uw(src1.ud);
+      }
       brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
               retype(src1, BRW_REGISTER_TYPE_UW));
    }
@@ -777,10 +789,9 @@ generate_tcs_urb_write(struct brw_codegen *p,
    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
    brw_set_dest(p, send, brw_null_reg());
    brw_set_src0(p, send, urb_header);
+   brw_set_desc(p, send, brw_message_desc(devinfo, inst->mlen, 0, true));
 
-   brw_set_message_descriptor(p, send, BRW_SFID_URB,
-                              inst->mlen /* mlen */, 0 /* rlen */,
-                              true /* header */, false /* eot */);
+   brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
    brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
    brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
    if (inst->urb_write_flags & BRW_URB_WRITE_EOT) {
@@ -932,8 +943,21 @@ generate_tes_add_indirect_urb_offset(struct brw_codegen *p,
    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 
    brw_MOV(p, dst, header);
+
+   /* Uniforms will have a stride <0;4,1>, and we need to convert to <0;1,0>.
+    * Other values get <4;1,0>.
+    */
+   struct brw_reg restrided_offset;
+   if (offset.vstride == BRW_VERTICAL_STRIDE_0 &&
+       offset.width == BRW_WIDTH_4 &&
+       offset.hstride == BRW_HORIZONTAL_STRIDE_1) {
+      restrided_offset = stride(offset, 0, 1, 0);
+   } else {
+      restrided_offset = stride(offset, 4, 1, 0);
+   }
+
    /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */
-   brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+   brw_MOV(p, vec2(get_element_ud(dst, 3)), restrided_offset);
 
    brw_pop_insn_state(p);
 }
@@ -953,9 +977,9 @@ generate_vec4_urb_read(struct brw_codegen *p,
    brw_set_dest(p, send, dst);
    brw_set_src0(p, send, header);
 
-   brw_set_message_descriptor(p, send, BRW_SFID_URB,
-                              1 /* mlen */, 1 /* rlen */,
-                              true /* header */, false /* eot */);
+   brw_set_desc(p, send, brw_message_desc(devinfo, 1, 1, true));
+
+   brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
    brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
    brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
    brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
@@ -989,9 +1013,9 @@ generate_tcs_release_input(struct brw_codegen *p,
    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
    brw_set_dest(p, send, brw_null_reg());
    brw_set_src0(p, send, header);
-   brw_set_message_descriptor(p, send, BRW_SFID_URB,
-                              1 /* mlen */, 0 /* rlen */,
-                              true /* header */, false /* eot */);
+   brw_set_desc(p, send, brw_message_desc(devinfo, 1, 0, true));
+
+   brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
    brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
    brw_inst_set_urb_complete(devinfo, send, 1);
    brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
@@ -1158,23 +1182,23 @@ generate_scratch_read(struct brw_codegen *p,
    const unsigned target_cache =
       devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
-      BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
+      BRW_SFID_DATAPORT_READ;
 
    /* Each of the 8 channel enables is considered for whether each
     * dword is written.
     */
    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_sfid(devinfo, send, target_cache);
    brw_set_dest(p, send, dst);
    brw_set_src0(p, send, header);
    if (devinfo->gen < 6)
       brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf);
-   brw_set_dp_read_message(p, send,
-                           brw_scratch_surface_idx(p),
-                          BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
-                          msg_type, target_cache,
-                          2, /* mlen */
-                           true, /* header_present */
-                          1 /* rlen */);
+   brw_set_desc(p, send,
+                brw_message_desc(devinfo, 2, 1, true) |
+                brw_dp_read_desc(devinfo,
+                                 brw_scratch_surface_idx(p),
+                                 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+                                 msg_type, BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
 }
 
 static void
@@ -1188,14 +1212,14 @@ generate_scratch_write(struct brw_codegen *p,
    const unsigned target_cache =
       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
-       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+       BRW_SFID_DATAPORT_WRITE);
    struct brw_reg header = brw_vec8_grf(0, 0);
    bool write_commit;
 
    /* If the instruction is predicated, we'll predicate the send, not
     * the header setup.
     */
-   brw_set_default_predicate_control(p, false);
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 
    gen6_resolve_implied_move(p, &header, inst->base_mrf);
 
@@ -1240,21 +1264,19 @@ generate_scratch_write(struct brw_codegen *p,
     * dword is written.
     */
    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_sfid(p->devinfo, send, target_cache);
    brw_set_dest(p, send, dst);
    brw_set_src0(p, send, header);
    if (devinfo->gen < 6)
       brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
-   brw_set_dp_write_message(p, send,
-                            brw_scratch_surface_idx(p),
-                           BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
-                           msg_type,
-                            target_cache,
-                           3, /* mlen */
-                           true, /* header present */
-                           false, /* not a render target write */
-                           write_commit, /* rlen */
-                           false, /* eot */
-                           write_commit);
+   brw_set_desc(p, send,
+                brw_message_desc(devinfo, 3, write_commit, true) |
+                brw_dp_write_desc(devinfo,
+                                  brw_scratch_surface_idx(p),
+                                  BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+                                  msg_type,
+                                  false, /* not a render target write */
+                                  write_commit));
 }
 
 static void
@@ -1268,7 +1290,7 @@ generate_pull_constant_load(struct brw_codegen *p,
    const struct gen_device_info *devinfo = p->devinfo;
    const unsigned target_cache =
       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE :
-       BRW_DATAPORT_READ_TARGET_DATA_CACHE);
+       BRW_SFID_DATAPORT_READ);
    assert(index.file == BRW_IMMEDIATE_VALUE &&
          index.type == BRW_REGISTER_TYPE_UD);
    uint32_t surf_index = index.ud;
@@ -1306,18 +1328,17 @@ generate_pull_constant_load(struct brw_codegen *p,
     * dword is written.
     */
    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_sfid(devinfo, send, target_cache);
    brw_set_dest(p, send, dst);
    brw_set_src0(p, send, header);
    if (devinfo->gen < 6)
       brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
-   brw_set_dp_read_message(p, send,
-                          surf_index,
-                          BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
-                          msg_type,
-                           target_cache,
-                          2, /* mlen */
-                           true, /* header_present */
-                          1 /* rlen */);
+   brw_set_desc(p, send,
+                brw_message_desc(devinfo, 2, 1, true) |
+                brw_dp_read_desc(devinfo, surf_index,
+                                 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+                                 msg_type,
+                                 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
 }
 
 static void
@@ -1344,8 +1365,6 @@ generate_get_buffer_size(struct brw_codegen *p,
               inst->header_size > 0,
               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
               BRW_SAMPLER_RETURN_FORMAT_SINT32);
-
-   brw_mark_surface_used(&prog_data->base, surf_index.ud);
 }
 
 static void
@@ -1356,25 +1375,21 @@ generate_pull_constant_load_gen7(struct brw_codegen *p,
                                  struct brw_reg surf_index,
                                  struct brw_reg offset)
 {
+   const struct gen_device_info *devinfo = p->devinfo;
    assert(surf_index.type == BRW_REGISTER_TYPE_UD);
 
    if (surf_index.file == BRW_IMMEDIATE_VALUE) {
 
       brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
+      brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
       brw_set_dest(p, insn, dst);
       brw_set_src0(p, insn, offset);
-      brw_set_sampler_message(p, insn,
-                              surf_index.ud,
-                              0, /* LD message ignores sampler unit */
-                              GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
-                              1, /* rlen */
-                              inst->mlen,
-                              inst->header_size != 0,
-                              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
-                              0);
-
-      brw_mark_surface_used(&prog_data->base, surf_index.ud);
-
+      brw_set_desc(p, insn,
+                   brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
+                   brw_sampler_desc(devinfo, surf_index.ud,
+                                    0, /* LD message ignores sampler unit */
+                                    GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                                    BRW_SAMPLER_SIMD_MODE_SIMD4X2, 0));
    } else {
 
       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
@@ -1385,7 +1400,7 @@ generate_pull_constant_load_gen7(struct brw_codegen *p,
 
       /* a0.0 = surf_index & 0xff */
       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
-      brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
+      brw_inst_set_exec_size(devinfo, insn_and, BRW_EXECUTE_1);
       brw_set_dest(p, insn_and, addr);
       brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD)));
       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
@@ -1393,23 +1408,22 @@ generate_pull_constant_load_gen7(struct brw_codegen *p,
       brw_pop_insn_state(p);
 
       /* dst = send(offset, a0.0 | <descriptor>) */
-      brw_inst *insn = brw_send_indirect_message(
-         p, BRW_SFID_SAMPLER, dst, offset, addr);
-      brw_set_sampler_message(p, insn,
-                              0 /* surface */,
-                              0 /* sampler */,
-                              GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
-                              1 /* rlen */,
-                              inst->mlen,
-                              inst->header_size != 0,
-                              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
-                              0);
+      brw_send_indirect_message(
+         p, BRW_SFID_SAMPLER, dst, offset, addr,
+         brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
+         brw_sampler_desc(devinfo,
+                          0 /* surface */,
+                          0 /* sampler */,
+                          GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                          BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                          0),
+         false /* EOT */);
    }
 }
 
 static void
 generate_set_simd4x2_header_gen9(struct brw_codegen *p,
-                                 vec4_instruction *inst,
+                                 vec4_instruction *,
                                  struct brw_reg dst)
 {
    brw_push_insn_state(p);
@@ -1427,9 +1441,9 @@ generate_set_simd4x2_header_gen9(struct brw_codegen *p,
 
 static void
 generate_mov_indirect(struct brw_codegen *p,
-                      vec4_instruction *inst,
+                      vec4_instruction *,
                       struct brw_reg dst, struct brw_reg reg,
-                      struct brw_reg indirect, struct brw_reg length)
+                      struct brw_reg indirect)
 {
    assert(indirect.type == BRW_REGISTER_TYPE_UD);
    assert(p->devinfo->gen >= 6);
@@ -1494,22 +1508,30 @@ generate_code(struct brw_codegen *p,
               void *log_data,
               const nir_shader *nir,
               struct brw_vue_prog_data *prog_data,
-              const struct cfg_t *cfg)
+              const struct cfg_t *cfg,
+              const performance &perf,
+              struct brw_compile_stats *stats)
 {
    const struct gen_device_info *devinfo = p->devinfo;
-   const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->stage);
+   const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->info.stage);
    bool debug_flag = INTEL_DEBUG &
-      intel_debug_flag_for_shader_stage(nir->stage);
-   struct annotation_info annotation;
-   memset(&annotation, 0, sizeof(annotation));
+      intel_debug_flag_for_shader_stage(nir->info.stage);
+   struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg);
+
+   /* `send_count` explicitly does not include spills or fills, as we'd
+    * like to use it as a metric for intentional memory access or other
+    * shared function use.  Otherwise, subtle changes to scheduling or
+    * register allocation could cause it to fluctuate wildly - and that
+    * effect is already counted in spill/fill counts.
+    */
    int spill_count = 0, fill_count = 0;
-   int loop_count = 0;
+   int loop_count = 0, send_count = 0;
 
    foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
       struct brw_reg src[3], dst;
 
       if (unlikely(debug_flag))
-         annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
+         disasm_annotate(disasm_info, inst, p->next_insn_offset);
 
       for (unsigned int i = 0; i < 3; i++) {
          src[i] = inst->src[i].as_brw_reg();
@@ -1518,23 +1540,27 @@ generate_code(struct brw_codegen *p,
 
       brw_set_default_predicate_control(p, inst->predicate);
       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
-      brw_set_default_flag_reg(p, 0, inst->flag_subreg);
+      brw_set_default_flag_reg(p, inst->flag_subreg / 2, inst->flag_subreg % 2);
       brw_set_default_saturate(p, inst->saturate);
       brw_set_default_mask_control(p, inst->force_writemask_all);
       brw_set_default_acc_write_control(p, inst->writes_accumulator);
 
       assert(inst->group % inst->exec_size == 0);
-      assert(inst->group % 8 == 0 ||
-             inst->dst.type == BRW_REGISTER_TYPE_DF ||
-             inst->src[0].type == BRW_REGISTER_TYPE_DF ||
-             inst->src[1].type == BRW_REGISTER_TYPE_DF ||
-             inst->src[2].type == BRW_REGISTER_TYPE_DF);
+      assert(inst->group % 4 == 0);
+
+      /* There are some instructions where the destination is 64-bit
+       * but we retype it to a smaller type. In that case, we cannot
+       * double the exec_size.
+       */
+      const bool is_df = (get_exec_type_size(inst) == 8 ||
+                          inst->dst.type == BRW_REGISTER_TYPE_DF) &&
+                         inst->opcode != VEC4_OPCODE_PICK_LOW_32BIT &&
+                         inst->opcode != VEC4_OPCODE_PICK_HIGH_32BIT &&
+                         inst->opcode != VEC4_OPCODE_SET_LOW_32BIT &&
+                         inst->opcode != VEC4_OPCODE_SET_HIGH_32BIT;
 
       unsigned exec_size = inst->exec_size;
-      if (devinfo->gen == 7 &&
-          !devinfo->is_haswell &&
-          (get_exec_type_size(inst) == 8 ||
-           inst->dst.type == BRW_REGISTER_TYPE_DF))
+      if (devinfo->gen == 7 && !devinfo->is_haswell && is_df)
          exec_size *= 2;
 
       brw_set_default_exec_size(p, cvt(exec_size) - 1);
@@ -1642,27 +1668,25 @@ generate_code(struct brw_codegen *p,
 
       case BRW_OPCODE_BFREV:
          assert(devinfo->gen >= 7);
-         /* BFREV only supports UD type for src and dst. */
          brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
                    retype(src[0], BRW_REGISTER_TYPE_UD));
          break;
       case BRW_OPCODE_FBH:
          assert(devinfo->gen >= 7);
-         /* FBH only supports UD type for dst. */
-         brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+         brw_FBH(p, retype(dst, src[0].type), src[0]);
          break;
       case BRW_OPCODE_FBL:
          assert(devinfo->gen >= 7);
-         /* FBL only supports UD type for dst. */
-         brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+         brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
+                 retype(src[0], BRW_REGISTER_TYPE_UD));
          break;
       case BRW_OPCODE_LZD:
          brw_LZD(p, dst, src[0]);
          break;
       case BRW_OPCODE_CBIT:
          assert(devinfo->gen >= 7);
-         /* CBIT only supports UD type for dst. */
-         brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+         brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
+                  retype(src[0], BRW_REGISTER_TYPE_UD));
          break;
       case BRW_OPCODE_ADDC:
          assert(devinfo->gen >= 7);
@@ -1741,6 +1765,7 @@ generate_code(struct brw_codegen *p,
             generate_math_gen6(p, inst, dst, src[0], brw_null_reg());
          } else {
             generate_math1_gen4(p, inst, dst, src[0]);
+            send_count++;
          }
          break;
 
@@ -1754,6 +1779,7 @@ generate_code(struct brw_codegen *p,
             generate_math_gen6(p, inst, dst, src[0], src[1]);
          } else {
             generate_math2_gen4(p, inst, dst, src[0], src[1]);
+            send_count++;
          }
          break;
 
@@ -1768,12 +1794,19 @@ generate_code(struct brw_codegen *p,
       case SHADER_OPCODE_TG4:
       case SHADER_OPCODE_TG4_OFFSET:
       case SHADER_OPCODE_SAMPLEINFO:
-         generate_tex(p, prog_data, nir->stage,
+         generate_tex(p, prog_data, nir->info.stage,
                       inst, dst, src[0], src[1], src[2]);
+         send_count++;
+         break;
+
+      case SHADER_OPCODE_GET_BUFFER_SIZE:
+         generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]);
+         send_count++;
          break;
 
       case VS_OPCODE_URB_WRITE:
          generate_vs_urb_write(p, inst);
+         send_count++;
          break;
 
       case SHADER_OPCODE_GEN4_SCRATCH_READ:
@@ -1788,31 +1821,31 @@ generate_code(struct brw_codegen *p,
 
       case VS_OPCODE_PULL_CONSTANT_LOAD:
          generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]);
+         send_count++;
          break;
 
       case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
          generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]);
+         send_count++;
          break;
 
       case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
          generate_set_simd4x2_header_gen9(p, inst, dst);
          break;
 
-
-      case VS_OPCODE_GET_BUFFER_SIZE:
-         generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]);
-         break;
-
       case GS_OPCODE_URB_WRITE:
          generate_gs_urb_write(p, inst);
+         send_count++;
          break;
 
       case GS_OPCODE_URB_WRITE_ALLOCATE:
          generate_gs_urb_write_allocate(p, inst);
+         send_count++;
          break;
 
       case GS_OPCODE_SVB_WRITE:
          generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]);
+         send_count++;
          break;
 
       case GS_OPCODE_SVB_SET_DST_INDEX:
@@ -1821,6 +1854,7 @@ generate_code(struct brw_codegen *p,
 
       case GS_OPCODE_THREAD_END:
          generate_gs_thread_end(p, inst);
+         send_count++;
          break;
 
       case GS_OPCODE_SET_WRITE_OFFSET:
@@ -1833,6 +1867,7 @@ generate_code(struct brw_codegen *p,
 
       case GS_OPCODE_FF_SYNC:
          generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
+         send_count++;
          break;
 
       case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
@@ -1862,53 +1897,42 @@ generate_code(struct brw_codegen *p,
       case SHADER_OPCODE_SHADER_TIME_ADD:
          brw_shader_time_add(p, src[0],
                              prog_data->base.binding_table.shader_time_start);
-         brw_mark_surface_used(&prog_data->base,
-                               prog_data->base.binding_table.shader_time_start);
+         send_count++;
          break;
 
-      case SHADER_OPCODE_UNTYPED_ATOMIC:
+      case VEC4_OPCODE_UNTYPED_ATOMIC:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
-                            !inst->dst.is_null());
+                            !inst->dst.is_null(), inst->header_size);
+         send_count++;
          break;
 
-      case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+      case VEC4_OPCODE_UNTYPED_SURFACE_READ:
+         assert(!inst->header_size);
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
                                   src[2].ud);
+         send_count++;
          break;
 
-      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+      case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
-                                   src[2].ud);
-         break;
-
-      case SHADER_OPCODE_TYPED_ATOMIC:
-         assert(src[2].file == BRW_IMMEDIATE_VALUE);
-         brw_typed_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
-                          !inst->dst.is_null());
-         break;
-
-      case SHADER_OPCODE_TYPED_SURFACE_READ:
-         assert(src[2].file == BRW_IMMEDIATE_VALUE);
-         brw_typed_surface_read(p, dst, src[0], src[1], inst->mlen,
-                                src[2].ud);
-         break;
-
-      case SHADER_OPCODE_TYPED_SURFACE_WRITE:
-         assert(src[2].file == BRW_IMMEDIATE_VALUE);
-         brw_typed_surface_write(p, src[0], src[1], inst->mlen,
-                                 src[2].ud);
+                                   src[2].ud, inst->header_size);
+         send_count++;
          break;
 
       case SHADER_OPCODE_MEMORY_FENCE:
-         brw_memory_fence(p, dst);
+         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND,
+                          brw_message_target(inst->sfid),
+                          /* commit_enable */ false,
+                          /* bti */ 0);
+         send_count++;
          break;
 
       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
          const struct brw_reg mask =
-            brw_stage_has_packed_dispatch(devinfo, nir->stage,
+            brw_stage_has_packed_dispatch(devinfo, nir->info.stage,
                                           &prog_data->base) ? brw_imm_ud(~0u) :
             brw_dmask_reg();
          brw_find_live_channel(p, dst, mask);
@@ -1940,22 +1964,43 @@ generate_code(struct brw_codegen *p,
          break;
       }
 
-      case VEC4_OPCODE_FROM_DOUBLE: {
+      case VEC4_OPCODE_DOUBLE_TO_F32:
+      case VEC4_OPCODE_DOUBLE_TO_D32:
+      case VEC4_OPCODE_DOUBLE_TO_U32: {
          assert(type_sz(src[0].type) == 8);
-         assert(type_sz(dst.type) == 4);
+         assert(type_sz(dst.type) == 8);
+
+         brw_reg_type dst_type;
+
+         switch (inst->opcode) {
+         case VEC4_OPCODE_DOUBLE_TO_F32:
+            dst_type = BRW_REGISTER_TYPE_F;
+            break;
+         case VEC4_OPCODE_DOUBLE_TO_D32:
+            dst_type = BRW_REGISTER_TYPE_D;
+            break;
+         case VEC4_OPCODE_DOUBLE_TO_U32:
+            dst_type = BRW_REGISTER_TYPE_UD;
+            break;
+         default:
+            unreachable("Not supported conversion");
+         }
+         dst = retype(dst, dst_type);
 
          brw_set_default_access_mode(p, BRW_ALIGN_1);
 
-         dst.hstride = BRW_HORIZONTAL_STRIDE_2;
-         dst.width = BRW_WIDTH_4;
-         src[0].vstride = BRW_VERTICAL_STRIDE_4;
-         src[0].width = BRW_WIDTH_4;
-         brw_MOV(p, dst, src[0]);
+         /* When converting from DF->F, we set destination's stride as 2 as an
+          * aligment requirement. But in IVB/BYT, each DF implicitly writes
+          * two floats, being the first one the converted value. So we don't
+          * need to explicitly set stride 2, but 1.
+          */
+         struct brw_reg spread_dst;
+         if (devinfo->gen == 7 && !devinfo->is_haswell)
+            spread_dst = stride(dst, 8, 4, 1);
+         else
+            spread_dst = stride(dst, 8, 4, 2);
 
-         struct brw_reg dst_as_src = dst;
-         dst.hstride = BRW_HORIZONTAL_STRIDE_1;
-         dst.width = BRW_WIDTH_8;
-         brw_MOV(p, dst, dst_as_src);
+         brw_MOV(p, spread_dst, src[0]);
 
          brw_set_default_access_mode(p, BRW_ALIGN_16);
          break;
@@ -1989,9 +2034,7 @@ generate_code(struct brw_codegen *p,
          src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
          if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT)
             src[0] = suboffset(src[0], 1);
-         src[0].vstride = BRW_VERTICAL_STRIDE_8;
-         src[0].width = BRW_WIDTH_4;
-         src[0].hstride = BRW_HORIZONTAL_STRIDE_2;
+         src[0] = spread(src[0], 2);
          brw_MOV(p, dst, src[0]);
 
          brw_set_default_access_mode(p, BRW_ALIGN_16);
@@ -2014,9 +2057,6 @@ generate_code(struct brw_codegen *p,
          dst.hstride = BRW_HORIZONTAL_STRIDE_2;
 
          src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
-         src[0].vstride = BRW_VERTICAL_STRIDE_4;
-         src[0].width = BRW_WIDTH_4;
-         src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
          brw_MOV(p, dst, src[0]);
 
          brw_set_default_access_mode(p, BRW_ALIGN_16);
@@ -2036,8 +2076,7 @@ generate_code(struct brw_codegen *p,
           *
           * where they pack the four bytes from the low and high four DW.
           */
-         assert(_mesa_is_pow_two(dst.writemask) &&
-                dst.writemask != 0);
+         assert(util_is_power_of_two_nonzero(dst.writemask));
          unsigned offset = __builtin_ctz(dst.writemask);
 
          dst.type = BRW_REGISTER_TYPE_UB;
@@ -2067,10 +2106,12 @@ generate_code(struct brw_codegen *p,
 
       case TCS_OPCODE_URB_WRITE:
          generate_tcs_urb_write(p, inst, src[0]);
+         send_count++;
          break;
 
       case VEC4_OPCODE_URB_READ:
          generate_vec4_urb_read(p, inst, dst, src[0]);
+         send_count++;
          break;
 
       case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
@@ -2112,19 +2153,22 @@ generate_code(struct brw_codegen *p,
 
       case TCS_OPCODE_RELEASE_INPUT:
          generate_tcs_release_input(p, dst, src[0], src[1]);
+         send_count++;
          break;
 
       case TCS_OPCODE_THREAD_END:
          generate_tcs_thread_end(p, inst);
+         send_count++;
          break;
 
       case SHADER_OPCODE_BARRIER:
          brw_barrier(p, src[0]);
          brw_WAIT(p);
+         send_count++;
          break;
 
       case SHADER_OPCODE_MOV_INDIRECT:
-         generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]);
+         generate_mov_indirect(p, inst, dst, src[0], src[1]);
          break;
 
       case BRW_OPCODE_DIM:
@@ -2157,43 +2201,68 @@ generate_code(struct brw_codegen *p,
    }
 
    brw_set_uip_jip(p, 0);
-   annotation_finalize(&annotation, p->next_insn_offset);
+
+   /* end of program sentinel */
+   disasm_new_inst_group(disasm_info, p->next_insn_offset);
 
 #ifndef NDEBUG
-   bool validated = brw_validate_instructions(p, 0, &annotation);
+   bool validated =
 #else
    if (unlikely(debug_flag))
-      brw_validate_instructions(p, 0, &annotation);
 #endif
+      brw_validate_instructions(devinfo, p->store,
+                                0, p->next_insn_offset,
+                                disasm_info);
 
    int before_size = p->next_insn_offset;
-   brw_compact_instructions(p, 0, annotation.ann_count, annotation.ann);
+   brw_compact_instructions(p, 0, disasm_info);
    int after_size = p->next_insn_offset;
 
    if (unlikely(debug_flag)) {
-      fprintf(stderr, "Native code for %s %s shader %s:\n",
-              nir->info->label ? nir->info->label : "unnamed",
-              _mesa_shader_stage_to_string(nir->stage), nir->info->name);
+      unsigned char sha1[21];
+      char sha1buf[41];
+
+      _mesa_sha1_compute(p->store, p->next_insn_offset, sha1);
+      _mesa_sha1_format(sha1buf, sha1);
+
+      fprintf(stderr, "Native code for %s %s shader %s (sha1 %s):\n",
+            nir->info.label ? nir->info.label : "unnamed",
+            _mesa_shader_stage_to_string(nir->info.stage), nir->info.name,
+            sha1buf);
 
       fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
-                      "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
-              stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
-              spill_count, fill_count, before_size, after_size,
-              100.0f * (before_size - after_size) / before_size);
-
-      dump_assembly(p->store, annotation.ann_count, annotation.ann,
-                    p->devinfo);
-      ralloc_free(annotation.mem_ctx);
+                     "spills:fills, %u sends. Compacted %d to %d bytes (%.0f%%)\n",
+            stage_abbrev, before_size / 16, loop_count, perf.latency,
+            spill_count, fill_count, send_count, before_size, after_size,
+            100.0f * (before_size - after_size) / before_size);
+
+      /* overriding the shader makes disasm_info invalid */
+      if (!brw_try_override_assembly(p, 0, sha1buf)) {
+         dump_assembly(p->store, 0, p->next_insn_offset,
+                       disasm_info, perf.block_latency);
+      } else {
+         fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
+      }
    }
+   ralloc_free(disasm_info);
    assert(validated);
 
    compiler->shader_debug_log(log_data,
                               "%s vec4 shader: %d inst, %d loops, %u cycles, "
-                              "%d:%d spills:fills, compacted %d to %d bytes.",
+                              "%d:%d spills:fills, %u sends, "
+                              "compacted %d to %d bytes.",
                               stage_abbrev, before_size / 16,
-                              loop_count, cfg->cycle_count, spill_count,
-                              fill_count, before_size, after_size);
-
+                              loop_count, perf.latency, spill_count,
+                              fill_count, send_count, before_size, after_size);
+   if (stats) {
+      stats->dispatch_width = 0;
+      stats->instructions = before_size / 16;
+      stats->sends = send_count;
+      stats->loops = loop_count;
+      stats->cycles = perf.latency;
+      stats->spills = spill_count;
+      stats->fills = fill_count;
+   }
 }
 
 extern "C" const unsigned *
@@ -2203,13 +2272,21 @@ brw_vec4_generate_assembly(const struct brw_compiler *compiler,
                            const nir_shader *nir,
                            struct brw_vue_prog_data *prog_data,
                            const struct cfg_t *cfg,
-                           unsigned *out_assembly_size)
+                           const performance &perf,
+                           struct brw_compile_stats *stats)
 {
    struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen);
    brw_init_codegen(compiler->devinfo, p, mem_ctx);
    brw_set_default_access_mode(p, BRW_ALIGN_16);
 
-   generate_code(p, compiler, log_data, nir, prog_data, cfg);
+   generate_code(p, compiler, log_data, nir, prog_data, cfg, perf, stats);
+
+   assert(prog_data->base.const_data_size == 0);
+   if (nir->constant_data_size > 0) {
+      prog_data->base.const_data_size = nir->constant_data_size;
+      prog_data->base.const_data_offset =
+         brw_append_data(p, nir->constant_data, nir->constant_data_size, 32);
+   }
 
-   return brw_get_program(p, out_assembly_size);
+   return brw_get_program(p, &prog_data->base.program_size);
 }