intel/fs,vec4: Stuff the constant data from NIR in the end of the program

[mesa.git] / src / intel / compiler / brw_vec4_generator.cpp
diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp

index c247c988181fa17d140a05d21a9b110804f2c35d..e9142c2c65c0270ae46dbbec9bd794fe8dbfd6bb 100644 (file)
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -270,6 +270,17 @@ generate_tex(struct brw_codegen *p,
        break;
     }
  
+   /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
+    * is set as part of the message descriptor.  On gen4, the PRM seems to
+    * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
+    * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
+    * gone from the message descriptor entirely and you just get UINT32 all
+    * the time regasrdless.  Since we can really only do non-UINT32 on gen4,
+    * just stomp it to UINT32 all the time.
+    */
+   if (inst->opcode == SHADER_OPCODE_TXS)
+      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+
     uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
           inst->opcode == SHADER_OPCODE_TG4_OFFSET)
           ? prog_data->base.binding_table.gather_texture_start
@@ -1498,6 +1509,7 @@ generate_code(struct brw_codegen *p,
                const nir_shader *nir,
                struct brw_vue_prog_data *prog_data,
                const struct cfg_t *cfg,
+              const performance &perf,
                struct brw_compile_stats *stats)
  {
     const struct gen_device_info *devinfo = p->devinfo;
@@ -1911,7 +1923,10 @@ generate_code(struct brw_codegen *p,
           break;
  
        case SHADER_OPCODE_MEMORY_FENCE:
-         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false, /* bti */ 0);
+         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND,
+                          brw_message_target(inst->sfid),
+                          /* commit_enable */ false,
+                          /* bti */ 0);
           send_count++;
           break;
  
@@ -2061,8 +2076,7 @@ generate_code(struct brw_codegen *p,
            *
            * where they pack the four bytes from the low and high four DW.
            */
-         assert(_mesa_is_pow_two(dst.writemask) &&
-                dst.writemask != 0);
+         assert(util_is_power_of_two_nonzero(dst.writemask));
           unsigned offset = __builtin_ctz(dst.writemask);
  
           dst.type = BRW_REGISTER_TYPE_UB;
@@ -2218,13 +2232,14 @@ generate_code(struct brw_codegen *p,
  
        fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
                       "spills:fills, %u sends. Compacted %d to %d bytes (%.0f%%)\n",
-            stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
+            stage_abbrev, before_size / 16, loop_count, perf.latency,
              spill_count, fill_count, send_count, before_size, after_size,
              100.0f * (before_size - after_size) / before_size);
  
        /* overriding the shader makes disasm_info invalid */
        if (!brw_try_override_assembly(p, 0, sha1buf)) {
-         dump_assembly(p->store, disasm_info);
+         dump_assembly(p->store, 0, p->next_insn_offset,
+                       disasm_info, perf.block_latency);
        } else {
           fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
        }
@@ -2237,14 +2252,14 @@ generate_code(struct brw_codegen *p,
                                "%d:%d spills:fills, %u sends, "
                                "compacted %d to %d bytes.",
                                stage_abbrev, before_size / 16,
-                              loop_count, cfg->cycle_count, spill_count,
+                              loop_count, perf.latency, spill_count,
                                fill_count, send_count, before_size, after_size);
     if (stats) {
        stats->dispatch_width = 0;
        stats->instructions = before_size / 16;
        stats->sends = send_count;
        stats->loops = loop_count;
-      stats->cycles = cfg->cycle_count;
+      stats->cycles = perf.latency;
        stats->spills = spill_count;
        stats->fills = fill_count;
     }
@@ -2257,13 +2272,21 @@ brw_vec4_generate_assembly(const struct brw_compiler *compiler,
                             const nir_shader *nir,
                             struct brw_vue_prog_data *prog_data,
                             const struct cfg_t *cfg,
+                           const performance &perf,
                             struct brw_compile_stats *stats)
  {
     struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen);
     brw_init_codegen(compiler->devinfo, p, mem_ctx);
     brw_set_default_access_mode(p, BRW_ALIGN_16);
  
-   generate_code(p, compiler, log_data, nir, prog_data, cfg, stats);
+   generate_code(p, compiler, log_data, nir, prog_data, cfg, perf, stats);
+
+   assert(prog_data->base.const_data_size == 0);
+   if (nir->constant_data_size > 0) {
+      prog_data->base.const_data_size = nir->constant_data_size;
+      prog_data->base.const_data_offset =
+         brw_append_data(p, nir->constant_data, nir->constant_data_size, 32);
+   }
  
     return brw_get_program(p, &prog_data->base.program_size);
  }