intel/fs,vec4: Stuff the constant data from NIR in the end of the program
[mesa.git] / src / intel / compiler / brw_fs_generator.cpp
index d7464c952971c7484772b0b47add1a811df1bdd2..74c6cd3047411a7f9a113622e1acb0063e16f954 100644 (file)
@@ -224,25 +224,27 @@ public:
 bool
 fs_generator::patch_discard_jumps_to_fb_writes()
 {
-   if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
+   if (this->discard_halt_patches.is_empty())
       return false;
 
    int scale = brw_jump_scale(p->devinfo);
 
-   /* There is a somewhat strange undocumented requirement of using
-    * HALT, according to the simulator.  If some channel has HALTed to
-    * a particular UIP, then by the end of the program, every channel
-    * must have HALTed to that UIP.  Furthermore, the tracking is a
-    * stack, so you can't do the final halt of a UIP after starting
-    * halting to a new UIP.
-    *
-    * Symptoms of not emitting this instruction on actual hardware
-    * included GPU hangs and sparkly rendering on the piglit discard
-    * tests.
-    */
-   brw_inst *last_halt = gen6_HALT(p);
-   brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
-   brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
+   if (devinfo->gen >= 6) {
+      /* There is a somewhat strange undocumented requirement of using
+       * HALT, according to the simulator.  If some channel has HALTed to
+       * a particular UIP, then by the end of the program, every channel
+       * must have HALTed to that UIP.  Furthermore, the tracking is a
+       * stack, so you can't do the final halt of a UIP after starting
+       * halting to a new UIP.
+       *
+       * Symptoms of not emitting this instruction on actual hardware
+       * included GPU hangs and sparkly rendering on the piglit discard
+       * tests.
+       */
+      brw_inst *last_halt = brw_HALT(p);
+      brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
+      brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
+   }
 
    int ip = p->nr_insn;
 
@@ -250,11 +252,67 @@ fs_generator::patch_discard_jumps_to_fb_writes()
       brw_inst *patch = &p->store[patch_ip->ip];
 
       assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
-      /* HALT takes a half-instruction distance from the pre-incremented IP. */
-      brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
+      if (devinfo->gen >= 6) {
+         /* HALT takes a half-instruction distance from the pre-incremented IP. */
+         brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
+      } else {
+         brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale));
+      }
    }
 
    this->discard_halt_patches.make_empty();
+
+   if (devinfo->gen < 6) {
+      /* From the g965 PRM:
+       *
+       *    "As DMask is not automatically reloaded into AMask upon completion
+       *    of this instruction, software has to manually restore AMask upon
+       *    completion."
+       *
+       * DMask lives in the bottom 16 bits of sr0.1.
+       */
+      brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK),
+                                   retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW));
+      brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1);
+      brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE);
+      brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE);
+      brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH);
+   }
+
+   if (devinfo->gen == 4 && !devinfo->is_g4x) {
+      /* From the g965 PRM:
+       *
+       *    "[DevBW, DevCL] Erratum: The subfields in mask stack register are
+       *    reset to zero during graphics reset, however, they are not
+       *    initialized at thread dispatch. These subfields will retain the
+       *    values from the previous thread. Software should make sure the
+       *    mask stack is empty (reset to zero) before terminating the thread.
+       *    In case that this is not practical, software may have to reset the
+       *    mask stack at the beginning of each kernel, which will impact the
+       *    performance."
+       *
+       * Luckily we can rely on:
+       *
+       *    "[DevBW, DevCL] This register access restriction is not
+       *    applicable, hardware does ensure execution pipeline coherency,
+       *    when a mask stack register is used as an explicit source and/or
+       *    destination."
+       */
+      brw_push_insn_state(p);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+
+      brw_set_default_exec_size(p, BRW_EXECUTE_2);
+      brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0));
+
+      brw_set_default_exec_size(p, BRW_EXECUTE_16);
+      /* Reset the if stack. */
+      brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW),
+              brw_imm_uw(0));
+
+      brw_pop_insn_state(p);
+   }
+
    return true;
 }
 
@@ -410,7 +468,15 @@ fs_generator::generate_mov_indirect(fs_inst *inst,
 
       reg.nr = imm_byte_offset / REG_SIZE;
       reg.subnr = imm_byte_offset % REG_SIZE;
-      brw_MOV(p, dst, reg);
+      if (type_sz(reg.type) > 4 && !devinfo->has_64bit_float) {
+         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
+                    subscript(reg, BRW_REGISTER_TYPE_D, 0));
+         brw_set_default_swsb(p, tgl_swsb_null());
+         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
+                    subscript(reg, BRW_REGISTER_TYPE_D, 1));
+      } else {
+         brw_MOV(p, dst, reg);
+      }
    } else {
       /* Prior to Broadwell, there are only 8 address registers. */
       assert(inst->exec_size <= 8 || devinfo->gen >= 8);
@@ -1362,14 +1428,12 @@ fs_generator::generate_ddy(const fs_inst *inst,
 void
 fs_generator::generate_discard_jump(fs_inst *)
 {
-   assert(devinfo->gen >= 6);
-
    /* This HALT will be patched up at FB write time to point UIP at the end of
     * the program, and at brw_uip_jip() JIP will be set to the end of the
     * current block (or the program).
     */
    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
-   gen6_HALT(p);
+   brw_HALT(p);
 }
 
 void
@@ -1715,11 +1779,11 @@ fs_generator::enable_debug(const char *shader_name)
 int
 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
                             struct shader_stats shader_stats,
+                            const brw::performance &perf,
                             struct brw_compile_stats *stats)
 {
    /* align to 64 byte boundary. */
-   while (p->next_insn_offset % 64)
-      brw_NOP(p);
+   brw_realign(p, 64);
 
    this->dispatch_width = dispatch_width;
 
@@ -2216,25 +2280,50 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
          generate_shader_time_add(inst, src[0], src[1], src[2]);
          break;
 
+      case SHADER_OPCODE_INTERLOCK:
       case SHADER_OPCODE_MEMORY_FENCE: {
          assert(src[1].file == BRW_IMMEDIATE_VALUE);
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
-         const unsigned sends =
-            brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud,
-                             src[2].ud);
-         send_count += sends;
+
+         const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ?
+            BRW_OPCODE_SENDC : BRW_OPCODE_SEND;
+
+         brw_memory_fence(p, dst, src[0], send_op,
+                          brw_message_target(inst->sfid),
+                          /* commit_enable */ src[1].ud,
+                          /* bti */ src[2].ud);
+         send_count++;
          break;
       }
 
       case FS_OPCODE_SCHEDULING_FENCE:
-         if (unlikely(debug_flag))
-            disasm_info->use_tail = true;
-         break;
+         if (inst->sources == 0 && inst->sched.regdist == 0 &&
+                                   inst->sched.mode == TGL_SBID_NULL) {
+            if (unlikely(debug_flag))
+               disasm_info->use_tail = true;
+            break;
+         }
+
+         if (devinfo->gen >= 12) {
+            /* Use the available SWSB information to stall.  A single SYNC is
+             * sufficient since if there were multiple dependencies, the
+             * scoreboard algorithm already injected other SYNCs before this
+             * instruction.
+             */
+            brw_SYNC(p, TGL_SYNC_NOP);
+         } else {
+            for (unsigned i = 0; i < inst->sources; i++) {
+               /* Emit a MOV to force a stall until the instruction producing the
+                * registers finishes.
+                */
+               brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
+                       retype(src[i], BRW_REGISTER_TYPE_UW));
+            }
+
+            if (inst->sources > 1)
+               multiple_instructions_emitted = true;
+         }
 
-      case SHADER_OPCODE_INTERLOCK:
-         assert(devinfo->gen >= 9);
-         /* The interlock is basically a memory fence issued via sendc */
-         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false, /* bti */ 0);
          break;
 
       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
@@ -2462,7 +2551,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
               "Compacted %d to %d bytes (%.0f%%)\n",
               shader_name, sha1buf,
               dispatch_width, before_size / 16,
-              loop_count, cfg->cycle_count,
+              loop_count, perf.latency,
               spill_count, fill_count, send_count,
               shader_stats.scheduler_mode,
               shader_stats.promoted_constants,
@@ -2471,7 +2560,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
 
       /* overriding the shader makes disasm_info invalid */
       if (!brw_try_override_assembly(p, start_offset, sha1buf)) {
-         dump_assembly(p->store, disasm_info);
+         dump_assembly(p->store, start_offset, p->next_insn_offset,
+                       disasm_info, perf.block_latency);
       } else {
          fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
       }
@@ -2487,7 +2577,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
                               "compacted %d to %d bytes.",
                               _mesa_shader_stage_to_abbrev(stage),
                               dispatch_width, before_size / 16 - nop_count,
-                              loop_count, cfg->cycle_count,
+                              loop_count, perf.latency,
                               spill_count, fill_count, send_count,
                               shader_stats.scheduler_mode,
                               shader_stats.promoted_constants,
@@ -2497,7 +2587,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
       stats->instructions = before_size / 16 - nop_count;
       stats->sends = send_count;
       stats->loops = loop_count;
-      stats->cycles = cfg->cycle_count;
+      stats->cycles = perf.latency;
       stats->spills = spill_count;
       stats->fills = fill_count;
    }
@@ -2505,6 +2595,16 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
    return start_offset;
 }
 
+void
+fs_generator::add_const_data(void *data, unsigned size)
+{
+   assert(prog_data->const_data_size == 0);
+   if (size > 0) {
+      prog_data->const_data_size = size;
+      prog_data->const_data_offset = brw_append_data(p, data, size, 32);
+   }
+}
+
 const unsigned *
 fs_generator::get_assembly()
 {