intel/fs: Add a generic SEND opcode
[mesa.git] / src / intel / compiler / brw_schedule_instructions.cpp
index b3f7e877c80ae79627e76a889d8aebd118256ec0..bc4c2dc5cdbda9b9f7fad6c5f8b2019d00d1e618 100644 (file)
@@ -94,8 +94,6 @@ public:
     * successors is an exit node.
     */
    schedule_node *exit;
-
-   bool is_barrier;
 };
 
 /**
@@ -371,6 +369,7 @@ schedule_node::set_latency_gen7(bool is_haswell)
       break;
 
    case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT:
    case SHADER_OPCODE_TYPED_ATOMIC:
       /* Test code:
        *   mov(8)    g112<1>ud       0x00000000ud       { align1 WE_all 1Q };
@@ -415,6 +414,13 @@ schedule_node::set_latency_gen7(bool is_haswell)
       latency = is_haswell ? 300 : 600;
       break;
 
+   case SHADER_OPCODE_SEND:
+      switch (inst->sfid) {
+      default:
+         unreachable("Unknown SFID");
+      }
+      break;
+
    default:
       /* 2 cycles:
        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
@@ -431,7 +437,7 @@ schedule_node::set_latency_gen7(bool is_haswell)
 class instruction_scheduler {
 public:
    instruction_scheduler(backend_shader *s, int grf_count,
-                         int hw_reg_count, int block_count,
+                         unsigned hw_reg_count, int block_count,
                          instruction_scheduler_mode mode)
    {
       this->bs = s;
@@ -512,7 +518,7 @@ public:
    bool post_reg_alloc;
    int instructions_to_schedule;
    int grf_count;
-   int hw_reg_count;
+   unsigned hw_reg_count;
    int reg_pressure;
    int block_idx;
    exec_list instructions;
@@ -666,7 +672,7 @@ fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
    int payload_last_use_ip[hw_reg_count];
    v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip);
 
-   for (int i = 0; i < hw_reg_count; i++) {
+   for (unsigned i = 0; i < hw_reg_count; i++) {
       if (payload_last_use_ip[i] == -1)
          continue;
 
@@ -765,22 +771,22 @@ vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
 }
 
 void
-vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+vec4_instruction_scheduler::count_reads_remaining(backend_instruction *)
 {
 }
 
 void
-vec4_instruction_scheduler::setup_liveness(cfg_t *cfg)
+vec4_instruction_scheduler::setup_liveness(cfg_t *)
 {
 }
 
 void
-vec4_instruction_scheduler::update_register_pressure(backend_instruction *be)
+vec4_instruction_scheduler::update_register_pressure(backend_instruction *)
 {
 }
 
 int
-vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
+vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *)
 {
    return 0;
 }
@@ -800,7 +806,6 @@ schedule_node::schedule_node(backend_instruction *inst,
    this->cand_generation = 0;
    this->delay = 0;
    this->exit = NULL;
-   this->is_barrier = false;
 
    /* We can't measure Gen6 timings directly but expect them to be much
     * closer to Gen7 than Gen4.
@@ -921,6 +926,14 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
    add_dep(before, after, before->latency);
 }
 
+static bool
+is_scheduling_barrier(const backend_instruction *inst)
+{
+   return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
+          inst->is_control_flow() ||
+          inst->has_side_effects();
+}
+
 /**
  * Sometimes we really want this node to execute after everything that
  * was before it and before everything that followed it.  This adds
@@ -932,12 +945,10 @@ instruction_scheduler::add_barrier_deps(schedule_node *n)
    schedule_node *prev = (schedule_node *)n->prev;
    schedule_node *next = (schedule_node *)n->next;
 
-   n->is_barrier = true;
-
    if (prev) {
       while (!prev->is_head_sentinel()) {
          add_dep(prev, n, 0);
-         if (prev->is_barrier)
+         if (is_scheduling_barrier(prev->inst))
             break;
          prev = (schedule_node *)prev->prev;
       }
@@ -946,7 +957,7 @@ instruction_scheduler::add_barrier_deps(schedule_node *n)
    if (next) {
       while (!next->is_tail_sentinel()) {
          add_dep(n, next, 0);
-         if (next->is_barrier)
+         if (is_scheduling_barrier(next->inst))
             break;
          next = (schedule_node *)next->next;
       }
@@ -962,14 +973,6 @@ fs_instruction_scheduler::is_compressed(fs_inst *inst)
    return inst->exec_size == 16;
 }
 
-static bool
-is_scheduling_barrier(const fs_inst *inst)
-{
-   return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
-          inst->is_control_flow() ||
-          inst->has_side_effects();
-}
-
 void
 fs_instruction_scheduler::calculate_deps()
 {
@@ -977,9 +980,9 @@ fs_instruction_scheduler::calculate_deps()
     * After register allocation, reg_offsets are gone and we track individual
     * GRF registers.
     */
-   schedule_node *last_grf_write[grf_count * 16];
+   schedule_node **last_grf_write;
    schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)];
-   schedule_node *last_conditional_mod[4] = {};
+   schedule_node *last_conditional_mod[8] = {};
    schedule_node *last_accumulator_write = NULL;
    /* Fixed HW registers are assumed to be separate from the virtual
     * GRFs, so they can be tracked separately.  We don't really write
@@ -988,7 +991,7 @@ fs_instruction_scheduler::calculate_deps()
     */
    schedule_node *last_fixed_grf_write = NULL;
 
-   memset(last_grf_write, 0, sizeof(last_grf_write));
+   last_grf_write = (schedule_node **)calloc(sizeof(schedule_node *), grf_count * 16);
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 
    /* top-to-bottom dependencies: RAW and WAW. */
@@ -1115,7 +1118,7 @@ fs_instruction_scheduler::calculate_deps()
    }
 
    /* bottom-to-top dependencies: WAR */
-   memset(last_grf_write, 0, sizeof(last_grf_write));
+   memset(last_grf_write, 0, sizeof(schedule_node *) * grf_count * 16);
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
    memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
    last_accumulator_write = NULL;
@@ -1231,13 +1234,8 @@ fs_instruction_scheduler::calculate_deps()
          last_accumulator_write = n;
       }
    }
-}
 
-static bool
-is_scheduling_barrier(const vec4_instruction *inst)
-{
-   return inst->is_control_flow() ||
-          inst->has_side_effects();
+   free(last_grf_write);
 }
 
 void
@@ -1279,6 +1277,9 @@ vec4_instruction_scheduler::calculate_deps()
          }
       }
 
+      if (inst->reads_g0_implicitly())
+         add_dep(last_fixed_grf_write, n);
+
       if (!inst->is_send_from_grf()) {
          for (int i = 0; i < inst->mlen; i++) {
             /* It looks like the MRF regs are released in the send
@@ -1555,14 +1556,15 @@ vec4_instruction_scheduler::choose_instruction_to_schedule()
 int
 fs_instruction_scheduler::issue_time(backend_instruction *inst)
 {
+   const unsigned overhead = v->bank_conflict_cycles((fs_inst *)inst);
    if (is_compressed((fs_inst *)inst))
-      return 4;
+      return 4 + overhead;
    else
-      return 2;
+      return 2 + overhead;
 }
 
 int
-vec4_instruction_scheduler::issue_time(backend_instruction *inst)
+vec4_instruction_scheduler::issue_time(backend_instruction *)
 {
    /* We always execute as two vec4s in parallel. */
    return 2;