intel/fs: Add a generic SEND opcode

[mesa.git] / src / intel / compiler / brw_schedule_instructions.cpp
diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp

index b3f7e877c80ae79627e76a889d8aebd118256ec0..bc4c2dc5cdbda9b9f7fad6c5f8b2019d00d1e618 100644 (file)
--- a/src/intel/compiler/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/brw_schedule_instructions.cpp
@@ -94,8 +94,6 @@ public:
      * successors is an exit node.
      */
     schedule_node *exit;
-
-   bool is_barrier;
  };
  
  /**
@@ -371,6 +369,7 @@ schedule_node::set_latency_gen7(bool is_haswell)
        break;
  
     case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT:
     case SHADER_OPCODE_TYPED_ATOMIC:
        /* Test code:
         *   mov(8)    g112<1>ud       0x00000000ud       { align1 WE_all 1Q };
@@ -415,6 +414,13 @@ schedule_node::set_latency_gen7(bool is_haswell)
        latency = is_haswell ? 300 : 600;
        break;
  
+   case SHADER_OPCODE_SEND:
+      switch (inst->sfid) {
+      default:
+         unreachable("Unknown SFID");
+      }
+      break;
+
     default:
        /* 2 cycles:
         * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
@@ -431,7 +437,7 @@ schedule_node::set_latency_gen7(bool is_haswell)
  class instruction_scheduler {
  public:
     instruction_scheduler(backend_shader *s, int grf_count,
-                         int hw_reg_count, int block_count,
+                         unsigned hw_reg_count, int block_count,
                           instruction_scheduler_mode mode)
     {
        this->bs = s;
@@ -512,7 +518,7 @@ public:
     bool post_reg_alloc;
     int instructions_to_schedule;
     int grf_count;
-   int hw_reg_count;
+   unsigned hw_reg_count;
     int reg_pressure;
     int block_idx;
     exec_list instructions;
@@ -666,7 +672,7 @@ fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
     int payload_last_use_ip[hw_reg_count];
     v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip);
  
-   for (int i = 0; i < hw_reg_count; i++) {
+   for (unsigned i = 0; i < hw_reg_count; i++) {
        if (payload_last_use_ip[i] == -1)
           continue;
  
@@ -765,22 +771,22 @@ vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
  }
  
  void
-vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+vec4_instruction_scheduler::count_reads_remaining(backend_instruction *)
  {
  }
  
  void
-vec4_instruction_scheduler::setup_liveness(cfg_t *cfg)
+vec4_instruction_scheduler::setup_liveness(cfg_t *)
  {
  }
  
  void
-vec4_instruction_scheduler::update_register_pressure(backend_instruction *be)
+vec4_instruction_scheduler::update_register_pressure(backend_instruction *)
  {
  }
  
  int
-vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
+vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *)
  {
     return 0;
  }
@@ -800,7 +806,6 @@ schedule_node::schedule_node(backend_instruction *inst,
     this->cand_generation = 0;
     this->delay = 0;
     this->exit = NULL;
-   this->is_barrier = false;
  
     /* We can't measure Gen6 timings directly but expect them to be much
      * closer to Gen7 than Gen4.
@@ -921,6 +926,14 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
     add_dep(before, after, before->latency);
  }
  
+static bool
+is_scheduling_barrier(const backend_instruction *inst)
+{
+   return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
+          inst->is_control_flow() ||
+          inst->has_side_effects();
+}
+
  /**
   * Sometimes we really want this node to execute after everything that
   * was before it and before everything that followed it.  This adds
@@ -932,12 +945,10 @@ instruction_scheduler::add_barrier_deps(schedule_node *n)
     schedule_node *prev = (schedule_node *)n->prev;
     schedule_node *next = (schedule_node *)n->next;
  
-   n->is_barrier = true;
-
     if (prev) {
        while (!prev->is_head_sentinel()) {
           add_dep(prev, n, 0);
-         if (prev->is_barrier)
+         if (is_scheduling_barrier(prev->inst))
              break;
           prev = (schedule_node *)prev->prev;
        }
@@ -946,7 +957,7 @@ instruction_scheduler::add_barrier_deps(schedule_node *n)
     if (next) {
        while (!next->is_tail_sentinel()) {
           add_dep(n, next, 0);
-         if (next->is_barrier)
+         if (is_scheduling_barrier(next->inst))
              break;
           next = (schedule_node *)next->next;
        }
@@ -962,14 +973,6 @@ fs_instruction_scheduler::is_compressed(fs_inst *inst)
     return inst->exec_size == 16;
  }
  
-static bool
-is_scheduling_barrier(const fs_inst *inst)
-{
-   return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
-          inst->is_control_flow() ||
-          inst->has_side_effects();
-}
-
  void
  fs_instruction_scheduler::calculate_deps()
  {
@@ -977,9 +980,9 @@ fs_instruction_scheduler::calculate_deps()
      * After register allocation, reg_offsets are gone and we track individual
      * GRF registers.
      */
-   schedule_node *last_grf_write[grf_count * 16];
+   schedule_node **last_grf_write;
     schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)];
-   schedule_node *last_conditional_mod[4] = {};
+   schedule_node *last_conditional_mod[8] = {};
     schedule_node *last_accumulator_write = NULL;
     /* Fixed HW registers are assumed to be separate from the virtual
      * GRFs, so they can be tracked separately.  We don't really write
@@ -988,7 +991,7 @@ fs_instruction_scheduler::calculate_deps()
      */
     schedule_node *last_fixed_grf_write = NULL;
  
-   memset(last_grf_write, 0, sizeof(last_grf_write));
+   last_grf_write = (schedule_node **)calloc(sizeof(schedule_node *), grf_count * 16);
     memset(last_mrf_write, 0, sizeof(last_mrf_write));
  
     /* top-to-bottom dependencies: RAW and WAW. */
@@ -1115,7 +1118,7 @@ fs_instruction_scheduler::calculate_deps()
     }
  
     /* bottom-to-top dependencies: WAR */
-   memset(last_grf_write, 0, sizeof(last_grf_write));
+   memset(last_grf_write, 0, sizeof(schedule_node *) * grf_count * 16);
     memset(last_mrf_write, 0, sizeof(last_mrf_write));
     memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
     last_accumulator_write = NULL;
@@ -1231,13 +1234,8 @@ fs_instruction_scheduler::calculate_deps()
           last_accumulator_write = n;
        }
     }
-}
  
-static bool
-is_scheduling_barrier(const vec4_instruction *inst)
-{
-   return inst->is_control_flow() ||
-          inst->has_side_effects();
+   free(last_grf_write);
  }
  
  void
@@ -1279,6 +1277,9 @@ vec4_instruction_scheduler::calculate_deps()
           }
        }
  
+      if (inst->reads_g0_implicitly())
+         add_dep(last_fixed_grf_write, n);
+
        if (!inst->is_send_from_grf()) {
           for (int i = 0; i < inst->mlen; i++) {
              /* It looks like the MRF regs are released in the send
@@ -1555,14 +1556,15 @@ vec4_instruction_scheduler::choose_instruction_to_schedule()
  int
  fs_instruction_scheduler::issue_time(backend_instruction *inst)
  {
+   const unsigned overhead = v->bank_conflict_cycles((fs_inst *)inst);
     if (is_compressed((fs_inst *)inst))
-      return 4;
+      return 4 + overhead;
     else
-      return 2;
+      return 2 + overhead;
  }
  
  int
-vec4_instruction_scheduler::issue_time(backend_instruction *inst)
+vec4_instruction_scheduler::issue_time(backend_instruction *)
  {
     /* We always execute as two vec4s in parallel. */
     return 2;