mesa: add/update comments in _mesa_copy_buffer_subdata()

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_schedule_instructions.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp

index b5202d4d2eacb68aa54b95e326885e353f904c92..910f3297d27d14a34f16280356478319cc2e9858 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
@@ -25,25 +25,10 @@
   *
   */
  
-extern "C" {
-
-#include <sys/types.h>
-
-#include "main/macros.h"
-#include "main/shaderobj.h"
-#include "main/uniforms.h"
-#include "program/prog_optimize.h"
-#include "program/register_allocate.h"
-#include "program/sampler.h"
-#include "program/hash_table.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_wm.h"
-}
  #include "brw_fs.h"
-#include "../glsl/glsl_types.h"
-#include "../glsl/ir_optimization.h"
-#include "../glsl/ir_print_visitor.h"
+#include "glsl/glsl_types.h"
+#include "glsl/ir_optimization.h"
+#include "glsl/ir_print_visitor.h"
  
  /** @file brw_fs_schedule_instructions.cpp
   *
@@ -84,26 +69,28 @@ public:
        int math_latency = 22;
  
        switch (inst->opcode) {
-      case FS_OPCODE_RCP:
+      case SHADER_OPCODE_RCP:
          this->latency = 1 * chans * math_latency;
          break;
-      case FS_OPCODE_RSQ:
+      case SHADER_OPCODE_RSQ:
          this->latency = 2 * chans * math_latency;
          break;
-      case FS_OPCODE_SQRT:
-      case FS_OPCODE_LOG2:
+      case SHADER_OPCODE_INT_QUOTIENT:
+      case SHADER_OPCODE_SQRT:
+      case SHADER_OPCODE_LOG2:
          /* full precision log.  partial is 2. */
          this->latency = 3 * chans * math_latency;
          break;
-      case FS_OPCODE_EXP2:
+      case SHADER_OPCODE_INT_REMAINDER:
+      case SHADER_OPCODE_EXP2:
          /* full precision.  partial is 3, same throughput. */
          this->latency = 4 * chans * math_latency;
          break;
-      case FS_OPCODE_POW:
+      case SHADER_OPCODE_POW:
          this->latency = 8 * chans * math_latency;
          break;
-      case FS_OPCODE_SIN:
-      case FS_OPCODE_COS:
+      case SHADER_OPCODE_SIN:
+      case SHADER_OPCODE_COS:
          /* minimum latency, max is 12 rounds. */
          this->latency = 5 * chans * math_latency;
          break;
@@ -128,7 +115,7 @@ public:
     instruction_scheduler(fs_visitor *v, void *mem_ctx, int virtual_grf_count)
     {
        this->v = v;
-      this->mem_ctx = talloc_new(mem_ctx);
+      this->mem_ctx = ralloc_context(mem_ctx);
        this->virtual_grf_count = virtual_grf_count;
        this->instructions.make_empty();
        this->instructions_to_schedule = 0;
@@ -136,15 +123,18 @@ public:
  
     ~instruction_scheduler()
     {
-      talloc_free(this->mem_ctx);
+      ralloc_free(this->mem_ctx);
     }
     void add_barrier_deps(schedule_node *n);
     void add_dep(schedule_node *before, schedule_node *after, int latency);
+   void add_dep(schedule_node *before, schedule_node *after);
  
     void add_inst(fs_inst *inst);
     void calculate_deps();
     void schedule_instructions(fs_inst *next_block_header);
  
+   bool is_compressed(fs_inst *inst);
+
     void *mem_ctx;
  
     int instructions_to_schedule;
@@ -195,11 +185,11 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
        else
          before->child_array_size *= 2;
  
-      before->children = talloc_realloc(mem_ctx, before->children,
-                                       schedule_node *,
-                                       before->child_array_size);
-      before->child_latency = talloc_realloc(mem_ctx, before->child_latency,
-                                            int, before->child_array_size);
+      before->children = reralloc(mem_ctx, before->children,
+                                 schedule_node *,
+                                 before->child_array_size);
+      before->child_latency = reralloc(mem_ctx, before->child_latency,
+                                      int, before->child_array_size);
     }
  
     before->children[before->child_count] = after;
@@ -208,6 +198,15 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
     after->parent_count++;
  }
  
+void
+instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
+{
+   if (!before)
+      return;
+
+   add_dep(before, after, before->latency);
+}
+
  /**
   * Sometimes we really want this node to execute after everything that
   * was before it and before everything that followed it.  This adds
@@ -234,12 +233,29 @@ instruction_scheduler::add_barrier_deps(schedule_node *n)
     }
  }
  
+/* instruction scheduling needs to be aware of when an MRF write
+ * actually writes 2 MRFs.
+ */
+bool
+instruction_scheduler::is_compressed(fs_inst *inst)
+{
+   return (v->c->dispatch_width == 16 &&
+          !inst->force_uncompressed &&
+          !inst->force_sechalf);
+}
+
  void
  instruction_scheduler::calculate_deps()
  {
     schedule_node *last_grf_write[virtual_grf_count];
     schedule_node *last_mrf_write[BRW_MAX_MRF];
     schedule_node *last_conditional_mod = NULL;
+   /* Fixed HW registers are assumed to be separate from the virtual
+    * GRFs, so they can be tracked separately.  We don't really write
+    * to fixed GRFs much, so don't bother tracking them on a more
+    * granular level.
+    */
+   schedule_node *last_fixed_grf_write = NULL;
  
     /* The last instruction always needs to still be the last
      * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
@@ -254,17 +270,18 @@ instruction_scheduler::calculate_deps()
     memset(last_mrf_write, 0, sizeof(last_mrf_write));
  
     /* top-to-bottom dependencies: RAW and WAW. */
-   foreach_iter(exec_list_iterator, iter, instructions) {
-      schedule_node *n = (schedule_node *)iter.get();
+   foreach_list(node, &instructions) {
+      schedule_node *n = (schedule_node *)node;
        fs_inst *inst = n->inst;
  
        /* read-after-write deps. */
        for (int i = 0; i < 3; i++) {
          if (inst->src[i].file == GRF) {
-           if (last_grf_write[inst->src[i].reg]) {
-              add_dep(last_grf_write[inst->src[i].reg], n,
-                      last_grf_write[inst->src[i].reg]->latency);
-           }
+           add_dep(last_grf_write[inst->src[i].reg], n);
+        } else if (inst->src[i].file == FIXED_HW_REG &&
+                   (inst->src[i].fixed_hw_reg.file ==
+                    BRW_GENERAL_REGISTER_FILE)) {
+           add_dep(last_fixed_grf_write, n);
          } else if (inst->src[i].file != BAD_FILE &&
                     inst->src[i].file != IMM &&
                     inst->src[i].file != UNIFORM) {
@@ -278,40 +295,41 @@ instruction_scheduler::calculate_deps()
           * instruction once it's sent, not when the result comes
           * back.
           */
-        if (last_mrf_write[inst->base_mrf + i]) {
-           add_dep(last_mrf_write[inst->base_mrf + i], n,
-                   last_mrf_write[inst->base_mrf + i]->latency);
-        }
+        add_dep(last_mrf_write[inst->base_mrf + i], n);
        }
  
        if (inst->predicated) {
          assert(last_conditional_mod);
-        add_dep(last_conditional_mod, n, last_conditional_mod->latency);
+        add_dep(last_conditional_mod, n);
        }
  
        /* write-after-write deps. */
        if (inst->dst.file == GRF) {
-        if (last_grf_write[inst->dst.reg]) {
-           add_dep(last_grf_write[inst->dst.reg], n,
-                   last_grf_write[inst->dst.reg]->latency);
-        }
+        add_dep(last_grf_write[inst->dst.reg], n);
          last_grf_write[inst->dst.reg] = n;
        } else if (inst->dst.file == MRF) {
-        if (last_mrf_write[inst->dst.hw_reg]) {
-           add_dep(last_mrf_write[inst->dst.hw_reg], n,
-                   last_mrf_write[inst->dst.hw_reg]->latency);
+        int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
+
+        add_dep(last_mrf_write[reg], n);
+        last_mrf_write[reg] = n;
+        if (is_compressed(inst)) {
+           if (inst->dst.reg & BRW_MRF_COMPR4)
+              reg += 4;
+           else
+              reg++;
+           add_dep(last_mrf_write[reg], n);
+           last_mrf_write[reg] = n;
          }
-        last_mrf_write[inst->dst.hw_reg] = n;
+      } else if (inst->dst.file == FIXED_HW_REG &&
+                inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+        last_fixed_grf_write = n;
        } else if (inst->dst.file != BAD_FILE) {
          add_barrier_deps(n);
        }
  
        if (inst->mlen > 0) {
          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
-           if (last_mrf_write[inst->base_mrf + i]) {
-              add_dep(last_mrf_write[inst->base_mrf + i], n,
-                      last_mrf_write[inst->base_mrf + i]->latency);
-           }
+           add_dep(last_mrf_write[inst->base_mrf + i], n);
             last_mrf_write[inst->base_mrf + i] = n;
          }
        }
@@ -326,6 +344,7 @@ instruction_scheduler::calculate_deps()
     memset(last_grf_write, 0, sizeof(last_grf_write));
     memset(last_mrf_write, 0, sizeof(last_mrf_write));
     last_conditional_mod = NULL;
+   last_fixed_grf_write = NULL;
  
     exec_node *node;
     exec_node *prev;
@@ -338,9 +357,11 @@ instruction_scheduler::calculate_deps()
        /* write-after-read deps. */
        for (int i = 0; i < 3; i++) {
          if (inst->src[i].file == GRF) {
-           if (last_grf_write[inst->src[i].reg]) {
-              add_dep(n, last_grf_write[inst->src[i].reg], n->latency);
-           }
+           add_dep(n, last_grf_write[inst->src[i].reg]);
+        } else if (inst->src[i].file == FIXED_HW_REG &&
+                   (inst->src[i].fixed_hw_reg.file ==
+                    BRW_GENERAL_REGISTER_FILE)) {
+           add_dep(n, last_fixed_grf_write);
          } else if (inst->src[i].file != BAD_FILE &&
                     inst->src[i].file != IMM &&
                     inst->src[i].file != UNIFORM) {
@@ -358,9 +379,7 @@ instruction_scheduler::calculate_deps()
        }
  
        if (inst->predicated) {
-        if (last_conditional_mod) {
-           add_dep(n, last_conditional_mod, n->latency);
-        }
+        add_dep(n, last_conditional_mod);
        }
  
        /* Update the things this instruction wrote, so earlier reads
@@ -369,7 +388,21 @@ instruction_scheduler::calculate_deps()
        if (inst->dst.file == GRF) {
          last_grf_write[inst->dst.reg] = n;
        } else if (inst->dst.file == MRF) {
-        last_mrf_write[inst->dst.hw_reg] = n;
+        int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
+
+        last_mrf_write[reg] = n;
+
+        if (is_compressed(inst)) {
+           if (inst->dst.reg & BRW_MRF_COMPR4)
+              reg += 4;
+           else
+              reg++;
+
+           last_mrf_write[reg] = n;
+        }
+      } else if (inst->dst.file == FIXED_HW_REG &&
+                inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+        last_fixed_grf_write = n;
        } else if (inst->dst.file != BAD_FILE) {
          add_barrier_deps(n);
        }
@@ -391,8 +424,8 @@ instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
     int time = 0;
  
     /* Remove non-DAG heads from the list. */
-   foreach_iter(exec_list_iterator, iter, instructions) {
-      schedule_node *n = (schedule_node *)iter.get();
+   foreach_list_safe(node, &instructions) {
+      schedule_node *n = (schedule_node *)node;
        if (n->parent_count != 0)
          n->remove();
     }
@@ -401,8 +434,8 @@ instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
        schedule_node *chosen = NULL;
        int chosen_time = 0;
  
-      foreach_iter(exec_list_iterator, iter, instructions) {
-        schedule_node *n = (schedule_node *)iter.get();
+      foreach_list(node, &instructions) {
+        schedule_node *n = (schedule_node *)node;
  
          if (!chosen || n->unblocked_time < chosen_time) {
             chosen = n;
@@ -444,8 +477,8 @@ instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
         * progress until the first is done.
         */
        if (chosen->inst->is_math()) {
-        foreach_iter(exec_list_iterator, iter, instructions) {
-           schedule_node *n = (schedule_node *)iter.get();
+        foreach_list(node, &instructions) {
+           schedule_node *n = (schedule_node *)node;
  
             if (n->inst->is_math())
                n->unblocked_time = MAX2(n->unblocked_time,