Merge remote-tracking branch 'jekstrand/wip/i965-uniforms' into vulkan

author Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>

Thu, 21 Jan 2016 19:07:09 +0000 (11:07 -0800)

committer Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>

Thu, 21 Jan 2016 19:09:58 +0000 (11:09 -0800)
author Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>
Thu, 21 Jan 2016 19:07:09 +0000 (11:07 -0800)
committer Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>
Thu, 21 Jan 2016 19:09:58 +0000 (11:09 -0800)
diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h

index b6996ee3f4746727bdac809f0cd497a047f5eca7..3e7cf735a1b5a565d6797b9dcc1e780517a2d5ef 100644 (file)
--- a/src/glsl/nir/nir_intrinsics.h
+++ b/src/glsl/nir/nir_intrinsics.h
@@ -311,6 +311,9 @@ SYSTEM_VALUE(helper_invocation, 1, 0)
   * of the start of the variable being loaded and and the offset source is a
   * offset into that variable.
   *
+ * Uniform load operations have a second index that specifies the size of the
+ * variable being loaded.  If const_index[1] == 0, then the size is unknown.
+ *
   * Some load operations such as UBO/SSBO load and per_vertex loads take an
   * additional source to specify which UBO/SSBO/vertex to load from.
   *
@@ -323,8 +326,8 @@ SYSTEM_VALUE(helper_invocation, 1, 0)
  #define LOAD(name, srcs, indices, flags) \
     INTRINSIC(load_##name, srcs, ARR(1, 1, 1, 1), true, 0, 0, indices, flags)
  
-/* src[] = { offset }. const_index[] = { base } */
-LOAD(uniform, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
+/* src[] = { offset }. const_index[] = { base, size } */
+LOAD(uniform, 1, 2, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
  /* src[] = { buffer_index, offset }. No const_index */
  LOAD(ubo, 2, 0, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
  /* src[] = { offset }. const_index[] = { base } */
diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c

index 90cf2e3acfd9e30dad9ef5d1a1fa91b20ce74428..2c5fa16af5e29aa7644057ca9124b66846053a01 100644 (file)
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -277,6 +277,11 @@ nir_lower_io_block(nir_block *block, void *void_state)
           load->const_index[0] =
              intrin->variables[0]->var->data.driver_location;
  
+         if (load->intrinsic == nir_intrinsic_load_uniform) {
+            load->const_index[1] =
+               state->type_size(intrin->variables[0]->var->type);
+         }
+
           if (per_vertex)
              load->src[0] = nir_src_for_ssa(vertex_index);
  
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 4230822c2cddc3f2cf4014b1a78e46056188281c..fc883f458fa7414377075286d22722640998320b 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -174,7 +174,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
      * CSE can later notice that those loads are all the same and eliminate
      * the redundant ones.
      */
-   fs_reg vec4_offset = vgrf(glsl_type::int_type);
+   fs_reg vec4_offset = vgrf(glsl_type::uint_type);
     bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
  
     int scale = 1;
@@ -433,7 +433,6 @@ fs_reg::fs_reg(struct ::brw_reg reg) :
  {
     this->reg_offset = 0;
     this->subreg_offset = 0;
-   this->reladdr = NULL;
     this->stride = 1;
     if (this->file == IMM &&
         (this->type != BRW_REGISTER_TYPE_V &&
@@ -448,7 +447,6 @@ fs_reg::equals(const fs_reg &r) const
  {
     return (this->backend_reg::equals(r) &&
             subreg_offset == r.subreg_offset &&
-           !reladdr && !r.reladdr &&
             stride == r.stride);
  }
  
@@ -851,7 +849,10 @@ fs_inst::regs_read(int arg) const
           assert(src[2].file == IMM);
           unsigned region_length = src[2].ud;
  
-         if (src[0].file == FIXED_GRF) {
+         if (src[0].file == UNIFORM) {
+            assert(region_length % 4 == 0);
+            return region_length / 4;
+         } else if (src[0].file == FIXED_GRF) {
              /* If the start of the region is not register aligned, then
               * there's some portion of the register that's technically
               * unread at the beginning.
@@ -865,7 +866,7 @@ fs_inst::regs_read(int arg) const
               * unread portion at the beginning.
               */
              if (src[0].subnr)
-               region_length += src[0].subnr * type_sz(src[0].type);
+               region_length += src[0].subnr;
  
              return DIV_ROUND_UP(region_length, REG_SIZE);
           } else {
@@ -1021,7 +1022,6 @@ fs_visitor::import_uniforms(fs_visitor *v)
     this->push_constant_loc = v->push_constant_loc;
     this->pull_constant_loc = v->pull_constant_loc;
     this->uniforms = v->uniforms;
-   this->param_size = v->param_size;
  }
  
  fs_reg *
@@ -1930,9 +1930,7 @@ fs_visitor::compact_virtual_grfs()
   * maximum number of fragment shader uniform components (64).  If
   * there are too many of these, they'd fill up all of register space.
   * So, this will push some of them out to the pull constant buffer and
- * update the program to load them.  We also use pull constants for all
- * indirect constant loads because we don't support indirect accesses in
- * registers yet.
+ * update the program to load them.
   */
  void
  fs_visitor::assign_constant_locations()
@@ -1941,20 +1939,21 @@ fs_visitor::assign_constant_locations()
     if (dispatch_width != 8)
        return;
  
-   unsigned int num_pull_constants = 0;
-
-   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
-   memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
-
     bool is_live[uniforms];
     memset(is_live, 0, sizeof(is_live));
  
+   /* For each uniform slot, a value of true indicates that the given slot and
+    * the next slot must remain contiguous.  This is used to keep us from
+    * splitting arrays apart.
+    */
+   bool contiguous[uniforms];
+   memset(contiguous, 0, sizeof(contiguous));
+
     /* First, we walk through the instructions and do two things:
      *
      *  1) Figure out which uniforms are live.
      *
-    *  2) Find all indirect access of uniform arrays and flag them as needing
-    *     to go into the pull constant buffer.
+    *  2) Mark any indirectly used ranges of registers as contiguous.
      *
      * Note that we don't move constant-indexed accesses to arrays.  No
      * testing has been done of the performance impact of this choice.
@@ -1964,20 +1963,19 @@ fs_visitor::assign_constant_locations()
           if (inst->src[i].file != UNIFORM)
              continue;
  
-         if (inst->src[i].reladdr) {
-            int uniform = inst->src[i].nr;
+         int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
  
-            /* If this array isn't already present in the pull constant buffer,
-             * add it.
-             */
-            if (pull_constant_loc[uniform] == -1) {
-               assert(param_size[uniform]);
-               for (int j = 0; j < param_size[uniform]; j++)
-                  pull_constant_loc[uniform + j] = num_pull_constants++;
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
+            assert(inst->src[2].ud % 4 == 0);
+            unsigned last = constant_nr + (inst->src[2].ud / 4) - 1;
+            assert(last < uniforms);
+
+            for (unsigned j = constant_nr; j < last; j++) {
+               is_live[j] = true;
+               contiguous[j] = true;
              }
+            is_live[last] = true;
           } else {
-            /* Mark the the one accessed uniform as live */
-            int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
              if (constant_nr >= 0 && constant_nr < (int) uniforms)
                 is_live[constant_nr] = true;
           }
@@ -1992,29 +1990,49 @@ fs_visitor::assign_constant_locations()
      * If changing this value, note the limitation about total_regs in
      * brw_curbe.c.
      */
-   unsigned int max_push_components = 16 * 8;
+   const unsigned int max_push_components = 16 * 8;
+
+   /* We push small arrays, but no bigger than 16 floats.  This is big enough
+    * for a vec4 but hopefully not large enough to push out other stuff.  We
+    * should probably use a better heuristic at some point.
+    */
+   const unsigned int max_chunk_size = 16;
+
     unsigned int num_push_constants = 0;
+   unsigned int num_pull_constants = 0;
  
     push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
  
-   for (unsigned int i = 0; i < uniforms; i++) {
-      if (!is_live[i] || pull_constant_loc[i] != -1) {
-         /* This UNIFORM register is either dead, or has already been demoted
-          * to a pull const.  Mark it as no longer living in the param[] array.
-          */
-         push_constant_loc[i] = -1;
+   int chunk_start = -1;
+   for (unsigned u = 0; u < uniforms; u++) {
+      push_constant_loc[u] = -1;
+      pull_constant_loc[u] = -1;
+
+      if (!is_live[u])
           continue;
-      }
  
-      if (num_push_constants < max_push_components) {
-         /* Retain as a push constant.  Record the location in the params[]
-          * array.
-          */
-         push_constant_loc[i] = num_push_constants++;
-      } else {
-         /* Demote to a pull constant. */
-         push_constant_loc[i] = -1;
-         pull_constant_loc[i] = num_pull_constants++;
+      /* This is the first live uniform in the chunk */
+      if (chunk_start < 0)
+         chunk_start = u;
+
+      /* If this element does not need to be contiguous with the next, we
+       * split at this point and everthing between chunk_start and u forms a
+       * single chunk.
+       */
+      if (!contiguous[u]) {
+         unsigned chunk_size = u - chunk_start + 1;
+
+         if (num_push_constants + chunk_size <= max_push_components &&
+             chunk_size <= max_chunk_size) {
+            for (unsigned j = chunk_start; j <= u; j++)
+               push_constant_loc[j] = num_push_constants++;
+         } else {
+            for (unsigned j = chunk_start; j <= u; j++)
+               pull_constant_loc[j] = num_pull_constants++;
+         }
+
+         chunk_start = -1;
        }
     }
  
@@ -2045,51 +2063,67 @@ fs_visitor::assign_constant_locations()
   * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
   */
  void
-fs_visitor::demote_pull_constants()
+fs_visitor::lower_constant_loads()
  {
-   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+   const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      /* Set up the annotation tracking for new generated instructions. */
+      const fs_builder ibld(this, block, inst);
+
        for (int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file != UNIFORM)
             continue;
  
-         int pull_index;
+         /* We'll handle this case later */
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
+            continue;
+
           unsigned location = inst->src[i].nr + inst->src[i].reg_offset;
-         if (location >= uniforms) /* Out of bounds access */
-            pull_index = -1;
-         else
-            pull_index = pull_constant_loc[location];
+         if (location >= uniforms)
+            continue; /* Out of bounds access */
+
+         int pull_index = pull_constant_loc[location];
  
           if (pull_index == -1)
             continue;
  
-         /* Set up the annotation tracking for new generated instructions. */
-         const fs_builder ibld(this, block, inst);
-         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
-         fs_reg dst = vgrf(glsl_type::float_type);
-
           assert(inst->src[i].stride == 0);
  
-         /* Generate a pull load into dst. */
-         if (inst->src[i].reladdr) {
-            VARYING_PULL_CONSTANT_LOAD(ibld, dst,
-                                       brw_imm_ud(index),
-                                       *inst->src[i].reladdr,
-                                       pull_index * 4);
-            inst->src[i].reladdr = NULL;
-            inst->src[i].stride = 1;
-         } else {
-            const fs_builder ubld = ibld.exec_all().group(8, 0);
-            struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
-            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                      dst, brw_imm_ud(index), offset);
-            inst->src[i].set_smear(pull_index & 3);
-         }
-         brw_mark_surface_used(prog_data, index);
+         fs_reg dst = vgrf(glsl_type::float_type);
+         const fs_builder ubld = ibld.exec_all().group(8, 0);
+         struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
+         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                   dst, brw_imm_ud(index), offset);
  
           /* Rewrite the instruction to use the temporary VGRF. */
           inst->src[i].file = VGRF;
           inst->src[i].nr = dst.nr;
           inst->src[i].reg_offset = 0;
+         inst->src[i].set_smear(pull_index & 3);
+
+         brw_mark_surface_used(prog_data, index);
+      }
+
+      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+          inst->src[0].file == UNIFORM) {
+
+         unsigned location = inst->src[0].nr + inst->src[0].reg_offset;
+         if (location >= uniforms)
+            continue; /* Out of bounds access */
+
+         int pull_index = pull_constant_loc[location];
+
+         if (pull_index == -1)
+           continue;
+
+         VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
+                                    brw_imm_ud(index),
+                                    inst->src[1],
+                                    pull_index * 4);
+         inst->remove(block);
+
+         brw_mark_surface_used(prog_data, index);
        }
     }
     invalidate_live_intervals();
@@ -4462,6 +4496,10 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
     case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
        return 8;
  
+   case SHADER_OPCODE_MOV_INDIRECT:
+      /* Prior to Broadwell, we only have 8 address subregisters */
+      return devinfo->gen < 8 ? 8 : inst->exec_size;
+
     default:
        return inst->exec_size;
     }
@@ -4744,9 +4782,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
           break;
        case UNIFORM:
           fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset);
-         if (inst->src[i].reladdr) {
-            fprintf(file, "+reladdr");
-         } else if (inst->src[i].subreg_offset) {
+         if (inst->src[i].subreg_offset) {
              fprintf(file, "+%d.%d", inst->src[i].reg_offset,
                      inst->src[i].subreg_offset);
           }
@@ -4857,7 +4893,6 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start,
  {
     if (end == start ||
         end->is_partial_write() ||
-       reg.reladdr ||
         !reg.equals(end->dst)) {
        return NULL;
     } else {
@@ -5070,7 +5105,7 @@ fs_visitor::optimize()
     bld = fs_builder(this, 64);
  
     assign_constant_locations();
-   demote_pull_constants();
+   lower_constant_loads();
  
     validate();
  
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h

index 2dddd7c0b4ff147f42da8e88224a063376d2c655..5a7a0eb5f63a3c6b596d60a5783c6c2b4ddee069 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -139,7 +139,7 @@ public:
     void split_virtual_grfs();
     bool compact_virtual_grfs();
     void assign_constant_locations();
-   void demote_pull_constants();
+   void lower_constant_loads();
     void invalidate_live_intervals();
     void calculate_live_intervals();
     void calculate_register_pressure();
@@ -323,8 +323,6 @@ public:
  
     const struct brw_vue_map *input_vue_map;
  
-   int *param_size;
-
     int *virtual_grf_start;
     int *virtual_grf_end;
     brw::fs_live_variables *live_intervals;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp

index 34d40e921cd5427de8ceff62a874bfbb6c53b9fa..b1134cff3c86f0a788dfd8006c19d10190e42fc3 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -351,23 +351,47 @@ fs_generator::generate_mov_indirect(fs_inst *inst,
  
     unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
  
-   /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
-   struct brw_reg addr = vec8(brw_address_reg(0));
+   if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
+      imm_byte_offset += indirect_byte_offset.ud;
  
-   /* The destination stride of an instruction (in bytes) must be greater
-    * than or equal to the size of the rest of the instruction.  Since the
-    * address register is of type UW, we can't use a D-type instruction.
-    * In order to get around this, re re-type to UW and use a stride.
-    */
-   indirect_byte_offset =
-      retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
+      reg.nr = imm_byte_offset / REG_SIZE;
+      reg.subnr = imm_byte_offset % REG_SIZE;
+      brw_MOV(p, dst, reg);
+   } else {
+      /* Prior to Broadwell, there are only 8 address registers. */
+      assert(inst->exec_size == 8 || devinfo->gen >= 8);
  
-   /* Prior to Broadwell, there are only 8 address registers. */
-   assert(inst->exec_size == 8 || devinfo->gen >= 8);
+      /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
+      struct brw_reg addr = vec8(brw_address_reg(0));
  
-   brw_MOV(p, addr, indirect_byte_offset);
-   brw_inst_set_mask_control(devinfo, brw_last_inst, BRW_MASK_DISABLE);
-   brw_MOV(p, dst, retype(brw_VxH_indirect(0, imm_byte_offset), dst.type));
+      /* The destination stride of an instruction (in bytes) must be greater
+       * than or equal to the size of the rest of the instruction.  Since the
+       * address register is of type UW, we can't use a D-type instruction.
+       * In order to get around this, re re-type to UW and use a stride.
+       */
+      indirect_byte_offset =
+         retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
+
+      if (devinfo->gen < 8) {
+         /* Prior to broadwell, we have a restriction that the bottom 5 bits
+          * of the base offset and the bottom 5 bits of the indirect must add
+          * to less than 32.  In other words, the hardware needs to be able to
+          * add the bottom five bits of the two to get the subnumber and add
+          * the next 7 bits of each to get the actual register number.  Since
+          * the indirect may cause us to cross a register boundary, this makes
+          * it almost useless.  We could try and do something clever where we
+          * use a actual base offset if base_offset % 32 == 0 but that would
+          * mean we were generating different code depending on the base
+          * offset.  Instead, for the sake of consistency, we'll just do the
+          * add ourselves.
+          */
+         brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
+         brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), dst.type));
+      } else {
+         brw_MOV(p, addr, indirect_byte_offset);
+         brw_MOV(p, dst, retype(brw_VxH_indirect(0, imm_byte_offset), dst.type));
+      }
+   }
  }
  
  void
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp

index 62d801f726484af61a2dc59ae31f912600ee4469..eded5a90f7d69679297d0cd99dca587ca5ceee12 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -175,15 +175,6 @@ fs_visitor::nir_setup_uniforms()
        return;
  
     uniforms = nir->num_uniforms / 4;
-
-   nir_foreach_variable(var, &nir->uniforms) {
-      /* UBO's and atomics don't take up space in the uniform file */
-      if (var->interface_type != NULL || var->type->contains_atomic())
-         continue;
-
-      if (type_size_scalar(var->type) > 0)
-         param_size[var->data.driver_location / 4] = type_size_scalar(var->type);
-   }
  }
  
  static bool
@@ -1195,6 +1186,8 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
  {
     fs_reg image(UNIFORM, deref->var->data.driver_location / 4,
                  BRW_REGISTER_TYPE_UD);
+   fs_reg indirect;
+   unsigned indirect_max = 0;
  
     for (const nir_deref *tail = &deref->deref; tail->child;
          tail = tail->child) {
@@ -1206,7 +1199,7 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
        image = offset(image, bld, base * element_size);
  
        if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
-         fs_reg tmp = vgrf(glsl_type::int_type);
+         fs_reg tmp = vgrf(glsl_type::uint_type);
  
           if (devinfo->gen == 7 && !devinfo->is_haswell) {
              /* IVB hangs when trying to access an invalid surface index with
@@ -1224,15 +1217,31 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
              bld.MOV(tmp, get_nir_src(deref_array->indirect));
           }
  
+         indirect_max += element_size * (tail->type->length - 1);
+
           bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4));
-         if (image.reladdr)
-            bld.ADD(*image.reladdr, *image.reladdr, tmp);
-         else
-            image.reladdr = new(mem_ctx) fs_reg(tmp);
+         if (indirect.file == BAD_FILE) {
+            indirect = tmp;
+         } else {
+            bld.ADD(indirect, indirect, tmp);
+         }
        }
     }
  
-   return image;
+   if (indirect.file == BAD_FILE) {
+      return image;
+   } else {
+      /* Emit a pile of MOVs to load the uniform into a temporary.  The
+       * dead-code elimination pass will get rid of what we don't use.
+       */
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE);
+      for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) {
+         bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+                  offset(tmp, bld, j), offset(image, bld, j),
+                  indirect, brw_imm_ud((indirect_max + 1) * 4));
+      }
+      return tmp;
+   }
  }
  
  void
@@ -2609,12 +2618,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
           /* Offsets are in bytes but they should always be multiples of 4 */
           assert(const_offset->u[0] % 4 == 0);
           src.reg_offset = const_offset->u[0] / 4;
+
+         for (unsigned j = 0; j < instr->num_components; j++) {
+            bld.MOV(offset(dest, bld, j), offset(src, bld, j));
+         }
        } else {
-         src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
-      }
+         fs_reg indirect = retype(get_nir_src(instr->src[0]),
+                                  BRW_REGISTER_TYPE_UD);
  
-      for (unsigned j = 0; j < instr->num_components; j++) {
-         bld.MOV(offset(dest, bld, j), offset(src, bld, j));
+         /* We need to pass a size to the MOV_INDIRECT but we don't want it to
+          * go past the end of the uniform.  In order to keep the n'th
+          * component from running past, we subtract off the size of all but
+          * one component of the vector.
+          */
+         assert(instr->const_index[1] >= instr->num_components * 4);
+         unsigned read_size = instr->const_index[1] -
+                              (instr->num_components - 1) * 4;
+
+         for (unsigned j = 0; j < instr->num_components; j++) {
+            bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+                     offset(dest, bld, j), offset(src, bld, j),
+                     indirect, brw_imm_ud(read_size));
+         }
        }
        break;
     }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

index 89e1559a196f64ce95468854740735ba05169252..0ff5cd6de190deb9e263e081e1d66d310ea078c7 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1037,9 +1037,6 @@ fs_visitor::init()
  
     this->spilled_any_registers = false;
     this->do_dual_src = false;
-
-   if (dispatch_width == 8)
-      this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
  }
  
  fs_visitor::~fs_visitor()
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h

index c3eec2efb42fa9b43f676cefb840ee18bfc3aac6..e4f20f4ffc9d8abadd740a83f5d1c4c782fb7d5f 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -58,8 +58,6 @@ public:
      */
     int subreg_offset;
  
-   fs_reg *reladdr;
-
     /** Register region horizontal stride */
     uint8_t stride;
  };
@@ -136,8 +134,7 @@ component(fs_reg reg, unsigned idx)
  static inline bool
  is_uniform(const fs_reg &reg)
  {
-   return (reg.stride == 0 || reg.is_null()) &&
-          (!reg.reladdr || is_uniform(*reg.reladdr));
+   return (reg.stride == 0 || reg.is_null());
  }
  
  /**
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp

index 09db25f10da7a3e9ebcfbb34236f9c2d90d61151..358a71041fcd5159be311569254520361dabc140 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -477,11 +477,6 @@ vec4_visitor::split_uniform_registers()
          inst->src[i].reg_offset = 0;
        }
     }
-
-   /* Update that everything is now vector-sized. */
-   for (int i = 0; i < this->uniforms; i++) {
-      this->uniform_size[i] = 1;
-   }
  }
  
  void
@@ -539,7 +534,6 @@ vec4_visitor::pack_uniform_registers()
      * push constants.
      */
     for (int src = 0; src < uniforms; src++) {
-      assert(src < uniform_array_size);
        int size = chans_used[src];
  
        if (size == 0)
@@ -786,7 +780,7 @@ vec4_visitor::move_push_constants_to_pull_constants()
          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
  
          emit_pull_constant_load(block, inst, temp, inst->src[i],
-                                pull_constant_loc[uniform]);
+                                pull_constant_loc[uniform], src_reg());
  
          inst->src[i].file = temp.file;
           inst->src[i].nr = temp.nr;
@@ -1606,8 +1600,6 @@ vec4_visitor::setup_uniforms(int reg)
      * matter what, or the GPU would hang.
      */
     if (devinfo->gen < 6 && this->uniforms == 0) {
-      assert(this->uniforms < this->uniform_array_size);
-
        stage_prog_data->param =
           reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
        for (unsigned int i = 0; i < 4; i++) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h

index 531eb17041900120d6f2cbf4e73f21614b367dfd..83d9edae8ca085d3a2dc4ec49542df859e0d0f28 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -115,8 +115,6 @@ public:
      */
     dst_reg output_reg[BRW_VARYING_SLOT_COUNT];
     const char *output_reg_annotation[BRW_VARYING_SLOT_COUNT];
-   int *uniform_size;
-   int uniform_array_size; /*< Size of the uniform_size array */
     int uniforms;
  
     src_reg shader_start_time;
@@ -285,8 +283,6 @@ public:
  
     src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst,
                               src_reg *reladdr, int reg_offset);
-   src_reg get_pull_constant_offset(bblock_t *block, vec4_instruction *inst,
-                                   src_reg *reladdr, int reg_offset);
     void emit_scratch_read(bblock_t *block, vec4_instruction *inst,
                           dst_reg dst,
                           src_reg orig_src,
@@ -296,7 +292,8 @@ public:
     void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
                                 dst_reg dst,
                                 src_reg orig_src,
-                               int base_offset);
+                               int base_offset,
+                                src_reg indirect);
     void emit_pull_constant_load_reg(dst_reg dst,
                                      src_reg surf_index,
                                      src_reg offset,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp

index 19c4024e061d707d24471dd353e5f253bdd444d7..237534decbac08edea5b7204f3c05a78d51b798e 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1390,6 +1390,48 @@ generate_set_simd4x2_header_gen9(struct brw_codegen *p,
     brw_pop_insn_state(p);
  }
  
+static void
+generate_mov_indirect(struct brw_codegen *p,
+                      vec4_instruction *inst,
+                      struct brw_reg dst, struct brw_reg reg,
+                      struct brw_reg indirect, struct brw_reg length)
+{
+   assert(indirect.type == BRW_REGISTER_TYPE_UD);
+
+   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2);
+
+   /* This instruction acts in align1 mode */
+   assert(inst->force_writemask_all || reg.writemask == 0xf);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   struct brw_reg addr = vec2(brw_address_reg(0));
+
+   /* We need to move the indirect value into the address register.  In order
+    * to make things make some sense, we want to respect at least the X
+    * component of the swizzle.  In order to do that, we need to convert the
+    * subnr (probably 0) to an align1 subnr and add in the swizzle.  We then
+    * use a region of <8,4,0>:uw to pick off the first 2 bytes of the indirect
+    * and splat it out to all four channels of the given half of a0.
+    */
+   assert(brw_is_single_value_swizzle(indirect.swizzle));
+   indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0)) * 2;
+   indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0);
+
+   brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset));
+
+   /* Use a <4,1> region Vx1 region*/
+   struct brw_reg src = brw_VxH_indirect(0, 0);
+   src.width = BRW_WIDTH_4;
+   src.hstride = BRW_HORIZONTAL_STRIDE_1;
+
+   brw_MOV(p, dst, retype(src, reg.type));
+
+   brw_pop_insn_state(p);
+}
+
  static void
  generate_code(struct brw_codegen *p,
                const struct brw_compiler *compiler,
@@ -1936,6 +1978,9 @@ generate_code(struct brw_codegen *p,
           brw_WAIT(p);
           break;
  
+      case SHADER_OPCODE_MOV_INDIRECT:
+         generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]);
+
        default:
           unreachable("Unsupported opcode");
        }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp

index b2335bdb8df8c755167b624db7eebdc579b95bfe..46cbbfaa590211297f149cc0e25efa604038b6d9 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -132,15 +132,6 @@ void
  vec4_visitor::nir_setup_uniforms()
  {
     uniforms = nir->num_uniforms / 16;
-
-   nir_foreach_variable(var, &nir->uniforms) {
-      /* UBO's and atomics don't take up space in the uniform file */
-      if (var->interface_type != NULL || var->type->contains_atomic())
-         continue;
-
-      if (type_size_vec4(var->type) > 0)
-         uniform_size[var->data.driver_location / 16] = type_size_vec4(var->type);
-   }
  }
  
  void
@@ -710,12 +701,14 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
           /* Offsets are in bytes but they should always be multiples of 16 */
           assert(const_offset->u[0] % 16 == 0);
           src.reg_offset = const_offset->u[0] / 16;
+
+         emit(MOV(dest, src));
        } else {
-         src_reg tmp = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_D, 1);
-         src.reladdr = new(mem_ctx) src_reg(tmp);
-      }
+         src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
  
-      emit(MOV(dest, src));
+         emit(SHADER_OPCODE_MOV_INDIRECT, dest, src,
+              indirect, brw_imm_ud(instr->const_index[1]));
+      }
        break;
     }
  
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp

index fb6ca8ee5f9f2718eefbc5f1195a8efd6307aaba..3d83152d365bde78fd9f2a017d7f41dfd0339d9e 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -59,8 +59,6 @@ vec4_tcs_visitor::emit_nir_code()
         * copies VS outputs to TES inputs.
         */
        uniforms = 2;
-      uniform_size[0] = 1;
-      uniform_size[1] = 1;
  
        uint64_t varyings = key->outputs_written;
  
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

index 529498be77a0a8112e35455b47bfc4a5a1edb1fb..0c5bfb8579cb1e79cf0be2987b40e346bb60162a 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1468,27 +1468,6 @@ vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
     }
  }
  
-src_reg
-vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
-                                      src_reg *reladdr, int reg_offset)
-{
-   if (reladdr) {
-      src_reg index = src_reg(this, glsl_type::int_type);
-
-      emit_before(block, inst, ADD(dst_reg(index), *reladdr,
-                                   brw_imm_d(reg_offset * 16)));
-
-      return index;
-   } else if (devinfo->gen >= 8) {
-      /* Store the offset in a GRF so we can send-from-GRF. */
-      src_reg offset = src_reg(this, glsl_type::int_type);
-      emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16)));
-      return offset;
-   } else {
-      return brw_imm_d(reg_offset * 16);
-   }
-}
-
  /**
   * Emits an instruction before @inst to load the value named by @orig_src
   * from scratch space at @base_offset to @temp.
@@ -1666,12 +1645,24 @@ vec4_visitor::move_grf_array_access_to_scratch()
  void
  vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
                                       dst_reg temp, src_reg orig_src,
-                                     int base_offset)
+                                     int base_offset, src_reg indirect)
  {
     int reg_offset = base_offset + orig_src.reg_offset;
     const unsigned index = prog_data->base.binding_table.pull_constants_start;
-   src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
-                                             reg_offset);
+
+   src_reg offset;
+   if (indirect.file != BAD_FILE) {
+      offset = src_reg(this, glsl_type::int_type);
+
+      emit_before(block, inst, ADD(dst_reg(offset), indirect,
+                                   brw_imm_d(reg_offset * 16)));
+   } else if (devinfo->gen >= 8) {
+      /* Store the offset in a GRF so we can send-from-GRF. */
+      offset = src_reg(this, glsl_type::int_type);
+      emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16)));
+   } else {
+      offset = brw_imm_d(reg_offset * 16);
+   }
  
     emit_pull_constant_load_reg(temp,
                                 brw_imm_ud(index),
@@ -1698,59 +1689,55 @@ vec4_visitor::move_uniform_array_access_to_pull_constants()
  {
     int pull_constant_loc[this->uniforms];
     memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
-   bool nested_reladdr;
  
-   /* Walk through and find array access of uniforms.  Put a copy of that
-    * uniform in the pull constant buffer.
-    *
-    * Note that we don't move constant-indexed accesses to arrays.  No
-    * testing has been done of the performance impact of this choice.
+   /* First, walk through the instructions and determine which things need to
+    * be pulled.  We mark something as needing to be pulled by setting
+    * pull_constant_loc to 0.
      */
-   do {
-      nested_reladdr = false;
-
-      foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
-         for (int i = 0 ; i < 3; i++) {
-            if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
-               continue;
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      /* We only care about MOV_INDIRECT of a uniform */
+      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
+          inst->src[0].file != UNIFORM)
+         continue;
  
-            int uniform = inst->src[i].nr;
+      int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
  
-            if (inst->src[i].reladdr->reladdr)
-               nested_reladdr = true;  /* will need another pass */
+      for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
+         pull_constant_loc[uniform_nr + j] = 0;
+   }
  
-            /* If this array isn't already present in the pull constant buffer,
-             * add it.
-             */
-            if (pull_constant_loc[uniform] == -1) {
-               const gl_constant_value **values =
-                  &stage_prog_data->param[uniform * 4];
+   /* Next, we walk the list of uniforms and assign real pull constant
+    * locations and set their corresponding entries in pull_param.
+    */
+   for (int j = 0; j < this->uniforms; j++) {
+      if (pull_constant_loc[j] < 0)
+         continue;
  
-               pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
+      pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
  
-               assert(uniform < uniform_array_size);
-               for (int j = 0; j < uniform_size[uniform] * 4; j++) {
-                  stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
-                     = values[j];
-               }
-            }
+      for (int i = 0; i < 4; i++) {
+         stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
+            = stage_prog_data->param[j * 4 + i];
+      }
+   }
  
-            /* Set up the annotation tracking for new generated instructions. */
-            base_ir = inst->ir;
-            current_annotation = inst->annotation;
+   /* Finally, we can walk through the instructions and lower MOV_INDIRECT
+    * instructions to actual uniform pulls.
+    */
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      /* We only care about MOV_INDIRECT of a uniform */
+      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
+          inst->src[0].file != UNIFORM)
+         continue;
  
-            dst_reg temp = dst_reg(this, glsl_type::vec4_type);
+      int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
  
-            emit_pull_constant_load(block, inst, temp, inst->src[i],
-                                    pull_constant_loc[uniform]);
+      assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
  
-            inst->src[i].file = temp.file;
-            inst->src[i].nr = temp.nr;
-            inst->src[i].reg_offset = temp.reg_offset;
-            inst->src[i].reladdr = NULL;
-         }
-      }
-   } while (nested_reladdr);
+      emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
+                              pull_constant_loc[uniform_nr], inst->src[1]);
+      inst->remove(block);
+   }
  
     /* Now there are no accesses of the UNIFORM file with a reladdr, so
      * no need to track them as larger-than-vec4 objects.  This will be
@@ -1803,17 +1790,6 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
     this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
  
     this->uniforms = 0;
-
-   /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
-    * at least one. See setup_uniforms() in brw_vec4.cpp.
-    */
-   this->uniform_array_size = 1;
-   if (prog_data) {
-      this->uniform_array_size =
-         MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
-   }
-
-   this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
  }
  
  vec4_visitor::~vec4_visitor()
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp

index 1d6914902b391f21143cb3ff86ba90fc83a9b9f1..86701f3fbd82a5883c8bc7814ebf2e0f060ddaeb 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -270,7 +270,6 @@ void
  vec4_vs_visitor::setup_uniform_clipplane_values()
  {
     for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
-      assert(this->uniforms < uniform_array_size);
        this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
        this->userplane[i].type = BRW_REGISTER_TYPE_F;
        for (int j = 0; j < 4; ++j) {
author	Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>
	Thu, 21 Jan 2016 19:07:09 +0000 (11:07 -0800)
committer	Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>
	Thu, 21 Jan 2016 19:09:58 +0000 (11:09 -0800)
src/glsl/nir/nir_intrinsics.h		patch \| blob \| history
src/glsl/nir/nir_lower_io.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs_generator.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs_nir.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_ir_fs.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_vec4.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_vec4.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_vec4_generator.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_vec4_nir.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp		patch \| blob \| history