nir/i965: use two slots from inputs_read for dvec3/dvec4 vertex input attributes

author Juan A. Suarez Romero <jasuarez@igalia.com>

Fri, 16 Dec 2016 09:24:43 +0000 (10:24 +0100)

committer Juan A. Suarez Romero <jasuarez@igalia.com>

Mon, 9 Jan 2017 09:42:22 +0000 (10:42 +0100)
author Juan A. Suarez Romero <jasuarez@igalia.com>
Fri, 16 Dec 2016 09:24:43 +0000 (10:24 +0100)
committer Juan A. Suarez Romero <jasuarez@igalia.com>
Mon, 9 Jan 2017 09:42:22 +0000 (10:42 +0100)
diff --git a/src/compiler/glsl/glsl_to_nir.cpp b/src/compiler/glsl/glsl_to_nir.cpp

index 69d4c2b20c694f1b5c48b246f87af567169cf012..33f71bf416a7cbdf0a6d37e38cdc087f24e6b75a 100644 (file)
--- a/src/compiler/glsl/glsl_to_nir.cpp
+++ b/src/compiler/glsl/glsl_to_nir.cpp
@@ -129,6 +129,19 @@ private:
  
  } /* end of anonymous namespace */
  
+static void
+nir_remap_attributes(nir_shader *shader)
+{
+   nir_foreach_variable(var, &shader->inputs) {
+      var->data.location += _mesa_bitcount_64(shader->info->double_inputs_read &
+                                              BITFIELD64_MASK(var->data.location));
+   }
+
+   /* Once the remap is done, reset double_inputs_read, so later it will have
+    * which location/slots are doubles */
+   shader->info->double_inputs_read = 0;
+}
+
  nir_shader *
  glsl_to_nir(const struct gl_shader_program *shader_prog,
              gl_shader_stage stage,
@@ -146,6 +159,13 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
  
     nir_lower_constant_initializers(shader, (nir_variable_mode)~0);
  
+   /* Remap the locations to slots so those requiring two slots will occupy
+    * two locations. For instance, if we have in the IR code a dvec3 attr0 in
+    * location 0 and vec4 attr1 in location 1, in NIR attr0 will use
+    * locations/slots 0 and 1, and attr1 will use location/slot 2 */
+   if (shader->stage == MESA_SHADER_VERTEX)
+      nir_remap_attributes(shader);
+
     shader->info->name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name);
     if (shader_prog->Label)
        shader->info->label = ralloc_strdup(shader, shader_prog->Label);
@@ -322,6 +342,14 @@ nir_visitor::visit(ir_variable *ir)
              var->data.compact = ir->type->without_array()->is_scalar();
           }
        }
+
+      /* Mark all the locations that require two slots */
+      if (glsl_type_is_dual_slot(glsl_without_array(var->type))) {
+         for (uint i = 0; i < glsl_count_attribute_slots(var->type, true); i++) {
+            uint64_t bitfield = BITFIELD64_BIT(var->data.location + i);
+            shader->info->double_inputs_read |= bitfield;
+         }
+      }
        break;
  
     case ir_var_shader_out:
diff --git a/src/compiler/nir/nir_gather_info.c b/src/compiler/nir/nir_gather_info.c

index 07c994971461b73c033efa41b2e654d5d2dfc315..35a1ce4dec6a3784acfa3469b810112a17e98049 100644 (file)
--- a/src/compiler/nir/nir_gather_info.c
+++ b/src/compiler/nir/nir_gather_info.c
@@ -53,11 +53,6 @@ set_io_mask(nir_shader *shader, nir_variable *var, int offset, int len)
           else
              shader->info->inputs_read |= bitfield;
  
-         /* double inputs read is only for vertex inputs */
-         if (shader->stage == MESA_SHADER_VERTEX &&
-             glsl_type_is_dual_slot(glsl_without_array(var->type)))
-            shader->info->double_inputs_read |= bitfield;
-
           if (shader->stage == MESA_SHADER_FRAGMENT) {
              shader->info->fs.uses_sample_qualifier |= var->data.sample;
           }
@@ -83,26 +78,21 @@ static void
  mark_whole_variable(nir_shader *shader, nir_variable *var)
  {
     const struct glsl_type *type = var->type;
-   bool is_vertex_input = false;
  
     if (nir_is_per_vertex_io(var, shader->stage)) {
        assert(glsl_type_is_array(type));
        type = glsl_get_array_element(type);
     }
  
-   if (shader->stage == MESA_SHADER_VERTEX &&
-       var->data.mode == nir_var_shader_in)
-      is_vertex_input = true;
-
     const unsigned slots =
        var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4)
-                        : glsl_count_attribute_slots(type, is_vertex_input);
+                        : glsl_count_attribute_slots(type, false);
  
     set_io_mask(shader, var, 0, slots);
  }
  
  static unsigned
-get_io_offset(nir_deref_var *deref, bool is_vertex_input)
+get_io_offset(nir_deref_var *deref)
  {
     unsigned offset = 0;
  
@@ -117,7 +107,7 @@ get_io_offset(nir_deref_var *deref, bool is_vertex_input)
              return -1;
           }
  
-         offset += glsl_count_attribute_slots(tail->type, is_vertex_input) *
+         offset += glsl_count_attribute_slots(tail->type, false) *
              deref_array->base_offset;
        }
        /* TODO: we can get the offset for structs here see nir_lower_io() */
@@ -163,12 +153,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var *deref)
        return false;
     }
  
-   bool is_vertex_input = false;
-   if (shader->stage == MESA_SHADER_VERTEX &&
-       var->data.mode == nir_var_shader_in)
-      is_vertex_input = true;
-
-   unsigned offset = get_io_offset(deref, is_vertex_input);
+   unsigned offset = get_io_offset(deref);
     if (offset == -1)
        return false;
  
@@ -184,8 +169,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var *deref)
     }
  
     /* double element width for double types that takes two slots */
-   if (!is_vertex_input &&
-       glsl_type_is_dual_slot(glsl_without_array(type))) {
+   if (glsl_type_is_dual_slot(glsl_without_array(type))) {
        elem_width *= 2;
     }
  
@@ -220,13 +204,27 @@ gather_intrinsic_info(nir_intrinsic_instr *instr, nir_shader *shader)
     case nir_intrinsic_interp_var_at_sample:
     case nir_intrinsic_interp_var_at_offset:
     case nir_intrinsic_load_var:
-   case nir_intrinsic_store_var:
-      if (instr->variables[0]->var->data.mode == nir_var_shader_in ||
-          instr->variables[0]->var->data.mode == nir_var_shader_out) {
+   case nir_intrinsic_store_var: {
+      nir_variable *var = instr->variables[0]->var;
+
+      if (var->data.mode == nir_var_shader_in ||
+          var->data.mode == nir_var_shader_out) {
           if (!try_mask_partial_io(shader, instr->variables[0]))
-            mark_whole_variable(shader, instr->variables[0]->var);
+            mark_whole_variable(shader, var);
+
+         /* We need to track which input_reads bits correspond to a
+          * dvec3/dvec4 input attribute */
+         if (shader->stage == MESA_SHADER_VERTEX &&
+             var->data.mode == nir_var_shader_in &&
+             glsl_type_is_dual_slot(glsl_without_array(var->type))) {
+            for (uint i = 0; i < glsl_count_attribute_slots(var->type, false); i++) {
+               int idx = var->data.location + i;
+               shader->info->double_inputs_read |= BITFIELD64_BIT(idx);
+            }
+         }
        }
        break;
+   }
  
     case nir_intrinsic_load_draw_id:
     case nir_intrinsic_load_front_face:
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c

index 9ff84cd2921257d19d36826fd6f01523c98671a2..c3feb115bb224f6f36c7171da5c38ea6aae0494c 100644 (file)
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -42,9 +42,35 @@ vertex_element_comp_control(enum isl_format format, unsigned comp)
     default: unreachable("Invalid component");
     }
  
+   /*
+    * Take in account hardware restrictions when dealing with 64-bit floats.
+    *
+    * From Broadwell spec, command reference structures, page 586:
+    *  "When SourceElementFormat is set to one of the *64*_PASSTHRU formats,
+    *   64-bit components are stored * in the URB without any conversion. In
+    *   this case, vertex elements must be written as 128 or 256 bits, with
+    *   VFCOMP_STORE_0 being used to pad the output as required. E.g., if
+    *   R64_PASSTHRU is used to copy a 64-bit Red component into the URB,
+    *   Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3
+    *   set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or
+    *   Components 1-3 must be specified as VFCOMP_STORE_0 in order to output
+    *   a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires
+    *   Component 3 to be specified as VFCOMP_STORE_0 in order to output a
+    *   256-bit vertex element."
+    */
     if (bits) {
        return VFCOMP_STORE_SRC;
-   } else if (comp < 3) {
+   } else if (comp >= 2 &&
+              !isl_format_layouts[format].channels.b.bits &&
+              isl_format_layouts[format].channels.r.type == ISL_RAW) {
+      /* When emitting 64-bit attributes, we need to write either 128 or 256
+       * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and
+       * VFCOMP_STORE_0 to pad the written chunk */
+      return VFCOMP_NOSTORE;
+   } else if (comp < 3 ||
+              isl_format_layouts[format].channels.r.type == ISL_RAW) {
+      /* Note we need to pad with value 0, not 1, due hardware restrictions
+       * (see comment above) */
        return VFCOMP_STORE_0;
     } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
              isl_format_layouts[format].channels.r.type == ISL_SINT) {
@@ -64,8 +90,10 @@ emit_vertex_input(struct anv_pipeline *pipeline,
  
     /* Pull inputs_read out of the VS prog data */
     const uint64_t inputs_read = vs_prog_data->inputs_read;
+   const uint64_t double_inputs_read = vs_prog_data->double_inputs_read;
     assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
     const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
+   const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
  
  #if GEN_GEN >= 8
     /* On BDW+, we only need to allocate space for base ids.  Setting up
@@ -83,13 +111,16 @@ emit_vertex_input(struct anv_pipeline *pipeline,
                                  vs_prog_data->uses_baseinstance;
  #endif
  
-   uint32_t elem_count = __builtin_popcount(elements) + needs_svgs_elem;
-   if (elem_count == 0)
+   uint32_t elem_count = __builtin_popcount(elements) -
+      __builtin_popcount(elements_double) / 2;
+
+   uint32_t total_elems = elem_count + needs_svgs_elem;
+   if (total_elems == 0)
        return;
  
     uint32_t *p;
  
-   const uint32_t num_dwords = 1 + elem_count * 2;
+   const uint32_t num_dwords = 1 + total_elems * 2;
     p = anv_batch_emitn(&pipeline->batch, num_dwords,
                         GENX(3DSTATE_VERTEX_ELEMENTS));
     memset(p + 1, 0, (num_dwords - 1) * 4);
@@ -107,7 +138,10 @@ emit_vertex_input(struct anv_pipeline *pipeline,
        if ((elements & (1 << desc->location)) == 0)
           continue; /* Binding unused */
  
-      uint32_t slot = __builtin_popcount(elements & ((1 << desc->location) - 1));
+      uint32_t slot =
+         __builtin_popcount(elements & ((1 << desc->location) - 1)) -
+         DIV_ROUND_UP(__builtin_popcount(elements_double &
+                                        ((1 << desc->location) -1)), 2);
  
        struct GENX(VERTEX_ELEMENT_STATE) element = {
           .VertexBufferIndex = desc->binding,
@@ -137,7 +171,7 @@ emit_vertex_input(struct anv_pipeline *pipeline,
  #endif
     }
  
-   const uint32_t id_slot = __builtin_popcount(elements);
+   const uint32_t id_slot = elem_count;
     if (needs_svgs_elem) {
        /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
         *    "Within a VERTEX_ELEMENT_STATE structure, if a Component
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c

index 578156459240b7c7ff9ba2aabf8edee8659a402d..b7527f2cd9bda9edbd7efdcb3b867a8c7c4a4de4 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -481,11 +481,16 @@ brw_prepare_vertices(struct brw_context *brw)
     /* Accumulate the list of enabled arrays. */
     brw->vb.nr_enabled = 0;
     while (vs_inputs) {
-      GLuint index = ffsll(vs_inputs) - 1;
+      GLuint first = ffsll(vs_inputs) - 1;
+      GLuint index =
+         first - DIV_ROUND_UP(_mesa_bitcount_64(vs_prog_data->double_inputs_read &
+                                                BITFIELD64_MASK(first)), 2);
        struct brw_vertex_element *input = &brw->vb.inputs[index];
        input->is_dual_slot = brw->gen >= 8 &&
-         (vs_prog_data->double_inputs_read & BITFIELD64_BIT(index)) != 0;
-      vs_inputs &= ~BITFIELD64_BIT(index);
+         (vs_prog_data->double_inputs_read & BITFIELD64_BIT(first)) != 0;
+      vs_inputs &= ~BITFIELD64_BIT(first);
+      if (input->is_dual_slot)
+         vs_inputs &= ~BITFIELD64_BIT(first + 1);
        brw->vb.enabled[brw->vb.nr_enabled++] = input;
     }
  
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index c8a069386dd1bafd581768d882378dbc3a259b09..03f9c24d151da52a6cb3bc720129988104d2420f 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -492,19 +492,6 @@ type_size_scalar(const struct glsl_type *type)
     return 0;
  }
  
-/* Attribute arrays are loaded as one vec4 per element (or matrix column),
- * except for double-precision types, which are loaded as one dvec4.
- */
-extern "C" int
-type_size_vs_input(const struct glsl_type *type)
-{
-   if (type->is_double()) {
-      return type_size_dvec4(type);
-   } else {
-      return type_size_vec4(type);
-   }
-}
-
  /**
   * Create a MOV to read the timestamp register.
   *
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

index 3775e6c4a094638c5a56c7d4ef0ddce29abc69ce..cea38d86237e61df5a38641332a2f239234dc091 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -36,8 +36,7 @@ fs_reg *
  fs_visitor::emit_vs_system_value(int location)
  {
     fs_reg *reg = new(this->mem_ctx)
-      fs_reg(ATTR, 4 * (_mesa_bitcount_64(nir->info->inputs_read) +
-                        _mesa_bitcount_64(nir->info->double_inputs_read)),
+      fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read),
               BRW_REGISTER_TYPE_D);
     struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
  
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c

index 3645f48777a9b678e242d6def5b02b764a9abd80..2d2fce28eefab22d74550e0d3745a266b58c0ac5 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -113,9 +113,7 @@ remap_vs_attrs(nir_block *block, shader_info *nir_info)
           int attr = intrin->const_index[0];
           int slot = _mesa_bitcount_64(nir_info->inputs_read &
                                        BITFIELD64_MASK(attr));
-         int dslot = _mesa_bitcount_64(nir_info->double_inputs_read &
-                                       BITFIELD64_MASK(attr));
-         intrin->const_index[0] = 4 * (slot + dslot);
+         intrin->const_index[0] = 4 * slot;
        }
     }
     return true;
@@ -268,7 +266,7 @@ brw_nir_lower_vs_inputs(nir_shader *nir,
      * loaded as one vec4 or dvec4 per element (or matrix column), depending on
      * whether it is a double-precision type or not.
      */
-   nir_lower_io(nir, nir_var_shader_in, type_size_vs_input, 0);
+   nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0);
  
     /* This pass needs actual constants */
     nir_opt_constant_folding(nir);
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h

index f713d47b40eb7fabdcf8c59caf6d407331d08c9f..ecb4118980605f37e257ae95b90be983b3405c61 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -34,7 +34,6 @@ extern "C" {
  int type_size_scalar(const struct glsl_type *type);
  int type_size_vec4(const struct glsl_type *type);
  int type_size_dvec4(const struct glsl_type *type);
-int type_size_vs_input(const struct glsl_type *type);
  
  static inline int
  type_size_scalar_bytes(const struct glsl_type *type)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp

index b5e846d7cf56e6df43009670168349ff6f503c74..5ddbe580d5ae2e9375773c27d0b3b2163ce86fca 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -2737,7 +2737,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
        ((1 << shader->info->cull_distance_array_size) - 1) <<
        shader->info->clip_distance_array_size;
  
-   unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
+   unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data->inputs_read);
  
     /* gl_VertexID and gl_InstanceID are system values, but arrive via an
      * incoming vertex attribute.  So, add an extra slot.
@@ -2747,18 +2747,17 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
          BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
          BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
          BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
-      nr_attributes++;
+      nr_attribute_slots++;
     }
  
     /* gl_DrawID has its very own vec4 */
     if (shader->info->system_values_read &
         BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) {
-      nr_attributes++;
+      nr_attribute_slots++;
     }
  
-   unsigned nr_attribute_slots =
-      nr_attributes +
-      _mesa_bitcount_64(shader->info->double_inputs_read);
+   unsigned nr_attributes = nr_attribute_slots -
+      DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read), 2);
  
     /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
      * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
author	Juan A. Suarez Romero <jasuarez@igalia.com>
	Fri, 16 Dec 2016 09:24:43 +0000 (10:24 +0100)
committer	Juan A. Suarez Romero <jasuarez@igalia.com>
	Mon, 9 Jan 2017 09:42:22 +0000 (10:42 +0100)
src/compiler/glsl/glsl_to_nir.cpp		patch \| blob \| history
src/compiler/nir/nir_gather_info.c		patch \| blob \| history
src/intel/vulkan/genX_pipeline.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_draw_upload.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_nir.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_nir.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_vec4.cpp		patch \| blob \| history