i965/fs: detect different bit size accesses to uniforms to push them in proper locations

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 977fd8c35f592ea32591355bce06d7fd410daeff..68e73cc5cd801a672bbb73e884444c5f9e6556d6 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -462,6 +462,8 @@ type_size_scalar(const struct glsl_type *type)
     case GLSL_TYPE_BOOL:
        return type->components();
     case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_UINT64:
+   case GLSL_TYPE_INT64:
        return type->components() * 2;
     case GLSL_TYPE_ARRAY:
        return type_size_scalar(type->fields.array) * type->length;
@@ -492,19 +494,6 @@ type_size_scalar(const struct glsl_type *type)
     return 0;
  }
  
-/* Attribute arrays are loaded as one vec4 per element (or matrix column),
- * except for double-precision types, which are loaded as one dvec4.
- */
-extern "C" int
-type_size_vs_input(const struct glsl_type *type)
-{
-   if (type->is_double()) {
-      return type_size_dvec4(type);
-   } else {
-      return type_size_vec4(type);
-   }
-}
-
  /**
   * Create a MOV to read the timestamp register.
   *
@@ -1864,7 +1853,10 @@ fs_visitor::compact_virtual_grfs()
  }
  
  static void
-set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous,
+set_push_pull_constant_loc(unsigned uniform, int *chunk_start,
+                           unsigned *max_chunk_bitsize,
+                           bool contiguous, unsigned bitsize,
+                           const unsigned target_bitsize,
                             int *push_constant_loc, int *pull_constant_loc,
                             unsigned *num_push_constants,
                             unsigned *num_pull_constants,
@@ -1876,11 +1868,23 @@ set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous,
     if (*chunk_start < 0)
        *chunk_start = uniform;
  
+   /* Keep track of the maximum bit size access in contiguous uniforms */
+   *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize);
+
     /* If this element does not need to be contiguous with the next, we
      * split at this point and everything between chunk_start and u forms a
      * single chunk.
      */
     if (!contiguous) {
+      /* If bitsize doesn't match the target one, skip it */
+      if (*max_chunk_bitsize != target_bitsize) {
+         /* FIXME: right now we only support 32 and 64-bit accesses */
+         assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8);
+         *max_chunk_bitsize = 0;
+         *chunk_start = -1;
+         return;
+      }
+
        unsigned chunk_size = uniform - *chunk_start + 1;
  
        /* Decide whether we should push or pull this parameter.  In the
@@ -1898,6 +1902,7 @@ set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous,
              pull_constant_loc[j] = (*num_pull_constants)++;
        }
  
+      *max_chunk_bitsize = 0;
        *chunk_start = -1;
     }
  }
@@ -1920,8 +1925,8 @@ fs_visitor::assign_constant_locations()
  
     bool is_live[uniforms];
     memset(is_live, 0, sizeof(is_live));
-   bool is_live_64bit[uniforms];
-   memset(is_live_64bit, 0, sizeof(is_live_64bit));
+   unsigned bitsize_access[uniforms];
+   memset(bitsize_access, 0, sizeof(bitsize_access));
  
     /* For each uniform slot, a value of true indicates that the given slot and
      * the next slot must remain contiguous.  This is used to keep us from
@@ -1958,20 +1963,18 @@ fs_visitor::assign_constant_locations()
              for (unsigned j = constant_nr; j < last; j++) {
                 is_live[j] = true;
                 contiguous[j] = true;
-               if (type_sz(inst->src[i].type) == 8) {
-                  is_live_64bit[j] = true;
-               }
+               bitsize_access[j] = MAX2(bitsize_access[j], type_sz(inst->src[i].type));
              }
              is_live[last] = true;
+            bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type));
           } else {
              if (constant_nr >= 0 && constant_nr < (int) uniforms) {
                 int regs_read = inst->components_read(i) *
                    type_sz(inst->src[i].type) / 4;
                 for (int j = 0; j < regs_read; j++) {
                    is_live[constant_nr + j] = true;
-                  if (type_sz(inst->src[i].type) == 8) {
-                     is_live_64bit[constant_nr + j] = true;
-                  }
+                  bitsize_access[constant_nr + j] =
+                     MAX2(bitsize_access[constant_nr + j], type_sz(inst->src[i].type));
                 }
              }
           }
@@ -2010,13 +2013,17 @@ fs_visitor::assign_constant_locations()
     memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
  
     int chunk_start = -1;
+   unsigned max_chunk_bitsize = 0;
  
     /* First push 64-bit uniforms to ensure they are properly aligned */
+   const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF);
     for (unsigned u = 0; u < uniforms; u++) {
-      if (!is_live[u] || !is_live_64bit[u])
+      if (!is_live[u])
           continue;
  
-      set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
+      set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
+                                 contiguous[u], bitsize_access[u],
+                                 uniform_64_bit_size,
                                   push_constant_loc, pull_constant_loc,
                                   &num_push_constants, &num_pull_constants,
                                   max_push_components, max_chunk_size,
@@ -2025,15 +2032,18 @@ fs_visitor::assign_constant_locations()
     }
  
     /* Then push the rest of uniforms */
+   const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F);
     for (unsigned u = 0; u < uniforms; u++) {
-      if (!is_live[u] || is_live_64bit[u])
+      if (!is_live[u])
           continue;
  
        /* Skip thread_local_id_index to put it in the last push register. */
        if (thread_local_id_index == (int)u)
           continue;
  
-      set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
+      set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
+                                 contiguous[u], bitsize_access[u],
+                                 uniform_32_bit_size,
                                   push_constant_loc, pull_constant_loc,
                                   &num_push_constants, &num_pull_constants,
                                   max_push_components, max_chunk_size,
@@ -2111,25 +2121,22 @@ fs_visitor::lower_constant_loads()
           if (pull_index == -1)
             continue;
  
-         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
-         fs_reg dst;
-
-         if (type_sz(inst->src[i].type) <= 4)
-            dst = vgrf(glsl_type::float_type);
-         else
-            dst = vgrf(glsl_type::double_type);
-
           assert(inst->src[i].stride == 0);
  
-         const fs_builder ubld = ibld.exec_all().group(4, 0);
-         struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
+         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+         const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
+         const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         const unsigned base = pull_index * 4;
+
           ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                   dst, brw_imm_ud(index), offset);
+                   dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
  
           /* Rewrite the instruction to use the temporary VGRF. */
           inst->src[i].file = VGRF;
           inst->src[i].nr = dst.nr;
-         inst->src[i].offset = (pull_index & 3) * 4 + inst->src[i].offset % 4;
+         inst->src[i].offset = (base & (block_sz - 1)) +
+                               inst->src[i].offset % 4;
  
           brw_mark_surface_used(prog_data, index);
        }
@@ -5941,15 +5948,15 @@ fs_visitor::run_tcs_single_patch()
     }
  
     /* Fix the disptach mask */
-   if (nir->info->tcs.vertices_out % 8) {
+   if (nir->info->tess.tcs_vertices_out % 8) {
        bld.CMP(bld.null_reg_ud(), invocation_id,
-              brw_imm_ud(nir->info->tcs.vertices_out), BRW_CONDITIONAL_L);
+              brw_imm_ud(nir->info->tess.tcs_vertices_out), BRW_CONDITIONAL_L);
        bld.IF(BRW_PREDICATE_NORMAL);
     }
  
     emit_nir_code();
  
-   if (nir->info->tcs.vertices_out % 8) {
+   if (nir->info->tess.tcs_vertices_out % 8) {
        bld.emit(BRW_OPCODE_ENDIF);
     }
  
@@ -6398,15 +6405,21 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                 unsigned *final_assembly_size,
                 char **error_str)
  {
+   const struct gen_device_info *devinfo = compiler->devinfo;
+
     nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
-   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
-                                      true);
-   brw_nir_lower_fs_inputs(shader, vue_map, prog, compiler->devinfo, key);
+   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
+   brw_nir_lower_fs_inputs(shader, devinfo, key);
     brw_nir_lower_fs_outputs(shader);
+
+   if (devinfo->gen < 6) {
+      brw_setup_vue_interpolation(vue_map, shader, prog_data, devinfo);
+   }
+
     if (!key->multisample_fbo)
        NIR_PASS_V(shader, demote_sample_qualifiers);
     NIR_PASS_V(shader, move_interpolation_to_top);
-   shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+   shader = brw_postprocess_nir(shader, compiler, true);
  
     /* key->alpha_test_func means simulating alpha testing via discards,
      * so the shader definitely kills pixels.
@@ -6631,8 +6644,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
                 char **error_str)
  {
     nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
-   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
-                                      true);
+   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
     brw_nir_lower_cs_shared(shader);
     prog_data->base.total_shared += shader->num_shared;
  
@@ -6645,7 +6657,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
             (unsigned)4 * (prog_data->thread_local_id_index + 1));
  
     brw_nir_lower_intrinsics(shader, &prog_data->base);
-   shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+   shader = brw_postprocess_nir(shader, compiler, true);
  
     prog_data->local_size[0] = shader->info->cs.local_size[0];
     prog_data->local_size[1] = shader->info->cs.local_size[1];