i965/fs: detect different bit size accesses to uniforms to push them in proper locations
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
index 977fd8c35f592ea32591355bce06d7fd410daeff..68e73cc5cd801a672bbb73e884444c5f9e6556d6 100644 (file)
@@ -462,6 +462,8 @@ type_size_scalar(const struct glsl_type *type)
    case GLSL_TYPE_BOOL:
       return type->components();
    case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_UINT64:
+   case GLSL_TYPE_INT64:
       return type->components() * 2;
    case GLSL_TYPE_ARRAY:
       return type_size_scalar(type->fields.array) * type->length;
@@ -492,19 +494,6 @@ type_size_scalar(const struct glsl_type *type)
    return 0;
 }
 
-/* Attribute arrays are loaded as one vec4 per element (or matrix column),
- * except for double-precision types, which are loaded as one dvec4.
- */
-extern "C" int
-type_size_vs_input(const struct glsl_type *type)
-{
-   if (type->is_double()) {
-      return type_size_dvec4(type);
-   } else {
-      return type_size_vec4(type);
-   }
-}
-
 /**
  * Create a MOV to read the timestamp register.
  *
@@ -1864,7 +1853,10 @@ fs_visitor::compact_virtual_grfs()
 }
 
 static void
-set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous,
+set_push_pull_constant_loc(unsigned uniform, int *chunk_start,
+                           unsigned *max_chunk_bitsize,
+                           bool contiguous, unsigned bitsize,
+                           const unsigned target_bitsize,
                            int *push_constant_loc, int *pull_constant_loc,
                            unsigned *num_push_constants,
                            unsigned *num_pull_constants,
@@ -1876,11 +1868,23 @@ set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous,
    if (*chunk_start < 0)
       *chunk_start = uniform;
 
+   /* Keep track of the maximum bit size access in contiguous uniforms */
+   *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize);
+
    /* If this element does not need to be contiguous with the next, we
     * split at this point and everything between chunk_start and u forms a
     * single chunk.
     */
    if (!contiguous) {
+      /* If bitsize doesn't match the target one, skip it */
+      if (*max_chunk_bitsize != target_bitsize) {
+         /* FIXME: right now we only support 32 and 64-bit accesses */
+         assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8);
+         *max_chunk_bitsize = 0;
+         *chunk_start = -1;
+         return;
+      }
+
       unsigned chunk_size = uniform - *chunk_start + 1;
 
       /* Decide whether we should push or pull this parameter.  In the
@@ -1898,6 +1902,7 @@ set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous,
             pull_constant_loc[j] = (*num_pull_constants)++;
       }
 
+      *max_chunk_bitsize = 0;
       *chunk_start = -1;
    }
 }
@@ -1920,8 +1925,8 @@ fs_visitor::assign_constant_locations()
 
    bool is_live[uniforms];
    memset(is_live, 0, sizeof(is_live));
-   bool is_live_64bit[uniforms];
-   memset(is_live_64bit, 0, sizeof(is_live_64bit));
+   unsigned bitsize_access[uniforms];
+   memset(bitsize_access, 0, sizeof(bitsize_access));
 
    /* For each uniform slot, a value of true indicates that the given slot and
     * the next slot must remain contiguous.  This is used to keep us from
@@ -1958,20 +1963,18 @@ fs_visitor::assign_constant_locations()
             for (unsigned j = constant_nr; j < last; j++) {
                is_live[j] = true;
                contiguous[j] = true;
-               if (type_sz(inst->src[i].type) == 8) {
-                  is_live_64bit[j] = true;
-               }
+               bitsize_access[j] = MAX2(bitsize_access[j], type_sz(inst->src[i].type));
             }
             is_live[last] = true;
+            bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type));
          } else {
             if (constant_nr >= 0 && constant_nr < (int) uniforms) {
                int regs_read = inst->components_read(i) *
                   type_sz(inst->src[i].type) / 4;
                for (int j = 0; j < regs_read; j++) {
                   is_live[constant_nr + j] = true;
-                  if (type_sz(inst->src[i].type) == 8) {
-                     is_live_64bit[constant_nr + j] = true;
-                  }
+                  bitsize_access[constant_nr + j] =
+                     MAX2(bitsize_access[constant_nr + j], type_sz(inst->src[i].type));
                }
             }
          }
@@ -2010,13 +2013,17 @@ fs_visitor::assign_constant_locations()
    memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
 
    int chunk_start = -1;
+   unsigned max_chunk_bitsize = 0;
 
    /* First push 64-bit uniforms to ensure they are properly aligned */
+   const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF);
    for (unsigned u = 0; u < uniforms; u++) {
-      if (!is_live[u] || !is_live_64bit[u])
+      if (!is_live[u])
          continue;
 
-      set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
+      set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
+                                 contiguous[u], bitsize_access[u],
+                                 uniform_64_bit_size,
                                  push_constant_loc, pull_constant_loc,
                                  &num_push_constants, &num_pull_constants,
                                  max_push_components, max_chunk_size,
@@ -2025,15 +2032,18 @@ fs_visitor::assign_constant_locations()
    }
 
    /* Then push the rest of uniforms */
+   const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F);
    for (unsigned u = 0; u < uniforms; u++) {
-      if (!is_live[u] || is_live_64bit[u])
+      if (!is_live[u])
          continue;
 
       /* Skip thread_local_id_index to put it in the last push register. */
       if (thread_local_id_index == (int)u)
          continue;
 
-      set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
+      set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
+                                 contiguous[u], bitsize_access[u],
+                                 uniform_32_bit_size,
                                  push_constant_loc, pull_constant_loc,
                                  &num_push_constants, &num_pull_constants,
                                  max_push_components, max_chunk_size,
@@ -2111,25 +2121,22 @@ fs_visitor::lower_constant_loads()
          if (pull_index == -1)
            continue;
 
-         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
-         fs_reg dst;
-
-         if (type_sz(inst->src[i].type) <= 4)
-            dst = vgrf(glsl_type::float_type);
-         else
-            dst = vgrf(glsl_type::double_type);
-
          assert(inst->src[i].stride == 0);
 
-         const fs_builder ubld = ibld.exec_all().group(4, 0);
-         struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
+         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+         const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
+         const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         const unsigned base = pull_index * 4;
+
          ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                   dst, brw_imm_ud(index), offset);
+                   dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
 
          /* Rewrite the instruction to use the temporary VGRF. */
          inst->src[i].file = VGRF;
          inst->src[i].nr = dst.nr;
-         inst->src[i].offset = (pull_index & 3) * 4 + inst->src[i].offset % 4;
+         inst->src[i].offset = (base & (block_sz - 1)) +
+                               inst->src[i].offset % 4;
 
          brw_mark_surface_used(prog_data, index);
       }
@@ -5941,15 +5948,15 @@ fs_visitor::run_tcs_single_patch()
    }
 
    /* Fix the disptach mask */
-   if (nir->info->tcs.vertices_out % 8) {
+   if (nir->info->tess.tcs_vertices_out % 8) {
       bld.CMP(bld.null_reg_ud(), invocation_id,
-              brw_imm_ud(nir->info->tcs.vertices_out), BRW_CONDITIONAL_L);
+              brw_imm_ud(nir->info->tess.tcs_vertices_out), BRW_CONDITIONAL_L);
       bld.IF(BRW_PREDICATE_NORMAL);
    }
 
    emit_nir_code();
 
-   if (nir->info->tcs.vertices_out % 8) {
+   if (nir->info->tess.tcs_vertices_out % 8) {
       bld.emit(BRW_OPCODE_ENDIF);
    }
 
@@ -6398,15 +6405,21 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                unsigned *final_assembly_size,
                char **error_str)
 {
+   const struct gen_device_info *devinfo = compiler->devinfo;
+
    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
-   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
-                                      true);
-   brw_nir_lower_fs_inputs(shader, vue_map, prog, compiler->devinfo, key);
+   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
+   brw_nir_lower_fs_inputs(shader, devinfo, key);
    brw_nir_lower_fs_outputs(shader);
+
+   if (devinfo->gen < 6) {
+      brw_setup_vue_interpolation(vue_map, shader, prog_data, devinfo);
+   }
+
    if (!key->multisample_fbo)
       NIR_PASS_V(shader, demote_sample_qualifiers);
    NIR_PASS_V(shader, move_interpolation_to_top);
-   shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+   shader = brw_postprocess_nir(shader, compiler, true);
 
    /* key->alpha_test_func means simulating alpha testing via discards,
     * so the shader definitely kills pixels.
@@ -6631,8 +6644,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
                char **error_str)
 {
    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
-   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
-                                      true);
+   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
    brw_nir_lower_cs_shared(shader);
    prog_data->base.total_shared += shader->num_shared;
 
@@ -6645,7 +6657,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
            (unsigned)4 * (prog_data->thread_local_id_index + 1));
 
    brw_nir_lower_intrinsics(shader, &prog_data->base);
-   shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+   shader = brw_postprocess_nir(shader, compiler, true);
 
    prog_data->local_size[0] = shader->info->cs.local_size[0];
    prog_data->local_size[1] = shader->info->cs.local_size[1];