i965/fs: detect different bit size accesses to uniforms to push them in proper locations

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index f839d368f3c8a0185566a6d3928d0cdd2cb9a222..68e73cc5cd801a672bbb73e884444c5f9e6556d6 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -462,6 +462,8 @@ type_size_scalar(const struct glsl_type *type)
     case GLSL_TYPE_BOOL:
        return type->components();
     case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_UINT64:
+   case GLSL_TYPE_INT64:
        return type->components() * 2;
     case GLSL_TYPE_ARRAY:
        return type_size_scalar(type->fields.array) * type->length;
@@ -492,19 +494,6 @@ type_size_scalar(const struct glsl_type *type)
     return 0;
  }
  
-/* Attribute arrays are loaded as one vec4 per element (or matrix column),
- * except for double-precision types, which are loaded as one dvec4.
- */
-extern "C" int
-type_size_vs_input(const struct glsl_type *type)
-{
-   if (type->is_double()) {
-      return type_size_dvec4(type);
-   } else {
-      return type_size_vec4(type);
-   }
-}
-
  /**
   * Create a MOV to read the timestamp register.
   *
@@ -730,7 +719,7 @@ fs_inst::components_read(unsigned i) const
                 opcode == SHADER_OPCODE_TXD_LOGICAL)
           return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
        /* Texture offset. */
-      else if (i == TEX_LOGICAL_SRC_OFFSET_VALUE)
+      else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
           return 2;
        /* MCS */
        else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
@@ -1864,7 +1853,10 @@ fs_visitor::compact_virtual_grfs()
  }
  
  static void
-set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous,
+set_push_pull_constant_loc(unsigned uniform, int *chunk_start,
+                           unsigned *max_chunk_bitsize,
+                           bool contiguous, unsigned bitsize,
+                           const unsigned target_bitsize,
                             int *push_constant_loc, int *pull_constant_loc,
                             unsigned *num_push_constants,
                             unsigned *num_pull_constants,
@@ -1876,11 +1868,23 @@ set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous,
     if (*chunk_start < 0)
        *chunk_start = uniform;
  
+   /* Keep track of the maximum bit size access in contiguous uniforms */
+   *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize);
+
     /* If this element does not need to be contiguous with the next, we
      * split at this point and everything between chunk_start and u forms a
      * single chunk.
      */
     if (!contiguous) {
+      /* If bitsize doesn't match the target one, skip it */
+      if (*max_chunk_bitsize != target_bitsize) {
+         /* FIXME: right now we only support 32 and 64-bit accesses */
+         assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8);
+         *max_chunk_bitsize = 0;
+         *chunk_start = -1;
+         return;
+      }
+
        unsigned chunk_size = uniform - *chunk_start + 1;
  
        /* Decide whether we should push or pull this parameter.  In the
@@ -1898,6 +1902,7 @@ set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous,
              pull_constant_loc[j] = (*num_pull_constants)++;
        }
  
+      *max_chunk_bitsize = 0;
        *chunk_start = -1;
     }
  }
@@ -1920,8 +1925,8 @@ fs_visitor::assign_constant_locations()
  
     bool is_live[uniforms];
     memset(is_live, 0, sizeof(is_live));
-   bool is_live_64bit[uniforms];
-   memset(is_live_64bit, 0, sizeof(is_live_64bit));
+   unsigned bitsize_access[uniforms];
+   memset(bitsize_access, 0, sizeof(bitsize_access));
  
     /* For each uniform slot, a value of true indicates that the given slot and
      * the next slot must remain contiguous.  This is used to keep us from
@@ -1958,20 +1963,18 @@ fs_visitor::assign_constant_locations()
              for (unsigned j = constant_nr; j < last; j++) {
                 is_live[j] = true;
                 contiguous[j] = true;
-               if (type_sz(inst->src[i].type) == 8) {
-                  is_live_64bit[j] = true;
-               }
+               bitsize_access[j] = MAX2(bitsize_access[j], type_sz(inst->src[i].type));
              }
              is_live[last] = true;
+            bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type));
           } else {
              if (constant_nr >= 0 && constant_nr < (int) uniforms) {
                 int regs_read = inst->components_read(i) *
                    type_sz(inst->src[i].type) / 4;
                 for (int j = 0; j < regs_read; j++) {
                    is_live[constant_nr + j] = true;
-                  if (type_sz(inst->src[i].type) == 8) {
-                     is_live_64bit[constant_nr + j] = true;
-                  }
+                  bitsize_access[constant_nr + j] =
+                     MAX2(bitsize_access[constant_nr + j], type_sz(inst->src[i].type));
                 }
              }
           }
@@ -2010,13 +2013,17 @@ fs_visitor::assign_constant_locations()
     memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
  
     int chunk_start = -1;
+   unsigned max_chunk_bitsize = 0;
  
     /* First push 64-bit uniforms to ensure they are properly aligned */
+   const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF);
     for (unsigned u = 0; u < uniforms; u++) {
-      if (!is_live[u] || !is_live_64bit[u])
+      if (!is_live[u])
           continue;
  
-      set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
+      set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
+                                 contiguous[u], bitsize_access[u],
+                                 uniform_64_bit_size,
                                   push_constant_loc, pull_constant_loc,
                                   &num_push_constants, &num_pull_constants,
                                   max_push_components, max_chunk_size,
@@ -2025,15 +2032,18 @@ fs_visitor::assign_constant_locations()
     }
  
     /* Then push the rest of uniforms */
+   const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F);
     for (unsigned u = 0; u < uniforms; u++) {
-      if (!is_live[u] || is_live_64bit[u])
+      if (!is_live[u])
           continue;
  
        /* Skip thread_local_id_index to put it in the last push register. */
        if (thread_local_id_index == (int)u)
           continue;
  
-      set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
+      set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
+                                 contiguous[u], bitsize_access[u],
+                                 uniform_32_bit_size,
                                   push_constant_loc, pull_constant_loc,
                                   &num_push_constants, &num_pull_constants,
                                   max_push_components, max_chunk_size,
@@ -2111,25 +2121,22 @@ fs_visitor::lower_constant_loads()
           if (pull_index == -1)
             continue;
  
-         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
-         fs_reg dst;
-
-         if (type_sz(inst->src[i].type) <= 4)
-            dst = vgrf(glsl_type::float_type);
-         else
-            dst = vgrf(glsl_type::double_type);
-
           assert(inst->src[i].stride == 0);
  
-         const fs_builder ubld = ibld.exec_all().group(8, 0);
-         struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
+         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+         const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
+         const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         const unsigned base = pull_index * 4;
+
           ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                   dst, brw_imm_ud(index), offset);
+                   dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
  
           /* Rewrite the instruction to use the temporary VGRF. */
           inst->src[i].file = VGRF;
           inst->src[i].nr = dst.nr;
-         inst->src[i].offset = (pull_index & 3) * 4 + inst->src[i].offset % 4;
+         inst->src[i].offset = (base & (block_sz - 1)) +
+                               inst->src[i].offset % 4;
  
           brw_mark_surface_used(prog_data, index);
        }
@@ -3202,44 +3209,18 @@ fs_visitor::lower_uniform_pull_constant_loads()
           continue;
  
        if (devinfo->gen >= 7) {
-         /* The offset arg is a vec4-aligned immediate byte offset. */
-         fs_reg const_offset_reg = inst->src[1];
-         assert(const_offset_reg.file == IMM &&
-                const_offset_reg.type == BRW_REGISTER_TYPE_UD);
-         assert(const_offset_reg.ud % 16 == 0);
-
-         fs_reg payload, offset;
-         if (devinfo->gen >= 9) {
-            /* We have to use a message header on Skylake to get SIMD4x2
-             * mode.  Reserve space for the register.
-            */
-            offset = payload = fs_reg(VGRF, alloc.allocate(2));
-            offset.offset += REG_SIZE;
-            inst->mlen = 2;
-         } else {
-            offset = payload = fs_reg(VGRF, alloc.allocate(1));
-            inst->mlen = 1;
-         }
-
-         /* This is actually going to be a MOV, but since only the first dword
-          * is accessed, we have a special opcode to do just that one.  Note
-          * that this needs to be an operation that will be considered a def
-          * by live variable analysis, or register allocation will explode.
-          */
-         fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
-                                               8, offset, const_offset_reg);
-         setup->force_writemask_all = true;
+         const fs_builder ubld = fs_builder(this, block, inst).exec_all();
+         const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
  
-         setup->ir = inst->ir;
-         setup->annotation = inst->annotation;
-         inst->insert_before(block, setup);
+         ubld.group(8, 0).MOV(payload,
+                              retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+         ubld.group(1, 0).MOV(component(payload, 2),
+                              brw_imm_ud(inst->src[1].ud / 16));
  
-         /* Similarly, this will only populate the first 4 channels of the
-          * result register (since we only use smear values from 0-3), but we
-          * don't tell the optimizer.
-          */
           inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
           inst->src[1] = payload;
+         inst->header_size = 1;
+         inst->mlen = 1;
  
           invalidate_live_intervals();
        } else {
@@ -3673,6 +3654,12 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
         */
        setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
        length++;
+   } else if (key->replicate_alpha && inst->target != 0) {
+      /* Handle the case when fragment shader doesn't write to draw buffer
+       * zero. No need to call setup_color_payload() for src0_alpha because
+       * alpha value will be undefined.
+       */
+      length++;
     }
  
     setup_color_payload(bld, key, &sources[length], color0, components);
@@ -3827,8 +3814,8 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
     }
  
     if (has_lod) {
-      /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
-       * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
+      /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
+       * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
         */
        assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
               bld.dispatch_width() == 16);
@@ -3871,7 +3858,6 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
                                  const fs_reg &sample_index,
                                  const fs_reg &surface,
                                  const fs_reg &sampler,
-                                const fs_reg &offset_value,
                                  unsigned coord_components,
                                  unsigned grad_components)
  {
@@ -3879,7 +3865,7 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
     fs_reg msg_coords = message;
     unsigned header_size = 0;
  
-   if (offset_value.file != BAD_FILE) {
+   if (inst->offset != 0) {
        /* The offsets set up by the visitor are in the m1 header, so we can't
         * go headerless.
         */
@@ -3979,7 +3965,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
                                  const fs_reg &mcs,
                                  const fs_reg &surface,
                                  const fs_reg &sampler,
-                                const fs_reg &offset_value,
+                                const fs_reg &tg4_offset,
                                  unsigned coord_components,
                                  unsigned grad_components)
  {
@@ -3991,7 +3977,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
        sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
  
     if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
-       offset_value.file != BAD_FILE || inst->eot ||
+       inst->offset != 0 || inst->eot ||
         op == SHADER_OPCODE_SAMPLEINFO ||
         is_high_sampler(devinfo, sampler)) {
        /* For general texture offsets (no txf workaround), we need a header to
@@ -4136,7 +4122,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
  
        for (unsigned i = 0; i < 2; i++) /* offu, offv */
           bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
-                 offset(offset_value, bld, i));
+                 offset(tg4_offset, bld, i));
  
        if (coord_components == 3) /* r if present */
           bld.MOV(sources[length++], offset(coordinate, bld, 2));
@@ -4188,7 +4174,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
     const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
     const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
     const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
-   const fs_reg &offset_value = inst->src[TEX_LOGICAL_SRC_OFFSET_VALUE];
+   const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
     assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
     const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
     assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
@@ -4197,12 +4183,12 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
     if (devinfo->gen >= 7) {
        lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
                                        shadow_c, lod, lod2, sample_index,
-                                      mcs, surface, sampler, offset_value,
+                                      mcs, surface, sampler, tg4_offset,
                                        coord_components, grad_components);
     } else if (devinfo->gen >= 5) {
        lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
                                        shadow_c, lod, lod2, sample_index,
-                                      surface, sampler, offset_value,
+                                      surface, sampler,
                                        coord_components, grad_components);
     } else {
        lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
@@ -4671,7 +4657,7 @@ get_sampler_lowered_simd_width(const struct gen_device_info *devinfo,
        inst->components_read(TEX_LOGICAL_SRC_LOD2) +
        inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
        (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
-       inst->components_read(TEX_LOGICAL_SRC_OFFSET_VALUE) : 0) +
+       inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
        inst->components_read(TEX_LOGICAL_SRC_MCS);
  
     /* SIMD16 messages with more than five arguments exceed the maximum message
@@ -5686,7 +5672,7 @@ fs_visitor::optimize()
  
        OPT(opt_algebraic);
        OPT(opt_cse);
-      OPT(opt_copy_propagate);
+      OPT(opt_copy_propagation);
        OPT(opt_predicated_break, this);
        OPT(opt_cmod_propagation);
        OPT(dead_code_eliminate);
@@ -5710,7 +5696,7 @@ fs_visitor::optimize()
     }
  
     if (OPT(lower_d2x)) {
-      OPT(opt_copy_propagate);
+      OPT(opt_copy_propagation);
        OPT(dead_code_eliminate);
     }
  
@@ -5722,12 +5708,12 @@ fs_visitor::optimize()
     OPT(lower_logical_sends);
  
     if (progress) {
-      OPT(opt_copy_propagate);
+      OPT(opt_copy_propagation);
        /* Only run after logical send lowering because it's easier to implement
         * in terms of physical sends.
         */
        if (OPT(opt_zero_samples))
-         OPT(opt_copy_propagate);
+         OPT(opt_copy_propagation);
        /* Run after logical send lowering to give it a chance to CSE the
         * LOAD_PAYLOAD instructions created to construct the payloads of
         * e.g. texturing messages in cases where it wasn't possible to CSE the
@@ -5756,7 +5742,7 @@ fs_visitor::optimize()
     if (devinfo->gen <= 5 && OPT(lower_minmax)) {
        OPT(opt_cmod_propagation);
        OPT(opt_cse);
-      OPT(opt_copy_propagate);
+      OPT(opt_copy_propagation);
        OPT(dead_code_eliminate);
     }
  
@@ -5962,15 +5948,15 @@ fs_visitor::run_tcs_single_patch()
     }
  
     /* Fix the disptach mask */
-   if (nir->info->tcs.vertices_out % 8) {
+   if (nir->info->tess.tcs_vertices_out % 8) {
        bld.CMP(bld.null_reg_ud(), invocation_id,
-              brw_imm_ud(nir->info->tcs.vertices_out), BRW_CONDITIONAL_L);
+              brw_imm_ud(nir->info->tess.tcs_vertices_out), BRW_CONDITIONAL_L);
        bld.IF(BRW_PREDICATE_NORMAL);
     }
  
     emit_nir_code();
  
-   if (nir->info->tcs.vertices_out % 8) {
+   if (nir->info->tess.tcs_vertices_out % 8) {
        bld.emit(BRW_OPCODE_ENDIF);
     }
  
@@ -6419,15 +6405,21 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                 unsigned *final_assembly_size,
                 char **error_str)
  {
+   const struct gen_device_info *devinfo = compiler->devinfo;
+
     nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
-   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
-                                      true);
-   brw_nir_lower_fs_inputs(shader, vue_map, prog, compiler->devinfo, key);
+   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
+   brw_nir_lower_fs_inputs(shader, devinfo, key);
     brw_nir_lower_fs_outputs(shader);
+
+   if (devinfo->gen < 6) {
+      brw_setup_vue_interpolation(vue_map, shader, prog_data, devinfo);
+   }
+
     if (!key->multisample_fbo)
        NIR_PASS_V(shader, demote_sample_qualifiers);
     NIR_PASS_V(shader, move_interpolation_to_top);
-   shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+   shader = brw_postprocess_nir(shader, compiler, true);
  
     /* key->alpha_test_func means simulating alpha testing via discards,
      * so the shader definitely kills pixels.
@@ -6449,6 +6441,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
         shader->info->outputs_read);
  
     prog_data->early_fragment_tests = shader->info->fs.early_fragment_tests;
+   prog_data->post_depth_coverage = shader->info->fs.post_depth_coverage;
+   prog_data->inner_coverage = shader->info->fs.inner_coverage;
  
     prog_data->barycentric_interp_modes =
        brw_compute_barycentric_interp_modes(compiler->devinfo, shader);
@@ -6650,8 +6644,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
                 char **error_str)
  {
     nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
-   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
-                                      true);
+   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
     brw_nir_lower_cs_shared(shader);
     prog_data->base.total_shared += shader->num_shared;
  
@@ -6664,7 +6657,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
             (unsigned)4 * (prog_data->thread_local_id_index + 1));
  
     brw_nir_lower_intrinsics(shader, &prog_data->base);
-   shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+   shader = brw_postprocess_nir(shader, compiler, true);
  
     prog_data->local_size[0] = shader->info->cs.local_size[0];
     prog_data->local_size[1] = shader->info->cs.local_size[1];