i965/miptree: Add real support for HiZ

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 4aae57422783dded87c71d949e8b19b135b5ed3e..b89c6721ea00232e8cde40a1faf19bad16295530 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -253,7 +253,6 @@ fs_inst::is_send_from_grf() const
     switch (opcode) {
     case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
     case SHADER_OPCODE_SHADER_TIME_ADD:
-   case FS_OPCODE_INTERPOLATE_AT_CENTROID:
     case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
     case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
     case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
@@ -510,6 +509,19 @@ type_size_scalar(const struct glsl_type *type)
     return 0;
  }
  
+/**
+ * Returns the number of scalar components needed to store type, assuming
+ * that vectors are padded out to vec4.
+ *
+ * This has the packing rules of type_size_vec4(), but counts components
+ * similar to type_size_scalar().
+ */
+extern "C" int
+type_size_vec4_times_4(const struct glsl_type *type)
+{
+   return 4 * type_size_vec4(type);
+}
+
  /* Attribute arrays are loaded as one vec4 per element (or matrix column),
   * except for double-precision types, which are loaded as one dvec4.
   */
@@ -703,6 +715,10 @@ fs_inst::is_partial_write() const
  unsigned
  fs_inst::components_read(unsigned i) const
  {
+   /* Return zero if the source is not present. */
+   if (src[i].file == BAD_FILE)
+      return 0;
+
     switch (opcode) {
     case FS_OPCODE_LINTERP:
        if (i == 0)
@@ -883,11 +899,10 @@ fs_inst::regs_read(int arg) const
     }
  
     switch (src[arg].file) {
-   case BAD_FILE:
-      return 0;
     case UNIFORM:
     case IMM:
        return 1;
+   case BAD_FILE:
     case ARF:
     case FIXED_GRF:
     case VGRF:
@@ -4079,16 +4094,6 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
  
     bool coordinate_done = false;
  
-   /* The sampler can only meaningfully compute LOD for fragment shader
-    * messages. For all other stages, we change the opcode to TXL and
-    * hardcode the LOD to 0.
-    */
-   if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
-       op == SHADER_OPCODE_TEX) {
-      op = SHADER_OPCODE_TXL;
-      lod = brw_imm_f(0.0f);
-   }
-
     /* Set up the LOD info */
     switch (op) {
     case FS_OPCODE_TXB:
@@ -4193,9 +4198,6 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
        coordinate_done = true;
        break;
     case SHADER_OPCODE_TG4_OFFSET:
-      /* gather4_po_c should have been lowered in SIMD16 mode. */
-      assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
-
        /* More crazy intermixing */
        for (unsigned i = 0; i < 2; i++) /* u, v */
           bld.MOV(sources[length++], offset(coordinate, bld, i));
@@ -4681,6 +4683,67 @@ get_fpu_lowered_simd_width(const struct brw_device_info *devinfo,
     return 1 << _mesa_logbase2(max_width);
  }
  
+/**
+ * Get the maximum allowed SIMD width for instruction \p inst accounting for
+ * various payload size restrictions that apply to sampler message
+ * instructions.
+ *
+ * This is only intended to provide a maximum theoretical bound for the
+ * execution size of the message based on the number of argument components
+ * alone, which in most cases will determine whether the SIMD8 or SIMD16
+ * variant of the message can be used, though some messages may have
+ * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
+ * the message length to determine the exact SIMD width and argument count,
+ * which makes a number of sampler message combinations impossible to
+ * represent).
+ */
+static unsigned
+get_sampler_lowered_simd_width(const struct brw_device_info *devinfo,
+                               const fs_inst *inst)
+{
+   /* Calculate the number of coordinate components that have to be present
+    * assuming that additional arguments follow the texel coordinates in the
+    * message payload.  On IVB+ there is no need for padding, on ILK-SNB we
+    * need to pad to four or three components depending on the message,
+    * pre-ILK we need to pad to at most three components.
+    */
+   const unsigned req_coord_components =
+      (devinfo->gen >= 7 ||
+       !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
+      (devinfo->gen >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
+                            inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
+      3;
+
+   /* On Gen9+ the LOD argument is for free if we're able to use the LZ
+    * variant of the TXL or TXF message.
+    */
+   const bool implicit_lod = devinfo->gen >= 9 &&
+                             (inst->opcode == SHADER_OPCODE_TXL ||
+                              inst->opcode == SHADER_OPCODE_TXF) &&
+                             inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
+
+   /* Calculate the total number of argument components that need to be passed
+    * to the sampler unit.
+    */
+   const unsigned num_payload_components =
+      MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
+           req_coord_components) +
+      inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
+      (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
+      inst->components_read(TEX_LOGICAL_SRC_LOD2) +
+      inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
+      (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
+       inst->components_read(TEX_LOGICAL_SRC_OFFSET_VALUE) : 0) +
+      inst->components_read(TEX_LOGICAL_SRC_MCS);
+
+   /* SIMD16 messages with more than five arguments exceed the maximum message
+    * size supported by the sampler, regardless of whether a header is
+    * provided or not.
+    */
+   return MIN2(inst->exec_size,
+               num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
+}
+
  /**
   * Get the closest native SIMD width supported by the hardware for instruction
   * \p inst.  The instruction will be left untouched by
@@ -4788,7 +4851,6 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
     case FS_OPCODE_PACK_HALF_2x16_SPLIT:
     case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
     case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
-   case FS_OPCODE_INTERPOLATE_AT_CENTROID:
     case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
     case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
     case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
@@ -4857,31 +4919,24 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
     case SHADER_OPCODE_LOD_LOGICAL:
     case SHADER_OPCODE_TG4_LOGICAL:
     case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
-      return MIN2(16, inst->exec_size);
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      return get_sampler_lowered_simd_width(devinfo, inst);
  
     case SHADER_OPCODE_TXD_LOGICAL:
        /* TXD is unsupported in SIMD16 mode. */
        return 8;
  
-   case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
-      /* gather4_po_c is unsupported in SIMD16 mode. */
-      const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
-      return (shadow_c.file != BAD_FILE ? 8 : MIN2(16, inst->exec_size));
-   }
     case SHADER_OPCODE_TXL_LOGICAL:
-   case FS_OPCODE_TXB_LOGICAL: {
-      /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
-       * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
-       * mode because the message exceeds the maximum length of 11.
+   case FS_OPCODE_TXB_LOGICAL:
+      /* Only one execution size is representable pre-ILK depending on whether
+       * the shadow reference argument is present.
         */
-      const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
-      if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
-         return 16;
-      else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
-         return 8;
+      if (devinfo->gen == 4)
+         return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
        else
-         return MIN2(16, inst->exec_size);
-   }
+         return get_sampler_lowered_simd_width(devinfo, inst);
+
     case SHADER_OPCODE_TXF_LOGICAL:
     case SHADER_OPCODE_TXS_LOGICAL:
        /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
@@ -4890,23 +4945,7 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
        if (devinfo->gen == 4)
           return 16;
        else
-         return MIN2(16, inst->exec_size);
-
-   case SHADER_OPCODE_TXF_CMS_W_LOGICAL: {
-      /* This opcode can take up to 6 arguments which means that in some
-       * circumstances it can end up with a message that is too long in SIMD16
-       * mode.
-       */
-      const unsigned coord_components =
-         inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
-      /* First three arguments are the sample index and the two arguments for
-       * the MCS data.
-       */
-      if ((coord_components + 3) * 2 > MAX_SAMPLER_MESSAGE_SIZE)
-         return 8;
-      else
-         return MIN2(16, inst->exec_size);
-   }
+         return get_sampler_lowered_simd_width(devinfo, inst);
  
     case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
     case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
@@ -5570,7 +5609,7 @@ fs_visitor::setup_gs_payload()
      * have to multiply by VerticesIn to obtain the total storage requirement.
      */
     if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
-       max_push_components) {
+       max_push_components || gs_prog_data->invocations > 1) {
        gs_prog_data->base.include_vue_handles = true;
  
        /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
@@ -6342,38 +6381,46 @@ move_interpolation_to_top(nir_shader *nir)
           continue;
  
        nir_block *top = nir_start_block(f->impl);
+      exec_node *cursor_node = NULL;
  
        nir_foreach_block(block, f->impl) {
           if (block == top)
              continue;
  
-         nir_foreach_instr_reverse_safe(instr, block) {
+         nir_foreach_instr_safe(instr, block) {
              if (instr->type != nir_instr_type_intrinsic)
                 continue;
  
              nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-            switch (intrin->intrinsic) {
-            case nir_intrinsic_load_barycentric_pixel:
-            case nir_intrinsic_load_barycentric_centroid:
-            case nir_intrinsic_load_barycentric_sample:
-               break;
-            case nir_intrinsic_load_interpolated_input: {
-               nir_intrinsic_instr *bary_intrinsic =
-                  nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
-               nir_intrinsic_op op = bary_intrinsic->intrinsic;
-
-               /* Leave interpolateAtSample/Offset() where it is. */
-               if (op == nir_intrinsic_load_barycentric_at_sample ||
-                   op == nir_intrinsic_load_barycentric_at_offset)
-                  continue;
-            }
-            default:
+            if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
+               continue;
+            nir_intrinsic_instr *bary_intrinsic =
+               nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
+            nir_intrinsic_op op = bary_intrinsic->intrinsic;
+
+            /* Leave interpolateAtSample/Offset() where they are. */
+            if (op == nir_intrinsic_load_barycentric_at_sample ||
+                op == nir_intrinsic_load_barycentric_at_offset)
                 continue;
-            }
  
-            exec_node_remove(&instr->node);
-            exec_list_push_head(&top->instr_list, &instr->node);
-            instr->block = top;
+            nir_instr *move[3] = {
+               &bary_intrinsic->instr,
+               intrin->src[1].ssa->parent_instr,
+               instr
+            };
+
+            for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
+               if (move[i]->block != top) {
+                  move[i]->block = top;
+                  exec_node_remove(&move[i]->node);
+                  if (cursor_node) {
+                     exec_node_insert_after(cursor_node, &move[i]->node);
+                  } else {
+                     exec_list_push_head(&top->instr_list, &move[i]->node);
+                  }
+                  cursor_node = &move[i]->node;
+               }
+            }
           }
        }
        nir_metadata_preserve(f->impl, (nir_metadata)