i965/miptree: Add real support for HiZ

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index fc91bbcfa46a078745c685cfed86204a9fbe5d9b..b89c6721ea00232e8cde40a1faf19bad16295530 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -253,7 +253,6 @@ fs_inst::is_send_from_grf() const
     switch (opcode) {
     case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
     case SHADER_OPCODE_SHADER_TIME_ADD:
-   case FS_OPCODE_INTERPOLATE_AT_CENTROID:
     case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
     case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
     case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
@@ -510,6 +509,19 @@ type_size_scalar(const struct glsl_type *type)
     return 0;
  }
  
+/**
+ * Returns the number of scalar components needed to store type, assuming
+ * that vectors are padded out to vec4.
+ *
+ * This has the packing rules of type_size_vec4(), but counts components
+ * similar to type_size_scalar().
+ */
+extern "C" int
+type_size_vec4_times_4(const struct glsl_type *type)
+{
+   return 4 * type_size_vec4(type);
+}
+
  /* Attribute arrays are loaded as one vec4 per element (or matrix column),
   * except for double-precision types, which are loaded as one dvec4.
   */
@@ -703,6 +715,10 @@ fs_inst::is_partial_write() const
  unsigned
  fs_inst::components_read(unsigned i) const
  {
+   /* Return zero if the source is not present. */
+   if (src[i].file == BAD_FILE)
+      return 0;
+
     switch (opcode) {
     case FS_OPCODE_LINTERP:
        if (i == 0)
@@ -883,11 +899,10 @@ fs_inst::regs_read(int arg) const
     }
  
     switch (src[arg].file) {
-   case BAD_FILE:
-      return 0;
     case UNIFORM:
     case IMM:
        return 1;
+   case BAD_FILE:
     case ARF:
     case FIXED_GRF:
     case VGRF:
@@ -1070,21 +1085,27 @@ fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
     bld.MOV(wpos, this->wpos_w);
  }
  
-static enum brw_barycentric_mode
-barycentric_mode(enum glsl_interp_mode mode,
-                 bool is_centroid, bool is_sample)
+enum brw_barycentric_mode
+brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
  {
-   unsigned bary;
-
     /* Barycentric modes don't make sense for flat inputs. */
     assert(mode != INTERP_MODE_FLAT);
  
-   if (is_sample) {
-      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
-   } else if (is_centroid) {
-      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
-   } else {
+   unsigned bary;
+   switch (op) {
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_at_offset:
        bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
+      break;
+   case nir_intrinsic_load_barycentric_centroid:
+      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
+      break;
+   case nir_intrinsic_load_barycentric_sample:
+   case nir_intrinsic_load_barycentric_at_sample:
+      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
+      break;
+   default:
+      unreachable("invalid intrinsic");
     }
  
     if (mode == INTERP_MODE_NOPERSPECTIVE)
@@ -1104,107 +1125,6 @@ centroid_to_pixel(enum brw_barycentric_mode bary)
     return (enum brw_barycentric_mode) ((unsigned) bary - 1);
  }
  
-void
-fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
-                                       const glsl_type *type,
-                                       glsl_interp_mode interpolation_mode,
-                                       int *location, bool mod_centroid,
-                                       bool mod_sample)
-{
-   assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-
-   if (type->is_array() || type->is_matrix()) {
-      const glsl_type *elem_type = glsl_get_array_element(type);
-      const unsigned length = glsl_get_length(type);
-
-      for (unsigned i = 0; i < length; i++) {
-         emit_general_interpolation(attr, name, elem_type, interpolation_mode,
-                                    location, mod_centroid, mod_sample);
-      }
-   } else if (type->is_record()) {
-      for (unsigned i = 0; i < type->length; i++) {
-         const glsl_type *field_type = type->fields.structure[i].type;
-         emit_general_interpolation(attr, name, field_type, interpolation_mode,
-                                    location, mod_centroid, mod_sample);
-      }
-   } else {
-      assert(type->is_scalar() || type->is_vector());
-
-      if (prog_data->urb_setup[*location] == -1) {
-         /* If there's no incoming setup data for this slot, don't
-          * emit interpolation for it.
-          */
-         *attr = offset(*attr, bld, type->vector_elements);
-         (*location)++;
-         return;
-      }
-
-      attr->type = brw_type_for_base_type(type->get_scalar_type());
-
-      if (interpolation_mode == INTERP_MODE_FLAT) {
-         /* Constant interpolation (flat shading) case. The SF has
-          * handed us defined values in only the constant offset
-          * field of the setup reg.
-          */
-         unsigned vector_elements = type->vector_elements;
-
-         /* Data starts at suboffet 3 in 32-bit units (12 bytes), so it is not
-          * 64-bit aligned and the current implementation fails to read the
-          * data properly. Instead, when there is a double input varying,
-          * read it as vector of floats with twice the number of components.
-          */
-         if (attr->type == BRW_REGISTER_TYPE_DF) {
-            vector_elements *= 2;
-            attr->type = BRW_REGISTER_TYPE_F;
-         }
-         for (unsigned int i = 0; i < vector_elements; i++) {
-            struct brw_reg interp = interp_reg(*location, i);
-            interp = suboffset(interp, 3);
-            interp.type = attr->type;
-            bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp));
-            *attr = offset(*attr, bld, 1);
-         }
-      } else {
-         /* Smooth/noperspective interpolation case. */
-         enum brw_barycentric_mode bary =
-            barycentric_mode(interpolation_mode, mod_centroid, mod_sample);
-
-         for (unsigned int i = 0; i < type->vector_elements; i++) {
-            fs_reg interp(interp_reg(*location, i));
-            if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
-               /* Get the pixel/sample mask into f0 so that we know
-                * which pixels are lit.  Then, for each channel that is
-                * unlit, replace the centroid data with non-centroid
-                * data.
-                */
-               bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
-
-               fs_inst *inst;
-               inst = bld.emit(FS_OPCODE_LINTERP, *attr,
-                               delta_xy[centroid_to_pixel(bary)], interp);
-               inst->predicate = BRW_PREDICATE_NORMAL;
-               inst->predicate_inverse = true;
-               inst->no_dd_clear = true;
-
-               inst = bld.emit(FS_OPCODE_LINTERP, *attr,
-                               delta_xy[bary], interp);
-               inst->predicate = BRW_PREDICATE_NORMAL;
-               inst->predicate_inverse = false;
-               inst->no_dd_check = true;
-            } else {
-               bld.emit(FS_OPCODE_LINTERP, *attr, delta_xy[bary], interp);
-            }
-            if (devinfo->gen < 6 && interpolation_mode == INTERP_MODE_SMOOTH) {
-               bld.MUL(*attr, *attr, this->pixel_w);
-            }
-            *attr = offset(*attr, bld, 1);
-         }
-      }
-      (*location)++;
-   }
-}
-
  fs_reg *
  fs_visitor::emit_frontfacing_interpolation()
  {
@@ -4174,16 +4094,6 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
  
     bool coordinate_done = false;
  
-   /* The sampler can only meaningfully compute LOD for fragment shader
-    * messages. For all other stages, we change the opcode to TXL and
-    * hardcode the LOD to 0.
-    */
-   if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
-       op == SHADER_OPCODE_TEX) {
-      op = SHADER_OPCODE_TXL;
-      lod = brw_imm_f(0.0f);
-   }
-
     /* Set up the LOD info */
     switch (op) {
     case FS_OPCODE_TXB:
@@ -4288,9 +4198,6 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
        coordinate_done = true;
        break;
     case SHADER_OPCODE_TG4_OFFSET:
-      /* gather4_po_c should have been lowered in SIMD16 mode. */
-      assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
-
        /* More crazy intermixing */
        for (unsigned i = 0; i < 2; i++) /* u, v */
           bld.MOV(sources[length++], offset(coordinate, bld, i));
@@ -4776,6 +4683,67 @@ get_fpu_lowered_simd_width(const struct brw_device_info *devinfo,
     return 1 << _mesa_logbase2(max_width);
  }
  
+/**
+ * Get the maximum allowed SIMD width for instruction \p inst accounting for
+ * various payload size restrictions that apply to sampler message
+ * instructions.
+ *
+ * This is only intended to provide a maximum theoretical bound for the
+ * execution size of the message based on the number of argument components
+ * alone, which in most cases will determine whether the SIMD8 or SIMD16
+ * variant of the message can be used, though some messages may have
+ * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
+ * the message length to determine the exact SIMD width and argument count,
+ * which makes a number of sampler message combinations impossible to
+ * represent).
+ */
+static unsigned
+get_sampler_lowered_simd_width(const struct brw_device_info *devinfo,
+                               const fs_inst *inst)
+{
+   /* Calculate the number of coordinate components that have to be present
+    * assuming that additional arguments follow the texel coordinates in the
+    * message payload.  On IVB+ there is no need for padding, on ILK-SNB we
+    * need to pad to four or three components depending on the message,
+    * pre-ILK we need to pad to at most three components.
+    */
+   const unsigned req_coord_components =
+      (devinfo->gen >= 7 ||
+       !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
+      (devinfo->gen >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
+                            inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
+      3;
+
+   /* On Gen9+ the LOD argument is for free if we're able to use the LZ
+    * variant of the TXL or TXF message.
+    */
+   const bool implicit_lod = devinfo->gen >= 9 &&
+                             (inst->opcode == SHADER_OPCODE_TXL ||
+                              inst->opcode == SHADER_OPCODE_TXF) &&
+                             inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
+
+   /* Calculate the total number of argument components that need to be passed
+    * to the sampler unit.
+    */
+   const unsigned num_payload_components =
+      MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
+           req_coord_components) +
+      inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
+      (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
+      inst->components_read(TEX_LOGICAL_SRC_LOD2) +
+      inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
+      (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
+       inst->components_read(TEX_LOGICAL_SRC_OFFSET_VALUE) : 0) +
+      inst->components_read(TEX_LOGICAL_SRC_MCS);
+
+   /* SIMD16 messages with more than five arguments exceed the maximum message
+    * size supported by the sampler, regardless of whether a header is
+    * provided or not.
+    */
+   return MIN2(inst->exec_size,
+               num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
+}
+
  /**
   * Get the closest native SIMD width supported by the hardware for instruction
   * \p inst.  The instruction will be left untouched by
@@ -4883,7 +4851,6 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
     case FS_OPCODE_PACK_HALF_2x16_SPLIT:
     case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
     case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
-   case FS_OPCODE_INTERPOLATE_AT_CENTROID:
     case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
     case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
     case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
@@ -4952,31 +4919,24 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
     case SHADER_OPCODE_LOD_LOGICAL:
     case SHADER_OPCODE_TG4_LOGICAL:
     case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
-      return MIN2(16, inst->exec_size);
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      return get_sampler_lowered_simd_width(devinfo, inst);
  
     case SHADER_OPCODE_TXD_LOGICAL:
        /* TXD is unsupported in SIMD16 mode. */
        return 8;
  
-   case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
-      /* gather4_po_c is unsupported in SIMD16 mode. */
-      const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
-      return (shadow_c.file != BAD_FILE ? 8 : MIN2(16, inst->exec_size));
-   }
     case SHADER_OPCODE_TXL_LOGICAL:
-   case FS_OPCODE_TXB_LOGICAL: {
-      /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
-       * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
-       * mode because the message exceeds the maximum length of 11.
+   case FS_OPCODE_TXB_LOGICAL:
+      /* Only one execution size is representable pre-ILK depending on whether
+       * the shadow reference argument is present.
         */
-      const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
-      if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
-         return 16;
-      else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
-         return 8;
+      if (devinfo->gen == 4)
+         return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
        else
-         return MIN2(16, inst->exec_size);
-   }
+         return get_sampler_lowered_simd_width(devinfo, inst);
+
     case SHADER_OPCODE_TXF_LOGICAL:
     case SHADER_OPCODE_TXS_LOGICAL:
        /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
@@ -4985,23 +4945,7 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
        if (devinfo->gen == 4)
           return 16;
        else
-         return MIN2(16, inst->exec_size);
-
-   case SHADER_OPCODE_TXF_CMS_W_LOGICAL: {
-      /* This opcode can take up to 6 arguments which means that in some
-       * circumstances it can end up with a message that is too long in SIMD16
-       * mode.
-       */
-      const unsigned coord_components =
-         inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
-      /* First three arguments are the sample index and the two arguments for
-       * the MCS data.
-       */
-      if ((coord_components + 3) * 2 > MAX_SAMPLER_MESSAGE_SIZE)
-         return 8;
-      else
-         return MIN2(16, inst->exec_size);
-   }
+         return get_sampler_lowered_simd_width(devinfo, inst);
  
     case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
     case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
@@ -5665,7 +5609,7 @@ fs_visitor::setup_gs_payload()
      * have to multiply by VerticesIn to obtain the total storage requirement.
      */
     if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
-       max_push_components) {
+       max_push_components || gs_prog_data->invocations > 1) {
        gs_prog_data->base.include_vue_handles = true;
  
        /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
@@ -6330,6 +6274,10 @@ fs_visitor::run_cs()
  /**
   * Return a bitfield where bit n is set if barycentric interpolation mode n
   * (see enum brw_barycentric_mode) is needed by the fragment shader.
+ *
+ * We examine the load_barycentric intrinsics rather than looking at input
+ * variables so that we catch interpolateAtCentroid() messages too, which
+ * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
   */
  static unsigned
  brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
@@ -6337,29 +6285,37 @@ brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
  {
     unsigned barycentric_interp_modes = 0;
  
-   nir_foreach_variable(var, &shader->inputs) {
-      /* Ignore WPOS; it doesn't require interpolation. */
-      if (var->data.location == VARYING_SLOT_POS)
+   nir_foreach_function(f, shader) {
+      if (!f->impl)
           continue;
  
-      /* Flat inputs don't need barycentric modes. */
-      if (var->data.interpolation == INTERP_MODE_FLAT)
-         continue;
+      nir_foreach_block(block, f->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
  
-      /* Determine the set (or sets) of barycentric coordinates needed to
-       * interpolate this variable.  Note that when
-       * brw->needs_unlit_centroid_workaround is set, centroid interpolation
-       * uses PIXEL interpolation for unlit pixels and CENTROID interpolation
-       * for lit pixels, so we need both sets of barycentric coordinates.
-       */
-      enum brw_barycentric_mode bary_mode =
-         barycentric_mode((glsl_interp_mode) var->data.interpolation,
-                          var->data.centroid, var->data.sample);
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
+               continue;
+
+            /* Ignore WPOS; it doesn't require interpolation. */
+            if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
+               continue;
  
-      barycentric_interp_modes |= 1 << bary_mode;
+            intrin = nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
+            enum glsl_interp_mode interp = (enum glsl_interp_mode)
+               nir_intrinsic_interp_mode(intrin);
+            nir_intrinsic_op bary_op = intrin->intrinsic;
+            enum brw_barycentric_mode bary =
+               brw_barycentric_mode(interp, bary_op);
  
-      if (var->data.centroid && devinfo->needs_unlit_centroid_workaround)
-         barycentric_interp_modes |= 1 << centroid_to_pixel(bary_mode);
+            barycentric_interp_modes |= 1 << bary;
+
+            if (devinfo->needs_unlit_centroid_workaround &&
+                bary_op == nir_intrinsic_load_barycentric_centroid)
+               barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
+         }
+      }
     }
  
     return barycentric_interp_modes;
@@ -6425,38 +6381,46 @@ move_interpolation_to_top(nir_shader *nir)
           continue;
  
        nir_block *top = nir_start_block(f->impl);
+      exec_node *cursor_node = NULL;
  
        nir_foreach_block(block, f->impl) {
           if (block == top)
              continue;
  
-         nir_foreach_instr_reverse_safe(instr, block) {
+         nir_foreach_instr_safe(instr, block) {
              if (instr->type != nir_instr_type_intrinsic)
                 continue;
  
              nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-            switch (intrin->intrinsic) {
-            case nir_intrinsic_load_barycentric_pixel:
-            case nir_intrinsic_load_barycentric_centroid:
-            case nir_intrinsic_load_barycentric_sample:
-               break;
-            case nir_intrinsic_load_interpolated_input: {
-               nir_intrinsic_instr *bary_intrinsic =
-                  nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
-               nir_intrinsic_op op = bary_intrinsic->intrinsic;
-
-               /* Leave interpolateAtSample/Offset() where it is. */
-               if (op == nir_intrinsic_load_barycentric_at_sample ||
-                   op == nir_intrinsic_load_barycentric_at_offset)
-                  continue;
-            }
-            default:
+            if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
+               continue;
+            nir_intrinsic_instr *bary_intrinsic =
+               nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
+            nir_intrinsic_op op = bary_intrinsic->intrinsic;
+
+            /* Leave interpolateAtSample/Offset() where they are. */
+            if (op == nir_intrinsic_load_barycentric_at_sample ||
+                op == nir_intrinsic_load_barycentric_at_offset)
                 continue;
-            }
  
-            exec_node_remove(&instr->node);
-            exec_list_push_head(&top->instr_list, &instr->node);
-            instr->block = top;
+            nir_instr *move[3] = {
+               &bary_intrinsic->instr,
+               intrin->src[1].ssa->parent_instr,
+               instr
+            };
+
+            for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
+               if (move[i]->block != top) {
+                  move[i]->block = top;
+                  exec_node_remove(&move[i]->node);
+                  if (cursor_node) {
+                     exec_node_insert_after(cursor_node, &move[i]->node);
+                  } else {
+                     exec_list_push_head(&top->instr_list, &move[i]->node);
+                  }
+                  cursor_node = &move[i]->node;
+               }
+            }
           }
        }
        nir_metadata_preserve(f->impl, (nir_metadata)