v3d: Force sampling from base level for tg4.

[mesa.git] / src / broadcom / compiler / v3d40_tex.c
diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d40_tex.c

index 0b41f37d1ea46648eea6164411436e315775dc8c..c12d14c6fe6aedf33fa9dd4034c8503fb8b94bfb 100644 (file)
--- a/src/broadcom/compiler/v3d40_tex.c
+++ b/src/broadcom/compiler/v3d40_tex.c
@@ -31,9 +31,15 @@
  #include "cle/v3d_packet_v41_pack.h"
  
  static void
-vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
+vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val,
+              int *tmu_writes)
  {
+        /* XXX perf: We should figure out how to merge ALU operations
+         * producing the val with this MOV, when possible.
+         */
          vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
+
+        (*tmu_writes)++;
  }
  
  static void
@@ -49,6 +55,10 @@ void
  v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
  {
          unsigned unit = instr->texture_index;
+        int tmu_writes = 0;
+        static const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
+                .op = V3D_TMU_OP_REGULAR,
+        };
  
          struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
          };
@@ -68,6 +78,8 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                  .gather_component = instr->component,
  
                  .coefficient_mode = instr->op == nir_texop_txd,
+
+                .disable_autolod = instr->op == nir_texop_tg4
          };
  
          int non_array_components = instr->coord_components - instr->is_array;
@@ -82,39 +94,41 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                          if (non_array_components > 1) {
                                  vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUT,
                                                ntq_get_src(c, instr->src[i].src,
-                                                          1));
+                                                          1), &tmu_writes);
                          }
                          if (non_array_components > 2) {
                                  vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUR,
                                                ntq_get_src(c, instr->src[i].src,
-                                                          2));
+                                                          2), &tmu_writes);
                          }
  
                          if (instr->is_array) {
                                  vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUI,
                                                ntq_get_src(c, instr->src[i].src,
-                                                          instr->coord_components - 1));
+                                                          instr->coord_components - 1),
+                                              &tmu_writes);
                          }
                          break;
  
                  case nir_tex_src_bias:
                          vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB,
-                                      ntq_get_src(c, instr->src[i].src, 0));
+                                      ntq_get_src(c, instr->src[i].src, 0),
+                                      &tmu_writes);
                          break;
  
                  case nir_tex_src_lod:
                          vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB,
-                                      ntq_get_src(c, instr->src[i].src, 0));
+                                      ntq_get_src(c, instr->src[i].src, 0),
+                                      &tmu_writes);
  
-                        if (instr->op != nir_texop_txf &&
-                            instr->op != nir_texop_tg4) {
+                        if (instr->op != nir_texop_txf)
                                  p2_unpacked.disable_autolod = true;
-                        }
                          break;
  
                  case nir_tex_src_comparator:
                          vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUDREF,
-                                      ntq_get_src(c, instr->src[i].src, 0));
+                                      ntq_get_src(c, instr->src[i].src, 0),
+                                      &tmu_writes);
                          break;
  
                  case nir_tex_src_offset: {
@@ -136,6 +150,10 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
  
          /* Limit the number of channels returned to both how many the NIR
           * instruction writes and how many the instruction could produce.
+         *
+         * XXX perf: Can we also limit to the number of channels that are
+         * actually read by the users of this NIR dest, so that we don't need
+         * to emit unused LDTMUs?
           */
          uint32_t instr_return_channels = nir_tex_instr_dest_size(instr);
          if (!p1_unpacked.output_type_32_bit)
@@ -145,6 +163,14 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                  (1 << MIN2(instr_return_channels,
                             c->key->tex[unit].return_channels)) - 1;
  
+        /* Word enables can't ask for more channels than the output type could
+         * provide (2 for f16, 4 for 32-bit).
+         */
+        assert(!p1_unpacked.output_type_32_bit ||
+               p0_unpacked.return_words_of_texture_data < (1 << 4));
+        assert(p1_unpacked.output_type_32_bit ||
+               p0_unpacked.return_words_of_texture_data < (1 << 2));
+
          uint32_t p0_packed;
          V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
                                            (uint8_t *)&p0_packed,
@@ -168,20 +194,28 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
          p1_packed |= unit << 24;
  
          vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed);
+        /* XXX perf: Can we skip p1 setup for txf ops? */
          vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P1, p1_packed);
-        vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
+        if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)) != 0)
+                vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
  
          if (instr->op == nir_texop_txf) {
                  assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s, &tmu_writes);
          } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s, &tmu_writes);
          } else {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s, &tmu_writes);
          }
  
          vir_emit_thrsw(c);
  
+        /* The input FIFO has 16 slots across all threads, so make sure we
+         * don't overfill our allocation.
+         */
+        while (tmu_writes > 16 / c->threads)
+                c->threads /= 2;
+
          struct qreg return_values[4];
          for (int i = 0; i < 4; i++) {
                  /* Swizzling .zw of an RG texture should give undefined
@@ -200,6 +234,12 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                          STATIC_ASSERT(PIPE_SWIZZLE_X == 0);
                          chan = return_values[i / 2];
  
+                        /* XXX perf: We should move this unpacking into NIR.
+                         * That would give us exposure of these types to NIR
+                         * optimization, so that (for example) a repacking of
+                         * half-float samples to the half-float render target
+                         * could be eliminated.
+                         */
                          if (nir_alu_type_get_base_type(instr->dest_type) ==
                              nir_type_float) {
                                  enum v3d_qpu_input_unpack unpack;