v3d: Force sampling from base level for tg4.
[mesa.git] / src / broadcom / compiler / v3d40_tex.c
index 0b41f37d1ea46648eea6164411436e315775dc8c..c12d14c6fe6aedf33fa9dd4034c8503fb8b94bfb 100644 (file)
 #include "cle/v3d_packet_v41_pack.h"
 
 static void
-vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
+vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val,
+              int *tmu_writes)
 {
+        /* XXX perf: We should figure out how to merge ALU operations
+         * producing the val with this MOV, when possible.
+         */
         vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
+
+        (*tmu_writes)++;
 }
 
 static void
@@ -49,6 +55,10 @@ void
 v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 {
         unsigned unit = instr->texture_index;
+        int tmu_writes = 0;
+        static const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
+                .op = V3D_TMU_OP_REGULAR,
+        };
 
         struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
         };
@@ -68,6 +78,8 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                 .gather_component = instr->component,
 
                 .coefficient_mode = instr->op == nir_texop_txd,
+
+                .disable_autolod = instr->op == nir_texop_tg4
         };
 
         int non_array_components = instr->coord_components - instr->is_array;
@@ -82,39 +94,41 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                         if (non_array_components > 1) {
                                 vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUT,
                                               ntq_get_src(c, instr->src[i].src,
-                                                          1));
+                                                          1), &tmu_writes);
                         }
                         if (non_array_components > 2) {
                                 vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUR,
                                               ntq_get_src(c, instr->src[i].src,
-                                                          2));
+                                                          2), &tmu_writes);
                         }
 
                         if (instr->is_array) {
                                 vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUI,
                                               ntq_get_src(c, instr->src[i].src,
-                                                          instr->coord_components - 1));
+                                                          instr->coord_components - 1),
+                                              &tmu_writes);
                         }
                         break;
 
                 case nir_tex_src_bias:
                         vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB,
-                                      ntq_get_src(c, instr->src[i].src, 0));
+                                      ntq_get_src(c, instr->src[i].src, 0),
+                                      &tmu_writes);
                         break;
 
                 case nir_tex_src_lod:
                         vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB,
-                                      ntq_get_src(c, instr->src[i].src, 0));
+                                      ntq_get_src(c, instr->src[i].src, 0),
+                                      &tmu_writes);
 
-                        if (instr->op != nir_texop_txf &&
-                            instr->op != nir_texop_tg4) {
+                        if (instr->op != nir_texop_txf)
                                 p2_unpacked.disable_autolod = true;
-                        }
                         break;
 
                 case nir_tex_src_comparator:
                         vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUDREF,
-                                      ntq_get_src(c, instr->src[i].src, 0));
+                                      ntq_get_src(c, instr->src[i].src, 0),
+                                      &tmu_writes);
                         break;
 
                 case nir_tex_src_offset: {
@@ -136,6 +150,10 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 
         /* Limit the number of channels returned to both how many the NIR
          * instruction writes and how many the instruction could produce.
+         *
+         * XXX perf: Can we also limit to the number of channels that are
+         * actually read by the users of this NIR dest, so that we don't need
+         * to emit unused LDTMUs?
          */
         uint32_t instr_return_channels = nir_tex_instr_dest_size(instr);
         if (!p1_unpacked.output_type_32_bit)
@@ -145,6 +163,14 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                 (1 << MIN2(instr_return_channels,
                            c->key->tex[unit].return_channels)) - 1;
 
+        /* Word enables can't ask for more channels than the output type could
+         * provide (2 for f16, 4 for 32-bit).
+         */
+        assert(!p1_unpacked.output_type_32_bit ||
+               p0_unpacked.return_words_of_texture_data < (1 << 4));
+        assert(p1_unpacked.output_type_32_bit ||
+               p0_unpacked.return_words_of_texture_data < (1 << 2));
+
         uint32_t p0_packed;
         V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
                                           (uint8_t *)&p0_packed,
@@ -168,20 +194,28 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
         p1_packed |= unit << 24;
 
         vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed);
+        /* XXX perf: Can we skip p1 setup for txf ops? */
         vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P1, p1_packed);
-        vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
+        if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)) != 0)
+                vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
 
         if (instr->op == nir_texop_txf) {
                 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s, &tmu_writes);
         } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s, &tmu_writes);
         } else {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s, &tmu_writes);
         }
 
         vir_emit_thrsw(c);
 
+        /* The input FIFO has 16 slots across all threads, so make sure we
+         * don't overfill our allocation.
+         */
+        while (tmu_writes > 16 / c->threads)
+                c->threads /= 2;
+
         struct qreg return_values[4];
         for (int i = 0; i < 4; i++) {
                 /* Swizzling .zw of an RG texture should give undefined
@@ -200,6 +234,12 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                         STATIC_ASSERT(PIPE_SWIZZLE_X == 0);
                         chan = return_values[i / 2];
 
+                        /* XXX perf: We should move this unpacking into NIR.
+                         * That would give us exposure of these types to NIR
+                         * optimization, so that (for example) a repacking of
+                         * half-float samples to the half-float render target
+                         * could be eliminated.
+                         */
                         if (nir_alu_type_get_base_type(instr->dest_type) ==
                             nir_type_float) {
                                 enum v3d_qpu_input_unpack unpack;