From 6b4dfd53ae9b4f86cda0377a4d67b79e9faf7cc8 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 23 Jun 2015 09:50:36 -0700
Subject: [PATCH] vc4: Add support for texel fetches from MSAA resources.

This is the core of ARB_texture_multisample.  Most of the piglit tests for
GL_ARB_texture_multisample require GL 3.0, but exposing support for this
lets us use the gallium blitter for multisample resolves.  We can
sometimes multisample resolve using just the RCL, but that requires that
the blit is 1:1, unflipped, and aligned to tile boundaries.
---
 src/gallium/drivers/vc4/Makefile.sources      |   1 +
 .../drivers/vc4/vc4_nir_lower_txf_ms.c        | 172 ++++++++++++++++++
 src/gallium/drivers/vc4/vc4_program.c         | 101 ++++++++--
 src/gallium/drivers/vc4/vc4_qir.h             |  18 +-
 src/gallium/drivers/vc4/vc4_uniforms.c        |  18 ++
 5 files changed, 295 insertions(+), 15 deletions(-)
 create mode 100644 src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c

diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index 6fb40c20562..24b577ae9f3 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -21,6 +21,7 @@ C_SOURCES := \
 	vc4_job.c \
 	vc4_nir_lower_blend.c \
 	vc4_nir_lower_io.c \
+	vc4_nir_lower_txf_ms.c \
 	vc4_opt_algebraic.c \
 	vc4_opt_constant_folding.c \
 	vc4_opt_copy_propagation.c \
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
new file mode 100644
index 00000000000..54873e6186a
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright Â© 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "vc4_qir.h"
+#include "kernel/vc4_packet.h"
+#include "tgsi/tgsi_info.h"
+#include "glsl/nir/nir_builder.h"
+
+/** @file vc4_nir_lower_txf_ms.c
+ * Walks the NIR generated by TGSI-to-NIR to lower its nir_texop_txf_ms
+ * coordinates to do the math necessary and use a plain nir_texop_txf instead.
+ *
+ * MSAA textures are laid out as 32x32-aligned blocks of RGBA8888 or Z24S8.
+ * We can't load them through the normal sampler path because of the lack of
+ * linear support in the hardware.  So, we treat MSAA textures as a giant UBO
+ * and do the math in the shader.
+ */
+
+static void
+vc4_nir_lower_txf_ms_instr(struct vc4_compile *c, nir_builder *b,
+                           nir_tex_instr *txf_ms)
+{
+        if (txf_ms->op != nir_texop_txf_ms)
+                return;
+
+        b->cursor = nir_before_instr(&txf_ms->instr);
+
+        nir_tex_instr *txf = nir_tex_instr_create(c->s, 1);
+        txf->op = nir_texop_txf;
+        txf->sampler = txf_ms->sampler;
+        txf->sampler_index = txf_ms->sampler_index;
+        txf->coord_components = txf_ms->coord_components;
+        txf->is_shadow = txf_ms->is_shadow;
+        txf->is_new_style_shadow = txf_ms->is_new_style_shadow;
+
+        nir_ssa_def *coord = NULL, *sample_index = NULL;
+        for (int i = 0; i < txf_ms->num_srcs; i++) {
+                assert(txf_ms->src[i].src.is_ssa);
+
+                switch (txf_ms->src[i].src_type) {
+                case nir_tex_src_coord:
+                        coord = txf_ms->src[i].src.ssa;
+                        break;
+                case nir_tex_src_ms_index:
+                        sample_index = txf_ms->src[i].src.ssa;
+                        break;
+                default:
+                        unreachable("Unknown txf_ms src\n");
+                }
+        }
+        assert(coord);
+        assert(sample_index);
+
+        nir_ssa_def *x = nir_channel(b, coord, 0);
+        nir_ssa_def *y = nir_channel(b, coord, 1);
+
+        uint32_t tile_w = 32;
+        uint32_t tile_h = 32;
+        uint32_t tile_w_shift = 5;
+        uint32_t tile_h_shift = 5;
+        uint32_t tile_size = (tile_h * tile_w *
+                              VC4_MAX_SAMPLES * sizeof(uint32_t));
+        unsigned unit = txf_ms->sampler_index;
+        uint32_t w = align(c->key->tex[unit].msaa_width, tile_w);
+        uint32_t w_tiles = w / tile_w;
+
+        nir_ssa_def *x_tile = nir_ushr(b, x, nir_imm_int(b, tile_w_shift));
+        nir_ssa_def *y_tile = nir_ushr(b, y, nir_imm_int(b, tile_h_shift));
+        nir_ssa_def *tile_addr = nir_iadd(b,
+                                          nir_imul(b, x_tile,
+                                                   nir_imm_int(b, tile_size)),
+                                          nir_imul(b, y_tile,
+                                                   nir_imm_int(b, (w_tiles *
+                                                                   tile_size))));
+        nir_ssa_def *x_subspan = nir_iand(b, x,
+                                          nir_imm_int(b, (tile_w - 1) & ~1));
+        nir_ssa_def *y_subspan = nir_iand(b, y,
+                                          nir_imm_int(b, (tile_h - 1) & ~1));
+        nir_ssa_def *subspan_addr = nir_iadd(b,
+                                             nir_imul(b, x_subspan,
+                                                      nir_imm_int(b, 2 * VC4_MAX_SAMPLES * sizeof(uint32_t))),
+                                             nir_imul(b, y_subspan,
+                                                      nir_imm_int(b,
+                                                                  tile_w *
+                                                                  VC4_MAX_SAMPLES *
+                                                                  sizeof(uint32_t))));
+
+        nir_ssa_def *pixel_addr = nir_ior(b,
+                                          nir_iand(b,
+                                                   nir_ishl(b, x,
+                                                            nir_imm_int(b, 2)),
+                                                   nir_imm_int(b, (1 << 2))),
+                                          nir_iand(b,
+                                                   nir_ishl(b, y,
+                                                            nir_imm_int(b, 3)),
+                                                   nir_imm_int(b, (1 << 3))));
+
+        nir_ssa_def *sample_addr = nir_ishl(b, sample_index, nir_imm_int(b, 4));
+
+        nir_ssa_def *addr = nir_iadd(b,
+                                     nir_ior(b, sample_addr, pixel_addr),
+                                     nir_iadd(b, subspan_addr, tile_addr));
+
+        txf->src[0].src_type = nir_tex_src_coord;
+        txf->src[0].src = nir_src_for_ssa(nir_vec2(b, addr, nir_imm_int(b, 0)));
+        nir_ssa_dest_init(&txf->instr, &txf->dest, 4, NULL);
+        nir_builder_instr_insert(b, &txf->instr);
+        nir_ssa_def_rewrite_uses(&txf_ms->dest.ssa,
+                                 nir_src_for_ssa(&txf->dest.ssa));
+        nir_instr_remove(&txf_ms->instr);
+}
+
+static bool
+vc4_nir_lower_txf_ms_block(nir_block *block, void *arg)
+{
+        struct vc4_compile *c = arg;
+        nir_function_impl *impl =
+                nir_cf_node_get_function(&block->cf_node);
+
+        nir_builder b;
+        nir_builder_init(&b, impl);
+
+        nir_foreach_instr_safe(block, instr) {
+                if (instr->type == nir_instr_type_tex) {
+                        vc4_nir_lower_txf_ms_instr(c, &b,
+                                                   nir_instr_as_tex(instr));
+                }
+        }
+
+        return true;
+}
+
+static bool
+vc4_nir_lower_txf_ms_impl(struct vc4_compile *c, nir_function_impl *impl)
+{
+        nir_foreach_block(impl, vc4_nir_lower_txf_ms_block, c);
+
+        nir_metadata_preserve(impl,
+                              nir_metadata_block_index |
+                              nir_metadata_dominance);
+
+        return true;
+}
+
+void
+vc4_nir_lower_txf_ms(struct vc4_compile *c)
+{
+        nir_foreach_overload(c->s, overload) {
+                if (overload->impl)
+                        vc4_nir_lower_txf_ms_impl(c, overload->impl);
+        }
+}
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index dda2d84b5b3..31968bb5db9 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -294,6 +294,76 @@ ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
                                         qir_uniform_ui(c, 24)));
 }
 
+static struct qreg
+ntq_scale_depth_texture(struct vc4_compile *c, struct qreg src)
+{
+        struct qreg depthf = qir_ITOF(c, qir_SHR(c, src,
+                                                 qir_uniform_ui(c, 8)));
+        return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff));
+}
+
+/**
+ * Emits a lowered TXF_MS from an MSAA texture.
+ *
+ * The addressing math has been lowered in NIR, and now we just need to read
+ * it like a UBO.
+ */
+static void
+ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
+{
+        uint32_t tile_width = 32;
+        uint32_t tile_height = 32;
+        uint32_t tile_size = (tile_height * tile_width *
+                              VC4_MAX_SAMPLES * sizeof(uint32_t));
+
+        unsigned unit = instr->sampler_index;
+        uint32_t w = align(c->key->tex[unit].msaa_width, tile_width);
+        uint32_t w_tiles = w / tile_width;
+        uint32_t h = align(c->key->tex[unit].msaa_height, tile_height);
+        uint32_t h_tiles = h / tile_height;
+        uint32_t size = w_tiles * h_tiles * tile_size;
+
+        struct qreg addr;
+        assert(instr->num_srcs == 1);
+        assert(instr->src[0].src_type == nir_tex_src_coord);
+        addr = ntq_get_src(c, instr->src[0].src, 0);
+
+        /* Perform the clamping required by kernel validation. */
+        addr = qir_MAX(c, addr, qir_uniform_ui(c, 0));
+        addr = qir_MIN(c, addr,  qir_uniform_ui(c, size - 4));
+
+        qir_TEX_DIRECT(c, addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
+
+        struct qreg tex = qir_TEX_RESULT(c);
+        c->num_texture_samples++;
+
+        struct qreg texture_output[4];
+        enum pipe_format format = c->key->tex[unit].format;
+        if (util_format_is_depth_or_stencil(format)) {
+                struct qreg scaled = ntq_scale_depth_texture(c, tex);
+                for (int i = 0; i < 4; i++)
+                        texture_output[i] = scaled;
+        } else {
+                struct qreg tex_result_unpacked[4];
+                for (int i = 0; i < 4; i++)
+                        tex_result_unpacked[i] = qir_UNPACK_8_F(c, tex, i);
+
+                const uint8_t *format_swiz =
+                        vc4_get_format_swizzle(c->key->tex[unit].format);
+                for (int i = 0; i < 4; i++) {
+                        texture_output[i] =
+                                get_swizzled_channel(c, tex_result_unpacked,
+                                                     format_swiz[i]);
+                }
+        }
+
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
+        for (int i = 0; i < 4; i++) {
+                dest[i] = get_swizzled_channel(c, texture_output,
+                                               c->key->tex[unit].swizzle[i]);
+        }
+}
+
 static void
 ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
 {
@@ -301,6 +371,11 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
         bool is_txb = false, is_txl = false, has_proj = false;
         unsigned unit = instr->sampler_index;
 
+        if (instr->op == nir_texop_txf) {
+                ntq_emit_txf(c, instr);
+                return;
+        }
+
         for (unsigned i = 0; i < instr->num_srcs; i++) {
                 switch (instr->src[i].src_type) {
                 case nir_tex_src_coord:
@@ -396,11 +471,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
 
         struct qreg unpacked[4];
         if (util_format_is_depth_or_stencil(format)) {
-                struct qreg depthf = qir_ITOF(c, qir_SHR(c, tex,
-                                                         qir_uniform_ui(c, 8)));
-                struct qreg normalized = qir_FMUL(c, depthf,
-                                                  qir_uniform_f(c, 1.0f/0xffffff));
-
+                struct qreg normalized = ntq_scale_depth_texture(c, tex);
                 struct qreg depth_output;
 
                 struct qreg one = qir_uniform_f(c, 1.0f);
@@ -1712,6 +1783,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
                 nir_lower_clip_vs(c->s, c->key->ucp_enables);
 
         vc4_nir_lower_io(c);
+        vc4_nir_lower_txf_ms(c);
         nir_lower_idiv(c->s);
         nir_lower_load_const_to_scalar(c->s);
 
@@ -1947,12 +2019,19 @@ vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,
                 struct pipe_sampler_state *sampler_state =
                         texstate->samplers[i];
 
-                if (sampler) {
-                        key->tex[i].format = sampler->format;
-                        key->tex[i].swizzle[0] = sampler->swizzle_r;
-                        key->tex[i].swizzle[1] = sampler->swizzle_g;
-                        key->tex[i].swizzle[2] = sampler->swizzle_b;
-                        key->tex[i].swizzle[3] = sampler->swizzle_a;
+                if (!sampler)
+                        continue;
+
+                key->tex[i].format = sampler->format;
+                key->tex[i].swizzle[0] = sampler->swizzle_r;
+                key->tex[i].swizzle[1] = sampler->swizzle_g;
+                key->tex[i].swizzle[2] = sampler->swizzle_b;
+                key->tex[i].swizzle[3] = sampler->swizzle_a;
+
+                if (sampler->texture->nr_samples) {
+                        key->tex[i].msaa_width = sampler->texture->width0;
+                        key->tex[i].msaa_height = sampler->texture->height0;
+                } else if (sampler){
                         key->tex[i].compare_mode = sampler_state->compare_mode;
                         key->tex[i].compare_func = sampler_state->compare_func;
                         key->tex[i].wrap_s = sampler_state->wrap_s;
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 4e406d60d72..d53095ed222 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -233,6 +233,8 @@ enum quniform_contents {
         /** A reference to a texture config parameter 2 cubemap stride uniform */
         QUNIFORM_TEXTURE_CONFIG_P2,
 
+        QUNIFORM_TEXTURE_MSAA_ADDR,
+
         QUNIFORM_UBO_ADDR,
 
         QUNIFORM_TEXRECT_SCALE_X,
@@ -287,11 +289,18 @@ struct vc4_key {
         struct vc4_uncompiled_shader *shader_state;
         struct {
                 enum pipe_format format;
-                unsigned compare_mode:1;
-                unsigned compare_func:3;
-                unsigned wrap_s:3;
-                unsigned wrap_t:3;
                 uint8_t swizzle[4];
+                union {
+                        struct {
+                                unsigned compare_mode:1;
+                                unsigned compare_func:3;
+                                unsigned wrap_s:3;
+                                unsigned wrap_t:3;
+                        };
+                        struct {
+                                uint16_t msaa_width, msaa_height;
+                        };
+                };
         } tex[VC4_MAX_TEXTURE_SAMPLERS];
         uint8_t ucp_enables;
 };
@@ -490,6 +499,7 @@ nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
                                        enum quniform_contents contents);
 nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b,
                                           nir_ssa_def **srcs, int swiz);
+void vc4_nir_lower_txf_ms(struct vc4_compile *c);
 void qir_lower_uniforms(struct vc4_compile *c);
 
 void qpu_schedule_instructions(struct vc4_compile *c);
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
index 5dfdd73f7bd..262531f1bd7 100644
--- a/src/gallium/drivers/vc4/vc4_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -71,6 +71,18 @@ write_texture_p2(struct vc4_context *vc4,
                VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD));
 }
 
+static void
+write_texture_msaa_addr(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                        struct vc4_texture_stateobj *texstate,
+                        uint32_t unit)
+{
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc4_resource *rsc = vc4_resource(texture->texture);
+
+        cl_aligned_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo, 0);
+}
+
 
 #define SWIZ(x,y,z,w) {          \
         UTIL_FORMAT_SWIZZLE_##x, \
@@ -244,6 +256,11 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                         cl_aligned_reloc(vc4, &vc4->uniforms, &uniforms, ubo, 0);
                         break;
 
+                case QUNIFORM_TEXTURE_MSAA_ADDR:
+                        write_texture_msaa_addr(vc4, &uniforms,
+                                                texstate, uinfo->data[i]);
+                        break;
+
                 case QUNIFORM_TEXTURE_BORDER_COLOR:
                         write_texture_border_color(vc4, &uniforms,
                                                    texstate, uinfo->data[i]);
@@ -349,6 +366,7 @@ vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader)
                 case QUNIFORM_TEXTURE_CONFIG_P1:
                 case QUNIFORM_TEXTURE_CONFIG_P2:
                 case QUNIFORM_TEXTURE_BORDER_COLOR:
+                case QUNIFORM_TEXTURE_MSAA_ADDR:
                 case QUNIFORM_TEXRECT_SCALE_X:
                 case QUNIFORM_TEXRECT_SCALE_Y:
                         dirty |= VC4_DIRTY_TEXSTATE;
-- 
2.30.2