From a97b40dca4949b5b8b3320e76768e54f430c9e78 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 23 Jun 2015 13:11:55 -0700
Subject: [PATCH] vc4: Add support for multisample framebuffer operations.

This includes GL_SAMPLE_COVERAGE, GL_SAMPLE_ALPHA_TO_ONE, and
GL_SAMPLE_ALPHA_TO_COVAGE.

I haven't implemented a dithering function yet, and gallium doesn't give
me a good chance to do so for GL_SAMPLE_COVERAGE.
---
 src/gallium/drivers/vc4/vc4_nir_lower_blend.c | 132 +++++++++++++++---
 src/gallium/drivers/vc4/vc4_nir_lower_io.c    |   4 +-
 src/gallium/drivers/vc4/vc4_program.c         |  48 ++++++-
 src/gallium/drivers/vc4/vc4_qir.c             |   1 +
 src/gallium/drivers/vc4/vc4_qir.h             |  12 ++
 src/gallium/drivers/vc4/vc4_qpu.h             |  11 ++
 src/gallium/drivers/vc4/vc4_qpu_emit.c        |   7 +
 7 files changed, 191 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index 0672a92226f..38676cff6b7 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -29,6 +29,10 @@
  * from the tile buffer after having waited for the scoreboard (which is
  * handled by vc4_qpu_emit.c), then do math using your output color and that
  * destination value, and update the output color appropriately.
+ *
+ * Once this pass is done, the color write will either have one component (for
+ * single sample) with packed argb8888, or 4 components with the per-sample
+ * argb8888 result.
  */
 
 /**
@@ -40,15 +44,23 @@
 #include "glsl/nir/nir_builder.h"
 #include "vc4_context.h"
 
+static bool
+blend_depends_on_dst_color(struct vc4_compile *c)
+{
+        return (c->fs_key->blend.blend_enable ||
+                c->fs_key->blend.colormask != 0xf ||
+                c->fs_key->logicop_func != PIPE_LOGICOP_COPY);
+}
+
 /** Emits a load of the previous fragment color from the tile buffer. */
 static nir_ssa_def *
-vc4_nir_get_dst_color(nir_builder *b)
+vc4_nir_get_dst_color(nir_builder *b, int sample)
 {
         nir_intrinsic_instr *load =
                 nir_intrinsic_instr_create(b->shader,
                                            nir_intrinsic_load_input);
         load->num_components = 1;
-        load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT;
+        load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT + sample;
         nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL);
         nir_builder_instr_insert(b, &load->instr);
         return &load->dest.ssa;
@@ -496,23 +508,26 @@ vc4_nir_swizzle_and_pack(struct vc4_compile *c, nir_builder *b,
 
 }
 
-static void
-vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
-                          nir_intrinsic_instr *intr)
+static nir_ssa_def *
+vc4_nir_blend_pipeline(struct vc4_compile *c, nir_builder *b, nir_ssa_def *src,
+                       int sample)
 {
         enum pipe_format color_format = c->fs_key->color_format;
         const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
         bool srgb = util_format_is_srgb(color_format);
 
         /* Pull out the float src/dst color components. */
-        nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b);
+        nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b, sample);
         nir_ssa_def *dst_vec4 = nir_unpack_unorm_4x8(b, packed_dst_color);
         nir_ssa_def *src_color[4], *unpacked_dst_color[4];
         for (unsigned i = 0; i < 4; i++) {
-                src_color[i] = nir_channel(b, intr->src[0].ssa, i);
+                src_color[i] = nir_channel(b, src, i);
                 unpacked_dst_color[i] = nir_channel(b, dst_vec4, i);
         }
 
+        if (c->fs_key->sample_alpha_to_one && c->fs_key->msaa)
+                src_color[3] = nir_imm_float(b, 1.0);
+
         vc4_nir_emit_alpha_test_discard(c, b, src_color[3]);
 
         nir_ssa_def *packed_color;
@@ -560,16 +575,101 @@ vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
                         colormask &= ~(0xff << (i * 8));
                 }
         }
-        packed_color = nir_ior(b,
-                               nir_iand(b, packed_color,
-                                        nir_imm_int(b, colormask)),
-                               nir_iand(b, packed_dst_color,
-                                        nir_imm_int(b, ~colormask)));
 
-        /* Turn the old vec4 output into a store of the packed color. */
-        nir_instr_rewrite_src(&intr->instr, &intr->src[0],
-                              nir_src_for_ssa(packed_color));
+        return nir_ior(b,
+                       nir_iand(b, packed_color,
+                                nir_imm_int(b, colormask)),
+                       nir_iand(b, packed_dst_color,
+                                nir_imm_int(b, ~colormask)));
+}
+
+static int
+vc4_nir_next_output_driver_location(nir_shader *s)
+{
+        int maxloc = -1;
+
+        nir_foreach_variable(var, &s->inputs)
+                maxloc = MAX2(maxloc, var->data.driver_location);
+
+        return maxloc;
+}
+
+static void
+vc4_nir_store_sample_mask(struct vc4_compile *c, nir_builder *b,
+                          nir_ssa_def *val)
+{
+        nir_variable *sample_mask = nir_variable_create(c->s, nir_var_shader_out,
+                                                        glsl_uint_type(),
+                                                        "sample_mask");
+        sample_mask->data.driver_location =
+                vc4_nir_next_output_driver_location(c->s);
+        sample_mask->data.location = FRAG_RESULT_SAMPLE_MASK;
+        exec_list_push_tail(&c->s->outputs, &sample_mask->node);
+
+        nir_intrinsic_instr *intr =
+                nir_intrinsic_instr_create(c->s, nir_intrinsic_store_output);
         intr->num_components = 1;
+        intr->const_index[0] = sample_mask->data.location;
+
+        intr->src[0] = nir_src_for_ssa(val);
+        nir_builder_instr_insert(b, &intr->instr);
+}
+
+static void
+vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
+                          nir_intrinsic_instr *intr)
+{
+        nir_ssa_def *frag_color = intr->src[0].ssa;
+
+        if (c->fs_key->sample_coverage) {
+                nir_intrinsic_instr *load =
+                        nir_intrinsic_instr_create(b->shader,
+                                                   nir_intrinsic_load_sample_mask_in);
+                load->num_components = 1;
+                nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL);
+                nir_builder_instr_insert(b, &load->instr);
+
+                nir_ssa_def *bitmask = &load->dest.ssa;
+
+                vc4_nir_store_sample_mask(c, b, bitmask);
+        } else if (c->fs_key->sample_alpha_to_coverage) {
+                nir_ssa_def *a = nir_channel(b, frag_color, 3);
+
+                /* XXX: We should do a nice dither based on the fragment
+                 * coordinate, instead.
+                 */
+                nir_ssa_def *num_samples = nir_imm_float(b, VC4_MAX_SAMPLES);
+                nir_ssa_def *num_bits = nir_f2i(b, nir_fmul(b, a, num_samples));
+                nir_ssa_def *bitmask = nir_isub(b,
+                                                nir_ishl(b,
+                                                         nir_imm_int(b, 1),
+                                                         num_bits),
+                                                nir_imm_int(b, 1));
+                vc4_nir_store_sample_mask(c, b, bitmask);
+        }
+
+        /* The TLB color read returns each sample in turn, so if our blending
+         * depends on the destination color, we're going to have to run the
+         * blending function separately for each destination sample value, and
+         * then output the per-sample color using TLB_COLOR_MS.
+         */
+        nir_ssa_def *blend_output;
+        if (c->fs_key->msaa && blend_depends_on_dst_color(c)) {
+                c->msaa_per_sample_output = true;
+
+                nir_ssa_def *samples[4];
+                for (int i = 0; i < VC4_MAX_SAMPLES; i++)
+                        samples[i] = vc4_nir_blend_pipeline(c, b, frag_color, i);
+                blend_output = nir_vec4(b,
+                                        samples[0], samples[1],
+                                        samples[2], samples[3]);
+        } else {
+                blend_output = vc4_nir_blend_pipeline(c, b, frag_color, 0);
+        }
+
+        nir_instr_rewrite_src(&intr->instr, &intr->src[0],
+                              nir_src_for_ssa(blend_output));
+        intr->num_components = blend_output->num_components;
 }
 
 static bool
@@ -577,7 +677,7 @@ vc4_nir_lower_blend_block(nir_block *block, void *state)
 {
         struct vc4_compile *c = state;
 
-        nir_foreach_instr(block, instr) {
+        nir_foreach_instr_safe(block, instr) {
                 if (instr->type != nir_instr_type_intrinsic)
                         continue;
                 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index 1afe52a63f4..72a514756fd 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -226,7 +226,9 @@ vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b,
 {
         b->cursor = nir_before_instr(&intr->instr);
 
-        if (intr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) {
+        if (intr->const_index[0] >= VC4_NIR_TLB_COLOR_READ_INPUT &&
+            intr->const_index[0] < (VC4_NIR_TLB_COLOR_READ_INPUT +
+                                    VC4_MAX_SAMPLES)) {
                 /* This doesn't need any lowering. */
                 return;
         }
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 081adfd185c..dda2d84b5b3 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1124,7 +1124,12 @@ emit_frag_end(struct vc4_compile *c)
                 qir_TLB_Z_WRITE(c, z);
         }
 
-        qir_TLB_COLOR_WRITE(c, color);
+        if (!c->msaa_per_sample_output) {
+                qir_TLB_COLOR_WRITE(c, color);
+        } else {
+                for (int i = 0; i < VC4_MAX_SAMPLES; i++)
+                        qir_TLB_COLOR_WRITE_MS(c, c->sample_colors[i]);
+        }
 }
 
 static void
@@ -1475,18 +1480,42 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
 
         case nir_intrinsic_load_input:
                 assert(instr->num_components == 1);
-                if (instr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) {
-                        *dest = qir_TLB_COLOR_READ(c);
+                if (instr->const_index[0] >= VC4_NIR_TLB_COLOR_READ_INPUT) {
+                        /* Reads of the per-sample color need to be done in
+                         * order.
+                         */
+                        int sample_index = (instr->const_index[0] -
+                                           VC4_NIR_TLB_COLOR_READ_INPUT);
+                        for (int i = 0; i <= sample_index; i++) {
+                                if (c->color_reads[i].file == QFILE_NULL) {
+                                        c->color_reads[i] =
+                                                qir_TLB_COLOR_READ(c);
+                                }
+                        }
+                        *dest = c->color_reads[sample_index];
                 } else {
                         *dest = c->inputs[instr->const_index[0]];
                 }
                 break;
 
         case nir_intrinsic_store_output:
-                assert(instr->num_components == 1);
-                c->outputs[instr->const_index[0]] =
-                        qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
-                c->num_outputs = MAX2(c->num_outputs, instr->const_index[0] + 1);
+                /* MSAA color outputs are the only case where we have an
+                 * output that's not lowered to being a store of a single 32
+                 * bit value.
+                 */
+                if (c->stage == QSTAGE_FRAG && instr->num_components == 4) {
+                        assert(instr->const_index[0] == c->output_color_index);
+                        for (int i = 0; i < 4; i++) {
+                                c->sample_colors[i] =
+                                        qir_MOV(c, ntq_get_src(c, instr->src[0],
+                                                               i));
+                        }
+                } else {
+                        assert(instr->num_components == 1);
+                        c->outputs[instr->const_index[0]] =
+                                qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
+                        c->num_outputs = MAX2(c->num_outputs, instr->const_index[0] + 1);
+                }
                 break;
 
         case nir_intrinsic_discard:
@@ -1963,6 +1992,11 @@ vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
         } else {
                 key->logicop_func = PIPE_LOGICOP_COPY;
         }
+        key->msaa = vc4->rasterizer->base.multisample;
+        key->sample_coverage = (vc4->rasterizer->base.multisample &&
+                                vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1);
+        key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage;
+        key->sample_alpha_to_one = vc4->blend->alpha_to_one;
         if (vc4->framebuffer.cbufs[0])
                 key->color_format = vc4->framebuffer.cbufs[0]->format;
 
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index 4c6667a9d9f..4ec25310b67 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -86,6 +86,7 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_TLB_STENCIL_SETUP] = { "tlb_stencil_setup", 0, 1, true },
         [QOP_TLB_Z_WRITE] = { "tlb_z", 0, 1, true },
         [QOP_TLB_COLOR_WRITE] = { "tlb_color", 0, 1, true },
+        [QOP_TLB_COLOR_WRITE_MS] = { "tlb_color_ms", 0, 1, true },
         [QOP_TLB_COLOR_READ] = { "tlb_color_read", 1, 0 },
         [QOP_MS_MASK] = { "ms_mask", 0, 1, true },
         [QOP_VARY_ADD_C] = { "vary_add_c", 1, 1 },
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 97a23df10c6..4e406d60d72 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -38,6 +38,7 @@
 
 #include "vc4_screen.h"
 #include "vc4_qpu_defines.h"
+#include "kernel/vc4_packet.h"
 #include "pipe/p_state.h"
 
 struct nir_builder;
@@ -121,6 +122,7 @@ enum qop {
         QOP_TLB_STENCIL_SETUP,
         QOP_TLB_Z_WRITE,
         QOP_TLB_COLOR_WRITE,
+        QOP_TLB_COLOR_WRITE_MS,
         QOP_TLB_COLOR_READ,
         QOP_MS_MASK,
         QOP_VARY_ADD_C,
@@ -306,6 +308,10 @@ struct vc4_fs_key {
         bool alpha_test;
         bool point_coord_upper_left;
         bool light_twoside;
+        bool msaa;
+        bool sample_coverage;
+        bool sample_alpha_to_coverage;
+        bool sample_alpha_to_one;
         uint8_t alpha_test_func;
         uint8_t logicop_func;
         uint32_t point_sprite_mask;
@@ -350,6 +356,9 @@ struct vc4_compile {
          */
         struct qreg *inputs;
         struct qreg *outputs;
+        bool msaa_per_sample_output;
+        struct qreg color_reads[VC4_MAX_SAMPLES];
+        struct qreg sample_colors[VC4_MAX_SAMPLES];
         uint32_t inputs_array_size;
         uint32_t outputs_array_size;
         uint32_t uniforms_array_size;
@@ -421,6 +430,8 @@ struct vc4_compile {
  */
 #define VC4_NIR_TLB_COLOR_READ_INPUT		2000000000
 
+#define VC4_NIR_MS_MASK_OUTPUT			2000000000
+
 /* Special offset for nir_load_uniform values to get a QUNIFORM_*
  * state-dependent value.
  */
@@ -619,6 +630,7 @@ QIR_ALU0(FRAG_REV_FLAG)
 QIR_ALU0(TEX_RESULT)
 QIR_ALU0(TLB_COLOR_READ)
 QIR_NODST_1(TLB_COLOR_WRITE)
+QIR_NODST_1(TLB_COLOR_WRITE_MS)
 QIR_NODST_1(TLB_Z_WRITE)
 QIR_NODST_1(TLB_DISCARD_SETUP)
 QIR_NODST_1(TLB_STENCIL_SETUP)
diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h
index 866ca5c1300..7c4ff1701ca 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.h
+++ b/src/gallium/drivers/vc4/vc4_qpu.h
@@ -116,6 +116,17 @@ qpu_tlbc()
         return r;
 }
 
+static inline struct qpu_reg
+qpu_tlbc_ms()
+{
+        struct qpu_reg r = {
+                QPU_MUX_A,
+                QPU_W_TLB_COLOR_MS,
+        };
+
+        return r;
+}
+
 static inline struct qpu_reg qpu_r0(void) { return qpu_rn(0); }
 static inline struct qpu_reg qpu_r1(void) { return qpu_rn(1); }
 static inline struct qpu_reg qpu_r2(void) { return qpu_rn(2); }
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index a3d1627156f..5800e520068 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -438,6 +438,13 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         }
                         break;
 
+                case QOP_TLB_COLOR_WRITE_MS:
+                        queue(c, qpu_a_MOV(qpu_tlbc_ms(), src[0]));
+                        if (discard) {
+                                set_last_cond_add(c, QPU_COND_ZS);
+                        }
+                        break;
+
                 case QOP_VARY_ADD_C:
                         queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
                         break;
-- 
2.30.2