From a97b40dca4949b5b8b3320e76768e54f430c9e78 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 23 Jun 2015 13:11:55 -0700 Subject: [PATCH] vc4: Add support for multisample framebuffer operations. This includes GL_SAMPLE_COVERAGE, GL_SAMPLE_ALPHA_TO_ONE, and GL_SAMPLE_ALPHA_TO_COVAGE. I haven't implemented a dithering function yet, and gallium doesn't give me a good chance to do so for GL_SAMPLE_COVERAGE. --- src/gallium/drivers/vc4/vc4_nir_lower_blend.c | 132 +++++++++++++++--- src/gallium/drivers/vc4/vc4_nir_lower_io.c | 4 +- src/gallium/drivers/vc4/vc4_program.c | 48 ++++++- src/gallium/drivers/vc4/vc4_qir.c | 1 + src/gallium/drivers/vc4/vc4_qir.h | 12 ++ src/gallium/drivers/vc4/vc4_qpu.h | 11 ++ src/gallium/drivers/vc4/vc4_qpu_emit.c | 7 + 7 files changed, 191 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c index 0672a92226f..38676cff6b7 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c @@ -29,6 +29,10 @@ * from the tile buffer after having waited for the scoreboard (which is * handled by vc4_qpu_emit.c), then do math using your output color and that * destination value, and update the output color appropriately. + * + * Once this pass is done, the color write will either have one component (for + * single sample) with packed argb8888, or 4 components with the per-sample + * argb8888 result. */ /** @@ -40,15 +44,23 @@ #include "glsl/nir/nir_builder.h" #include "vc4_context.h" +static bool +blend_depends_on_dst_color(struct vc4_compile *c) +{ + return (c->fs_key->blend.blend_enable || + c->fs_key->blend.colormask != 0xf || + c->fs_key->logicop_func != PIPE_LOGICOP_COPY); +} + /** Emits a load of the previous fragment color from the tile buffer. */ static nir_ssa_def * -vc4_nir_get_dst_color(nir_builder *b) +vc4_nir_get_dst_color(nir_builder *b, int sample) { nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_input); load->num_components = 1; - load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT; + load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT + sample; nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL); nir_builder_instr_insert(b, &load->instr); return &load->dest.ssa; @@ -496,23 +508,26 @@ vc4_nir_swizzle_and_pack(struct vc4_compile *c, nir_builder *b, } -static void -vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b, - nir_intrinsic_instr *intr) +static nir_ssa_def * +vc4_nir_blend_pipeline(struct vc4_compile *c, nir_builder *b, nir_ssa_def *src, + int sample) { enum pipe_format color_format = c->fs_key->color_format; const uint8_t *format_swiz = vc4_get_format_swizzle(color_format); bool srgb = util_format_is_srgb(color_format); /* Pull out the float src/dst color components. */ - nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b); + nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b, sample); nir_ssa_def *dst_vec4 = nir_unpack_unorm_4x8(b, packed_dst_color); nir_ssa_def *src_color[4], *unpacked_dst_color[4]; for (unsigned i = 0; i < 4; i++) { - src_color[i] = nir_channel(b, intr->src[0].ssa, i); + src_color[i] = nir_channel(b, src, i); unpacked_dst_color[i] = nir_channel(b, dst_vec4, i); } + if (c->fs_key->sample_alpha_to_one && c->fs_key->msaa) + src_color[3] = nir_imm_float(b, 1.0); + vc4_nir_emit_alpha_test_discard(c, b, src_color[3]); nir_ssa_def *packed_color; @@ -560,16 +575,101 @@ vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b, colormask &= ~(0xff << (i * 8)); } } - packed_color = nir_ior(b, - nir_iand(b, packed_color, - nir_imm_int(b, colormask)), - nir_iand(b, packed_dst_color, - nir_imm_int(b, ~colormask))); - /* Turn the old vec4 output into a store of the packed color. */ - nir_instr_rewrite_src(&intr->instr, &intr->src[0], - nir_src_for_ssa(packed_color)); + return nir_ior(b, + nir_iand(b, packed_color, + nir_imm_int(b, colormask)), + nir_iand(b, packed_dst_color, + nir_imm_int(b, ~colormask))); +} + +static int +vc4_nir_next_output_driver_location(nir_shader *s) +{ + int maxloc = -1; + + nir_foreach_variable(var, &s->inputs) + maxloc = MAX2(maxloc, var->data.driver_location); + + return maxloc; +} + +static void +vc4_nir_store_sample_mask(struct vc4_compile *c, nir_builder *b, + nir_ssa_def *val) +{ + nir_variable *sample_mask = nir_variable_create(c->s, nir_var_shader_out, + glsl_uint_type(), + "sample_mask"); + sample_mask->data.driver_location = + vc4_nir_next_output_driver_location(c->s); + sample_mask->data.location = FRAG_RESULT_SAMPLE_MASK; + exec_list_push_tail(&c->s->outputs, &sample_mask->node); + + nir_intrinsic_instr *intr = + nir_intrinsic_instr_create(c->s, nir_intrinsic_store_output); intr->num_components = 1; + intr->const_index[0] = sample_mask->data.location; + + intr->src[0] = nir_src_for_ssa(val); + nir_builder_instr_insert(b, &intr->instr); +} + +static void +vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + nir_ssa_def *frag_color = intr->src[0].ssa; + + if (c->fs_key->sample_coverage) { + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_load_sample_mask_in); + load->num_components = 1; + nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL); + nir_builder_instr_insert(b, &load->instr); + + nir_ssa_def *bitmask = &load->dest.ssa; + + vc4_nir_store_sample_mask(c, b, bitmask); + } else if (c->fs_key->sample_alpha_to_coverage) { + nir_ssa_def *a = nir_channel(b, frag_color, 3); + + /* XXX: We should do a nice dither based on the fragment + * coordinate, instead. + */ + nir_ssa_def *num_samples = nir_imm_float(b, VC4_MAX_SAMPLES); + nir_ssa_def *num_bits = nir_f2i(b, nir_fmul(b, a, num_samples)); + nir_ssa_def *bitmask = nir_isub(b, + nir_ishl(b, + nir_imm_int(b, 1), + num_bits), + nir_imm_int(b, 1)); + vc4_nir_store_sample_mask(c, b, bitmask); + } + + /* The TLB color read returns each sample in turn, so if our blending + * depends on the destination color, we're going to have to run the + * blending function separately for each destination sample value, and + * then output the per-sample color using TLB_COLOR_MS. + */ + nir_ssa_def *blend_output; + if (c->fs_key->msaa && blend_depends_on_dst_color(c)) { + c->msaa_per_sample_output = true; + + nir_ssa_def *samples[4]; + for (int i = 0; i < VC4_MAX_SAMPLES; i++) + samples[i] = vc4_nir_blend_pipeline(c, b, frag_color, i); + blend_output = nir_vec4(b, + samples[0], samples[1], + samples[2], samples[3]); + } else { + blend_output = vc4_nir_blend_pipeline(c, b, frag_color, 0); + } + + nir_instr_rewrite_src(&intr->instr, &intr->src[0], + nir_src_for_ssa(blend_output)); + intr->num_components = blend_output->num_components; } static bool @@ -577,7 +677,7 @@ vc4_nir_lower_blend_block(nir_block *block, void *state) { struct vc4_compile *c = state; - nir_foreach_instr(block, instr) { + nir_foreach_instr_safe(block, instr) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c index 1afe52a63f4..72a514756fd 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c @@ -226,7 +226,9 @@ vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b, { b->cursor = nir_before_instr(&intr->instr); - if (intr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) { + if (intr->const_index[0] >= VC4_NIR_TLB_COLOR_READ_INPUT && + intr->const_index[0] < (VC4_NIR_TLB_COLOR_READ_INPUT + + VC4_MAX_SAMPLES)) { /* This doesn't need any lowering. */ return; } diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 081adfd185c..dda2d84b5b3 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -1124,7 +1124,12 @@ emit_frag_end(struct vc4_compile *c) qir_TLB_Z_WRITE(c, z); } - qir_TLB_COLOR_WRITE(c, color); + if (!c->msaa_per_sample_output) { + qir_TLB_COLOR_WRITE(c, color); + } else { + for (int i = 0; i < VC4_MAX_SAMPLES; i++) + qir_TLB_COLOR_WRITE_MS(c, c->sample_colors[i]); + } } static void @@ -1475,18 +1480,42 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) case nir_intrinsic_load_input: assert(instr->num_components == 1); - if (instr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) { - *dest = qir_TLB_COLOR_READ(c); + if (instr->const_index[0] >= VC4_NIR_TLB_COLOR_READ_INPUT) { + /* Reads of the per-sample color need to be done in + * order. + */ + int sample_index = (instr->const_index[0] - + VC4_NIR_TLB_COLOR_READ_INPUT); + for (int i = 0; i <= sample_index; i++) { + if (c->color_reads[i].file == QFILE_NULL) { + c->color_reads[i] = + qir_TLB_COLOR_READ(c); + } + } + *dest = c->color_reads[sample_index]; } else { *dest = c->inputs[instr->const_index[0]]; } break; case nir_intrinsic_store_output: - assert(instr->num_components == 1); - c->outputs[instr->const_index[0]] = - qir_MOV(c, ntq_get_src(c, instr->src[0], 0)); - c->num_outputs = MAX2(c->num_outputs, instr->const_index[0] + 1); + /* MSAA color outputs are the only case where we have an + * output that's not lowered to being a store of a single 32 + * bit value. + */ + if (c->stage == QSTAGE_FRAG && instr->num_components == 4) { + assert(instr->const_index[0] == c->output_color_index); + for (int i = 0; i < 4; i++) { + c->sample_colors[i] = + qir_MOV(c, ntq_get_src(c, instr->src[0], + i)); + } + } else { + assert(instr->num_components == 1); + c->outputs[instr->const_index[0]] = + qir_MOV(c, ntq_get_src(c, instr->src[0], 0)); + c->num_outputs = MAX2(c->num_outputs, instr->const_index[0] + 1); + } break; case nir_intrinsic_discard: @@ -1963,6 +1992,11 @@ vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode) } else { key->logicop_func = PIPE_LOGICOP_COPY; } + key->msaa = vc4->rasterizer->base.multisample; + key->sample_coverage = (vc4->rasterizer->base.multisample && + vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1); + key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage; + key->sample_alpha_to_one = vc4->blend->alpha_to_one; if (vc4->framebuffer.cbufs[0]) key->color_format = vc4->framebuffer.cbufs[0]->format; diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index 4c6667a9d9f..4ec25310b67 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -86,6 +86,7 @@ static const struct qir_op_info qir_op_info[] = { [QOP_TLB_STENCIL_SETUP] = { "tlb_stencil_setup", 0, 1, true }, [QOP_TLB_Z_WRITE] = { "tlb_z", 0, 1, true }, [QOP_TLB_COLOR_WRITE] = { "tlb_color", 0, 1, true }, + [QOP_TLB_COLOR_WRITE_MS] = { "tlb_color_ms", 0, 1, true }, [QOP_TLB_COLOR_READ] = { "tlb_color_read", 1, 0 }, [QOP_MS_MASK] = { "ms_mask", 0, 1, true }, [QOP_VARY_ADD_C] = { "vary_add_c", 1, 1 }, diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 97a23df10c6..4e406d60d72 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -38,6 +38,7 @@ #include "vc4_screen.h" #include "vc4_qpu_defines.h" +#include "kernel/vc4_packet.h" #include "pipe/p_state.h" struct nir_builder; @@ -121,6 +122,7 @@ enum qop { QOP_TLB_STENCIL_SETUP, QOP_TLB_Z_WRITE, QOP_TLB_COLOR_WRITE, + QOP_TLB_COLOR_WRITE_MS, QOP_TLB_COLOR_READ, QOP_MS_MASK, QOP_VARY_ADD_C, @@ -306,6 +308,10 @@ struct vc4_fs_key { bool alpha_test; bool point_coord_upper_left; bool light_twoside; + bool msaa; + bool sample_coverage; + bool sample_alpha_to_coverage; + bool sample_alpha_to_one; uint8_t alpha_test_func; uint8_t logicop_func; uint32_t point_sprite_mask; @@ -350,6 +356,9 @@ struct vc4_compile { */ struct qreg *inputs; struct qreg *outputs; + bool msaa_per_sample_output; + struct qreg color_reads[VC4_MAX_SAMPLES]; + struct qreg sample_colors[VC4_MAX_SAMPLES]; uint32_t inputs_array_size; uint32_t outputs_array_size; uint32_t uniforms_array_size; @@ -421,6 +430,8 @@ struct vc4_compile { */ #define VC4_NIR_TLB_COLOR_READ_INPUT 2000000000 +#define VC4_NIR_MS_MASK_OUTPUT 2000000000 + /* Special offset for nir_load_uniform values to get a QUNIFORM_* * state-dependent value. */ @@ -619,6 +630,7 @@ QIR_ALU0(FRAG_REV_FLAG) QIR_ALU0(TEX_RESULT) QIR_ALU0(TLB_COLOR_READ) QIR_NODST_1(TLB_COLOR_WRITE) +QIR_NODST_1(TLB_COLOR_WRITE_MS) QIR_NODST_1(TLB_Z_WRITE) QIR_NODST_1(TLB_DISCARD_SETUP) QIR_NODST_1(TLB_STENCIL_SETUP) diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h index 866ca5c1300..7c4ff1701ca 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.h +++ b/src/gallium/drivers/vc4/vc4_qpu.h @@ -116,6 +116,17 @@ qpu_tlbc() return r; } +static inline struct qpu_reg +qpu_tlbc_ms() +{ + struct qpu_reg r = { + QPU_MUX_A, + QPU_W_TLB_COLOR_MS, + }; + + return r; +} + static inline struct qpu_reg qpu_r0(void) { return qpu_rn(0); } static inline struct qpu_reg qpu_r1(void) { return qpu_rn(1); } static inline struct qpu_reg qpu_r2(void) { return qpu_rn(2); } diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index a3d1627156f..5800e520068 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -438,6 +438,13 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) } break; + case QOP_TLB_COLOR_WRITE_MS: + queue(c, qpu_a_MOV(qpu_tlbc_ms(), src[0])); + if (discard) { + set_last_cond_add(c, QPU_COND_ZS); + } + break; + case QOP_VARY_ADD_C: queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack); break; -- 2.30.2