From bf3c50fba221f216e38d3f60f89161ced4c684c0 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 13 Apr 2015 21:36:24 -0700 Subject: [PATCH] vc4: Move all of our fixed function fragment color handling to NIR. This massively reduces our dependency on VC4-specific optimization passes. shader-db: total uniforms in shared programs: 32077 -> 32067 (-0.03%) uniforms in affected programs: 149 -> 139 (-6.71%) total instructions in shared programs: 98208 -> 98182 (-0.03%) instructions in affected programs: 2154 -> 2128 (-1.21%) --- src/gallium/drivers/vc4/Makefile.sources | 1 + src/gallium/drivers/vc4/vc4_nir_lower_blend.c | 431 +++++++++++++++++ src/gallium/drivers/vc4/vc4_nir_lower_io.c | 24 +- src/gallium/drivers/vc4/vc4_program.c | 440 +++--------------- src/gallium/drivers/vc4/vc4_qir.h | 16 +- src/gallium/drivers/vc4/vc4_uniforms.c | 14 +- 6 files changed, 538 insertions(+), 388 deletions(-) create mode 100644 src/gallium/drivers/vc4/vc4_nir_lower_blend.c diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index b09ffa60c39..6fb40c20562 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -19,6 +19,7 @@ C_SOURCES := \ vc4_fence.c \ vc4_formats.c \ vc4_job.c \ + vc4_nir_lower_blend.c \ vc4_nir_lower_io.c \ vc4_opt_algebraic.c \ vc4_opt_constant_folding.c \ diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c new file mode 100644 index 00000000000..a372a6c0cdc --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c @@ -0,0 +1,431 @@ +/* + * Copyright © 2015 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * Implements most of the fixed function fragment pipeline in shader code. + * + * VC4 doesn't have any hardware support for blending, alpha test, logic ops, + * or color mask. Instead, you read the current contents of the destination + * from the tile buffer after having waited for the scoreboard (which is + * handled by vc4_qpu_emit.c), then do math using your output color and that + * destination value, and update the output color appropriately. + */ + +/** + * Lowers fixed-function blending to a load of the destination color and a + * series of ALU operations before the store of the output. + */ +#include "util/u_format.h" +#include "vc4_qir.h" +#include "glsl/nir/nir_builder.h" +#include "vc4_context.h" + +/** Emits a load of the previous fragment color from the tile buffer. */ +static nir_ssa_def * +vc4_nir_get_dst_color(nir_builder *b) +{ + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_load_input); + load->num_components = 1; + load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT; + nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL); + nir_builder_instr_insert(b, &load->instr); + return &load->dest.ssa; +} + +static nir_ssa_def * +vc4_nir_srgb_decode(nir_builder *b, nir_ssa_def *srgb) +{ + nir_ssa_def *is_low = nir_flt(b, srgb, nir_imm_float(b, 0.04045)); + nir_ssa_def *low = nir_fmul(b, srgb, nir_imm_float(b, 1.0 / 12.92)); + nir_ssa_def *high = nir_fpow(b, + nir_fmul(b, + nir_fadd(b, srgb, + nir_imm_float(b, 0.055)), + nir_imm_float(b, 1.0 / 1.055)), + nir_imm_float(b, 2.4)); + + return nir_bcsel(b, is_low, low, high); +} + +static nir_ssa_def * +vc4_nir_srgb_encode(nir_builder *b, nir_ssa_def *linear) +{ + nir_ssa_def *is_low = nir_flt(b, linear, nir_imm_float(b, 0.0031308)); + nir_ssa_def *low = nir_fmul(b, linear, nir_imm_float(b, 12.92)); + nir_ssa_def *high = nir_fsub(b, + nir_fmul(b, + nir_imm_float(b, 1.055), + nir_fpow(b, + linear, + nir_imm_float(b, 0.41666))), + nir_imm_float(b, 0.055)); + + return nir_bcsel(b, is_low, low, high); +} + +static nir_ssa_def * +vc4_blend_channel(nir_builder *b, + nir_ssa_def **src, + nir_ssa_def **dst, + unsigned factor, + int channel) +{ + switch(factor) { + case PIPE_BLENDFACTOR_ONE: + return nir_imm_float(b, 1.0); + case PIPE_BLENDFACTOR_SRC_COLOR: + return src[channel]; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return src[3]; + case PIPE_BLENDFACTOR_DST_ALPHA: + return dst[3]; + case PIPE_BLENDFACTOR_DST_COLOR: + return dst[channel]; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + if (channel != 3) { + return nir_fmin(b, + src[3], + nir_fsub(b, + nir_imm_float(b, 1.0), + dst[3])); + } else { + return nir_imm_float(b, 1.0); + } + case PIPE_BLENDFACTOR_CONST_COLOR: + return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel); + case PIPE_BLENDFACTOR_CONST_ALPHA: + return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W); + case PIPE_BLENDFACTOR_ZERO: + return nir_imm_float(b, 0.0); + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return nir_fsub(b, nir_imm_float(b, 1.0), src[channel]); + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return nir_fsub(b, nir_imm_float(b, 1.0), src[3]); + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + return nir_fsub(b, nir_imm_float(b, 1.0), dst[3]); + case PIPE_BLENDFACTOR_INV_DST_COLOR: + return nir_fsub(b, nir_imm_float(b, 1.0), dst[channel]); + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + return nir_fsub(b, nir_imm_float(b, 1.0), + vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel)); + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + return nir_fsub(b, nir_imm_float(b, 1.0), + vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W)); + + default: + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + /* Unsupported. */ + fprintf(stderr, "Unknown blend factor %d\n", factor); + return nir_imm_float(b, 1.0); + } +} + +static nir_ssa_def * +vc4_blend_func(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst, + unsigned func) +{ + switch (func) { + case PIPE_BLEND_ADD: + return nir_fadd(b, src, dst); + case PIPE_BLEND_SUBTRACT: + return nir_fsub(b, src, dst); + case PIPE_BLEND_REVERSE_SUBTRACT: + return nir_fsub(b, dst, src); + case PIPE_BLEND_MIN: + return nir_fmin(b, src, dst); + case PIPE_BLEND_MAX: + return nir_fmax(b, src, dst); + + default: + /* Unsupported. */ + fprintf(stderr, "Unknown blend func %d\n", func); + return src; + + } +} + +static void +vc4_do_blending(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result, + nir_ssa_def **src_color, nir_ssa_def **dst_color) +{ + struct pipe_rt_blend_state *blend = &c->fs_key->blend; + + if (!blend->blend_enable) { + for (int i = 0; i < 4; i++) + result[i] = src_color[i]; + return; + } + + /* Clamp the src color to [0, 1]. Dest is already clamped. */ + for (int i = 0; i < 4; i++) + src_color[i] = nir_fsat(b, src_color[i]); + + nir_ssa_def *src_blend[4], *dst_blend[4]; + for (int i = 0; i < 4; i++) { + int src_factor = ((i != 3) ? blend->rgb_src_factor : + blend->alpha_src_factor); + int dst_factor = ((i != 3) ? blend->rgb_dst_factor : + blend->alpha_dst_factor); + src_blend[i] = nir_fmul(b, src_color[i], + vc4_blend_channel(b, + src_color, dst_color, + src_factor, i)); + dst_blend[i] = nir_fmul(b, dst_color[i], + vc4_blend_channel(b, + src_color, dst_color, + dst_factor, i)); + } + + for (int i = 0; i < 4; i++) { + result[i] = vc4_blend_func(b, src_blend[i], dst_blend[i], + ((i != 3) ? blend->rgb_func : + blend->alpha_func)); + } +} + +static nir_ssa_def * +vc4_logicop(nir_builder *b, int logicop_func, + nir_ssa_def *src, nir_ssa_def *dst) +{ + switch (logicop_func) { + case PIPE_LOGICOP_CLEAR: + return nir_imm_int(b, 0); + case PIPE_LOGICOP_NOR: + return nir_inot(b, nir_ior(b, src, dst)); + case PIPE_LOGICOP_AND_INVERTED: + return nir_iand(b, nir_inot(b, src), dst); + case PIPE_LOGICOP_COPY_INVERTED: + return nir_inot(b, src); + case PIPE_LOGICOP_AND_REVERSE: + return nir_iand(b, src, nir_inot(b, dst)); + case PIPE_LOGICOP_INVERT: + return nir_inot(b, dst); + case PIPE_LOGICOP_XOR: + return nir_ixor(b, src, dst); + case PIPE_LOGICOP_NAND: + return nir_inot(b, nir_iand(b, src, dst)); + case PIPE_LOGICOP_AND: + return nir_iand(b, src, dst); + case PIPE_LOGICOP_EQUIV: + return nir_inot(b, nir_ixor(b, src, dst)); + case PIPE_LOGICOP_NOOP: + return dst; + case PIPE_LOGICOP_OR_INVERTED: + return nir_ior(b, nir_inot(b, src), dst); + case PIPE_LOGICOP_OR_REVERSE: + return nir_ior(b, src, nir_inot(b, dst)); + case PIPE_LOGICOP_OR: + return nir_ior(b, src, dst); + case PIPE_LOGICOP_SET: + return nir_imm_int(b, ~0); + default: + fprintf(stderr, "Unknown logic op %d\n", logicop_func); + /* FALLTHROUGH */ + case PIPE_LOGICOP_COPY: + return src; + } +} + +static nir_ssa_def * +vc4_nir_pipe_compare_func(nir_builder *b, int func, + nir_ssa_def *src0, nir_ssa_def *src1) +{ + switch (func) { + default: + fprintf(stderr, "Unknown compare func %d\n", func); + /* FALLTHROUGH */ + case PIPE_FUNC_NEVER: + return nir_imm_int(b, 0); + case PIPE_FUNC_ALWAYS: + return nir_imm_int(b, ~0); + case PIPE_FUNC_EQUAL: + return nir_feq(b, src0, src1); + case PIPE_FUNC_NOTEQUAL: + return nir_fne(b, src0, src1); + case PIPE_FUNC_GREATER: + return nir_flt(b, src1, src0); + case PIPE_FUNC_GEQUAL: + return nir_fge(b, src0, src1); + case PIPE_FUNC_LESS: + return nir_flt(b, src0, src1); + case PIPE_FUNC_LEQUAL: + return nir_fge(b, src1, src0); + } +} + +static void +vc4_nir_emit_alpha_test_discard(struct vc4_compile *c, nir_builder *b, + nir_ssa_def *alpha) +{ + if (!c->fs_key->alpha_test) + return; + + nir_ssa_def *alpha_ref = + vc4_nir_get_state_uniform(b, QUNIFORM_ALPHA_REF); + nir_ssa_def *condition = + vc4_nir_pipe_compare_func(b, c->fs_key->alpha_test_func, + alpha, alpha_ref); + + nir_intrinsic_instr *discard = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_discard_if); + discard->num_components = 1; + discard->src[0] = nir_src_for_ssa(nir_inot(b, condition)); + nir_builder_instr_insert(b, &discard->instr); +} + +static void +vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + enum pipe_format color_format = c->fs_key->color_format; + const uint8_t *format_swiz = vc4_get_format_swizzle(color_format); + + /* Pull out the float src/dst color components. */ + nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b); + nir_ssa_def *dst_vec4 = nir_unpack_unorm_4x8(b, packed_dst_color); + nir_ssa_def *src_color[4], *unpacked_dst_color[4]; + for (unsigned i = 0; i < 4; i++) { + src_color[i] = nir_swizzle(b, intr->src[0].ssa, &i, 1, false); + unpacked_dst_color[i] = nir_swizzle(b, dst_vec4, &i, 1, false); + } + + /* Unswizzle the destination color. */ + nir_ssa_def *dst_color[4]; + for (unsigned i = 0; i < 4; i++) { + dst_color[i] = vc4_nir_get_swizzled_channel(b, + unpacked_dst_color, + format_swiz[i]); + } + + vc4_nir_emit_alpha_test_discard(c, b, src_color[3]); + + /* Turn dst color to linear. */ + if (util_format_is_srgb(color_format)) { + for (int i = 0; i < 3; i++) + dst_color[i] = vc4_nir_srgb_decode(b, dst_color[i]); + } + + nir_ssa_def *blend_color[4]; + vc4_do_blending(c, b, blend_color, src_color, dst_color); + + /* sRGB encode the output color */ + if (util_format_is_srgb(color_format)) { + for (int i = 0; i < 3; i++) + blend_color[i] = vc4_nir_srgb_encode(b, blend_color[i]); + } + + nir_ssa_def *swizzled_outputs[4]; + for (int i = 0; i < 4; i++) { + swizzled_outputs[i] = + vc4_nir_get_swizzled_channel(b, blend_color, + format_swiz[i]); + } + + nir_ssa_def *packed_color = + nir_pack_unorm_4x8(b, + nir_vec4(b, + swizzled_outputs[0], + swizzled_outputs[1], + swizzled_outputs[2], + swizzled_outputs[3])); + + packed_color = vc4_logicop(b, c->fs_key->logicop_func, + packed_color, packed_dst_color); + + /* If the bit isn't set in the color mask, then just return the + * original dst color, instead. + */ + uint32_t colormask = 0xffffffff; + for (int i = 0; i < 4; i++) { + if (format_swiz[i] < 4 && + !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) { + colormask &= ~(0xff << (i * 8)); + } + } + packed_color = nir_ior(b, + nir_iand(b, packed_color, + nir_imm_int(b, colormask)), + nir_iand(b, packed_dst_color, + nir_imm_int(b, ~colormask))); + + /* Turn the old vec4 output into a store of the packed color. */ + nir_instr_rewrite_src(&intr->instr, &intr->src[0], + nir_src_for_ssa(packed_color)); + intr->num_components = 1; +} + +static bool +vc4_nir_lower_blend_block(nir_block *block, void *state) +{ + struct vc4_compile *c = state; + + nir_foreach_instr(block, instr) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; + + nir_variable *output_var = NULL; + foreach_list_typed(nir_variable, var, node, &c->s->outputs) { + if (var->data.driver_location == intr->const_index[0]) { + output_var = var; + break; + } + } + assert(output_var); + unsigned semantic_name = output_var->data.location; + + if (semantic_name != TGSI_SEMANTIC_COLOR) + continue; + + nir_function_impl *impl = + nir_cf_node_get_function(&block->cf_node); + nir_builder b; + nir_builder_init(&b, impl); + nir_builder_insert_before_instr(&b, &intr->instr); + vc4_nir_lower_blend_instr(c, &b, intr); + } + return true; +} + +void +vc4_nir_lower_blend(struct vc4_compile *c) +{ + nir_foreach_overload(c->s, overload) { + if (overload->impl) { + nir_foreach_block(overload->impl, + vc4_nir_lower_blend_block, c); + + nir_metadata_preserve(overload->impl, + nir_metadata_block_index | + nir_metadata_dominance); + } + } +} diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c index ffc120e8865..229d41147d8 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c @@ -56,11 +56,14 @@ static void vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b, nir_intrinsic_instr *intr) { - /* All TGSI-to-NIR inputs are vec4. */ - assert(intr->num_components == 4); - nir_builder_insert_before_instr(b, &intr->instr); + if (c->stage == QSTAGE_FRAG && intr->const_index[0] == + VC4_NIR_TLB_COLOR_READ_INPUT) { + /* This doesn't need any lowering. */ + return; + } + nir_variable *input_var = NULL; foreach_list_typed(nir_variable, var, node, &c->s->inputs) { if (var->data.driver_location == intr->const_index[0]) { @@ -72,6 +75,9 @@ vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b, int semantic_name = input_var->data.location; int semantic_index = input_var->data.index; + /* All TGSI-to-NIR inputs are vec4. */ + assert(intr->num_components == 4); + /* Generate scalar loads equivalent to the original VEC4. */ nir_ssa_def *dests[4]; for (unsigned i = 0; i < intr->num_components; i++) { @@ -145,6 +151,12 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b, return; } + /* Color output is lowered by vc4_nir_lower_blend(). */ + if (c->stage == QSTAGE_FRAG && semantic_name == TGSI_SEMANTIC_COLOR) { + intr->const_index[0] *= 4; + return; + } + /* All TGSI-to-NIR outputs are VEC4. */ assert(intr->num_components == 4); @@ -170,7 +182,11 @@ static void vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b, nir_intrinsic_instr *intr) { - /* All TGSI-to-NIR uniform loads are vec4. */ + /* All TGSI-to-NIR uniform loads are vec4, but we may create dword + * loads in our lowering passes. + */ + if (intr->num_components == 1) + return; assert(intr->num_components == 4); nir_builder_insert_before_instr(b, &intr->instr); diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index fb1726c0d1e..13c472152d8 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -123,6 +123,26 @@ nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b, return &intr->dest.ssa; } +nir_ssa_def * +vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz) +{ + switch (swiz) { + default: + case UTIL_FORMAT_SWIZZLE_NONE: + fprintf(stderr, "warning: unknown swizzle\n"); + /* FALLTHROUGH */ + case UTIL_FORMAT_SWIZZLE_0: + return nir_imm_float(b, 0.0); + case UTIL_FORMAT_SWIZZLE_1: + return nir_imm_float(b, 1.0); + case UTIL_FORMAT_SWIZZLE_X: + case UTIL_FORMAT_SWIZZLE_Y: + case UTIL_FORMAT_SWIZZLE_Z: + case UTIL_FORMAT_SWIZZLE_W: + return srcs[swiz]; + } +} + static struct qreg * ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def) { @@ -258,22 +278,6 @@ qir_srgb_decode(struct vc4_compile *c, struct qreg srgb) return qir_SEL_X_Y_NS(c, low, high); } -static struct qreg -qir_srgb_encode(struct vc4_compile *c, struct qreg linear) -{ - struct qreg low = qir_FMUL(c, linear, qir_uniform_f(c, 12.92)); - struct qreg high = qir_FSUB(c, - qir_FMUL(c, - qir_uniform_f(c, 1.055), - qir_POW(c, - linear, - qir_uniform_f(c, 0.41666))), - qir_uniform_f(c, 0.055)); - - qir_SF(c, qir_FSUB(c, linear, qir_uniform_f(c, 0.0031308))); - return qir_SEL_X_Y_NS(c, low, high); -} - static struct qreg ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1) { @@ -834,6 +838,32 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) return; } + if (instr->op == nir_op_pack_unorm_4x8) { + struct qreg result; + for (int i = 0; i < 4; i++) { + struct qreg src = ntq_get_src(c, instr->src[0].src, + instr->src[0].swizzle[i]); + if (i == 0) + result = qir_PACK_8888_F(c, src); + else + result = qir_PACK_8_F(c, result, src, i); + } + struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); + *dest = result; + return; + } + + if (instr->op == nir_op_unpack_unorm_4x8) { + struct qreg src = ntq_get_src(c, instr->src[0].src, + instr->src[0].swizzle[0]); + struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); + for (int i = 0; i < 4; i++) { + if (instr->dest.write_mask & (1 << i)) + dest[i] = qir_UNPACK_8_F(c, src, i); + } + return; + } + /* General case: We can just grab the one used channel per src. */ struct qreg src[nir_op_infos[instr->op].num_inputs]; for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { @@ -1036,161 +1066,6 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) } } -static struct qreg -vc4_blend_channel(struct vc4_compile *c, - struct qreg *dst, - struct qreg *src, - struct qreg val, - unsigned factor, - int channel) -{ - switch(factor) { - case PIPE_BLENDFACTOR_ONE: - return val; - case PIPE_BLENDFACTOR_SRC_COLOR: - return qir_FMUL(c, val, src[channel]); - case PIPE_BLENDFACTOR_SRC_ALPHA: - return qir_FMUL(c, val, src[3]); - case PIPE_BLENDFACTOR_DST_ALPHA: - return qir_FMUL(c, val, dst[3]); - case PIPE_BLENDFACTOR_DST_COLOR: - return qir_FMUL(c, val, dst[channel]); - case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: - if (channel != 3) { - return qir_FMUL(c, - val, - qir_FMIN(c, - src[3], - qir_FSUB(c, - qir_uniform_f(c, 1.0), - dst[3]))); - } else { - return val; - } - case PIPE_BLENDFACTOR_CONST_COLOR: - return qir_FMUL(c, val, - qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR, - channel)); - case PIPE_BLENDFACTOR_CONST_ALPHA: - return qir_FMUL(c, val, - qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR, 3)); - case PIPE_BLENDFACTOR_ZERO: - return qir_uniform_f(c, 0.0); - case PIPE_BLENDFACTOR_INV_SRC_COLOR: - return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0), - src[channel])); - case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0), - src[3])); - case PIPE_BLENDFACTOR_INV_DST_ALPHA: - return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0), - dst[3])); - case PIPE_BLENDFACTOR_INV_DST_COLOR: - return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0), - dst[channel])); - case PIPE_BLENDFACTOR_INV_CONST_COLOR: - return qir_FMUL(c, val, - qir_FSUB(c, qir_uniform_f(c, 1.0), - qir_uniform(c, - QUNIFORM_BLEND_CONST_COLOR, - channel))); - case PIPE_BLENDFACTOR_INV_CONST_ALPHA: - return qir_FMUL(c, val, - qir_FSUB(c, qir_uniform_f(c, 1.0), - qir_uniform(c, - QUNIFORM_BLEND_CONST_COLOR, - 3))); - - default: - case PIPE_BLENDFACTOR_SRC1_COLOR: - case PIPE_BLENDFACTOR_SRC1_ALPHA: - case PIPE_BLENDFACTOR_INV_SRC1_COLOR: - case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: - /* Unsupported. */ - fprintf(stderr, "Unknown blend factor %d\n", factor); - return val; - } -} - -static struct qreg -vc4_blend_func(struct vc4_compile *c, - struct qreg src, struct qreg dst, - unsigned func) -{ - switch (func) { - case PIPE_BLEND_ADD: - return qir_FADD(c, src, dst); - case PIPE_BLEND_SUBTRACT: - return qir_FSUB(c, src, dst); - case PIPE_BLEND_REVERSE_SUBTRACT: - return qir_FSUB(c, dst, src); - case PIPE_BLEND_MIN: - return qir_FMIN(c, src, dst); - case PIPE_BLEND_MAX: - return qir_FMAX(c, src, dst); - - default: - /* Unsupported. */ - fprintf(stderr, "Unknown blend func %d\n", func); - return src; - - } -} - -/** - * Implements fixed function blending in shader code. - * - * VC4 doesn't have any hardware support for blending. Instead, you read the - * current contents of the destination from the tile buffer after having - * waited for the scoreboard (which is handled by vc4_qpu_emit.c), then do - * math using your output color and that destination value, and update the - * output color appropriately. - */ -static void -vc4_blend(struct vc4_compile *c, struct qreg *result, - struct qreg *dst_color, struct qreg *src_color) -{ - struct pipe_rt_blend_state *blend = &c->fs_key->blend; - - if (!blend->blend_enable) { - for (int i = 0; i < 4; i++) - result[i] = src_color[i]; - return; - } - - for (int i = 0; i < 4; i++) - src_color[i] = qir_SAT(c, src_color[i]); - - struct qreg src_blend[4], dst_blend[4]; - for (int i = 0; i < 3; i++) { - src_blend[i] = vc4_blend_channel(c, - dst_color, src_color, - src_color[i], - blend->rgb_src_factor, i); - dst_blend[i] = vc4_blend_channel(c, - dst_color, src_color, - dst_color[i], - blend->rgb_dst_factor, i); - } - src_blend[3] = vc4_blend_channel(c, - dst_color, src_color, - src_color[3], - blend->alpha_src_factor, 3); - dst_blend[3] = vc4_blend_channel(c, - dst_color, src_color, - dst_color[3], - blend->alpha_dst_factor, 3); - - for (int i = 0; i < 3; i++) { - result[i] = vc4_blend_func(c, - src_blend[i], dst_blend[i], - blend->rgb_func); - } - result[3] = vc4_blend_func(c, - src_blend[3], dst_blend[3], - blend->alpha_func); -} - static void clip_distance_discard(struct vc4_compile *c) { @@ -1213,217 +1088,17 @@ clip_distance_discard(struct vc4_compile *c) } } -static void -alpha_test_discard(struct vc4_compile *c) -{ - struct qreg src_alpha; - struct qreg alpha_ref = qir_uniform(c, QUNIFORM_ALPHA_REF, 0); - - if (!c->fs_key->alpha_test) - return; - - if (c->output_color_index != -1) - src_alpha = c->outputs[c->output_color_index + 3]; - else - src_alpha = qir_uniform_f(c, 1.0); - - if (c->discard.file == QFILE_NULL) - c->discard = qir_uniform_ui(c, 0); - - switch (c->fs_key->alpha_test_func) { - case PIPE_FUNC_NEVER: - c->discard = qir_uniform_ui(c, ~0); - break; - case PIPE_FUNC_ALWAYS: - break; - case PIPE_FUNC_EQUAL: - qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref)); - c->discard = qir_SEL_X_Y_ZS(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_NOTEQUAL: - qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref)); - c->discard = qir_SEL_X_Y_ZC(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_GREATER: - qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref)); - c->discard = qir_SEL_X_Y_NC(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_GEQUAL: - qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha)); - c->discard = qir_SEL_X_Y_NS(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_LESS: - qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref)); - c->discard = qir_SEL_X_Y_NS(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - case PIPE_FUNC_LEQUAL: - qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha)); - c->discard = qir_SEL_X_Y_NC(c, c->discard, - qir_uniform_ui(c, ~0)); - break; - } -} - -static struct qreg -vc4_logicop(struct vc4_compile *c, struct qreg src, struct qreg dst) -{ - switch (c->fs_key->logicop_func) { - case PIPE_LOGICOP_CLEAR: - return qir_uniform_f(c, 0.0); - case PIPE_LOGICOP_NOR: - return qir_NOT(c, qir_OR(c, src, dst)); - case PIPE_LOGICOP_AND_INVERTED: - return qir_AND(c, qir_NOT(c, src), dst); - case PIPE_LOGICOP_COPY_INVERTED: - return qir_NOT(c, src); - case PIPE_LOGICOP_AND_REVERSE: - return qir_AND(c, src, qir_NOT(c, dst)); - case PIPE_LOGICOP_INVERT: - return qir_NOT(c, dst); - case PIPE_LOGICOP_XOR: - return qir_XOR(c, src, dst); - case PIPE_LOGICOP_NAND: - return qir_NOT(c, qir_AND(c, src, dst)); - case PIPE_LOGICOP_AND: - return qir_AND(c, src, dst); - case PIPE_LOGICOP_EQUIV: - return qir_NOT(c, qir_XOR(c, src, dst)); - case PIPE_LOGICOP_NOOP: - return dst; - case PIPE_LOGICOP_OR_INVERTED: - return qir_OR(c, qir_NOT(c, src), dst); - case PIPE_LOGICOP_OR_REVERSE: - return qir_OR(c, src, qir_NOT(c, dst)); - case PIPE_LOGICOP_OR: - return qir_OR(c, src, dst); - case PIPE_LOGICOP_SET: - return qir_uniform_ui(c, ~0); - case PIPE_LOGICOP_COPY: - default: - return src; - } -} - -/** - * Applies the GL blending pipeline and returns the packed (8888) output - * color. - */ -static struct qreg -blend_pipeline(struct vc4_compile *c) -{ - enum pipe_format color_format = c->fs_key->color_format; - const uint8_t *format_swiz = vc4_get_format_swizzle(color_format); - struct qreg tlb_read_color[4] = { c->undef, c->undef, c->undef, c->undef }; - struct qreg dst_color[4] = { c->undef, c->undef, c->undef, c->undef }; - struct qreg linear_dst_color[4] = { c->undef, c->undef, c->undef, c->undef }; - struct qreg packed_dst_color = c->undef; - - if (c->fs_key->blend.blend_enable || - c->fs_key->blend.colormask != 0xf || - c->fs_key->logicop_func != PIPE_LOGICOP_COPY) { - packed_dst_color = qir_TLB_COLOR_READ(c); - for (int i = 0; i < 4; i++) - tlb_read_color[i] = qir_UNPACK_8_F(c, - packed_dst_color, i); - for (int i = 0; i < 4; i++) { - dst_color[i] = get_swizzled_channel(c, - tlb_read_color, - format_swiz[i]); - if (util_format_is_srgb(color_format) && i != 3) { - linear_dst_color[i] = - qir_srgb_decode(c, dst_color[i]); - } else { - linear_dst_color[i] = dst_color[i]; - } - } - } - - struct qreg undef_array[4] = { c->undef, c->undef, c->undef, c->undef }; - const struct qreg *output_colors = (c->output_color_index != -1 ? - c->outputs + c->output_color_index : - undef_array); - struct qreg blend_src_color[4]; - for (int i = 0; i < 4; i++) - blend_src_color[i] = output_colors[i]; - - struct qreg blend_color[4]; - vc4_blend(c, blend_color, linear_dst_color, blend_src_color); - - if (util_format_is_srgb(color_format)) { - for (int i = 0; i < 3; i++) - blend_color[i] = qir_srgb_encode(c, blend_color[i]); - } - - /* Debug: Sometimes you're getting a black output and just want to see - * if the FS is getting executed at all. Spam magenta into the color - * output. - */ - if (0) { - blend_color[0] = qir_uniform_f(c, 1.0); - blend_color[1] = qir_uniform_f(c, 0.0); - blend_color[2] = qir_uniform_f(c, 1.0); - blend_color[3] = qir_uniform_f(c, 0.5); - } - - struct qreg swizzled_outputs[4]; - for (int i = 0; i < 4; i++) { - swizzled_outputs[i] = get_swizzled_channel(c, blend_color, - format_swiz[i]); - } - - struct qreg packed_color = c->undef; - for (int i = 0; i < 4; i++) { - if (swizzled_outputs[i].file == QFILE_NULL) - continue; - if (packed_color.file == QFILE_NULL) { - packed_color = qir_PACK_8888_F(c, swizzled_outputs[i]); - } else { - packed_color = qir_PACK_8_F(c, - packed_color, - swizzled_outputs[i], - i); - } - } - - if (packed_color.file == QFILE_NULL) - packed_color = qir_uniform_ui(c, 0); - - if (c->fs_key->logicop_func != PIPE_LOGICOP_COPY) { - packed_color = vc4_logicop(c, packed_color, packed_dst_color); - } - - /* If the bit isn't set in the color mask, then just return the - * original dst color, instead. - */ - uint32_t colormask = 0xffffffff; - for (int i = 0; i < 4; i++) { - if (format_swiz[i] < 4 && - !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) { - colormask &= ~(0xff << (i * 8)); - } - } - if (colormask != 0xffffffff) { - packed_color = qir_OR(c, - qir_AND(c, packed_color, - qir_uniform_ui(c, colormask)), - qir_AND(c, packed_dst_color, - qir_uniform_ui(c, ~colormask))); - } - - return packed_color; -} - static void emit_frag_end(struct vc4_compile *c) { clip_distance_discard(c); - alpha_test_discard(c); - struct qreg color = blend_pipeline(c); + + struct qreg color; + if (c->output_color_index != -1) { + color = c->outputs[c->output_color_index]; + } else { + color = qir_uniform_ui(c, 0); + } if (c->discard.file != QFILE_NULL) qir_TLB_DISCARD_SETUP(c, c->discard); @@ -1839,8 +1514,11 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) case nir_intrinsic_load_input: assert(instr->num_components == 1); - *dest = c->inputs[instr->const_index[0]]; - + if (instr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) { + *dest = qir_TLB_COLOR_READ(c); + } else { + *dest = c->inputs[instr->const_index[0]]; + } break; case nir_intrinsic_store_output: @@ -2052,6 +1730,8 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, c->s = tgsi_to_nir(tokens, &nir_options); nir_opt_global_to_local(c->s); nir_convert_to_ssa(c->s); + if (stage == QSTAGE_FRAG) + vc4_nir_lower_blend(c); vc4_nir_lower_io(c); nir_lower_idiv(c->s); nir_lower_load_const_to_scalar(c->s); diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 57e25de1b94..cade795c12a 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -39,6 +39,8 @@ #include "vc4_screen.h" #include "pipe/p_state.h" +struct nir_builder; + enum qfile { QFILE_NULL, QFILE_TEMP, @@ -242,7 +244,11 @@ enum quniform_contents { QUNIFORM_TEXTURE_BORDER_COLOR, - QUNIFORM_BLEND_CONST_COLOR, + QUNIFORM_BLEND_CONST_COLOR_X, + QUNIFORM_BLEND_CONST_COLOR_Y, + QUNIFORM_BLEND_CONST_COLOR_Z, + QUNIFORM_BLEND_CONST_COLOR_W, + QUNIFORM_STENCIL, QUNIFORM_ALPHA_REF, @@ -414,6 +420,11 @@ struct vc4_compile { uint32_t variant_id; }; +/* Special nir_load_input intrinsic index for loading the current TLB + * destination color. + */ +#define VC4_NIR_TLB_COLOR_READ_INPUT 2000000000 + /* Special offset for nir_load_uniform values to get a QUNIFORM_* * state-dependent value. */ @@ -458,9 +469,12 @@ bool qir_opt_cse(struct vc4_compile *c); bool qir_opt_dead_code(struct vc4_compile *c); bool qir_opt_small_immediates(struct vc4_compile *c); bool qir_opt_vpm_writes(struct vc4_compile *c); +void vc4_nir_lower_blend(struct vc4_compile *c); void vc4_nir_lower_io(struct vc4_compile *c); nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b, enum quniform_contents contents); +nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b, + nir_ssa_def **srcs, int swiz); void qir_lower_uniforms(struct vc4_compile *c); void qpu_schedule_instructions(struct vc4_compile *c); diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c index 3bf6672a88a..85d6998205e 100644 --- a/src/gallium/drivers/vc4/vc4_uniforms.c +++ b/src/gallium/drivers/vc4/vc4_uniforms.c @@ -257,9 +257,14 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, uinfo->data[i])); break; - case QUNIFORM_BLEND_CONST_COLOR: + case QUNIFORM_BLEND_CONST_COLOR_X: + case QUNIFORM_BLEND_CONST_COLOR_Y: + case QUNIFORM_BLEND_CONST_COLOR_Z: + case QUNIFORM_BLEND_CONST_COLOR_W: cl_aligned_f(&uniforms, - CLAMP(vc4->blend_color.color[uinfo->data[i]], 0, 1)); + CLAMP(vc4->blend_color.color[uinfo->contents[i] - + QUNIFORM_BLEND_CONST_COLOR_X], + 0, 1)); break; case QUNIFORM_STENCIL: @@ -321,7 +326,10 @@ vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader) dirty |= VC4_DIRTY_TEXSTATE; break; - case QUNIFORM_BLEND_CONST_COLOR: + case QUNIFORM_BLEND_CONST_COLOR_X: + case QUNIFORM_BLEND_CONST_COLOR_Y: + case QUNIFORM_BLEND_CONST_COLOR_Z: + case QUNIFORM_BLEND_CONST_COLOR_W: dirty |= VC4_DIRTY_BLEND_COLOR; break; -- 2.30.2