--- /dev/null
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file
+ * Generate code to perform all per-fragment operations.
+ *
+ * Code generated by these functions perform both alpha, depth, and stencil
+ * testing as well as alpha blending.
+ *
+ * \note
+ * Occlusion query is not supported, but this is the right place to add that
+ * support.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "cell_context.h"
+
+#include "rtasm/rtasm_ppc_spe.h"
+
+
+/**
+ * Generate code to perform alpha testing.
+ *
+ * The code generated by this function uses the register specificed by
+ * \c mask as both an input and an output.
+ *
+ * \param dsa Current alpha-test state
+ * \param f Function to which code should be appended
+ * \param mask Index of register containing active fragment mask
+ * \param alphas Index of register containing per-fragment alpha values
+ *
+ * \note Emits a maximum of 6 instructions.
+ */
+static void
+emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa,
+ struct spe_function *f, int mask, int alphas)
+{
+ /* If the alpha function is either NEVER or ALWAYS, there is no need to
+ * load the reference value into a register. ALWAYS is a fairly common
+ * case, and this optimization saves 2 instructions.
+ */
+ if (dsa->alpha.enabled
+ && (dsa->alpha.func != PIPE_FUNC_NEVER)
+ && (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+ int ref = spe_allocate_available_register(f);
+ int tmp_a = spe_allocate_available_register(f);
+ int tmp_b = spe_allocate_available_register(f);
+ union {
+ float f;
+ unsigned u;
+ } ref_val;
+ boolean complement = FALSE;
+
+ ref_val.f = dsa->alpha.ref;
+
+ spe_il(f, ref, ref_val.u & 0x0000ffff);
+ spe_ilh(f, ref, ref_val.u >> 16);
+
+ switch (dsa->alpha.func) {
+ case PIPE_FUNC_NOTEQUAL:
+ complement = TRUE;
+ /* FALLTHROUGH */
+
+ case PIPE_FUNC_EQUAL:
+ spe_fceq(f, tmp_a, ref, alphas);
+ break;
+
+ case PIPE_FUNC_LEQUAL:
+ complement = TRUE;
+ /* FALLTHROUGH */
+
+ case PIPE_FUNC_GREATER:
+ spe_fcgt(f, tmp_a, ref, alphas);
+ break;
+
+ case PIPE_FUNC_LESS:
+ complement = TRUE;
+ /* FALLTHROUGH */
+
+ case PIPE_FUNC_GEQUAL:
+ spe_fcgt(f, tmp_a, ref, alphas);
+ spe_fceq(f, tmp_b, ref, alphas);
+ spe_or(f, tmp_a, tmp_b, tmp_a);
+ break;
+
+ case PIPE_FUNC_ALWAYS:
+ case PIPE_FUNC_NEVER:
+ default:
+ assert(0);
+ break;
+ }
+
+ if (complement) {
+ spe_andc(f, mask, mask, tmp_a);
+ } else {
+ spe_and(f, mask, mask, tmp_a);
+ }
+
+ spe_release_register(f, ref);
+ spe_release_register(f, tmp_a);
+ spe_release_register(f, tmp_b);
+ } else if (dsa->alpha.enabled && (dsa->alpha.func == PIPE_FUNC_NEVER)) {
+ spe_il(f, mask, 0);
+ }
+}
+
+
+/**
+ * \param dsa Current depth-test state
+ * \param f Function to which code should be appended
+ * \param m Mask of allocated / free SPE registers
+ * \param mask Index of register to contain depth-pass mask
+ * \param stored Index of register containing values from depth buffer
+ * \param calculated Index of register containing per-fragment depth values
+ *
+ * \return
+ * If the calculated depth comparison mask is the actual mask, \c FALSE is
+ * returned. If the calculated depth comparison mask is the compliment of
+ * the actual mask, \c TRUE is returned.
+ *
+ * \note Emits a maximum of 3 instructions.
+ */
+static boolean
+emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa,
+ struct spe_function *f, int mask, int stored, int calculated)
+{
+ unsigned func = (dsa->depth.enabled)
+ ? dsa->depth.func : PIPE_FUNC_ALWAYS;
+ int tmp = spe_allocate_available_register(f);
+ boolean compliment = FALSE;
+
+ switch (func) {
+ case PIPE_FUNC_NEVER:
+ spe_il(f, mask, 0);
+ break;
+
+ case PIPE_FUNC_NOTEQUAL:
+ compliment = TRUE;
+ /* FALLTHROUGH */
+ case PIPE_FUNC_EQUAL:
+ spe_ceq(f, mask, calculated, stored);
+ break;
+
+ case PIPE_FUNC_LEQUAL:
+ compliment = TRUE;
+ /* FALLTHROUGH */
+ case PIPE_FUNC_GREATER:
+ spe_clgt(f, mask, calculated, stored);
+ break;
+
+ case PIPE_FUNC_LESS:
+ compliment = TRUE;
+ /* FALLTHROUGH */
+ case PIPE_FUNC_GEQUAL:
+ spe_clgt(f, mask, calculated, stored);
+ spe_ceq(f, tmp, calculated, stored);
+ spe_or(f, mask, mask, tmp);
+ break;
+
+ case PIPE_FUNC_ALWAYS:
+ spe_il(f, mask, ~0);
+ break;
+
+ default:
+ assert(0);
+ break;
+ }
+
+ spe_release_register(f, tmp);
+ return compliment;
+}
+
+
+/**
+ * \note Emits a maximum of 5 instructions.
+ */
+static void
+emit_stencil_op(struct spe_function *f,
+ int out, int in, int mask, unsigned op, unsigned ref)
+{
+ const int clamp = spe_allocate_available_register(f);
+ const int tmp = spe_allocate_available_register(f);
+
+ switch(op) {
+ case PIPE_STENCIL_OP_KEEP:
+ assert(0);
+ case PIPE_STENCIL_OP_ZERO:
+ spe_il(f, out, 0);
+ break;
+ case PIPE_STENCIL_OP_REPLACE:
+ spe_il(f, out, ref);
+ break;
+ case PIPE_STENCIL_OP_INCR:
+ spe_il(f, clamp, 0x0ff);
+ spe_ai(f, out, in, 1);
+ spe_cgti(f, tmp, out, clamp);
+ spe_selb(f, out, out, clamp, tmp);
+ break;
+ case PIPE_STENCIL_OP_DECR:
+ spe_il(f, clamp, 0);
+ spe_ai(f, out, in, -1);
+ spe_cgti(f, tmp, out, clamp);
+ spe_selb(f, out, clamp, out, tmp);
+ break;
+ case PIPE_STENCIL_OP_INCR_WRAP:
+ spe_ai(f, out, in, 1);
+ break;
+ case PIPE_STENCIL_OP_DECR_WRAP:
+ spe_ai(f, out, in, -1);
+ break;
+ case PIPE_STENCIL_OP_INVERT:
+ spe_nor(f, out, in, in);
+ break;
+ default:
+ assert(0);
+ }
+
+ spe_release_register(f, tmp);
+ spe_release_register(f, clamp);
+
+ spe_selb(f, out, in, out, mask);
+}
+
+
+/**
+ * \param dsa Depth / stencil test state
+ * \param face 0 for front face, 1 for back face
+ * \param f Function to append instructions to
+ * \param reg_mask Mask of allocated registers
+ * \param mask Register containing mask of fragments passing the
+ * alpha test
+ * \param depth_mask Register containing mask of fragments passing the
+ * depth test
+ * \param depth_compliment Is \c depth_mask the compliment of the actual mask?
+ * \param stencil Register containing values from stencil buffer
+ * \param depth_pass Register to store mask of fragments passing stencil test
+ * and depth test
+ *
+ * \note
+ * Emits a maximum of 10 + (3 * 5) = 25 instructions.
+ */
+static int
+emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
+ unsigned face,
+ struct spe_function *f,
+ int mask,
+ int depth_mask,
+ boolean depth_complement,
+ int stencil,
+ int depth_pass)
+{
+ int stencil_fail = spe_allocate_available_register(f);
+ int depth_fail = spe_allocate_available_register(f);
+ int stencil_mask = spe_allocate_available_register(f);
+ int stencil_pass = spe_allocate_available_register(f);
+ int face_stencil = spe_allocate_available_register(f);
+ int stencil_src = stencil;
+ const unsigned ref = (dsa->stencil[face].ref_value
+ & dsa->stencil[face].value_mask);
+ boolean complement = FALSE;
+ int stored = spe_allocate_available_register(f);
+ int tmp = spe_allocate_available_register(f);
+
+
+ if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+ && (dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+ && (dsa->stencil[face].value_mask != 0x0ff)) {
+ spe_andi(f, stored, stencil, dsa->stencil[face].value_mask);
+ }
+
+
+ switch (dsa->stencil[face].func) {
+ case PIPE_FUNC_NEVER:
+ spe_il(f, stencil_mask, 0);
+ break;
+
+ case PIPE_FUNC_NOTEQUAL:
+ complement = TRUE;
+ /* FALLTHROUGH */
+ case PIPE_FUNC_EQUAL:
+ spe_ceqi(f, stencil_mask, stored, ref);
+ break;
+
+ case PIPE_FUNC_LEQUAL:
+ complement = TRUE;
+ /* FALLTHROUGH */
+ case PIPE_FUNC_GREATER:
+ spe_clgti(f, stencil_mask, stored, ref);
+ break;
+
+ case PIPE_FUNC_LESS:
+ complement = TRUE;
+ /* FALLTHROUGH */
+ case PIPE_FUNC_GEQUAL:
+ spe_clgti(f, stencil_mask, stored, ref);
+ spe_ceqi(f, tmp, stored, ref);
+ spe_or(f, stencil_mask, stencil_mask, tmp);
+ break;
+
+ case PIPE_FUNC_ALWAYS:
+ /* See comment below. */
+ break;
+
+ default:
+ assert(0);
+ break;
+ }
+
+ spe_release_register(f, stored);
+ spe_release_register(f, tmp);
+
+
+ /* ALWAYS is a very common stencil-test, so some effort is applied to
+ * optimize that case. The stencil-pass mask is the same as the input
+ * fragment mask. This makes the stencil-test (above) a no-op, and the
+ * input fragment mask can be "renamed" the stencil-pass mask.
+ */
+ if (dsa->stencil[face].func == PIPE_FUNC_ALWAYS) {
+ spe_release_register(f, stencil_pass);
+ stencil_pass = mask;
+ } else {
+ if (complement) {
+ spe_andc(f, stencil_pass, mask, stencil_mask);
+ } else {
+ spe_and(f, stencil_pass, mask, stencil_mask);
+ }
+ }
+
+ if (depth_complement) {
+ spe_andc(f, depth_pass, stencil_pass, depth_mask);
+ } else {
+ spe_and(f, depth_pass, stencil_pass, depth_mask);
+ }
+
+
+ /* Conditionally emit code to update the stencil value under various
+ * condititons. Note that there is no need to generate code under the
+ * following circumstances:
+ *
+ * - Stencil write mask is zero.
+ * - For stencil-fail if the stencil test is ALWAYS
+ * - For depth-fail if the stencil test is NEVER
+ * - For depth-pass if the stencil test is NEVER
+ * - Any of the 3 conditions if the operation is KEEP
+ */
+ if (dsa->stencil[face].write_mask != 0) {
+ if ((dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+ && (dsa->stencil[face].fail_op != PIPE_STENCIL_OP_KEEP)) {
+ if (complement) {
+ spe_and(f, stencil_fail, mask, stencil_mask);
+ } else {
+ spe_andc(f, stencil_fail, mask, stencil_mask);
+ }
+
+ emit_stencil_op(f, face_stencil, stencil_src, stencil_fail,
+ dsa->stencil[face].fail_op,
+ dsa->stencil[face].ref_value);
+
+ stencil_src = face_stencil;
+ }
+
+ if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+ && (dsa->stencil[face].zfail_op != PIPE_STENCIL_OP_KEEP)) {
+ if (depth_complement) {
+ spe_and(f, depth_fail, stencil_pass, depth_mask);
+ } else {
+ spe_andc(f, depth_fail, stencil_pass, depth_mask);
+ }
+
+ emit_stencil_op(f, face_stencil, stencil_src, depth_fail,
+ dsa->stencil[face].zfail_op,
+ dsa->stencil[face].ref_value);
+ stencil_src = face_stencil;
+ }
+
+ if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+ && (dsa->stencil[face].zpass_op != PIPE_STENCIL_OP_KEEP)) {
+ emit_stencil_op(f, face_stencil, stencil_src, depth_pass,
+ dsa->stencil[face].zpass_op,
+ dsa->stencil[face].ref_value);
+ stencil_src = face_stencil;
+ }
+ }
+
+ spe_release_register(f, stencil_fail);
+ spe_release_register(f, depth_fail);
+ spe_release_register(f, stencil_mask);
+ if (stencil_pass != mask) {
+ spe_release_register(f, stencil_pass);
+ }
+
+ /* If all of the stencil operations were KEEP or the stencil write mask was
+ * zero, "stencil_src" will still be set to "stencil". In this case
+ * release the "face_stencil" register. Otherwise apply the stencil write
+ * mask to select bits from the calculated stencil value and the previous
+ * stencil value.
+ */
+ if (stencil_src == stencil) {
+ spe_release_register(f, face_stencil);
+ } else if (dsa->stencil[face].write_mask != 0x0ff) {
+ int tmp = spe_allocate_available_register(f);
+
+ spe_il(f, tmp, dsa->stencil[face].write_mask);
+ spe_selb(f, stencil_src, stencil, stencil_src, tmp);
+
+ spe_release_register(f, tmp);
+ }
+
+ return stencil_src;
+}
+
+
+void
+cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
+{
+ struct pipe_depth_stencil_alpha_state *const dsa = &cdsa->base;
+ struct spe_function *const f = &cdsa->code;
+
+ /* This code generates a maximum of 6 (alpha test) + 3 (depth test)
+ * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions. Round
+ * up to 64 to make it a happy power-of-two.
+ */
+ spe_init_func(f, 4 * 64);
+
+
+ /* Allocate registers for the function's input parameters. Cleverly (and
+ * clever code is usually dangerous, but I couldn't resist) the generated
+ * function returns a structure. Returned structures start with register
+ * 3, and the structure fields are ordered to match up exactly with the
+ * input parameters.
+ */
+ int mask = spe_allocate_register(f, 3);
+ int depth = spe_allocate_register(f, 4);
+ int stencil = spe_allocate_register(f, 5);
+ int zvals = spe_allocate_register(f, 6);
+ int frag_a = spe_allocate_register(f, 7);
+ int facing = spe_allocate_register(f, 8);
+
+ int depth_mask = spe_allocate_available_register(f);
+
+ boolean depth_complement;
+
+
+ emit_alpha_test(dsa, f, mask, frag_a);
+
+ depth_complement = emit_depth_test(dsa, f, depth_mask, depth, zvals);
+
+ if (dsa->stencil[0].enabled) {
+ const int front_depth_pass = spe_allocate_available_register(f);
+ int front_stencil = emit_stencil_test(dsa, 0, f, mask,
+ depth_mask, depth_complement,
+ stencil, front_depth_pass);
+
+ if (dsa->stencil[1].enabled) {
+ const int back_depth_pass = spe_allocate_available_register(f);
+ int back_stencil = emit_stencil_test(dsa, 1, f, mask,
+ depth_mask, depth_complement,
+ stencil, back_depth_pass);
+
+ /* If the front facing stencil value and the back facing stencil
+ * value are stored in the same register, there is no need to select
+ * a value based on the facing. This can happen if the stencil value
+ * was not modified due to the write masks being zero, the stencil
+ * operations being KEEP, etc.
+ */
+ if (front_stencil != back_stencil) {
+ spe_selb(f, stencil, back_stencil, front_stencil, facing);
+ }
+
+ if (back_stencil != stencil) {
+ spe_release_register(f, back_stencil);
+ }
+
+ if (front_stencil != stencil) {
+ spe_release_register(f, front_stencil);
+ }
+
+ spe_selb(f, mask, back_depth_pass, front_depth_pass, facing);
+
+ spe_release_register(f, back_depth_pass);
+ } else {
+ if (front_stencil != stencil) {
+ spe_or(f, stencil, front_stencil, front_stencil);
+ spe_release_register(f, front_stencil);
+ }
+ }
+
+ spe_release_register(f, front_depth_pass);
+ } else if (dsa->depth.enabled) {
+ if (depth_complement) {
+ spe_andc(f, mask, mask, depth_mask);
+ } else {
+ spe_and(f, mask, mask, depth_mask);
+ }
+ }
+
+ if (dsa->depth.writemask) {
+ spe_selb(f, depth, depth, zvals, mask);
+ }
+
+ spe_bi(f, 0, 0, 0);
+}
+
+
+/**
+ * \note Emits a maximum of 3 instructions
+ */
+static int
+emit_alpha_factor_calculation(struct spe_function *f,
+ unsigned factor, float const_alpha,
+ int src_alpha, int dst_alpha)
+{
+ union {
+ float f;
+ unsigned u;
+ } alpha;
+ int factor_reg;
+ int tmp;
+
+
+ alpha.f = const_alpha;
+
+ switch (factor) {
+ case PIPE_BLENDFACTOR_ONE:
+ factor_reg = -1;
+ break;
+
+ case PIPE_BLENDFACTOR_SRC_ALPHA:
+ factor_reg = spe_allocate_available_register(f);
+
+ spe_or(f, factor_reg, src_alpha, src_alpha);
+ break;
+
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ factor_reg = dst_alpha;
+ break;
+
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+ factor_reg = -1;
+ break;
+
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+ const_alpha = 1.0 - const_alpha;
+ /* FALLTHROUGH */
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ factor_reg = spe_allocate_available_register(f);
+
+ spe_il(f, factor_reg, alpha.u & 0x0ffff);
+ spe_ilh(f, factor_reg, alpha.u >> 16);
+ break;
+
+ case PIPE_BLENDFACTOR_ZERO:
+ factor_reg = -1;
+ break;
+
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+ tmp = spe_allocate_available_register(f);
+ factor_reg = spe_allocate_available_register(f);
+
+ spe_il(f, tmp, 1);
+ spe_cuflt(f, tmp, tmp, 0);
+ spe_fs(f, factor_reg, tmp, src_alpha);
+
+ spe_release_register(f, tmp);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+ tmp = spe_allocate_available_register(f);
+ factor_reg = spe_allocate_available_register(f);
+
+ spe_il(f, tmp, 1);
+ spe_cuflt(f, tmp, tmp, 0);
+ spe_fs(f, factor_reg, tmp, dst_alpha);
+
+ spe_release_register(f, tmp);
+ break;
+
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+ default:
+ assert(0);
+ factor_reg = -1;
+ break;
+ }
+
+ return factor_reg;
+}
+
+
+/**
+ * \note Emits a maximum of 5 instructions
+ */
+static void
+emit_color_factor_calculation(struct spe_function *f,
+ unsigned sF, unsigned mask,
+ const struct pipe_blend_color *blend_color,
+ const int *src,
+ const int *dst,
+ int *factor)
+{
+ union {
+ float f[4];
+ unsigned u[4];
+ } color;
+ int tmp;
+ unsigned i;
+
+
+ color.f[0] = blend_color->color[0];
+ color.f[1] = blend_color->color[1];
+ color.f[2] = blend_color->color[2];
+ color.f[3] = blend_color->color[3];
+
+ factor[0] = -1;
+ factor[1] = -1;
+ factor[2] = -1;
+ factor[3] = -1;
+
+ switch (sF) {
+ case PIPE_BLENDFACTOR_ONE:
+ break;
+
+ case PIPE_BLENDFACTOR_SRC_COLOR:
+ for (i = 0; i < 3; ++i) {
+ if ((mask & (1U << i)) != 0) {
+ factor[i] = spe_allocate_available_register(f);
+ spe_or(f, factor[i], src[i], src[i]);
+ }
+ }
+ break;
+
+ case PIPE_BLENDFACTOR_SRC_ALPHA:
+ factor[0] = spe_allocate_available_register(f);
+ factor[1] = factor[0];
+ factor[2] = factor[0];
+
+ spe_or(f, factor[0], src[3], src[3]);
+ break;
+
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ factor[0] = dst[3];
+ factor[1] = dst[3];
+ factor[2] = dst[3];
+ break;
+
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ factor[0] = dst[0];
+ factor[1] = dst[1];
+ factor[2] = dst[2];
+ break;
+
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+ tmp = spe_allocate_available_register(f);
+ factor[0] = spe_allocate_available_register(f);
+ factor[1] = factor[0];
+ factor[2] = factor[0];
+
+ /* Alpha saturate means min(As, 1-Ad).
+ */
+ spe_il(f, tmp, 1);
+ spe_cuflt(f, tmp, tmp, 0);
+ spe_fs(f, tmp, tmp, dst[3]);
+ spe_fcgt(f, factor[0], tmp, src[3]);
+ spe_selb(f, factor[0], src[3], tmp, factor[0]);
+
+ spe_release_register(f, tmp);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ color.f[0] = 1.0 - color.f[0];
+ color.f[1] = 1.0 - color.f[1];
+ color.f[2] = 1.0 - color.f[2];
+ /* FALLTHROUGH */
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ for (i = 0; i < 3; i++) {
+ factor[i] = spe_allocate_available_register(f);
+
+ spe_il(f, factor[i], color.u[i] & 0x0ffff);
+ spe_ilh(f, factor[i], color.u[i] >> 16);
+ }
+ break;
+
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+ color.f[3] = 1.0 - color.f[3];
+ /* FALLTHROUGH */
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ factor[0] = spe_allocate_available_register(f);
+ factor[1] = factor[0];
+ factor[2] = factor[0];
+
+ spe_il(f, factor[0], color.u[3] & 0x0ffff);
+ spe_ilh(f, factor[0], color.u[3] >> 16);
+ break;
+
+ case PIPE_BLENDFACTOR_ZERO:
+ break;
+
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ tmp = spe_allocate_available_register(f);
+
+ spe_il(f, tmp, 1);
+ spe_cuflt(f, tmp, tmp, 0);
+
+ for (i = 0; i < 3; ++i) {
+ if ((mask & (1U << i)) != 0) {
+ factor[i] = spe_allocate_available_register(f);
+ spe_fs(f, factor[i], tmp, src[i]);
+ }
+ }
+
+ spe_release_register(f, tmp);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+ tmp = spe_allocate_available_register(f);
+ factor[0] = spe_allocate_available_register(f);
+ factor[1] = factor[0];
+ factor[2] = factor[0];
+
+ spe_il(f, tmp, 1);
+ spe_cuflt(f, tmp, tmp, 0);
+ spe_fs(f, factor[0], tmp, src[3]);
+
+ spe_release_register(f, tmp);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+ tmp = spe_allocate_available_register(f);
+ factor[0] = spe_allocate_available_register(f);
+ factor[1] = factor[0];
+ factor[2] = factor[0];
+
+ spe_il(f, tmp, 1);
+ spe_cuflt(f, tmp, tmp, 0);
+ spe_fs(f, factor[0], tmp, dst[3]);
+
+ spe_release_register(f, tmp);
+ break;
+
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ tmp = spe_allocate_available_register(f);
+
+ spe_il(f, tmp, 1);
+ spe_cuflt(f, tmp, tmp, 0);
+
+ for (i = 0; i < 3; ++i) {
+ if ((mask & (1U << i)) != 0) {
+ factor[i] = spe_allocate_available_register(f);
+ spe_fs(f, factor[i], tmp, dst[i]);
+ }
+ }
+
+ spe_release_register(f, tmp);
+ break;
+
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+ default:
+ assert(0);
+ }
+}
+
+
+static void
+emit_blend_calculation(struct spe_function *f,
+ unsigned func, unsigned sF, unsigned dF,
+ int src, int src_factor, int dst, int dst_factor)
+{
+ int tmp = spe_allocate_available_register(f);
+
+ switch (func) {
+ case PIPE_BLEND_ADD:
+ if (sF == PIPE_BLENDFACTOR_ONE) {
+ if (dF == PIPE_BLENDFACTOR_ZERO) {
+ /* Do nothing. */
+ } else if (dF == PIPE_BLENDFACTOR_ONE) {
+ spe_fa(f, src, src, dst);
+ }
+ } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+ if (dF == PIPE_BLENDFACTOR_ZERO) {
+ spe_il(f, src, 0);
+ } else if (dF == PIPE_BLENDFACTOR_ONE) {
+ spe_or(f, src, dst, dst);
+ }
+ } else {
+ spe_fm(f, tmp, dst, dst_factor);
+ spe_fma(f, src, src, src_factor, tmp);
+ }
+ break;
+
+ case PIPE_BLEND_SUBTRACT:
+ if (sF == PIPE_BLENDFACTOR_ONE) {
+ if (dF == PIPE_BLENDFACTOR_ZERO) {
+ /* Do nothing. */
+ } else if (dF == PIPE_BLENDFACTOR_ONE) {
+ spe_fs(f, src, src, dst);
+ }
+ } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+ if (dF == PIPE_BLENDFACTOR_ZERO) {
+ spe_il(f, src, 0);
+ } else if (dF == PIPE_BLENDFACTOR_ONE) {
+ spe_il(f, tmp, 0);
+ spe_fs(f, src, tmp, dst);
+ }
+ } else {
+ spe_fm(f, tmp, dst, dst_factor);
+ spe_fms(f, src, src, src_factor, tmp);
+ }
+ break;
+
+ case PIPE_BLEND_REVERSE_SUBTRACT:
+ if (sF == PIPE_BLENDFACTOR_ONE) {
+ if (dF == PIPE_BLENDFACTOR_ZERO) {
+ spe_il(f, tmp, 0);
+ spe_fs(f, src, tmp, src);
+ } else if (dF == PIPE_BLENDFACTOR_ONE) {
+ spe_fs(f, src, dst, src);
+ }
+ } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+ if (dF == PIPE_BLENDFACTOR_ZERO) {
+ spe_il(f, src, 0);
+ } else if (dF == PIPE_BLENDFACTOR_ONE) {
+ spe_or(f, src, dst, dst);
+ }
+ } else {
+ spe_fm(f, tmp, src, src_factor);
+ spe_fms(f, src, src, dst_factor, tmp);
+ }
+ break;
+
+ case PIPE_BLEND_MIN:
+ spe_cgt(f, tmp, src, dst);
+ spe_selb(f, src, dst, src, tmp);
+ break;
+
+ case PIPE_BLEND_MAX:
+ spe_cgt(f, tmp, src, dst);
+ spe_selb(f, src, src, dst, tmp);
+ break;
+
+ default:
+ assert(0);
+ }
+
+ spe_release_register(f, tmp);
+}
+
+
+/**
+ * Generate code to perform alpha blending on the SPE
+ */
+void
+cell_generate_alpha_blend(struct cell_blend_state *cb,
+ const struct pipe_blend_color *blend_color)
+{
+ struct pipe_blend_state *const b = &cb->base;
+ struct spe_function *const f = &cb->code;
+
+ /* This code generates a maximum of 3 (source alpha factor)
+ * + 3 (destination alpha factor) + (3 * 5) (source color factor)
+ * + (3 * 5) (destination color factor) + (4 * 2) (blend equation)
+ * + 4 (fragment mask) + 1 (return) = 49 instlructions. Round up to 64 to
+ * make it a happy power-of-two.
+ */
+ spe_init_func(f, 4 * 64);
+
+
+ const int frag[4] = {
+ spe_allocate_register(f, 3),
+ spe_allocate_register(f, 4),
+ spe_allocate_register(f, 5),
+ spe_allocate_register(f, 6),
+ };
+ const int pixel[4] = {
+ spe_allocate_register(f, 7),
+ spe_allocate_register(f, 8),
+ spe_allocate_register(f, 9),
+ spe_allocate_register(f, 10),
+ };
+ const int mask = spe_allocate_register(f, 11);
+ unsigned func[4];
+ unsigned sF[4];
+ unsigned dF[4];
+ unsigned i;
+ int src_factor[4];
+ int dst_factor[4];
+
+
+ /* Does the selected blend mode make use of the source / destination
+ * color (RGB) blend factors?
+ */
+ boolean need_color_factor = b->blend_enable
+ && (b->rgb_func != PIPE_BLEND_MIN)
+ && (b->rgb_func != PIPE_BLEND_MAX);
+
+ /* Does the selected blend mode make use of the source / destination
+ * alpha blend factors?
+ */
+ boolean need_alpha_factor = b->blend_enable
+ && (b->alpha_func != PIPE_BLEND_MIN)
+ && (b->alpha_func != PIPE_BLEND_MAX);
+
+
+ sF[0] = b->rgb_src_factor;
+ sF[1] = sF[0];
+ sF[2] = sF[0];
+ sF[3] = (b->alpha_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE)
+ ? PIPE_BLENDFACTOR_ONE : b->alpha_src_factor;
+
+ dF[0] = b->rgb_dst_factor;
+ dF[1] = dF[0];
+ dF[2] = dF[0];
+ dF[3] = b->rgb_dst_factor;
+
+
+ /* If alpha writing is enabled and the alpha blend mode requires use of
+ * the alpha factor, calculate the alpha factor.
+ */
+ if (((b->colormask & 8) != 0) && need_alpha_factor) {
+ src_factor[3] = emit_alpha_factor_calculation(f, sF[3],
+ blend_color->color[3],
+ frag[3], pixel[3]);
+
+ /* If the alpha destination blend factor is the same as the alpha source
+ * blend factor, re-use the previously calculated value.
+ */
+ dst_factor[3] = (dF[3] == sF[3])
+ ? src_factor[3]
+ : emit_alpha_factor_calculation(f, dF[3],
+ blend_color->color[3],
+ frag[3], pixel[3]);
+ }
+
+
+ if (sF[0] == sF[3]) {
+ src_factor[0] = src_factor[3];
+ src_factor[1] = src_factor[3];
+ src_factor[2] = src_factor[3];
+ } else if (sF[0] == dF[3]) {
+ src_factor[0] = dst_factor[3];
+ src_factor[1] = dst_factor[3];
+ src_factor[2] = dst_factor[3];
+ } else if (need_color_factor) {
+ emit_color_factor_calculation(f,
+ b->rgb_src_factor,
+ b->colormask,
+ blend_color,
+ frag, pixel, src_factor);
+ }
+
+
+ if (dF[0] == sF[3]) {
+ dst_factor[0] = src_factor[3];
+ dst_factor[1] = src_factor[3];
+ dst_factor[2] = src_factor[3];
+ } else if (dF[0] == dF[3]) {
+ dst_factor[0] = dst_factor[3];
+ dst_factor[1] = dst_factor[3];
+ dst_factor[2] = dst_factor[3];
+ } else if (dF[0] == sF[0]) {
+ dst_factor[0] = src_factor[0];
+ dst_factor[1] = src_factor[1];
+ dst_factor[2] = src_factor[2];
+ } else if (need_color_factor) {
+ emit_color_factor_calculation(f,
+ b->rgb_dst_factor,
+ b->colormask,
+ blend_color,
+ frag, pixel, dst_factor);
+ }
+
+
+
+ func[0] = b->rgb_func;
+ func[1] = func[0];
+ func[2] = func[0];
+ func[3] = b->alpha_func;
+
+ for (i = 0; i < 4; ++i) {
+ if ((b->colormask & (1U << i)) != 0) {
+ emit_blend_calculation(f,
+ func[i], sF[i], dF[i],
+ frag[i], src_factor[i],
+ pixel[i], dst_factor[i]);
+ spe_selb(f, frag[i], pixel[i], frag[i], mask);
+ } else {
+ spe_or(f, frag[i], pixel[i], pixel[i]);
+ }
+ }
+
+ spe_bi(f, 0, 0, 0);
+}
--- /dev/null
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file spu_per_fragment_op.c
+ * SPU implementation various per-fragment operations.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_format.h"
+#include "spu_main.h"
+#include "spu_per_fragment_op.h"
+
+#define ZERO 0x80
+
+static void
+read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
+ enum pipe_format depth_format, qword *depth,
+ qword *stencil)
+{
+ const int ix = x / 2;
+ const int iy = y / 2;
+
+ switch (depth_format) {
+ case PIPE_FORMAT_Z16_UNORM: {
+ qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
+
+ const qword shuf_vec = (qword) {
+ ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
+ ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7
+ };
+
+
+ /* At even X values we want the first 4 shorts, and at odd X values we
+ * want the second 4 shorts.
+ */
+ qword bias = (qword) spu_splats((unsigned char) ((ix & 0x01) << 3));
+ qword bias_mask = si_fsmbi(0x3333);
+ qword sv = si_a(shuf_vec, si_and(bias_mask, bias));
+
+ *depth = si_shufb(*ptr, *ptr, sv);
+ *stencil = si_il(0);
+ break;
+ }
+
+
+ case PIPE_FORMAT_Z32_UNORM: {
+ qword *ptr = (qword *) &buffer->ui4[iy][ix];
+
+ *depth = *ptr;
+ *stencil = si_il(0);
+ break;
+ }
+
+
+ case PIPE_FORMAT_Z24S8_UNORM: {
+ qword *ptr = (qword *) &buffer->ui4[iy][ix];
+ qword mask = si_fsmbi(0x7777);
+
+ *depth = si_and(*ptr, mask);
+ *stencil = si_rotmai(si_andc(*ptr, mask), -24);
+ break;
+ }
+
+
+ default:
+ assert(0);
+ break;
+ }
+}
+
+
+static void
+write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
+ enum pipe_format depth_format,
+ qword depth, qword stencil)
+{
+ const int ix = x / 2;
+ const int iy = y / 2;
+
+ (void) stencil;
+
+ switch (depth_format) {
+ case PIPE_FORMAT_Z16_UNORM: {
+ qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
+
+ qword sv = ((ix & 0x01) == 0)
+ ? (qword) { 2, 3, 6, 7, 10, 11, 14, 15,
+ 24, 25, 26, 27, 28, 29, 30, 31 }
+ : (qword) { 16, 17, 18, 19, 20 , 21, 22, 23,
+ 2, 3, 6, 7, 10, 11, 14, 15 };
+ *ptr = si_shufb(depth, *ptr, sv);
+ break;
+ }
+
+
+ case PIPE_FORMAT_Z32_UNORM: {
+ qword *ptr = (qword *) &buffer->ui4[iy][ix];
+ *ptr = depth;
+ break;
+ }
+
+
+ case PIPE_FORMAT_Z24S8_UNORM: {
+ qword *ptr = (qword *) &buffer->ui4[iy][ix];
+ qword mask = si_fsmbi(0x7777);
+
+ stencil = si_rotmai(stencil, 24);
+ *ptr = si_selb(stencil, depth, mask);
+ break;
+ }
+
+
+ default:
+ assert(0);
+ break;
+ }
+}
+
+
+qword
+spu_do_depth_stencil(int x, int y,
+ qword frag_mask, qword frag_depth, qword frag_alpha,
+ qword facing)
+{
+ struct spu_frag_test_results result;
+ qword pixel_depth;
+ qword pixel_stencil;
+
+ /* All of this preable code (everthing before the call to frag_test) should
+ * be generated on the PPU and upload to the SPU.
+ */
+ if (spu.read_depth || spu.read_stencil) {
+ read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
+ &pixel_depth, &pixel_stencil);
+ }
+
+ switch (spu.fb.depth_format) {
+ case PIPE_FORMAT_Z16_UNORM:
+ frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x0000ffffu)));
+ frag_depth = si_cfltu(frag_depth, 0);
+ break;
+ case PIPE_FORMAT_Z32_UNORM:
+ frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0xffffffffu)));
+ frag_depth = si_cfltu(frag_depth, 0);
+ break;
+ case PIPE_FORMAT_Z24S8_UNORM:
+ frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x00ffffffu)));
+ frag_depth = si_cfltu(frag_depth, 0);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil,
+ frag_depth, frag_alpha, facing);
+
+
+ /* This code (everthing after the call to frag_test) should
+ * be generated on the PPU and upload to the SPU.
+ */
+ if (spu.read_depth || spu.read_stencil) {
+ write_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
+ result.depth, result.stencil);
+ }
+
+ return result.mask;
+}
+++ /dev/null
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * Zbuffer/depth test code.
- */
-
-
-#ifndef SPU_ZTEST_H
-#define SPU_ZTEST_H
-
-
-#ifdef __SPU__
-#include <spu_intrinsics.h>
-#endif
-
-
-
-/**
- * Perform Z testing for a 16-bit/value Z buffer.
- *
- * \param zvals vector of four fragment zvalues as floats
- * \param zbuf ptr to vector of ushort[8] zbuffer values. Note that this
- * contains the Z values for 2 quads, 8 pixels.
- * \param x x coordinate of quad (only lsbit is significant)
- * \param inMask indicates which fragments in the quad are alive
- * \return new mask indicating which fragments are alive after ztest
- */
-static INLINE vector unsigned int
-spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
- uint x, vector unsigned int inMask)
-{
-#define ZERO 0x80
- vector unsigned int zvals_ui4, zbuf_ui4, mask;
-
- /* convert floats to uints in [0, 65535] */
- zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */
- zvals_ui4 = spu_rlmask(zvals_ui4, -16); /* right shift 16 */
-
- /* XXX this conditional could be removed with a bit of work */
- if (x & 1) {
- /* convert zbuffer values from ushorts to uints */
- /* gather lower four ushorts */
- zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
- (vector unsigned int) *zbuf,
- ((vector unsigned char) {
- ZERO, ZERO, 8, 9, ZERO, ZERO, 10, 11,
- ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15}));
- /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
- mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
- /* mask &= inMask */
- mask = spu_and(mask, inMask);
- /* zbuf = mask ? zval : zbuf */
- zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
- /* convert zbuffer values from uints back to ushorts, preserve lower 4 */
- *zbuf = (vector unsigned short)
- spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
- ((vector unsigned char) {
- 16, 17, 18, 19, 20, 21, 22, 23,
- 2, 3, 6, 7, 10, 11, 14, 15}));
- }
- else {
- /* convert zbuffer values from ushorts to uints */
- /* gather upper four ushorts */
- zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
- (vector unsigned int) *zbuf,
- ((vector unsigned char) {
- ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
- ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7}));
- /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
- mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
- /* mask &= inMask */
- mask = spu_and(mask, inMask);
- /* zbuf = mask ? zval : zbuf */
- zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
- /* convert zbuffer values from uints back to ushorts, preserve upper 4 */
- *zbuf = (vector unsigned short)
- spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
- ((vector unsigned char) {
- 2, 3, 6, 7, 10, 11, 14, 15,
- 24, 25, 26, 27, 28, 29, 30, 31}));
- }
- return mask;
-#undef ZERO
-}
-
-
-/**
- * As above, but Zbuffer values as 32-bit uints
- */
-static INLINE vector unsigned int
-spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr,
- vector unsigned int inMask)
-{
- vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr;
-
- /* convert floats to uints in [0, 0xffffffff] */
- zvals_ui4 = spu_convtu(zvals, 32);
- /* mask = (zbuf < zvals_ui4) ? ~0 : 0 */
- mask = spu_cmpgt(zbuf, zvals_ui4);
- /* mask &= inMask */
- mask = spu_and(mask, inMask);
- /* zbuf = mask ? zval : zbuf */
- *zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask);
-
- return mask;
-}
-
-
-#endif /* SPU_ZTEST_H */