*
* Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
* All Rights Reserved.
+ * Copyright 2009 VMware, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
*
**************************************************************************/
-
-
/**
* Generate SPU per-fragment code (actually per-quad code).
* \author Brian Paul
+ * \author Bob Ellison
*/
* \param ifragZ_reg register containing integer fragment Z values (in)
* \param ifbZ_reg register containing integer frame buffer Z values (in/out)
* \param zmask_reg register containing result of Z test/comparison (out)
+ *
+ * Returns TRUE if the Z-buffer needs to be updated.
*/
-static void
-gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
- struct spe_function *f,
+static boolean
+gen_depth_test(struct spe_function *f,
+ const struct pipe_depth_stencil_alpha_state *dsa,
int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
{
+ /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
+ * quantities. This only makes a difference for 32-bit Z values though.
+ */
ASSERT(dsa->depth.enabled);
switch (dsa->depth.func) {
case PIPE_FUNC_GREATER:
/* zmask = (ifragZ > ref) */
- spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+ spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
/* mask = (mask & zmask) */
spe_and(f, mask_reg, mask_reg, zmask_reg);
break;
case PIPE_FUNC_LESS:
/* zmask = (ref > ifragZ) */
- spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+ spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
/* mask = (mask & zmask) */
spe_and(f, mask_reg, mask_reg, zmask_reg);
break;
case PIPE_FUNC_LEQUAL:
/* zmask = (ifragZ > ref) */
- spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+ spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
/* mask = (mask & ~zmask) */
spe_andc(f, mask_reg, mask_reg, zmask_reg);
break;
case PIPE_FUNC_GEQUAL:
/* zmask = (ref > ifragZ) */
- spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+ spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
/* mask = (mask & ~zmask) */
spe_andc(f, mask_reg, mask_reg, zmask_reg);
break;
* framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
*/
spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+ return TRUE;
}
+
+ return FALSE;
}
spe_release_register(f, amask_reg);
}
-/* This pair of functions is used inline to allocate and deallocate
+
+/**
+ * This pair of functions is used inline to allocate and deallocate
* optional constant registers. Once a constant is discovered to be
* needed, we will likely need it again, so we don't want to deallocate
* it and have to allocate and load it again unnecessarily.
*/
-static inline void
-setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+static INLINE void
+setup_optional_register(struct spe_function *f,
+ int *r)
+{
+ if (*r < 0)
+ *r = spe_allocate_available_register(f);
+}
+
+static INLINE void
+release_optional_register(struct spe_function *f,
+ int r)
+{
+ if (r >= 0)
+ spe_release_register(f, r);
+}
+
+static INLINE void
+setup_const_register(struct spe_function *f,
+ int *r,
+ float value)
{
- if (*is_already_set) return;
- *r = spe_allocate_available_register(f);
+ if (*r >= 0)
+ return;
+ setup_optional_register(f, r);
spe_load_float(f, *r, value);
- *is_already_set = true;
}
-static inline void
-release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+static INLINE void
+release_const_register(struct spe_function *f,
+ int r)
{
- if (!*is_already_set) return;
- spe_release_register(f, r);
- *is_already_set = false;
+ release_optional_register(f, r);
}
+
+
+/**
+ * Unpack/convert framebuffer colors from four 32-bit packed colors
+ * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
+ * Each 8-bit color component is expanded into a float in [0.0, 1.0].
+ */
+static void
+unpack_colors(struct spe_function *f,
+ enum pipe_format color_format,
+ int fbRGBA_reg,
+ int fbR_reg, int fbG_reg, int fbB_reg, int fbA_reg)
+{
+ int mask0_reg = spe_allocate_available_register(f);
+ int mask1_reg = spe_allocate_available_register(f);
+ int mask2_reg = spe_allocate_available_register(f);
+ int mask3_reg = spe_allocate_available_register(f);
+
+ spe_load_int(f, mask0_reg, 0xff);
+ spe_load_int(f, mask1_reg, 0xff00);
+ spe_load_int(f, mask2_reg, 0xff0000);
+ spe_load_int(f, mask3_reg, 0xff000000);
+
+ spe_comment(f, 0, "Unpack framebuffer colors, convert to floats");
+
+ switch (color_format) {
+ case PIPE_FORMAT_A8R8G8B8_UNORM:
+ /* fbB = fbRGBA & mask */
+ spe_and(f, fbB_reg, fbRGBA_reg, mask0_reg);
+
+ /* fbG = fbRGBA & mask */
+ spe_and(f, fbG_reg, fbRGBA_reg, mask1_reg);
+
+ /* fbR = fbRGBA & mask */
+ spe_and(f, fbR_reg, fbRGBA_reg, mask2_reg);
+
+ /* fbA = fbRGBA & mask */
+ spe_and(f, fbA_reg, fbRGBA_reg, mask3_reg);
+
+ /* fbG = fbG >> 8 */
+ spe_roti(f, fbG_reg, fbG_reg, -8);
+
+ /* fbR = fbR >> 16 */
+ spe_roti(f, fbR_reg, fbR_reg, -16);
+
+ /* fbA = fbA >> 24 */
+ spe_roti(f, fbA_reg, fbA_reg, -24);
+ break;
+
+ case PIPE_FORMAT_B8G8R8A8_UNORM:
+ /* fbA = fbRGBA & mask */
+ spe_and(f, fbA_reg, fbRGBA_reg, mask0_reg);
+
+ /* fbR = fbRGBA & mask */
+ spe_and(f, fbR_reg, fbRGBA_reg, mask1_reg);
+
+ /* fbG = fbRGBA & mask */
+ spe_and(f, fbG_reg, fbRGBA_reg, mask2_reg);
+
+ /* fbB = fbRGBA & mask */
+ spe_and(f, fbB_reg, fbRGBA_reg, mask3_reg);
+
+ /* fbR = fbR >> 8 */
+ spe_roti(f, fbR_reg, fbR_reg, -8);
+
+ /* fbG = fbG >> 16 */
+ spe_roti(f, fbG_reg, fbG_reg, -16);
+
+ /* fbB = fbB >> 24 */
+ spe_roti(f, fbB_reg, fbB_reg, -24);
+ break;
+
+ default:
+ ASSERT(0);
+ }
+
+ /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
+ spe_cuflt(f, fbR_reg, fbR_reg, 8);
+ spe_cuflt(f, fbG_reg, fbG_reg, 8);
+ spe_cuflt(f, fbB_reg, fbB_reg, 8);
+ spe_cuflt(f, fbA_reg, fbA_reg, 8);
+
+ spe_release_register(f, mask0_reg);
+ spe_release_register(f, mask1_reg);
+ spe_release_register(f, mask2_reg);
+ spe_release_register(f, mask3_reg);
+}
+
+
/**
* Generate SPE code to implement the given blend mode for a quad of pixels.
* \param f SPE function to append instruction onto.
* if we do use them, make sure we only allocate them once by
* keeping a flag on each one.
*/
- boolean one_reg_set = false;
- unsigned int one_reg;
- boolean constR_reg_set = false, constG_reg_set = false,
- constB_reg_set = false, constA_reg_set = false;
- unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
+ int one_reg = -1;
+ int constR_reg = -1, constG_reg = -1, constB_reg = -1, constA_reg = -1;
ASSERT(blend->blend_enable);
- /* Unpack/convert framebuffer colors from four 32-bit packed colors
- * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
- * Each 8-bit color component is expanded into a float in [0.0, 1.0].
- */
- {
- int mask_reg = spe_allocate_available_register(f);
-
- /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
- spe_load_int(f, mask_reg, 0xff);
-
- /* XXX there may be more clever ways to implement the following code */
- switch (color_format) {
- case PIPE_FORMAT_A8R8G8B8_UNORM:
- /* fbB = fbB & mask */
- spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
- /* mask = mask << 8 */
- spe_roti(f, mask_reg, mask_reg, 8);
-
- /* fbG = fbRGBA & mask */
- spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
- /* fbG = fbG >> 8 */
- spe_roti(f, fbG_reg, fbG_reg, -8);
- /* mask = mask << 8 */
- spe_roti(f, mask_reg, mask_reg, 8);
-
- /* fbR = fbRGBA & mask */
- spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
- /* fbR = fbR >> 16 */
- spe_roti(f, fbR_reg, fbR_reg, -16);
- /* mask = mask << 8 */
- spe_roti(f, mask_reg, mask_reg, 8);
-
- /* fbA = fbRGBA & mask */
- spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
- /* fbA = fbA >> 24 */
- spe_roti(f, fbA_reg, fbA_reg, -24);
- break;
-
- case PIPE_FORMAT_B8G8R8A8_UNORM:
- /* fbA = fbA & mask */
- spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
- /* mask = mask << 8 */
- spe_roti(f, mask_reg, mask_reg, 8);
-
- /* fbR = fbRGBA & mask */
- spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
- /* fbR = fbR >> 8 */
- spe_roti(f, fbR_reg, fbR_reg, -8);
- /* mask = mask << 8 */
- spe_roti(f, mask_reg, mask_reg, 8);
-
- /* fbG = fbRGBA & mask */
- spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
- /* fbG = fbG >> 16 */
- spe_roti(f, fbG_reg, fbG_reg, -16);
- /* mask = mask << 8 */
- spe_roti(f, mask_reg, mask_reg, 8);
-
- /* fbB = fbRGBA & mask */
- spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
- /* fbB = fbB >> 24 */
- spe_roti(f, fbB_reg, fbB_reg, -24);
- break;
-
- default:
- ASSERT(0);
- }
-
- /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
- spe_cuflt(f, fbR_reg, fbR_reg, 8);
- spe_cuflt(f, fbG_reg, fbG_reg, 8);
- spe_cuflt(f, fbB_reg, fbB_reg, 8);
- spe_cuflt(f, fbA_reg, fbA_reg, 8);
-
- spe_release_register(f, mask_reg);
- }
+ /* packed RGBA -> float colors */
+ unpack_colors(f, color_format, fbRGBA_reg,
+ fbR_reg, fbG_reg, fbB_reg, fbA_reg);
/*
* Compute Src RGB terms. We're actually looking for the value
break;
case PIPE_BLENDFACTOR_CONST_COLOR:
/* We need the optional constant color registers */
- setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
- setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
- setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ setup_const_register(f, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg, blend_color->color[2]);
/* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
spe_fm(f, term1R_reg, fragR_reg, constR_reg);
spe_fm(f, term1G_reg, fragG_reg, constG_reg);
break;
case PIPE_BLENDFACTOR_CONST_ALPHA:
/* we'll need the optional constant alpha register */
- setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ setup_const_register(f, &constA_reg, blend_color->color[3]);
/* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
spe_fm(f, term1R_reg, fragR_reg, constA_reg);
spe_fm(f, term1G_reg, fragG_reg, constA_reg);
break;
case PIPE_BLENDFACTOR_INV_CONST_COLOR:
/* We need the optional constant color registers */
- setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
- setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
- setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ setup_const_register(f, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg, blend_color->color[2]);
/* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc))
* or term = (R-R*Rc, G-G*Gc, B-B*Bc)
* fnms(a,b,c,d) computes a = d - b*c
break;
case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
/* We need the optional constant color registers */
- setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
- setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
- setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ setup_const_register(f, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg, blend_color->color[2]);
/* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
* or term = (R-R*Ac,G-G*Ac,B-B*Ac)
* fnms(a,b,c,d) computes a = d - b*c
break;
case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
/* We'll need the optional {1,1,1,1} register */
- setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
+ setup_const_register(f, &one_reg, 1.0f);
/* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so
* term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
* We could expand the term (as a*min(b,c) == min(a*b,a*c)
case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
case PIPE_BLENDFACTOR_CONST_COLOR:
/* We need the optional constA_reg register */
- setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ setup_const_register(f, &constA_reg, blend_color->color[3]);
/* factor = Ac, so term = A*Ac */
spe_fm(f, term1A_reg, fragA_reg, constA_reg);
break;
case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
case PIPE_BLENDFACTOR_INV_CONST_COLOR:
/* We need the optional constA_reg register */
- setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ setup_const_register(f, &constA_reg, blend_color->color[3]);
/* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
/* fnms(a,b,c,d) computes a = d - b*c */
spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
break;
case PIPE_BLENDFACTOR_CONST_COLOR:
/* We need the optional constant color registers */
- setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
- setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
- setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ setup_const_register(f, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg, blend_color->color[2]);
/* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
spe_fm(f, term2R_reg, fbR_reg, constR_reg);
spe_fm(f, term2G_reg, fbG_reg, constG_reg);
break;
case PIPE_BLENDFACTOR_CONST_ALPHA:
/* we'll need the optional constant alpha register */
- setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ setup_const_register(f, &constA_reg, blend_color->color[3]);
/* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
spe_fm(f, term2R_reg, fbR_reg, constA_reg);
spe_fm(f, term2G_reg, fbG_reg, constA_reg);
break;
case PIPE_BLENDFACTOR_INV_CONST_COLOR:
/* We need the optional constant color registers */
- setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
- setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
- setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ setup_const_register(f, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg, blend_color->color[2]);
/* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc))
* or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
* fnms(a,b,c,d) computes a = d - b*c
break;
case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
/* We need the optional constant color registers */
- setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
- setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
- setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+ setup_const_register(f, &constR_reg, blend_color->color[0]);
+ setup_const_register(f, &constG_reg, blend_color->color[1]);
+ setup_const_register(f, &constB_reg, blend_color->color[2]);
/* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
* or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
* fnms(a,b,c,d) computes a = d - b*c
case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
case PIPE_BLENDFACTOR_CONST_COLOR:
/* We need the optional constA_reg register */
- setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ setup_const_register(f, &constA_reg, blend_color->color[3]);
/* factor = Ac, so term = Afb*Ac */
spe_fm(f, term2A_reg, fbA_reg, constA_reg);
break;
case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
case PIPE_BLENDFACTOR_INV_CONST_COLOR:
/* We need the optional constA_reg register */
- setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+ setup_const_register(f, &constA_reg, blend_color->color[3]);
/* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
/* fnms(a,b,c,d) computes a = d - b*c */
spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
spe_release_register(f, tmp_reg);
/* Free any optional registers that actually got used */
- release_const_register(f, &one_reg_set, one_reg);
- release_const_register(f, &constR_reg_set, constR_reg);
- release_const_register(f, &constG_reg_set, constG_reg);
- release_const_register(f, &constB_reg_set, constB_reg);
- release_const_register(f, &constA_reg_set, constA_reg);
+ release_const_register(f, one_reg);
+ release_const_register(f, constR_reg);
+ release_const_register(f, constG_reg);
+ release_const_register(f, constB_reg);
+ release_const_register(f, constA_reg);
}
spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
break;
case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
- spe_complement(f, fragRGBA_reg);
+ spe_complement(f, fragRGBA_reg, fragRGBA_reg);
break;
case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
/* andc R, A, B computes R = A & ~B */
break;
case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
- spe_complement(f, fragRGBA_reg);
+ spe_complement(f, fragRGBA_reg, fragRGBA_reg);
break;
case PIPE_LOGICOP_NOOP: /* d */
spe_move(f, fragRGBA_reg, fbRGBA_reg);
}
-static void
-gen_colormask(uint colormask,
- struct spe_function *f,
- int fragRGBA_reg, int fbRGBA_reg)
-{
- /* We've got four 32-bit RGBA packed pixels in each of
- * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
- * reds, greens, blues, and alphas.
- * */
-
- /* The color mask operation can prevent any set of color
- * components in the incoming fragment from being written to the frame
- * buffer; we do this by replacing the masked components of the
- * fragment with the frame buffer values.
- *
- * There are only 16 possibilities, with a unique mask for
- * each of the possibilities. (Technically, there are only 15
- * possibilities, since we shouldn't be called for the one mask
- * that does nothing, but the complete implementation is here
- * anyway to avoid confusion.)
- *
- * We implement this via a constant static array which we'll index
- * into to get the correct mask.
- *
- * We're dependent on the mask values being low-order bits,
- * with particular values for each bit; so we start with a
- * few assertions, which will fail if any of the values were
- * to change.
- */
- ASSERT(PIPE_MASK_R == 0x1);
- ASSERT(PIPE_MASK_G == 0x2);
- ASSERT(PIPE_MASK_B == 0x4);
- ASSERT(PIPE_MASK_A == 0x8);
-
- /* Here's the list of all possible colormasks, indexed by the
- * value of the combined mask specifier.
- */
- static const unsigned int colormasks[16] = {
- 0x00000000, /* 0: all colors masked */
- 0xff000000, /* 1: PIPE_MASK_R */
- 0x00ff0000, /* 2: PIPE_MASK_G */
- 0xffff0000, /* 3: PIPE_MASK_R | PIPE_MASK_G */
- 0x0000ff00, /* 4: PIPE_MASK_B */
- 0xff00ff00, /* 5: PIPE_MASK_R | PIPE_MASK_B */
- 0x00ffff00, /* 6: PIPE_MASK_G | PIPE_MASK_B */
- 0xffffff00, /* 7: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B */
- 0x000000ff, /* 8: PIPE_MASK_A */
- 0xff0000ff, /* 9: PIPE_MASK_R | PIPE_MASK_A */
- 0x00ff00ff, /* 10: PIPE_MASK_G | PIPE_MASK_A */
- 0xffff00ff, /* 11: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_A */
- 0x0000ffff, /* 12: PIPE_MASK_B | PIPE_MASK_A */
- 0xff00ffff, /* 13: PIPE_MASK_R | PIPE_MASK_B | PIPE_MASK_A */
- 0x00ffffff, /* 14: PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
- 0xffffffff /* 15: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
- };
-
- /* Get a temporary register to hold the mask */
- int colormask_reg = spe_allocate_available_register(f);
-
- /* Look up the desired mask directly and load it into the mask register.
- * This will load the same mask into each of the four words in the
- * mask register.
- */
- spe_load_uint(f, colormask_reg, colormasks[colormask]);
-
- /* Use the mask register to select between the fragment color
- * values and the frame buffer color values. Wherever the
- * mask has a 0 bit, the current frame buffer color should override
- * the fragment color. Wherever the mask has a 1 bit, the
- * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM)
- * instruction will select bits from its first operand rA wherever the
- * the mask bits rM are 0, and from its second operand rB wherever the
- * mask bits rM are 1. That means that the frame buffer color is the
- * first operand, and the fragment color the second.
- */
- spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
-
- /* Release the temporary register and we're done */
- spe_release_register(f, colormask_reg);
-}
-
/**
- * Generate code to pack a quad of float colors into a four 32-bit integers.
+ * Generate code to pack a quad of float colors into four 32-bit integers.
*
* \param f SPE function to append instruction onto.
* \param color_format the dest color packing format
int r_reg, int g_reg, int b_reg, int a_reg,
int rgba_reg)
{
+ int rg_reg = spe_allocate_available_register(f);
+ int ba_reg = spe_allocate_available_register(f);
+
/* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
spe_cfltu(f, r_reg, r_reg, 32);
spe_cfltu(f, g_reg, g_reg, 32);
spe_cfltu(f, b_reg, b_reg, 32);
spe_cfltu(f, a_reg, a_reg, 32);
- /* Shift the most significant bytes to least the significant positions.
+ /* Shift the most significant bytes to the least significant positions.
* I.e.: reg = reg >> 24
*/
spe_rotmi(f, r_reg, r_reg, -24);
* OR-ing all those together gives us four packed colors:
* RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
*/
- spe_or(f, rgba_reg, r_reg, g_reg);
- spe_or(f, rgba_reg, rgba_reg, b_reg);
- spe_or(f, rgba_reg, rgba_reg, a_reg);
+ spe_or(f, rg_reg, r_reg, g_reg);
+ spe_or(f, ba_reg, a_reg, b_reg);
+ spe_or(f, rgba_reg, rg_reg, ba_reg);
+
+ spe_release_register(f, rg_reg);
+ spe_release_register(f, ba_reg);
+}
+
+
+static void
+gen_colormask(struct spe_function *f,
+ uint colormask,
+ enum pipe_format color_format,
+ int fragRGBA_reg, int fbRGBA_reg)
+{
+ /* We've got four 32-bit RGBA packed pixels in each of
+ * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+ * reds, greens, blues, and alphas. Further, the pixels
+ * are packed according to the given color format, not
+ * necessarily RGBA...
+ */
+ uint r_mask;
+ uint g_mask;
+ uint b_mask;
+ uint a_mask;
+
+ /* Calculate exactly where the bits for any particular color
+ * end up, so we can mask them correctly.
+ */
+ switch(color_format) {
+ case PIPE_FORMAT_A8R8G8B8_UNORM:
+ /* ARGB */
+ a_mask = 0xff000000;
+ r_mask = 0x00ff0000;
+ g_mask = 0x0000ff00;
+ b_mask = 0x000000ff;
+ break;
+ case PIPE_FORMAT_B8G8R8A8_UNORM:
+ /* BGRA */
+ b_mask = 0xff000000;
+ g_mask = 0x00ff0000;
+ r_mask = 0x0000ff00;
+ a_mask = 0x000000ff;
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ /* For each R, G, B, and A component we're supposed to mask out,
+ * clear its bits. Then our mask operation later will work
+ * as expected.
+ */
+ if (!(colormask & PIPE_MASK_R)) {
+ r_mask = 0;
+ }
+ if (!(colormask & PIPE_MASK_G)) {
+ g_mask = 0;
+ }
+ if (!(colormask & PIPE_MASK_B)) {
+ b_mask = 0;
+ }
+ if (!(colormask & PIPE_MASK_A)) {
+ a_mask = 0;
+ }
+
+ /* Get a temporary register to hold the mask that will be applied
+ * to the fragment
+ */
+ int colormask_reg = spe_allocate_available_register(f);
+
+ /* The actual mask we're going to use is an OR of the remaining R, G, B,
+ * and A masks. Load the result value into our temporary register.
+ */
+ spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
+
+ /* Use the mask register to select between the fragment color
+ * values and the frame buffer color values. Wherever the
+ * mask has a 0 bit, the current frame buffer color should override
+ * the fragment color. Wherever the mask has a 1 bit, the
+ * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM)
+ * instruction will select bits from its first operand rA wherever the
+ * the mask bits rM are 0, and from its second operand rB wherever the
+ * mask bits rM are 1. That means that the frame buffer color is the
+ * first operand, and the fragment color the second.
+ */
+ spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
+
+ /* Release the temporary register and we're done */
+ spe_release_register(f, colormask_reg);
}
+/**
+ * This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value. As such, we have
+ * access to the Compare Immediate instructions where we don't in
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * There's some added complexity if there's a non-trivial state->mask
+ * value; then stencil and reference both must be masked
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test. The bitmask of valid
+ * fragments that failed would be found in
+ * (fragment_mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f,
+ const struct pipe_stencil_state *state,
+ uint stencil_max_value,
+ int fragment_mask_reg,
+ int fbS_reg,
+ int stencil_pass_reg)
+{
+ /* Generate code that puts the set of passing fragments into the
+ * stencil_pass_reg register, taking into account whether each fragment
+ * was active to begin with.
+ */
+ switch (state->func) {
+ case PIPE_FUNC_EQUAL:
+ if (state->valuemask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (s == reference) */
+ spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
+ uint tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+ spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
+ state->valuemask & state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
+ break;
+
+ case PIPE_FUNC_NOTEQUAL:
+ if (state->valuemask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & ~(s == reference) */
+ spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
+ int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+ spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
+ state->valuemask & state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
+ break;
+
+ case PIPE_FUNC_LESS:
+ if (state->valuemask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (reference < s) */
+ spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ((reference&mask) < (s & mask)) */
+ int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+ spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
+ state->valuemask & state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
+ break;
+
+ case PIPE_FUNC_GREATER:
+ if (state->valuemask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (reference > s) */
+ /* There's no convenient Compare Less Than Immediate instruction, so
+ * we'll have to do this one the harder way, by loading a register and
+ * comparing directly. Compare Logical Greater Than Word (clgt)
+ * treats its operands as unsigned - no sign extension.
+ */
+ int tmp_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
+ int tmp_reg = spe_allocate_available_register(f);
+ int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->valuemask & state->ref_value);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
+ break;
+
+ case PIPE_FUNC_GEQUAL:
+ if (state->valuemask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (reference >= s)
+ * = fragment_mask & ~(s > reference) */
+ spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg,
+ state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
+ int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+ spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
+ state->valuemask & state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
+ break;
+
+ case PIPE_FUNC_LEQUAL:
+ if (state->valuemask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (reference <= s) ]
+ * = fragment_mask & ~(reference > s) */
+ /* As above, we have to do this by loading a register */
+ int tmp_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
+ int tmp_reg = spe_allocate_available_register(f);
+ int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value & state->valuemask);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
+ break;
+
+ case PIPE_FUNC_NEVER:
+ /* stencil_pass = fragment_mask & 0 = 0 */
+ spe_load_uint(f, stencil_pass_reg, 0);
+ break;
+
+ case PIPE_FUNC_ALWAYS:
+ /* stencil_pass = fragment_mask & 1 = fragment_mask */
+ spe_move(f, stencil_pass_reg, fragment_mask_reg);
+ break;
+ }
+
+ /* The fragments that passed the stencil test are now in stencil_pass_reg.
+ * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
+ */
+}
+
+
+/**
+ * This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply. It does not
+ * apply any tests. It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f,
+ uint stencil_op,
+ uint stencil_ref_value,
+ uint stencil_max_value,
+ int fbS_reg,
+ int newS_reg)
+{
+ /* The code below assumes that newS_reg and fbS_reg are not the same
+ * register; if they can be, the calculations below will have to use
+ * an additional temporary register. For now, mark the assumption
+ * with an assertion that will fail if they are the same.
+ */
+ ASSERT(fbS_reg != newS_reg);
+
+ /* The code also assumes the the stencil_max_value is of the form
+ * 2^n-1 and can therefore be used as a mask for the valid bits in
+ * addition to a maximum. Make sure this is the case as well.
+ * The clever math below exploits the fact that incrementing a
+ * binary number serves to flip all the bits of a number starting at
+ * the LSB and continuing to (and including) the first zero bit
+ * found. That means that a number and its increment will always
+ * have at least one bit in common (the high order bit, if nothing
+ * else) *unless* the number is zero, *or* the number is of a form
+ * consisting of some number of 1s in the low-order bits followed
+ * by nothing but 0s in the high-order bits. The latter case
+ * implies it's of the form 2^n-1.
+ */
+ ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+ switch(stencil_op) {
+ case PIPE_STENCIL_OP_KEEP:
+ /* newS = S */
+ spe_move(f, newS_reg, fbS_reg);
+ break;
+
+ case PIPE_STENCIL_OP_ZERO:
+ /* newS = 0 */
+ spe_zero(f, newS_reg);
+ break;
+
+ case PIPE_STENCIL_OP_REPLACE:
+ /* newS = stencil reference value */
+ spe_load_uint(f, newS_reg, stencil_ref_value);
+ break;
+
+ case PIPE_STENCIL_OP_INCR: {
+ /* newS = (s == max ? max : s + 1) */
+ int equals_reg = spe_allocate_available_register(f);
+
+ spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+ /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+ spe_ai(f, newS_reg, fbS_reg, 1);
+ /* Select from the current value or the new value based on the equality test */
+ spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+ spe_release_register(f, equals_reg);
+ break;
+ }
+ case PIPE_STENCIL_OP_DECR: {
+ /* newS = (s == 0 ? 0 : s - 1) */
+ int equals_reg = spe_allocate_available_register(f);
+
+ spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+ /* Add Word Immediate with a (-1) value works */
+ spe_ai(f, newS_reg, fbS_reg, -1);
+ /* Select from the current value or the new value based on the equality test */
+ spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+ spe_release_register(f, equals_reg);
+ break;
+ }
+ case PIPE_STENCIL_OP_INCR_WRAP:
+ /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+ * do a normal add and mask off the correct bits
+ */
+ spe_ai(f, newS_reg, fbS_reg, 1);
+ spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+ break;
+
+ case PIPE_STENCIL_OP_DECR_WRAP:
+ /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+ spe_ai(f, newS_reg, fbS_reg, -1);
+ spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+ break;
+
+ case PIPE_STENCIL_OP_INVERT:
+ /* newS = ~s. We take advantage of the mask/max value to invert only
+ * the valid bits for the field so we don't have to do an extra "and".
+ */
+ spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+ break;
+
+ default:
+ ASSERT(0);
+ }
+}
+
+
+/**
+ * This function generates code to get all the necessary possible
+ * stencil values. For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f,
+ const struct pipe_stencil_state *stencil,
+ const uint depth_enabled,
+ int fbS_reg,
+ int *fail_reg,
+ int *zfail_reg,
+ int *zpass_reg)
+{
+ uint zfail_op;
+
+ /* Stenciling had better be enabled here */
+ ASSERT(stencil->enabled);
+
+ /* If the depth test is not enabled, it is treated as though it always
+ * passes, which means that the zfail_op is not considered - a
+ * failing stencil test triggers the fail_op, and a passing one
+ * triggers the zpass_op
+ *
+ * As an optimization, override calculation of the zfail_op values
+ * if they aren't going to be used. By setting the value of
+ * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
+ * to match the incoming stencil values, and no calculation will
+ * be done.
+ */
+ if (depth_enabled) {
+ zfail_op = stencil->zfail_op;
+ }
+ else {
+ zfail_op = PIPE_STENCIL_OP_KEEP;
+ }
+
+ /* One-sided or front-facing stencil */
+ if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
+ *fail_reg = fbS_reg;
+ }
+ else {
+ *fail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, stencil->fail_op, stencil->ref_value,
+ 0xff, fbS_reg, *fail_reg);
+ }
+
+ /* Check the possibly overridden value, not the structure value */
+ if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+ *zfail_reg = fbS_reg;
+ }
+ else if (zfail_op == stencil->fail_op) {
+ *zfail_reg = *fail_reg;
+ }
+ else {
+ *zfail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, stencil->zfail_op, stencil->ref_value,
+ 0xff, fbS_reg, *zfail_reg);
+ }
+
+ if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+ *zpass_reg = fbS_reg;
+ }
+ else if (stencil->zpass_op == stencil->fail_op) {
+ *zpass_reg = *fail_reg;
+ }
+ else if (stencil->zpass_op == zfail_op) {
+ *zpass_reg = *zfail_reg;
+ }
+ else {
+ *zpass_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, stencil->zpass_op, stencil->ref_value,
+ 0xff, fbS_reg, *zpass_reg);
+ }
+}
+
+/**
+ * Note that fbZ_reg may *not* be set on entry, if in fact
+ * the depth test is not enabled. This function must not use
+ * the register if depth is not enabled.
+ */
+static boolean
+gen_stencil_depth_test(struct spe_function *f,
+ const struct pipe_depth_stencil_alpha_state *dsa,
+ const uint facing,
+ const int mask_reg, const int fragZ_reg,
+ const int fbZ_reg, const int fbS_reg)
+{
+ /* True if we've generated code that could require writeback to the
+ * depth and/or stencil buffers
+ */
+ boolean modified_buffers = FALSE;
+
+ boolean need_to_calculate_stencil_values;
+ boolean need_to_writemask_stencil_values;
+
+ struct pipe_stencil_state *stencil;
+
+ /* Registers. We may or may not actually allocate these, depending
+ * on whether the state values indicate that we need them.
+ */
+ int stencil_pass_reg, stencil_fail_reg;
+ int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+ int stencil_writemask_reg;
+ int zmask_reg;
+ int newS_reg;
+
+ /* Stenciling is quite complex: up to six different configurable stencil
+ * operations/calculations can be required (three each for front-facing
+ * and back-facing fragments). Many of those operations will likely
+ * be identical, so there's good reason to try to avoid calculating
+ * the same values more than once (which unfortunately makes the code less
+ * straightforward).
+ *
+ * To make register management easier, we start a new
+ * register set; we can release all the registers in the set at
+ * once, and avoid having to keep track of exactly which registers
+ * we allocate. We can still allocate and free registers as
+ * desired (if we know we no longer need a register), but we don't
+ * have to spend the complexity to track the more difficult variant
+ * register usage scenarios.
+ */
+ spe_comment(f, 0, "Allocating stencil register set");
+ spe_allocate_register_set(f);
+
+ /* The facing we're given is the fragment facing; it doesn't
+ * exactly match the stencil facing. If stencil is enabled,
+ * but two-sided stencil is *not* enabled, we use the same
+ * stencil settings for both front- and back-facing fragments.
+ * We only use the "back-facing" stencil for backfacing fragments
+ * if two-sided stenciling is enabled.
+ */
+ if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
+ stencil = &dsa->stencil[1];
+ }
+ else {
+ stencil = &dsa->stencil[0];
+ }
+
+ /* Calculate the writemask. If the writemask is trivial (either
+ * all 0s, meaning that we don't need to calculate any stencil values
+ * because they're not going to change the stencil anyway, or all 1s,
+ * meaning that we have to calculate the stencil values but do not
+ * need to mask them), we can avoid generating code. Don't forget
+ * that we need to consider backfacing stencil, if enabled.
+ *
+ * Note that if the backface stencil is *not* enabled, the backface
+ * stencil will have the same values as the frontface stencil.
+ */
+ if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
+ stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
+ stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+ need_to_calculate_stencil_values = FALSE;
+ need_to_writemask_stencil_values = FALSE;
+ }
+ else if (stencil->writemask == 0x0) {
+ /* All changes are writemasked out, so no need to calculate
+ * what those changes might be, and no need to write anything back.
+ */
+ need_to_calculate_stencil_values = FALSE;
+ need_to_writemask_stencil_values = FALSE;
+ }
+ else if (stencil->writemask == 0xff) {
+ /* Still trivial, but a little less so. We need to write the stencil
+ * values, but we don't need to mask them.
+ */
+ need_to_calculate_stencil_values = TRUE;
+ need_to_writemask_stencil_values = FALSE;
+ }
+ else {
+ /* The general case: calculate, mask, and write */
+ need_to_calculate_stencil_values = TRUE;
+ need_to_writemask_stencil_values = TRUE;
+
+ /* While we're here, generate code that calculates what the
+ * writemask should be. If backface stenciling is enabled,
+ * and the backface writemask is not the same as the frontface
+ * writemask, we'll have to generate code that merges the
+ * two masks into a single effective mask based on fragment facing.
+ */
+ spe_comment(f, 0, "Computing stencil writemask");
+ stencil_writemask_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].writemask);
+ }
+
+ /* At least one-sided stenciling must be on. Generate code that
+ * runs the stencil test on the basic/front-facing stencil, leaving
+ * the mask of passing stencil bits in stencil_pass_reg. This mask will
+ * be used both to mask the set of active pixels, and also to
+ * determine how the stencil buffer changes.
+ *
+ * This test will *not* change the value in mask_reg (because we don't
+ * yet know whether to apply the two-sided stencil or one-sided stencil).
+ */
+ spe_comment(f, 0, "Running basic stencil test");
+ stencil_pass_reg = spe_allocate_available_register(f);
+ gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
+
+ /* Generate code that, given the mask of valid fragments and the
+ * mask of valid fragments that passed the stencil test, computes
+ * the mask of valid fragments that failed the stencil test. We
+ * have to do this before we run a depth test (because the
+ * depth test should not be performed on fragments that failed the
+ * stencil test, and because the depth test will update the
+ * mask of valid fragments based on the results of the depth test).
+ */
+ spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
+ stencil_fail_reg = spe_allocate_available_register(f);
+ spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+ /* Now remove the stenciled-out pixels from the valid fragment mask,
+ * so we can later use the valid fragment mask in the depth test.
+ */
+ spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+ /* We may not need to calculate stencil values, if the writemask is off */
+ if (need_to_calculate_stencil_values) {
+ /* Generate code that calculates exactly which stencil values we need,
+ * without calculating the same value twice (say, if two different
+ * stencil ops have the same value). This code will work for one-sided
+ * and two-sided stenciling (so that we take into account that operations
+ * may match between front and back stencils), and will also take into
+ * account whether the depth test is enabled (if the depth test is off,
+ * we don't need any of the zfail results, because the depth test always
+ * is considered to pass if it is disabled). Any register value that
+ * does not need to be calculated will come back with the same value
+ * that's in fbS_reg.
+ *
+ * This function will allocate a variant number of registers that
+ * will be released as part of the register set.
+ */
+ spe_comment(f, 0, facing == CELL_FACING_FRONT
+ ? "Computing front-facing stencil values"
+ : "Computing back-facing stencil values");
+ gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg,
+ &stencil_fail_values, &stencil_pass_depth_fail_values,
+ &stencil_pass_depth_pass_values);
+ }
+
+ /* We now have all the stencil values we need. We also need
+ * the results of the depth test to figure out which
+ * stencil values will become the new stencil values. (Even if
+ * we aren't actually calculating stencil values, we need to apply
+ * the depth test if it's enabled.)
+ *
+ * The code generated by gen_depth_test() returns the results of the
+ * test in the given register, but also alters the mask_reg based
+ * on the results of the test.
+ */
+ if (dsa->depth.enabled) {
+ spe_comment(f, 0, "Running stencil depth test");
+ zmask_reg = spe_allocate_available_register(f);
+ modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg,
+ fbZ_reg, zmask_reg);
+ }
+
+ if (need_to_calculate_stencil_values) {
+
+ /* If we need to writemask the stencil values before going into
+ * the stencil buffer, we'll have to use a new register to
+ * hold the new values. If not, we can just keep using the
+ * current register.
+ */
+ if (need_to_writemask_stencil_values) {
+ newS_reg = spe_allocate_available_register(f);
+ spe_comment(f, 0, "Saving current stencil values for writemasking");
+ spe_move(f, newS_reg, fbS_reg);
+ }
+ else {
+ newS_reg = fbS_reg;
+ }
+
+ /* Merge in the selected stencil fail values */
+ if (stencil_fail_values != fbS_reg) {
+ spe_comment(f, 0, "Loading stencil fail values");
+ spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+ modified_buffers = TRUE;
+ }
+
+ /* Same for the stencil pass/depth fail values. If this calculation
+ * is not needed (say, if depth test is off), then the
+ * stencil_pass_depth_fail_values register will be equal to fbS_reg
+ * and we'll skip the calculation.
+ */
+ if (stencil_pass_depth_fail_values != fbS_reg) {
+ /* We don't actually have a stencil pass/depth fail mask yet.
+ * Calculate it here from the stencil passing mask and the
+ * depth passing mask. Note that zmask_reg *must* have been
+ * set above if we're here.
+ */
+ uint stencil_pass_depth_fail_mask =
+ spe_allocate_available_register(f);
+
+ spe_comment(f, 0, "Loading stencil pass/depth fail values");
+ spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+ spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values,
+ stencil_pass_depth_fail_mask);
+
+ spe_release_register(f, stencil_pass_depth_fail_mask);
+ modified_buffers = TRUE;
+ }
+
+ /* Same for the stencil pass/depth pass mask. Note that we
+ * *can* get here with zmask_reg being unset (if the depth
+ * test is off but the stencil test is on). In this case,
+ * we assume the depth test passes, and don't need to mask
+ * the stencil pass mask with the Z mask.
+ */
+ if (stencil_pass_depth_pass_values != fbS_reg) {
+ if (dsa->depth.enabled) {
+ uint stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+ /* We'll need a separate register */
+ spe_comment(f, 0, "Loading stencil pass/depth pass values");
+ spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+ spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+ spe_release_register(f, stencil_pass_depth_pass_mask);
+ }
+ else {
+ /* We can use the same stencil-pass register */
+ spe_comment(f, 0, "Loading stencil pass values");
+ spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
+ }
+ modified_buffers = TRUE;
+ }
+
+ /* Almost done. If we need to writemask, do it now, leaving the
+ * results in the fbS_reg register passed in. If we don't need
+ * to writemask, then the results are *already* in the fbS_reg,
+ * so there's nothing more to do.
+ */
+
+ if (need_to_writemask_stencil_values && modified_buffers) {
+ /* The Select Bytes command makes a fine writemask. Where
+ * the mask is 0, the first (original) values are retained,
+ * effectively masking out changes. Where the mask is 1, the
+ * second (new) values are retained, incorporating changes.
+ */
+ spe_comment(f, 0, "Writemasking new stencil values");
+ spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+ }
+
+ } /* done calculating stencil values */
+
+ /* The stencil and/or depth values have been applied, and the
+ * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+ * We're all done, except that we've allocated a fair number
+ * of registers that we didn't bother tracking. Release all
+ * those registers as part of the register set, and go home.
+ */
+ spe_comment(f, 0, "Releasing stencil register set");
+ spe_release_register_set(f);
+
+ /* Return TRUE if we could have modified the stencil and/or
+ * depth buffers.
+ */
+ return modified_buffers;
+}
+
+
+/**
+ * Generate depth and/or stencil test code.
+ * \param cell context
+ * \param dsa depth/stencil/alpha state
+ * \param f spe function to emit
+ * \param facing either CELL_FACING_FRONT or CELL_FACING_BACK
+ * \param mask_reg register containing the pixel alive/dead mask
+ * \param depth_tile_reg register containing address of z/stencil tile
+ * \param quad_offset_reg offset to quad from start of tile
+ * \param fragZ_reg register containg fragment Z values
+ */
+static void
+gen_depth_stencil(struct cell_context *cell,
+ const struct pipe_depth_stencil_alpha_state *dsa,
+ struct spe_function *f,
+ uint facing,
+ int mask_reg,
+ int depth_tile_reg,
+ int quad_offset_reg,
+ int fragZ_reg)
+
+{
+ const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
+ boolean write_depth_stencil;
+
+ /* framebuffer's combined z/stencil values register */
+ int fbZS_reg = spe_allocate_available_register(f);
+
+ /* Framebufer Z values register */
+ int fbZ_reg = spe_allocate_available_register(f);
+
+ /* Framebuffer stencil values register (may not be used) */
+ int fbS_reg = spe_allocate_available_register(f);
+
+ /* 24-bit mask register (may not be used) */
+ int zmask_reg = spe_allocate_available_register(f);
+
+ /**
+ * The following code:
+ * 1. fetch quad of packed Z/S values from the framebuffer tile.
+ * 2. extract the separate the Z and S values from packed values
+ * 3. convert fragment Z values from float in [0,1] to 32/24/16-bit ints
+ *
+ * The instructions for doing this are interleaved for better performance.
+ */
+ spe_comment(f, 0, "Fetch Z/stencil quad from tile");
+
+ switch(zs_format) {
+ case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+ case PIPE_FORMAT_X8Z24_UNORM:
+ /* prepare mask to extract Z vals from ZS vals */
+ spe_load_uint(f, zmask_reg, 0x00ffffff);
+
+ /* convert fragment Z from [0,1] to 32-bit ints */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+ /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+ spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+ /* right shift 32-bit fragment Z to 24 bits */
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+ /* extract 24-bit Z values from ZS values by masking */
+ spe_and(f, fbZ_reg, fbZS_reg, zmask_reg);
+
+ /* extract 8-bit stencil values by shifting */
+ spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+ break;
+
+ case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+ case PIPE_FORMAT_Z24X8_UNORM:
+ /* convert fragment Z from [0,1] to 32-bit ints */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+ /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+ spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+ /* right shift 32-bit fragment Z to 24 bits */
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+ /* extract 24-bit Z values from ZS values by shifting */
+ spe_rotmi(f, fbZ_reg, fbZS_reg, -8);
+
+ /* extract 8-bit stencil values by masking */
+ spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+ break;
+
+ case PIPE_FORMAT_Z32_UNORM:
+ /* Load: fbZ_reg = memory[depth_tile_reg + offset_reg] */
+ spe_lqx(f, fbZ_reg, depth_tile_reg, quad_offset_reg);
+
+ /* convert fragment Z from [0,1] to 32-bit ints */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+ /* No stencil, so can't do anything there */
+ break;
+
+ case PIPE_FORMAT_Z16_UNORM:
+ /* XXX This code for 16bpp Z is broken! */
+
+ /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+ spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+ /* Copy over 4 32-bit values */
+ spe_move(f, fbZ_reg, fbZS_reg);
+
+ /* convert Z from [0,1] to 16-bit ints */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+ /* No stencil */
+ break;
+
+ default:
+ ASSERT(0); /* invalid format */
+ }
+
+ /* If stencil is enabled, use the stencil-specific code
+ * generator to generate both the stencil and depth (if needed)
+ * tests. Otherwise, if only depth is enabled, generate
+ * a quick depth test. The test generators themselves will
+ * report back whether the depth/stencil buffer has to be
+ * written back.
+ */
+ if (dsa->stencil[0].enabled) {
+ /* This will perform the stencil and depth tests, and update
+ * the mask_reg, fbZ_reg, and fbS_reg as required by the
+ * tests.
+ */
+ ASSERT(fbS_reg >= 0);
+ spe_comment(f, 0, "Perform stencil test");
+
+ /* Note that fbZ_reg may not be set on entry, if stenciling
+ * is enabled but there's no Z-buffer. The
+ * gen_stencil_depth_test() function must ignore the
+ * fbZ_reg register if depth is not enabled.
+ */
+ write_depth_stencil = gen_stencil_depth_test(f, dsa, facing,
+ mask_reg, fragZ_reg,
+ fbZ_reg, fbS_reg);
+ }
+ else if (dsa->depth.enabled) {
+ int zmask_reg = spe_allocate_available_register(f);
+ ASSERT(fbZ_reg >= 0);
+ spe_comment(f, 0, "Perform depth test");
+ write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg,
+ fbZ_reg, zmask_reg);
+ spe_release_register(f, zmask_reg);
+ }
+ else {
+ write_depth_stencil = FALSE;
+ }
+
+ if (write_depth_stencil) {
+ /* Merge latest Z and Stencil values into fbZS_reg.
+ * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
+ * fbS_reg has four 8-bit Z values in bits [7..0].
+ */
+ spe_comment(f, 0, "Store quad's depth/stencil values in tile");
+ if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+ zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+ spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+ }
+ else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+ zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+ spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+ }
+ else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+ }
+ else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+ }
+ else if (zs_format == PIPE_FORMAT_S8_UNORM) {
+ ASSERT(0); /* XXX to do */
+ }
+ else {
+ ASSERT(0); /* bad zs_format */
+ }
+
+ /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
+ spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+ }
+
+ /* Don't need these any more */
+ spe_release_register(f, fbZS_reg);
+ spe_release_register(f, fbZ_reg);
+ spe_release_register(f, fbS_reg);
+ spe_release_register(f, zmask_reg);
+}
+
/**
* should be much faster.
*
* \param cell the rendering context (in)
- * \param f the generated function (out)
+ * \param facing whether the generated code is for front-facing or
+ * back-facing fragments
+ * \param f the generated function (in/out); on input, the function
+ * must already have been initialized. On exit, whatever
+ * instructions within the generated function have had
+ * the fragment ops appended.
*/
void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+cell_gen_fragment_function(struct cell_context *cell,
+ const uint facing,
+ struct spe_function *f)
{
- const struct pipe_depth_stencil_alpha_state *dsa =
- &cell->depth_stencil->base;
- const struct pipe_blend_state *blend = &cell->blend->base;
+ const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
+ const struct pipe_blend_state *blend = cell->blend;
const struct pipe_blend_color *blend_color = &cell->blend_color;
const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
const int fragA_reg = 11; /* vector float */
const int mask_reg = 12; /* vector uint */
+ ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
+
/* offset of quad from start of tile
* XXX assuming 4-byte pixels for color AND Z/stencil!!!!
*/
int quad_offset_reg;
int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */
- int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */
-
- spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
if (cell->debug_flags & CELL_DEBUG_ASM) {
- spe_print_code(f, true);
+ spe_print_code(f, TRUE);
spe_indent(f, 8);
- spe_comment(f, -4, "Begin per-fragment ops");
+ spe_comment(f, -4, facing == CELL_FACING_FRONT
+ ? "Begin front-facing per-fragment ops"
+ : "Begin back-facing per-fragment ops");
}
spe_allocate_register(f, x_reg);
quad_offset_reg = spe_allocate_available_register(f);
fbRGBA_reg = spe_allocate_available_register(f);
- fbZS_reg = spe_allocate_available_register(f);
/* compute offset of quad from start of tile, in bytes */
{
ASSERT(TILE_SIZE == 32);
- spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
+ spe_comment(f, 0, "Compute quad offset within tile");
spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */
+ spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */
spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */
spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */
spe_release_register(f, y2_reg);
}
-
+ /* Generate the alpha test, if needed. */
if (dsa->alpha.enabled) {
gen_alpha_test(dsa, f, mask_reg, fragA_reg);
}
+ /* generate depth and/or stencil test code */
if (dsa->depth.enabled || dsa->stencil[0].enabled) {
- const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
- boolean write_depth_stencil;
-
- int fbZ_reg = spe_allocate_available_register(f); /* Z values */
- int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
-
- /* fetch quad of depth/stencil values from tile at (x,y) */
- /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
- spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
-
- if (dsa->depth.enabled) {
- /* Extract Z bits from fbZS_reg into fbZ_reg */
- if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- int mask_reg = spe_allocate_available_register(f);
- spe_fsmbi(f, mask_reg, 0x7777); /* mask[0,1,2,3] = 0x00ffffff */
- spe_and(f, fbZ_reg, fbZS_reg, mask_reg); /* fbZ = fbZS & mask */
- spe_release_register(f, mask_reg);
- /* OK, fbZ_reg has four 24-bit Z values now */
- }
- else {
- /* XXX handle other z/stencil formats */
- ASSERT(0);
- }
-
- /* Convert fragZ values from float[4] to uint[4] */
- if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM ||
- zs_format == PIPE_FORMAT_Z24S8_UNORM ||
- zs_format == PIPE_FORMAT_Z24X8_UNORM) {
- /* 24-bit Z values */
- int scale_reg = spe_allocate_available_register(f);
-
- /* scale_reg[0,1,2,3] = float(2^24-1) */
- spe_load_float(f, scale_reg, (float) 0xffffff);
-
- /* XXX these two instructions might be combined */
- spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 0); /* fragZ = (int) fragZ */
-
- spe_release_register(f, scale_reg);
- }
- else {
- /* XXX handle 16-bit Z format */
- ASSERT(0);
- }
- }
-
- if (dsa->stencil[0].enabled) {
- /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
- if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- /* XXX extract with a shift */
- ASSERT(0);
- }
- else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
- zs_format == PIPE_FORMAT_Z24X8_UNORM) {
- /* XXX extract with a mask */
- ASSERT(0);
- }
- }
-
-
- if (dsa->stencil[0].enabled) {
- /* XXX this may involve depth testing too */
- // gen_stencil_test(dsa, f, ... );
- ASSERT(0);
- }
- else if (dsa->depth.enabled) {
- int zmask_reg = spe_allocate_available_register(f);
- gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
- spe_release_register(f, zmask_reg);
- }
-
- /* do we need to write Z and/or Stencil back into framebuffer? */
- write_depth_stencil = (dsa->depth.writemask |
- dsa->stencil[0].write_mask |
- dsa->stencil[1].write_mask);
-
- if (write_depth_stencil) {
- /* Merge latest Z and Stencil values into fbZS_reg.
- * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
- * fbS_reg has four 8-bit Z values in bits [7..0].
- */
- if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
- spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
- }
- else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- /* XXX to do */
- ASSERT(0);
- }
- else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
- /* XXX to do */
- ASSERT(0);
- }
- else if (zs_format == PIPE_FORMAT_S8_UNORM) {
- /* XXX to do */
- ASSERT(0);
- }
- else {
- /* bad zs_format */
- ASSERT(0);
- }
-
- /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
- spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
- }
-
- spe_release_register(f, fbZ_reg);
- spe_release_register(f, fbS_reg);
+ gen_depth_stencil(cell, dsa, f,
+ facing,
+ mask_reg,
+ depth_tile_reg,
+ quad_offset_reg,
+ fragZ_reg);
}
-
/* Get framebuffer quad/colors. We'll need these for blending,
* color masking, and to obey the quad/pixel mask.
* Load: fbRGBA_reg = memory[color_tile + quad_offset]
* Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
* we could skip this load.
*/
+ spe_comment(f, 0, "Fetch quad colors from tile");
spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
-
if (blend->blend_enable) {
+ spe_comment(f, 0, "Perform blending");
gen_blend(blend, blend_color, f, color_format,
fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
}
int rgba_reg = spe_allocate_available_register(f);
/* Pack four float colors as four 32-bit int colors */
+ spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
gen_pack_colors(f, color_format,
fragR_reg, fragG_reg, fragB_reg, fragA_reg,
rgba_reg);
if (blend->logicop_enable) {
+ spe_comment(f, 0, "Compute logic op");
gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
}
if (blend->colormask != PIPE_MASK_RGBA) {
- gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
+ spe_comment(f, 0, "Compute color mask");
+ gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
}
-
/* Mix fragment colors with framebuffer colors using the quad/pixel mask:
* if (mask[i])
* rgba[i] = rgba[i];
/* Store updated quad in tile:
* memory[color_tile + quad_offset] = rgba_reg;
*/
+ spe_comment(f, 0, "Store quad colors into color tile");
spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
spe_release_register(f, rgba_reg);
spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */
-
spe_release_register(f, fbRGBA_reg);
- spe_release_register(f, fbZS_reg);
spe_release_register(f, quad_offset_reg);
if (cell->debug_flags & CELL_DEBUG_ASM) {
- spe_comment(f, -4, "End per-fragment ops");
+ char buffer[1024];
+ sprintf(buffer, "End %s-facing per-fragment ops: %d instructions",
+ facing == CELL_FACING_FRONT ? "front" : "back", f->num_inst);
+ spe_comment(f, -4, buffer);
}
}