1 /**************************************************************************
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
31 * Generate SPU per-fragment code (actually per-quad code).
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "rtasm/rtasm_ppc_spe.h"
39 #include "cell_context.h"
40 #include "cell_gen_fragment.h"
44 /** Do extra optimizations? */
45 #define OPTIMIZATIONS 1
49 * Generate SPE code to perform Z/depth testing.
51 * \param dsa Gallium depth/stencil/alpha state to gen code for
52 * \param f SPE function to append instruction onto.
53 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
54 * \param ifragZ_reg register containing integer fragment Z values (in)
55 * \param ifbZ_reg register containing integer frame buffer Z values (in/out)
56 * \param zmask_reg register containing result of Z test/comparison (out)
59 gen_depth_test(const struct pipe_depth_stencil_alpha_state
*dsa
,
60 struct spe_function
*f
,
61 int mask_reg
, int ifragZ_reg
, int ifbZ_reg
, int zmask_reg
)
63 ASSERT(dsa
->depth
.enabled
);
65 switch (dsa
->depth
.func
) {
67 /* zmask = (ifragZ == ref) */
68 spe_ceq(f
, zmask_reg
, ifragZ_reg
, ifbZ_reg
);
69 /* mask = (mask & zmask) */
70 spe_and(f
, mask_reg
, mask_reg
, zmask_reg
);
73 case PIPE_FUNC_NOTEQUAL
:
74 /* zmask = (ifragZ == ref) */
75 spe_ceq(f
, zmask_reg
, ifragZ_reg
, ifbZ_reg
);
76 /* mask = (mask & ~zmask) */
77 spe_andc(f
, mask_reg
, mask_reg
, zmask_reg
);
80 case PIPE_FUNC_GREATER
:
81 /* zmask = (ifragZ > ref) */
82 spe_cgt(f
, zmask_reg
, ifragZ_reg
, ifbZ_reg
);
83 /* mask = (mask & zmask) */
84 spe_and(f
, mask_reg
, mask_reg
, zmask_reg
);
88 /* zmask = (ref > ifragZ) */
89 spe_cgt(f
, zmask_reg
, ifbZ_reg
, ifragZ_reg
);
90 /* mask = (mask & zmask) */
91 spe_and(f
, mask_reg
, mask_reg
, zmask_reg
);
94 case PIPE_FUNC_LEQUAL
:
95 /* zmask = (ifragZ > ref) */
96 spe_cgt(f
, zmask_reg
, ifragZ_reg
, ifbZ_reg
);
97 /* mask = (mask & ~zmask) */
98 spe_andc(f
, mask_reg
, mask_reg
, zmask_reg
);
101 case PIPE_FUNC_GEQUAL
:
102 /* zmask = (ref > ifragZ) */
103 spe_cgt(f
, zmask_reg
, ifbZ_reg
, ifragZ_reg
);
104 /* mask = (mask & ~zmask) */
105 spe_andc(f
, mask_reg
, mask_reg
, zmask_reg
);
108 case PIPE_FUNC_NEVER
:
109 spe_il(f
, mask_reg
, 0); /* mask = {0,0,0,0} */
110 spe_move(f
, zmask_reg
, mask_reg
); /* zmask = mask */
113 case PIPE_FUNC_ALWAYS
:
115 spe_il(f
, zmask_reg
, ~0); /* zmask = {~0,~0,~0,~0} */
123 if (dsa
->depth
.writemask
) {
125 * If (ztest passed) {
126 * framebufferZ = fragmentZ;
129 * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
131 spe_selb(f
, ifbZ_reg
, ifbZ_reg
, ifragZ_reg
, mask_reg
);
137 * Generate SPE code to perform alpha testing.
139 * \param dsa Gallium depth/stencil/alpha state to gen code for
140 * \param f SPE function to append instruction onto.
141 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
142 * \param fragA_reg register containing four fragment alpha values (in)
145 gen_alpha_test(const struct pipe_depth_stencil_alpha_state
*dsa
,
146 struct spe_function
*f
, int mask_reg
, int fragA_reg
)
148 int ref_reg
= spe_allocate_available_register(f
);
149 int amask_reg
= spe_allocate_available_register(f
);
151 ASSERT(dsa
->alpha
.enabled
);
153 if ((dsa
->alpha
.func
!= PIPE_FUNC_NEVER
) &&
154 (dsa
->alpha
.func
!= PIPE_FUNC_ALWAYS
)) {
155 /* load/splat the alpha reference float value */
156 spe_load_float(f
, ref_reg
, dsa
->alpha
.ref
);
159 /* emit code to do the alpha comparison, updating 'mask' */
160 switch (dsa
->alpha
.func
) {
161 case PIPE_FUNC_EQUAL
:
162 /* amask = (fragA == ref) */
163 spe_fceq(f
, amask_reg
, fragA_reg
, ref_reg
);
164 /* mask = (mask & amask) */
165 spe_and(f
, mask_reg
, mask_reg
, amask_reg
);
168 case PIPE_FUNC_NOTEQUAL
:
169 /* amask = (fragA == ref) */
170 spe_fceq(f
, amask_reg
, fragA_reg
, ref_reg
);
171 /* mask = (mask & ~amask) */
172 spe_andc(f
, mask_reg
, mask_reg
, amask_reg
);
175 case PIPE_FUNC_GREATER
:
176 /* amask = (fragA > ref) */
177 spe_fcgt(f
, amask_reg
, fragA_reg
, ref_reg
);
178 /* mask = (mask & amask) */
179 spe_and(f
, mask_reg
, mask_reg
, amask_reg
);
183 /* amask = (ref > fragA) */
184 spe_fcgt(f
, amask_reg
, ref_reg
, fragA_reg
);
185 /* mask = (mask & amask) */
186 spe_and(f
, mask_reg
, mask_reg
, amask_reg
);
189 case PIPE_FUNC_LEQUAL
:
190 /* amask = (fragA > ref) */
191 spe_fcgt(f
, amask_reg
, fragA_reg
, ref_reg
);
192 /* mask = (mask & ~amask) */
193 spe_andc(f
, mask_reg
, mask_reg
, amask_reg
);
196 case PIPE_FUNC_GEQUAL
:
197 /* amask = (ref > fragA) */
198 spe_fcgt(f
, amask_reg
, ref_reg
, fragA_reg
);
199 /* mask = (mask & ~amask) */
200 spe_andc(f
, mask_reg
, mask_reg
, amask_reg
);
203 case PIPE_FUNC_NEVER
:
204 spe_il(f
, mask_reg
, 0); /* mask = [0,0,0,0] */
207 case PIPE_FUNC_ALWAYS
:
208 /* no-op, mask unchanged */
217 /* if mask == {0,0,0,0} we're all done, return */
219 /* re-use amask reg here */
220 int tmp_reg
= amask_reg
;
221 /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
222 spe_orx(f
, tmp_reg
, mask_reg
);
223 /* if tmp[0] == 0 then return from function call */
224 spe_biz(f
, tmp_reg
, SPE_REG_RA
, 0, 0);
228 spe_release_register(f
, ref_reg
);
229 spe_release_register(f
, amask_reg
);
235 * Generate SPE code to implement the fragment operations (alpha test,
236 * depth test, stencil test, blending, colormask, and final
237 * framebuffer write) as specified by the current context state.
239 * Logically, this code will be called after running the fragment
240 * shader. But under some circumstances we could run some of this
241 * code before the fragment shader to cull fragments/quads that are
242 * totally occluded/discarded.
244 * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
246 * See the spu_default_fragment_ops() function to see how the per-fragment
247 * operations would be done with ordinary C code.
248 * The code we generate here though has no branches, is SIMD, etc and
249 * should be much faster.
251 * \param cell the rendering context (in)
252 * \param f the generated function (out)
255 gen_fragment_function(struct cell_context
*cell
, struct spe_function
*f
)
257 const struct pipe_depth_stencil_alpha_state
*dsa
=
258 &cell
->depth_stencil
->base
;
259 const struct pipe_blend_state
*blend
= &cell
->blend
->base
;
261 /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
262 const int x_reg
= 3; /* uint */
263 const int y_reg
= 4; /* uint */
264 const int color_tile_reg
= 5; /* tile_t * */
265 const int depth_tile_reg
= 6; /* tile_t * */
266 const int fragZ_reg
= 7; /* vector float */
267 const int fragR_reg
= 8; /* vector float */
268 const int fragG_reg
= 9; /* vector float */
269 const int fragB_reg
= 10; /* vector float */
270 const int fragA_reg
= 11; /* vector float */
271 const int mask_reg
= 12; /* vector uint */
273 /* offset of quad from start of tile
274 * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
278 int fbRGBA_reg
; /**< framebuffer's RGBA colors for quad */
279 int fbZS_reg
; /**< framebuffer's combined z/stencil values for quad */
281 spe_init_func(f
, SPU_MAX_FRAGMENT_OPS_INSTS
* SPE_INST_SIZE
);
282 spe_allocate_register(f
, x_reg
);
283 spe_allocate_register(f
, y_reg
);
284 spe_allocate_register(f
, color_tile_reg
);
285 spe_allocate_register(f
, depth_tile_reg
);
286 spe_allocate_register(f
, fragZ_reg
);
287 spe_allocate_register(f
, fragR_reg
);
288 spe_allocate_register(f
, fragG_reg
);
289 spe_allocate_register(f
, fragB_reg
);
290 spe_allocate_register(f
, fragA_reg
);
291 spe_allocate_register(f
, mask_reg
);
293 quad_offset_reg
= spe_allocate_available_register(f
);
294 fbRGBA_reg
= spe_allocate_available_register(f
);
295 fbZS_reg
= spe_allocate_available_register(f
);
297 /* compute offset of quad from start of tile, in bytes */
299 int x2_reg
= spe_allocate_available_register(f
);
300 int y2_reg
= spe_allocate_available_register(f
);
302 ASSERT(TILE_SIZE
== 32);
304 spe_rotmi(f
, x2_reg
, x_reg
, -1); /* x2 = x / 2 */
305 spe_rotmi(f
, y2_reg
, y_reg
, -1); /* y2 = y / 2 */
306 spe_shli(f
, y2_reg
, y2_reg
, 4); /* y2 *= 16 */
307 spe_a(f
, quad_offset_reg
, y2_reg
, x2_reg
); /* offset = y2 + x2 */
308 spe_shli(f
, quad_offset_reg
, quad_offset_reg
, 4); /* offset *= 16 */
310 spe_release_register(f
, x2_reg
);
311 spe_release_register(f
, y2_reg
);
315 if (dsa
->alpha
.enabled
) {
316 gen_alpha_test(dsa
, f
, mask_reg
, fragA_reg
);
319 if (dsa
->depth
.enabled
|| dsa
->stencil
[0].enabled
) {
320 const enum pipe_format zs_format
= cell
->framebuffer
.zsbuf
->format
;
321 boolean write_depth_stencil
;
323 int fbZ_reg
= spe_allocate_available_register(f
); /* Z values */
324 int fbS_reg
= spe_allocate_available_register(f
); /* Stencil values */
326 /* fetch quad of depth/stencil values from tile at (x,y) */
327 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
328 spe_lqx(f
, fbZS_reg
, depth_tile_reg
, quad_offset_reg
);
330 if (dsa
->depth
.enabled
) {
331 /* Extract Z bits from fbZS_reg into fbZ_reg */
332 if (zs_format
== PIPE_FORMAT_S8Z24_UNORM
||
333 zs_format
== PIPE_FORMAT_X8Z24_UNORM
) {
334 int mask_reg
= spe_allocate_available_register(f
);
335 spe_fsmbi(f
, mask_reg
, 0x7777); /* mask[0,1,2,3] = 0x00ffffff */
336 spe_and(f
, fbZ_reg
, fbZS_reg
, mask_reg
); /* fbZ = fbZS & mask */
337 spe_release_register(f
, mask_reg
);
338 /* OK, fbZ_reg has four 24-bit Z values now */
341 /* XXX handle other z/stencil formats */
345 /* Convert fragZ values from float[4] to uint[4] */
346 if (zs_format
== PIPE_FORMAT_S8Z24_UNORM
||
347 zs_format
== PIPE_FORMAT_X8Z24_UNORM
||
348 zs_format
== PIPE_FORMAT_Z24S8_UNORM
||
349 zs_format
== PIPE_FORMAT_Z24X8_UNORM
) {
350 /* 24-bit Z values */
351 int scale_reg
= spe_allocate_available_register(f
);
353 /* scale_reg[0,1,2,3] = float(2^24-1) */
354 spe_load_float(f
, scale_reg
, (float) 0xffffff);
356 /* XXX these two instructions might be combined */
357 spe_fm(f
, fragZ_reg
, fragZ_reg
, scale_reg
); /* fragZ *= scale */
358 spe_cfltu(f
, fragZ_reg
, fragZ_reg
, 0); /* fragZ = (int) fragZ */
360 spe_release_register(f
, scale_reg
);
363 /* XXX handle 16-bit Z format */
368 if (dsa
->stencil
[0].enabled
) {
369 /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
370 if (zs_format
== PIPE_FORMAT_S8Z24_UNORM
||
371 zs_format
== PIPE_FORMAT_X8Z24_UNORM
) {
372 /* XXX extract with a shift */
375 else if (zs_format
== PIPE_FORMAT_Z24S8_UNORM
||
376 zs_format
== PIPE_FORMAT_Z24X8_UNORM
) {
377 /* XXX extract with a mask */
383 if (dsa
->stencil
[0].enabled
) {
384 /* XXX this may involve depth testing too */
385 // gen_stencil_test(dsa, f, ... );
388 else if (dsa
->depth
.enabled
) {
389 int zmask_reg
= spe_allocate_available_register(f
);
390 gen_depth_test(dsa
, f
, mask_reg
, fragZ_reg
, fbZ_reg
, zmask_reg
);
391 spe_release_register(f
, zmask_reg
);
394 /* do we need to write Z and/or Stencil back into framebuffer? */
395 write_depth_stencil
= (dsa
->depth
.writemask
|
396 dsa
->stencil
[0].write_mask
|
397 dsa
->stencil
[1].write_mask
);
399 if (write_depth_stencil
) {
400 /* Merge latest Z and Stencil values into fbZS_reg.
401 * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
402 * fbS_reg has four 8-bit Z values in bits [7..0].
404 if (zs_format
== PIPE_FORMAT_S8Z24_UNORM
||
405 zs_format
== PIPE_FORMAT_X8Z24_UNORM
) {
406 spe_shli(f
, fbS_reg
, fbS_reg
, 24); /* fbS = fbS << 24 */
407 spe_or(f
, fbZS_reg
, fbS_reg
, fbZ_reg
); /* fbZS = fbS | fbZ */
409 else if (zs_format
== PIPE_FORMAT_S8Z24_UNORM
||
410 zs_format
== PIPE_FORMAT_X8Z24_UNORM
) {
414 else if (zs_format
== PIPE_FORMAT_Z16_UNORM
) {
418 else if (zs_format
== PIPE_FORMAT_S8_UNORM
) {
427 /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
428 spe_stqx(f
, fbZS_reg
, depth_tile_reg
, quad_offset_reg
);
431 spe_release_register(f
, fbZ_reg
);
432 spe_release_register(f
, fbS_reg
);
436 /* Get framebuffer quad/colors. We'll need these for blending,
437 * color masking, and to obey the quad/pixel mask.
438 * Load: fbRGBA_reg = memory[color_tile + quad_offset]
439 * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
440 * we could skip this load.
442 spe_lqx(f
, fbRGBA_reg
, color_tile_reg
, quad_offset_reg
);
445 if (blend
->blend_enable
) {
446 /* convert packed tile colors in fbRGBA_reg to float[4] vectors */
448 // gen_blend_code(blend, f, mask_reg, ... );
455 * Write fragment colors to framebuffer/tile.
456 * This involves converting the fragment colors from float[4] to the
457 * tile's specific format and obeying the quad/pixel mask.
460 const enum pipe_format color_format
= cell
->framebuffer
.cbufs
[0]->format
;
461 int rgba_reg
= spe_allocate_available_register(f
);
463 /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
464 spe_cfltu(f
, fragR_reg
, fragR_reg
, 32);
465 spe_cfltu(f
, fragG_reg
, fragG_reg
, 32);
466 spe_cfltu(f
, fragB_reg
, fragB_reg
, 32);
467 spe_cfltu(f
, fragA_reg
, fragA_reg
, 32);
469 /* Shift most the significant bytes to least the significant positions.
470 * I.e.: reg = reg >> 24
472 spe_rotmi(f
, fragR_reg
, fragR_reg
, -24);
473 spe_rotmi(f
, fragG_reg
, fragG_reg
, -24);
474 spe_rotmi(f
, fragB_reg
, fragB_reg
, -24);
475 spe_rotmi(f
, fragA_reg
, fragA_reg
, -24);
477 /* Shift the color bytes according to the surface format */
478 if (color_format
== PIPE_FORMAT_A8R8G8B8_UNORM
) {
479 spe_roti(f
, fragG_reg
, fragG_reg
, 8); /* green <<= 8 */
480 spe_roti(f
, fragR_reg
, fragR_reg
, 16); /* red <<= 16 */
481 spe_roti(f
, fragA_reg
, fragA_reg
, 24); /* alpha <<= 24 */
483 else if (color_format
== PIPE_FORMAT_B8G8R8A8_UNORM
) {
484 spe_roti(f
, fragR_reg
, fragR_reg
, 8); /* red <<= 8 */
485 spe_roti(f
, fragG_reg
, fragG_reg
, 16); /* green <<= 16 */
486 spe_roti(f
, fragB_reg
, fragB_reg
, 24); /* blue <<= 24 */
492 /* Merge red, green, blue, alpha registers to make packed RGBA colors.
493 * Eg: after shifting according to color_format we might have:
494 * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
495 * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
496 * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
497 * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
498 * OR-ing all those together gives us four packed colors:
499 * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
501 spe_or(f
, rgba_reg
, fragR_reg
, fragG_reg
);
502 spe_or(f
, rgba_reg
, rgba_reg
, fragB_reg
);
503 spe_or(f
, rgba_reg
, rgba_reg
, fragA_reg
);
505 /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
509 * rgba[i] = framebuffer[i];
511 spe_selb(f
, rgba_reg
, fbRGBA_reg
, rgba_reg
, mask_reg
);
513 /* Store updated quad in tile:
514 * memory[color_tile + quad_offset] = rgba_reg;
516 spe_stqx(f
, rgba_reg
, color_tile_reg
, quad_offset_reg
);
518 spe_release_register(f
, rgba_reg
);
521 printf("gen_fragment_ops nr instructions: %u\n", f
->num_inst
);
523 spe_bi(f
, SPE_REG_RA
, 0, 0); /* return from function call */
526 spe_release_register(f
, fbRGBA_reg
);
527 spe_release_register(f
, fbZS_reg
);
528 spe_release_register(f
, quad_offset_reg
);