cell: checkpoint commit of new per-fragment processing
[mesa.git] / src / gallium / drivers / cell / ppu / cell_gen_fragment.c
1 /**************************************************************************
2 *
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29
30 /**
31 * Generate SPU per-fragment code (actually per-quad code).
32 * \author Brian Paul
33 */
34
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "rtasm/rtasm_ppc_spe.h"
39 #include "cell_context.h"
40 #include "cell_gen_fragment.h"
41
42
43
44 /** Do extra optimizations? */
45 #define OPTIMIZATIONS 1
46
47
48 /**
49 * Generate SPE code to perform Z/depth testing.
50 *
51 * \param dsa Gallium depth/stencil/alpha state to gen code for
52 * \param f SPE function to append instruction onto.
53 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
54 * \param ifragZ_reg register containing integer fragment Z values (in)
55 * \param ifbZ_reg register containing integer frame buffer Z values (in/out)
56 * \param zmask_reg register containing result of Z test/comparison (out)
57 */
58 static void
59 gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
60 struct spe_function *f,
61 int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
62 {
63 ASSERT(dsa->depth.enabled);
64
65 switch (dsa->depth.func) {
66 case PIPE_FUNC_EQUAL:
67 /* zmask = (ifragZ == ref) */
68 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
69 /* mask = (mask & zmask) */
70 spe_and(f, mask_reg, mask_reg, zmask_reg);
71 break;
72
73 case PIPE_FUNC_NOTEQUAL:
74 /* zmask = (ifragZ == ref) */
75 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
76 /* mask = (mask & ~zmask) */
77 spe_andc(f, mask_reg, mask_reg, zmask_reg);
78 break;
79
80 case PIPE_FUNC_GREATER:
81 /* zmask = (ifragZ > ref) */
82 spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
83 /* mask = (mask & zmask) */
84 spe_and(f, mask_reg, mask_reg, zmask_reg);
85 break;
86
87 case PIPE_FUNC_LESS:
88 /* zmask = (ref > ifragZ) */
89 spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
90 /* mask = (mask & zmask) */
91 spe_and(f, mask_reg, mask_reg, zmask_reg);
92 break;
93
94 case PIPE_FUNC_LEQUAL:
95 /* zmask = (ifragZ > ref) */
96 spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
97 /* mask = (mask & ~zmask) */
98 spe_andc(f, mask_reg, mask_reg, zmask_reg);
99 break;
100
101 case PIPE_FUNC_GEQUAL:
102 /* zmask = (ref > ifragZ) */
103 spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
104 /* mask = (mask & ~zmask) */
105 spe_andc(f, mask_reg, mask_reg, zmask_reg);
106 break;
107
108 case PIPE_FUNC_NEVER:
109 spe_il(f, mask_reg, 0); /* mask = {0,0,0,0} */
110 spe_move(f, zmask_reg, mask_reg); /* zmask = mask */
111 break;
112
113 case PIPE_FUNC_ALWAYS:
114 /* mask unchanged */
115 spe_il(f, zmask_reg, ~0); /* zmask = {~0,~0,~0,~0} */
116 break;
117
118 default:
119 ASSERT(0);
120 break;
121 }
122
123 if (dsa->depth.writemask) {
124 /*
125 * If (ztest passed) {
126 * framebufferZ = fragmentZ;
127 * }
128 * OR,
129 * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
130 */
131 spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
132 }
133 }
134
135
136 /**
137 * Generate SPE code to perform alpha testing.
138 *
139 * \param dsa Gallium depth/stencil/alpha state to gen code for
140 * \param f SPE function to append instruction onto.
141 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
142 * \param fragA_reg register containing four fragment alpha values (in)
143 */
144 static void
145 gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
146 struct spe_function *f, int mask_reg, int fragA_reg)
147 {
148 int ref_reg = spe_allocate_available_register(f);
149 int amask_reg = spe_allocate_available_register(f);
150
151 ASSERT(dsa->alpha.enabled);
152
153 if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
154 (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
155 /* load/splat the alpha reference float value */
156 spe_load_float(f, ref_reg, dsa->alpha.ref);
157 }
158
159 /* emit code to do the alpha comparison, updating 'mask' */
160 switch (dsa->alpha.func) {
161 case PIPE_FUNC_EQUAL:
162 /* amask = (fragA == ref) */
163 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
164 /* mask = (mask & amask) */
165 spe_and(f, mask_reg, mask_reg, amask_reg);
166 break;
167
168 case PIPE_FUNC_NOTEQUAL:
169 /* amask = (fragA == ref) */
170 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
171 /* mask = (mask & ~amask) */
172 spe_andc(f, mask_reg, mask_reg, amask_reg);
173 break;
174
175 case PIPE_FUNC_GREATER:
176 /* amask = (fragA > ref) */
177 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
178 /* mask = (mask & amask) */
179 spe_and(f, mask_reg, mask_reg, amask_reg);
180 break;
181
182 case PIPE_FUNC_LESS:
183 /* amask = (ref > fragA) */
184 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
185 /* mask = (mask & amask) */
186 spe_and(f, mask_reg, mask_reg, amask_reg);
187 break;
188
189 case PIPE_FUNC_LEQUAL:
190 /* amask = (fragA > ref) */
191 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
192 /* mask = (mask & ~amask) */
193 spe_andc(f, mask_reg, mask_reg, amask_reg);
194 break;
195
196 case PIPE_FUNC_GEQUAL:
197 /* amask = (ref > fragA) */
198 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
199 /* mask = (mask & ~amask) */
200 spe_andc(f, mask_reg, mask_reg, amask_reg);
201 break;
202
203 case PIPE_FUNC_NEVER:
204 spe_il(f, mask_reg, 0); /* mask = [0,0,0,0] */
205 break;
206
207 case PIPE_FUNC_ALWAYS:
208 /* no-op, mask unchanged */
209 break;
210
211 default:
212 ASSERT(0);
213 break;
214 }
215
216 #if OPTIMIZATIONS
217 /* if mask == {0,0,0,0} we're all done, return */
218 {
219 /* re-use amask reg here */
220 int tmp_reg = amask_reg;
221 /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
222 spe_orx(f, tmp_reg, mask_reg);
223 /* if tmp[0] == 0 then return from function call */
224 spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
225 }
226 #endif
227
228 spe_release_register(f, ref_reg);
229 spe_release_register(f, amask_reg);
230 }
231
232
233
234 /**
235 * Generate SPE code to implement the fragment operations (alpha test,
236 * depth test, stencil test, blending, colormask, and final
237 * framebuffer write) as specified by the current context state.
238 *
239 * Logically, this code will be called after running the fragment
240 * shader. But under some circumstances we could run some of this
241 * code before the fragment shader to cull fragments/quads that are
242 * totally occluded/discarded.
243 *
244 * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
245 *
246 * See the spu_default_fragment_ops() function to see how the per-fragment
247 * operations would be done with ordinary C code.
248 * The code we generate here though has no branches, is SIMD, etc and
249 * should be much faster.
250 *
251 * \param cell the rendering context (in)
252 * \param f the generated function (out)
253 */
254 void
255 gen_fragment_function(struct cell_context *cell, struct spe_function *f)
256 {
257 const struct pipe_depth_stencil_alpha_state *dsa =
258 &cell->depth_stencil->base;
259 const struct pipe_blend_state *blend = &cell->blend->base;
260
261 /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
262 const int x_reg = 3; /* uint */
263 const int y_reg = 4; /* uint */
264 const int color_tile_reg = 5; /* tile_t * */
265 const int depth_tile_reg = 6; /* tile_t * */
266 const int fragZ_reg = 7; /* vector float */
267 const int fragR_reg = 8; /* vector float */
268 const int fragG_reg = 9; /* vector float */
269 const int fragB_reg = 10; /* vector float */
270 const int fragA_reg = 11; /* vector float */
271 const int mask_reg = 12; /* vector uint */
272
273 /* offset of quad from start of tile
274 * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
275 */
276 int quad_offset_reg;
277
278 int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */
279 int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */
280
281 spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
282 spe_allocate_register(f, x_reg);
283 spe_allocate_register(f, y_reg);
284 spe_allocate_register(f, color_tile_reg);
285 spe_allocate_register(f, depth_tile_reg);
286 spe_allocate_register(f, fragZ_reg);
287 spe_allocate_register(f, fragR_reg);
288 spe_allocate_register(f, fragG_reg);
289 spe_allocate_register(f, fragB_reg);
290 spe_allocate_register(f, fragA_reg);
291 spe_allocate_register(f, mask_reg);
292
293 quad_offset_reg = spe_allocate_available_register(f);
294 fbRGBA_reg = spe_allocate_available_register(f);
295 fbZS_reg = spe_allocate_available_register(f);
296
297 /* compute offset of quad from start of tile, in bytes */
298 {
299 int x2_reg = spe_allocate_available_register(f);
300 int y2_reg = spe_allocate_available_register(f);
301
302 ASSERT(TILE_SIZE == 32);
303
304 spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
305 spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */
306 spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */
307 spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */
308 spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */
309
310 spe_release_register(f, x2_reg);
311 spe_release_register(f, y2_reg);
312 }
313
314
315 if (dsa->alpha.enabled) {
316 gen_alpha_test(dsa, f, mask_reg, fragA_reg);
317 }
318
319 if (dsa->depth.enabled || dsa->stencil[0].enabled) {
320 const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
321 boolean write_depth_stencil;
322
323 int fbZ_reg = spe_allocate_available_register(f); /* Z values */
324 int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
325
326 /* fetch quad of depth/stencil values from tile at (x,y) */
327 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
328 spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
329
330 if (dsa->depth.enabled) {
331 /* Extract Z bits from fbZS_reg into fbZ_reg */
332 if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
333 zs_format == PIPE_FORMAT_X8Z24_UNORM) {
334 int mask_reg = spe_allocate_available_register(f);
335 spe_fsmbi(f, mask_reg, 0x7777); /* mask[0,1,2,3] = 0x00ffffff */
336 spe_and(f, fbZ_reg, fbZS_reg, mask_reg); /* fbZ = fbZS & mask */
337 spe_release_register(f, mask_reg);
338 /* OK, fbZ_reg has four 24-bit Z values now */
339 }
340 else {
341 /* XXX handle other z/stencil formats */
342 ASSERT(0);
343 }
344
345 /* Convert fragZ values from float[4] to uint[4] */
346 if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
347 zs_format == PIPE_FORMAT_X8Z24_UNORM ||
348 zs_format == PIPE_FORMAT_Z24S8_UNORM ||
349 zs_format == PIPE_FORMAT_Z24X8_UNORM) {
350 /* 24-bit Z values */
351 int scale_reg = spe_allocate_available_register(f);
352
353 /* scale_reg[0,1,2,3] = float(2^24-1) */
354 spe_load_float(f, scale_reg, (float) 0xffffff);
355
356 /* XXX these two instructions might be combined */
357 spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
358 spe_cfltu(f, fragZ_reg, fragZ_reg, 0); /* fragZ = (int) fragZ */
359
360 spe_release_register(f, scale_reg);
361 }
362 else {
363 /* XXX handle 16-bit Z format */
364 ASSERT(0);
365 }
366 }
367
368 if (dsa->stencil[0].enabled) {
369 /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
370 if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
371 zs_format == PIPE_FORMAT_X8Z24_UNORM) {
372 /* XXX extract with a shift */
373 ASSERT(0);
374 }
375 else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
376 zs_format == PIPE_FORMAT_Z24X8_UNORM) {
377 /* XXX extract with a mask */
378 ASSERT(0);
379 }
380 }
381
382
383 if (dsa->stencil[0].enabled) {
384 /* XXX this may involve depth testing too */
385 // gen_stencil_test(dsa, f, ... );
386 ASSERT(0);
387 }
388 else if (dsa->depth.enabled) {
389 int zmask_reg = spe_allocate_available_register(f);
390 gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
391 spe_release_register(f, zmask_reg);
392 }
393
394 /* do we need to write Z and/or Stencil back into framebuffer? */
395 write_depth_stencil = (dsa->depth.writemask |
396 dsa->stencil[0].write_mask |
397 dsa->stencil[1].write_mask);
398
399 if (write_depth_stencil) {
400 /* Merge latest Z and Stencil values into fbZS_reg.
401 * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
402 * fbS_reg has four 8-bit Z values in bits [7..0].
403 */
404 if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
405 zs_format == PIPE_FORMAT_X8Z24_UNORM) {
406 spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
407 spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
408 }
409 else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
410 zs_format == PIPE_FORMAT_X8Z24_UNORM) {
411 /* XXX to do */
412 ASSERT(0);
413 }
414 else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
415 /* XXX to do */
416 ASSERT(0);
417 }
418 else if (zs_format == PIPE_FORMAT_S8_UNORM) {
419 /* XXX to do */
420 ASSERT(0);
421 }
422 else {
423 /* bad zs_format */
424 ASSERT(0);
425 }
426
427 /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
428 spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
429 }
430
431 spe_release_register(f, fbZ_reg);
432 spe_release_register(f, fbS_reg);
433 }
434
435
436 /* Get framebuffer quad/colors. We'll need these for blending,
437 * color masking, and to obey the quad/pixel mask.
438 * Load: fbRGBA_reg = memory[color_tile + quad_offset]
439 * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
440 * we could skip this load.
441 */
442 spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
443
444
445 if (blend->blend_enable) {
446 /* convert packed tile colors in fbRGBA_reg to float[4] vectors */
447
448 // gen_blend_code(blend, f, mask_reg, ... );
449
450 }
451
452
453
454 /*
455 * Write fragment colors to framebuffer/tile.
456 * This involves converting the fragment colors from float[4] to the
457 * tile's specific format and obeying the quad/pixel mask.
458 */
459 {
460 const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
461 int rgba_reg = spe_allocate_available_register(f);
462
463 /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
464 spe_cfltu(f, fragR_reg, fragR_reg, 32);
465 spe_cfltu(f, fragG_reg, fragG_reg, 32);
466 spe_cfltu(f, fragB_reg, fragB_reg, 32);
467 spe_cfltu(f, fragA_reg, fragA_reg, 32);
468
469 /* Shift most the significant bytes to least the significant positions.
470 * I.e.: reg = reg >> 24
471 */
472 spe_rotmi(f, fragR_reg, fragR_reg, -24);
473 spe_rotmi(f, fragG_reg, fragG_reg, -24);
474 spe_rotmi(f, fragB_reg, fragB_reg, -24);
475 spe_rotmi(f, fragA_reg, fragA_reg, -24);
476
477 /* Shift the color bytes according to the surface format */
478 if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
479 spe_roti(f, fragG_reg, fragG_reg, 8); /* green <<= 8 */
480 spe_roti(f, fragR_reg, fragR_reg, 16); /* red <<= 16 */
481 spe_roti(f, fragA_reg, fragA_reg, 24); /* alpha <<= 24 */
482 }
483 else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
484 spe_roti(f, fragR_reg, fragR_reg, 8); /* red <<= 8 */
485 spe_roti(f, fragG_reg, fragG_reg, 16); /* green <<= 16 */
486 spe_roti(f, fragB_reg, fragB_reg, 24); /* blue <<= 24 */
487 }
488 else {
489 ASSERT(0);
490 }
491
492 /* Merge red, green, blue, alpha registers to make packed RGBA colors.
493 * Eg: after shifting according to color_format we might have:
494 * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
495 * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
496 * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
497 * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
498 * OR-ing all those together gives us four packed colors:
499 * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
500 */
501 spe_or(f, rgba_reg, fragR_reg, fragG_reg);
502 spe_or(f, rgba_reg, rgba_reg, fragB_reg);
503 spe_or(f, rgba_reg, rgba_reg, fragA_reg);
504
505 /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
506 * if (mask[i])
507 * rgba[i] = rgba[i];
508 * else
509 * rgba[i] = framebuffer[i];
510 */
511 spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
512
513 /* Store updated quad in tile:
514 * memory[color_tile + quad_offset] = rgba_reg;
515 */
516 spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
517
518 spe_release_register(f, rgba_reg);
519 }
520
521 printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
522
523 spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */
524
525
526 spe_release_register(f, fbRGBA_reg);
527 spe_release_register(f, fbZS_reg);
528 spe_release_register(f, quad_offset_reg);
529 }
530