cell: re-order the z/stencil fetch/extract/convert instructions for better perf
[mesa.git] / src / gallium / drivers / cell / ppu / cell_gen_fragment.c
1 /**************************************************************************
2 *
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009 VMware, Inc. All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29 /**
30 * Generate SPU per-fragment code (actually per-quad code).
31 * \author Brian Paul
32 * \author Bob Ellison
33 */
34
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "rtasm/rtasm_ppc_spe.h"
39 #include "cell_context.h"
40 #include "cell_gen_fragment.h"
41
42
43
44 /** Do extra optimizations? */
45 #define OPTIMIZATIONS 1
46
47
48 /**
49 * Generate SPE code to perform Z/depth testing.
50 *
51 * \param dsa Gallium depth/stencil/alpha state to gen code for
52 * \param f SPE function to append instruction onto.
53 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
54 * \param ifragZ_reg register containing integer fragment Z values (in)
55 * \param ifbZ_reg register containing integer frame buffer Z values (in/out)
56 * \param zmask_reg register containing result of Z test/comparison (out)
57 *
58 * Returns TRUE if the Z-buffer needs to be updated.
59 */
60 static boolean
61 gen_depth_test(struct spe_function *f,
62 const struct pipe_depth_stencil_alpha_state *dsa,
63 int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
64 {
65 /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
66 * quantities. This only makes a difference for 32-bit Z values though.
67 */
68 ASSERT(dsa->depth.enabled);
69
70 switch (dsa->depth.func) {
71 case PIPE_FUNC_EQUAL:
72 /* zmask = (ifragZ == ref) */
73 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
74 /* mask = (mask & zmask) */
75 spe_and(f, mask_reg, mask_reg, zmask_reg);
76 break;
77
78 case PIPE_FUNC_NOTEQUAL:
79 /* zmask = (ifragZ == ref) */
80 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
81 /* mask = (mask & ~zmask) */
82 spe_andc(f, mask_reg, mask_reg, zmask_reg);
83 break;
84
85 case PIPE_FUNC_GREATER:
86 /* zmask = (ifragZ > ref) */
87 spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
88 /* mask = (mask & zmask) */
89 spe_and(f, mask_reg, mask_reg, zmask_reg);
90 break;
91
92 case PIPE_FUNC_LESS:
93 /* zmask = (ref > ifragZ) */
94 spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
95 /* mask = (mask & zmask) */
96 spe_and(f, mask_reg, mask_reg, zmask_reg);
97 break;
98
99 case PIPE_FUNC_LEQUAL:
100 /* zmask = (ifragZ > ref) */
101 spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
102 /* mask = (mask & ~zmask) */
103 spe_andc(f, mask_reg, mask_reg, zmask_reg);
104 break;
105
106 case PIPE_FUNC_GEQUAL:
107 /* zmask = (ref > ifragZ) */
108 spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
109 /* mask = (mask & ~zmask) */
110 spe_andc(f, mask_reg, mask_reg, zmask_reg);
111 break;
112
113 case PIPE_FUNC_NEVER:
114 spe_il(f, mask_reg, 0); /* mask = {0,0,0,0} */
115 spe_move(f, zmask_reg, mask_reg); /* zmask = mask */
116 break;
117
118 case PIPE_FUNC_ALWAYS:
119 /* mask unchanged */
120 spe_il(f, zmask_reg, ~0); /* zmask = {~0,~0,~0,~0} */
121 break;
122
123 default:
124 ASSERT(0);
125 break;
126 }
127
128 if (dsa->depth.writemask) {
129 /*
130 * If (ztest passed) {
131 * framebufferZ = fragmentZ;
132 * }
133 * OR,
134 * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
135 */
136 spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
137 return TRUE;
138 }
139
140 return FALSE;
141 }
142
143
144 /**
145 * Generate SPE code to perform alpha testing.
146 *
147 * \param dsa Gallium depth/stencil/alpha state to gen code for
148 * \param f SPE function to append instruction onto.
149 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
150 * \param fragA_reg register containing four fragment alpha values (in)
151 */
152 static void
153 gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
154 struct spe_function *f, int mask_reg, int fragA_reg)
155 {
156 int ref_reg = spe_allocate_available_register(f);
157 int amask_reg = spe_allocate_available_register(f);
158
159 ASSERT(dsa->alpha.enabled);
160
161 if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
162 (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
163 /* load/splat the alpha reference float value */
164 spe_load_float(f, ref_reg, dsa->alpha.ref);
165 }
166
167 /* emit code to do the alpha comparison, updating 'mask' */
168 switch (dsa->alpha.func) {
169 case PIPE_FUNC_EQUAL:
170 /* amask = (fragA == ref) */
171 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
172 /* mask = (mask & amask) */
173 spe_and(f, mask_reg, mask_reg, amask_reg);
174 break;
175
176 case PIPE_FUNC_NOTEQUAL:
177 /* amask = (fragA == ref) */
178 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
179 /* mask = (mask & ~amask) */
180 spe_andc(f, mask_reg, mask_reg, amask_reg);
181 break;
182
183 case PIPE_FUNC_GREATER:
184 /* amask = (fragA > ref) */
185 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
186 /* mask = (mask & amask) */
187 spe_and(f, mask_reg, mask_reg, amask_reg);
188 break;
189
190 case PIPE_FUNC_LESS:
191 /* amask = (ref > fragA) */
192 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
193 /* mask = (mask & amask) */
194 spe_and(f, mask_reg, mask_reg, amask_reg);
195 break;
196
197 case PIPE_FUNC_LEQUAL:
198 /* amask = (fragA > ref) */
199 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
200 /* mask = (mask & ~amask) */
201 spe_andc(f, mask_reg, mask_reg, amask_reg);
202 break;
203
204 case PIPE_FUNC_GEQUAL:
205 /* amask = (ref > fragA) */
206 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
207 /* mask = (mask & ~amask) */
208 spe_andc(f, mask_reg, mask_reg, amask_reg);
209 break;
210
211 case PIPE_FUNC_NEVER:
212 spe_il(f, mask_reg, 0); /* mask = [0,0,0,0] */
213 break;
214
215 case PIPE_FUNC_ALWAYS:
216 /* no-op, mask unchanged */
217 break;
218
219 default:
220 ASSERT(0);
221 break;
222 }
223
224 #if OPTIMIZATIONS
225 /* if mask == {0,0,0,0} we're all done, return */
226 {
227 /* re-use amask reg here */
228 int tmp_reg = amask_reg;
229 /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
230 spe_orx(f, tmp_reg, mask_reg);
231 /* if tmp[0] == 0 then return from function call */
232 spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
233 }
234 #endif
235
236 spe_release_register(f, ref_reg);
237 spe_release_register(f, amask_reg);
238 }
239
240
241 /**
242 * This pair of functions is used inline to allocate and deallocate
243 * optional constant registers. Once a constant is discovered to be
244 * needed, we will likely need it again, so we don't want to deallocate
245 * it and have to allocate and load it again unnecessarily.
246 */
247 static INLINE void
248 setup_optional_register(struct spe_function *f,
249 int *r)
250 {
251 if (*r < 0)
252 *r = spe_allocate_available_register(f);
253 }
254
255 static INLINE void
256 release_optional_register(struct spe_function *f,
257 int r)
258 {
259 if (r >= 0)
260 spe_release_register(f, r);
261 }
262
263 static INLINE void
264 setup_const_register(struct spe_function *f,
265 int *r,
266 float value)
267 {
268 if (*r >= 0)
269 return;
270 setup_optional_register(f, r);
271 spe_load_float(f, *r, value);
272 }
273
274 static INLINE void
275 release_const_register(struct spe_function *f,
276 int r)
277 {
278 release_optional_register(f, r);
279 }
280
281 /**
282 * Generate SPE code to implement the given blend mode for a quad of pixels.
283 * \param f SPE function to append instruction onto.
284 * \param fragR_reg register with fragment red values (float) (in/out)
285 * \param fragG_reg register with fragment green values (float) (in/out)
286 * \param fragB_reg register with fragment blue values (float) (in/out)
287 * \param fragA_reg register with fragment alpha values (float) (in/out)
288 * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
289 */
290 static void
291 gen_blend(const struct pipe_blend_state *blend,
292 const struct pipe_blend_color *blend_color,
293 struct spe_function *f,
294 enum pipe_format color_format,
295 int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
296 int fbRGBA_reg)
297 {
298 int term1R_reg = spe_allocate_available_register(f);
299 int term1G_reg = spe_allocate_available_register(f);
300 int term1B_reg = spe_allocate_available_register(f);
301 int term1A_reg = spe_allocate_available_register(f);
302
303 int term2R_reg = spe_allocate_available_register(f);
304 int term2G_reg = spe_allocate_available_register(f);
305 int term2B_reg = spe_allocate_available_register(f);
306 int term2A_reg = spe_allocate_available_register(f);
307
308 int fbR_reg = spe_allocate_available_register(f);
309 int fbG_reg = spe_allocate_available_register(f);
310 int fbB_reg = spe_allocate_available_register(f);
311 int fbA_reg = spe_allocate_available_register(f);
312
313 int tmp_reg = spe_allocate_available_register(f);
314
315 /* Optional constant registers we might or might not end up using;
316 * if we do use them, make sure we only allocate them once by
317 * keeping a flag on each one.
318 */
319 int one_reg = -1;
320 int constR_reg = -1, constG_reg = -1, constB_reg = -1, constA_reg = -1;
321
322 ASSERT(blend->blend_enable);
323
324 /* Unpack/convert framebuffer colors from four 32-bit packed colors
325 * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
326 * Each 8-bit color component is expanded into a float in [0.0, 1.0].
327 */
328 {
329 int mask_reg = spe_allocate_available_register(f);
330
331 /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
332 spe_load_int(f, mask_reg, 0xff);
333
334 /* XXX there may be more clever ways to implement the following code */
335 switch (color_format) {
336 case PIPE_FORMAT_A8R8G8B8_UNORM:
337 /* fbB = fbB & mask */
338 spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
339 /* mask = mask << 8 */
340 spe_roti(f, mask_reg, mask_reg, 8);
341
342 /* fbG = fbRGBA & mask */
343 spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
344 /* fbG = fbG >> 8 */
345 spe_roti(f, fbG_reg, fbG_reg, -8);
346 /* mask = mask << 8 */
347 spe_roti(f, mask_reg, mask_reg, 8);
348
349 /* fbR = fbRGBA & mask */
350 spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
351 /* fbR = fbR >> 16 */
352 spe_roti(f, fbR_reg, fbR_reg, -16);
353 /* mask = mask << 8 */
354 spe_roti(f, mask_reg, mask_reg, 8);
355
356 /* fbA = fbRGBA & mask */
357 spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
358 /* fbA = fbA >> 24 */
359 spe_roti(f, fbA_reg, fbA_reg, -24);
360 break;
361
362 case PIPE_FORMAT_B8G8R8A8_UNORM:
363 /* fbA = fbA & mask */
364 spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
365 /* mask = mask << 8 */
366 spe_roti(f, mask_reg, mask_reg, 8);
367
368 /* fbR = fbRGBA & mask */
369 spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
370 /* fbR = fbR >> 8 */
371 spe_roti(f, fbR_reg, fbR_reg, -8);
372 /* mask = mask << 8 */
373 spe_roti(f, mask_reg, mask_reg, 8);
374
375 /* fbG = fbRGBA & mask */
376 spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
377 /* fbG = fbG >> 16 */
378 spe_roti(f, fbG_reg, fbG_reg, -16);
379 /* mask = mask << 8 */
380 spe_roti(f, mask_reg, mask_reg, 8);
381
382 /* fbB = fbRGBA & mask */
383 spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
384 /* fbB = fbB >> 24 */
385 spe_roti(f, fbB_reg, fbB_reg, -24);
386 break;
387
388 default:
389 ASSERT(0);
390 }
391
392 /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
393 spe_cuflt(f, fbR_reg, fbR_reg, 8);
394 spe_cuflt(f, fbG_reg, fbG_reg, 8);
395 spe_cuflt(f, fbB_reg, fbB_reg, 8);
396 spe_cuflt(f, fbA_reg, fbA_reg, 8);
397
398 spe_release_register(f, mask_reg);
399 }
400
401 /*
402 * Compute Src RGB terms. We're actually looking for the value
403 * of (the appropriate RGB factors) * (the incoming source RGB color),
404 * because in some cases (like PIPE_BLENDFACTOR_ONE and
405 * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
406 */
407 switch (blend->rgb_src_factor) {
408 case PIPE_BLENDFACTOR_ONE:
409 /* factors = (1,1,1), so term = (R,G,B) */
410 spe_move(f, term1R_reg, fragR_reg);
411 spe_move(f, term1G_reg, fragG_reg);
412 spe_move(f, term1B_reg, fragB_reg);
413 break;
414 case PIPE_BLENDFACTOR_ZERO:
415 /* factors = (0,0,0), so term = (0,0,0) */
416 spe_load_float(f, term1R_reg, 0.0f);
417 spe_load_float(f, term1G_reg, 0.0f);
418 spe_load_float(f, term1B_reg, 0.0f);
419 break;
420 case PIPE_BLENDFACTOR_SRC_COLOR:
421 /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
422 spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
423 spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
424 spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
425 break;
426 case PIPE_BLENDFACTOR_SRC_ALPHA:
427 /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
428 spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
429 spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
430 spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
431 break;
432 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
433 /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B))
434 * or in other words term = (R-R*R, G-G*G, B-B*B)
435 * fnms(a,b,c,d) computes a = d - b*c
436 */
437 spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
438 spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
439 spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
440 break;
441 case PIPE_BLENDFACTOR_DST_COLOR:
442 /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
443 spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
444 spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
445 spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
446 break;
447 case PIPE_BLENDFACTOR_INV_DST_COLOR:
448 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
449 * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
450 * fnms(a,b,c,d) computes a = d - b*c
451 */
452 spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
453 spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
454 spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
455 break;
456 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
457 /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
458 * or term = (R-R*A,G-G*A,B-B*A)
459 * fnms(a,b,c,d) computes a = d - b*c
460 */
461 spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
462 spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
463 spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
464 break;
465 case PIPE_BLENDFACTOR_DST_ALPHA:
466 /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
467 spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
468 spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
469 spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
470 break;
471 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
472 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb))
473 * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
474 * fnms(a,b,c,d) computes a = d - b*c
475 */
476 spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
477 spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
478 spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
479 break;
480 case PIPE_BLENDFACTOR_CONST_COLOR:
481 /* We need the optional constant color registers */
482 setup_const_register(f, &constR_reg, blend_color->color[0]);
483 setup_const_register(f, &constG_reg, blend_color->color[1]);
484 setup_const_register(f, &constB_reg, blend_color->color[2]);
485 /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
486 spe_fm(f, term1R_reg, fragR_reg, constR_reg);
487 spe_fm(f, term1G_reg, fragG_reg, constG_reg);
488 spe_fm(f, term1B_reg, fragB_reg, constB_reg);
489 break;
490 case PIPE_BLENDFACTOR_CONST_ALPHA:
491 /* we'll need the optional constant alpha register */
492 setup_const_register(f, &constA_reg, blend_color->color[3]);
493 /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
494 spe_fm(f, term1R_reg, fragR_reg, constA_reg);
495 spe_fm(f, term1G_reg, fragG_reg, constA_reg);
496 spe_fm(f, term1B_reg, fragB_reg, constA_reg);
497 break;
498 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
499 /* We need the optional constant color registers */
500 setup_const_register(f, &constR_reg, blend_color->color[0]);
501 setup_const_register(f, &constG_reg, blend_color->color[1]);
502 setup_const_register(f, &constB_reg, blend_color->color[2]);
503 /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc))
504 * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
505 * fnms(a,b,c,d) computes a = d - b*c
506 */
507 spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
508 spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
509 spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
510 break;
511 case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
512 /* We need the optional constant color registers */
513 setup_const_register(f, &constR_reg, blend_color->color[0]);
514 setup_const_register(f, &constG_reg, blend_color->color[1]);
515 setup_const_register(f, &constB_reg, blend_color->color[2]);
516 /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
517 * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
518 * fnms(a,b,c,d) computes a = d - b*c
519 */
520 spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
521 spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
522 spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
523 break;
524 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
525 /* We'll need the optional {1,1,1,1} register */
526 setup_const_register(f, &one_reg, 1.0f);
527 /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so
528 * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
529 * We could expand the term (as a*min(b,c) == min(a*b,a*c)
530 * as long as a is positive), but then we'd have to do three
531 * spe_float_min() functions instead of one, so this is simpler.
532 */
533 /* tmp = 1 - Afb */
534 spe_fs(f, tmp_reg, one_reg, fbA_reg);
535 /* tmp = min(A,tmp) */
536 spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
537 /* term = R*tmp */
538 spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
539 spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
540 spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
541 break;
542
543 /* These are special D3D cases involving a second color output
544 * from the fragment shader. I'm not sure we can support them
545 * yet... XXX
546 */
547 case PIPE_BLENDFACTOR_SRC1_COLOR:
548 case PIPE_BLENDFACTOR_SRC1_ALPHA:
549 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
550 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
551
552 default:
553 ASSERT(0);
554 }
555
556 /*
557 * Compute Src Alpha term. Like the above, we're looking for
558 * the full term A*factor, not just the factor itself, because
559 * in many cases we can avoid doing unnecessary multiplies.
560 */
561 switch (blend->alpha_src_factor) {
562 case PIPE_BLENDFACTOR_ZERO:
563 /* factor = 0, so term = 0 */
564 spe_load_float(f, term1A_reg, 0.0f);
565 break;
566
567 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
568 case PIPE_BLENDFACTOR_ONE:
569 /* factor = 1, so term = A */
570 spe_move(f, term1A_reg, fragA_reg);
571 break;
572
573 case PIPE_BLENDFACTOR_SRC_COLOR:
574 /* factor = A, so term = A*A */
575 spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
576 break;
577 case PIPE_BLENDFACTOR_SRC_ALPHA:
578 spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
579 break;
580
581 case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
582 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
583 /* factor = 1-A, so term = A*(1-A) = A-A*A */
584 /* fnms(a,b,c,d) computes a = d - b*c */
585 spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
586 break;
587
588 case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
589 case PIPE_BLENDFACTOR_DST_COLOR:
590 /* factor = Afb, so term = A*Afb */
591 spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
592 break;
593
594 case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
595 case PIPE_BLENDFACTOR_INV_DST_COLOR:
596 /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
597 /* fnms(a,b,c,d) computes a = d - b*c */
598 spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
599 break;
600
601 case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
602 case PIPE_BLENDFACTOR_CONST_COLOR:
603 /* We need the optional constA_reg register */
604 setup_const_register(f, &constA_reg, blend_color->color[3]);
605 /* factor = Ac, so term = A*Ac */
606 spe_fm(f, term1A_reg, fragA_reg, constA_reg);
607 break;
608
609 case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
610 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
611 /* We need the optional constA_reg register */
612 setup_const_register(f, &constA_reg, blend_color->color[3]);
613 /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
614 /* fnms(a,b,c,d) computes a = d - b*c */
615 spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
616 break;
617
618 /* These are special D3D cases involving a second color output
619 * from the fragment shader. I'm not sure we can support them
620 * yet... XXX
621 */
622 case PIPE_BLENDFACTOR_SRC1_COLOR:
623 case PIPE_BLENDFACTOR_SRC1_ALPHA:
624 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
625 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
626 default:
627 ASSERT(0);
628 }
629
630 /*
631 * Compute Dest RGB term. Like the above, we're looking for
632 * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
633 * in many cases we can avoid doing unnecessary multiplies.
634 */
635 switch (blend->rgb_dst_factor) {
636 case PIPE_BLENDFACTOR_ONE:
637 /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
638 spe_move(f, term2R_reg, fbR_reg);
639 spe_move(f, term2G_reg, fbG_reg);
640 spe_move(f, term2B_reg, fbB_reg);
641 break;
642 case PIPE_BLENDFACTOR_ZERO:
643 /* factor s= (0,0,0), so term = (0,0,0) */
644 spe_load_float(f, term2R_reg, 0.0f);
645 spe_load_float(f, term2G_reg, 0.0f);
646 spe_load_float(f, term2B_reg, 0.0f);
647 break;
648 case PIPE_BLENDFACTOR_SRC_COLOR:
649 /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
650 spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
651 spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
652 spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
653 break;
654 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
655 /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B))
656 * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
657 * fnms(a,b,c,d) computes a = d - b*c
658 */
659 spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
660 spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
661 spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
662 break;
663 case PIPE_BLENDFACTOR_SRC_ALPHA:
664 /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
665 spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
666 spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
667 spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
668 break;
669 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
670 /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
671 /* fnms(a,b,c,d) computes a = d - b*c */
672 spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
673 spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
674 spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
675 break;
676 case PIPE_BLENDFACTOR_DST_COLOR:
677 /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
678 spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
679 spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
680 spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
681 break;
682 case PIPE_BLENDFACTOR_INV_DST_COLOR:
683 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
684 * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
685 * fnms(a,b,c,d) computes a = d - b*c
686 */
687 spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
688 spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
689 spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
690 break;
691
692 case PIPE_BLENDFACTOR_DST_ALPHA:
693 /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
694 spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
695 spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
696 spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
697 break;
698 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
699 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb))
700 * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
701 * fnms(a,b,c,d) computes a = d - b*c
702 */
703 spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
704 spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
705 spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
706 break;
707 case PIPE_BLENDFACTOR_CONST_COLOR:
708 /* We need the optional constant color registers */
709 setup_const_register(f, &constR_reg, blend_color->color[0]);
710 setup_const_register(f, &constG_reg, blend_color->color[1]);
711 setup_const_register(f, &constB_reg, blend_color->color[2]);
712 /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
713 spe_fm(f, term2R_reg, fbR_reg, constR_reg);
714 spe_fm(f, term2G_reg, fbG_reg, constG_reg);
715 spe_fm(f, term2B_reg, fbB_reg, constB_reg);
716 break;
717 case PIPE_BLENDFACTOR_CONST_ALPHA:
718 /* we'll need the optional constant alpha register */
719 setup_const_register(f, &constA_reg, blend_color->color[3]);
720 /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
721 spe_fm(f, term2R_reg, fbR_reg, constA_reg);
722 spe_fm(f, term2G_reg, fbG_reg, constA_reg);
723 spe_fm(f, term2B_reg, fbB_reg, constA_reg);
724 break;
725 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
726 /* We need the optional constant color registers */
727 setup_const_register(f, &constR_reg, blend_color->color[0]);
728 setup_const_register(f, &constG_reg, blend_color->color[1]);
729 setup_const_register(f, &constB_reg, blend_color->color[2]);
730 /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc))
731 * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
732 * fnms(a,b,c,d) computes a = d - b*c
733 */
734 spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
735 spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
736 spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
737 break;
738 case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
739 /* We need the optional constant color registers */
740 setup_const_register(f, &constR_reg, blend_color->color[0]);
741 setup_const_register(f, &constG_reg, blend_color->color[1]);
742 setup_const_register(f, &constB_reg, blend_color->color[2]);
743 /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
744 * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
745 * fnms(a,b,c,d) computes a = d - b*c
746 */
747 spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
748 spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
749 spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
750 break;
751 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
752 ASSERT(0);
753 break;
754
755 /* These are special D3D cases involving a second color output
756 * from the fragment shader. I'm not sure we can support them
757 * yet... XXX
758 */
759 case PIPE_BLENDFACTOR_SRC1_COLOR:
760 case PIPE_BLENDFACTOR_SRC1_ALPHA:
761 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
762 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
763
764 default:
765 ASSERT(0);
766 }
767
768 /*
769 * Compute Dest Alpha term. Like the above, we're looking for
770 * the full term Afb*factor, not just the factor itself, because
771 * in many cases we can avoid doing unnecessary multiplies.
772 */
773 switch (blend->alpha_dst_factor) {
774 case PIPE_BLENDFACTOR_ONE:
775 /* factor = 1, so term = Afb */
776 spe_move(f, term2A_reg, fbA_reg);
777 break;
778 case PIPE_BLENDFACTOR_ZERO:
779 /* factor = 0, so term = 0 */
780 spe_load_float(f, term2A_reg, 0.0f);
781 break;
782
783 case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
784 case PIPE_BLENDFACTOR_SRC_COLOR:
785 /* factor = A, so term = Afb*A */
786 spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
787 break;
788
789 case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
790 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
791 /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
792 /* fnms(a,b,c,d) computes a = d - b*c */
793 spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
794 break;
795
796 case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
797 case PIPE_BLENDFACTOR_DST_COLOR:
798 /* factor = Afb, so term = Afb*Afb */
799 spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
800 break;
801
802 case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
803 case PIPE_BLENDFACTOR_INV_DST_COLOR:
804 /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
805 /* fnms(a,b,c,d) computes a = d - b*c */
806 spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
807 break;
808
809 case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
810 case PIPE_BLENDFACTOR_CONST_COLOR:
811 /* We need the optional constA_reg register */
812 setup_const_register(f, &constA_reg, blend_color->color[3]);
813 /* factor = Ac, so term = Afb*Ac */
814 spe_fm(f, term2A_reg, fbA_reg, constA_reg);
815 break;
816
817 case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
818 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
819 /* We need the optional constA_reg register */
820 setup_const_register(f, &constA_reg, blend_color->color[3]);
821 /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
822 /* fnms(a,b,c,d) computes a = d - b*c */
823 spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
824 break;
825
826 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
827 ASSERT(0);
828 break;
829
830 /* These are special D3D cases involving a second color output
831 * from the fragment shader. I'm not sure we can support them
832 * yet... XXX
833 */
834 case PIPE_BLENDFACTOR_SRC1_COLOR:
835 case PIPE_BLENDFACTOR_SRC1_ALPHA:
836 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
837 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
838 default:
839 ASSERT(0);
840 }
841
842 /*
843 * Combine Src/Dest RGB terms as per the blend equation.
844 */
845 switch (blend->rgb_func) {
846 case PIPE_BLEND_ADD:
847 spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
848 spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
849 spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
850 break;
851 case PIPE_BLEND_SUBTRACT:
852 spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
853 spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
854 spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
855 break;
856 case PIPE_BLEND_REVERSE_SUBTRACT:
857 spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
858 spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
859 spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
860 break;
861 case PIPE_BLEND_MIN:
862 spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
863 spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
864 spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
865 break;
866 case PIPE_BLEND_MAX:
867 spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
868 spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
869 spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
870 break;
871 default:
872 ASSERT(0);
873 }
874
875 /*
876 * Combine Src/Dest A term
877 */
878 switch (blend->alpha_func) {
879 case PIPE_BLEND_ADD:
880 spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
881 break;
882 case PIPE_BLEND_SUBTRACT:
883 spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
884 break;
885 case PIPE_BLEND_REVERSE_SUBTRACT:
886 spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
887 break;
888 case PIPE_BLEND_MIN:
889 spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
890 break;
891 case PIPE_BLEND_MAX:
892 spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
893 break;
894 default:
895 ASSERT(0);
896 }
897
898 spe_release_register(f, term1R_reg);
899 spe_release_register(f, term1G_reg);
900 spe_release_register(f, term1B_reg);
901 spe_release_register(f, term1A_reg);
902
903 spe_release_register(f, term2R_reg);
904 spe_release_register(f, term2G_reg);
905 spe_release_register(f, term2B_reg);
906 spe_release_register(f, term2A_reg);
907
908 spe_release_register(f, fbR_reg);
909 spe_release_register(f, fbG_reg);
910 spe_release_register(f, fbB_reg);
911 spe_release_register(f, fbA_reg);
912
913 spe_release_register(f, tmp_reg);
914
915 /* Free any optional registers that actually got used */
916 release_const_register(f, one_reg);
917 release_const_register(f, constR_reg);
918 release_const_register(f, constG_reg);
919 release_const_register(f, constB_reg);
920 release_const_register(f, constA_reg);
921 }
922
923
924 static void
925 gen_logicop(const struct pipe_blend_state *blend,
926 struct spe_function *f,
927 int fragRGBA_reg, int fbRGBA_reg)
928 {
929 /* We've got four 32-bit RGBA packed pixels in each of
930 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
931 * reds, greens, blues, and alphas.
932 * */
933 ASSERT(blend->logicop_enable);
934
935 switch(blend->logicop_func) {
936 case PIPE_LOGICOP_CLEAR: /* 0 */
937 spe_zero(f, fragRGBA_reg);
938 break;
939 case PIPE_LOGICOP_NOR: /* ~(s | d) */
940 spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
941 break;
942 case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
943 /* andc R, A, B computes R = A & ~B */
944 spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
945 break;
946 case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
947 spe_complement(f, fragRGBA_reg, fragRGBA_reg);
948 break;
949 case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
950 /* andc R, A, B computes R = A & ~B */
951 spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
952 break;
953 case PIPE_LOGICOP_INVERT: /* ~d */
954 /* Note that (A nor A) == ~(A|A) == ~A */
955 spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
956 break;
957 case PIPE_LOGICOP_XOR: /* s ^ d */
958 spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
959 break;
960 case PIPE_LOGICOP_NAND: /* ~(s & d) */
961 spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
962 break;
963 case PIPE_LOGICOP_AND: /* s & d */
964 spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
965 break;
966 case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
967 spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
968 spe_complement(f, fragRGBA_reg, fragRGBA_reg);
969 break;
970 case PIPE_LOGICOP_NOOP: /* d */
971 spe_move(f, fragRGBA_reg, fbRGBA_reg);
972 break;
973 case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
974 /* orc R, A, B computes R = A | ~B */
975 spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
976 break;
977 case PIPE_LOGICOP_COPY: /* s */
978 break;
979 case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
980 /* orc R, A, B computes R = A | ~B */
981 spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
982 break;
983 case PIPE_LOGICOP_OR: /* s | d */
984 spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
985 break;
986 case PIPE_LOGICOP_SET: /* 1 */
987 spe_load_int(f, fragRGBA_reg, 0xffffffff);
988 break;
989 default:
990 ASSERT(0);
991 }
992 }
993
994
995 /**
996 * Generate code to pack a quad of float colors into four 32-bit integers.
997 *
998 * \param f SPE function to append instruction onto.
999 * \param color_format the dest color packing format
1000 * \param r_reg register containing four red values (in/clobbered)
1001 * \param g_reg register containing four green values (in/clobbered)
1002 * \param b_reg register containing four blue values (in/clobbered)
1003 * \param a_reg register containing four alpha values (in/clobbered)
1004 * \param rgba_reg register to store the packed RGBA colors (out)
1005 */
1006 static void
1007 gen_pack_colors(struct spe_function *f,
1008 enum pipe_format color_format,
1009 int r_reg, int g_reg, int b_reg, int a_reg,
1010 int rgba_reg)
1011 {
1012 int rg_reg = spe_allocate_available_register(f);
1013 int ba_reg = spe_allocate_available_register(f);
1014
1015 /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
1016 spe_cfltu(f, r_reg, r_reg, 32);
1017 spe_cfltu(f, g_reg, g_reg, 32);
1018 spe_cfltu(f, b_reg, b_reg, 32);
1019 spe_cfltu(f, a_reg, a_reg, 32);
1020
1021 /* Shift the most significant bytes to the least significant positions.
1022 * I.e.: reg = reg >> 24
1023 */
1024 spe_rotmi(f, r_reg, r_reg, -24);
1025 spe_rotmi(f, g_reg, g_reg, -24);
1026 spe_rotmi(f, b_reg, b_reg, -24);
1027 spe_rotmi(f, a_reg, a_reg, -24);
1028
1029 /* Shift the color bytes according to the surface format */
1030 if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
1031 spe_roti(f, g_reg, g_reg, 8); /* green <<= 8 */
1032 spe_roti(f, r_reg, r_reg, 16); /* red <<= 16 */
1033 spe_roti(f, a_reg, a_reg, 24); /* alpha <<= 24 */
1034 }
1035 else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1036 spe_roti(f, r_reg, r_reg, 8); /* red <<= 8 */
1037 spe_roti(f, g_reg, g_reg, 16); /* green <<= 16 */
1038 spe_roti(f, b_reg, b_reg, 24); /* blue <<= 24 */
1039 }
1040 else {
1041 ASSERT(0);
1042 }
1043
1044 /* Merge red, green, blue, alpha registers to make packed RGBA colors.
1045 * Eg: after shifting according to color_format we might have:
1046 * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
1047 * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
1048 * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
1049 * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
1050 * OR-ing all those together gives us four packed colors:
1051 * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
1052 */
1053 spe_or(f, rg_reg, r_reg, g_reg);
1054 spe_or(f, ba_reg, a_reg, b_reg);
1055 spe_or(f, rgba_reg, rg_reg, ba_reg);
1056
1057 spe_release_register(f, rg_reg);
1058 spe_release_register(f, ba_reg);
1059 }
1060
1061
1062 static void
1063 gen_colormask(struct spe_function *f,
1064 uint colormask,
1065 enum pipe_format color_format,
1066 int fragRGBA_reg, int fbRGBA_reg)
1067 {
1068 /* We've got four 32-bit RGBA packed pixels in each of
1069 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
1070 * reds, greens, blues, and alphas. Further, the pixels
1071 * are packed according to the given color format, not
1072 * necessarily RGBA...
1073 */
1074 uint r_mask;
1075 uint g_mask;
1076 uint b_mask;
1077 uint a_mask;
1078
1079 /* Calculate exactly where the bits for any particular color
1080 * end up, so we can mask them correctly.
1081 */
1082 switch(color_format) {
1083 case PIPE_FORMAT_A8R8G8B8_UNORM:
1084 /* ARGB */
1085 a_mask = 0xff000000;
1086 r_mask = 0x00ff0000;
1087 g_mask = 0x0000ff00;
1088 b_mask = 0x000000ff;
1089 break;
1090 case PIPE_FORMAT_B8G8R8A8_UNORM:
1091 /* BGRA */
1092 b_mask = 0xff000000;
1093 g_mask = 0x00ff0000;
1094 r_mask = 0x0000ff00;
1095 a_mask = 0x000000ff;
1096 break;
1097 default:
1098 ASSERT(0);
1099 }
1100
1101 /* For each R, G, B, and A component we're supposed to mask out,
1102 * clear its bits. Then our mask operation later will work
1103 * as expected.
1104 */
1105 if (!(colormask & PIPE_MASK_R)) {
1106 r_mask = 0;
1107 }
1108 if (!(colormask & PIPE_MASK_G)) {
1109 g_mask = 0;
1110 }
1111 if (!(colormask & PIPE_MASK_B)) {
1112 b_mask = 0;
1113 }
1114 if (!(colormask & PIPE_MASK_A)) {
1115 a_mask = 0;
1116 }
1117
1118 /* Get a temporary register to hold the mask that will be applied
1119 * to the fragment
1120 */
1121 int colormask_reg = spe_allocate_available_register(f);
1122
1123 /* The actual mask we're going to use is an OR of the remaining R, G, B,
1124 * and A masks. Load the result value into our temporary register.
1125 */
1126 spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
1127
1128 /* Use the mask register to select between the fragment color
1129 * values and the frame buffer color values. Wherever the
1130 * mask has a 0 bit, the current frame buffer color should override
1131 * the fragment color. Wherever the mask has a 1 bit, the
1132 * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM)
1133 * instruction will select bits from its first operand rA wherever the
1134 * the mask bits rM are 0, and from its second operand rB wherever the
1135 * mask bits rM are 1. That means that the frame buffer color is the
1136 * first operand, and the fragment color the second.
1137 */
1138 spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
1139
1140 /* Release the temporary register and we're done */
1141 spe_release_register(f, colormask_reg);
1142 }
1143
1144
1145 /**
1146 * This function is annoyingly similar to gen_depth_test(), above, except
1147 * that instead of comparing two varying values (i.e. fragment and buffer),
1148 * we're comparing a varying value with a static value. As such, we have
1149 * access to the Compare Immediate instructions where we don't in
1150 * gen_depth_test(), which is what makes us very different.
1151 *
1152 * There's some added complexity if there's a non-trivial state->mask
1153 * value; then stencil and reference both must be masked
1154 *
1155 * The return value in the stencil_pass_reg is a bitmask of valid
1156 * fragments that also passed the stencil test. The bitmask of valid
1157 * fragments that failed would be found in
1158 * (fragment_mask_reg & ~stencil_pass_reg).
1159 */
1160 static void
1161 gen_stencil_test(struct spe_function *f,
1162 const struct pipe_stencil_state *state,
1163 uint stencil_max_value,
1164 int fragment_mask_reg,
1165 int fbS_reg,
1166 int stencil_pass_reg)
1167 {
1168 /* Generate code that puts the set of passing fragments into the
1169 * stencil_pass_reg register, taking into account whether each fragment
1170 * was active to begin with.
1171 */
1172 switch (state->func) {
1173 case PIPE_FUNC_EQUAL:
1174 if (state->value_mask == stencil_max_value) {
1175 /* stencil_pass = fragment_mask & (s == reference) */
1176 spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1177 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1178 }
1179 else {
1180 /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
1181 uint tmp_masked_stencil = spe_allocate_available_register(f);
1182 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1183 spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
1184 state->value_mask & state->ref_value);
1185 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1186 spe_release_register(f, tmp_masked_stencil);
1187 }
1188 break;
1189
1190 case PIPE_FUNC_NOTEQUAL:
1191 if (state->value_mask == stencil_max_value) {
1192 /* stencil_pass = fragment_mask & ~(s == reference) */
1193 spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1194 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1195 }
1196 else {
1197 /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
1198 int tmp_masked_stencil = spe_allocate_available_register(f);
1199 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1200 spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
1201 state->value_mask & state->ref_value);
1202 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1203 spe_release_register(f, tmp_masked_stencil);
1204 }
1205 break;
1206
1207 case PIPE_FUNC_LESS:
1208 if (state->value_mask == stencil_max_value) {
1209 /* stencil_pass = fragment_mask & (reference < s) */
1210 spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1211 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1212 }
1213 else {
1214 /* stencil_pass = fragment_mask & ((reference&mask) < (s & mask)) */
1215 int tmp_masked_stencil = spe_allocate_available_register(f);
1216 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1217 spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
1218 state->value_mask & state->ref_value);
1219 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1220 spe_release_register(f, tmp_masked_stencil);
1221 }
1222 break;
1223
1224 case PIPE_FUNC_GREATER:
1225 if (state->value_mask == stencil_max_value) {
1226 /* stencil_pass = fragment_mask & (reference > s) */
1227 /* There's no convenient Compare Less Than Immediate instruction, so
1228 * we'll have to do this one the harder way, by loading a register and
1229 * comparing directly. Compare Logical Greater Than Word (clgt)
1230 * treats its operands as unsigned - no sign extension.
1231 */
1232 int tmp_reg = spe_allocate_available_register(f);
1233 spe_load_uint(f, tmp_reg, state->ref_value);
1234 spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
1235 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1236 spe_release_register(f, tmp_reg);
1237 }
1238 else {
1239 /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
1240 int tmp_reg = spe_allocate_available_register(f);
1241 int tmp_masked_stencil = spe_allocate_available_register(f);
1242 spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value);
1243 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1244 spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
1245 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1246 spe_release_register(f, tmp_reg);
1247 spe_release_register(f, tmp_masked_stencil);
1248 }
1249 break;
1250
1251 case PIPE_FUNC_GEQUAL:
1252 if (state->value_mask == stencil_max_value) {
1253 /* stencil_pass = fragment_mask & (reference >= s)
1254 * = fragment_mask & ~(s > reference) */
1255 spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg,
1256 state->ref_value);
1257 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1258 }
1259 else {
1260 /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
1261 int tmp_masked_stencil = spe_allocate_available_register(f);
1262 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1263 spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
1264 state->value_mask & state->ref_value);
1265 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1266 spe_release_register(f, tmp_masked_stencil);
1267 }
1268 break;
1269
1270 case PIPE_FUNC_LEQUAL:
1271 if (state->value_mask == stencil_max_value) {
1272 /* stencil_pass = fragment_mask & (reference <= s) ]
1273 * = fragment_mask & ~(reference > s) */
1274 /* As above, we have to do this by loading a register */
1275 int tmp_reg = spe_allocate_available_register(f);
1276 spe_load_uint(f, tmp_reg, state->ref_value);
1277 spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
1278 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1279 spe_release_register(f, tmp_reg);
1280 }
1281 else {
1282 /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
1283 int tmp_reg = spe_allocate_available_register(f);
1284 int tmp_masked_stencil = spe_allocate_available_register(f);
1285 spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask);
1286 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1287 spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
1288 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1289 spe_release_register(f, tmp_reg);
1290 spe_release_register(f, tmp_masked_stencil);
1291 }
1292 break;
1293
1294 case PIPE_FUNC_NEVER:
1295 /* stencil_pass = fragment_mask & 0 = 0 */
1296 spe_load_uint(f, stencil_pass_reg, 0);
1297 break;
1298
1299 case PIPE_FUNC_ALWAYS:
1300 /* stencil_pass = fragment_mask & 1 = fragment_mask */
1301 spe_move(f, stencil_pass_reg, fragment_mask_reg);
1302 break;
1303 }
1304
1305 /* The fragments that passed the stencil test are now in stencil_pass_reg.
1306 * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
1307 */
1308 }
1309
1310
1311 /**
1312 * This function generates code that calculates a set of new stencil values
1313 * given the earlier values and the operation to apply. It does not
1314 * apply any tests. It is intended to be called up to 3 times
1315 * (for the stencil fail operation, for the stencil pass-z fail operation,
1316 * and for the stencil pass-z pass operation) to collect up to three
1317 * possible sets of values, and for the caller to combine them based
1318 * on the result of the tests.
1319 *
1320 * stencil_max_value should be (2^n - 1) where n is the number of bits
1321 * in the stencil buffer - in other words, it should be usable as a mask.
1322 */
1323 static void
1324 gen_stencil_values(struct spe_function *f,
1325 uint stencil_op,
1326 uint stencil_ref_value,
1327 uint stencil_max_value,
1328 int fbS_reg,
1329 int newS_reg)
1330 {
1331 /* The code below assumes that newS_reg and fbS_reg are not the same
1332 * register; if they can be, the calculations below will have to use
1333 * an additional temporary register. For now, mark the assumption
1334 * with an assertion that will fail if they are the same.
1335 */
1336 ASSERT(fbS_reg != newS_reg);
1337
1338 /* The code also assumes the the stencil_max_value is of the form
1339 * 2^n-1 and can therefore be used as a mask for the valid bits in
1340 * addition to a maximum. Make sure this is the case as well.
1341 * The clever math below exploits the fact that incrementing a
1342 * binary number serves to flip all the bits of a number starting at
1343 * the LSB and continuing to (and including) the first zero bit
1344 * found. That means that a number and its increment will always
1345 * have at least one bit in common (the high order bit, if nothing
1346 * else) *unless* the number is zero, *or* the number is of a form
1347 * consisting of some number of 1s in the low-order bits followed
1348 * by nothing but 0s in the high-order bits. The latter case
1349 * implies it's of the form 2^n-1.
1350 */
1351 ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
1352
1353 switch(stencil_op) {
1354 case PIPE_STENCIL_OP_KEEP:
1355 /* newS = S */
1356 spe_move(f, newS_reg, fbS_reg);
1357 break;
1358
1359 case PIPE_STENCIL_OP_ZERO:
1360 /* newS = 0 */
1361 spe_zero(f, newS_reg);
1362 break;
1363
1364 case PIPE_STENCIL_OP_REPLACE:
1365 /* newS = stencil reference value */
1366 spe_load_uint(f, newS_reg, stencil_ref_value);
1367 break;
1368
1369 case PIPE_STENCIL_OP_INCR: {
1370 /* newS = (s == max ? max : s + 1) */
1371 int equals_reg = spe_allocate_available_register(f);
1372
1373 spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
1374 /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
1375 spe_ai(f, newS_reg, fbS_reg, 1);
1376 /* Select from the current value or the new value based on the equality test */
1377 spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
1378
1379 spe_release_register(f, equals_reg);
1380 break;
1381 }
1382 case PIPE_STENCIL_OP_DECR: {
1383 /* newS = (s == 0 ? 0 : s - 1) */
1384 int equals_reg = spe_allocate_available_register(f);
1385
1386 spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
1387 /* Add Word Immediate with a (-1) value works */
1388 spe_ai(f, newS_reg, fbS_reg, -1);
1389 /* Select from the current value or the new value based on the equality test */
1390 spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
1391
1392 spe_release_register(f, equals_reg);
1393 break;
1394 }
1395 case PIPE_STENCIL_OP_INCR_WRAP:
1396 /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
1397 * do a normal add and mask off the correct bits
1398 */
1399 spe_ai(f, newS_reg, fbS_reg, 1);
1400 spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
1401 break;
1402
1403 case PIPE_STENCIL_OP_DECR_WRAP:
1404 /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
1405 spe_ai(f, newS_reg, fbS_reg, -1);
1406 spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
1407 break;
1408
1409 case PIPE_STENCIL_OP_INVERT:
1410 /* newS = ~s. We take advantage of the mask/max value to invert only
1411 * the valid bits for the field so we don't have to do an extra "and".
1412 */
1413 spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
1414 break;
1415
1416 default:
1417 ASSERT(0);
1418 }
1419 }
1420
1421
1422 /**
1423 * This function generates code to get all the necessary possible
1424 * stencil values. For each of the output registers (fail_reg,
1425 * zfail_reg, and zpass_reg), it either allocates a new register
1426 * and calculates a new set of values based on the stencil operation,
1427 * or it reuses a register allocation and calculation done for an
1428 * earlier (matching) operation, or it reuses the fbS_reg register
1429 * (if the stencil operation is KEEP, which doesn't change the
1430 * stencil buffer).
1431 *
1432 * Since this function allocates a variable number of registers,
1433 * to avoid incurring complex logic to free them, they should
1434 * be allocated after a spe_allocate_register_set() call
1435 * and released by the corresponding spe_release_register_set() call.
1436 */
1437 static void
1438 gen_get_stencil_values(struct spe_function *f,
1439 const struct pipe_stencil_state *stencil,
1440 const uint depth_enabled,
1441 int fbS_reg,
1442 int *fail_reg,
1443 int *zfail_reg,
1444 int *zpass_reg)
1445 {
1446 uint zfail_op;
1447
1448 /* Stenciling had better be enabled here */
1449 ASSERT(stencil->enabled);
1450
1451 /* If the depth test is not enabled, it is treated as though it always
1452 * passes, which means that the zfail_op is not considered - a
1453 * failing stencil test triggers the fail_op, and a passing one
1454 * triggers the zpass_op
1455 *
1456 * As an optimization, override calculation of the zfail_op values
1457 * if they aren't going to be used. By setting the value of
1458 * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
1459 * to match the incoming stencil values, and no calculation will
1460 * be done.
1461 */
1462 if (depth_enabled) {
1463 zfail_op = stencil->zfail_op;
1464 }
1465 else {
1466 zfail_op = PIPE_STENCIL_OP_KEEP;
1467 }
1468
1469 /* One-sided or front-facing stencil */
1470 if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
1471 *fail_reg = fbS_reg;
1472 }
1473 else {
1474 *fail_reg = spe_allocate_available_register(f);
1475 gen_stencil_values(f, stencil->fail_op, stencil->ref_value,
1476 0xff, fbS_reg, *fail_reg);
1477 }
1478
1479 /* Check the possibly overridden value, not the structure value */
1480 if (zfail_op == PIPE_STENCIL_OP_KEEP) {
1481 *zfail_reg = fbS_reg;
1482 }
1483 else if (zfail_op == stencil->fail_op) {
1484 *zfail_reg = *fail_reg;
1485 }
1486 else {
1487 *zfail_reg = spe_allocate_available_register(f);
1488 gen_stencil_values(f, stencil->zfail_op, stencil->ref_value,
1489 0xff, fbS_reg, *zfail_reg);
1490 }
1491
1492 if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
1493 *zpass_reg = fbS_reg;
1494 }
1495 else if (stencil->zpass_op == stencil->fail_op) {
1496 *zpass_reg = *fail_reg;
1497 }
1498 else if (stencil->zpass_op == zfail_op) {
1499 *zpass_reg = *zfail_reg;
1500 }
1501 else {
1502 *zpass_reg = spe_allocate_available_register(f);
1503 gen_stencil_values(f, stencil->zpass_op, stencil->ref_value,
1504 0xff, fbS_reg, *zpass_reg);
1505 }
1506 }
1507
1508 /**
1509 * Note that fbZ_reg may *not* be set on entry, if in fact
1510 * the depth test is not enabled. This function must not use
1511 * the register if depth is not enabled.
1512 */
1513 static boolean
1514 gen_stencil_depth_test(struct spe_function *f,
1515 const struct pipe_depth_stencil_alpha_state *dsa,
1516 const uint facing,
1517 const int mask_reg, const int fragZ_reg,
1518 const int fbZ_reg, const int fbS_reg)
1519 {
1520 /* True if we've generated code that could require writeback to the
1521 * depth and/or stencil buffers
1522 */
1523 boolean modified_buffers = FALSE;
1524
1525 boolean need_to_calculate_stencil_values;
1526 boolean need_to_writemask_stencil_values;
1527
1528 struct pipe_stencil_state *stencil;
1529
1530 /* Registers. We may or may not actually allocate these, depending
1531 * on whether the state values indicate that we need them.
1532 */
1533 int stencil_pass_reg, stencil_fail_reg;
1534 int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
1535 int stencil_writemask_reg;
1536 int zmask_reg;
1537 int newS_reg;
1538
1539 /* Stenciling is quite complex: up to six different configurable stencil
1540 * operations/calculations can be required (three each for front-facing
1541 * and back-facing fragments). Many of those operations will likely
1542 * be identical, so there's good reason to try to avoid calculating
1543 * the same values more than once (which unfortunately makes the code less
1544 * straightforward).
1545 *
1546 * To make register management easier, we start a new
1547 * register set; we can release all the registers in the set at
1548 * once, and avoid having to keep track of exactly which registers
1549 * we allocate. We can still allocate and free registers as
1550 * desired (if we know we no longer need a register), but we don't
1551 * have to spend the complexity to track the more difficult variant
1552 * register usage scenarios.
1553 */
1554 spe_comment(f, 0, "Allocating stencil register set");
1555 spe_allocate_register_set(f);
1556
1557 /* The facing we're given is the fragment facing; it doesn't
1558 * exactly match the stencil facing. If stencil is enabled,
1559 * but two-sided stencil is *not* enabled, we use the same
1560 * stencil settings for both front- and back-facing fragments.
1561 * We only use the "back-facing" stencil for backfacing fragments
1562 * if two-sided stenciling is enabled.
1563 */
1564 if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
1565 stencil = &dsa->stencil[1];
1566 }
1567 else {
1568 stencil = &dsa->stencil[0];
1569 }
1570
1571 /* Calculate the writemask. If the writemask is trivial (either
1572 * all 0s, meaning that we don't need to calculate any stencil values
1573 * because they're not going to change the stencil anyway, or all 1s,
1574 * meaning that we have to calculate the stencil values but do not
1575 * need to mask them), we can avoid generating code. Don't forget
1576 * that we need to consider backfacing stencil, if enabled.
1577 *
1578 * Note that if the backface stencil is *not* enabled, the backface
1579 * stencil will have the same values as the frontface stencil.
1580 */
1581 if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
1582 stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
1583 stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
1584 need_to_calculate_stencil_values = FALSE;
1585 need_to_writemask_stencil_values = FALSE;
1586 }
1587 else if (stencil->write_mask == 0x0) {
1588 /* All changes are writemasked out, so no need to calculate
1589 * what those changes might be, and no need to write anything back.
1590 */
1591 need_to_calculate_stencil_values = FALSE;
1592 need_to_writemask_stencil_values = FALSE;
1593 }
1594 else if (stencil->write_mask == 0xff) {
1595 /* Still trivial, but a little less so. We need to write the stencil
1596 * values, but we don't need to mask them.
1597 */
1598 need_to_calculate_stencil_values = TRUE;
1599 need_to_writemask_stencil_values = FALSE;
1600 }
1601 else {
1602 /* The general case: calculate, mask, and write */
1603 need_to_calculate_stencil_values = TRUE;
1604 need_to_writemask_stencil_values = TRUE;
1605
1606 /* While we're here, generate code that calculates what the
1607 * writemask should be. If backface stenciling is enabled,
1608 * and the backface writemask is not the same as the frontface
1609 * writemask, we'll have to generate code that merges the
1610 * two masks into a single effective mask based on fragment facing.
1611 */
1612 spe_comment(f, 0, "Computing stencil writemask");
1613 stencil_writemask_reg = spe_allocate_available_register(f);
1614 spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].write_mask);
1615 }
1616
1617 /* At least one-sided stenciling must be on. Generate code that
1618 * runs the stencil test on the basic/front-facing stencil, leaving
1619 * the mask of passing stencil bits in stencil_pass_reg. This mask will
1620 * be used both to mask the set of active pixels, and also to
1621 * determine how the stencil buffer changes.
1622 *
1623 * This test will *not* change the value in mask_reg (because we don't
1624 * yet know whether to apply the two-sided stencil or one-sided stencil).
1625 */
1626 spe_comment(f, 0, "Running basic stencil test");
1627 stencil_pass_reg = spe_allocate_available_register(f);
1628 gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
1629
1630 /* Generate code that, given the mask of valid fragments and the
1631 * mask of valid fragments that passed the stencil test, computes
1632 * the mask of valid fragments that failed the stencil test. We
1633 * have to do this before we run a depth test (because the
1634 * depth test should not be performed on fragments that failed the
1635 * stencil test, and because the depth test will update the
1636 * mask of valid fragments based on the results of the depth test).
1637 */
1638 spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
1639 stencil_fail_reg = spe_allocate_available_register(f);
1640 spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
1641 /* Now remove the stenciled-out pixels from the valid fragment mask,
1642 * so we can later use the valid fragment mask in the depth test.
1643 */
1644 spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
1645
1646 /* We may not need to calculate stencil values, if the writemask is off */
1647 if (need_to_calculate_stencil_values) {
1648 /* Generate code that calculates exactly which stencil values we need,
1649 * without calculating the same value twice (say, if two different
1650 * stencil ops have the same value). This code will work for one-sided
1651 * and two-sided stenciling (so that we take into account that operations
1652 * may match between front and back stencils), and will also take into
1653 * account whether the depth test is enabled (if the depth test is off,
1654 * we don't need any of the zfail results, because the depth test always
1655 * is considered to pass if it is disabled). Any register value that
1656 * does not need to be calculated will come back with the same value
1657 * that's in fbS_reg.
1658 *
1659 * This function will allocate a variant number of registers that
1660 * will be released as part of the register set.
1661 */
1662 spe_comment(f, 0, facing == CELL_FACING_FRONT
1663 ? "Computing front-facing stencil values"
1664 : "Computing back-facing stencil values");
1665 gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg,
1666 &stencil_fail_values, &stencil_pass_depth_fail_values,
1667 &stencil_pass_depth_pass_values);
1668 }
1669
1670 /* We now have all the stencil values we need. We also need
1671 * the results of the depth test to figure out which
1672 * stencil values will become the new stencil values. (Even if
1673 * we aren't actually calculating stencil values, we need to apply
1674 * the depth test if it's enabled.)
1675 *
1676 * The code generated by gen_depth_test() returns the results of the
1677 * test in the given register, but also alters the mask_reg based
1678 * on the results of the test.
1679 */
1680 if (dsa->depth.enabled) {
1681 spe_comment(f, 0, "Running stencil depth test");
1682 zmask_reg = spe_allocate_available_register(f);
1683 modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg,
1684 fbZ_reg, zmask_reg);
1685 }
1686
1687 if (need_to_calculate_stencil_values) {
1688
1689 /* If we need to writemask the stencil values before going into
1690 * the stencil buffer, we'll have to use a new register to
1691 * hold the new values. If not, we can just keep using the
1692 * current register.
1693 */
1694 if (need_to_writemask_stencil_values) {
1695 newS_reg = spe_allocate_available_register(f);
1696 spe_comment(f, 0, "Saving current stencil values for writemasking");
1697 spe_move(f, newS_reg, fbS_reg);
1698 }
1699 else {
1700 newS_reg = fbS_reg;
1701 }
1702
1703 /* Merge in the selected stencil fail values */
1704 if (stencil_fail_values != fbS_reg) {
1705 spe_comment(f, 0, "Loading stencil fail values");
1706 spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
1707 modified_buffers = TRUE;
1708 }
1709
1710 /* Same for the stencil pass/depth fail values. If this calculation
1711 * is not needed (say, if depth test is off), then the
1712 * stencil_pass_depth_fail_values register will be equal to fbS_reg
1713 * and we'll skip the calculation.
1714 */
1715 if (stencil_pass_depth_fail_values != fbS_reg) {
1716 /* We don't actually have a stencil pass/depth fail mask yet.
1717 * Calculate it here from the stencil passing mask and the
1718 * depth passing mask. Note that zmask_reg *must* have been
1719 * set above if we're here.
1720 */
1721 uint stencil_pass_depth_fail_mask =
1722 spe_allocate_available_register(f);
1723
1724 spe_comment(f, 0, "Loading stencil pass/depth fail values");
1725 spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
1726
1727 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values,
1728 stencil_pass_depth_fail_mask);
1729
1730 spe_release_register(f, stencil_pass_depth_fail_mask);
1731 modified_buffers = TRUE;
1732 }
1733
1734 /* Same for the stencil pass/depth pass mask. Note that we
1735 * *can* get here with zmask_reg being unset (if the depth
1736 * test is off but the stencil test is on). In this case,
1737 * we assume the depth test passes, and don't need to mask
1738 * the stencil pass mask with the Z mask.
1739 */
1740 if (stencil_pass_depth_pass_values != fbS_reg) {
1741 if (dsa->depth.enabled) {
1742 uint stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
1743 /* We'll need a separate register */
1744 spe_comment(f, 0, "Loading stencil pass/depth pass values");
1745 spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
1746 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
1747 spe_release_register(f, stencil_pass_depth_pass_mask);
1748 }
1749 else {
1750 /* We can use the same stencil-pass register */
1751 spe_comment(f, 0, "Loading stencil pass values");
1752 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
1753 }
1754 modified_buffers = TRUE;
1755 }
1756
1757 /* Almost done. If we need to writemask, do it now, leaving the
1758 * results in the fbS_reg register passed in. If we don't need
1759 * to writemask, then the results are *already* in the fbS_reg,
1760 * so there's nothing more to do.
1761 */
1762
1763 if (need_to_writemask_stencil_values && modified_buffers) {
1764 /* The Select Bytes command makes a fine writemask. Where
1765 * the mask is 0, the first (original) values are retained,
1766 * effectively masking out changes. Where the mask is 1, the
1767 * second (new) values are retained, incorporating changes.
1768 */
1769 spe_comment(f, 0, "Writemasking new stencil values");
1770 spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
1771 }
1772
1773 } /* done calculating stencil values */
1774
1775 /* The stencil and/or depth values have been applied, and the
1776 * mask_reg, fbS_reg, and fbZ_reg values have been updated.
1777 * We're all done, except that we've allocated a fair number
1778 * of registers that we didn't bother tracking. Release all
1779 * those registers as part of the register set, and go home.
1780 */
1781 spe_comment(f, 0, "Releasing stencil register set");
1782 spe_release_register_set(f);
1783
1784 /* Return TRUE if we could have modified the stencil and/or
1785 * depth buffers.
1786 */
1787 return modified_buffers;
1788 }
1789
1790
1791 /**
1792 * Generate depth and/or stencil test code.
1793 * \param cell context
1794 * \param dsa depth/stencil/alpha state
1795 * \param f spe function to emit
1796 * \param facing either CELL_FACING_FRONT or CELL_FACING_BACK
1797 * \param mask_reg register containing the pixel alive/dead mask
1798 * \param depth_tile_reg register containing address of z/stencil tile
1799 * \param quad_offset_reg offset to quad from start of tile
1800 * \param fragZ_reg register containg fragment Z values
1801 */
1802 static void
1803 gen_depth_stencil(struct cell_context *cell,
1804 const struct pipe_depth_stencil_alpha_state *dsa,
1805 struct spe_function *f,
1806 uint facing,
1807 int mask_reg,
1808 int depth_tile_reg,
1809 int quad_offset_reg,
1810 int fragZ_reg)
1811
1812 {
1813 const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
1814 boolean write_depth_stencil;
1815
1816 /* framebuffer's combined z/stencil values register */
1817 int fbZS_reg = spe_allocate_available_register(f);
1818
1819 /* Framebufer Z values register */
1820 int fbZ_reg = spe_allocate_available_register(f);
1821
1822 /* Framebuffer stencil values register (may not be used) */
1823 int fbS_reg = spe_allocate_available_register(f);
1824
1825 /* 24-bit mask register (may not be used) */
1826 int zmask_reg = spe_allocate_available_register(f);
1827
1828 /**
1829 * The following code:
1830 * 1. fetch quad of packed Z/S values from the framebuffer tile.
1831 * 2. extract the separate the Z and S values from packed values
1832 * 3. convert fragment Z values from float in [0,1] to 32/24/16-bit ints
1833 *
1834 * The instructions for doing this are interleaved for better performance.
1835 */
1836 spe_comment(f, 0, "Fetch Z/stencil quad from tile");
1837
1838 switch(zs_format) {
1839 case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
1840 case PIPE_FORMAT_X8Z24_UNORM:
1841 /* prepare mask to extract Z vals from ZS vals */
1842 spe_load_uint(f, zmask_reg, 0x00ffffff);
1843
1844 /* convert fragment Z from [0,1] to 32-bit ints */
1845 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1846
1847 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
1848 spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
1849
1850 /* right shift 32-bit fragment Z to 24 bits */
1851 spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
1852
1853 /* extract 24-bit Z values from ZS values by masking */
1854 spe_and(f, fbZ_reg, fbZS_reg, zmask_reg);
1855
1856 /* extract 8-bit stencil values by shifting */
1857 spe_rotmi(f, fbS_reg, fbZS_reg, -24);
1858 break;
1859
1860 case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
1861 case PIPE_FORMAT_Z24X8_UNORM:
1862 /* convert fragment Z from [0,1] to 32-bit ints */
1863 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1864
1865 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
1866 spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
1867
1868 /* right shift 32-bit fragment Z to 24 bits */
1869 spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
1870
1871 /* extract 24-bit Z values from ZS values by shifting */
1872 spe_rotmi(f, fbZ_reg, fbZS_reg, -8);
1873
1874 /* extract 8-bit stencil values by masking */
1875 spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
1876 break;
1877
1878 case PIPE_FORMAT_Z32_UNORM:
1879 /* Load: fbZ_reg = memory[depth_tile_reg + offset_reg] */
1880 spe_lqx(f, fbZ_reg, depth_tile_reg, quad_offset_reg);
1881
1882 /* convert fragment Z from [0,1] to 32-bit ints */
1883 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1884
1885 /* No stencil, so can't do anything there */
1886 break;
1887
1888 case PIPE_FORMAT_Z16_UNORM:
1889 /* XXX This code for 16bpp Z is broken! */
1890
1891 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
1892 spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
1893
1894 /* Copy over 4 32-bit values */
1895 spe_move(f, fbZ_reg, fbZS_reg);
1896
1897 /* convert Z from [0,1] to 16-bit ints */
1898 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1899 spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
1900 /* No stencil */
1901 break;
1902
1903 default:
1904 ASSERT(0); /* invalid format */
1905 }
1906
1907 /* If stencil is enabled, use the stencil-specific code
1908 * generator to generate both the stencil and depth (if needed)
1909 * tests. Otherwise, if only depth is enabled, generate
1910 * a quick depth test. The test generators themselves will
1911 * report back whether the depth/stencil buffer has to be
1912 * written back.
1913 */
1914 if (dsa->stencil[0].enabled) {
1915 /* This will perform the stencil and depth tests, and update
1916 * the mask_reg, fbZ_reg, and fbS_reg as required by the
1917 * tests.
1918 */
1919 ASSERT(fbS_reg >= 0);
1920 spe_comment(f, 0, "Perform stencil test");
1921
1922 /* Note that fbZ_reg may not be set on entry, if stenciling
1923 * is enabled but there's no Z-buffer. The
1924 * gen_stencil_depth_test() function must ignore the
1925 * fbZ_reg register if depth is not enabled.
1926 */
1927 write_depth_stencil = gen_stencil_depth_test(f, dsa, facing,
1928 mask_reg, fragZ_reg,
1929 fbZ_reg, fbS_reg);
1930 }
1931 else if (dsa->depth.enabled) {
1932 int zmask_reg = spe_allocate_available_register(f);
1933 ASSERT(fbZ_reg >= 0);
1934 spe_comment(f, 0, "Perform depth test");
1935 write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg,
1936 fbZ_reg, zmask_reg);
1937 spe_release_register(f, zmask_reg);
1938 }
1939 else {
1940 write_depth_stencil = FALSE;
1941 }
1942
1943 if (write_depth_stencil) {
1944 /* Merge latest Z and Stencil values into fbZS_reg.
1945 * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
1946 * fbS_reg has four 8-bit Z values in bits [7..0].
1947 */
1948 spe_comment(f, 0, "Store quad's depth/stencil values in tile");
1949 if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
1950 zs_format == PIPE_FORMAT_X8Z24_UNORM) {
1951 spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
1952 spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
1953 }
1954 else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
1955 zs_format == PIPE_FORMAT_Z24X8_UNORM) {
1956 spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
1957 spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
1958 }
1959 else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
1960 spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
1961 }
1962 else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
1963 spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
1964 }
1965 else if (zs_format == PIPE_FORMAT_S8_UNORM) {
1966 ASSERT(0); /* XXX to do */
1967 }
1968 else {
1969 ASSERT(0); /* bad zs_format */
1970 }
1971
1972 /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
1973 spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
1974 }
1975
1976 /* Don't need these any more */
1977 spe_release_register(f, fbZS_reg);
1978 spe_release_register(f, fbZ_reg);
1979 spe_release_register(f, fbS_reg);
1980 spe_release_register(f, zmask_reg);
1981 }
1982
1983
1984
1985 /**
1986 * Generate SPE code to implement the fragment operations (alpha test,
1987 * depth test, stencil test, blending, colormask, and final
1988 * framebuffer write) as specified by the current context state.
1989 *
1990 * Logically, this code will be called after running the fragment
1991 * shader. But under some circumstances we could run some of this
1992 * code before the fragment shader to cull fragments/quads that are
1993 * totally occluded/discarded.
1994 *
1995 * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
1996 *
1997 * See the spu_default_fragment_ops() function to see how the per-fragment
1998 * operations would be done with ordinary C code.
1999 * The code we generate here though has no branches, is SIMD, etc and
2000 * should be much faster.
2001 *
2002 * \param cell the rendering context (in)
2003 * \param facing whether the generated code is for front-facing or
2004 * back-facing fragments
2005 * \param f the generated function (in/out); on input, the function
2006 * must already have been initialized. On exit, whatever
2007 * instructions within the generated function have had
2008 * the fragment ops appended.
2009 */
2010 void
2011 cell_gen_fragment_function(struct cell_context *cell,
2012 const uint facing,
2013 struct spe_function *f)
2014 {
2015 const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
2016 const struct pipe_blend_state *blend = cell->blend;
2017 const struct pipe_blend_color *blend_color = &cell->blend_color;
2018 const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
2019
2020 /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
2021 const int x_reg = 3; /* uint */
2022 const int y_reg = 4; /* uint */
2023 const int color_tile_reg = 5; /* tile_t * */
2024 const int depth_tile_reg = 6; /* tile_t * */
2025 const int fragZ_reg = 7; /* vector float */
2026 const int fragR_reg = 8; /* vector float */
2027 const int fragG_reg = 9; /* vector float */
2028 const int fragB_reg = 10; /* vector float */
2029 const int fragA_reg = 11; /* vector float */
2030 const int mask_reg = 12; /* vector uint */
2031
2032 ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
2033
2034 /* offset of quad from start of tile
2035 * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
2036 */
2037 int quad_offset_reg;
2038
2039 int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */
2040
2041 if (cell->debug_flags & CELL_DEBUG_ASM) {
2042 spe_print_code(f, TRUE);
2043 spe_indent(f, 8);
2044 spe_comment(f, -4, facing == CELL_FACING_FRONT
2045 ? "Begin front-facing per-fragment ops"
2046 : "Begin back-facing per-fragment ops");
2047 }
2048
2049 spe_allocate_register(f, x_reg);
2050 spe_allocate_register(f, y_reg);
2051 spe_allocate_register(f, color_tile_reg);
2052 spe_allocate_register(f, depth_tile_reg);
2053 spe_allocate_register(f, fragZ_reg);
2054 spe_allocate_register(f, fragR_reg);
2055 spe_allocate_register(f, fragG_reg);
2056 spe_allocate_register(f, fragB_reg);
2057 spe_allocate_register(f, fragA_reg);
2058 spe_allocate_register(f, mask_reg);
2059
2060 quad_offset_reg = spe_allocate_available_register(f);
2061 fbRGBA_reg = spe_allocate_available_register(f);
2062
2063 /* compute offset of quad from start of tile, in bytes */
2064 {
2065 int x2_reg = spe_allocate_available_register(f);
2066 int y2_reg = spe_allocate_available_register(f);
2067
2068 ASSERT(TILE_SIZE == 32);
2069
2070 spe_comment(f, 0, "Compute quad offset within tile");
2071 spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */
2072 spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
2073 spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */
2074 spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */
2075 spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */
2076
2077 spe_release_register(f, x2_reg);
2078 spe_release_register(f, y2_reg);
2079 }
2080
2081 /* Generate the alpha test, if needed. */
2082 if (dsa->alpha.enabled) {
2083 gen_alpha_test(dsa, f, mask_reg, fragA_reg);
2084 }
2085
2086 /* generate depth and/or stencil test code */
2087 if (dsa->depth.enabled || dsa->stencil[0].enabled) {
2088 gen_depth_stencil(cell, dsa, f,
2089 facing,
2090 mask_reg,
2091 depth_tile_reg,
2092 quad_offset_reg,
2093 fragZ_reg);
2094 }
2095
2096 /* Get framebuffer quad/colors. We'll need these for blending,
2097 * color masking, and to obey the quad/pixel mask.
2098 * Load: fbRGBA_reg = memory[color_tile + quad_offset]
2099 * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
2100 * we could skip this load.
2101 */
2102 spe_comment(f, 0, "Fetch quad colors from tile");
2103 spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
2104
2105 if (blend->blend_enable) {
2106 spe_comment(f, 0, "Perform blending");
2107 gen_blend(blend, blend_color, f, color_format,
2108 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
2109 }
2110
2111 /*
2112 * Write fragment colors to framebuffer/tile.
2113 * This involves converting the fragment colors from float[4] to the
2114 * tile's specific format and obeying the quad/pixel mask.
2115 */
2116 {
2117 int rgba_reg = spe_allocate_available_register(f);
2118
2119 /* Pack four float colors as four 32-bit int colors */
2120 spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
2121 gen_pack_colors(f, color_format,
2122 fragR_reg, fragG_reg, fragB_reg, fragA_reg,
2123 rgba_reg);
2124
2125 if (blend->logicop_enable) {
2126 spe_comment(f, 0, "Compute logic op");
2127 gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
2128 }
2129
2130 if (blend->colormask != PIPE_MASK_RGBA) {
2131 spe_comment(f, 0, "Compute color mask");
2132 gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
2133 }
2134
2135 /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
2136 * if (mask[i])
2137 * rgba[i] = rgba[i];
2138 * else
2139 * rgba[i] = framebuffer[i];
2140 */
2141 spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
2142
2143 /* Store updated quad in tile:
2144 * memory[color_tile + quad_offset] = rgba_reg;
2145 */
2146 spe_comment(f, 0, "Store quad colors into color tile");
2147 spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
2148
2149 spe_release_register(f, rgba_reg);
2150 }
2151
2152 //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
2153
2154 spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */
2155
2156 spe_release_register(f, fbRGBA_reg);
2157 spe_release_register(f, quad_offset_reg);
2158
2159 if (cell->debug_flags & CELL_DEBUG_ASM) {
2160 char buffer[1024];
2161 sprintf(buffer, "End %s-facing per-fragment ops: %d instructions",
2162 facing == CELL_FACING_FRONT ? "front" : "back", f->num_inst);
2163 spe_comment(f, -4, buffer);
2164 }
2165 }