Merge remote branch 'origin/7.8'
[mesa.git] / src / gallium / drivers / cell / ppu / cell_gen_fragment.c
1 /**************************************************************************
2 *
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009 VMware, Inc. All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29 /**
30 * Generate SPU per-fragment code (actually per-quad code).
31 * \author Brian Paul
32 * \author Bob Ellison
33 */
34
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "rtasm/rtasm_ppc_spe.h"
39 #include "cell_context.h"
40 #include "cell_gen_fragment.h"
41
42
43
44 /** Do extra optimizations? */
45 #define OPTIMIZATIONS 1
46
47
48 /**
49 * Generate SPE code to perform Z/depth testing.
50 *
51 * \param dsa Gallium depth/stencil/alpha state to gen code for
52 * \param f SPE function to append instruction onto.
53 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
54 * \param ifragZ_reg register containing integer fragment Z values (in)
55 * \param ifbZ_reg register containing integer frame buffer Z values (in/out)
56 * \param zmask_reg register containing result of Z test/comparison (out)
57 *
58 * Returns TRUE if the Z-buffer needs to be updated.
59 */
60 static boolean
61 gen_depth_test(struct spe_function *f,
62 const struct pipe_depth_stencil_alpha_state *dsa,
63 int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
64 {
65 /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
66 * quantities. This only makes a difference for 32-bit Z values though.
67 */
68 ASSERT(dsa->depth.enabled);
69
70 switch (dsa->depth.func) {
71 case PIPE_FUNC_EQUAL:
72 /* zmask = (ifragZ == ref) */
73 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
74 /* mask = (mask & zmask) */
75 spe_and(f, mask_reg, mask_reg, zmask_reg);
76 break;
77
78 case PIPE_FUNC_NOTEQUAL:
79 /* zmask = (ifragZ == ref) */
80 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
81 /* mask = (mask & ~zmask) */
82 spe_andc(f, mask_reg, mask_reg, zmask_reg);
83 break;
84
85 case PIPE_FUNC_GREATER:
86 /* zmask = (ifragZ > ref) */
87 spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
88 /* mask = (mask & zmask) */
89 spe_and(f, mask_reg, mask_reg, zmask_reg);
90 break;
91
92 case PIPE_FUNC_LESS:
93 /* zmask = (ref > ifragZ) */
94 spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
95 /* mask = (mask & zmask) */
96 spe_and(f, mask_reg, mask_reg, zmask_reg);
97 break;
98
99 case PIPE_FUNC_LEQUAL:
100 /* zmask = (ifragZ > ref) */
101 spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
102 /* mask = (mask & ~zmask) */
103 spe_andc(f, mask_reg, mask_reg, zmask_reg);
104 break;
105
106 case PIPE_FUNC_GEQUAL:
107 /* zmask = (ref > ifragZ) */
108 spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
109 /* mask = (mask & ~zmask) */
110 spe_andc(f, mask_reg, mask_reg, zmask_reg);
111 break;
112
113 case PIPE_FUNC_NEVER:
114 spe_il(f, mask_reg, 0); /* mask = {0,0,0,0} */
115 spe_move(f, zmask_reg, mask_reg); /* zmask = mask */
116 break;
117
118 case PIPE_FUNC_ALWAYS:
119 /* mask unchanged */
120 spe_il(f, zmask_reg, ~0); /* zmask = {~0,~0,~0,~0} */
121 break;
122
123 default:
124 ASSERT(0);
125 break;
126 }
127
128 if (dsa->depth.writemask) {
129 /*
130 * If (ztest passed) {
131 * framebufferZ = fragmentZ;
132 * }
133 * OR,
134 * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
135 */
136 spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
137 return TRUE;
138 }
139
140 return FALSE;
141 }
142
143
144 /**
145 * Generate SPE code to perform alpha testing.
146 *
147 * \param dsa Gallium depth/stencil/alpha state to gen code for
148 * \param f SPE function to append instruction onto.
149 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
150 * \param fragA_reg register containing four fragment alpha values (in)
151 */
152 static void
153 gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
154 struct spe_function *f, int mask_reg, int fragA_reg)
155 {
156 int ref_reg = spe_allocate_available_register(f);
157 int amask_reg = spe_allocate_available_register(f);
158
159 ASSERT(dsa->alpha.enabled);
160
161 if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
162 (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
163 /* load/splat the alpha reference float value */
164 spe_load_float(f, ref_reg, dsa->alpha.ref_value);
165 }
166
167 /* emit code to do the alpha comparison, updating 'mask' */
168 switch (dsa->alpha.func) {
169 case PIPE_FUNC_EQUAL:
170 /* amask = (fragA == ref) */
171 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
172 /* mask = (mask & amask) */
173 spe_and(f, mask_reg, mask_reg, amask_reg);
174 break;
175
176 case PIPE_FUNC_NOTEQUAL:
177 /* amask = (fragA == ref) */
178 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
179 /* mask = (mask & ~amask) */
180 spe_andc(f, mask_reg, mask_reg, amask_reg);
181 break;
182
183 case PIPE_FUNC_GREATER:
184 /* amask = (fragA > ref) */
185 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
186 /* mask = (mask & amask) */
187 spe_and(f, mask_reg, mask_reg, amask_reg);
188 break;
189
190 case PIPE_FUNC_LESS:
191 /* amask = (ref > fragA) */
192 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
193 /* mask = (mask & amask) */
194 spe_and(f, mask_reg, mask_reg, amask_reg);
195 break;
196
197 case PIPE_FUNC_LEQUAL:
198 /* amask = (fragA > ref) */
199 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
200 /* mask = (mask & ~amask) */
201 spe_andc(f, mask_reg, mask_reg, amask_reg);
202 break;
203
204 case PIPE_FUNC_GEQUAL:
205 /* amask = (ref > fragA) */
206 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
207 /* mask = (mask & ~amask) */
208 spe_andc(f, mask_reg, mask_reg, amask_reg);
209 break;
210
211 case PIPE_FUNC_NEVER:
212 spe_il(f, mask_reg, 0); /* mask = [0,0,0,0] */
213 break;
214
215 case PIPE_FUNC_ALWAYS:
216 /* no-op, mask unchanged */
217 break;
218
219 default:
220 ASSERT(0);
221 break;
222 }
223
224 #if OPTIMIZATIONS
225 /* if mask == {0,0,0,0} we're all done, return */
226 {
227 /* re-use amask reg here */
228 int tmp_reg = amask_reg;
229 /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
230 spe_orx(f, tmp_reg, mask_reg);
231 /* if tmp[0] == 0 then return from function call */
232 spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
233 }
234 #endif
235
236 spe_release_register(f, ref_reg);
237 spe_release_register(f, amask_reg);
238 }
239
240
241 /**
242 * This pair of functions is used inline to allocate and deallocate
243 * optional constant registers. Once a constant is discovered to be
244 * needed, we will likely need it again, so we don't want to deallocate
245 * it and have to allocate and load it again unnecessarily.
246 */
247 static INLINE void
248 setup_optional_register(struct spe_function *f,
249 int *r)
250 {
251 if (*r < 0)
252 *r = spe_allocate_available_register(f);
253 }
254
255 static INLINE void
256 release_optional_register(struct spe_function *f,
257 int r)
258 {
259 if (r >= 0)
260 spe_release_register(f, r);
261 }
262
263 static INLINE void
264 setup_const_register(struct spe_function *f,
265 int *r,
266 float value)
267 {
268 if (*r >= 0)
269 return;
270 setup_optional_register(f, r);
271 spe_load_float(f, *r, value);
272 }
273
274 static INLINE void
275 release_const_register(struct spe_function *f,
276 int r)
277 {
278 release_optional_register(f, r);
279 }
280
281
282
283 /**
284 * Unpack/convert framebuffer colors from four 32-bit packed colors
285 * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
286 * Each 8-bit color component is expanded into a float in [0.0, 1.0].
287 */
288 static void
289 unpack_colors(struct spe_function *f,
290 enum pipe_format color_format,
291 int fbRGBA_reg,
292 int fbR_reg, int fbG_reg, int fbB_reg, int fbA_reg)
293 {
294 int mask0_reg = spe_allocate_available_register(f);
295 int mask1_reg = spe_allocate_available_register(f);
296 int mask2_reg = spe_allocate_available_register(f);
297 int mask3_reg = spe_allocate_available_register(f);
298
299 spe_load_int(f, mask0_reg, 0xff);
300 spe_load_int(f, mask1_reg, 0xff00);
301 spe_load_int(f, mask2_reg, 0xff0000);
302 spe_load_int(f, mask3_reg, 0xff000000);
303
304 spe_comment(f, 0, "Unpack framebuffer colors, convert to floats");
305
306 switch (color_format) {
307 case PIPE_FORMAT_B8G8R8A8_UNORM:
308 /* fbB = fbRGBA & mask */
309 spe_and(f, fbB_reg, fbRGBA_reg, mask0_reg);
310
311 /* fbG = fbRGBA & mask */
312 spe_and(f, fbG_reg, fbRGBA_reg, mask1_reg);
313
314 /* fbR = fbRGBA & mask */
315 spe_and(f, fbR_reg, fbRGBA_reg, mask2_reg);
316
317 /* fbA = fbRGBA & mask */
318 spe_and(f, fbA_reg, fbRGBA_reg, mask3_reg);
319
320 /* fbG = fbG >> 8 */
321 spe_roti(f, fbG_reg, fbG_reg, -8);
322
323 /* fbR = fbR >> 16 */
324 spe_roti(f, fbR_reg, fbR_reg, -16);
325
326 /* fbA = fbA >> 24 */
327 spe_roti(f, fbA_reg, fbA_reg, -24);
328 break;
329
330 case PIPE_FORMAT_A8R8G8B8_UNORM:
331 /* fbA = fbRGBA & mask */
332 spe_and(f, fbA_reg, fbRGBA_reg, mask0_reg);
333
334 /* fbR = fbRGBA & mask */
335 spe_and(f, fbR_reg, fbRGBA_reg, mask1_reg);
336
337 /* fbG = fbRGBA & mask */
338 spe_and(f, fbG_reg, fbRGBA_reg, mask2_reg);
339
340 /* fbB = fbRGBA & mask */
341 spe_and(f, fbB_reg, fbRGBA_reg, mask3_reg);
342
343 /* fbR = fbR >> 8 */
344 spe_roti(f, fbR_reg, fbR_reg, -8);
345
346 /* fbG = fbG >> 16 */
347 spe_roti(f, fbG_reg, fbG_reg, -16);
348
349 /* fbB = fbB >> 24 */
350 spe_roti(f, fbB_reg, fbB_reg, -24);
351 break;
352
353 default:
354 ASSERT(0);
355 }
356
357 /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
358 spe_cuflt(f, fbR_reg, fbR_reg, 8);
359 spe_cuflt(f, fbG_reg, fbG_reg, 8);
360 spe_cuflt(f, fbB_reg, fbB_reg, 8);
361 spe_cuflt(f, fbA_reg, fbA_reg, 8);
362
363 spe_release_register(f, mask0_reg);
364 spe_release_register(f, mask1_reg);
365 spe_release_register(f, mask2_reg);
366 spe_release_register(f, mask3_reg);
367 }
368
369
370 /**
371 * Generate SPE code to implement the given blend mode for a quad of pixels.
372 * \param f SPE function to append instruction onto.
373 * \param fragR_reg register with fragment red values (float) (in/out)
374 * \param fragG_reg register with fragment green values (float) (in/out)
375 * \param fragB_reg register with fragment blue values (float) (in/out)
376 * \param fragA_reg register with fragment alpha values (float) (in/out)
377 * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
378 */
379 static void
380 gen_blend(const struct pipe_blend_state *blend,
381 const struct pipe_blend_color *blend_color,
382 struct spe_function *f,
383 enum pipe_format color_format,
384 int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
385 int fbRGBA_reg)
386 {
387 int term1R_reg = spe_allocate_available_register(f);
388 int term1G_reg = spe_allocate_available_register(f);
389 int term1B_reg = spe_allocate_available_register(f);
390 int term1A_reg = spe_allocate_available_register(f);
391
392 int term2R_reg = spe_allocate_available_register(f);
393 int term2G_reg = spe_allocate_available_register(f);
394 int term2B_reg = spe_allocate_available_register(f);
395 int term2A_reg = spe_allocate_available_register(f);
396
397 int fbR_reg = spe_allocate_available_register(f);
398 int fbG_reg = spe_allocate_available_register(f);
399 int fbB_reg = spe_allocate_available_register(f);
400 int fbA_reg = spe_allocate_available_register(f);
401
402 int tmp_reg = spe_allocate_available_register(f);
403
404 /* Optional constant registers we might or might not end up using;
405 * if we do use them, make sure we only allocate them once by
406 * keeping a flag on each one.
407 */
408 int one_reg = -1;
409 int constR_reg = -1, constG_reg = -1, constB_reg = -1, constA_reg = -1;
410
411 ASSERT(blend->rt[0].blend_enable);
412
413 /* packed RGBA -> float colors */
414 unpack_colors(f, color_format, fbRGBA_reg,
415 fbR_reg, fbG_reg, fbB_reg, fbA_reg);
416
417 /*
418 * Compute Src RGB terms. We're actually looking for the value
419 * of (the appropriate RGB factors) * (the incoming source RGB color),
420 * because in some cases (like PIPE_BLENDFACTOR_ONE and
421 * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
422 */
423 switch (blend->rt[0].rgb_src_factor) {
424 case PIPE_BLENDFACTOR_ONE:
425 /* factors = (1,1,1), so term = (R,G,B) */
426 spe_move(f, term1R_reg, fragR_reg);
427 spe_move(f, term1G_reg, fragG_reg);
428 spe_move(f, term1B_reg, fragB_reg);
429 break;
430 case PIPE_BLENDFACTOR_ZERO:
431 /* factors = (0,0,0), so term = (0,0,0) */
432 spe_load_float(f, term1R_reg, 0.0f);
433 spe_load_float(f, term1G_reg, 0.0f);
434 spe_load_float(f, term1B_reg, 0.0f);
435 break;
436 case PIPE_BLENDFACTOR_SRC_COLOR:
437 /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
438 spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
439 spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
440 spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
441 break;
442 case PIPE_BLENDFACTOR_SRC_ALPHA:
443 /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
444 spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
445 spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
446 spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
447 break;
448 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
449 /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B))
450 * or in other words term = (R-R*R, G-G*G, B-B*B)
451 * fnms(a,b,c,d) computes a = d - b*c
452 */
453 spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
454 spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
455 spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
456 break;
457 case PIPE_BLENDFACTOR_DST_COLOR:
458 /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
459 spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
460 spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
461 spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
462 break;
463 case PIPE_BLENDFACTOR_INV_DST_COLOR:
464 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
465 * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
466 * fnms(a,b,c,d) computes a = d - b*c
467 */
468 spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
469 spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
470 spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
471 break;
472 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
473 /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
474 * or term = (R-R*A,G-G*A,B-B*A)
475 * fnms(a,b,c,d) computes a = d - b*c
476 */
477 spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
478 spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
479 spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
480 break;
481 case PIPE_BLENDFACTOR_DST_ALPHA:
482 /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
483 spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
484 spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
485 spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
486 break;
487 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
488 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb))
489 * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
490 * fnms(a,b,c,d) computes a = d - b*c
491 */
492 spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
493 spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
494 spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
495 break;
496 case PIPE_BLENDFACTOR_CONST_COLOR:
497 /* We need the optional constant color registers */
498 setup_const_register(f, &constR_reg, blend_color->color[0]);
499 setup_const_register(f, &constG_reg, blend_color->color[1]);
500 setup_const_register(f, &constB_reg, blend_color->color[2]);
501 /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
502 spe_fm(f, term1R_reg, fragR_reg, constR_reg);
503 spe_fm(f, term1G_reg, fragG_reg, constG_reg);
504 spe_fm(f, term1B_reg, fragB_reg, constB_reg);
505 break;
506 case PIPE_BLENDFACTOR_CONST_ALPHA:
507 /* we'll need the optional constant alpha register */
508 setup_const_register(f, &constA_reg, blend_color->color[3]);
509 /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
510 spe_fm(f, term1R_reg, fragR_reg, constA_reg);
511 spe_fm(f, term1G_reg, fragG_reg, constA_reg);
512 spe_fm(f, term1B_reg, fragB_reg, constA_reg);
513 break;
514 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
515 /* We need the optional constant color registers */
516 setup_const_register(f, &constR_reg, blend_color->color[0]);
517 setup_const_register(f, &constG_reg, blend_color->color[1]);
518 setup_const_register(f, &constB_reg, blend_color->color[2]);
519 /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc))
520 * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
521 * fnms(a,b,c,d) computes a = d - b*c
522 */
523 spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
524 spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
525 spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
526 break;
527 case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
528 /* We need the optional constant color registers */
529 setup_const_register(f, &constR_reg, blend_color->color[0]);
530 setup_const_register(f, &constG_reg, blend_color->color[1]);
531 setup_const_register(f, &constB_reg, blend_color->color[2]);
532 /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
533 * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
534 * fnms(a,b,c,d) computes a = d - b*c
535 */
536 spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
537 spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
538 spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
539 break;
540 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
541 /* We'll need the optional {1,1,1,1} register */
542 setup_const_register(f, &one_reg, 1.0f);
543 /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so
544 * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
545 * We could expand the term (as a*min(b,c) == min(a*b,a*c)
546 * as long as a is positive), but then we'd have to do three
547 * spe_float_min() functions instead of one, so this is simpler.
548 */
549 /* tmp = 1 - Afb */
550 spe_fs(f, tmp_reg, one_reg, fbA_reg);
551 /* tmp = min(A,tmp) */
552 spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
553 /* term = R*tmp */
554 spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
555 spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
556 spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
557 break;
558
559 /* These are special D3D cases involving a second color output
560 * from the fragment shader. I'm not sure we can support them
561 * yet... XXX
562 */
563 case PIPE_BLENDFACTOR_SRC1_COLOR:
564 case PIPE_BLENDFACTOR_SRC1_ALPHA:
565 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
566 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
567
568 default:
569 ASSERT(0);
570 }
571
572 /*
573 * Compute Src Alpha term. Like the above, we're looking for
574 * the full term A*factor, not just the factor itself, because
575 * in many cases we can avoid doing unnecessary multiplies.
576 */
577 switch (blend->rt[0].alpha_src_factor) {
578 case PIPE_BLENDFACTOR_ZERO:
579 /* factor = 0, so term = 0 */
580 spe_load_float(f, term1A_reg, 0.0f);
581 break;
582
583 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
584 case PIPE_BLENDFACTOR_ONE:
585 /* factor = 1, so term = A */
586 spe_move(f, term1A_reg, fragA_reg);
587 break;
588
589 case PIPE_BLENDFACTOR_SRC_COLOR:
590 /* factor = A, so term = A*A */
591 spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
592 break;
593 case PIPE_BLENDFACTOR_SRC_ALPHA:
594 spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
595 break;
596
597 case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
598 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
599 /* factor = 1-A, so term = A*(1-A) = A-A*A */
600 /* fnms(a,b,c,d) computes a = d - b*c */
601 spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
602 break;
603
604 case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
605 case PIPE_BLENDFACTOR_DST_COLOR:
606 /* factor = Afb, so term = A*Afb */
607 spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
608 break;
609
610 case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
611 case PIPE_BLENDFACTOR_INV_DST_COLOR:
612 /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
613 /* fnms(a,b,c,d) computes a = d - b*c */
614 spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
615 break;
616
617 case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
618 case PIPE_BLENDFACTOR_CONST_COLOR:
619 /* We need the optional constA_reg register */
620 setup_const_register(f, &constA_reg, blend_color->color[3]);
621 /* factor = Ac, so term = A*Ac */
622 spe_fm(f, term1A_reg, fragA_reg, constA_reg);
623 break;
624
625 case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
626 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
627 /* We need the optional constA_reg register */
628 setup_const_register(f, &constA_reg, blend_color->color[3]);
629 /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
630 /* fnms(a,b,c,d) computes a = d - b*c */
631 spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
632 break;
633
634 /* These are special D3D cases involving a second color output
635 * from the fragment shader. I'm not sure we can support them
636 * yet... XXX
637 */
638 case PIPE_BLENDFACTOR_SRC1_COLOR:
639 case PIPE_BLENDFACTOR_SRC1_ALPHA:
640 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
641 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
642 default:
643 ASSERT(0);
644 }
645
646 /*
647 * Compute Dest RGB term. Like the above, we're looking for
648 * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
649 * in many cases we can avoid doing unnecessary multiplies.
650 */
651 switch (blend->rt[0].rgb_dst_factor) {
652 case PIPE_BLENDFACTOR_ONE:
653 /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
654 spe_move(f, term2R_reg, fbR_reg);
655 spe_move(f, term2G_reg, fbG_reg);
656 spe_move(f, term2B_reg, fbB_reg);
657 break;
658 case PIPE_BLENDFACTOR_ZERO:
659 /* factor s= (0,0,0), so term = (0,0,0) */
660 spe_load_float(f, term2R_reg, 0.0f);
661 spe_load_float(f, term2G_reg, 0.0f);
662 spe_load_float(f, term2B_reg, 0.0f);
663 break;
664 case PIPE_BLENDFACTOR_SRC_COLOR:
665 /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
666 spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
667 spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
668 spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
669 break;
670 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
671 /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B))
672 * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
673 * fnms(a,b,c,d) computes a = d - b*c
674 */
675 spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
676 spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
677 spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
678 break;
679 case PIPE_BLENDFACTOR_SRC_ALPHA:
680 /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
681 spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
682 spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
683 spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
684 break;
685 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
686 /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
687 /* fnms(a,b,c,d) computes a = d - b*c */
688 spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
689 spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
690 spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
691 break;
692 case PIPE_BLENDFACTOR_DST_COLOR:
693 /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
694 spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
695 spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
696 spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
697 break;
698 case PIPE_BLENDFACTOR_INV_DST_COLOR:
699 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
700 * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
701 * fnms(a,b,c,d) computes a = d - b*c
702 */
703 spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
704 spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
705 spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
706 break;
707
708 case PIPE_BLENDFACTOR_DST_ALPHA:
709 /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
710 spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
711 spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
712 spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
713 break;
714 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
715 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb))
716 * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
717 * fnms(a,b,c,d) computes a = d - b*c
718 */
719 spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
720 spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
721 spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
722 break;
723 case PIPE_BLENDFACTOR_CONST_COLOR:
724 /* We need the optional constant color registers */
725 setup_const_register(f, &constR_reg, blend_color->color[0]);
726 setup_const_register(f, &constG_reg, blend_color->color[1]);
727 setup_const_register(f, &constB_reg, blend_color->color[2]);
728 /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
729 spe_fm(f, term2R_reg, fbR_reg, constR_reg);
730 spe_fm(f, term2G_reg, fbG_reg, constG_reg);
731 spe_fm(f, term2B_reg, fbB_reg, constB_reg);
732 break;
733 case PIPE_BLENDFACTOR_CONST_ALPHA:
734 /* we'll need the optional constant alpha register */
735 setup_const_register(f, &constA_reg, blend_color->color[3]);
736 /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
737 spe_fm(f, term2R_reg, fbR_reg, constA_reg);
738 spe_fm(f, term2G_reg, fbG_reg, constA_reg);
739 spe_fm(f, term2B_reg, fbB_reg, constA_reg);
740 break;
741 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
742 /* We need the optional constant color registers */
743 setup_const_register(f, &constR_reg, blend_color->color[0]);
744 setup_const_register(f, &constG_reg, blend_color->color[1]);
745 setup_const_register(f, &constB_reg, blend_color->color[2]);
746 /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc))
747 * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
748 * fnms(a,b,c,d) computes a = d - b*c
749 */
750 spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
751 spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
752 spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
753 break;
754 case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
755 /* We need the optional constant color registers */
756 setup_const_register(f, &constR_reg, blend_color->color[0]);
757 setup_const_register(f, &constG_reg, blend_color->color[1]);
758 setup_const_register(f, &constB_reg, blend_color->color[2]);
759 /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
760 * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
761 * fnms(a,b,c,d) computes a = d - b*c
762 */
763 spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
764 spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
765 spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
766 break;
767 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
768 ASSERT(0);
769 break;
770
771 /* These are special D3D cases involving a second color output
772 * from the fragment shader. I'm not sure we can support them
773 * yet... XXX
774 */
775 case PIPE_BLENDFACTOR_SRC1_COLOR:
776 case PIPE_BLENDFACTOR_SRC1_ALPHA:
777 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
778 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
779
780 default:
781 ASSERT(0);
782 }
783
784 /*
785 * Compute Dest Alpha term. Like the above, we're looking for
786 * the full term Afb*factor, not just the factor itself, because
787 * in many cases we can avoid doing unnecessary multiplies.
788 */
789 switch (blend->rt[0].alpha_dst_factor) {
790 case PIPE_BLENDFACTOR_ONE:
791 /* factor = 1, so term = Afb */
792 spe_move(f, term2A_reg, fbA_reg);
793 break;
794 case PIPE_BLENDFACTOR_ZERO:
795 /* factor = 0, so term = 0 */
796 spe_load_float(f, term2A_reg, 0.0f);
797 break;
798
799 case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
800 case PIPE_BLENDFACTOR_SRC_COLOR:
801 /* factor = A, so term = Afb*A */
802 spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
803 break;
804
805 case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
806 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
807 /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
808 /* fnms(a,b,c,d) computes a = d - b*c */
809 spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
810 break;
811
812 case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
813 case PIPE_BLENDFACTOR_DST_COLOR:
814 /* factor = Afb, so term = Afb*Afb */
815 spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
816 break;
817
818 case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
819 case PIPE_BLENDFACTOR_INV_DST_COLOR:
820 /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
821 /* fnms(a,b,c,d) computes a = d - b*c */
822 spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
823 break;
824
825 case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
826 case PIPE_BLENDFACTOR_CONST_COLOR:
827 /* We need the optional constA_reg register */
828 setup_const_register(f, &constA_reg, blend_color->color[3]);
829 /* factor = Ac, so term = Afb*Ac */
830 spe_fm(f, term2A_reg, fbA_reg, constA_reg);
831 break;
832
833 case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
834 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
835 /* We need the optional constA_reg register */
836 setup_const_register(f, &constA_reg, blend_color->color[3]);
837 /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
838 /* fnms(a,b,c,d) computes a = d - b*c */
839 spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
840 break;
841
842 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
843 ASSERT(0);
844 break;
845
846 /* These are special D3D cases involving a second color output
847 * from the fragment shader. I'm not sure we can support them
848 * yet... XXX
849 */
850 case PIPE_BLENDFACTOR_SRC1_COLOR:
851 case PIPE_BLENDFACTOR_SRC1_ALPHA:
852 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
853 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
854 default:
855 ASSERT(0);
856 }
857
858 /*
859 * Combine Src/Dest RGB terms as per the blend equation.
860 */
861 switch (blend->rt[0].rgb_func) {
862 case PIPE_BLEND_ADD:
863 spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
864 spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
865 spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
866 break;
867 case PIPE_BLEND_SUBTRACT:
868 spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
869 spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
870 spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
871 break;
872 case PIPE_BLEND_REVERSE_SUBTRACT:
873 spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
874 spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
875 spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
876 break;
877 case PIPE_BLEND_MIN:
878 spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
879 spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
880 spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
881 break;
882 case PIPE_BLEND_MAX:
883 spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
884 spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
885 spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
886 break;
887 default:
888 ASSERT(0);
889 }
890
891 /*
892 * Combine Src/Dest A term
893 */
894 switch (blend->rt[0].alpha_func) {
895 case PIPE_BLEND_ADD:
896 spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
897 break;
898 case PIPE_BLEND_SUBTRACT:
899 spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
900 break;
901 case PIPE_BLEND_REVERSE_SUBTRACT:
902 spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
903 break;
904 case PIPE_BLEND_MIN:
905 spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
906 break;
907 case PIPE_BLEND_MAX:
908 spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
909 break;
910 default:
911 ASSERT(0);
912 }
913
914 spe_release_register(f, term1R_reg);
915 spe_release_register(f, term1G_reg);
916 spe_release_register(f, term1B_reg);
917 spe_release_register(f, term1A_reg);
918
919 spe_release_register(f, term2R_reg);
920 spe_release_register(f, term2G_reg);
921 spe_release_register(f, term2B_reg);
922 spe_release_register(f, term2A_reg);
923
924 spe_release_register(f, fbR_reg);
925 spe_release_register(f, fbG_reg);
926 spe_release_register(f, fbB_reg);
927 spe_release_register(f, fbA_reg);
928
929 spe_release_register(f, tmp_reg);
930
931 /* Free any optional registers that actually got used */
932 release_const_register(f, one_reg);
933 release_const_register(f, constR_reg);
934 release_const_register(f, constG_reg);
935 release_const_register(f, constB_reg);
936 release_const_register(f, constA_reg);
937 }
938
939
940 static void
941 gen_logicop(const struct pipe_blend_state *blend,
942 struct spe_function *f,
943 int fragRGBA_reg, int fbRGBA_reg)
944 {
945 /* We've got four 32-bit RGBA packed pixels in each of
946 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
947 * reds, greens, blues, and alphas.
948 * */
949 ASSERT(blend->logicop_enable);
950
951 switch(blend->logicop_func) {
952 case PIPE_LOGICOP_CLEAR: /* 0 */
953 spe_zero(f, fragRGBA_reg);
954 break;
955 case PIPE_LOGICOP_NOR: /* ~(s | d) */
956 spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
957 break;
958 case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
959 /* andc R, A, B computes R = A & ~B */
960 spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
961 break;
962 case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
963 spe_complement(f, fragRGBA_reg, fragRGBA_reg);
964 break;
965 case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
966 /* andc R, A, B computes R = A & ~B */
967 spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
968 break;
969 case PIPE_LOGICOP_INVERT: /* ~d */
970 /* Note that (A nor A) == ~(A|A) == ~A */
971 spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
972 break;
973 case PIPE_LOGICOP_XOR: /* s ^ d */
974 spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
975 break;
976 case PIPE_LOGICOP_NAND: /* ~(s & d) */
977 spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
978 break;
979 case PIPE_LOGICOP_AND: /* s & d */
980 spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
981 break;
982 case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
983 spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
984 spe_complement(f, fragRGBA_reg, fragRGBA_reg);
985 break;
986 case PIPE_LOGICOP_NOOP: /* d */
987 spe_move(f, fragRGBA_reg, fbRGBA_reg);
988 break;
989 case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
990 /* orc R, A, B computes R = A | ~B */
991 spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
992 break;
993 case PIPE_LOGICOP_COPY: /* s */
994 break;
995 case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
996 /* orc R, A, B computes R = A | ~B */
997 spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
998 break;
999 case PIPE_LOGICOP_OR: /* s | d */
1000 spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
1001 break;
1002 case PIPE_LOGICOP_SET: /* 1 */
1003 spe_load_int(f, fragRGBA_reg, 0xffffffff);
1004 break;
1005 default:
1006 ASSERT(0);
1007 }
1008 }
1009
1010
1011 /**
1012 * Generate code to pack a quad of float colors into four 32-bit integers.
1013 *
1014 * \param f SPE function to append instruction onto.
1015 * \param color_format the dest color packing format
1016 * \param r_reg register containing four red values (in/clobbered)
1017 * \param g_reg register containing four green values (in/clobbered)
1018 * \param b_reg register containing four blue values (in/clobbered)
1019 * \param a_reg register containing four alpha values (in/clobbered)
1020 * \param rgba_reg register to store the packed RGBA colors (out)
1021 */
1022 static void
1023 gen_pack_colors(struct spe_function *f,
1024 enum pipe_format color_format,
1025 int r_reg, int g_reg, int b_reg, int a_reg,
1026 int rgba_reg)
1027 {
1028 int rg_reg = spe_allocate_available_register(f);
1029 int ba_reg = spe_allocate_available_register(f);
1030
1031 /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
1032 spe_cfltu(f, r_reg, r_reg, 32);
1033 spe_cfltu(f, g_reg, g_reg, 32);
1034 spe_cfltu(f, b_reg, b_reg, 32);
1035 spe_cfltu(f, a_reg, a_reg, 32);
1036
1037 /* Shift the most significant bytes to the least significant positions.
1038 * I.e.: reg = reg >> 24
1039 */
1040 spe_rotmi(f, r_reg, r_reg, -24);
1041 spe_rotmi(f, g_reg, g_reg, -24);
1042 spe_rotmi(f, b_reg, b_reg, -24);
1043 spe_rotmi(f, a_reg, a_reg, -24);
1044
1045 /* Shift the color bytes according to the surface format */
1046 if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1047 spe_roti(f, g_reg, g_reg, 8); /* green <<= 8 */
1048 spe_roti(f, r_reg, r_reg, 16); /* red <<= 16 */
1049 spe_roti(f, a_reg, a_reg, 24); /* alpha <<= 24 */
1050 }
1051 else if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
1052 spe_roti(f, r_reg, r_reg, 8); /* red <<= 8 */
1053 spe_roti(f, g_reg, g_reg, 16); /* green <<= 16 */
1054 spe_roti(f, b_reg, b_reg, 24); /* blue <<= 24 */
1055 }
1056 else {
1057 ASSERT(0);
1058 }
1059
1060 /* Merge red, green, blue, alpha registers to make packed RGBA colors.
1061 * Eg: after shifting according to color_format we might have:
1062 * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
1063 * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
1064 * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
1065 * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
1066 * OR-ing all those together gives us four packed colors:
1067 * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
1068 */
1069 spe_or(f, rg_reg, r_reg, g_reg);
1070 spe_or(f, ba_reg, a_reg, b_reg);
1071 spe_or(f, rgba_reg, rg_reg, ba_reg);
1072
1073 spe_release_register(f, rg_reg);
1074 spe_release_register(f, ba_reg);
1075 }
1076
1077
1078 static void
1079 gen_colormask(struct spe_function *f,
1080 uint colormask,
1081 enum pipe_format color_format,
1082 int fragRGBA_reg, int fbRGBA_reg)
1083 {
1084 /* We've got four 32-bit RGBA packed pixels in each of
1085 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
1086 * reds, greens, blues, and alphas. Further, the pixels
1087 * are packed according to the given color format, not
1088 * necessarily RGBA...
1089 */
1090 uint r_mask;
1091 uint g_mask;
1092 uint b_mask;
1093 uint a_mask;
1094
1095 /* Calculate exactly where the bits for any particular color
1096 * end up, so we can mask them correctly.
1097 */
1098 switch(color_format) {
1099 case PIPE_FORMAT_B8G8R8A8_UNORM:
1100 /* ARGB */
1101 a_mask = 0xff000000;
1102 r_mask = 0x00ff0000;
1103 g_mask = 0x0000ff00;
1104 b_mask = 0x000000ff;
1105 break;
1106 case PIPE_FORMAT_A8R8G8B8_UNORM:
1107 /* BGRA */
1108 b_mask = 0xff000000;
1109 g_mask = 0x00ff0000;
1110 r_mask = 0x0000ff00;
1111 a_mask = 0x000000ff;
1112 break;
1113 default:
1114 ASSERT(0);
1115 }
1116
1117 /* For each R, G, B, and A component we're supposed to mask out,
1118 * clear its bits. Then our mask operation later will work
1119 * as expected.
1120 */
1121 if (!(colormask & PIPE_MASK_R)) {
1122 r_mask = 0;
1123 }
1124 if (!(colormask & PIPE_MASK_G)) {
1125 g_mask = 0;
1126 }
1127 if (!(colormask & PIPE_MASK_B)) {
1128 b_mask = 0;
1129 }
1130 if (!(colormask & PIPE_MASK_A)) {
1131 a_mask = 0;
1132 }
1133
1134 /* Get a temporary register to hold the mask that will be applied
1135 * to the fragment
1136 */
1137 int colormask_reg = spe_allocate_available_register(f);
1138
1139 /* The actual mask we're going to use is an OR of the remaining R, G, B,
1140 * and A masks. Load the result value into our temporary register.
1141 */
1142 spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
1143
1144 /* Use the mask register to select between the fragment color
1145 * values and the frame buffer color values. Wherever the
1146 * mask has a 0 bit, the current frame buffer color should override
1147 * the fragment color. Wherever the mask has a 1 bit, the
1148 * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM)
1149 * instruction will select bits from its first operand rA wherever the
1150 * the mask bits rM are 0, and from its second operand rB wherever the
1151 * mask bits rM are 1. That means that the frame buffer color is the
1152 * first operand, and the fragment color the second.
1153 */
1154 spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
1155
1156 /* Release the temporary register and we're done */
1157 spe_release_register(f, colormask_reg);
1158 }
1159
1160
1161 /**
1162 * This function is annoyingly similar to gen_depth_test(), above, except
1163 * that instead of comparing two varying values (i.e. fragment and buffer),
1164 * we're comparing a varying value with a static value. As such, we have
1165 * access to the Compare Immediate instructions where we don't in
1166 * gen_depth_test(), which is what makes us very different.
1167 *
1168 * There's some added complexity if there's a non-trivial state->mask
1169 * value; then stencil and reference both must be masked
1170 *
1171 * The return value in the stencil_pass_reg is a bitmask of valid
1172 * fragments that also passed the stencil test. The bitmask of valid
1173 * fragments that failed would be found in
1174 * (fragment_mask_reg & ~stencil_pass_reg).
1175 */
1176 static void
1177 gen_stencil_test(struct spe_function *f,
1178 const struct pipe_stencil_state *state,
1179 const unsigned ref_value,
1180 uint stencil_max_value,
1181 int fragment_mask_reg,
1182 int fbS_reg,
1183 int stencil_pass_reg)
1184 {
1185 /* Generate code that puts the set of passing fragments into the
1186 * stencil_pass_reg register, taking into account whether each fragment
1187 * was active to begin with.
1188 */
1189 switch (state->func) {
1190 case PIPE_FUNC_EQUAL:
1191 if (state->valuemask == stencil_max_value) {
1192 /* stencil_pass = fragment_mask & (s == reference) */
1193 spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, ref_value);
1194 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1195 }
1196 else {
1197 /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
1198 uint tmp_masked_stencil = spe_allocate_available_register(f);
1199 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
1200 spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
1201 state->valuemask & ref_value);
1202 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1203 spe_release_register(f, tmp_masked_stencil);
1204 }
1205 break;
1206
1207 case PIPE_FUNC_NOTEQUAL:
1208 if (state->valuemask == stencil_max_value) {
1209 /* stencil_pass = fragment_mask & ~(s == reference) */
1210 spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, ref_value);
1211 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1212 }
1213 else {
1214 /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
1215 int tmp_masked_stencil = spe_allocate_available_register(f);
1216 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
1217 spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
1218 state->valuemask & ref_value);
1219 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1220 spe_release_register(f, tmp_masked_stencil);
1221 }
1222 break;
1223
1224 case PIPE_FUNC_LESS:
1225 if (state->valuemask == stencil_max_value) {
1226 /* stencil_pass = fragment_mask & (reference < s) */
1227 spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, ref_value);
1228 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1229 }
1230 else {
1231 /* stencil_pass = fragment_mask & ((reference&mask) < (s & mask)) */
1232 int tmp_masked_stencil = spe_allocate_available_register(f);
1233 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
1234 spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
1235 state->valuemask & ref_value);
1236 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1237 spe_release_register(f, tmp_masked_stencil);
1238 }
1239 break;
1240
1241 case PIPE_FUNC_GREATER:
1242 if (state->valuemask == stencil_max_value) {
1243 /* stencil_pass = fragment_mask & (reference > s) */
1244 /* There's no convenient Compare Less Than Immediate instruction, so
1245 * we'll have to do this one the harder way, by loading a register and
1246 * comparing directly. Compare Logical Greater Than Word (clgt)
1247 * treats its operands as unsigned - no sign extension.
1248 */
1249 int tmp_reg = spe_allocate_available_register(f);
1250 spe_load_uint(f, tmp_reg, ref_value);
1251 spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
1252 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1253 spe_release_register(f, tmp_reg);
1254 }
1255 else {
1256 /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
1257 int tmp_reg = spe_allocate_available_register(f);
1258 int tmp_masked_stencil = spe_allocate_available_register(f);
1259 spe_load_uint(f, tmp_reg, state->valuemask & ref_value);
1260 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
1261 spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
1262 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1263 spe_release_register(f, tmp_reg);
1264 spe_release_register(f, tmp_masked_stencil);
1265 }
1266 break;
1267
1268 case PIPE_FUNC_GEQUAL:
1269 if (state->valuemask == stencil_max_value) {
1270 /* stencil_pass = fragment_mask & (reference >= s)
1271 * = fragment_mask & ~(s > reference) */
1272 spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg,
1273 ref_value);
1274 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1275 }
1276 else {
1277 /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
1278 int tmp_masked_stencil = spe_allocate_available_register(f);
1279 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
1280 spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
1281 state->valuemask & ref_value);
1282 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1283 spe_release_register(f, tmp_masked_stencil);
1284 }
1285 break;
1286
1287 case PIPE_FUNC_LEQUAL:
1288 if (state->valuemask == stencil_max_value) {
1289 /* stencil_pass = fragment_mask & (reference <= s) ]
1290 * = fragment_mask & ~(reference > s) */
1291 /* As above, we have to do this by loading a register */
1292 int tmp_reg = spe_allocate_available_register(f);
1293 spe_load_uint(f, tmp_reg, ref_value);
1294 spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
1295 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1296 spe_release_register(f, tmp_reg);
1297 }
1298 else {
1299 /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
1300 int tmp_reg = spe_allocate_available_register(f);
1301 int tmp_masked_stencil = spe_allocate_available_register(f);
1302 spe_load_uint(f, tmp_reg, ref_value & state->valuemask);
1303 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
1304 spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
1305 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1306 spe_release_register(f, tmp_reg);
1307 spe_release_register(f, tmp_masked_stencil);
1308 }
1309 break;
1310
1311 case PIPE_FUNC_NEVER:
1312 /* stencil_pass = fragment_mask & 0 = 0 */
1313 spe_load_uint(f, stencil_pass_reg, 0);
1314 break;
1315
1316 case PIPE_FUNC_ALWAYS:
1317 /* stencil_pass = fragment_mask & 1 = fragment_mask */
1318 spe_move(f, stencil_pass_reg, fragment_mask_reg);
1319 break;
1320 }
1321
1322 /* The fragments that passed the stencil test are now in stencil_pass_reg.
1323 * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
1324 */
1325 }
1326
1327
1328 /**
1329 * This function generates code that calculates a set of new stencil values
1330 * given the earlier values and the operation to apply. It does not
1331 * apply any tests. It is intended to be called up to 3 times
1332 * (for the stencil fail operation, for the stencil pass-z fail operation,
1333 * and for the stencil pass-z pass operation) to collect up to three
1334 * possible sets of values, and for the caller to combine them based
1335 * on the result of the tests.
1336 *
1337 * stencil_max_value should be (2^n - 1) where n is the number of bits
1338 * in the stencil buffer - in other words, it should be usable as a mask.
1339 */
1340 static void
1341 gen_stencil_values(struct spe_function *f,
1342 uint stencil_op,
1343 uint stencil_ref_value,
1344 uint stencil_max_value,
1345 int fbS_reg,
1346 int newS_reg)
1347 {
1348 /* The code below assumes that newS_reg and fbS_reg are not the same
1349 * register; if they can be, the calculations below will have to use
1350 * an additional temporary register. For now, mark the assumption
1351 * with an assertion that will fail if they are the same.
1352 */
1353 ASSERT(fbS_reg != newS_reg);
1354
1355 /* The code also assumes that the stencil_max_value is of the form
1356 * 2^n-1 and can therefore be used as a mask for the valid bits in
1357 * addition to a maximum. Make sure this is the case as well.
1358 * The clever math below exploits the fact that incrementing a
1359 * binary number serves to flip all the bits of a number starting at
1360 * the LSB and continuing to (and including) the first zero bit
1361 * found. That means that a number and its increment will always
1362 * have at least one bit in common (the high order bit, if nothing
1363 * else) *unless* the number is zero, *or* the number is of a form
1364 * consisting of some number of 1s in the low-order bits followed
1365 * by nothing but 0s in the high-order bits. The latter case
1366 * implies it's of the form 2^n-1.
1367 */
1368 ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
1369
1370 switch(stencil_op) {
1371 case PIPE_STENCIL_OP_KEEP:
1372 /* newS = S */
1373 spe_move(f, newS_reg, fbS_reg);
1374 break;
1375
1376 case PIPE_STENCIL_OP_ZERO:
1377 /* newS = 0 */
1378 spe_zero(f, newS_reg);
1379 break;
1380
1381 case PIPE_STENCIL_OP_REPLACE:
1382 /* newS = stencil reference value */
1383 spe_load_uint(f, newS_reg, stencil_ref_value);
1384 break;
1385
1386 case PIPE_STENCIL_OP_INCR: {
1387 /* newS = (s == max ? max : s + 1) */
1388 int equals_reg = spe_allocate_available_register(f);
1389
1390 spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
1391 /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
1392 spe_ai(f, newS_reg, fbS_reg, 1);
1393 /* Select from the current value or the new value based on the equality test */
1394 spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
1395
1396 spe_release_register(f, equals_reg);
1397 break;
1398 }
1399 case PIPE_STENCIL_OP_DECR: {
1400 /* newS = (s == 0 ? 0 : s - 1) */
1401 int equals_reg = spe_allocate_available_register(f);
1402
1403 spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
1404 /* Add Word Immediate with a (-1) value works */
1405 spe_ai(f, newS_reg, fbS_reg, -1);
1406 /* Select from the current value or the new value based on the equality test */
1407 spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
1408
1409 spe_release_register(f, equals_reg);
1410 break;
1411 }
1412 case PIPE_STENCIL_OP_INCR_WRAP:
1413 /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
1414 * do a normal add and mask off the correct bits
1415 */
1416 spe_ai(f, newS_reg, fbS_reg, 1);
1417 spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
1418 break;
1419
1420 case PIPE_STENCIL_OP_DECR_WRAP:
1421 /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
1422 spe_ai(f, newS_reg, fbS_reg, -1);
1423 spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
1424 break;
1425
1426 case PIPE_STENCIL_OP_INVERT:
1427 /* newS = ~s. We take advantage of the mask/max value to invert only
1428 * the valid bits for the field so we don't have to do an extra "and".
1429 */
1430 spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
1431 break;
1432
1433 default:
1434 ASSERT(0);
1435 }
1436 }
1437
1438
1439 /**
1440 * This function generates code to get all the necessary possible
1441 * stencil values. For each of the output registers (fail_reg,
1442 * zfail_reg, and zpass_reg), it either allocates a new register
1443 * and calculates a new set of values based on the stencil operation,
1444 * or it reuses a register allocation and calculation done for an
1445 * earlier (matching) operation, or it reuses the fbS_reg register
1446 * (if the stencil operation is KEEP, which doesn't change the
1447 * stencil buffer).
1448 *
1449 * Since this function allocates a variable number of registers,
1450 * to avoid incurring complex logic to free them, they should
1451 * be allocated after a spe_allocate_register_set() call
1452 * and released by the corresponding spe_release_register_set() call.
1453 */
1454 static void
1455 gen_get_stencil_values(struct spe_function *f,
1456 const struct pipe_stencil_state *stencil,
1457 const unsigned ref_value,
1458 const uint depth_enabled,
1459 int fbS_reg,
1460 int *fail_reg,
1461 int *zfail_reg,
1462 int *zpass_reg)
1463 {
1464 uint zfail_op;
1465
1466 /* Stenciling had better be enabled here */
1467 ASSERT(stencil->enabled);
1468
1469 /* If the depth test is not enabled, it is treated as though it always
1470 * passes, which means that the zfail_op is not considered - a
1471 * failing stencil test triggers the fail_op, and a passing one
1472 * triggers the zpass_op
1473 *
1474 * As an optimization, override calculation of the zfail_op values
1475 * if they aren't going to be used. By setting the value of
1476 * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
1477 * to match the incoming stencil values, and no calculation will
1478 * be done.
1479 */
1480 if (depth_enabled) {
1481 zfail_op = stencil->zfail_op;
1482 }
1483 else {
1484 zfail_op = PIPE_STENCIL_OP_KEEP;
1485 }
1486
1487 /* One-sided or front-facing stencil */
1488 if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
1489 *fail_reg = fbS_reg;
1490 }
1491 else {
1492 *fail_reg = spe_allocate_available_register(f);
1493 gen_stencil_values(f, stencil->fail_op, ref_value,
1494 0xff, fbS_reg, *fail_reg);
1495 }
1496
1497 /* Check the possibly overridden value, not the structure value */
1498 if (zfail_op == PIPE_STENCIL_OP_KEEP) {
1499 *zfail_reg = fbS_reg;
1500 }
1501 else if (zfail_op == stencil->fail_op) {
1502 *zfail_reg = *fail_reg;
1503 }
1504 else {
1505 *zfail_reg = spe_allocate_available_register(f);
1506 gen_stencil_values(f, stencil->zfail_op, ref_value,
1507 0xff, fbS_reg, *zfail_reg);
1508 }
1509
1510 if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
1511 *zpass_reg = fbS_reg;
1512 }
1513 else if (stencil->zpass_op == stencil->fail_op) {
1514 *zpass_reg = *fail_reg;
1515 }
1516 else if (stencil->zpass_op == zfail_op) {
1517 *zpass_reg = *zfail_reg;
1518 }
1519 else {
1520 *zpass_reg = spe_allocate_available_register(f);
1521 gen_stencil_values(f, stencil->zpass_op, ref_value,
1522 0xff, fbS_reg, *zpass_reg);
1523 }
1524 }
1525
1526 /**
1527 * Note that fbZ_reg may *not* be set on entry, if in fact
1528 * the depth test is not enabled. This function must not use
1529 * the register if depth is not enabled.
1530 */
1531 static boolean
1532 gen_stencil_depth_test(struct spe_function *f,
1533 const struct pipe_depth_stencil_alpha_state *dsa,
1534 const struct pipe_stencil_ref *stencil_ref,
1535 const uint facing,
1536 const int mask_reg, const int fragZ_reg,
1537 const int fbZ_reg, const int fbS_reg)
1538 {
1539 /* True if we've generated code that could require writeback to the
1540 * depth and/or stencil buffers
1541 */
1542 boolean modified_buffers = FALSE;
1543
1544 boolean need_to_calculate_stencil_values;
1545 boolean need_to_writemask_stencil_values;
1546
1547 struct pipe_stencil_state *stencil;
1548
1549 /* Registers. We may or may not actually allocate these, depending
1550 * on whether the state values indicate that we need them.
1551 */
1552 int stencil_pass_reg, stencil_fail_reg;
1553 int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
1554 int stencil_writemask_reg;
1555 int zmask_reg;
1556 int newS_reg;
1557 unsigned ref_value;
1558
1559 /* Stenciling is quite complex: up to six different configurable stencil
1560 * operations/calculations can be required (three each for front-facing
1561 * and back-facing fragments). Many of those operations will likely
1562 * be identical, so there's good reason to try to avoid calculating
1563 * the same values more than once (which unfortunately makes the code less
1564 * straightforward).
1565 *
1566 * To make register management easier, we start a new
1567 * register set; we can release all the registers in the set at
1568 * once, and avoid having to keep track of exactly which registers
1569 * we allocate. We can still allocate and free registers as
1570 * desired (if we know we no longer need a register), but we don't
1571 * have to spend the complexity to track the more difficult variant
1572 * register usage scenarios.
1573 */
1574 spe_comment(f, 0, "Allocating stencil register set");
1575 spe_allocate_register_set(f);
1576
1577 /* The facing we're given is the fragment facing; it doesn't
1578 * exactly match the stencil facing. If stencil is enabled,
1579 * but two-sided stencil is *not* enabled, we use the same
1580 * stencil settings for both front- and back-facing fragments.
1581 * We only use the "back-facing" stencil for backfacing fragments
1582 * if two-sided stenciling is enabled.
1583 */
1584 if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
1585 stencil = &dsa->stencil[1];
1586 ref_value = stencil_ref->ref_value[1];
1587 }
1588 else {
1589 stencil = &dsa->stencil[0];
1590 ref_value = stencil_ref->ref_value[0];
1591 }
1592
1593 /* Calculate the writemask. If the writemask is trivial (either
1594 * all 0s, meaning that we don't need to calculate any stencil values
1595 * because they're not going to change the stencil anyway, or all 1s,
1596 * meaning that we have to calculate the stencil values but do not
1597 * need to mask them), we can avoid generating code. Don't forget
1598 * that we need to consider backfacing stencil, if enabled.
1599 *
1600 * Note that if the backface stencil is *not* enabled, the backface
1601 * stencil will have the same values as the frontface stencil.
1602 */
1603 if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
1604 stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
1605 stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
1606 need_to_calculate_stencil_values = FALSE;
1607 need_to_writemask_stencil_values = FALSE;
1608 }
1609 else if (stencil->writemask == 0x0) {
1610 /* All changes are writemasked out, so no need to calculate
1611 * what those changes might be, and no need to write anything back.
1612 */
1613 need_to_calculate_stencil_values = FALSE;
1614 need_to_writemask_stencil_values = FALSE;
1615 }
1616 else if (stencil->writemask == 0xff) {
1617 /* Still trivial, but a little less so. We need to write the stencil
1618 * values, but we don't need to mask them.
1619 */
1620 need_to_calculate_stencil_values = TRUE;
1621 need_to_writemask_stencil_values = FALSE;
1622 }
1623 else {
1624 /* The general case: calculate, mask, and write */
1625 need_to_calculate_stencil_values = TRUE;
1626 need_to_writemask_stencil_values = TRUE;
1627
1628 /* While we're here, generate code that calculates what the
1629 * writemask should be. If backface stenciling is enabled,
1630 * and the backface writemask is not the same as the frontface
1631 * writemask, we'll have to generate code that merges the
1632 * two masks into a single effective mask based on fragment facing.
1633 */
1634 spe_comment(f, 0, "Computing stencil writemask");
1635 stencil_writemask_reg = spe_allocate_available_register(f);
1636 spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].writemask);
1637 }
1638
1639 /* At least one-sided stenciling must be on. Generate code that
1640 * runs the stencil test on the basic/front-facing stencil, leaving
1641 * the mask of passing stencil bits in stencil_pass_reg. This mask will
1642 * be used both to mask the set of active pixels, and also to
1643 * determine how the stencil buffer changes.
1644 *
1645 * This test will *not* change the value in mask_reg (because we don't
1646 * yet know whether to apply the two-sided stencil or one-sided stencil).
1647 */
1648 spe_comment(f, 0, "Running basic stencil test");
1649 stencil_pass_reg = spe_allocate_available_register(f);
1650 gen_stencil_test(f, stencil, ref_value, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
1651
1652 /* Generate code that, given the mask of valid fragments and the
1653 * mask of valid fragments that passed the stencil test, computes
1654 * the mask of valid fragments that failed the stencil test. We
1655 * have to do this before we run a depth test (because the
1656 * depth test should not be performed on fragments that failed the
1657 * stencil test, and because the depth test will update the
1658 * mask of valid fragments based on the results of the depth test).
1659 */
1660 spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
1661 stencil_fail_reg = spe_allocate_available_register(f);
1662 spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
1663 /* Now remove the stenciled-out pixels from the valid fragment mask,
1664 * so we can later use the valid fragment mask in the depth test.
1665 */
1666 spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
1667
1668 /* We may not need to calculate stencil values, if the writemask is off */
1669 if (need_to_calculate_stencil_values) {
1670 /* Generate code that calculates exactly which stencil values we need,
1671 * without calculating the same value twice (say, if two different
1672 * stencil ops have the same value). This code will work for one-sided
1673 * and two-sided stenciling (so that we take into account that operations
1674 * may match between front and back stencils), and will also take into
1675 * account whether the depth test is enabled (if the depth test is off,
1676 * we don't need any of the zfail results, because the depth test always
1677 * is considered to pass if it is disabled). Any register value that
1678 * does not need to be calculated will come back with the same value
1679 * that's in fbS_reg.
1680 *
1681 * This function will allocate a variant number of registers that
1682 * will be released as part of the register set.
1683 */
1684 spe_comment(f, 0, facing == CELL_FACING_FRONT
1685 ? "Computing front-facing stencil values"
1686 : "Computing back-facing stencil values");
1687 gen_get_stencil_values(f, stencil, ref_value, dsa->depth.enabled, fbS_reg,
1688 &stencil_fail_values, &stencil_pass_depth_fail_values,
1689 &stencil_pass_depth_pass_values);
1690 }
1691
1692 /* We now have all the stencil values we need. We also need
1693 * the results of the depth test to figure out which
1694 * stencil values will become the new stencil values. (Even if
1695 * we aren't actually calculating stencil values, we need to apply
1696 * the depth test if it's enabled.)
1697 *
1698 * The code generated by gen_depth_test() returns the results of the
1699 * test in the given register, but also alters the mask_reg based
1700 * on the results of the test.
1701 */
1702 if (dsa->depth.enabled) {
1703 spe_comment(f, 0, "Running stencil depth test");
1704 zmask_reg = spe_allocate_available_register(f);
1705 modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg,
1706 fbZ_reg, zmask_reg);
1707 }
1708
1709 if (need_to_calculate_stencil_values) {
1710
1711 /* If we need to writemask the stencil values before going into
1712 * the stencil buffer, we'll have to use a new register to
1713 * hold the new values. If not, we can just keep using the
1714 * current register.
1715 */
1716 if (need_to_writemask_stencil_values) {
1717 newS_reg = spe_allocate_available_register(f);
1718 spe_comment(f, 0, "Saving current stencil values for writemasking");
1719 spe_move(f, newS_reg, fbS_reg);
1720 }
1721 else {
1722 newS_reg = fbS_reg;
1723 }
1724
1725 /* Merge in the selected stencil fail values */
1726 if (stencil_fail_values != fbS_reg) {
1727 spe_comment(f, 0, "Loading stencil fail values");
1728 spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
1729 modified_buffers = TRUE;
1730 }
1731
1732 /* Same for the stencil pass/depth fail values. If this calculation
1733 * is not needed (say, if depth test is off), then the
1734 * stencil_pass_depth_fail_values register will be equal to fbS_reg
1735 * and we'll skip the calculation.
1736 */
1737 if (stencil_pass_depth_fail_values != fbS_reg) {
1738 /* We don't actually have a stencil pass/depth fail mask yet.
1739 * Calculate it here from the stencil passing mask and the
1740 * depth passing mask. Note that zmask_reg *must* have been
1741 * set above if we're here.
1742 */
1743 uint stencil_pass_depth_fail_mask =
1744 spe_allocate_available_register(f);
1745
1746 spe_comment(f, 0, "Loading stencil pass/depth fail values");
1747 spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
1748
1749 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values,
1750 stencil_pass_depth_fail_mask);
1751
1752 spe_release_register(f, stencil_pass_depth_fail_mask);
1753 modified_buffers = TRUE;
1754 }
1755
1756 /* Same for the stencil pass/depth pass mask. Note that we
1757 * *can* get here with zmask_reg being unset (if the depth
1758 * test is off but the stencil test is on). In this case,
1759 * we assume the depth test passes, and don't need to mask
1760 * the stencil pass mask with the Z mask.
1761 */
1762 if (stencil_pass_depth_pass_values != fbS_reg) {
1763 if (dsa->depth.enabled) {
1764 uint stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
1765 /* We'll need a separate register */
1766 spe_comment(f, 0, "Loading stencil pass/depth pass values");
1767 spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
1768 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
1769 spe_release_register(f, stencil_pass_depth_pass_mask);
1770 }
1771 else {
1772 /* We can use the same stencil-pass register */
1773 spe_comment(f, 0, "Loading stencil pass values");
1774 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
1775 }
1776 modified_buffers = TRUE;
1777 }
1778
1779 /* Almost done. If we need to writemask, do it now, leaving the
1780 * results in the fbS_reg register passed in. If we don't need
1781 * to writemask, then the results are *already* in the fbS_reg,
1782 * so there's nothing more to do.
1783 */
1784
1785 if (need_to_writemask_stencil_values && modified_buffers) {
1786 /* The Select Bytes command makes a fine writemask. Where
1787 * the mask is 0, the first (original) values are retained,
1788 * effectively masking out changes. Where the mask is 1, the
1789 * second (new) values are retained, incorporating changes.
1790 */
1791 spe_comment(f, 0, "Writemasking new stencil values");
1792 spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
1793 }
1794
1795 } /* done calculating stencil values */
1796
1797 /* The stencil and/or depth values have been applied, and the
1798 * mask_reg, fbS_reg, and fbZ_reg values have been updated.
1799 * We're all done, except that we've allocated a fair number
1800 * of registers that we didn't bother tracking. Release all
1801 * those registers as part of the register set, and go home.
1802 */
1803 spe_comment(f, 0, "Releasing stencil register set");
1804 spe_release_register_set(f);
1805
1806 /* Return TRUE if we could have modified the stencil and/or
1807 * depth buffers.
1808 */
1809 return modified_buffers;
1810 }
1811
1812
1813 /**
1814 * Generate depth and/or stencil test code.
1815 * \param cell context
1816 * \param dsa depth/stencil/alpha state
1817 * \param f spe function to emit
1818 * \param facing either CELL_FACING_FRONT or CELL_FACING_BACK
1819 * \param mask_reg register containing the pixel alive/dead mask
1820 * \param depth_tile_reg register containing address of z/stencil tile
1821 * \param quad_offset_reg offset to quad from start of tile
1822 * \param fragZ_reg register containg fragment Z values
1823 */
1824 static void
1825 gen_depth_stencil(struct cell_context *cell,
1826 const struct pipe_depth_stencil_alpha_state *dsa,
1827 const struct pipe_stencil_ref *stencil_ref,
1828 struct spe_function *f,
1829 uint facing,
1830 int mask_reg,
1831 int depth_tile_reg,
1832 int quad_offset_reg,
1833 int fragZ_reg)
1834
1835 {
1836 const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
1837 boolean write_depth_stencil;
1838
1839 /* framebuffer's combined z/stencil values register */
1840 int fbZS_reg = spe_allocate_available_register(f);
1841
1842 /* Framebufer Z values register */
1843 int fbZ_reg = spe_allocate_available_register(f);
1844
1845 /* Framebuffer stencil values register (may not be used) */
1846 int fbS_reg = spe_allocate_available_register(f);
1847
1848 /* 24-bit mask register (may not be used) */
1849 int zmask_reg = spe_allocate_available_register(f);
1850
1851 /**
1852 * The following code:
1853 * 1. fetch quad of packed Z/S values from the framebuffer tile.
1854 * 2. extract the separate the Z and S values from packed values
1855 * 3. convert fragment Z values from float in [0,1] to 32/24/16-bit ints
1856 *
1857 * The instructions for doing this are interleaved for better performance.
1858 */
1859 spe_comment(f, 0, "Fetch Z/stencil quad from tile");
1860
1861 switch(zs_format) {
1862 case PIPE_FORMAT_Z24_UNORM_S8_USCALED: /* fall through */
1863 case PIPE_FORMAT_Z24X8_UNORM:
1864 /* prepare mask to extract Z vals from ZS vals */
1865 spe_load_uint(f, zmask_reg, 0x00ffffff);
1866
1867 /* convert fragment Z from [0,1] to 32-bit ints */
1868 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1869
1870 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
1871 spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
1872
1873 /* right shift 32-bit fragment Z to 24 bits */
1874 spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
1875
1876 /* extract 24-bit Z values from ZS values by masking */
1877 spe_and(f, fbZ_reg, fbZS_reg, zmask_reg);
1878
1879 /* extract 8-bit stencil values by shifting */
1880 spe_rotmi(f, fbS_reg, fbZS_reg, -24);
1881 break;
1882
1883 case PIPE_FORMAT_S8_USCALED_Z24_UNORM: /* fall through */
1884 case PIPE_FORMAT_X8Z24_UNORM:
1885 /* convert fragment Z from [0,1] to 32-bit ints */
1886 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1887
1888 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
1889 spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
1890
1891 /* right shift 32-bit fragment Z to 24 bits */
1892 spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
1893
1894 /* extract 24-bit Z values from ZS values by shifting */
1895 spe_rotmi(f, fbZ_reg, fbZS_reg, -8);
1896
1897 /* extract 8-bit stencil values by masking */
1898 spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
1899 break;
1900
1901 case PIPE_FORMAT_Z32_UNORM:
1902 /* Load: fbZ_reg = memory[depth_tile_reg + offset_reg] */
1903 spe_lqx(f, fbZ_reg, depth_tile_reg, quad_offset_reg);
1904
1905 /* convert fragment Z from [0,1] to 32-bit ints */
1906 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1907
1908 /* No stencil, so can't do anything there */
1909 break;
1910
1911 case PIPE_FORMAT_Z16_UNORM:
1912 /* XXX This code for 16bpp Z is broken! */
1913
1914 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
1915 spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
1916
1917 /* Copy over 4 32-bit values */
1918 spe_move(f, fbZ_reg, fbZS_reg);
1919
1920 /* convert Z from [0,1] to 16-bit ints */
1921 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1922 spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
1923 /* No stencil */
1924 break;
1925
1926 default:
1927 ASSERT(0); /* invalid format */
1928 }
1929
1930 /* If stencil is enabled, use the stencil-specific code
1931 * generator to generate both the stencil and depth (if needed)
1932 * tests. Otherwise, if only depth is enabled, generate
1933 * a quick depth test. The test generators themselves will
1934 * report back whether the depth/stencil buffer has to be
1935 * written back.
1936 */
1937 if (dsa->stencil[0].enabled) {
1938 /* This will perform the stencil and depth tests, and update
1939 * the mask_reg, fbZ_reg, and fbS_reg as required by the
1940 * tests.
1941 */
1942 ASSERT(fbS_reg >= 0);
1943 spe_comment(f, 0, "Perform stencil test");
1944
1945 /* Note that fbZ_reg may not be set on entry, if stenciling
1946 * is enabled but there's no Z-buffer. The
1947 * gen_stencil_depth_test() function must ignore the
1948 * fbZ_reg register if depth is not enabled.
1949 */
1950 write_depth_stencil = gen_stencil_depth_test(f, dsa, stencil_ref, facing,
1951 mask_reg, fragZ_reg,
1952 fbZ_reg, fbS_reg);
1953 }
1954 else if (dsa->depth.enabled) {
1955 int zmask_reg = spe_allocate_available_register(f);
1956 ASSERT(fbZ_reg >= 0);
1957 spe_comment(f, 0, "Perform depth test");
1958 write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg,
1959 fbZ_reg, zmask_reg);
1960 spe_release_register(f, zmask_reg);
1961 }
1962 else {
1963 write_depth_stencil = FALSE;
1964 }
1965
1966 if (write_depth_stencil) {
1967 /* Merge latest Z and Stencil values into fbZS_reg.
1968 * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
1969 * fbS_reg has four 8-bit Z values in bits [7..0].
1970 */
1971 spe_comment(f, 0, "Store quad's depth/stencil values in tile");
1972 if (zs_format == PIPE_FORMAT_Z24_UNORM_S8_USCALED ||
1973 zs_format == PIPE_FORMAT_Z24X8_UNORM) {
1974 spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
1975 spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
1976 }
1977 else if (zs_format == PIPE_FORMAT_S8_USCALED_Z24_UNORM ||
1978 zs_format == PIPE_FORMAT_X8Z24_UNORM) {
1979 spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
1980 spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
1981 }
1982 else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
1983 spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
1984 }
1985 else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
1986 spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
1987 }
1988 else if (zs_format == PIPE_FORMAT_S8_USCALED) {
1989 ASSERT(0); /* XXX to do */
1990 }
1991 else {
1992 ASSERT(0); /* bad zs_format */
1993 }
1994
1995 /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
1996 spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
1997 }
1998
1999 /* Don't need these any more */
2000 spe_release_register(f, fbZS_reg);
2001 spe_release_register(f, fbZ_reg);
2002 spe_release_register(f, fbS_reg);
2003 spe_release_register(f, zmask_reg);
2004 }
2005
2006
2007
2008 /**
2009 * Generate SPE code to implement the fragment operations (alpha test,
2010 * depth test, stencil test, blending, colormask, and final
2011 * framebuffer write) as specified by the current context state.
2012 *
2013 * Logically, this code will be called after running the fragment
2014 * shader. But under some circumstances we could run some of this
2015 * code before the fragment shader to cull fragments/quads that are
2016 * totally occluded/discarded.
2017 *
2018 * XXX we only support PIPE_FORMAT_S8_USCALED_Z24_UNORM z/stencil buffer right now.
2019 *
2020 * See the spu_default_fragment_ops() function to see how the per-fragment
2021 * operations would be done with ordinary C code.
2022 * The code we generate here though has no branches, is SIMD, etc and
2023 * should be much faster.
2024 *
2025 * \param cell the rendering context (in)
2026 * \param facing whether the generated code is for front-facing or
2027 * back-facing fragments
2028 * \param f the generated function (in/out); on input, the function
2029 * must already have been initialized. On exit, whatever
2030 * instructions within the generated function have had
2031 * the fragment ops appended.
2032 */
2033 void
2034 cell_gen_fragment_function(struct cell_context *cell,
2035 const uint facing,
2036 struct spe_function *f)
2037 {
2038 const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
2039 const struct pipe_stencil_ref *stencil_ref = &cell->stencil_ref;
2040 const struct pipe_blend_state *blend = cell->blend;
2041 const struct pipe_blend_color *blend_color = &cell->blend_color;
2042 const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
2043
2044 /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
2045 const int x_reg = 3; /* uint */
2046 const int y_reg = 4; /* uint */
2047 const int color_tile_reg = 5; /* tile_t * */
2048 const int depth_tile_reg = 6; /* tile_t * */
2049 const int fragZ_reg = 7; /* vector float */
2050 const int fragR_reg = 8; /* vector float */
2051 const int fragG_reg = 9; /* vector float */
2052 const int fragB_reg = 10; /* vector float */
2053 const int fragA_reg = 11; /* vector float */
2054 const int mask_reg = 12; /* vector uint */
2055
2056 ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
2057
2058 /* offset of quad from start of tile
2059 * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
2060 */
2061 int quad_offset_reg;
2062
2063 int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */
2064
2065 if (cell->debug_flags & CELL_DEBUG_ASM) {
2066 spe_print_code(f, TRUE);
2067 spe_indent(f, 8);
2068 spe_comment(f, -4, facing == CELL_FACING_FRONT
2069 ? "Begin front-facing per-fragment ops"
2070 : "Begin back-facing per-fragment ops");
2071 }
2072
2073 spe_allocate_register(f, x_reg);
2074 spe_allocate_register(f, y_reg);
2075 spe_allocate_register(f, color_tile_reg);
2076 spe_allocate_register(f, depth_tile_reg);
2077 spe_allocate_register(f, fragZ_reg);
2078 spe_allocate_register(f, fragR_reg);
2079 spe_allocate_register(f, fragG_reg);
2080 spe_allocate_register(f, fragB_reg);
2081 spe_allocate_register(f, fragA_reg);
2082 spe_allocate_register(f, mask_reg);
2083
2084 quad_offset_reg = spe_allocate_available_register(f);
2085 fbRGBA_reg = spe_allocate_available_register(f);
2086
2087 /* compute offset of quad from start of tile, in bytes */
2088 {
2089 int x2_reg = spe_allocate_available_register(f);
2090 int y2_reg = spe_allocate_available_register(f);
2091
2092 ASSERT(TILE_SIZE == 32);
2093
2094 spe_comment(f, 0, "Compute quad offset within tile");
2095 spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */
2096 spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
2097 spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */
2098 spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */
2099 spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */
2100
2101 spe_release_register(f, x2_reg);
2102 spe_release_register(f, y2_reg);
2103 }
2104
2105 /* Generate the alpha test, if needed. */
2106 if (dsa->alpha.enabled) {
2107 gen_alpha_test(dsa, f, mask_reg, fragA_reg);
2108 }
2109
2110 /* generate depth and/or stencil test code */
2111 if (dsa->depth.enabled || dsa->stencil[0].enabled) {
2112 gen_depth_stencil(cell, dsa, stencil_ref, f,
2113 facing,
2114 mask_reg,
2115 depth_tile_reg,
2116 quad_offset_reg,
2117 fragZ_reg);
2118 }
2119
2120 /* Get framebuffer quad/colors. We'll need these for blending,
2121 * color masking, and to obey the quad/pixel mask.
2122 * Load: fbRGBA_reg = memory[color_tile + quad_offset]
2123 * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
2124 * we could skip this load.
2125 */
2126 spe_comment(f, 0, "Fetch quad colors from tile");
2127 spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
2128
2129 if (blend->rt[0].blend_enable) {
2130 spe_comment(f, 0, "Perform blending");
2131 gen_blend(blend, blend_color, f, color_format,
2132 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
2133 }
2134
2135 /*
2136 * Write fragment colors to framebuffer/tile.
2137 * This involves converting the fragment colors from float[4] to the
2138 * tile's specific format and obeying the quad/pixel mask.
2139 */
2140 {
2141 int rgba_reg = spe_allocate_available_register(f);
2142
2143 /* Pack four float colors as four 32-bit int colors */
2144 spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
2145 gen_pack_colors(f, color_format,
2146 fragR_reg, fragG_reg, fragB_reg, fragA_reg,
2147 rgba_reg);
2148
2149 if (blend->logicop_enable) {
2150 spe_comment(f, 0, "Compute logic op");
2151 gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
2152 }
2153
2154 if (blend->rt[0].colormask != PIPE_MASK_RGBA) {
2155 spe_comment(f, 0, "Compute color mask");
2156 gen_colormask(f, blend->rt[0].colormask, color_format, rgba_reg, fbRGBA_reg);
2157 }
2158
2159 /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
2160 * if (mask[i])
2161 * rgba[i] = rgba[i];
2162 * else
2163 * rgba[i] = framebuffer[i];
2164 */
2165 spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
2166
2167 /* Store updated quad in tile:
2168 * memory[color_tile + quad_offset] = rgba_reg;
2169 */
2170 spe_comment(f, 0, "Store quad colors into color tile");
2171 spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
2172
2173 spe_release_register(f, rgba_reg);
2174 }
2175
2176 //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
2177
2178 spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */
2179
2180 spe_release_register(f, fbRGBA_reg);
2181 spe_release_register(f, quad_offset_reg);
2182
2183 if (cell->debug_flags & CELL_DEBUG_ASM) {
2184 char buffer[1024];
2185 sprintf(buffer, "End %s-facing per-fragment ops: %d instructions",
2186 facing == CELL_FACING_FRONT ? "front" : "back", f->num_inst);
2187 spe_comment(f, -4, buffer);
2188 }
2189 }