cell: clean-up, re-indent, comments
[mesa.git] / src / gallium / drivers / cell / ppu / cell_gen_fragment.c
1 /**************************************************************************
2 *
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009 VMware, Inc. All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29 /**
30 * Generate SPU per-fragment code (actually per-quad code).
31 * \author Brian Paul
32 * \author Bob Ellison
33 */
34
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "rtasm/rtasm_ppc_spe.h"
39 #include "cell_context.h"
40 #include "cell_gen_fragment.h"
41
42
43
44 /** Do extra optimizations? */
45 #define OPTIMIZATIONS 1
46
47
48 /**
49 * Generate SPE code to perform Z/depth testing.
50 *
51 * \param dsa Gallium depth/stencil/alpha state to gen code for
52 * \param f SPE function to append instruction onto.
53 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
54 * \param ifragZ_reg register containing integer fragment Z values (in)
55 * \param ifbZ_reg register containing integer frame buffer Z values (in/out)
56 * \param zmask_reg register containing result of Z test/comparison (out)
57 *
58 * Returns true if the Z-buffer needs to be updated.
59 */
60 static boolean
61 gen_depth_test(struct spe_function *f,
62 const struct pipe_depth_stencil_alpha_state *dsa,
63 int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
64 {
65 /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
66 * quantities. This only makes a difference for 32-bit Z values though.
67 */
68 ASSERT(dsa->depth.enabled);
69
70 switch (dsa->depth.func) {
71 case PIPE_FUNC_EQUAL:
72 /* zmask = (ifragZ == ref) */
73 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
74 /* mask = (mask & zmask) */
75 spe_and(f, mask_reg, mask_reg, zmask_reg);
76 break;
77
78 case PIPE_FUNC_NOTEQUAL:
79 /* zmask = (ifragZ == ref) */
80 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
81 /* mask = (mask & ~zmask) */
82 spe_andc(f, mask_reg, mask_reg, zmask_reg);
83 break;
84
85 case PIPE_FUNC_GREATER:
86 /* zmask = (ifragZ > ref) */
87 spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
88 /* mask = (mask & zmask) */
89 spe_and(f, mask_reg, mask_reg, zmask_reg);
90 break;
91
92 case PIPE_FUNC_LESS:
93 /* zmask = (ref > ifragZ) */
94 spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
95 /* mask = (mask & zmask) */
96 spe_and(f, mask_reg, mask_reg, zmask_reg);
97 break;
98
99 case PIPE_FUNC_LEQUAL:
100 /* zmask = (ifragZ > ref) */
101 spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
102 /* mask = (mask & ~zmask) */
103 spe_andc(f, mask_reg, mask_reg, zmask_reg);
104 break;
105
106 case PIPE_FUNC_GEQUAL:
107 /* zmask = (ref > ifragZ) */
108 spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
109 /* mask = (mask & ~zmask) */
110 spe_andc(f, mask_reg, mask_reg, zmask_reg);
111 break;
112
113 case PIPE_FUNC_NEVER:
114 spe_il(f, mask_reg, 0); /* mask = {0,0,0,0} */
115 spe_move(f, zmask_reg, mask_reg); /* zmask = mask */
116 break;
117
118 case PIPE_FUNC_ALWAYS:
119 /* mask unchanged */
120 spe_il(f, zmask_reg, ~0); /* zmask = {~0,~0,~0,~0} */
121 break;
122
123 default:
124 ASSERT(0);
125 break;
126 }
127
128 if (dsa->depth.writemask) {
129 /*
130 * If (ztest passed) {
131 * framebufferZ = fragmentZ;
132 * }
133 * OR,
134 * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
135 */
136 spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
137 return true;
138 }
139
140 return false;
141 }
142
143
144 /**
145 * Generate SPE code to perform alpha testing.
146 *
147 * \param dsa Gallium depth/stencil/alpha state to gen code for
148 * \param f SPE function to append instruction onto.
149 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
150 * \param fragA_reg register containing four fragment alpha values (in)
151 */
152 static void
153 gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
154 struct spe_function *f, int mask_reg, int fragA_reg)
155 {
156 int ref_reg = spe_allocate_available_register(f);
157 int amask_reg = spe_allocate_available_register(f);
158
159 ASSERT(dsa->alpha.enabled);
160
161 if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
162 (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
163 /* load/splat the alpha reference float value */
164 spe_load_float(f, ref_reg, dsa->alpha.ref);
165 }
166
167 /* emit code to do the alpha comparison, updating 'mask' */
168 switch (dsa->alpha.func) {
169 case PIPE_FUNC_EQUAL:
170 /* amask = (fragA == ref) */
171 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
172 /* mask = (mask & amask) */
173 spe_and(f, mask_reg, mask_reg, amask_reg);
174 break;
175
176 case PIPE_FUNC_NOTEQUAL:
177 /* amask = (fragA == ref) */
178 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
179 /* mask = (mask & ~amask) */
180 spe_andc(f, mask_reg, mask_reg, amask_reg);
181 break;
182
183 case PIPE_FUNC_GREATER:
184 /* amask = (fragA > ref) */
185 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
186 /* mask = (mask & amask) */
187 spe_and(f, mask_reg, mask_reg, amask_reg);
188 break;
189
190 case PIPE_FUNC_LESS:
191 /* amask = (ref > fragA) */
192 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
193 /* mask = (mask & amask) */
194 spe_and(f, mask_reg, mask_reg, amask_reg);
195 break;
196
197 case PIPE_FUNC_LEQUAL:
198 /* amask = (fragA > ref) */
199 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
200 /* mask = (mask & ~amask) */
201 spe_andc(f, mask_reg, mask_reg, amask_reg);
202 break;
203
204 case PIPE_FUNC_GEQUAL:
205 /* amask = (ref > fragA) */
206 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
207 /* mask = (mask & ~amask) */
208 spe_andc(f, mask_reg, mask_reg, amask_reg);
209 break;
210
211 case PIPE_FUNC_NEVER:
212 spe_il(f, mask_reg, 0); /* mask = [0,0,0,0] */
213 break;
214
215 case PIPE_FUNC_ALWAYS:
216 /* no-op, mask unchanged */
217 break;
218
219 default:
220 ASSERT(0);
221 break;
222 }
223
224 #if OPTIMIZATIONS
225 /* if mask == {0,0,0,0} we're all done, return */
226 {
227 /* re-use amask reg here */
228 int tmp_reg = amask_reg;
229 /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
230 spe_orx(f, tmp_reg, mask_reg);
231 /* if tmp[0] == 0 then return from function call */
232 spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
233 }
234 #endif
235
236 spe_release_register(f, ref_reg);
237 spe_release_register(f, amask_reg);
238 }
239
240
241 /**
242 * This pair of functions is used inline to allocate and deallocate
243 * optional constant registers. Once a constant is discovered to be
244 * needed, we will likely need it again, so we don't want to deallocate
245 * it and have to allocate and load it again unnecessarily.
246 */
247 static INLINE void
248 setup_optional_register(struct spe_function *f,
249 boolean *is_already_set,
250 uint *r)
251 {
252 if (*is_already_set)
253 return;
254 *r = spe_allocate_available_register(f);
255 *is_already_set = true;
256 }
257
258 static INLINE void
259 release_optional_register(struct spe_function *f,
260 boolean *is_already_set,
261 uint r)
262 {
263 if (!*is_already_set)
264 return;
265 spe_release_register(f, r);
266 *is_already_set = false;
267 }
268
269 static INLINE void
270 setup_const_register(struct spe_function *f,
271 boolean *is_already_set,
272 uint *r,
273 float value)
274 {
275 if (*is_already_set)
276 return;
277 setup_optional_register(f, is_already_set, r);
278 spe_load_float(f, *r, value);
279 }
280
281 static INLINE void
282 release_const_register(struct spe_function *f,
283 boolean *is_already_set,
284 uint r)
285 {
286 release_optional_register(f, is_already_set, r);
287 }
288
289 /**
290 * Generate SPE code to implement the given blend mode for a quad of pixels.
291 * \param f SPE function to append instruction onto.
292 * \param fragR_reg register with fragment red values (float) (in/out)
293 * \param fragG_reg register with fragment green values (float) (in/out)
294 * \param fragB_reg register with fragment blue values (float) (in/out)
295 * \param fragA_reg register with fragment alpha values (float) (in/out)
296 * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
297 */
298 static void
299 gen_blend(const struct pipe_blend_state *blend,
300 const struct pipe_blend_color *blend_color,
301 struct spe_function *f,
302 enum pipe_format color_format,
303 int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
304 int fbRGBA_reg)
305 {
306 int term1R_reg = spe_allocate_available_register(f);
307 int term1G_reg = spe_allocate_available_register(f);
308 int term1B_reg = spe_allocate_available_register(f);
309 int term1A_reg = spe_allocate_available_register(f);
310
311 int term2R_reg = spe_allocate_available_register(f);
312 int term2G_reg = spe_allocate_available_register(f);
313 int term2B_reg = spe_allocate_available_register(f);
314 int term2A_reg = spe_allocate_available_register(f);
315
316 int fbR_reg = spe_allocate_available_register(f);
317 int fbG_reg = spe_allocate_available_register(f);
318 int fbB_reg = spe_allocate_available_register(f);
319 int fbA_reg = spe_allocate_available_register(f);
320
321 int tmp_reg = spe_allocate_available_register(f);
322
323 /* Optional constant registers we might or might not end up using;
324 * if we do use them, make sure we only allocate them once by
325 * keeping a flag on each one.
326 */
327 boolean one_reg_set = false;
328 unsigned int one_reg;
329 boolean constR_reg_set = false, constG_reg_set = false,
330 constB_reg_set = false, constA_reg_set = false;
331 unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
332
333 ASSERT(blend->blend_enable);
334
335 /* Unpack/convert framebuffer colors from four 32-bit packed colors
336 * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
337 * Each 8-bit color component is expanded into a float in [0.0, 1.0].
338 */
339 {
340 int mask_reg = spe_allocate_available_register(f);
341
342 /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
343 spe_load_int(f, mask_reg, 0xff);
344
345 /* XXX there may be more clever ways to implement the following code */
346 switch (color_format) {
347 case PIPE_FORMAT_A8R8G8B8_UNORM:
348 /* fbB = fbB & mask */
349 spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
350 /* mask = mask << 8 */
351 spe_roti(f, mask_reg, mask_reg, 8);
352
353 /* fbG = fbRGBA & mask */
354 spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
355 /* fbG = fbG >> 8 */
356 spe_roti(f, fbG_reg, fbG_reg, -8);
357 /* mask = mask << 8 */
358 spe_roti(f, mask_reg, mask_reg, 8);
359
360 /* fbR = fbRGBA & mask */
361 spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
362 /* fbR = fbR >> 16 */
363 spe_roti(f, fbR_reg, fbR_reg, -16);
364 /* mask = mask << 8 */
365 spe_roti(f, mask_reg, mask_reg, 8);
366
367 /* fbA = fbRGBA & mask */
368 spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
369 /* fbA = fbA >> 24 */
370 spe_roti(f, fbA_reg, fbA_reg, -24);
371 break;
372
373 case PIPE_FORMAT_B8G8R8A8_UNORM:
374 /* fbA = fbA & mask */
375 spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
376 /* mask = mask << 8 */
377 spe_roti(f, mask_reg, mask_reg, 8);
378
379 /* fbR = fbRGBA & mask */
380 spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
381 /* fbR = fbR >> 8 */
382 spe_roti(f, fbR_reg, fbR_reg, -8);
383 /* mask = mask << 8 */
384 spe_roti(f, mask_reg, mask_reg, 8);
385
386 /* fbG = fbRGBA & mask */
387 spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
388 /* fbG = fbG >> 16 */
389 spe_roti(f, fbG_reg, fbG_reg, -16);
390 /* mask = mask << 8 */
391 spe_roti(f, mask_reg, mask_reg, 8);
392
393 /* fbB = fbRGBA & mask */
394 spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
395 /* fbB = fbB >> 24 */
396 spe_roti(f, fbB_reg, fbB_reg, -24);
397 break;
398
399 default:
400 ASSERT(0);
401 }
402
403 /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
404 spe_cuflt(f, fbR_reg, fbR_reg, 8);
405 spe_cuflt(f, fbG_reg, fbG_reg, 8);
406 spe_cuflt(f, fbB_reg, fbB_reg, 8);
407 spe_cuflt(f, fbA_reg, fbA_reg, 8);
408
409 spe_release_register(f, mask_reg);
410 }
411
412 /*
413 * Compute Src RGB terms. We're actually looking for the value
414 * of (the appropriate RGB factors) * (the incoming source RGB color),
415 * because in some cases (like PIPE_BLENDFACTOR_ONE and
416 * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
417 */
418 switch (blend->rgb_src_factor) {
419 case PIPE_BLENDFACTOR_ONE:
420 /* factors = (1,1,1), so term = (R,G,B) */
421 spe_move(f, term1R_reg, fragR_reg);
422 spe_move(f, term1G_reg, fragG_reg);
423 spe_move(f, term1B_reg, fragB_reg);
424 break;
425 case PIPE_BLENDFACTOR_ZERO:
426 /* factors = (0,0,0), so term = (0,0,0) */
427 spe_load_float(f, term1R_reg, 0.0f);
428 spe_load_float(f, term1G_reg, 0.0f);
429 spe_load_float(f, term1B_reg, 0.0f);
430 break;
431 case PIPE_BLENDFACTOR_SRC_COLOR:
432 /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
433 spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
434 spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
435 spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
436 break;
437 case PIPE_BLENDFACTOR_SRC_ALPHA:
438 /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
439 spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
440 spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
441 spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
442 break;
443 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
444 /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B))
445 * or in other words term = (R-R*R, G-G*G, B-B*B)
446 * fnms(a,b,c,d) computes a = d - b*c
447 */
448 spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
449 spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
450 spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
451 break;
452 case PIPE_BLENDFACTOR_DST_COLOR:
453 /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
454 spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
455 spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
456 spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
457 break;
458 case PIPE_BLENDFACTOR_INV_DST_COLOR:
459 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
460 * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
461 * fnms(a,b,c,d) computes a = d - b*c
462 */
463 spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
464 spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
465 spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
466 break;
467 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
468 /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
469 * or term = (R-R*A,G-G*A,B-B*A)
470 * fnms(a,b,c,d) computes a = d - b*c
471 */
472 spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
473 spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
474 spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
475 break;
476 case PIPE_BLENDFACTOR_DST_ALPHA:
477 /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
478 spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
479 spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
480 spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
481 break;
482 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
483 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb))
484 * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
485 * fnms(a,b,c,d) computes a = d - b*c
486 */
487 spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
488 spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
489 spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
490 break;
491 case PIPE_BLENDFACTOR_CONST_COLOR:
492 /* We need the optional constant color registers */
493 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
494 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
495 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
496 /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
497 spe_fm(f, term1R_reg, fragR_reg, constR_reg);
498 spe_fm(f, term1G_reg, fragG_reg, constG_reg);
499 spe_fm(f, term1B_reg, fragB_reg, constB_reg);
500 break;
501 case PIPE_BLENDFACTOR_CONST_ALPHA:
502 /* we'll need the optional constant alpha register */
503 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
504 /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
505 spe_fm(f, term1R_reg, fragR_reg, constA_reg);
506 spe_fm(f, term1G_reg, fragG_reg, constA_reg);
507 spe_fm(f, term1B_reg, fragB_reg, constA_reg);
508 break;
509 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
510 /* We need the optional constant color registers */
511 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
512 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
513 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
514 /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc))
515 * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
516 * fnms(a,b,c,d) computes a = d - b*c
517 */
518 spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
519 spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
520 spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
521 break;
522 case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
523 /* We need the optional constant color registers */
524 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
525 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
526 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
527 /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
528 * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
529 * fnms(a,b,c,d) computes a = d - b*c
530 */
531 spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
532 spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
533 spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
534 break;
535 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
536 /* We'll need the optional {1,1,1,1} register */
537 setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
538 /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so
539 * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
540 * We could expand the term (as a*min(b,c) == min(a*b,a*c)
541 * as long as a is positive), but then we'd have to do three
542 * spe_float_min() functions instead of one, so this is simpler.
543 */
544 /* tmp = 1 - Afb */
545 spe_fs(f, tmp_reg, one_reg, fbA_reg);
546 /* tmp = min(A,tmp) */
547 spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
548 /* term = R*tmp */
549 spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
550 spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
551 spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
552 break;
553
554 /* These are special D3D cases involving a second color output
555 * from the fragment shader. I'm not sure we can support them
556 * yet... XXX
557 */
558 case PIPE_BLENDFACTOR_SRC1_COLOR:
559 case PIPE_BLENDFACTOR_SRC1_ALPHA:
560 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
561 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
562
563 default:
564 ASSERT(0);
565 }
566
567 /*
568 * Compute Src Alpha term. Like the above, we're looking for
569 * the full term A*factor, not just the factor itself, because
570 * in many cases we can avoid doing unnecessary multiplies.
571 */
572 switch (blend->alpha_src_factor) {
573 case PIPE_BLENDFACTOR_ZERO:
574 /* factor = 0, so term = 0 */
575 spe_load_float(f, term1A_reg, 0.0f);
576 break;
577
578 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
579 case PIPE_BLENDFACTOR_ONE:
580 /* factor = 1, so term = A */
581 spe_move(f, term1A_reg, fragA_reg);
582 break;
583
584 case PIPE_BLENDFACTOR_SRC_COLOR:
585 /* factor = A, so term = A*A */
586 spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
587 break;
588 case PIPE_BLENDFACTOR_SRC_ALPHA:
589 spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
590 break;
591
592 case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
593 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
594 /* factor = 1-A, so term = A*(1-A) = A-A*A */
595 /* fnms(a,b,c,d) computes a = d - b*c */
596 spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
597 break;
598
599 case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
600 case PIPE_BLENDFACTOR_DST_COLOR:
601 /* factor = Afb, so term = A*Afb */
602 spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
603 break;
604
605 case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
606 case PIPE_BLENDFACTOR_INV_DST_COLOR:
607 /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
608 /* fnms(a,b,c,d) computes a = d - b*c */
609 spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
610 break;
611
612 case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
613 case PIPE_BLENDFACTOR_CONST_COLOR:
614 /* We need the optional constA_reg register */
615 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
616 /* factor = Ac, so term = A*Ac */
617 spe_fm(f, term1A_reg, fragA_reg, constA_reg);
618 break;
619
620 case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
621 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
622 /* We need the optional constA_reg register */
623 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
624 /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
625 /* fnms(a,b,c,d) computes a = d - b*c */
626 spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
627 break;
628
629 /* These are special D3D cases involving a second color output
630 * from the fragment shader. I'm not sure we can support them
631 * yet... XXX
632 */
633 case PIPE_BLENDFACTOR_SRC1_COLOR:
634 case PIPE_BLENDFACTOR_SRC1_ALPHA:
635 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
636 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
637 default:
638 ASSERT(0);
639 }
640
641 /*
642 * Compute Dest RGB term. Like the above, we're looking for
643 * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
644 * in many cases we can avoid doing unnecessary multiplies.
645 */
646 switch (blend->rgb_dst_factor) {
647 case PIPE_BLENDFACTOR_ONE:
648 /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
649 spe_move(f, term2R_reg, fbR_reg);
650 spe_move(f, term2G_reg, fbG_reg);
651 spe_move(f, term2B_reg, fbB_reg);
652 break;
653 case PIPE_BLENDFACTOR_ZERO:
654 /* factor s= (0,0,0), so term = (0,0,0) */
655 spe_load_float(f, term2R_reg, 0.0f);
656 spe_load_float(f, term2G_reg, 0.0f);
657 spe_load_float(f, term2B_reg, 0.0f);
658 break;
659 case PIPE_BLENDFACTOR_SRC_COLOR:
660 /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
661 spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
662 spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
663 spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
664 break;
665 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
666 /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B))
667 * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
668 * fnms(a,b,c,d) computes a = d - b*c
669 */
670 spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
671 spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
672 spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
673 break;
674 case PIPE_BLENDFACTOR_SRC_ALPHA:
675 /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
676 spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
677 spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
678 spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
679 break;
680 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
681 /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
682 /* fnms(a,b,c,d) computes a = d - b*c */
683 spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
684 spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
685 spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
686 break;
687 case PIPE_BLENDFACTOR_DST_COLOR:
688 /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
689 spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
690 spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
691 spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
692 break;
693 case PIPE_BLENDFACTOR_INV_DST_COLOR:
694 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
695 * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
696 * fnms(a,b,c,d) computes a = d - b*c
697 */
698 spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
699 spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
700 spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
701 break;
702
703 case PIPE_BLENDFACTOR_DST_ALPHA:
704 /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
705 spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
706 spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
707 spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
708 break;
709 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
710 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb))
711 * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
712 * fnms(a,b,c,d) computes a = d - b*c
713 */
714 spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
715 spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
716 spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
717 break;
718 case PIPE_BLENDFACTOR_CONST_COLOR:
719 /* We need the optional constant color registers */
720 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
721 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
722 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
723 /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
724 spe_fm(f, term2R_reg, fbR_reg, constR_reg);
725 spe_fm(f, term2G_reg, fbG_reg, constG_reg);
726 spe_fm(f, term2B_reg, fbB_reg, constB_reg);
727 break;
728 case PIPE_BLENDFACTOR_CONST_ALPHA:
729 /* we'll need the optional constant alpha register */
730 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
731 /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
732 spe_fm(f, term2R_reg, fbR_reg, constA_reg);
733 spe_fm(f, term2G_reg, fbG_reg, constA_reg);
734 spe_fm(f, term2B_reg, fbB_reg, constA_reg);
735 break;
736 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
737 /* We need the optional constant color registers */
738 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
739 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
740 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
741 /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc))
742 * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
743 * fnms(a,b,c,d) computes a = d - b*c
744 */
745 spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
746 spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
747 spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
748 break;
749 case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
750 /* We need the optional constant color registers */
751 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
752 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
753 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
754 /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
755 * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
756 * fnms(a,b,c,d) computes a = d - b*c
757 */
758 spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
759 spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
760 spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
761 break;
762 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
763 ASSERT(0);
764 break;
765
766 /* These are special D3D cases involving a second color output
767 * from the fragment shader. I'm not sure we can support them
768 * yet... XXX
769 */
770 case PIPE_BLENDFACTOR_SRC1_COLOR:
771 case PIPE_BLENDFACTOR_SRC1_ALPHA:
772 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
773 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
774
775 default:
776 ASSERT(0);
777 }
778
779 /*
780 * Compute Dest Alpha term. Like the above, we're looking for
781 * the full term Afb*factor, not just the factor itself, because
782 * in many cases we can avoid doing unnecessary multiplies.
783 */
784 switch (blend->alpha_dst_factor) {
785 case PIPE_BLENDFACTOR_ONE:
786 /* factor = 1, so term = Afb */
787 spe_move(f, term2A_reg, fbA_reg);
788 break;
789 case PIPE_BLENDFACTOR_ZERO:
790 /* factor = 0, so term = 0 */
791 spe_load_float(f, term2A_reg, 0.0f);
792 break;
793
794 case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
795 case PIPE_BLENDFACTOR_SRC_COLOR:
796 /* factor = A, so term = Afb*A */
797 spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
798 break;
799
800 case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
801 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
802 /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
803 /* fnms(a,b,c,d) computes a = d - b*c */
804 spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
805 break;
806
807 case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
808 case PIPE_BLENDFACTOR_DST_COLOR:
809 /* factor = Afb, so term = Afb*Afb */
810 spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
811 break;
812
813 case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
814 case PIPE_BLENDFACTOR_INV_DST_COLOR:
815 /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
816 /* fnms(a,b,c,d) computes a = d - b*c */
817 spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
818 break;
819
820 case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
821 case PIPE_BLENDFACTOR_CONST_COLOR:
822 /* We need the optional constA_reg register */
823 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
824 /* factor = Ac, so term = Afb*Ac */
825 spe_fm(f, term2A_reg, fbA_reg, constA_reg);
826 break;
827
828 case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
829 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
830 /* We need the optional constA_reg register */
831 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
832 /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
833 /* fnms(a,b,c,d) computes a = d - b*c */
834 spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
835 break;
836
837 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
838 ASSERT(0);
839 break;
840
841 /* These are special D3D cases involving a second color output
842 * from the fragment shader. I'm not sure we can support them
843 * yet... XXX
844 */
845 case PIPE_BLENDFACTOR_SRC1_COLOR:
846 case PIPE_BLENDFACTOR_SRC1_ALPHA:
847 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
848 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
849 default:
850 ASSERT(0);
851 }
852
853 /*
854 * Combine Src/Dest RGB terms as per the blend equation.
855 */
856 switch (blend->rgb_func) {
857 case PIPE_BLEND_ADD:
858 spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
859 spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
860 spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
861 break;
862 case PIPE_BLEND_SUBTRACT:
863 spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
864 spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
865 spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
866 break;
867 case PIPE_BLEND_REVERSE_SUBTRACT:
868 spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
869 spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
870 spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
871 break;
872 case PIPE_BLEND_MIN:
873 spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
874 spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
875 spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
876 break;
877 case PIPE_BLEND_MAX:
878 spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
879 spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
880 spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
881 break;
882 default:
883 ASSERT(0);
884 }
885
886 /*
887 * Combine Src/Dest A term
888 */
889 switch (blend->alpha_func) {
890 case PIPE_BLEND_ADD:
891 spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
892 break;
893 case PIPE_BLEND_SUBTRACT:
894 spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
895 break;
896 case PIPE_BLEND_REVERSE_SUBTRACT:
897 spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
898 break;
899 case PIPE_BLEND_MIN:
900 spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
901 break;
902 case PIPE_BLEND_MAX:
903 spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
904 break;
905 default:
906 ASSERT(0);
907 }
908
909 spe_release_register(f, term1R_reg);
910 spe_release_register(f, term1G_reg);
911 spe_release_register(f, term1B_reg);
912 spe_release_register(f, term1A_reg);
913
914 spe_release_register(f, term2R_reg);
915 spe_release_register(f, term2G_reg);
916 spe_release_register(f, term2B_reg);
917 spe_release_register(f, term2A_reg);
918
919 spe_release_register(f, fbR_reg);
920 spe_release_register(f, fbG_reg);
921 spe_release_register(f, fbB_reg);
922 spe_release_register(f, fbA_reg);
923
924 spe_release_register(f, tmp_reg);
925
926 /* Free any optional registers that actually got used */
927 release_const_register(f, &one_reg_set, one_reg);
928 release_const_register(f, &constR_reg_set, constR_reg);
929 release_const_register(f, &constG_reg_set, constG_reg);
930 release_const_register(f, &constB_reg_set, constB_reg);
931 release_const_register(f, &constA_reg_set, constA_reg);
932 }
933
934
935 static void
936 gen_logicop(const struct pipe_blend_state *blend,
937 struct spe_function *f,
938 int fragRGBA_reg, int fbRGBA_reg)
939 {
940 /* We've got four 32-bit RGBA packed pixels in each of
941 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
942 * reds, greens, blues, and alphas.
943 * */
944 ASSERT(blend->logicop_enable);
945
946 switch(blend->logicop_func) {
947 case PIPE_LOGICOP_CLEAR: /* 0 */
948 spe_zero(f, fragRGBA_reg);
949 break;
950 case PIPE_LOGICOP_NOR: /* ~(s | d) */
951 spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
952 break;
953 case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
954 /* andc R, A, B computes R = A & ~B */
955 spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
956 break;
957 case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
958 spe_complement(f, fragRGBA_reg, fragRGBA_reg);
959 break;
960 case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
961 /* andc R, A, B computes R = A & ~B */
962 spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
963 break;
964 case PIPE_LOGICOP_INVERT: /* ~d */
965 /* Note that (A nor A) == ~(A|A) == ~A */
966 spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
967 break;
968 case PIPE_LOGICOP_XOR: /* s ^ d */
969 spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
970 break;
971 case PIPE_LOGICOP_NAND: /* ~(s & d) */
972 spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
973 break;
974 case PIPE_LOGICOP_AND: /* s & d */
975 spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
976 break;
977 case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
978 spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
979 spe_complement(f, fragRGBA_reg, fragRGBA_reg);
980 break;
981 case PIPE_LOGICOP_NOOP: /* d */
982 spe_move(f, fragRGBA_reg, fbRGBA_reg);
983 break;
984 case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
985 /* orc R, A, B computes R = A | ~B */
986 spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
987 break;
988 case PIPE_LOGICOP_COPY: /* s */
989 break;
990 case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
991 /* orc R, A, B computes R = A | ~B */
992 spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
993 break;
994 case PIPE_LOGICOP_OR: /* s | d */
995 spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
996 break;
997 case PIPE_LOGICOP_SET: /* 1 */
998 spe_load_int(f, fragRGBA_reg, 0xffffffff);
999 break;
1000 default:
1001 ASSERT(0);
1002 }
1003 }
1004
1005
1006 /**
1007 * Generate code to pack a quad of float colors into four 32-bit integers.
1008 *
1009 * \param f SPE function to append instruction onto.
1010 * \param color_format the dest color packing format
1011 * \param r_reg register containing four red values (in/clobbered)
1012 * \param g_reg register containing four green values (in/clobbered)
1013 * \param b_reg register containing four blue values (in/clobbered)
1014 * \param a_reg register containing four alpha values (in/clobbered)
1015 * \param rgba_reg register to store the packed RGBA colors (out)
1016 */
1017 static void
1018 gen_pack_colors(struct spe_function *f,
1019 enum pipe_format color_format,
1020 int r_reg, int g_reg, int b_reg, int a_reg,
1021 int rgba_reg)
1022 {
1023 int rg_reg = spe_allocate_available_register(f);
1024 int ba_reg = spe_allocate_available_register(f);
1025
1026 /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
1027 spe_cfltu(f, r_reg, r_reg, 32);
1028 spe_cfltu(f, g_reg, g_reg, 32);
1029 spe_cfltu(f, b_reg, b_reg, 32);
1030 spe_cfltu(f, a_reg, a_reg, 32);
1031
1032 /* Shift the most significant bytes to the least significant positions.
1033 * I.e.: reg = reg >> 24
1034 */
1035 spe_rotmi(f, r_reg, r_reg, -24);
1036 spe_rotmi(f, g_reg, g_reg, -24);
1037 spe_rotmi(f, b_reg, b_reg, -24);
1038 spe_rotmi(f, a_reg, a_reg, -24);
1039
1040 /* Shift the color bytes according to the surface format */
1041 if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
1042 spe_roti(f, g_reg, g_reg, 8); /* green <<= 8 */
1043 spe_roti(f, r_reg, r_reg, 16); /* red <<= 16 */
1044 spe_roti(f, a_reg, a_reg, 24); /* alpha <<= 24 */
1045 }
1046 else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1047 spe_roti(f, r_reg, r_reg, 8); /* red <<= 8 */
1048 spe_roti(f, g_reg, g_reg, 16); /* green <<= 16 */
1049 spe_roti(f, b_reg, b_reg, 24); /* blue <<= 24 */
1050 }
1051 else {
1052 ASSERT(0);
1053 }
1054
1055 /* Merge red, green, blue, alpha registers to make packed RGBA colors.
1056 * Eg: after shifting according to color_format we might have:
1057 * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
1058 * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
1059 * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
1060 * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
1061 * OR-ing all those together gives us four packed colors:
1062 * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
1063 */
1064 spe_or(f, rg_reg, r_reg, g_reg);
1065 spe_or(f, ba_reg, a_reg, b_reg);
1066 spe_or(f, rgba_reg, rg_reg, ba_reg);
1067
1068 spe_release_register(f, rg_reg);
1069 spe_release_register(f, ba_reg);
1070 }
1071
1072
1073 static void
1074 gen_colormask(struct spe_function *f,
1075 uint colormask,
1076 enum pipe_format color_format,
1077 int fragRGBA_reg, int fbRGBA_reg)
1078 {
1079 /* We've got four 32-bit RGBA packed pixels in each of
1080 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
1081 * reds, greens, blues, and alphas. Further, the pixels
1082 * are packed according to the given color format, not
1083 * necessarily RGBA...
1084 */
1085 unsigned int r_mask;
1086 unsigned int g_mask;
1087 unsigned int b_mask;
1088 unsigned int a_mask;
1089
1090 /* Calculate exactly where the bits for any particular color
1091 * end up, so we can mask them correctly.
1092 */
1093 switch(color_format) {
1094 case PIPE_FORMAT_A8R8G8B8_UNORM:
1095 /* ARGB */
1096 a_mask = 0xff000000;
1097 r_mask = 0x00ff0000;
1098 g_mask = 0x0000ff00;
1099 b_mask = 0x000000ff;
1100 break;
1101 case PIPE_FORMAT_B8G8R8A8_UNORM:
1102 /* BGRA */
1103 b_mask = 0xff000000;
1104 g_mask = 0x00ff0000;
1105 r_mask = 0x0000ff00;
1106 a_mask = 0x000000ff;
1107 break;
1108 default:
1109 ASSERT(0);
1110 }
1111
1112 /* For each R, G, B, and A component we're supposed to mask out,
1113 * clear its bits. Then our mask operation later will work
1114 * as expected.
1115 */
1116 if (!(colormask & PIPE_MASK_R)) {
1117 r_mask = 0;
1118 }
1119 if (!(colormask & PIPE_MASK_G)) {
1120 g_mask = 0;
1121 }
1122 if (!(colormask & PIPE_MASK_B)) {
1123 b_mask = 0;
1124 }
1125 if (!(colormask & PIPE_MASK_A)) {
1126 a_mask = 0;
1127 }
1128
1129 /* Get a temporary register to hold the mask that will be applied
1130 * to the fragment
1131 */
1132 int colormask_reg = spe_allocate_available_register(f);
1133
1134 /* The actual mask we're going to use is an OR of the remaining R, G, B,
1135 * and A masks. Load the result value into our temporary register.
1136 */
1137 spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
1138
1139 /* Use the mask register to select between the fragment color
1140 * values and the frame buffer color values. Wherever the
1141 * mask has a 0 bit, the current frame buffer color should override
1142 * the fragment color. Wherever the mask has a 1 bit, the
1143 * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM)
1144 * instruction will select bits from its first operand rA wherever the
1145 * the mask bits rM are 0, and from its second operand rB wherever the
1146 * mask bits rM are 1. That means that the frame buffer color is the
1147 * first operand, and the fragment color the second.
1148 */
1149 spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
1150
1151 /* Release the temporary register and we're done */
1152 spe_release_register(f, colormask_reg);
1153 }
1154
1155
1156 /**
1157 * This function is annoyingly similar to gen_depth_test(), above, except
1158 * that instead of comparing two varying values (i.e. fragment and buffer),
1159 * we're comparing a varying value with a static value. As such, we have
1160 * access to the Compare Immediate instructions where we don't in
1161 * gen_depth_test(), which is what makes us very different.
1162 *
1163 * There's some added complexity if there's a non-trivial state->mask
1164 * value; then stencil and reference both must be masked
1165 *
1166 * The return value in the stencil_pass_reg is a bitmask of valid
1167 * fragments that also passed the stencil test. The bitmask of valid
1168 * fragments that failed would be found in
1169 * (fragment_mask_reg & ~stencil_pass_reg).
1170 */
1171 static void
1172 gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
1173 unsigned int stencil_max_value,
1174 unsigned int fragment_mask_reg, unsigned int fbS_reg,
1175 unsigned int stencil_pass_reg)
1176 {
1177 /* Generate code that puts the set of passing fragments into the
1178 * stencil_pass_reg register, taking into account whether each fragment
1179 * was active to begin with.
1180 */
1181 switch (state->func) {
1182 case PIPE_FUNC_EQUAL:
1183 if (state->value_mask == stencil_max_value) {
1184 /* stencil_pass = fragment_mask & (s == reference) */
1185 spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1186 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1187 }
1188 else {
1189 /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
1190 unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
1191 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1192 spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
1193 state->value_mask & state->ref_value);
1194 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1195 spe_release_register(f, tmp_masked_stencil);
1196 }
1197 break;
1198
1199 case PIPE_FUNC_NOTEQUAL:
1200 if (state->value_mask == stencil_max_value) {
1201 /* stencil_pass = fragment_mask & ~(s == reference) */
1202 spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1203 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1204 }
1205 else {
1206 /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
1207 unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
1208 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1209 spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
1210 state->value_mask & state->ref_value);
1211 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1212 spe_release_register(f, tmp_masked_stencil);
1213 }
1214 break;
1215
1216 case PIPE_FUNC_LESS:
1217 if (state->value_mask == stencil_max_value) {
1218 /* stencil_pass = fragment_mask & (reference < s) */
1219 spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1220 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1221 }
1222 else {
1223 /* stencil_pass = fragment_mask & ((reference&mask) < (s & mask)) */
1224 unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
1225 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1226 spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
1227 state->value_mask & state->ref_value);
1228 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1229 spe_release_register(f, tmp_masked_stencil);
1230 }
1231 break;
1232
1233 case PIPE_FUNC_GREATER:
1234 if (state->value_mask == stencil_max_value) {
1235 /* stencil_pass = fragment_mask & (reference > s) */
1236 /* There's no convenient Compare Less Than Immediate instruction, so
1237 * we'll have to do this one the harder way, by loading a register and
1238 * comparing directly. Compare Logical Greater Than Word (clgt)
1239 * treats its operands as unsigned - no sign extension.
1240 */
1241 unsigned int tmp_reg = spe_allocate_available_register(f);
1242 spe_load_uint(f, tmp_reg, state->ref_value);
1243 spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
1244 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1245 spe_release_register(f, tmp_reg);
1246 }
1247 else {
1248 /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
1249 unsigned int tmp_reg = spe_allocate_available_register(f);
1250 unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
1251 spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value);
1252 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1253 spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
1254 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1255 spe_release_register(f, tmp_reg);
1256 spe_release_register(f, tmp_masked_stencil);
1257 }
1258 break;
1259
1260 case PIPE_FUNC_GEQUAL:
1261 if (state->value_mask == stencil_max_value) {
1262 /* stencil_pass = fragment_mask & (reference >= s)
1263 * = fragment_mask & ~(s > reference) */
1264 spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg,
1265 state->ref_value);
1266 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1267 }
1268 else {
1269 /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
1270 unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
1271 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1272 spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
1273 state->value_mask & state->ref_value);
1274 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1275 spe_release_register(f, tmp_masked_stencil);
1276 }
1277 break;
1278
1279 case PIPE_FUNC_LEQUAL:
1280 if (state->value_mask == stencil_max_value) {
1281 /* stencil_pass = fragment_mask & (reference <= s) ]
1282 * = fragment_mask & ~(reference > s) */
1283 /* As above, we have to do this by loading a register */
1284 unsigned int tmp_reg = spe_allocate_available_register(f);
1285 spe_load_uint(f, tmp_reg, state->ref_value);
1286 spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
1287 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1288 spe_release_register(f, tmp_reg);
1289 }
1290 else {
1291 /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
1292 unsigned int tmp_reg = spe_allocate_available_register(f);
1293 unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
1294 spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask);
1295 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1296 spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
1297 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1298 spe_release_register(f, tmp_reg);
1299 spe_release_register(f, tmp_masked_stencil);
1300 }
1301 break;
1302
1303 case PIPE_FUNC_NEVER:
1304 /* stencil_pass = fragment_mask & 0 = 0 */
1305 spe_load_uint(f, stencil_pass_reg, 0);
1306 break;
1307
1308 case PIPE_FUNC_ALWAYS:
1309 /* stencil_pass = fragment_mask & 1 = fragment_mask */
1310 spe_move(f, stencil_pass_reg, fragment_mask_reg);
1311 break;
1312 }
1313
1314 /* The fragments that passed the stencil test are now in stencil_pass_reg.
1315 * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
1316 */
1317 }
1318
1319 /* This function generates code that calculates a set of new stencil values
1320 * given the earlier values and the operation to apply. It does not
1321 * apply any tests. It is intended to be called up to 3 times
1322 * (for the stencil fail operation, for the stencil pass-z fail operation,
1323 * and for the stencil pass-z pass operation) to collect up to three
1324 * possible sets of values, and for the caller to combine them based
1325 * on the result of the tests.
1326 *
1327 * stencil_max_value should be (2^n - 1) where n is the number of bits
1328 * in the stencil buffer - in other words, it should be usable as a mask.
1329 */
1330 static void
1331 gen_stencil_values(struct spe_function *f,
1332 unsigned int stencil_op,
1333 unsigned int stencil_ref_value,
1334 unsigned int stencil_max_value,
1335 unsigned int fbS_reg,
1336 unsigned int newS_reg)
1337 {
1338 /* The code below assumes that newS_reg and fbS_reg are not the same
1339 * register; if they can be, the calculations below will have to use
1340 * an additional temporary register. For now, mark the assumption
1341 * with an assertion that will fail if they are the same.
1342 */
1343 ASSERT(fbS_reg != newS_reg);
1344
1345 /* The code also assumes the the stencil_max_value is of the form
1346 * 2^n-1 and can therefore be used as a mask for the valid bits in
1347 * addition to a maximum. Make sure this is the case as well.
1348 * The clever math below exploits the fact that incrementing a
1349 * binary number serves to flip all the bits of a number starting at
1350 * the LSB and continuing to (and including) the first zero bit
1351 * found. That means that a number and its increment will always
1352 * have at least one bit in common (the high order bit, if nothing
1353 * else) *unless* the number is zero, *or* the number is of a form
1354 * consisting of some number of 1s in the low-order bits followed
1355 * by nothing but 0s in the high-order bits. The latter case
1356 * implies it's of the form 2^n-1.
1357 */
1358 ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
1359
1360 switch(stencil_op) {
1361 case PIPE_STENCIL_OP_KEEP:
1362 /* newS = S */
1363 spe_move(f, newS_reg, fbS_reg);
1364 break;
1365
1366 case PIPE_STENCIL_OP_ZERO:
1367 /* newS = 0 */
1368 spe_zero(f, newS_reg);
1369 break;
1370
1371 case PIPE_STENCIL_OP_REPLACE:
1372 /* newS = stencil reference value */
1373 spe_load_uint(f, newS_reg, stencil_ref_value);
1374 break;
1375
1376 case PIPE_STENCIL_OP_INCR: {
1377 /* newS = (s == max ? max : s + 1) */
1378 unsigned int equals_reg = spe_allocate_available_register(f);
1379
1380 spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
1381 /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
1382 spe_ai(f, newS_reg, fbS_reg, 1);
1383 /* Select from the current value or the new value based on the equality test */
1384 spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
1385
1386 spe_release_register(f, equals_reg);
1387 break;
1388 }
1389 case PIPE_STENCIL_OP_DECR: {
1390 /* newS = (s == 0 ? 0 : s - 1) */
1391 unsigned int equals_reg = spe_allocate_available_register(f);
1392
1393 spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
1394 /* Add Word Immediate with a (-1) value works */
1395 spe_ai(f, newS_reg, fbS_reg, -1);
1396 /* Select from the current value or the new value based on the equality test */
1397 spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
1398
1399 spe_release_register(f, equals_reg);
1400 break;
1401 }
1402 case PIPE_STENCIL_OP_INCR_WRAP:
1403 /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
1404 * do a normal add and mask off the correct bits
1405 */
1406 spe_ai(f, newS_reg, fbS_reg, 1);
1407 spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
1408 break;
1409
1410 case PIPE_STENCIL_OP_DECR_WRAP:
1411 /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
1412 spe_ai(f, newS_reg, fbS_reg, -1);
1413 spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
1414 break;
1415
1416 case PIPE_STENCIL_OP_INVERT:
1417 /* newS = ~s. We take advantage of the mask/max value to invert only
1418 * the valid bits for the field so we don't have to do an extra "and".
1419 */
1420 spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
1421 break;
1422
1423 default:
1424 ASSERT(0);
1425 }
1426 }
1427
1428
1429 /* This function generates code to get all the necessary possible
1430 * stencil values. For each of the output registers (fail_reg,
1431 * zfail_reg, and zpass_reg), it either allocates a new register
1432 * and calculates a new set of values based on the stencil operation,
1433 * or it reuses a register allocation and calculation done for an
1434 * earlier (matching) operation, or it reuses the fbS_reg register
1435 * (if the stencil operation is KEEP, which doesn't change the
1436 * stencil buffer).
1437 *
1438 * Since this function allocates a variable number of registers,
1439 * to avoid incurring complex logic to free them, they should
1440 * be allocated after a spe_allocate_register_set() call
1441 * and released by the corresponding spe_release_register_set() call.
1442 */
1443 static void
1444 gen_get_stencil_values(struct spe_function *f,
1445 const struct pipe_stencil_state *stencil,
1446 const unsigned int depth_enabled,
1447 unsigned int fbS_reg,
1448 unsigned int *fail_reg,
1449 unsigned int *zfail_reg,
1450 unsigned int *zpass_reg)
1451 {
1452 unsigned zfail_op;
1453
1454 /* Stenciling had better be enabled here */
1455 ASSERT(stencil->enabled);
1456
1457 /* If the depth test is not enabled, it is treated as though it always
1458 * passes, which means that the zfail_op is not considered - a
1459 * failing stencil test triggers the fail_op, and a passing one
1460 * triggers the zpass_op
1461 *
1462 * As an optimization, override calculation of the zfail_op values
1463 * if they aren't going to be used. By setting the value of
1464 * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
1465 * to match the incoming stencil values, and no calculation will
1466 * be done.
1467 */
1468 if (depth_enabled) {
1469 zfail_op = stencil->zfail_op;
1470 }
1471 else {
1472 zfail_op = PIPE_STENCIL_OP_KEEP;
1473 }
1474
1475 /* One-sided or front-facing stencil */
1476 if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
1477 *fail_reg = fbS_reg;
1478 }
1479 else {
1480 *fail_reg = spe_allocate_available_register(f);
1481 gen_stencil_values(f, stencil->fail_op, stencil->ref_value,
1482 0xff, fbS_reg, *fail_reg);
1483 }
1484
1485 /* Check the possibly overridden value, not the structure value */
1486 if (zfail_op == PIPE_STENCIL_OP_KEEP) {
1487 *zfail_reg = fbS_reg;
1488 }
1489 else if (zfail_op == stencil->fail_op) {
1490 *zfail_reg = *fail_reg;
1491 }
1492 else {
1493 *zfail_reg = spe_allocate_available_register(f);
1494 gen_stencil_values(f, stencil->zfail_op, stencil->ref_value,
1495 0xff, fbS_reg, *zfail_reg);
1496 }
1497
1498 if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
1499 *zpass_reg = fbS_reg;
1500 }
1501 else if (stencil->zpass_op == stencil->fail_op) {
1502 *zpass_reg = *fail_reg;
1503 }
1504 else if (stencil->zpass_op == zfail_op) {
1505 *zpass_reg = *zfail_reg;
1506 }
1507 else {
1508 *zpass_reg = spe_allocate_available_register(f);
1509 gen_stencil_values(f, stencil->zpass_op, stencil->ref_value,
1510 0xff, fbS_reg, *zpass_reg);
1511 }
1512 }
1513
1514 /* Note that fbZ_reg may *not* be set on entry, if in fact
1515 * the depth test is not enabled. This function must not use
1516 * the register if depth is not enabled.
1517 */
1518 static boolean
1519 gen_stencil_depth_test(struct spe_function *f,
1520 const struct pipe_depth_stencil_alpha_state *dsa,
1521 const uint facing,
1522 const int mask_reg, const int fragZ_reg,
1523 const int fbZ_reg, const int fbS_reg)
1524 {
1525 /* True if we've generated code that could require writeback to the
1526 * depth and/or stencil buffers
1527 */
1528 boolean modified_buffers = false;
1529
1530 boolean need_to_calculate_stencil_values;
1531 boolean need_to_writemask_stencil_values;
1532
1533 struct pipe_stencil_state *stencil;
1534
1535 /* Registers. We may or may not actually allocate these, depending
1536 * on whether the state values indicate that we need them.
1537 */
1538 unsigned int stencil_pass_reg, stencil_fail_reg;
1539 unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
1540 unsigned int stencil_writemask_reg;
1541 unsigned int zmask_reg;
1542 unsigned int newS_reg;
1543
1544 /* Stenciling is quite complex: up to six different configurable stencil
1545 * operations/calculations can be required (three each for front-facing
1546 * and back-facing fragments). Many of those operations will likely
1547 * be identical, so there's good reason to try to avoid calculating
1548 * the same values more than once (which unfortunately makes the code less
1549 * straightforward).
1550 *
1551 * To make register management easier, we start a new
1552 * register set; we can release all the registers in the set at
1553 * once, and avoid having to keep track of exactly which registers
1554 * we allocate. We can still allocate and free registers as
1555 * desired (if we know we no longer need a register), but we don't
1556 * have to spend the complexity to track the more difficult variant
1557 * register usage scenarios.
1558 */
1559 spe_comment(f, 0, "Allocating stencil register set");
1560 spe_allocate_register_set(f);
1561
1562 /* The facing we're given is the fragment facing; it doesn't
1563 * exactly match the stencil facing. If stencil is enabled,
1564 * but two-sided stencil is *not* enabled, we use the same
1565 * stencil settings for both front- and back-facing fragments.
1566 * We only use the "back-facing" stencil for backfacing fragments
1567 * if two-sided stenciling is enabled.
1568 */
1569 if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
1570 stencil = &dsa->stencil[1];
1571 }
1572 else {
1573 stencil = &dsa->stencil[0];
1574 }
1575
1576 /* Calculate the writemask. If the writemask is trivial (either
1577 * all 0s, meaning that we don't need to calculate any stencil values
1578 * because they're not going to change the stencil anyway, or all 1s,
1579 * meaning that we have to calculate the stencil values but do not
1580 * need to mask them), we can avoid generating code. Don't forget
1581 * that we need to consider backfacing stencil, if enabled.
1582 *
1583 * Note that if the backface stencil is *not* enabled, the backface
1584 * stencil will have the same values as the frontface stencil.
1585 */
1586 if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
1587 stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
1588 stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
1589 need_to_calculate_stencil_values = false;
1590 need_to_writemask_stencil_values = false;
1591 }
1592 else if (stencil->write_mask == 0x0) {
1593 /* All changes are writemasked out, so no need to calculate
1594 * what those changes might be, and no need to write anything back.
1595 */
1596 need_to_calculate_stencil_values = false;
1597 need_to_writemask_stencil_values = false;
1598 }
1599 else if (stencil->write_mask == 0xff) {
1600 /* Still trivial, but a little less so. We need to write the stencil
1601 * values, but we don't need to mask them.
1602 */
1603 need_to_calculate_stencil_values = true;
1604 need_to_writemask_stencil_values = false;
1605 }
1606 else {
1607 /* The general case: calculate, mask, and write */
1608 need_to_calculate_stencil_values = true;
1609 need_to_writemask_stencil_values = true;
1610
1611 /* While we're here, generate code that calculates what the
1612 * writemask should be. If backface stenciling is enabled,
1613 * and the backface writemask is not the same as the frontface
1614 * writemask, we'll have to generate code that merges the
1615 * two masks into a single effective mask based on fragment facing.
1616 */
1617 spe_comment(f, 0, "Computing stencil writemask");
1618 stencil_writemask_reg = spe_allocate_available_register(f);
1619 spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].write_mask);
1620 }
1621
1622 /* At least one-sided stenciling must be on. Generate code that
1623 * runs the stencil test on the basic/front-facing stencil, leaving
1624 * the mask of passing stencil bits in stencil_pass_reg. This mask will
1625 * be used both to mask the set of active pixels, and also to
1626 * determine how the stencil buffer changes.
1627 *
1628 * This test will *not* change the value in mask_reg (because we don't
1629 * yet know whether to apply the two-sided stencil or one-sided stencil).
1630 */
1631 spe_comment(f, 0, "Running basic stencil test");
1632 stencil_pass_reg = spe_allocate_available_register(f);
1633 gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
1634
1635 /* Generate code that, given the mask of valid fragments and the
1636 * mask of valid fragments that passed the stencil test, computes
1637 * the mask of valid fragments that failed the stencil test. We
1638 * have to do this before we run a depth test (because the
1639 * depth test should not be performed on fragments that failed the
1640 * stencil test, and because the depth test will update the
1641 * mask of valid fragments based on the results of the depth test).
1642 */
1643 spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
1644 stencil_fail_reg = spe_allocate_available_register(f);
1645 spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
1646 /* Now remove the stenciled-out pixels from the valid fragment mask,
1647 * so we can later use the valid fragment mask in the depth test.
1648 */
1649 spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
1650
1651 /* We may not need to calculate stencil values, if the writemask is off */
1652 if (need_to_calculate_stencil_values) {
1653 /* Generate code that calculates exactly which stencil values we need,
1654 * without calculating the same value twice (say, if two different
1655 * stencil ops have the same value). This code will work for one-sided
1656 * and two-sided stenciling (so that we take into account that operations
1657 * may match between front and back stencils), and will also take into
1658 * account whether the depth test is enabled (if the depth test is off,
1659 * we don't need any of the zfail results, because the depth test always
1660 * is considered to pass if it is disabled). Any register value that
1661 * does not need to be calculated will come back with the same value
1662 * that's in fbS_reg.
1663 *
1664 * This function will allocate a variant number of registers that
1665 * will be released as part of the register set.
1666 */
1667 spe_comment(f, 0, facing == CELL_FACING_FRONT
1668 ? "Computing front-facing stencil values"
1669 : "Computing back-facing stencil values");
1670 gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg,
1671 &stencil_fail_values, &stencil_pass_depth_fail_values,
1672 &stencil_pass_depth_pass_values);
1673 }
1674
1675 /* We now have all the stencil values we need. We also need
1676 * the results of the depth test to figure out which
1677 * stencil values will become the new stencil values. (Even if
1678 * we aren't actually calculating stencil values, we need to apply
1679 * the depth test if it's enabled.)
1680 *
1681 * The code generated by gen_depth_test() returns the results of the
1682 * test in the given register, but also alters the mask_reg based
1683 * on the results of the test.
1684 */
1685 if (dsa->depth.enabled) {
1686 spe_comment(f, 0, "Running stencil depth test");
1687 zmask_reg = spe_allocate_available_register(f);
1688 modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg,
1689 fbZ_reg, zmask_reg);
1690 }
1691
1692 if (need_to_calculate_stencil_values) {
1693
1694 /* If we need to writemask the stencil values before going into
1695 * the stencil buffer, we'll have to use a new register to
1696 * hold the new values. If not, we can just keep using the
1697 * current register.
1698 */
1699 if (need_to_writemask_stencil_values) {
1700 newS_reg = spe_allocate_available_register(f);
1701 spe_comment(f, 0, "Saving current stencil values for writemasking");
1702 spe_move(f, newS_reg, fbS_reg);
1703 }
1704 else {
1705 newS_reg = fbS_reg;
1706 }
1707
1708 /* Merge in the selected stencil fail values */
1709 if (stencil_fail_values != fbS_reg) {
1710 spe_comment(f, 0, "Loading stencil fail values");
1711 spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
1712 modified_buffers = true;
1713 }
1714
1715 /* Same for the stencil pass/depth fail values. If this calculation
1716 * is not needed (say, if depth test is off), then the
1717 * stencil_pass_depth_fail_values register will be equal to fbS_reg
1718 * and we'll skip the calculation.
1719 */
1720 if (stencil_pass_depth_fail_values != fbS_reg) {
1721 /* We don't actually have a stencil pass/depth fail mask yet.
1722 * Calculate it here from the stencil passing mask and the
1723 * depth passing mask. Note that zmask_reg *must* have been
1724 * set above if we're here.
1725 */
1726 unsigned int stencil_pass_depth_fail_mask =
1727 spe_allocate_available_register(f);
1728
1729 spe_comment(f, 0, "Loading stencil pass/depth fail values");
1730 spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
1731
1732 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values,
1733 stencil_pass_depth_fail_mask);
1734
1735 spe_release_register(f, stencil_pass_depth_fail_mask);
1736 modified_buffers = true;
1737 }
1738
1739 /* Same for the stencil pass/depth pass mask. Note that we
1740 * *can* get here with zmask_reg being unset (if the depth
1741 * test is off but the stencil test is on). In this case,
1742 * we assume the depth test passes, and don't need to mask
1743 * the stencil pass mask with the Z mask.
1744 */
1745 if (stencil_pass_depth_pass_values != fbS_reg) {
1746 if (dsa->depth.enabled) {
1747 unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
1748 /* We'll need a separate register */
1749 spe_comment(f, 0, "Loading stencil pass/depth pass values");
1750 spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
1751 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
1752 spe_release_register(f, stencil_pass_depth_pass_mask);
1753 }
1754 else {
1755 /* We can use the same stencil-pass register */
1756 spe_comment(f, 0, "Loading stencil pass values");
1757 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
1758 }
1759 modified_buffers = true;
1760 }
1761
1762 /* Almost done. If we need to writemask, do it now, leaving the
1763 * results in the fbS_reg register passed in. If we don't need
1764 * to writemask, then the results are *already* in the fbS_reg,
1765 * so there's nothing more to do.
1766 */
1767
1768 if (need_to_writemask_stencil_values && modified_buffers) {
1769 /* The Select Bytes command makes a fine writemask. Where
1770 * the mask is 0, the first (original) values are retained,
1771 * effectively masking out changes. Where the mask is 1, the
1772 * second (new) values are retained, incorporating changes.
1773 */
1774 spe_comment(f, 0, "Writemasking new stencil values");
1775 spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
1776 }
1777
1778 } /* done calculating stencil values */
1779
1780 /* The stencil and/or depth values have been applied, and the
1781 * mask_reg, fbS_reg, and fbZ_reg values have been updated.
1782 * We're all done, except that we've allocated a fair number
1783 * of registers that we didn't bother tracking. Release all
1784 * those registers as part of the register set, and go home.
1785 */
1786 spe_comment(f, 0, "Releasing stencil register set");
1787 spe_release_register_set(f);
1788
1789 /* Return true if we could have modified the stencil and/or
1790 * depth buffers.
1791 */
1792 return modified_buffers;
1793 }
1794
1795
1796 /**
1797 * Generate SPE code to implement the fragment operations (alpha test,
1798 * depth test, stencil test, blending, colormask, and final
1799 * framebuffer write) as specified by the current context state.
1800 *
1801 * Logically, this code will be called after running the fragment
1802 * shader. But under some circumstances we could run some of this
1803 * code before the fragment shader to cull fragments/quads that are
1804 * totally occluded/discarded.
1805 *
1806 * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
1807 *
1808 * See the spu_default_fragment_ops() function to see how the per-fragment
1809 * operations would be done with ordinary C code.
1810 * The code we generate here though has no branches, is SIMD, etc and
1811 * should be much faster.
1812 *
1813 * \param cell the rendering context (in)
1814 * \param facing whether the generated code is for front-facing or
1815 * back-facing fragments
1816 * \param f the generated function (in/out); on input, the function
1817 * must already have been initialized. On exit, whatever
1818 * instructions within the generated function have had
1819 * the fragment ops appended.
1820 */
1821 void
1822 cell_gen_fragment_function(struct cell_context *cell,
1823 const uint facing,
1824 struct spe_function *f)
1825 {
1826 const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
1827 const struct pipe_blend_state *blend = cell->blend;
1828 const struct pipe_blend_color *blend_color = &cell->blend_color;
1829 const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
1830
1831 /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
1832 const int x_reg = 3; /* uint */
1833 const int y_reg = 4; /* uint */
1834 const int color_tile_reg = 5; /* tile_t * */
1835 const int depth_tile_reg = 6; /* tile_t * */
1836 const int fragZ_reg = 7; /* vector float */
1837 const int fragR_reg = 8; /* vector float */
1838 const int fragG_reg = 9; /* vector float */
1839 const int fragB_reg = 10; /* vector float */
1840 const int fragA_reg = 11; /* vector float */
1841 const int mask_reg = 12; /* vector uint */
1842
1843 ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
1844
1845 /* offset of quad from start of tile
1846 * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
1847 */
1848 int quad_offset_reg;
1849
1850 int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */
1851 int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */
1852
1853 if (cell->debug_flags & CELL_DEBUG_ASM) {
1854 spe_print_code(f, true);
1855 spe_indent(f, 8);
1856 spe_comment(f, -4, facing == CELL_FACING_FRONT
1857 ? "Begin front-facing per-fragment ops"
1858 : "Begin back-facing per-fragment ops");
1859 }
1860
1861 spe_allocate_register(f, x_reg);
1862 spe_allocate_register(f, y_reg);
1863 spe_allocate_register(f, color_tile_reg);
1864 spe_allocate_register(f, depth_tile_reg);
1865 spe_allocate_register(f, fragZ_reg);
1866 spe_allocate_register(f, fragR_reg);
1867 spe_allocate_register(f, fragG_reg);
1868 spe_allocate_register(f, fragB_reg);
1869 spe_allocate_register(f, fragA_reg);
1870 spe_allocate_register(f, mask_reg);
1871
1872 quad_offset_reg = spe_allocate_available_register(f);
1873 fbRGBA_reg = spe_allocate_available_register(f);
1874 fbZS_reg = spe_allocate_available_register(f);
1875
1876 /* compute offset of quad from start of tile, in bytes */
1877 {
1878 int x2_reg = spe_allocate_available_register(f);
1879 int y2_reg = spe_allocate_available_register(f);
1880
1881 ASSERT(TILE_SIZE == 32);
1882
1883 spe_comment(f, 0, "Compute quad offset within tile");
1884 spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */
1885 spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
1886 spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */
1887 spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */
1888 spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */
1889
1890 spe_release_register(f, x2_reg);
1891 spe_release_register(f, y2_reg);
1892 }
1893
1894 /* Generate the alpha test, if needed. */
1895 if (dsa->alpha.enabled) {
1896 gen_alpha_test(dsa, f, mask_reg, fragA_reg);
1897 }
1898
1899 /* If we need the stencil buffers (because one- or two-sided stencil is
1900 * enabled) or the depth buffer (because the depth test is enabled),
1901 * go grab them. Note that if either one- or two-sided stencil is
1902 * enabled, dsa->stencil[0].enabled will be true.
1903 */
1904 if (dsa->depth.enabled || dsa->stencil[0].enabled) {
1905 const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
1906 boolean write_depth_stencil;
1907
1908 /* We may or may not need to allocate a register for Z or stencil values */
1909 boolean fbS_reg_set = false, fbZ_reg_set = false;
1910 unsigned int fbS_reg, fbZ_reg = 0;
1911
1912 spe_comment(f, 0, "Fetch Z/stencil quad from tile");
1913
1914 /* fetch quad of depth/stencil values from tile at (x,y) */
1915 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
1916 /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
1917 spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
1918
1919 /* From the Z/stencil buffer format, pull out the bits we need for
1920 * Z and/or stencil. We'll also convert the incoming fragment Z
1921 * value in fragZ_reg from a floating point value in [0.0..1.0] to
1922 * an unsigned integer value with the appropriate resolution.
1923 * Note that even if depth or stencil is *not* enabled, if it's
1924 * present in the buffer, we pull it out and put it back later;
1925 * otherwise, we can inadvertently destroy the contents of
1926 * buffers we're not supposed to touch (e.g., if the user is
1927 * clearing the depth buffer but not the stencil buffer, a
1928 * quad of constant depth is drawn over the surface; the stencil
1929 * buffer must be maintained).
1930 */
1931 switch(zs_format) {
1932 case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
1933 case PIPE_FORMAT_X8Z24_UNORM:
1934 /* Pull out both Z and stencil */
1935 setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
1936 setup_optional_register(f, &fbS_reg_set, &fbS_reg);
1937
1938 /* four 24-bit Z values in the low-order bits */
1939 spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
1940
1941 /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
1942 * to a 24-bit unsigned integer
1943 */
1944 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1945 spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
1946
1947 /* four 8-bit stencil values in the high-order bits */
1948 spe_rotmi(f, fbS_reg, fbZS_reg, -24);
1949 break;
1950
1951 case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
1952 case PIPE_FORMAT_Z24X8_UNORM:
1953 setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
1954 setup_optional_register(f, &fbS_reg_set, &fbS_reg);
1955
1956 /* shift by 8 to get the upper 24-bit values */
1957 spe_rotmi(f, fbS_reg, fbZS_reg, -8);
1958
1959 /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
1960 * to a 24-bit unsigned integer
1961 */
1962 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1963 spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
1964
1965 /* 8-bit stencil in the low-order bits - mask them out */
1966 spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
1967 break;
1968
1969 case PIPE_FORMAT_Z32_UNORM:
1970 setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
1971 /* Copy over 4 32-bit values */
1972 spe_move(f, fbZ_reg, fbZS_reg);
1973
1974 /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
1975 * to a 32-bit unsigned integer
1976 */
1977 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1978 /* No stencil, so can't do anything there */
1979 break;
1980
1981 case PIPE_FORMAT_Z16_UNORM:
1982 /* XXX Not sure this is correct, but it was here before, so we're
1983 * going with it for now
1984 */
1985 setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
1986 /* Copy over 4 32-bit values */
1987 spe_move(f, fbZ_reg, fbZS_reg);
1988
1989 /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
1990 * to a 16-bit unsigned integer
1991 */
1992 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1993 spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
1994 /* No stencil */
1995 break;
1996
1997 default:
1998 ASSERT(0); /* invalid format */
1999 }
2000
2001 /* If stencil is enabled, use the stencil-specific code
2002 * generator to generate both the stencil and depth (if needed)
2003 * tests. Otherwise, if only depth is enabled, generate
2004 * a quick depth test. The test generators themselves will
2005 * report back whether the depth/stencil buffer has to be
2006 * written back.
2007 */
2008 if (dsa->stencil[0].enabled) {
2009 /* This will perform the stencil and depth tests, and update
2010 * the mask_reg, fbZ_reg, and fbS_reg as required by the
2011 * tests.
2012 */
2013 ASSERT(fbS_reg_set);
2014 spe_comment(f, 0, "Perform stencil test");
2015
2016 /* Note that fbZ_reg may not be set on entry, if stenciling
2017 * is enabled but there's no Z-buffer. The
2018 * gen_stencil_depth_test() function must ignore the
2019 * fbZ_reg register if depth is not enabled.
2020 */
2021 write_depth_stencil = gen_stencil_depth_test(f, dsa, facing,
2022 mask_reg, fragZ_reg,
2023 fbZ_reg, fbS_reg);
2024 }
2025 else if (dsa->depth.enabled) {
2026 int zmask_reg = spe_allocate_available_register(f);
2027 ASSERT(fbZ_reg_set);
2028 spe_comment(f, 0, "Perform depth test");
2029 write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg,
2030 fbZ_reg, zmask_reg);
2031 spe_release_register(f, zmask_reg);
2032 }
2033 else {
2034 write_depth_stencil = false;
2035 }
2036
2037 if (write_depth_stencil) {
2038 /* Merge latest Z and Stencil values into fbZS_reg.
2039 * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
2040 * fbS_reg has four 8-bit Z values in bits [7..0].
2041 */
2042 spe_comment(f, 0, "Store quad's depth/stencil values in tile");
2043 if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
2044 zs_format == PIPE_FORMAT_X8Z24_UNORM) {
2045 spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
2046 spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
2047 }
2048 else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
2049 zs_format == PIPE_FORMAT_Z24X8_UNORM) {
2050 spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
2051 spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
2052 }
2053 else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
2054 spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
2055 }
2056 else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
2057 spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
2058 }
2059 else if (zs_format == PIPE_FORMAT_S8_UNORM) {
2060 ASSERT(0); /* XXX to do */
2061 }
2062 else {
2063 ASSERT(0); /* bad zs_format */
2064 }
2065
2066 /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
2067 spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
2068 }
2069
2070 /* Don't need these any more */
2071 release_optional_register(f, &fbZ_reg_set, fbZ_reg);
2072 release_optional_register(f, &fbS_reg_set, fbS_reg);
2073 }
2074
2075 /* Get framebuffer quad/colors. We'll need these for blending,
2076 * color masking, and to obey the quad/pixel mask.
2077 * Load: fbRGBA_reg = memory[color_tile + quad_offset]
2078 * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
2079 * we could skip this load.
2080 */
2081 spe_comment(f, 0, "Fetch quad colors from tile");
2082 spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
2083
2084 if (blend->blend_enable) {
2085 spe_comment(f, 0, "Perform blending");
2086 gen_blend(blend, blend_color, f, color_format,
2087 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
2088 }
2089
2090 /*
2091 * Write fragment colors to framebuffer/tile.
2092 * This involves converting the fragment colors from float[4] to the
2093 * tile's specific format and obeying the quad/pixel mask.
2094 */
2095 {
2096 int rgba_reg = spe_allocate_available_register(f);
2097
2098 /* Pack four float colors as four 32-bit int colors */
2099 spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
2100 gen_pack_colors(f, color_format,
2101 fragR_reg, fragG_reg, fragB_reg, fragA_reg,
2102 rgba_reg);
2103
2104 if (blend->logicop_enable) {
2105 spe_comment(f, 0, "Compute logic op");
2106 gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
2107 }
2108
2109 if (blend->colormask != PIPE_MASK_RGBA) {
2110 spe_comment(f, 0, "Compute color mask");
2111 gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
2112 }
2113
2114 /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
2115 * if (mask[i])
2116 * rgba[i] = rgba[i];
2117 * else
2118 * rgba[i] = framebuffer[i];
2119 */
2120 spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
2121
2122 /* Store updated quad in tile:
2123 * memory[color_tile + quad_offset] = rgba_reg;
2124 */
2125 spe_comment(f, 0, "Store quad colors into color tile");
2126 spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
2127
2128 spe_release_register(f, rgba_reg);
2129 }
2130
2131 //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
2132
2133 spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */
2134
2135 spe_release_register(f, fbRGBA_reg);
2136 spe_release_register(f, fbZS_reg);
2137 spe_release_register(f, quad_offset_reg);
2138
2139 if (cell->debug_flags & CELL_DEBUG_ASM) {
2140 char buffer[1024];
2141 sprintf(buffer, "End %s-facing per-fragment ops: %d instructions",
2142 facing == CELL_FACING_FRONT ? "front" : "back", f->num_inst);
2143 spe_comment(f, -4, buffer);
2144 }
2145 }