Merge commit 'origin/gallium-0.1' into gallium-0.2
[mesa.git] / src / gallium / drivers / cell / ppu / cell_gen_fragment.c
1 /**************************************************************************
2 *
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29
30 /**
31 * Generate SPU per-fragment code (actually per-quad code).
32 * \author Brian Paul
33 */
34
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "rtasm/rtasm_ppc_spe.h"
39 #include "cell_context.h"
40 #include "cell_gen_fragment.h"
41
42
43
44 /** Do extra optimizations? */
45 #define OPTIMIZATIONS 1
46
47
48 /**
49 * Generate SPE code to perform Z/depth testing.
50 *
51 * \param dsa Gallium depth/stencil/alpha state to gen code for
52 * \param f SPE function to append instruction onto.
53 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
54 * \param ifragZ_reg register containing integer fragment Z values (in)
55 * \param ifbZ_reg register containing integer frame buffer Z values (in/out)
56 * \param zmask_reg register containing result of Z test/comparison (out)
57 *
58 * Returns true if the Z-buffer needs to be updated.
59 */
60 static boolean
61 gen_depth_test(struct spe_function *f,
62 const struct pipe_depth_stencil_alpha_state *dsa,
63 int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
64 {
65 /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
66 * quantities. This only makes a difference for 32-bit Z values though.
67 */
68 ASSERT(dsa->depth.enabled);
69
70 switch (dsa->depth.func) {
71 case PIPE_FUNC_EQUAL:
72 /* zmask = (ifragZ == ref) */
73 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
74 /* mask = (mask & zmask) */
75 spe_and(f, mask_reg, mask_reg, zmask_reg);
76 break;
77
78 case PIPE_FUNC_NOTEQUAL:
79 /* zmask = (ifragZ == ref) */
80 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
81 /* mask = (mask & ~zmask) */
82 spe_andc(f, mask_reg, mask_reg, zmask_reg);
83 break;
84
85 case PIPE_FUNC_GREATER:
86 /* zmask = (ifragZ > ref) */
87 spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
88 /* mask = (mask & zmask) */
89 spe_and(f, mask_reg, mask_reg, zmask_reg);
90 break;
91
92 case PIPE_FUNC_LESS:
93 /* zmask = (ref > ifragZ) */
94 spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
95 /* mask = (mask & zmask) */
96 spe_and(f, mask_reg, mask_reg, zmask_reg);
97 break;
98
99 case PIPE_FUNC_LEQUAL:
100 /* zmask = (ifragZ > ref) */
101 spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
102 /* mask = (mask & ~zmask) */
103 spe_andc(f, mask_reg, mask_reg, zmask_reg);
104 break;
105
106 case PIPE_FUNC_GEQUAL:
107 /* zmask = (ref > ifragZ) */
108 spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
109 /* mask = (mask & ~zmask) */
110 spe_andc(f, mask_reg, mask_reg, zmask_reg);
111 break;
112
113 case PIPE_FUNC_NEVER:
114 spe_il(f, mask_reg, 0); /* mask = {0,0,0,0} */
115 spe_move(f, zmask_reg, mask_reg); /* zmask = mask */
116 break;
117
118 case PIPE_FUNC_ALWAYS:
119 /* mask unchanged */
120 spe_il(f, zmask_reg, ~0); /* zmask = {~0,~0,~0,~0} */
121 break;
122
123 default:
124 ASSERT(0);
125 break;
126 }
127
128 if (dsa->depth.writemask) {
129 /*
130 * If (ztest passed) {
131 * framebufferZ = fragmentZ;
132 * }
133 * OR,
134 * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
135 */
136 spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
137 return true;
138 }
139
140 return false;
141 }
142
143
144 /**
145 * Generate SPE code to perform alpha testing.
146 *
147 * \param dsa Gallium depth/stencil/alpha state to gen code for
148 * \param f SPE function to append instruction onto.
149 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
150 * \param fragA_reg register containing four fragment alpha values (in)
151 */
152 static void
153 gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
154 struct spe_function *f, int mask_reg, int fragA_reg)
155 {
156 int ref_reg = spe_allocate_available_register(f);
157 int amask_reg = spe_allocate_available_register(f);
158
159 ASSERT(dsa->alpha.enabled);
160
161 if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
162 (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
163 /* load/splat the alpha reference float value */
164 spe_load_float(f, ref_reg, dsa->alpha.ref);
165 }
166
167 /* emit code to do the alpha comparison, updating 'mask' */
168 switch (dsa->alpha.func) {
169 case PIPE_FUNC_EQUAL:
170 /* amask = (fragA == ref) */
171 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
172 /* mask = (mask & amask) */
173 spe_and(f, mask_reg, mask_reg, amask_reg);
174 break;
175
176 case PIPE_FUNC_NOTEQUAL:
177 /* amask = (fragA == ref) */
178 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
179 /* mask = (mask & ~amask) */
180 spe_andc(f, mask_reg, mask_reg, amask_reg);
181 break;
182
183 case PIPE_FUNC_GREATER:
184 /* amask = (fragA > ref) */
185 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
186 /* mask = (mask & amask) */
187 spe_and(f, mask_reg, mask_reg, amask_reg);
188 break;
189
190 case PIPE_FUNC_LESS:
191 /* amask = (ref > fragA) */
192 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
193 /* mask = (mask & amask) */
194 spe_and(f, mask_reg, mask_reg, amask_reg);
195 break;
196
197 case PIPE_FUNC_LEQUAL:
198 /* amask = (fragA > ref) */
199 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
200 /* mask = (mask & ~amask) */
201 spe_andc(f, mask_reg, mask_reg, amask_reg);
202 break;
203
204 case PIPE_FUNC_GEQUAL:
205 /* amask = (ref > fragA) */
206 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
207 /* mask = (mask & ~amask) */
208 spe_andc(f, mask_reg, mask_reg, amask_reg);
209 break;
210
211 case PIPE_FUNC_NEVER:
212 spe_il(f, mask_reg, 0); /* mask = [0,0,0,0] */
213 break;
214
215 case PIPE_FUNC_ALWAYS:
216 /* no-op, mask unchanged */
217 break;
218
219 default:
220 ASSERT(0);
221 break;
222 }
223
224 #if OPTIMIZATIONS
225 /* if mask == {0,0,0,0} we're all done, return */
226 {
227 /* re-use amask reg here */
228 int tmp_reg = amask_reg;
229 /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
230 spe_orx(f, tmp_reg, mask_reg);
231 /* if tmp[0] == 0 then return from function call */
232 spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
233 }
234 #endif
235
236 spe_release_register(f, ref_reg);
237 spe_release_register(f, amask_reg);
238 }
239
240 /* This pair of functions is used inline to allocate and deallocate
241 * optional constant registers. Once a constant is discovered to be
242 * needed, we will likely need it again, so we don't want to deallocate
243 * it and have to allocate and load it again unnecessarily.
244 */
245 static inline void
246 setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
247 {
248 if (*is_already_set) return;
249 *r = spe_allocate_available_register(f);
250 *is_already_set = true;
251 }
252
253 static inline void
254 release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
255 {
256 if (!*is_already_set) return;
257 spe_release_register(f, r);
258 *is_already_set = false;
259 }
260
261 static inline void
262 setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
263 {
264 if (*is_already_set) return;
265 setup_optional_register(f, is_already_set, r);
266 spe_load_float(f, *r, value);
267 }
268
269 static inline void
270 release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
271 {
272 release_optional_register(f, is_already_set, r);
273 }
274
275 /**
276 * Generate SPE code to implement the given blend mode for a quad of pixels.
277 * \param f SPE function to append instruction onto.
278 * \param fragR_reg register with fragment red values (float) (in/out)
279 * \param fragG_reg register with fragment green values (float) (in/out)
280 * \param fragB_reg register with fragment blue values (float) (in/out)
281 * \param fragA_reg register with fragment alpha values (float) (in/out)
282 * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
283 */
284 static void
285 gen_blend(const struct pipe_blend_state *blend,
286 const struct pipe_blend_color *blend_color,
287 struct spe_function *f,
288 enum pipe_format color_format,
289 int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
290 int fbRGBA_reg)
291 {
292 int term1R_reg = spe_allocate_available_register(f);
293 int term1G_reg = spe_allocate_available_register(f);
294 int term1B_reg = spe_allocate_available_register(f);
295 int term1A_reg = spe_allocate_available_register(f);
296
297 int term2R_reg = spe_allocate_available_register(f);
298 int term2G_reg = spe_allocate_available_register(f);
299 int term2B_reg = spe_allocate_available_register(f);
300 int term2A_reg = spe_allocate_available_register(f);
301
302 int fbR_reg = spe_allocate_available_register(f);
303 int fbG_reg = spe_allocate_available_register(f);
304 int fbB_reg = spe_allocate_available_register(f);
305 int fbA_reg = spe_allocate_available_register(f);
306
307 int tmp_reg = spe_allocate_available_register(f);
308
309 /* Optional constant registers we might or might not end up using;
310 * if we do use them, make sure we only allocate them once by
311 * keeping a flag on each one.
312 */
313 boolean one_reg_set = false;
314 unsigned int one_reg;
315 boolean constR_reg_set = false, constG_reg_set = false,
316 constB_reg_set = false, constA_reg_set = false;
317 unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
318
319 ASSERT(blend->blend_enable);
320
321 /* Unpack/convert framebuffer colors from four 32-bit packed colors
322 * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
323 * Each 8-bit color component is expanded into a float in [0.0, 1.0].
324 */
325 {
326 int mask_reg = spe_allocate_available_register(f);
327
328 /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
329 spe_load_int(f, mask_reg, 0xff);
330
331 /* XXX there may be more clever ways to implement the following code */
332 switch (color_format) {
333 case PIPE_FORMAT_A8R8G8B8_UNORM:
334 /* fbB = fbB & mask */
335 spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
336 /* mask = mask << 8 */
337 spe_roti(f, mask_reg, mask_reg, 8);
338
339 /* fbG = fbRGBA & mask */
340 spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
341 /* fbG = fbG >> 8 */
342 spe_roti(f, fbG_reg, fbG_reg, -8);
343 /* mask = mask << 8 */
344 spe_roti(f, mask_reg, mask_reg, 8);
345
346 /* fbR = fbRGBA & mask */
347 spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
348 /* fbR = fbR >> 16 */
349 spe_roti(f, fbR_reg, fbR_reg, -16);
350 /* mask = mask << 8 */
351 spe_roti(f, mask_reg, mask_reg, 8);
352
353 /* fbA = fbRGBA & mask */
354 spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
355 /* fbA = fbA >> 24 */
356 spe_roti(f, fbA_reg, fbA_reg, -24);
357 break;
358
359 case PIPE_FORMAT_B8G8R8A8_UNORM:
360 /* fbA = fbA & mask */
361 spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
362 /* mask = mask << 8 */
363 spe_roti(f, mask_reg, mask_reg, 8);
364
365 /* fbR = fbRGBA & mask */
366 spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
367 /* fbR = fbR >> 8 */
368 spe_roti(f, fbR_reg, fbR_reg, -8);
369 /* mask = mask << 8 */
370 spe_roti(f, mask_reg, mask_reg, 8);
371
372 /* fbG = fbRGBA & mask */
373 spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
374 /* fbG = fbG >> 16 */
375 spe_roti(f, fbG_reg, fbG_reg, -16);
376 /* mask = mask << 8 */
377 spe_roti(f, mask_reg, mask_reg, 8);
378
379 /* fbB = fbRGBA & mask */
380 spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
381 /* fbB = fbB >> 24 */
382 spe_roti(f, fbB_reg, fbB_reg, -24);
383 break;
384
385 default:
386 ASSERT(0);
387 }
388
389 /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
390 spe_cuflt(f, fbR_reg, fbR_reg, 8);
391 spe_cuflt(f, fbG_reg, fbG_reg, 8);
392 spe_cuflt(f, fbB_reg, fbB_reg, 8);
393 spe_cuflt(f, fbA_reg, fbA_reg, 8);
394
395 spe_release_register(f, mask_reg);
396 }
397
398 /*
399 * Compute Src RGB terms. We're actually looking for the value
400 * of (the appropriate RGB factors) * (the incoming source RGB color),
401 * because in some cases (like PIPE_BLENDFACTOR_ONE and
402 * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
403 */
404 switch (blend->rgb_src_factor) {
405 case PIPE_BLENDFACTOR_ONE:
406 /* factors = (1,1,1), so term = (R,G,B) */
407 spe_move(f, term1R_reg, fragR_reg);
408 spe_move(f, term1G_reg, fragG_reg);
409 spe_move(f, term1B_reg, fragB_reg);
410 break;
411 case PIPE_BLENDFACTOR_ZERO:
412 /* factors = (0,0,0), so term = (0,0,0) */
413 spe_load_float(f, term1R_reg, 0.0f);
414 spe_load_float(f, term1G_reg, 0.0f);
415 spe_load_float(f, term1B_reg, 0.0f);
416 break;
417 case PIPE_BLENDFACTOR_SRC_COLOR:
418 /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
419 spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
420 spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
421 spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
422 break;
423 case PIPE_BLENDFACTOR_SRC_ALPHA:
424 /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
425 spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
426 spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
427 spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
428 break;
429 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
430 /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B))
431 * or in other words term = (R-R*R, G-G*G, B-B*B)
432 * fnms(a,b,c,d) computes a = d - b*c
433 */
434 spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
435 spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
436 spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
437 break;
438 case PIPE_BLENDFACTOR_DST_COLOR:
439 /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
440 spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
441 spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
442 spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
443 break;
444 case PIPE_BLENDFACTOR_INV_DST_COLOR:
445 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
446 * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
447 * fnms(a,b,c,d) computes a = d - b*c
448 */
449 spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
450 spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
451 spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
452 break;
453 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
454 /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
455 * or term = (R-R*A,G-G*A,B-B*A)
456 * fnms(a,b,c,d) computes a = d - b*c
457 */
458 spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
459 spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
460 spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
461 break;
462 case PIPE_BLENDFACTOR_DST_ALPHA:
463 /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
464 spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
465 spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
466 spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
467 break;
468 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
469 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb))
470 * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
471 * fnms(a,b,c,d) computes a = d - b*c
472 */
473 spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
474 spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
475 spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
476 break;
477 case PIPE_BLENDFACTOR_CONST_COLOR:
478 /* We need the optional constant color registers */
479 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
480 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
481 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
482 /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
483 spe_fm(f, term1R_reg, fragR_reg, constR_reg);
484 spe_fm(f, term1G_reg, fragG_reg, constG_reg);
485 spe_fm(f, term1B_reg, fragB_reg, constB_reg);
486 break;
487 case PIPE_BLENDFACTOR_CONST_ALPHA:
488 /* we'll need the optional constant alpha register */
489 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
490 /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
491 spe_fm(f, term1R_reg, fragR_reg, constA_reg);
492 spe_fm(f, term1G_reg, fragG_reg, constA_reg);
493 spe_fm(f, term1B_reg, fragB_reg, constA_reg);
494 break;
495 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
496 /* We need the optional constant color registers */
497 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
498 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
499 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
500 /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc))
501 * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
502 * fnms(a,b,c,d) computes a = d - b*c
503 */
504 spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
505 spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
506 spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
507 break;
508 case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
509 /* We need the optional constant color registers */
510 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
511 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
512 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
513 /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
514 * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
515 * fnms(a,b,c,d) computes a = d - b*c
516 */
517 spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
518 spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
519 spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
520 break;
521 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
522 /* We'll need the optional {1,1,1,1} register */
523 setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
524 /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so
525 * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
526 * We could expand the term (as a*min(b,c) == min(a*b,a*c)
527 * as long as a is positive), but then we'd have to do three
528 * spe_float_min() functions instead of one, so this is simpler.
529 */
530 /* tmp = 1 - Afb */
531 spe_fs(f, tmp_reg, one_reg, fbA_reg);
532 /* tmp = min(A,tmp) */
533 spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
534 /* term = R*tmp */
535 spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
536 spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
537 spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
538 break;
539
540 /* These are special D3D cases involving a second color output
541 * from the fragment shader. I'm not sure we can support them
542 * yet... XXX
543 */
544 case PIPE_BLENDFACTOR_SRC1_COLOR:
545 case PIPE_BLENDFACTOR_SRC1_ALPHA:
546 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
547 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
548
549 default:
550 ASSERT(0);
551 }
552
553 /*
554 * Compute Src Alpha term. Like the above, we're looking for
555 * the full term A*factor, not just the factor itself, because
556 * in many cases we can avoid doing unnecessary multiplies.
557 */
558 switch (blend->alpha_src_factor) {
559 case PIPE_BLENDFACTOR_ZERO:
560 /* factor = 0, so term = 0 */
561 spe_load_float(f, term1A_reg, 0.0f);
562 break;
563
564 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
565 case PIPE_BLENDFACTOR_ONE:
566 /* factor = 1, so term = A */
567 spe_move(f, term1A_reg, fragA_reg);
568 break;
569
570 case PIPE_BLENDFACTOR_SRC_COLOR:
571 /* factor = A, so term = A*A */
572 spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
573 break;
574 case PIPE_BLENDFACTOR_SRC_ALPHA:
575 spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
576 break;
577
578 case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
579 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
580 /* factor = 1-A, so term = A*(1-A) = A-A*A */
581 /* fnms(a,b,c,d) computes a = d - b*c */
582 spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
583 break;
584
585 case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
586 case PIPE_BLENDFACTOR_DST_COLOR:
587 /* factor = Afb, so term = A*Afb */
588 spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
589 break;
590
591 case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
592 case PIPE_BLENDFACTOR_INV_DST_COLOR:
593 /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
594 /* fnms(a,b,c,d) computes a = d - b*c */
595 spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
596 break;
597
598 case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
599 case PIPE_BLENDFACTOR_CONST_COLOR:
600 /* We need the optional constA_reg register */
601 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
602 /* factor = Ac, so term = A*Ac */
603 spe_fm(f, term1A_reg, fragA_reg, constA_reg);
604 break;
605
606 case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
607 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
608 /* We need the optional constA_reg register */
609 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
610 /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
611 /* fnms(a,b,c,d) computes a = d - b*c */
612 spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
613 break;
614
615 /* These are special D3D cases involving a second color output
616 * from the fragment shader. I'm not sure we can support them
617 * yet... XXX
618 */
619 case PIPE_BLENDFACTOR_SRC1_COLOR:
620 case PIPE_BLENDFACTOR_SRC1_ALPHA:
621 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
622 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
623 default:
624 ASSERT(0);
625 }
626
627 /*
628 * Compute Dest RGB term. Like the above, we're looking for
629 * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
630 * in many cases we can avoid doing unnecessary multiplies.
631 */
632 switch (blend->rgb_dst_factor) {
633 case PIPE_BLENDFACTOR_ONE:
634 /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
635 spe_move(f, term2R_reg, fbR_reg);
636 spe_move(f, term2G_reg, fbG_reg);
637 spe_move(f, term2B_reg, fbB_reg);
638 break;
639 case PIPE_BLENDFACTOR_ZERO:
640 /* factor s= (0,0,0), so term = (0,0,0) */
641 spe_load_float(f, term2R_reg, 0.0f);
642 spe_load_float(f, term2G_reg, 0.0f);
643 spe_load_float(f, term2B_reg, 0.0f);
644 break;
645 case PIPE_BLENDFACTOR_SRC_COLOR:
646 /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
647 spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
648 spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
649 spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
650 break;
651 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
652 /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B))
653 * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
654 * fnms(a,b,c,d) computes a = d - b*c
655 */
656 spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
657 spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
658 spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
659 break;
660 case PIPE_BLENDFACTOR_SRC_ALPHA:
661 /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
662 spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
663 spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
664 spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
665 break;
666 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
667 /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
668 /* fnms(a,b,c,d) computes a = d - b*c */
669 spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
670 spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
671 spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
672 break;
673 case PIPE_BLENDFACTOR_DST_COLOR:
674 /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
675 spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
676 spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
677 spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
678 break;
679 case PIPE_BLENDFACTOR_INV_DST_COLOR:
680 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
681 * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
682 * fnms(a,b,c,d) computes a = d - b*c
683 */
684 spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
685 spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
686 spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
687 break;
688
689 case PIPE_BLENDFACTOR_DST_ALPHA:
690 /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
691 spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
692 spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
693 spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
694 break;
695 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
696 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb))
697 * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
698 * fnms(a,b,c,d) computes a = d - b*c
699 */
700 spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
701 spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
702 spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
703 break;
704 case PIPE_BLENDFACTOR_CONST_COLOR:
705 /* We need the optional constant color registers */
706 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
707 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
708 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
709 /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
710 spe_fm(f, term2R_reg, fbR_reg, constR_reg);
711 spe_fm(f, term2G_reg, fbG_reg, constG_reg);
712 spe_fm(f, term2B_reg, fbB_reg, constB_reg);
713 break;
714 case PIPE_BLENDFACTOR_CONST_ALPHA:
715 /* we'll need the optional constant alpha register */
716 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
717 /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
718 spe_fm(f, term2R_reg, fbR_reg, constA_reg);
719 spe_fm(f, term2G_reg, fbG_reg, constA_reg);
720 spe_fm(f, term2B_reg, fbB_reg, constA_reg);
721 break;
722 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
723 /* We need the optional constant color registers */
724 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
725 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
726 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
727 /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc))
728 * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
729 * fnms(a,b,c,d) computes a = d - b*c
730 */
731 spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
732 spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
733 spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
734 break;
735 case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
736 /* We need the optional constant color registers */
737 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
738 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
739 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
740 /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
741 * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
742 * fnms(a,b,c,d) computes a = d - b*c
743 */
744 spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
745 spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
746 spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
747 break;
748 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
749 ASSERT(0);
750 break;
751
752 /* These are special D3D cases involving a second color output
753 * from the fragment shader. I'm not sure we can support them
754 * yet... XXX
755 */
756 case PIPE_BLENDFACTOR_SRC1_COLOR:
757 case PIPE_BLENDFACTOR_SRC1_ALPHA:
758 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
759 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
760
761 default:
762 ASSERT(0);
763 }
764
765 /*
766 * Compute Dest Alpha term. Like the above, we're looking for
767 * the full term Afb*factor, not just the factor itself, because
768 * in many cases we can avoid doing unnecessary multiplies.
769 */
770 switch (blend->alpha_dst_factor) {
771 case PIPE_BLENDFACTOR_ONE:
772 /* factor = 1, so term = Afb */
773 spe_move(f, term2A_reg, fbA_reg);
774 break;
775 case PIPE_BLENDFACTOR_ZERO:
776 /* factor = 0, so term = 0 */
777 spe_load_float(f, term2A_reg, 0.0f);
778 break;
779
780 case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
781 case PIPE_BLENDFACTOR_SRC_COLOR:
782 /* factor = A, so term = Afb*A */
783 spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
784 break;
785
786 case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
787 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
788 /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
789 /* fnms(a,b,c,d) computes a = d - b*c */
790 spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
791 break;
792
793 case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
794 case PIPE_BLENDFACTOR_DST_COLOR:
795 /* factor = Afb, so term = Afb*Afb */
796 spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
797 break;
798
799 case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
800 case PIPE_BLENDFACTOR_INV_DST_COLOR:
801 /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
802 /* fnms(a,b,c,d) computes a = d - b*c */
803 spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
804 break;
805
806 case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
807 case PIPE_BLENDFACTOR_CONST_COLOR:
808 /* We need the optional constA_reg register */
809 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
810 /* factor = Ac, so term = Afb*Ac */
811 spe_fm(f, term2A_reg, fbA_reg, constA_reg);
812 break;
813
814 case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
815 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
816 /* We need the optional constA_reg register */
817 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
818 /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
819 /* fnms(a,b,c,d) computes a = d - b*c */
820 spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
821 break;
822
823 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
824 ASSERT(0);
825 break;
826
827 /* These are special D3D cases involving a second color output
828 * from the fragment shader. I'm not sure we can support them
829 * yet... XXX
830 */
831 case PIPE_BLENDFACTOR_SRC1_COLOR:
832 case PIPE_BLENDFACTOR_SRC1_ALPHA:
833 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
834 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
835 default:
836 ASSERT(0);
837 }
838
839 /*
840 * Combine Src/Dest RGB terms as per the blend equation.
841 */
842 switch (blend->rgb_func) {
843 case PIPE_BLEND_ADD:
844 spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
845 spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
846 spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
847 break;
848 case PIPE_BLEND_SUBTRACT:
849 spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
850 spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
851 spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
852 break;
853 case PIPE_BLEND_REVERSE_SUBTRACT:
854 spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
855 spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
856 spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
857 break;
858 case PIPE_BLEND_MIN:
859 spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
860 spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
861 spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
862 break;
863 case PIPE_BLEND_MAX:
864 spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
865 spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
866 spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
867 break;
868 default:
869 ASSERT(0);
870 }
871
872 /*
873 * Combine Src/Dest A term
874 */
875 switch (blend->alpha_func) {
876 case PIPE_BLEND_ADD:
877 spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
878 break;
879 case PIPE_BLEND_SUBTRACT:
880 spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
881 break;
882 case PIPE_BLEND_REVERSE_SUBTRACT:
883 spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
884 break;
885 case PIPE_BLEND_MIN:
886 spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
887 break;
888 case PIPE_BLEND_MAX:
889 spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
890 break;
891 default:
892 ASSERT(0);
893 }
894
895 spe_release_register(f, term1R_reg);
896 spe_release_register(f, term1G_reg);
897 spe_release_register(f, term1B_reg);
898 spe_release_register(f, term1A_reg);
899
900 spe_release_register(f, term2R_reg);
901 spe_release_register(f, term2G_reg);
902 spe_release_register(f, term2B_reg);
903 spe_release_register(f, term2A_reg);
904
905 spe_release_register(f, fbR_reg);
906 spe_release_register(f, fbG_reg);
907 spe_release_register(f, fbB_reg);
908 spe_release_register(f, fbA_reg);
909
910 spe_release_register(f, tmp_reg);
911
912 /* Free any optional registers that actually got used */
913 release_const_register(f, &one_reg_set, one_reg);
914 release_const_register(f, &constR_reg_set, constR_reg);
915 release_const_register(f, &constG_reg_set, constG_reg);
916 release_const_register(f, &constB_reg_set, constB_reg);
917 release_const_register(f, &constA_reg_set, constA_reg);
918 }
919
920
921 static void
922 gen_logicop(const struct pipe_blend_state *blend,
923 struct spe_function *f,
924 int fragRGBA_reg, int fbRGBA_reg)
925 {
926 /* We've got four 32-bit RGBA packed pixels in each of
927 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
928 * reds, greens, blues, and alphas.
929 * */
930 ASSERT(blend->logicop_enable);
931
932 switch(blend->logicop_func) {
933 case PIPE_LOGICOP_CLEAR: /* 0 */
934 spe_zero(f, fragRGBA_reg);
935 break;
936 case PIPE_LOGICOP_NOR: /* ~(s | d) */
937 spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
938 break;
939 case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
940 /* andc R, A, B computes R = A & ~B */
941 spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
942 break;
943 case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
944 spe_complement(f, fragRGBA_reg, fragRGBA_reg);
945 break;
946 case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
947 /* andc R, A, B computes R = A & ~B */
948 spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
949 break;
950 case PIPE_LOGICOP_INVERT: /* ~d */
951 /* Note that (A nor A) == ~(A|A) == ~A */
952 spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
953 break;
954 case PIPE_LOGICOP_XOR: /* s ^ d */
955 spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
956 break;
957 case PIPE_LOGICOP_NAND: /* ~(s & d) */
958 spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
959 break;
960 case PIPE_LOGICOP_AND: /* s & d */
961 spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
962 break;
963 case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
964 spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
965 spe_complement(f, fragRGBA_reg, fragRGBA_reg);
966 break;
967 case PIPE_LOGICOP_NOOP: /* d */
968 spe_move(f, fragRGBA_reg, fbRGBA_reg);
969 break;
970 case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
971 /* orc R, A, B computes R = A | ~B */
972 spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
973 break;
974 case PIPE_LOGICOP_COPY: /* s */
975 break;
976 case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
977 /* orc R, A, B computes R = A | ~B */
978 spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
979 break;
980 case PIPE_LOGICOP_OR: /* s | d */
981 spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
982 break;
983 case PIPE_LOGICOP_SET: /* 1 */
984 spe_load_int(f, fragRGBA_reg, 0xffffffff);
985 break;
986 default:
987 ASSERT(0);
988 }
989 }
990
991
992 /**
993 * Generate code to pack a quad of float colors into four 32-bit integers.
994 *
995 * \param f SPE function to append instruction onto.
996 * \param color_format the dest color packing format
997 * \param r_reg register containing four red values (in/clobbered)
998 * \param g_reg register containing four green values (in/clobbered)
999 * \param b_reg register containing four blue values (in/clobbered)
1000 * \param a_reg register containing four alpha values (in/clobbered)
1001 * \param rgba_reg register to store the packed RGBA colors (out)
1002 */
1003 static void
1004 gen_pack_colors(struct spe_function *f,
1005 enum pipe_format color_format,
1006 int r_reg, int g_reg, int b_reg, int a_reg,
1007 int rgba_reg)
1008 {
1009 int rg_reg = spe_allocate_available_register(f);
1010 int ba_reg = spe_allocate_available_register(f);
1011
1012 /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
1013 spe_cfltu(f, r_reg, r_reg, 32);
1014 spe_cfltu(f, g_reg, g_reg, 32);
1015 spe_cfltu(f, b_reg, b_reg, 32);
1016 spe_cfltu(f, a_reg, a_reg, 32);
1017
1018 /* Shift the most significant bytes to the least significant positions.
1019 * I.e.: reg = reg >> 24
1020 */
1021 spe_rotmi(f, r_reg, r_reg, -24);
1022 spe_rotmi(f, g_reg, g_reg, -24);
1023 spe_rotmi(f, b_reg, b_reg, -24);
1024 spe_rotmi(f, a_reg, a_reg, -24);
1025
1026 /* Shift the color bytes according to the surface format */
1027 if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
1028 spe_roti(f, g_reg, g_reg, 8); /* green <<= 8 */
1029 spe_roti(f, r_reg, r_reg, 16); /* red <<= 16 */
1030 spe_roti(f, a_reg, a_reg, 24); /* alpha <<= 24 */
1031 }
1032 else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1033 spe_roti(f, r_reg, r_reg, 8); /* red <<= 8 */
1034 spe_roti(f, g_reg, g_reg, 16); /* green <<= 16 */
1035 spe_roti(f, b_reg, b_reg, 24); /* blue <<= 24 */
1036 }
1037 else {
1038 ASSERT(0);
1039 }
1040
1041 /* Merge red, green, blue, alpha registers to make packed RGBA colors.
1042 * Eg: after shifting according to color_format we might have:
1043 * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
1044 * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
1045 * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
1046 * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
1047 * OR-ing all those together gives us four packed colors:
1048 * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
1049 */
1050 spe_or(f, rg_reg, r_reg, g_reg);
1051 spe_or(f, ba_reg, a_reg, b_reg);
1052 spe_or(f, rgba_reg, rg_reg, ba_reg);
1053
1054 spe_release_register(f, rg_reg);
1055 spe_release_register(f, ba_reg);
1056 }
1057
1058 static void
1059 gen_colormask(struct spe_function *f,
1060 uint colormask,
1061 enum pipe_format color_format,
1062 int fragRGBA_reg, int fbRGBA_reg)
1063 {
1064 /* We've got four 32-bit RGBA packed pixels in each of
1065 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
1066 * reds, greens, blues, and alphas. Further, the pixels
1067 * are packed according to the given color format, not
1068 * necessarily RGBA...
1069 */
1070 unsigned int r_mask;
1071 unsigned int g_mask;
1072 unsigned int b_mask;
1073 unsigned int a_mask;
1074
1075 /* Calculate exactly where the bits for any particular color
1076 * end up, so we can mask them correctly.
1077 */
1078 switch(color_format) {
1079 case PIPE_FORMAT_A8R8G8B8_UNORM:
1080 /* ARGB */
1081 a_mask = 0xff000000;
1082 r_mask = 0x00ff0000;
1083 g_mask = 0x0000ff00;
1084 b_mask = 0x000000ff;
1085 break;
1086 case PIPE_FORMAT_B8G8R8A8_UNORM:
1087 /* BGRA */
1088 b_mask = 0xff000000;
1089 g_mask = 0x00ff0000;
1090 r_mask = 0x0000ff00;
1091 a_mask = 0x000000ff;
1092 break;
1093 default:
1094 ASSERT(0);
1095 }
1096
1097 /* For each R, G, B, and A component we're supposed to mask out,
1098 * clear its bits. Then our mask operation later will work
1099 * as expected.
1100 */
1101 if (!(colormask & PIPE_MASK_R)) {
1102 r_mask = 0;
1103 }
1104 if (!(colormask & PIPE_MASK_G)) {
1105 g_mask = 0;
1106 }
1107 if (!(colormask & PIPE_MASK_B)) {
1108 b_mask = 0;
1109 }
1110 if (!(colormask & PIPE_MASK_A)) {
1111 a_mask = 0;
1112 }
1113
1114 /* Get a temporary register to hold the mask that will be applied to the fragment */
1115 int colormask_reg = spe_allocate_available_register(f);
1116
1117 /* The actual mask we're going to use is an OR of the remaining R, G, B, and A
1118 * masks. Load the result value into our temporary register.
1119 */
1120 spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
1121
1122 /* Use the mask register to select between the fragment color
1123 * values and the frame buffer color values. Wherever the
1124 * mask has a 0 bit, the current frame buffer color should override
1125 * the fragment color. Wherever the mask has a 1 bit, the
1126 * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM)
1127 * instruction will select bits from its first operand rA wherever the
1128 * the mask bits rM are 0, and from its second operand rB wherever the
1129 * mask bits rM are 1. That means that the frame buffer color is the
1130 * first operand, and the fragment color the second.
1131 */
1132 spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
1133
1134 /* Release the temporary register and we're done */
1135 spe_release_register(f, colormask_reg);
1136 }
1137
1138 /* This function is annoyingly similar to gen_depth_test(), above, except
1139 * that instead of comparing two varying values (i.e. fragment and buffer),
1140 * we're comparing a varying value with a static value. As such, we have
1141 * access to the Compare Immediate instructions where we don't in
1142 * gen_depth_test(), which is what makes us very different.
1143 *
1144 * There's some added complexity if there's a non-trivial state->mask
1145 * value; then stencil and reference both must be masked
1146 *
1147 * The return value in the stencil_pass_reg is a bitmask of valid
1148 * fragments that also passed the stencil test. The bitmask of valid
1149 * fragments that failed would be found in (fragment_mask_reg & ~stencil_pass_reg).
1150 */
1151 static void
1152 gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
1153 unsigned int stencil_max_value,
1154 unsigned int fragment_mask_reg, unsigned int fbS_reg,
1155 unsigned int stencil_pass_reg)
1156 {
1157 /* Generate code that puts the set of passing fragments into the stencil_pass_reg
1158 * register, taking into account whether each fragment was active to begin with.
1159 */
1160 switch (state->func) {
1161 case PIPE_FUNC_EQUAL:
1162 if (state->value_mask == stencil_max_value) {
1163 /* stencil_pass = fragment_mask & (s == reference) */
1164 spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1165 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1166 }
1167 else {
1168 /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
1169 unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
1170 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1171 spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
1172 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1173 spe_release_register(f, tmp_masked_stencil);
1174 }
1175 break;
1176
1177 case PIPE_FUNC_NOTEQUAL:
1178 if (state->value_mask == stencil_max_value) {
1179 /* stencil_pass = fragment_mask & ~(s == reference) */
1180 spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1181 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1182 }
1183 else {
1184 /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
1185 unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
1186 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1187 spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
1188 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1189 spe_release_register(f, tmp_masked_stencil);
1190 }
1191 break;
1192
1193 case PIPE_FUNC_GREATER:
1194 if (state->value_mask == stencil_max_value) {
1195 /* stencil_pass = fragment_mask & (s > reference) */
1196 spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1197 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1198 }
1199 else {
1200 /* stencil_pass = fragment_mask & ((s&mask) > (reference&mask)) */
1201 unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
1202 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1203 spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
1204 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1205 spe_release_register(f, tmp_masked_stencil);
1206 }
1207 break;
1208
1209 case PIPE_FUNC_LESS:
1210 if (state->value_mask == stencil_max_value) {
1211 /* stencil_pass = fragment_mask & (reference > s) */
1212 /* There's no convenient Compare Less Than Immediate instruction, so
1213 * we'll have to do this one the harder way, by loading a register and
1214 * comparing directly. Compare Logical Greater Than Word (clgt)
1215 * treats its operands as unsigned - no sign extension.
1216 */
1217 unsigned int tmp_reg = spe_allocate_available_register(f);
1218 spe_load_uint(f, tmp_reg, state->ref_value);
1219 spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
1220 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1221 spe_release_register(f, tmp_reg);
1222 }
1223 else {
1224 /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
1225 unsigned int tmp_reg = spe_allocate_available_register(f);
1226 unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
1227 spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value);
1228 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1229 spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
1230 spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1231 spe_release_register(f, tmp_reg);
1232 spe_release_register(f, tmp_masked_stencil);
1233 }
1234 break;
1235
1236 case PIPE_FUNC_LEQUAL:
1237 if (state->value_mask == stencil_max_value) {
1238 /* stencil_pass = fragment_mask & (s <= reference)
1239 * = fragment_mask & ~(s > reference) */
1240 spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1241 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1242 }
1243 else {
1244 /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
1245 unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
1246 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1247 spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
1248 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1249 spe_release_register(f, tmp_masked_stencil);
1250 }
1251 break;
1252
1253 case PIPE_FUNC_GEQUAL:
1254 if (state->value_mask == stencil_max_value) {
1255 /* stencil_pass = fragment_mask & (s >= reference) ]
1256 * = fragment_mask & ~(reference > s) */
1257 /* As above, we have to do this by loading a register */
1258 unsigned int tmp_reg = spe_allocate_available_register(f);
1259 spe_load_uint(f, tmp_reg, state->ref_value);
1260 spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
1261 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1262 spe_release_register(f, tmp_reg);
1263 }
1264 else {
1265 /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
1266 unsigned int tmp_reg = spe_allocate_available_register(f);
1267 unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
1268 spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask);
1269 spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
1270 spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
1271 spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
1272 spe_release_register(f, tmp_reg);
1273 spe_release_register(f, tmp_masked_stencil);
1274 }
1275 break;
1276
1277 case PIPE_FUNC_NEVER:
1278 /* stencil_pass = fragment_mask & 0 = 0 */
1279 spe_load_uint(f, stencil_pass_reg, 0);
1280 break;
1281
1282 case PIPE_FUNC_ALWAYS:
1283 /* stencil_pass = fragment_mask & 1 = fragment_mask */
1284 spe_move(f, stencil_pass_reg, fragment_mask_reg);
1285 break;
1286 }
1287
1288 /* The fragments that passed the stencil test are now in stencil_pass_reg.
1289 * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
1290 */
1291 }
1292
1293 /* This function generates code that calculates a set of new stencil values
1294 * given the earlier values and the operation to apply. It does not
1295 * apply any tests. It is intended to be called up to 3 times
1296 * (for the stencil fail operation, for the stencil pass-z fail operation,
1297 * and for the stencil pass-z pass operation) to collect up to three
1298 * possible sets of values, and for the caller to combine them based
1299 * on the result of the tests.
1300 *
1301 * stencil_max_value should be (2^n - 1) where n is the number of bits
1302 * in the stencil buffer - in other words, it should be usable as a mask.
1303 */
1304 static void
1305 gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
1306 unsigned int stencil_ref_value, unsigned int stencil_max_value,
1307 unsigned int fbS_reg, unsigned int newS_reg)
1308 {
1309 /* The code below assumes that newS_reg and fbS_reg are not the same
1310 * register; if they can be, the calculations below will have to use
1311 * an additional temporary register. For now, mark the assumption
1312 * with an assertion that will fail if they are the same.
1313 */
1314 ASSERT(fbS_reg != newS_reg);
1315
1316 /* The code also assumes the the stencil_max_value is of the form
1317 * 2^n-1 and can therefore be used as a mask for the valid bits in
1318 * addition to a maximum. Make sure this is the case as well.
1319 * The clever math below exploits the fact that incrementing a
1320 * binary number serves to flip all the bits of a number starting at
1321 * the LSB and continuing to (and including) the first zero bit
1322 * found. That means that a number and its increment will always
1323 * have at least one bit in common (the high order bit, if nothing
1324 * else) *unless* the number is zero, *or* the number is of a form
1325 * consisting of some number of 1s in the low-order bits followed
1326 * by nothing but 0s in the high-order bits. The latter case
1327 * implies it's of the form 2^n-1.
1328 */
1329 ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
1330
1331 switch(stencil_op) {
1332 case PIPE_STENCIL_OP_KEEP:
1333 /* newS = S */
1334 spe_move(f, newS_reg, fbS_reg);
1335 break;
1336
1337 case PIPE_STENCIL_OP_ZERO:
1338 /* newS = 0 */
1339 spe_zero(f, newS_reg);
1340 break;
1341
1342 case PIPE_STENCIL_OP_REPLACE:
1343 /* newS = stencil reference value */
1344 spe_load_uint(f, newS_reg, stencil_ref_value);
1345 break;
1346
1347 case PIPE_STENCIL_OP_INCR: {
1348 /* newS = (s == max ? max : s + 1) */
1349 unsigned int equals_reg = spe_allocate_available_register(f);
1350
1351 spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
1352 /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
1353 spe_ai(f, newS_reg, fbS_reg, 1);
1354 /* Select from the current value or the new value based on the equality test */
1355 spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
1356
1357 spe_release_register(f, equals_reg);
1358 break;
1359 }
1360 case PIPE_STENCIL_OP_DECR: {
1361 /* newS = (s == 0 ? 0 : s - 1) */
1362 unsigned int equals_reg = spe_allocate_available_register(f);
1363
1364 spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
1365 /* Add Word Immediate with a (-1) value works */
1366 spe_ai(f, newS_reg, fbS_reg, -1);
1367 /* Select from the current value or the new value based on the equality test */
1368 spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
1369
1370 spe_release_register(f, equals_reg);
1371 break;
1372 }
1373 case PIPE_STENCIL_OP_INCR_WRAP:
1374 /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
1375 * do a normal add and mask off the correct bits
1376 */
1377 spe_ai(f, newS_reg, fbS_reg, 1);
1378 spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
1379 break;
1380
1381 case PIPE_STENCIL_OP_DECR_WRAP:
1382 /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
1383 spe_ai(f, newS_reg, fbS_reg, -1);
1384 spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
1385 break;
1386
1387 case PIPE_STENCIL_OP_INVERT:
1388 /* newS = ~s. We take advantage of the mask/max value to invert only
1389 * the valid bits for the field so we don't have to do an extra "and".
1390 */
1391 spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
1392 break;
1393
1394 default:
1395 ASSERT(0);
1396 }
1397 }
1398
1399
1400 /* This function generates code to get all the necessary possible
1401 * stencil values. For each of the output registers (fail_reg,
1402 * zfail_reg, and zpass_reg), it either allocates a new register
1403 * and calculates a new set of values based on the stencil operation,
1404 * or it reuses a register allocation and calculation done for an
1405 * earlier (matching) operation, or it reuses the fbS_reg register
1406 * (if the stencil operation is KEEP, which doesn't change the
1407 * stencil buffer).
1408 *
1409 * Since this function allocates a variable number of registers,
1410 * to avoid incurring complex logic to free them, they should
1411 * be allocated after a spe_allocate_register_set() call
1412 * and released by the corresponding spe_release_register_set() call.
1413 */
1414 static void
1415 gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
1416 unsigned int fbS_reg,
1417 unsigned int *fail_reg, unsigned int *zfail_reg,
1418 unsigned int *zpass_reg, unsigned int *back_fail_reg,
1419 unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
1420 {
1421 unsigned zfail_op, back_zfail_op;
1422
1423 /* Stenciling had better be enabled here */
1424 ASSERT(dsa->stencil[0].enabled);
1425
1426 /* If the depth test is not enabled, it is treated as though it always
1427 * passes. In particular, that means that the "zfail_op" (and the backfacing
1428 * counterpart, if active) are not considered - a failing stencil test will
1429 * trigger the "fail_op", and a passing stencil test will trigger the
1430 * "zpass_op".
1431 *
1432 * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
1433 * we keep them from being calculated.
1434 */
1435 if (dsa->depth.enabled) {
1436 zfail_op = dsa->stencil[0].zfail_op;
1437 back_zfail_op = dsa->stencil[1].zfail_op;
1438 }
1439 else {
1440 zfail_op = PIPE_STENCIL_OP_KEEP;
1441 back_zfail_op = PIPE_STENCIL_OP_KEEP;
1442 }
1443
1444 /* One-sided or front-facing stencil */
1445 if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
1446 *fail_reg = fbS_reg;
1447 }
1448 else {
1449 *fail_reg = spe_allocate_available_register(f);
1450 gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value,
1451 0xff, fbS_reg, *fail_reg);
1452 }
1453
1454 if (zfail_op == PIPE_STENCIL_OP_KEEP) {
1455 *zfail_reg = fbS_reg;
1456 }
1457 else if (zfail_op == dsa->stencil[0].fail_op) {
1458 *zfail_reg = *fail_reg;
1459 }
1460 else {
1461 *zfail_reg = spe_allocate_available_register(f);
1462 gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value,
1463 0xff, fbS_reg, *zfail_reg);
1464 }
1465
1466 if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
1467 *zpass_reg = fbS_reg;
1468 }
1469 else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
1470 *zpass_reg = *fail_reg;
1471 }
1472 else if (dsa->stencil[0].zpass_op == zfail_op) {
1473 *zpass_reg = *zfail_reg;
1474 }
1475 else {
1476 *zpass_reg = spe_allocate_available_register(f);
1477 gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value,
1478 0xff, fbS_reg, *zpass_reg);
1479 }
1480
1481 /* If two-sided stencil is enabled, we have more work to do. */
1482 if (!dsa->stencil[1].enabled) {
1483 /* This just flags that the registers need not be deallocated later */
1484 *back_fail_reg = fbS_reg;
1485 *back_zfail_reg = fbS_reg;
1486 *back_zpass_reg = fbS_reg;
1487 }
1488 else {
1489 /* Same calculations as above, but for the back stencil */
1490 if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
1491 *back_fail_reg = fbS_reg;
1492 }
1493 else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
1494 *back_fail_reg = *fail_reg;
1495 }
1496 else if (dsa->stencil[1].fail_op == zfail_op) {
1497 *back_fail_reg = *zfail_reg;
1498 }
1499 else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
1500 *back_fail_reg = *zpass_reg;
1501 }
1502 else {
1503 *back_fail_reg = spe_allocate_available_register(f);
1504 gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value,
1505 0xff, fbS_reg, *back_fail_reg);
1506 }
1507
1508 if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
1509 *back_zfail_reg = fbS_reg;
1510 }
1511 else if (back_zfail_op == dsa->stencil[0].fail_op) {
1512 *back_zfail_reg = *fail_reg;
1513 }
1514 else if (back_zfail_op == zfail_op) {
1515 *back_zfail_reg = *zfail_reg;
1516 }
1517 else if (back_zfail_op == dsa->stencil[0].zpass_op) {
1518 *back_zfail_reg = *zpass_reg;
1519 }
1520 else if (back_zfail_op == dsa->stencil[1].fail_op) {
1521 *back_zfail_reg = *back_fail_reg;
1522 }
1523 else {
1524 *back_zfail_reg = spe_allocate_available_register(f);
1525 gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value,
1526 0xff, fbS_reg, *back_zfail_reg);
1527 }
1528
1529 if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
1530 *back_zpass_reg = fbS_reg;
1531 }
1532 else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
1533 *back_zpass_reg = *fail_reg;
1534 }
1535 else if (dsa->stencil[1].zpass_op == zfail_op) {
1536 *back_zpass_reg = *zfail_reg;
1537 }
1538 else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
1539 *back_zpass_reg = *zpass_reg;
1540 }
1541 else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
1542 *back_zpass_reg = *back_fail_reg;
1543 }
1544 else if (dsa->stencil[1].zpass_op == back_zfail_op) {
1545 *back_zpass_reg = *back_zfail_reg;
1546 }
1547 else {
1548 *back_zfail_reg = spe_allocate_available_register(f);
1549 gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value,
1550 0xff, fbS_reg, *back_zpass_reg);
1551 }
1552 } /* End of calculations for back-facing stencil */
1553 }
1554
1555 /* Note that fbZ_reg may *not* be set on entry, if in fact
1556 * the depth test is not enabled. This function must not use
1557 * the register if depth is not enabled.
1558 */
1559 static boolean
1560 gen_stencil_depth_test(struct spe_function *f,
1561 const struct pipe_depth_stencil_alpha_state *dsa,
1562 const int const facing_reg,
1563 const int mask_reg, const int fragZ_reg,
1564 const int fbZ_reg, const int fbS_reg)
1565 {
1566 /* True if we've generated code that could require writeback to the
1567 * depth and/or stencil buffers
1568 */
1569 boolean modified_buffers = false;
1570
1571 boolean need_to_calculate_stencil_values;
1572 boolean need_to_writemask_stencil_values;
1573
1574 /* Registers. We may or may not actually allocate these, depending
1575 * on whether the state values indicate that we need them.
1576 */
1577 unsigned int stencil_pass_reg, stencil_fail_reg;
1578 unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
1579 unsigned int stencil_writemask_reg;
1580 unsigned int zmask_reg;
1581 unsigned int newS_reg;
1582
1583 /* Stenciling is quite complex: up to six different configurable stencil
1584 * operations/calculations can be required (three each for front-facing
1585 * and back-facing fragments). Many of those operations will likely
1586 * be identical, so there's good reason to try to avoid calculating
1587 * the same values more than once (which unfortunately makes the code less
1588 * straightforward).
1589 *
1590 * To make register management easier, we start a new
1591 * register set; we can release all the registers in the set at
1592 * once, and avoid having to keep track of exactly which registers
1593 * we allocate. We can still allocate and free registers as
1594 * desired (if we know we no longer need a register), but we don't
1595 * have to spend the complexity to track the more difficult variant
1596 * register usage scenarios.
1597 */
1598 spe_comment(f, 0, "Allocating stencil register set");
1599 spe_allocate_register_set(f);
1600
1601 /* Calculate the writemask. If the writemask is trivial (either
1602 * all 0s, meaning that we don't need to calculate any stencil values
1603 * because they're not going to change the stencil anyway, or all 1s,
1604 * meaning that we have to calculate the stencil values but do not
1605 * need to mask them), we can avoid generating code. Don't forget
1606 * that we need to consider backfacing stencil, if enabled.
1607 *
1608 * Note that if the backface stencil is *not* enabled, the backface
1609 * stencil will have the same values as the frontface stencil.
1610 */
1611 if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
1612 dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
1613 dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
1614 dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
1615 dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
1616 dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
1617 /* No changes to any stencil values */
1618 need_to_calculate_stencil_values = false;
1619 need_to_writemask_stencil_values = false;
1620 }
1621 else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) {
1622 /* All changes are writemasked out, so no need to calculate
1623 * what those changes might be, and no need to write anything back.
1624 */
1625 need_to_calculate_stencil_values = false;
1626 need_to_writemask_stencil_values = false;
1627 }
1628 else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) {
1629 /* Still trivial, but a little less so. We need to write the stencil
1630 * values, but we don't need to mask them.
1631 */
1632 need_to_calculate_stencil_values = true;
1633 need_to_writemask_stencil_values = false;
1634 }
1635 else {
1636 /* The general case: calculate, mask, and write */
1637 need_to_calculate_stencil_values = true;
1638 need_to_writemask_stencil_values = true;
1639
1640 /* While we're here, generate code that calculates what the
1641 * writemask should be. If backface stenciling is enabled,
1642 * and the backface writemask is not the same as the frontface
1643 * writemask, we'll have to generate code that merges the
1644 * two masks into a single effective mask based on fragment facing.
1645 */
1646 spe_comment(f, 0, "Computing stencil writemask");
1647 stencil_writemask_reg = spe_allocate_available_register(f);
1648 spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
1649 if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
1650 unsigned int back_write_mask_reg = spe_allocate_available_register(f);
1651 spe_comment(f, 0, "Resolving two-sided stencil writemask");
1652 spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
1653 spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
1654 spe_release_register(f, back_write_mask_reg);
1655 }
1656 }
1657
1658 /* At least one-sided stenciling must be on. Generate code that
1659 * runs the stencil test on the basic/front-facing stencil, leaving
1660 * the mask of passing stencil bits in stencil_pass_reg. This mask will
1661 * be used both to mask the set of active pixels, and also to
1662 * determine how the stencil buffer changes.
1663 *
1664 * This test will *not* change the value in mask_reg (because we don't
1665 * yet know whether to apply the two-sided stencil or one-sided stencil).
1666 */
1667 spe_comment(f, 0, "Running basic stencil test");
1668 stencil_pass_reg = spe_allocate_available_register(f);
1669 gen_stencil_test(f, &dsa->stencil[0], 0xff, mask_reg, fbS_reg, stencil_pass_reg);
1670
1671 /* If two-sided stenciling is on, generate code to run the stencil
1672 * test on the backfacing stencil as well, and combine the two results
1673 * into the one correct result based on facing.
1674 */
1675 if (dsa->stencil[1].enabled) {
1676 unsigned int temp_reg = spe_allocate_available_register(f);
1677 spe_comment(f, 0, "Running backface stencil test");
1678 gen_stencil_test(f, &dsa->stencil[1], 0xff, mask_reg, fbS_reg, temp_reg);
1679 spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
1680 spe_release_register(f, temp_reg);
1681 }
1682
1683 /* Generate code that, given the mask of valid fragments and the
1684 * mask of valid fragments that passed the stencil test, computes
1685 * the mask of valid fragments that failed the stencil test. We
1686 * have to do this before we run a depth test (because the
1687 * depth test should not be performed on fragments that failed the
1688 * stencil test, and because the depth test will update the
1689 * mask of valid fragments based on the results of the depth test).
1690 */
1691 spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
1692 stencil_fail_reg = spe_allocate_available_register(f);
1693 spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
1694 /* Now remove the stenciled-out pixels from the valid fragment mask,
1695 * so we can later use the valid fragment mask in the depth test.
1696 */
1697 spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
1698
1699 /* We may not need to calculate stencil values, if the writemask is off */
1700 if (need_to_calculate_stencil_values) {
1701 unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
1702 unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
1703
1704 /* Generate code that calculates exactly which stencil values we need,
1705 * without calculating the same value twice (say, if two different
1706 * stencil ops have the same value). This code will work for one-sided
1707 * and two-sided stenciling (so that we take into account that operations
1708 * may match between front and back stencils), and will also take into
1709 * account whether the depth test is enabled (if the depth test is off,
1710 * we don't need any of the zfail results, because the depth test always
1711 * is considered to pass if it is disabled). Any register value that
1712 * does not need to be calculated will come back with the same value
1713 * that's in fbS_reg.
1714 *
1715 * This function will allocate a variant number of registers that
1716 * will be released as part of the register set.
1717 */
1718 spe_comment(f, 0, "Computing stencil values");
1719 gen_get_stencil_values(f, dsa, fbS_reg,
1720 &front_stencil_fail_values, &front_stencil_pass_depth_fail_values,
1721 &front_stencil_pass_depth_pass_values, &back_stencil_fail_values,
1722 &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
1723
1724 /* Tricky, tricky, tricky - the things we do to create optimal
1725 * code...
1726 *
1727 * The various stencil values registers may overlap with each other
1728 * and with fbS_reg arbitrarily (as any particular operation is
1729 * only calculated once and stored in one register, no matter
1730 * how many times it is used). So we can't change the values
1731 * within those registers directly - if we change a value in a
1732 * register that's being referenced by two different calculations,
1733 * we've just unwittingly changed the second value as well...
1734 *
1735 * Avoid this by allocating new registers to hold the results
1736 * (there may be 2, if the depth test is off, or 3, if it is on).
1737 * These will be released as part of the register set.
1738 */
1739 if (!dsa->stencil[1].enabled) {
1740 /* The easy case: if two-sided stenciling is *not* enabled, we
1741 * just use the front-sided values.
1742 */
1743 stencil_fail_values = front_stencil_fail_values;
1744 stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
1745 stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
1746 }
1747 else { /* two-sided stencil enabled */
1748 spe_comment(f, 0, "Resolving backface stencil values");
1749 /* Allocate new registers for the needed merged values */
1750 stencil_fail_values = spe_allocate_available_register(f);
1751 spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
1752 if (dsa->depth.enabled) {
1753 stencil_pass_depth_fail_values = spe_allocate_available_register(f);
1754 spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
1755 }
1756 else {
1757 stencil_pass_depth_fail_values = fbS_reg;
1758 }
1759 stencil_pass_depth_pass_values = spe_allocate_available_register(f);
1760 spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
1761 }
1762 }
1763
1764 /* We now have all the stencil values we need. We also need
1765 * the results of the depth test to figure out which
1766 * stencil values will become the new stencil values. (Even if
1767 * we aren't actually calculating stencil values, we need to apply
1768 * the depth test if it's enabled.)
1769 *
1770 * The code generated by gen_depth_test() returns the results of the
1771 * test in the given register, but also alters the mask_reg based
1772 * on the results of the test.
1773 */
1774 if (dsa->depth.enabled) {
1775 spe_comment(f, 0, "Running stencil depth test");
1776 zmask_reg = spe_allocate_available_register(f);
1777 modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
1778 }
1779
1780 if (need_to_calculate_stencil_values) {
1781
1782 /* If we need to writemask the stencil values before going into
1783 * the stencil buffer, we'll have to use a new register to
1784 * hold the new values. If not, we can just keep using the
1785 * current register.
1786 */
1787 if (need_to_writemask_stencil_values) {
1788 newS_reg = spe_allocate_available_register(f);
1789 spe_comment(f, 0, "Saving current stencil values for writemasking");
1790 spe_move(f, newS_reg, fbS_reg);
1791 }
1792 else {
1793 newS_reg = fbS_reg;
1794 }
1795
1796 /* Merge in the selected stencil fail values */
1797 if (stencil_fail_values != fbS_reg) {
1798 spe_comment(f, 0, "Loading stencil fail values");
1799 spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
1800 modified_buffers = true;
1801 }
1802
1803 /* Same for the stencil pass/depth fail values. If this calculation
1804 * is not needed (say, if depth test is off), then the
1805 * stencil_pass_depth_fail_values register will be equal to fbS_reg
1806 * and we'll skip the calculation.
1807 */
1808 if (stencil_pass_depth_fail_values != fbS_reg) {
1809 /* We don't actually have a stencil pass/depth fail mask yet.
1810 * Calculate it here from the stencil passing mask and the
1811 * depth passing mask. Note that zmask_reg *must* have been
1812 * set above if we're here.
1813 */
1814 unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
1815 spe_comment(f, 0, "Loading stencil pass/depth fail values");
1816 spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
1817
1818 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
1819
1820 spe_release_register(f, stencil_pass_depth_fail_mask);
1821 modified_buffers = true;
1822 }
1823
1824 /* Same for the stencil pass/depth pass mask. Note that we
1825 * *can* get here with zmask_reg being unset (if the depth
1826 * test is off but the stencil test is on). In this case,
1827 * we assume the depth test passes, and don't need to mask
1828 * the stencil pass mask with the Z mask.
1829 */
1830 if (stencil_pass_depth_pass_values != fbS_reg) {
1831 if (dsa->depth.enabled) {
1832 unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
1833 /* We'll need a separate register */
1834 spe_comment(f, 0, "Loading stencil pass/depth pass values");
1835 spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
1836 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
1837 spe_release_register(f, stencil_pass_depth_pass_mask);
1838 }
1839 else {
1840 /* We can use the same stencil-pass register */
1841 spe_comment(f, 0, "Loading stencil pass values");
1842 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
1843 }
1844 modified_buffers = true;
1845 }
1846
1847 /* Almost done. If we need to writemask, do it now, leaving the
1848 * results in the fbS_reg register passed in. If we don't need
1849 * to writemask, then the results are *already* in the fbS_reg,
1850 * so there's nothing more to do.
1851 */
1852
1853 if (need_to_writemask_stencil_values && modified_buffers) {
1854 /* The Select Bytes command makes a fine writemask. Where
1855 * the mask is 0, the first (original) values are retained,
1856 * effectively masking out changes. Where the mask is 1, the
1857 * second (new) values are retained, incorporating changes.
1858 */
1859 spe_comment(f, 0, "Writemasking new stencil values");
1860 spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
1861 }
1862
1863 } /* done calculating stencil values */
1864
1865 /* The stencil and/or depth values have been applied, and the
1866 * mask_reg, fbS_reg, and fbZ_reg values have been updated.
1867 * We're all done, except that we've allocated a fair number
1868 * of registers that we didn't bother tracking. Release all
1869 * those registers as part of the register set, and go home.
1870 */
1871 spe_comment(f, 0, "Releasing stencil register set");
1872 spe_release_register_set(f);
1873
1874 /* Return true if we could have modified the stencil and/or
1875 * depth buffers.
1876 */
1877 return modified_buffers;
1878 }
1879
1880
1881 /**
1882 * Generate SPE code to implement the fragment operations (alpha test,
1883 * depth test, stencil test, blending, colormask, and final
1884 * framebuffer write) as specified by the current context state.
1885 *
1886 * Logically, this code will be called after running the fragment
1887 * shader. But under some circumstances we could run some of this
1888 * code before the fragment shader to cull fragments/quads that are
1889 * totally occluded/discarded.
1890 *
1891 * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
1892 *
1893 * See the spu_default_fragment_ops() function to see how the per-fragment
1894 * operations would be done with ordinary C code.
1895 * The code we generate here though has no branches, is SIMD, etc and
1896 * should be much faster.
1897 *
1898 * \param cell the rendering context (in)
1899 * \param f the generated function (out)
1900 */
1901 void
1902 cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
1903 {
1904 const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
1905 const struct pipe_blend_state *blend = cell->blend;
1906 const struct pipe_blend_color *blend_color = &cell->blend_color;
1907 const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
1908
1909 /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
1910 const int x_reg = 3; /* uint */
1911 const int y_reg = 4; /* uint */
1912 const int color_tile_reg = 5; /* tile_t * */
1913 const int depth_tile_reg = 6; /* tile_t * */
1914 const int fragZ_reg = 7; /* vector float */
1915 const int fragR_reg = 8; /* vector float */
1916 const int fragG_reg = 9; /* vector float */
1917 const int fragB_reg = 10; /* vector float */
1918 const int fragA_reg = 11; /* vector float */
1919 const int mask_reg = 12; /* vector uint */
1920 const int facing_reg = 13; /* uint */
1921
1922 /* offset of quad from start of tile
1923 * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
1924 */
1925 int quad_offset_reg;
1926
1927 int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */
1928 int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */
1929
1930 spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
1931
1932 if (cell->debug_flags & CELL_DEBUG_ASM) {
1933 spe_print_code(f, true);
1934 spe_indent(f, 8);
1935 spe_comment(f, -4, "Begin per-fragment ops");
1936 }
1937
1938 spe_allocate_register(f, x_reg);
1939 spe_allocate_register(f, y_reg);
1940 spe_allocate_register(f, color_tile_reg);
1941 spe_allocate_register(f, depth_tile_reg);
1942 spe_allocate_register(f, fragZ_reg);
1943 spe_allocate_register(f, fragR_reg);
1944 spe_allocate_register(f, fragG_reg);
1945 spe_allocate_register(f, fragB_reg);
1946 spe_allocate_register(f, fragA_reg);
1947 spe_allocate_register(f, mask_reg);
1948 spe_allocate_register(f, facing_reg);
1949
1950 quad_offset_reg = spe_allocate_available_register(f);
1951 fbRGBA_reg = spe_allocate_available_register(f);
1952 fbZS_reg = spe_allocate_available_register(f);
1953
1954 /* compute offset of quad from start of tile, in bytes */
1955 {
1956 int x2_reg = spe_allocate_available_register(f);
1957 int y2_reg = spe_allocate_available_register(f);
1958
1959 ASSERT(TILE_SIZE == 32);
1960
1961 spe_comment(f, 0, "Compute quad offset within tile");
1962 spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */
1963 spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
1964 spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */
1965 spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */
1966 spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */
1967
1968 spe_release_register(f, x2_reg);
1969 spe_release_register(f, y2_reg);
1970 }
1971
1972 if (dsa->alpha.enabled) {
1973 gen_alpha_test(dsa, f, mask_reg, fragA_reg);
1974 }
1975
1976 /* If we need the stencil buffers (because one- or two-sided stencil is
1977 * enabled) or the depth buffer (because the depth test is enabled),
1978 * go grab them. Note that if either one- or two-sided stencil is
1979 * enabled, dsa->stencil[0].enabled will be true.
1980 */
1981 if (dsa->depth.enabled || dsa->stencil[0].enabled) {
1982 const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
1983 boolean write_depth_stencil;
1984
1985 /* We may or may not need to allocate a register for Z or stencil values */
1986 boolean fbS_reg_set = false, fbZ_reg_set = false;
1987 unsigned int fbS_reg, fbZ_reg = 0;
1988
1989 spe_comment(f, 0, "Fetching Z/stencil quad from tile");
1990
1991 /* fetch quad of depth/stencil values from tile at (x,y) */
1992 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
1993 /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
1994 spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
1995
1996 /* From the Z/stencil buffer format, pull out the bits we need for
1997 * Z and/or stencil. We'll also convert the incoming fragment Z
1998 * value in fragZ_reg from a floating point value in [0.0..1.0] to
1999 * an unsigned integer value with the appropriate resolution.
2000 * Note that even if depth or stencil is *not* enabled, if it's
2001 * present in the buffer, we pull it out and put it back later;
2002 * otherwise, we can inadvertently destroy the contents of
2003 * buffers we're not supposed to touch (e.g., if the user is
2004 * clearing the depth buffer but not the stencil buffer, a
2005 * quad of constant depth is drawn over the surface; the stencil
2006 * buffer must be maintained).
2007 */
2008 switch(zs_format) {
2009
2010 case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
2011 case PIPE_FORMAT_X8Z24_UNORM:
2012 /* Pull out both Z and stencil */
2013 setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
2014 setup_optional_register(f, &fbS_reg_set, &fbS_reg);
2015
2016 /* four 24-bit Z values in the low-order bits */
2017 spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
2018
2019 /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
2020 * to a 24-bit unsigned integer
2021 */
2022 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
2023 spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
2024
2025 /* four 8-bit stencil values in the high-order bits */
2026 spe_rotmi(f, fbS_reg, fbZS_reg, -24);
2027 break;
2028
2029 case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
2030 case PIPE_FORMAT_Z24X8_UNORM:
2031 setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
2032 setup_optional_register(f, &fbS_reg_set, &fbS_reg);
2033
2034 /* shift by 8 to get the upper 24-bit values */
2035 spe_rotmi(f, fbS_reg, fbZS_reg, -8);
2036
2037 /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
2038 * to a 24-bit unsigned integer
2039 */
2040 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
2041 spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
2042
2043 /* 8-bit stencil in the low-order bits - mask them out */
2044 spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
2045 break;
2046
2047 case PIPE_FORMAT_Z32_UNORM:
2048 setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
2049 /* Copy over 4 32-bit values */
2050 spe_move(f, fbZ_reg, fbZS_reg);
2051
2052 /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
2053 * to a 32-bit unsigned integer
2054 */
2055 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
2056 /* No stencil, so can't do anything there */
2057 break;
2058
2059 case PIPE_FORMAT_Z16_UNORM:
2060 /* XXX Not sure this is correct, but it was here before, so we're
2061 * going with it for now
2062 */
2063 setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
2064 /* Copy over 4 32-bit values */
2065 spe_move(f, fbZ_reg, fbZS_reg);
2066
2067 /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
2068 * to a 16-bit unsigned integer
2069 */
2070 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
2071 spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
2072 /* No stencil */
2073
2074 default:
2075 ASSERT(0); /* invalid format */
2076 }
2077
2078 /* If stencil is enabled, use the stencil-specific code
2079 * generator to generate both the stencil and depth (if needed)
2080 * tests. Otherwise, if only depth is enabled, generate
2081 * a quick depth test. The test generators themselves will
2082 * report back whether the depth/stencil buffer has to be
2083 * written back.
2084 */
2085 if (dsa->stencil[0].enabled) {
2086 /* This will perform the stencil and depth tests, and update
2087 * the mask_reg, fbZ_reg, and fbS_reg as required by the
2088 * tests.
2089 */
2090 ASSERT(fbS_reg_set);
2091 spe_comment(f, 0, "Perform stencil test");
2092
2093 /* Note that fbZ_reg may not be set on entry, if stenciling
2094 * is enabled but there's no Z-buffer. The
2095 * gen_stencil_depth_test() function must ignore the
2096 * fbZ_reg register if depth is not enabled.
2097 */
2098 write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
2099 }
2100 else if (dsa->depth.enabled) {
2101 int zmask_reg = spe_allocate_available_register(f);
2102 ASSERT(fbZ_reg_set);
2103 spe_comment(f, 0, "Perform depth test");
2104 write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
2105 spe_release_register(f, zmask_reg);
2106 }
2107 else {
2108 write_depth_stencil = false;
2109 }
2110
2111 if (write_depth_stencil) {
2112 /* Merge latest Z and Stencil values into fbZS_reg.
2113 * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
2114 * fbS_reg has four 8-bit Z values in bits [7..0].
2115 */
2116 spe_comment(f, 0, "Store quad's depth/stencil values in tile");
2117 if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
2118 zs_format == PIPE_FORMAT_X8Z24_UNORM) {
2119 spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
2120 spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
2121 }
2122 else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
2123 zs_format == PIPE_FORMAT_Z24X8_UNORM) {
2124 spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
2125 spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
2126 }
2127 else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
2128 spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
2129 }
2130 else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
2131 spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
2132 }
2133 else if (zs_format == PIPE_FORMAT_S8_UNORM) {
2134 ASSERT(0); /* XXX to do */
2135 }
2136 else {
2137 ASSERT(0); /* bad zs_format */
2138 }
2139
2140 /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
2141 spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
2142 }
2143
2144 /* Don't need these any more */
2145 release_optional_register(f, &fbZ_reg_set, fbZ_reg);
2146 release_optional_register(f, &fbS_reg_set, fbS_reg);
2147 }
2148
2149 /* Get framebuffer quad/colors. We'll need these for blending,
2150 * color masking, and to obey the quad/pixel mask.
2151 * Load: fbRGBA_reg = memory[color_tile + quad_offset]
2152 * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
2153 * we could skip this load.
2154 */
2155 spe_comment(f, 0, "Fetch quad colors from tile");
2156 spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
2157
2158 if (blend->blend_enable) {
2159 spe_comment(f, 0, "Perform blending");
2160 gen_blend(blend, blend_color, f, color_format,
2161 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
2162 }
2163
2164 /*
2165 * Write fragment colors to framebuffer/tile.
2166 * This involves converting the fragment colors from float[4] to the
2167 * tile's specific format and obeying the quad/pixel mask.
2168 */
2169 {
2170 int rgba_reg = spe_allocate_available_register(f);
2171
2172 /* Pack four float colors as four 32-bit int colors */
2173 spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
2174 gen_pack_colors(f, color_format,
2175 fragR_reg, fragG_reg, fragB_reg, fragA_reg,
2176 rgba_reg);
2177
2178 if (blend->logicop_enable) {
2179 spe_comment(f, 0, "Compute logic op");
2180 gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
2181 }
2182
2183 if (blend->colormask != PIPE_MASK_RGBA) {
2184 spe_comment(f, 0, "Compute color mask");
2185 gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
2186 }
2187
2188 /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
2189 * if (mask[i])
2190 * rgba[i] = rgba[i];
2191 * else
2192 * rgba[i] = framebuffer[i];
2193 */
2194 spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
2195
2196 /* Store updated quad in tile:
2197 * memory[color_tile + quad_offset] = rgba_reg;
2198 */
2199 spe_comment(f, 0, "Store quad colors into color tile");
2200 spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
2201
2202 spe_release_register(f, rgba_reg);
2203 }
2204
2205 //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
2206
2207 spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */
2208
2209 spe_release_register(f, fbRGBA_reg);
2210 spe_release_register(f, fbZS_reg);
2211 spe_release_register(f, quad_offset_reg);
2212
2213 if (cell->debug_flags & CELL_DEBUG_ASM) {
2214 spe_comment(f, -4, "End per-fragment ops");
2215 }
2216 }