de170d1036c10f464dd05b79aaaf500d451d7fa3
[mesa.git] / src / gallium / drivers / cell / ppu / cell_gen_fragment.c
1 /**************************************************************************
2 *
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29
30 /**
31 * Generate SPU per-fragment code (actually per-quad code).
32 * \author Brian Paul
33 */
34
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "rtasm/rtasm_ppc_spe.h"
39 #include "cell_context.h"
40 #include "cell_gen_fragment.h"
41
42
43
44 /** Do extra optimizations? */
45 #define OPTIMIZATIONS 1
46
47
48 /**
49 * Generate SPE code to perform Z/depth testing.
50 *
51 * \param dsa Gallium depth/stencil/alpha state to gen code for
52 * \param f SPE function to append instruction onto.
53 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
54 * \param ifragZ_reg register containing integer fragment Z values (in)
55 * \param ifbZ_reg register containing integer frame buffer Z values (in/out)
56 * \param zmask_reg register containing result of Z test/comparison (out)
57 *
58 * Returns true if the Z-buffer needs to be updated.
59 */
60 static boolean
61 gen_depth_test(struct spe_function *f,
62 const struct pipe_depth_stencil_alpha_state *dsa,
63 int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
64 {
65 /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
66 * quantities. This only makes a difference for 32-bit Z values though.
67 */
68 ASSERT(dsa->depth.enabled);
69
70 switch (dsa->depth.func) {
71 case PIPE_FUNC_EQUAL:
72 /* zmask = (ifragZ == ref) */
73 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
74 /* mask = (mask & zmask) */
75 spe_and(f, mask_reg, mask_reg, zmask_reg);
76 break;
77
78 case PIPE_FUNC_NOTEQUAL:
79 /* zmask = (ifragZ == ref) */
80 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
81 /* mask = (mask & ~zmask) */
82 spe_andc(f, mask_reg, mask_reg, zmask_reg);
83 break;
84
85 case PIPE_FUNC_GREATER:
86 /* zmask = (ifragZ > ref) */
87 spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
88 /* mask = (mask & zmask) */
89 spe_and(f, mask_reg, mask_reg, zmask_reg);
90 break;
91
92 case PIPE_FUNC_LESS:
93 /* zmask = (ref > ifragZ) */
94 spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
95 /* mask = (mask & zmask) */
96 spe_and(f, mask_reg, mask_reg, zmask_reg);
97 break;
98
99 case PIPE_FUNC_LEQUAL:
100 /* zmask = (ifragZ > ref) */
101 spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
102 /* mask = (mask & ~zmask) */
103 spe_andc(f, mask_reg, mask_reg, zmask_reg);
104 break;
105
106 case PIPE_FUNC_GEQUAL:
107 /* zmask = (ref > ifragZ) */
108 spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
109 /* mask = (mask & ~zmask) */
110 spe_andc(f, mask_reg, mask_reg, zmask_reg);
111 break;
112
113 case PIPE_FUNC_NEVER:
114 spe_il(f, mask_reg, 0); /* mask = {0,0,0,0} */
115 spe_move(f, zmask_reg, mask_reg); /* zmask = mask */
116 break;
117
118 case PIPE_FUNC_ALWAYS:
119 /* mask unchanged */
120 spe_il(f, zmask_reg, ~0); /* zmask = {~0,~0,~0,~0} */
121 break;
122
123 default:
124 ASSERT(0);
125 break;
126 }
127
128 if (dsa->depth.writemask) {
129 /*
130 * If (ztest passed) {
131 * framebufferZ = fragmentZ;
132 * }
133 * OR,
134 * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
135 */
136 spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
137 return true;
138 }
139
140 return false;
141 }
142
143
144 /**
145 * Generate SPE code to perform alpha testing.
146 *
147 * \param dsa Gallium depth/stencil/alpha state to gen code for
148 * \param f SPE function to append instruction onto.
149 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
150 * \param fragA_reg register containing four fragment alpha values (in)
151 */
152 static void
153 gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
154 struct spe_function *f, int mask_reg, int fragA_reg)
155 {
156 int ref_reg = spe_allocate_available_register(f);
157 int amask_reg = spe_allocate_available_register(f);
158
159 ASSERT(dsa->alpha.enabled);
160
161 if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
162 (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
163 /* load/splat the alpha reference float value */
164 spe_load_float(f, ref_reg, dsa->alpha.ref);
165 }
166
167 /* emit code to do the alpha comparison, updating 'mask' */
168 switch (dsa->alpha.func) {
169 case PIPE_FUNC_EQUAL:
170 /* amask = (fragA == ref) */
171 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
172 /* mask = (mask & amask) */
173 spe_and(f, mask_reg, mask_reg, amask_reg);
174 break;
175
176 case PIPE_FUNC_NOTEQUAL:
177 /* amask = (fragA == ref) */
178 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
179 /* mask = (mask & ~amask) */
180 spe_andc(f, mask_reg, mask_reg, amask_reg);
181 break;
182
183 case PIPE_FUNC_GREATER:
184 /* amask = (fragA > ref) */
185 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
186 /* mask = (mask & amask) */
187 spe_and(f, mask_reg, mask_reg, amask_reg);
188 break;
189
190 case PIPE_FUNC_LESS:
191 /* amask = (ref > fragA) */
192 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
193 /* mask = (mask & amask) */
194 spe_and(f, mask_reg, mask_reg, amask_reg);
195 break;
196
197 case PIPE_FUNC_LEQUAL:
198 /* amask = (fragA > ref) */
199 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
200 /* mask = (mask & ~amask) */
201 spe_andc(f, mask_reg, mask_reg, amask_reg);
202 break;
203
204 case PIPE_FUNC_GEQUAL:
205 /* amask = (ref > fragA) */
206 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
207 /* mask = (mask & ~amask) */
208 spe_andc(f, mask_reg, mask_reg, amask_reg);
209 break;
210
211 case PIPE_FUNC_NEVER:
212 spe_il(f, mask_reg, 0); /* mask = [0,0,0,0] */
213 break;
214
215 case PIPE_FUNC_ALWAYS:
216 /* no-op, mask unchanged */
217 break;
218
219 default:
220 ASSERT(0);
221 break;
222 }
223
224 #if OPTIMIZATIONS
225 /* if mask == {0,0,0,0} we're all done, return */
226 {
227 /* re-use amask reg here */
228 int tmp_reg = amask_reg;
229 /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
230 spe_orx(f, tmp_reg, mask_reg);
231 /* if tmp[0] == 0 then return from function call */
232 spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
233 }
234 #endif
235
236 spe_release_register(f, ref_reg);
237 spe_release_register(f, amask_reg);
238 }
239
240 /* This pair of functions is used inline to allocate and deallocate
241 * optional constant registers. Once a constant is discovered to be
242 * needed, we will likely need it again, so we don't want to deallocate
243 * it and have to allocate and load it again unnecessarily.
244 */
245 static inline void
246 setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
247 {
248 if (*is_already_set) return;
249 *r = spe_allocate_available_register(f);
250 }
251
252 static inline void
253 release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
254 {
255 if (!*is_already_set) return;
256 spe_release_register(f, r);
257 *is_already_set = false;
258 }
259
260 static inline void
261 setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
262 {
263 if (*is_already_set) return;
264 setup_optional_register(f, is_already_set, r);
265 spe_load_float(f, *r, value);
266 }
267
268 static inline void
269 release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
270 {
271 release_optional_register(f, is_already_set, r);
272 }
273
274 /**
275 * Generate SPE code to implement the given blend mode for a quad of pixels.
276 * \param f SPE function to append instruction onto.
277 * \param fragR_reg register with fragment red values (float) (in/out)
278 * \param fragG_reg register with fragment green values (float) (in/out)
279 * \param fragB_reg register with fragment blue values (float) (in/out)
280 * \param fragA_reg register with fragment alpha values (float) (in/out)
281 * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
282 */
283 static void
284 gen_blend(const struct pipe_blend_state *blend,
285 const struct pipe_blend_color *blend_color,
286 struct spe_function *f,
287 enum pipe_format color_format,
288 int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
289 int fbRGBA_reg)
290 {
291 int term1R_reg = spe_allocate_available_register(f);
292 int term1G_reg = spe_allocate_available_register(f);
293 int term1B_reg = spe_allocate_available_register(f);
294 int term1A_reg = spe_allocate_available_register(f);
295
296 int term2R_reg = spe_allocate_available_register(f);
297 int term2G_reg = spe_allocate_available_register(f);
298 int term2B_reg = spe_allocate_available_register(f);
299 int term2A_reg = spe_allocate_available_register(f);
300
301 int fbR_reg = spe_allocate_available_register(f);
302 int fbG_reg = spe_allocate_available_register(f);
303 int fbB_reg = spe_allocate_available_register(f);
304 int fbA_reg = spe_allocate_available_register(f);
305
306 int tmp_reg = spe_allocate_available_register(f);
307
308 /* Optional constant registers we might or might not end up using;
309 * if we do use them, make sure we only allocate them once by
310 * keeping a flag on each one.
311 */
312 boolean one_reg_set = false;
313 unsigned int one_reg;
314 boolean constR_reg_set = false, constG_reg_set = false,
315 constB_reg_set = false, constA_reg_set = false;
316 unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
317
318 ASSERT(blend->blend_enable);
319
320 /* Unpack/convert framebuffer colors from four 32-bit packed colors
321 * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
322 * Each 8-bit color component is expanded into a float in [0.0, 1.0].
323 */
324 {
325 int mask_reg = spe_allocate_available_register(f);
326
327 /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
328 spe_load_int(f, mask_reg, 0xff);
329
330 /* XXX there may be more clever ways to implement the following code */
331 switch (color_format) {
332 case PIPE_FORMAT_A8R8G8B8_UNORM:
333 /* fbB = fbB & mask */
334 spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
335 /* mask = mask << 8 */
336 spe_roti(f, mask_reg, mask_reg, 8);
337
338 /* fbG = fbRGBA & mask */
339 spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
340 /* fbG = fbG >> 8 */
341 spe_roti(f, fbG_reg, fbG_reg, -8);
342 /* mask = mask << 8 */
343 spe_roti(f, mask_reg, mask_reg, 8);
344
345 /* fbR = fbRGBA & mask */
346 spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
347 /* fbR = fbR >> 16 */
348 spe_roti(f, fbR_reg, fbR_reg, -16);
349 /* mask = mask << 8 */
350 spe_roti(f, mask_reg, mask_reg, 8);
351
352 /* fbA = fbRGBA & mask */
353 spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
354 /* fbA = fbA >> 24 */
355 spe_roti(f, fbA_reg, fbA_reg, -24);
356 break;
357
358 case PIPE_FORMAT_B8G8R8A8_UNORM:
359 /* fbA = fbA & mask */
360 spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
361 /* mask = mask << 8 */
362 spe_roti(f, mask_reg, mask_reg, 8);
363
364 /* fbR = fbRGBA & mask */
365 spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
366 /* fbR = fbR >> 8 */
367 spe_roti(f, fbR_reg, fbR_reg, -8);
368 /* mask = mask << 8 */
369 spe_roti(f, mask_reg, mask_reg, 8);
370
371 /* fbG = fbRGBA & mask */
372 spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
373 /* fbG = fbG >> 16 */
374 spe_roti(f, fbG_reg, fbG_reg, -16);
375 /* mask = mask << 8 */
376 spe_roti(f, mask_reg, mask_reg, 8);
377
378 /* fbB = fbRGBA & mask */
379 spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
380 /* fbB = fbB >> 24 */
381 spe_roti(f, fbB_reg, fbB_reg, -24);
382 break;
383
384 default:
385 ASSERT(0);
386 }
387
388 /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
389 spe_cuflt(f, fbR_reg, fbR_reg, 8);
390 spe_cuflt(f, fbG_reg, fbG_reg, 8);
391 spe_cuflt(f, fbB_reg, fbB_reg, 8);
392 spe_cuflt(f, fbA_reg, fbA_reg, 8);
393
394 spe_release_register(f, mask_reg);
395 }
396
397 /*
398 * Compute Src RGB terms. We're actually looking for the value
399 * of (the appropriate RGB factors) * (the incoming source RGB color),
400 * because in some cases (like PIPE_BLENDFACTOR_ONE and
401 * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
402 */
403 switch (blend->rgb_src_factor) {
404 case PIPE_BLENDFACTOR_ONE:
405 /* factors = (1,1,1), so term = (R,G,B) */
406 spe_move(f, term1R_reg, fragR_reg);
407 spe_move(f, term1G_reg, fragG_reg);
408 spe_move(f, term1B_reg, fragB_reg);
409 break;
410 case PIPE_BLENDFACTOR_ZERO:
411 /* factors = (0,0,0), so term = (0,0,0) */
412 spe_load_float(f, term1R_reg, 0.0f);
413 spe_load_float(f, term1G_reg, 0.0f);
414 spe_load_float(f, term1B_reg, 0.0f);
415 break;
416 case PIPE_BLENDFACTOR_SRC_COLOR:
417 /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
418 spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
419 spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
420 spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
421 break;
422 case PIPE_BLENDFACTOR_SRC_ALPHA:
423 /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
424 spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
425 spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
426 spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
427 break;
428 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
429 /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B))
430 * or in other words term = (R-R*R, G-G*G, B-B*B)
431 * fnms(a,b,c,d) computes a = d - b*c
432 */
433 spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
434 spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
435 spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
436 break;
437 case PIPE_BLENDFACTOR_DST_COLOR:
438 /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
439 spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
440 spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
441 spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
442 break;
443 case PIPE_BLENDFACTOR_INV_DST_COLOR:
444 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
445 * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
446 * fnms(a,b,c,d) computes a = d - b*c
447 */
448 spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
449 spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
450 spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
451 break;
452 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
453 /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
454 * or term = (R-R*A,G-G*A,B-B*A)
455 * fnms(a,b,c,d) computes a = d - b*c
456 */
457 spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
458 spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
459 spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
460 break;
461 case PIPE_BLENDFACTOR_DST_ALPHA:
462 /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
463 spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
464 spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
465 spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
466 break;
467 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
468 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb))
469 * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
470 * fnms(a,b,c,d) computes a = d - b*c
471 */
472 spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
473 spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
474 spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
475 break;
476 case PIPE_BLENDFACTOR_CONST_COLOR:
477 /* We need the optional constant color registers */
478 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
479 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
480 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
481 /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
482 spe_fm(f, term1R_reg, fragR_reg, constR_reg);
483 spe_fm(f, term1G_reg, fragG_reg, constG_reg);
484 spe_fm(f, term1B_reg, fragB_reg, constB_reg);
485 break;
486 case PIPE_BLENDFACTOR_CONST_ALPHA:
487 /* we'll need the optional constant alpha register */
488 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
489 /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
490 spe_fm(f, term1R_reg, fragR_reg, constA_reg);
491 spe_fm(f, term1G_reg, fragG_reg, constA_reg);
492 spe_fm(f, term1B_reg, fragB_reg, constA_reg);
493 break;
494 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
495 /* We need the optional constant color registers */
496 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
497 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
498 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
499 /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc))
500 * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
501 * fnms(a,b,c,d) computes a = d - b*c
502 */
503 spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
504 spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
505 spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
506 break;
507 case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
508 /* We need the optional constant color registers */
509 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
510 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
511 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
512 /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
513 * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
514 * fnms(a,b,c,d) computes a = d - b*c
515 */
516 spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
517 spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
518 spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
519 break;
520 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
521 /* We'll need the optional {1,1,1,1} register */
522 setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
523 /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so
524 * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
525 * We could expand the term (as a*min(b,c) == min(a*b,a*c)
526 * as long as a is positive), but then we'd have to do three
527 * spe_float_min() functions instead of one, so this is simpler.
528 */
529 /* tmp = 1 - Afb */
530 spe_fs(f, tmp_reg, one_reg, fbA_reg);
531 /* tmp = min(A,tmp) */
532 spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
533 /* term = R*tmp */
534 spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
535 spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
536 spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
537 break;
538
539 /* These are special D3D cases involving a second color output
540 * from the fragment shader. I'm not sure we can support them
541 * yet... XXX
542 */
543 case PIPE_BLENDFACTOR_SRC1_COLOR:
544 case PIPE_BLENDFACTOR_SRC1_ALPHA:
545 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
546 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
547
548 default:
549 ASSERT(0);
550 }
551
552 /*
553 * Compute Src Alpha term. Like the above, we're looking for
554 * the full term A*factor, not just the factor itself, because
555 * in many cases we can avoid doing unnecessary multiplies.
556 */
557 switch (blend->alpha_src_factor) {
558 case PIPE_BLENDFACTOR_ZERO:
559 /* factor = 0, so term = 0 */
560 spe_load_float(f, term1A_reg, 0.0f);
561 break;
562
563 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
564 case PIPE_BLENDFACTOR_ONE:
565 /* factor = 1, so term = A */
566 spe_move(f, term1A_reg, fragA_reg);
567 break;
568
569 case PIPE_BLENDFACTOR_SRC_COLOR:
570 /* factor = A, so term = A*A */
571 spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
572 break;
573 case PIPE_BLENDFACTOR_SRC_ALPHA:
574 spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
575 break;
576
577 case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
578 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
579 /* factor = 1-A, so term = A*(1-A) = A-A*A */
580 /* fnms(a,b,c,d) computes a = d - b*c */
581 spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
582 break;
583
584 case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
585 case PIPE_BLENDFACTOR_DST_COLOR:
586 /* factor = Afb, so term = A*Afb */
587 spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
588 break;
589
590 case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
591 case PIPE_BLENDFACTOR_INV_DST_COLOR:
592 /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
593 /* fnms(a,b,c,d) computes a = d - b*c */
594 spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
595 break;
596
597 case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
598 case PIPE_BLENDFACTOR_CONST_COLOR:
599 /* We need the optional constA_reg register */
600 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
601 /* factor = Ac, so term = A*Ac */
602 spe_fm(f, term1A_reg, fragA_reg, constA_reg);
603 break;
604
605 case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
606 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
607 /* We need the optional constA_reg register */
608 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
609 /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
610 /* fnms(a,b,c,d) computes a = d - b*c */
611 spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
612 break;
613
614 /* These are special D3D cases involving a second color output
615 * from the fragment shader. I'm not sure we can support them
616 * yet... XXX
617 */
618 case PIPE_BLENDFACTOR_SRC1_COLOR:
619 case PIPE_BLENDFACTOR_SRC1_ALPHA:
620 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
621 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
622 default:
623 ASSERT(0);
624 }
625
626 /*
627 * Compute Dest RGB term. Like the above, we're looking for
628 * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
629 * in many cases we can avoid doing unnecessary multiplies.
630 */
631 switch (blend->rgb_dst_factor) {
632 case PIPE_BLENDFACTOR_ONE:
633 /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
634 spe_move(f, term2R_reg, fbR_reg);
635 spe_move(f, term2G_reg, fbG_reg);
636 spe_move(f, term2B_reg, fbB_reg);
637 break;
638 case PIPE_BLENDFACTOR_ZERO:
639 /* factor s= (0,0,0), so term = (0,0,0) */
640 spe_load_float(f, term2R_reg, 0.0f);
641 spe_load_float(f, term2G_reg, 0.0f);
642 spe_load_float(f, term2B_reg, 0.0f);
643 break;
644 case PIPE_BLENDFACTOR_SRC_COLOR:
645 /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
646 spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
647 spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
648 spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
649 break;
650 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
651 /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B))
652 * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
653 * fnms(a,b,c,d) computes a = d - b*c
654 */
655 spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
656 spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
657 spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
658 break;
659 case PIPE_BLENDFACTOR_SRC_ALPHA:
660 /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
661 spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
662 spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
663 spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
664 break;
665 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
666 /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
667 /* fnms(a,b,c,d) computes a = d - b*c */
668 spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
669 spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
670 spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
671 break;
672 case PIPE_BLENDFACTOR_DST_COLOR:
673 /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
674 spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
675 spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
676 spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
677 break;
678 case PIPE_BLENDFACTOR_INV_DST_COLOR:
679 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
680 * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
681 * fnms(a,b,c,d) computes a = d - b*c
682 */
683 spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
684 spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
685 spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
686 break;
687
688 case PIPE_BLENDFACTOR_DST_ALPHA:
689 /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
690 spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
691 spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
692 spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
693 break;
694 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
695 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb))
696 * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
697 * fnms(a,b,c,d) computes a = d - b*c
698 */
699 spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
700 spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
701 spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
702 break;
703 case PIPE_BLENDFACTOR_CONST_COLOR:
704 /* We need the optional constant color registers */
705 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
706 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
707 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
708 /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
709 spe_fm(f, term2R_reg, fbR_reg, constR_reg);
710 spe_fm(f, term2G_reg, fbG_reg, constG_reg);
711 spe_fm(f, term2B_reg, fbB_reg, constB_reg);
712 break;
713 case PIPE_BLENDFACTOR_CONST_ALPHA:
714 /* we'll need the optional constant alpha register */
715 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
716 /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
717 spe_fm(f, term2R_reg, fbR_reg, constA_reg);
718 spe_fm(f, term2G_reg, fbG_reg, constA_reg);
719 spe_fm(f, term2B_reg, fbB_reg, constA_reg);
720 break;
721 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
722 /* We need the optional constant color registers */
723 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
724 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
725 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
726 /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc))
727 * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
728 * fnms(a,b,c,d) computes a = d - b*c
729 */
730 spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
731 spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
732 spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
733 break;
734 case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
735 /* We need the optional constant color registers */
736 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
737 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
738 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
739 /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
740 * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
741 * fnms(a,b,c,d) computes a = d - b*c
742 */
743 spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
744 spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
745 spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
746 break;
747 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
748 ASSERT(0);
749 break;
750
751 /* These are special D3D cases involving a second color output
752 * from the fragment shader. I'm not sure we can support them
753 * yet... XXX
754 */
755 case PIPE_BLENDFACTOR_SRC1_COLOR:
756 case PIPE_BLENDFACTOR_SRC1_ALPHA:
757 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
758 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
759
760 default:
761 ASSERT(0);
762 }
763
764 /*
765 * Compute Dest Alpha term. Like the above, we're looking for
766 * the full term Afb*factor, not just the factor itself, because
767 * in many cases we can avoid doing unnecessary multiplies.
768 */
769 switch (blend->alpha_dst_factor) {
770 case PIPE_BLENDFACTOR_ONE:
771 /* factor = 1, so term = Afb */
772 spe_move(f, term2A_reg, fbA_reg);
773 break;
774 case PIPE_BLENDFACTOR_ZERO:
775 /* factor = 0, so term = 0 */
776 spe_load_float(f, term2A_reg, 0.0f);
777 break;
778
779 case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
780 case PIPE_BLENDFACTOR_SRC_COLOR:
781 /* factor = A, so term = Afb*A */
782 spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
783 break;
784
785 case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
786 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
787 /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
788 /* fnms(a,b,c,d) computes a = d - b*c */
789 spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
790 break;
791
792 case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
793 case PIPE_BLENDFACTOR_DST_COLOR:
794 /* factor = Afb, so term = Afb*Afb */
795 spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
796 break;
797
798 case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
799 case PIPE_BLENDFACTOR_INV_DST_COLOR:
800 /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
801 /* fnms(a,b,c,d) computes a = d - b*c */
802 spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
803 break;
804
805 case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
806 case PIPE_BLENDFACTOR_CONST_COLOR:
807 /* We need the optional constA_reg register */
808 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
809 /* factor = Ac, so term = Afb*Ac */
810 spe_fm(f, term2A_reg, fbA_reg, constA_reg);
811 break;
812
813 case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
814 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
815 /* We need the optional constA_reg register */
816 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
817 /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
818 /* fnms(a,b,c,d) computes a = d - b*c */
819 spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
820 break;
821
822 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
823 ASSERT(0);
824 break;
825
826 /* These are special D3D cases involving a second color output
827 * from the fragment shader. I'm not sure we can support them
828 * yet... XXX
829 */
830 case PIPE_BLENDFACTOR_SRC1_COLOR:
831 case PIPE_BLENDFACTOR_SRC1_ALPHA:
832 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
833 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
834 default:
835 ASSERT(0);
836 }
837
838 /*
839 * Combine Src/Dest RGB terms as per the blend equation.
840 */
841 switch (blend->rgb_func) {
842 case PIPE_BLEND_ADD:
843 spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
844 spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
845 spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
846 break;
847 case PIPE_BLEND_SUBTRACT:
848 spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
849 spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
850 spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
851 break;
852 case PIPE_BLEND_REVERSE_SUBTRACT:
853 spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
854 spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
855 spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
856 break;
857 case PIPE_BLEND_MIN:
858 spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
859 spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
860 spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
861 break;
862 case PIPE_BLEND_MAX:
863 spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
864 spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
865 spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
866 break;
867 default:
868 ASSERT(0);
869 }
870
871 /*
872 * Combine Src/Dest A term
873 */
874 switch (blend->alpha_func) {
875 case PIPE_BLEND_ADD:
876 spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
877 break;
878 case PIPE_BLEND_SUBTRACT:
879 spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
880 break;
881 case PIPE_BLEND_REVERSE_SUBTRACT:
882 spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
883 break;
884 case PIPE_BLEND_MIN:
885 spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
886 break;
887 case PIPE_BLEND_MAX:
888 spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
889 break;
890 default:
891 ASSERT(0);
892 }
893
894 spe_release_register(f, term1R_reg);
895 spe_release_register(f, term1G_reg);
896 spe_release_register(f, term1B_reg);
897 spe_release_register(f, term1A_reg);
898
899 spe_release_register(f, term2R_reg);
900 spe_release_register(f, term2G_reg);
901 spe_release_register(f, term2B_reg);
902 spe_release_register(f, term2A_reg);
903
904 spe_release_register(f, fbR_reg);
905 spe_release_register(f, fbG_reg);
906 spe_release_register(f, fbB_reg);
907 spe_release_register(f, fbA_reg);
908
909 spe_release_register(f, tmp_reg);
910
911 /* Free any optional registers that actually got used */
912 release_const_register(f, &one_reg_set, one_reg);
913 release_const_register(f, &constR_reg_set, constR_reg);
914 release_const_register(f, &constG_reg_set, constG_reg);
915 release_const_register(f, &constB_reg_set, constB_reg);
916 release_const_register(f, &constA_reg_set, constA_reg);
917 }
918
919
920 static void
921 gen_logicop(const struct pipe_blend_state *blend,
922 struct spe_function *f,
923 int fragRGBA_reg, int fbRGBA_reg)
924 {
925 /* We've got four 32-bit RGBA packed pixels in each of
926 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
927 * reds, greens, blues, and alphas.
928 * */
929 ASSERT(blend->logicop_enable);
930
931 switch(blend->logicop_func) {
932 case PIPE_LOGICOP_CLEAR: /* 0 */
933 spe_zero(f, fragRGBA_reg);
934 break;
935 case PIPE_LOGICOP_NOR: /* ~(s | d) */
936 spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
937 break;
938 case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
939 /* andc R, A, B computes R = A & ~B */
940 spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
941 break;
942 case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
943 spe_complement(f, fragRGBA_reg, fragRGBA_reg);
944 break;
945 case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
946 /* andc R, A, B computes R = A & ~B */
947 spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
948 break;
949 case PIPE_LOGICOP_INVERT: /* ~d */
950 /* Note that (A nor A) == ~(A|A) == ~A */
951 spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
952 break;
953 case PIPE_LOGICOP_XOR: /* s ^ d */
954 spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
955 break;
956 case PIPE_LOGICOP_NAND: /* ~(s & d) */
957 spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
958 break;
959 case PIPE_LOGICOP_AND: /* s & d */
960 spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
961 break;
962 case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
963 spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
964 spe_complement(f, fragRGBA_reg, fragRGBA_reg);
965 break;
966 case PIPE_LOGICOP_NOOP: /* d */
967 spe_move(f, fragRGBA_reg, fbRGBA_reg);
968 break;
969 case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
970 /* orc R, A, B computes R = A | ~B */
971 spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
972 break;
973 case PIPE_LOGICOP_COPY: /* s */
974 break;
975 case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
976 /* orc R, A, B computes R = A | ~B */
977 spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
978 break;
979 case PIPE_LOGICOP_OR: /* s | d */
980 spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
981 break;
982 case PIPE_LOGICOP_SET: /* 1 */
983 spe_load_int(f, fragRGBA_reg, 0xffffffff);
984 break;
985 default:
986 ASSERT(0);
987 }
988 }
989
990
991 /**
992 * Generate code to pack a quad of float colors into four 32-bit integers.
993 *
994 * \param f SPE function to append instruction onto.
995 * \param color_format the dest color packing format
996 * \param r_reg register containing four red values (in/clobbered)
997 * \param g_reg register containing four green values (in/clobbered)
998 * \param b_reg register containing four blue values (in/clobbered)
999 * \param a_reg register containing four alpha values (in/clobbered)
1000 * \param rgba_reg register to store the packed RGBA colors (out)
1001 */
1002 static void
1003 gen_pack_colors(struct spe_function *f,
1004 enum pipe_format color_format,
1005 int r_reg, int g_reg, int b_reg, int a_reg,
1006 int rgba_reg)
1007 {
1008 int rg_reg = spe_allocate_available_register(f);
1009 int ba_reg = spe_allocate_available_register(f);
1010
1011 /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
1012 spe_cfltu(f, r_reg, r_reg, 32);
1013 spe_cfltu(f, g_reg, g_reg, 32);
1014 spe_cfltu(f, b_reg, b_reg, 32);
1015 spe_cfltu(f, a_reg, a_reg, 32);
1016
1017 /* Shift the most significant bytes to the least significant positions.
1018 * I.e.: reg = reg >> 24
1019 */
1020 spe_rotmi(f, r_reg, r_reg, -24);
1021 spe_rotmi(f, g_reg, g_reg, -24);
1022 spe_rotmi(f, b_reg, b_reg, -24);
1023 spe_rotmi(f, a_reg, a_reg, -24);
1024
1025 /* Shift the color bytes according to the surface format */
1026 if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
1027 spe_roti(f, g_reg, g_reg, 8); /* green <<= 8 */
1028 spe_roti(f, r_reg, r_reg, 16); /* red <<= 16 */
1029 spe_roti(f, a_reg, a_reg, 24); /* alpha <<= 24 */
1030 }
1031 else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1032 spe_roti(f, r_reg, r_reg, 8); /* red <<= 8 */
1033 spe_roti(f, g_reg, g_reg, 16); /* green <<= 16 */
1034 spe_roti(f, b_reg, b_reg, 24); /* blue <<= 24 */
1035 }
1036 else {
1037 ASSERT(0);
1038 }
1039
1040 /* Merge red, green, blue, alpha registers to make packed RGBA colors.
1041 * Eg: after shifting according to color_format we might have:
1042 * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
1043 * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
1044 * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
1045 * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
1046 * OR-ing all those together gives us four packed colors:
1047 * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
1048 */
1049 spe_or(f, rg_reg, r_reg, g_reg);
1050 spe_or(f, ba_reg, a_reg, b_reg);
1051 spe_or(f, rgba_reg, rg_reg, ba_reg);
1052
1053 spe_release_register(f, rg_reg);
1054 spe_release_register(f, ba_reg);
1055 }
1056
1057 static void
1058 gen_colormask(struct spe_function *f,
1059 uint colormask,
1060 enum pipe_format color_format,
1061 int fragRGBA_reg, int fbRGBA_reg)
1062 {
1063 /* We've got four 32-bit RGBA packed pixels in each of
1064 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
1065 * reds, greens, blues, and alphas. Further, the pixels
1066 * are packed according to the given color format, not
1067 * necessarily RGBA...
1068 */
1069 unsigned int r_mask;
1070 unsigned int g_mask;
1071 unsigned int b_mask;
1072 unsigned int a_mask;
1073
1074 /* Calculate exactly where the bits for any particular color
1075 * end up, so we can mask them correctly.
1076 */
1077 switch(color_format) {
1078 case PIPE_FORMAT_A8R8G8B8_UNORM:
1079 /* ARGB */
1080 a_mask = 0xff000000;
1081 r_mask = 0x00ff0000;
1082 g_mask = 0x0000ff00;
1083 b_mask = 0x000000ff;
1084 break;
1085 case PIPE_FORMAT_B8G8R8A8_UNORM:
1086 /* BGRA */
1087 b_mask = 0xff000000;
1088 g_mask = 0x00ff0000;
1089 r_mask = 0x0000ff00;
1090 a_mask = 0x000000ff;
1091 break;
1092 default:
1093 ASSERT(0);
1094 }
1095
1096 /* For each R, G, B, and A component we're supposed to mask out,
1097 * clear its bits. Then our mask operation later will work
1098 * as expected.
1099 */
1100 if (!(colormask & PIPE_MASK_R)) {
1101 r_mask = 0;
1102 }
1103 if (!(colormask & PIPE_MASK_G)) {
1104 g_mask = 0;
1105 }
1106 if (!(colormask & PIPE_MASK_B)) {
1107 b_mask = 0;
1108 }
1109 if (!(colormask & PIPE_MASK_A)) {
1110 a_mask = 0;
1111 }
1112
1113 /* Get a temporary register to hold the mask that will be applied to the fragment */
1114 int colormask_reg = spe_allocate_available_register(f);
1115
1116 /* The actual mask we're going to use is an OR of the remaining R, G, B, and A
1117 * masks. Load the result value into our temporary register.
1118 */
1119 spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
1120
1121 /* Use the mask register to select between the fragment color
1122 * values and the frame buffer color values. Wherever the
1123 * mask has a 0 bit, the current frame buffer color should override
1124 * the fragment color. Wherever the mask has a 1 bit, the
1125 * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM)
1126 * instruction will select bits from its first operand rA wherever the
1127 * the mask bits rM are 0, and from its second operand rB wherever the
1128 * mask bits rM are 1. That means that the frame buffer color is the
1129 * first operand, and the fragment color the second.
1130 */
1131 spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
1132
1133 /* Release the temporary register and we're done */
1134 spe_release_register(f, colormask_reg);
1135 }
1136
1137 /* This function is annoyingly similar to gen_depth_test(), above, except
1138 * that instead of comparing two varying values (i.e. fragment and buffer),
1139 * we're comparing a varying value with a static value. As such, we have
1140 * access to the Compare Immediate instructions where we don't in
1141 * gen_depth_test(), which is what makes us very different.
1142 *
1143 * The return value in the stencil_pass_reg is a bitmask of valid
1144 * fragments that also passed the stencil test. The bitmask of valid
1145 * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
1146 */
1147 static void
1148 gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
1149 unsigned int mask_reg, unsigned int fbS_reg,
1150 unsigned int stencil_pass_reg)
1151 {
1152 /* Generate code that puts the set of passing fragments into the stencil_pass_reg
1153 * register, taking into account whether each fragment was active to begin with.
1154 */
1155 switch (state->func) {
1156 case PIPE_FUNC_EQUAL:
1157 /* stencil_pass = mask & (s == reference) */
1158 spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1159 spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
1160 /* stencil_fail = mask & ~stencil_pass */
1161 break;
1162
1163 case PIPE_FUNC_NOTEQUAL:
1164 /* stencil_pass = mask & ~(s == reference) */
1165 spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1166 spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
1167 break;
1168
1169 case PIPE_FUNC_GREATER:
1170 /* stencil_pass = mask & (s > reference) */
1171 spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1172 spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
1173 break;
1174
1175 case PIPE_FUNC_LESS: {
1176 /* stencil_pass = mask & (reference > s) */
1177 /* There's no convenient Compare Less Than Immediate instruction, so
1178 * we'll have to do this one the harder way, by loading a register and
1179 * comparing directly. Compare Logical Greater Than Word (clgt)
1180 * treats its operands as unsigned - no sign extension.
1181 */
1182 unsigned int tmp_reg = spe_allocate_available_register(f);
1183 spe_load_uint(f, tmp_reg, state->ref_value);
1184 spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
1185 spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
1186 spe_release_register(f, tmp_reg);
1187 break;
1188 }
1189
1190 case PIPE_FUNC_LEQUAL:
1191 /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
1192 spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
1193 spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
1194 break;
1195
1196 case PIPE_FUNC_GEQUAL: {
1197 /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
1198 /* As above, we have to do this by loading a register */
1199 unsigned int tmp_reg = spe_allocate_available_register(f);
1200 spe_load_uint(f, tmp_reg, state->ref_value);
1201 spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
1202 spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
1203 spe_release_register(f, tmp_reg);
1204 break;
1205 }
1206
1207 case PIPE_FUNC_NEVER:
1208 /* stencil_pass = mask & 0 = 0 */
1209 spe_load_uint(f, stencil_pass_reg, 0);
1210 spe_move(f, stencil_pass_reg, mask_reg); /* zmask = mask */
1211 break;
1212
1213 case PIPE_FUNC_ALWAYS:
1214 /* stencil_pass = mask & 1 = mask */
1215 spe_move(f, stencil_pass_reg, mask_reg);
1216 break;
1217 }
1218
1219 /* The fragments that passed the stencil test are now in stencil_pass_reg.
1220 * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
1221 */
1222 }
1223
1224 /* This function generates code that calculates a set of new stencil values
1225 * given the earlier values and the operation to apply. It does not
1226 * apply any tests. It is intended to be called up to 3 times
1227 * (for the stencil fail operation, for the stencil pass-z fail operation,
1228 * and for the stencil pass-z pass operation) to collect up to three
1229 * possible sets of values, and for the caller to combine them based
1230 * on the result of the tests.
1231 *
1232 * stencil_max_value should be (2^n - 1) where n is the number of bits
1233 * in the stencil buffer - in other words, it should be usable as a mask.
1234 */
1235 static void
1236 gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
1237 unsigned int stencil_ref_value, unsigned int stencil_max_value,
1238 unsigned int fbS_reg, unsigned int newS_reg)
1239 {
1240 /* The code below assumes that newS_reg and fbS_reg are not the same
1241 * register; if they can be, the calculations below will have to use
1242 * an additional temporary register. For now, mark the assumption
1243 * with an assertion that will fail if they are the same.
1244 */
1245 ASSERT(fbS_reg != newS_reg);
1246
1247 /* The code also assumes the the stencil_max_value is of the form
1248 * 2^n-1 and can therefore be used as a mask for the valid bits in
1249 * addition to a maximum. Make sure this is the case as well.
1250 * The clever math below exploits the fact that incrementing a
1251 * binary number serves to flip all the bits of a number starting at
1252 * the LSB and continuing to (and including) the first zero bit
1253 * found. That means that a number and its increment will always
1254 * have at least one bit in common (the high order bit, if nothing
1255 * else) *unless* the number is zero, *or* the number is of a form
1256 * consisting of some number of 1s in the low-order bits followed
1257 * by nothing but 0s in the high-order bits. The latter case
1258 * implies it's of the form 2^n-1.
1259 */
1260 ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
1261
1262 switch(stencil_op) {
1263 case PIPE_STENCIL_OP_KEEP:
1264 /* newS = S */
1265 spe_move(f, newS_reg, fbS_reg);
1266 break;
1267
1268 case PIPE_STENCIL_OP_ZERO:
1269 /* newS = 0 */
1270 spe_zero(f, newS_reg);
1271 break;
1272
1273 case PIPE_STENCIL_OP_REPLACE:
1274 /* newS = stencil reference value */
1275 spe_load_uint(f, newS_reg, stencil_ref_value);
1276 break;
1277
1278 case PIPE_STENCIL_OP_INCR: {
1279 /* newS = (s == max ? max : s + 1) */
1280 unsigned int equals_reg = spe_allocate_available_register(f);
1281
1282 spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
1283 /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
1284 spe_ai(f, newS_reg, fbS_reg, 1);
1285 /* Select from the current value or the new value based on the equality test */
1286 spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
1287
1288 spe_release_register(f, equals_reg);
1289 break;
1290 }
1291 case PIPE_STENCIL_OP_DECR: {
1292 /* newS = (s == 0 ? 0 : s - 1) */
1293 unsigned int equals_reg = spe_allocate_available_register(f);
1294
1295 spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
1296 /* Add Word Immediate with a (-1) value works */
1297 spe_ai(f, newS_reg, fbS_reg, -1);
1298 /* Select from the current value or the new value based on the equality test */
1299 spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
1300
1301 spe_release_register(f, equals_reg);
1302 break;
1303 }
1304 case PIPE_STENCIL_OP_INCR_WRAP:
1305 /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
1306 * do a normal add and mask off the correct bits
1307 */
1308 spe_ai(f, newS_reg, fbS_reg, 1);
1309 spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
1310 break;
1311
1312 case PIPE_STENCIL_OP_DECR_WRAP:
1313 /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
1314 spe_ai(f, newS_reg, fbS_reg, -1);
1315 spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
1316 break;
1317
1318 case PIPE_STENCIL_OP_INVERT:
1319 /* newS = ~s. We take advantage of the mask/max value to invert only
1320 * the valid bits for the field so we don't have to do an extra "and".
1321 */
1322 spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
1323 break;
1324
1325 default:
1326 ASSERT(0);
1327 }
1328 }
1329
1330
1331 /* This function generates code to get all the necessary possible
1332 * stencil values. For each of the output registers (fail_reg,
1333 * zfail_reg, and zpass_reg), it either allocates a new register
1334 * and calculates a new set of values based on the stencil operation,
1335 * or it reuses a register allocation and calculation done for an
1336 * earlier (matching) operation, or it reuses the fbS_reg register
1337 * (if the stencil operation is KEEP, which doesn't change the
1338 * stencil buffer).
1339 *
1340 * Since this function allocates a variable number of registers,
1341 * to avoid incurring complex logic to free them, they should
1342 * be allocated after a spe_allocate_register_set() call
1343 * and released by the corresponding spe_release_register_set() call.
1344 */
1345 static void
1346 gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
1347 unsigned int fbS_reg,
1348 unsigned int *fail_reg, unsigned int *zfail_reg,
1349 unsigned int *zpass_reg, unsigned int *back_fail_reg,
1350 unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
1351 {
1352 unsigned zfail_op, back_zfail_op;
1353
1354 /* Stenciling had better be enabled here */
1355 ASSERT(dsa->stencil[0].enabled);
1356
1357 /* If the depth test is not enabled, it is treated as though it always
1358 * passes. In particular, that means that the "zfail_op" (and the backfacing
1359 * counterpart, if active) are not considered - a failing stencil test will
1360 * trigger the "fail_op", and a passing stencil test will trigger the
1361 * "zpass_op".
1362 *
1363 * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
1364 * we keep them from being calculated.
1365 */
1366 if (dsa->depth.enabled) {
1367 zfail_op = dsa->stencil[0].zfail_op;
1368 back_zfail_op = dsa->stencil[1].zfail_op;
1369 }
1370 else {
1371 zfail_op = PIPE_STENCIL_OP_KEEP;
1372 back_zfail_op = PIPE_STENCIL_OP_KEEP;
1373 }
1374
1375 /* One-sided or front-facing stencil */
1376 if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
1377 *fail_reg = fbS_reg;
1378 }
1379 else {
1380 *fail_reg = spe_allocate_available_register(f);
1381 gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value,
1382 0xff, fbS_reg, *fail_reg);
1383 }
1384
1385 if (zfail_op == PIPE_STENCIL_OP_KEEP) {
1386 *zfail_reg = fbS_reg;
1387 }
1388 else if (zfail_op == dsa->stencil[0].fail_op) {
1389 *zfail_reg = *fail_reg;
1390 }
1391 else {
1392 *zfail_reg = spe_allocate_available_register(f);
1393 gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value,
1394 0xff, fbS_reg, *zfail_reg);
1395 }
1396
1397 if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
1398 *zpass_reg = fbS_reg;
1399 }
1400 else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
1401 *zpass_reg = *fail_reg;
1402 }
1403 else if (dsa->stencil[0].zpass_op == zfail_op) {
1404 *zpass_reg = *zfail_reg;
1405 }
1406 else {
1407 *zpass_reg = spe_allocate_available_register(f);
1408 gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value,
1409 0xff, fbS_reg, *zpass_reg);
1410 }
1411
1412 /* If two-sided stencil is enabled, we have more work to do. */
1413 if (!dsa->stencil[1].enabled) {
1414 /* This just flags that the registers need not be deallocated later */
1415 *back_fail_reg = fbS_reg;
1416 *back_zfail_reg = fbS_reg;
1417 *back_zpass_reg = fbS_reg;
1418 }
1419 else {
1420 /* Same calculations as above, but for the back stencil */
1421 if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
1422 *back_fail_reg = fbS_reg;
1423 }
1424 else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
1425 *back_fail_reg = *fail_reg;
1426 }
1427 else if (dsa->stencil[1].fail_op == zfail_op) {
1428 *back_fail_reg = *zfail_reg;
1429 }
1430 else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
1431 *back_fail_reg = *zpass_reg;
1432 }
1433 else {
1434 *back_fail_reg = spe_allocate_available_register(f);
1435 gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value,
1436 0xff, fbS_reg, *back_fail_reg);
1437 }
1438
1439 if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
1440 *back_zfail_reg = fbS_reg;
1441 }
1442 else if (back_zfail_op == dsa->stencil[0].fail_op) {
1443 *back_zfail_reg = *fail_reg;
1444 }
1445 else if (back_zfail_op == zfail_op) {
1446 *back_zfail_reg = *zfail_reg;
1447 }
1448 else if (back_zfail_op == dsa->stencil[0].zpass_op) {
1449 *back_zfail_reg = *zpass_reg;
1450 }
1451 else if (back_zfail_op == dsa->stencil[1].fail_op) {
1452 *back_zfail_reg = *back_fail_reg;
1453 }
1454 else {
1455 *back_zfail_reg = spe_allocate_available_register(f);
1456 gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value,
1457 0xff, fbS_reg, *back_zfail_reg);
1458 }
1459
1460 if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
1461 *back_zpass_reg = fbS_reg;
1462 }
1463 else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
1464 *back_zpass_reg = *fail_reg;
1465 }
1466 else if (dsa->stencil[1].zpass_op == zfail_op) {
1467 *back_zpass_reg = *zfail_reg;
1468 }
1469 else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
1470 *back_zpass_reg = *zpass_reg;
1471 }
1472 else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
1473 *back_zpass_reg = *back_fail_reg;
1474 }
1475 else if (dsa->stencil[1].zpass_op == back_zfail_op) {
1476 *back_zpass_reg = *back_zfail_reg;
1477 }
1478 else {
1479 *back_zfail_reg = spe_allocate_available_register(f);
1480 gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value,
1481 0xff, fbS_reg, *back_zpass_reg);
1482 }
1483 } /* End of calculations for back-facing stencil */
1484 }
1485
1486 static boolean
1487 gen_stencil_depth_test(struct spe_function *f,
1488 const struct pipe_depth_stencil_alpha_state *dsa,
1489 const int const facing_reg,
1490 const int mask_reg, const int fragZ_reg,
1491 const int fbZ_reg, const int fbS_reg)
1492 {
1493 /* True if we've generated code that could require writeback to the
1494 * depth and/or stencil buffers
1495 */
1496 boolean modified_buffers = false;
1497
1498 boolean need_to_calculate_stencil_values;
1499 boolean need_to_writemask_stencil_values;
1500
1501 /* Registers. We may or may not actually allocate these, depending
1502 * on whether the state values indicate that we need them.
1503 */
1504 unsigned int stencil_pass_reg, stencil_fail_reg;
1505 unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
1506 unsigned int stencil_writemask_reg;
1507 unsigned int zmask_reg;
1508 unsigned int newS_reg;
1509
1510 /* Stenciling is quite complex: up to six different configurable stencil
1511 * operations/calculations can be required (three each for front-facing
1512 * and back-facing fragments). Many of those operations will likely
1513 * be identical, so there's good reason to try to avoid calculating
1514 * the same values more than once (which unfortunately makes the code less
1515 * straightforward).
1516 *
1517 * To make register management easier, we start a new
1518 * register set; we can release all the registers in the set at
1519 * once, and avoid having to keep track of exactly which registers
1520 * we allocate. We can still allocate and free registers as
1521 * desired (if we know we no longer need a register), but we don't
1522 * have to spend the complexity to track the more difficult variant
1523 * register usage scenarios.
1524 */
1525 spe_allocate_register_set(f);
1526
1527 /* Calculate the writemask. If the writemask is trivial (either
1528 * all 0s, meaning that we don't need to calculate any stencil values
1529 * because they're not going to change the stencil anyway, or all 1s,
1530 * meaning that we have to calculate the stencil values but do not
1531 * need to mask them), we can avoid generating code. Don't forget
1532 * that we need to consider backfacing stencil, if enabled.
1533 */
1534 if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
1535 /* Trivial: don't need to calculate stencil values, and don't need to
1536 * write them back to the framebuffer.
1537 */
1538 need_to_calculate_stencil_values = false;
1539 need_to_writemask_stencil_values = false;
1540 }
1541 else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
1542 /* Still trivial, but a little less so. We need to write the stencil
1543 * values, but we don't need to mask them.
1544 */
1545 need_to_calculate_stencil_values = true;
1546 need_to_writemask_stencil_values = false;
1547 }
1548 else {
1549 /* The general case: calculate, mask, and write */
1550 need_to_calculate_stencil_values = true;
1551 need_to_writemask_stencil_values = true;
1552
1553 /* While we're here, generate code that calculates what the
1554 * writemask should be. If backface stenciling is enabled,
1555 * and the backface writemask is not the same as the frontface
1556 * writemask, we'll have to generate code that merges the
1557 * two masks into a single effective mask based on fragment facing.
1558 */
1559 stencil_writemask_reg = spe_allocate_available_register(f);
1560 spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
1561 if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
1562 unsigned int back_write_mask_reg = spe_allocate_available_register(f);
1563 spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
1564 spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
1565 spe_release_register(f, back_write_mask_reg);
1566 }
1567 }
1568
1569 /* At least one-sided stenciling must be on. Generate code that
1570 * runs the stencil test on the basic/front-facing stencil, leaving
1571 * the mask of passing stencil bits in stencil_pass_reg. This mask will
1572 * be used both to mask the set of active pixels, and also to
1573 * determine how the stencil buffer changes.
1574 *
1575 * This test will *not* change the value in mask_reg (because we don't
1576 * yet know whether to apply the two-sided stencil or one-sided stencil).
1577 */
1578 stencil_pass_reg = spe_allocate_available_register(f);
1579 gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
1580
1581 /* If two-sided stenciling is on, generate code to run the stencil
1582 * test on the backfacing stencil as well, and combine the two results
1583 * into the one correct result based on facing.
1584 */
1585 if (dsa->stencil[1].enabled) {
1586 unsigned int temp_reg = spe_allocate_available_register(f);
1587 gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
1588 spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
1589 spe_release_register(f, temp_reg);
1590 }
1591
1592 /* Generate code that, given the mask of valid fragments and the
1593 * mask of valid fragments that passed the stencil test, computes
1594 * the mask of valid fragments that failed the stencil test. We
1595 * have to do this before we run a depth test (because the
1596 * depth test should not be performed on fragments that failed the
1597 * stencil test, and because the depth test will update the
1598 * mask of valid fragments based on the results of the depth test).
1599 */
1600 stencil_fail_reg = spe_allocate_available_register(f);
1601 spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
1602 /* Now remove the stenciled-out pixels from the valid fragment mask,
1603 * so we can later use the valid fragment mask in the depth test.
1604 */
1605 spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
1606
1607 /* We may not need to calculate stencil values, if the writemask is off */
1608 if (need_to_calculate_stencil_values) {
1609 unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
1610 unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
1611
1612 /* Generate code that calculates exactly which stencil values we need,
1613 * without calculating the same value twice (say, if two different
1614 * stencil ops have the same value). This code will work for one-sided
1615 * and two-sided stenciling (so that we take into account that operations
1616 * may match between front and back stencils), and will also take into
1617 * account whether the depth test is enabled (if the depth test is off,
1618 * we don't need any of the zfail results, because the depth test always
1619 * is considered to pass if it is disabled). Any register value that
1620 * does not need to be calculated will come back with the same value
1621 * that's in fbS_reg.
1622 *
1623 * This function will allocate a variant number of registers that
1624 * will be released as part of the register set.
1625 */
1626 gen_get_stencil_values(f, dsa, fbS_reg,
1627 &front_stencil_fail_values, &front_stencil_pass_depth_fail_values,
1628 &front_stencil_pass_depth_pass_values, &back_stencil_fail_values,
1629 &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
1630
1631 /* Tricky, tricky, tricky - the things we do to create optimal
1632 * code...
1633 *
1634 * The various stencil values registers may overlap with each other
1635 * and with fbS_reg arbitrarily (as any particular operation is
1636 * only calculated once and stored in one register, no matter
1637 * how many times it is used). So we can't change the values
1638 * within those registers directly - if we change a value in a
1639 * register that's being referenced by two different calculations,
1640 * we've just unwittingly changed the second value as well...
1641 *
1642 * Avoid this by allocating new registers to hold the results
1643 * (there may be 2, if the depth test is off, or 3, if it is on).
1644 * These will be released as part of the register set.
1645 */
1646 if (!dsa->stencil[1].enabled) {
1647 /* The easy case: if two-sided stenciling is *not* enabled, we
1648 * just use the front-sided values.
1649 */
1650 stencil_fail_values = front_stencil_fail_values;
1651 stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
1652 stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
1653 }
1654 else { /* two-sided stencil enabled */
1655 /* Allocate new registers for the needed merged values */
1656 stencil_fail_values = spe_allocate_available_register(f);
1657 spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
1658 if (dsa->depth.enabled) {
1659 stencil_pass_depth_fail_values = spe_allocate_available_register(f);
1660 spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
1661 }
1662 else {
1663 stencil_pass_depth_fail_values = fbS_reg;
1664 }
1665 stencil_pass_depth_pass_values = spe_allocate_available_register(f);
1666 spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
1667 }
1668 }
1669
1670 /* We now have all the stencil values we need. We also need
1671 * the results of the depth test to figure out which
1672 * stencil values will become the new stencil values. (Even if
1673 * we aren't actually calculating stencil values, we need to apply
1674 * the depth test if it's enabled.)
1675 *
1676 * The code generated by gen_depth_test() returns the results of the
1677 * test in the given register, but also alters the mask_reg based
1678 * on the results of the test.
1679 */
1680 if (dsa->depth.enabled) {
1681 zmask_reg = spe_allocate_available_register(f);
1682 modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
1683 }
1684
1685 if (need_to_calculate_stencil_values) {
1686 /* If we need to writemask the stencil values before going into
1687 * the stencil buffer, we'll have to use a new register to
1688 * hold the new values. If not, we can just keep using the
1689 * current register.
1690 */
1691 if (need_to_writemask_stencil_values) {
1692 newS_reg = spe_allocate_available_register(f);
1693 spe_move(f, newS_reg, fbS_reg);
1694 modified_buffers = true;
1695 }
1696 else {
1697 newS_reg = fbS_reg;
1698 }
1699
1700 /* Merge in the selected stencil fail values */
1701 if (stencil_fail_values != fbS_reg) {
1702 spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
1703 }
1704
1705 /* Same for the stencil pass/depth fail values. If this calculation
1706 * is not needed (say, if depth test is off), then the
1707 * stencil_pass_depth_fail_values register will be equal to fbS_reg
1708 * and we'll skip the calculation.
1709 */
1710 if (stencil_pass_depth_fail_values != fbS_reg) {
1711 /* We don't actually have a stencil pass/depth fail mask yet.
1712 * Calculate it here from the stencil passing mask and the
1713 * depth passing mask. Note that zmask_reg *must* have been
1714 * set above if we're here.
1715 */
1716 unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
1717 spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
1718
1719 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
1720
1721 spe_release_register(f, stencil_pass_depth_fail_mask);
1722 }
1723
1724 /* Same for the stencil pass/depth pass mask */
1725 if (stencil_pass_depth_pass_values != fbS_reg) {
1726 unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
1727 spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
1728
1729 spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
1730 spe_release_register(f, stencil_pass_depth_pass_mask);
1731 }
1732
1733 /* Almost done. If we need to writemask, do it now, leaving the
1734 * results in the fbS_reg register passed in. If we don't need
1735 * to writemask, then the results are *already* in the fbS_reg,
1736 * so there's nothing more to do.
1737 */
1738
1739 if (need_to_writemask_stencil_values) {
1740 /* The Select Bytes command makes a fine writemask. Where
1741 * the mask is 0, the first (original) values are retained,
1742 * effectively masking out changes. Where the mask is 1, the
1743 * second (new) values are retained, incorporating changes.
1744 */
1745 spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
1746 }
1747 } /* done calculating stencil values */
1748
1749 /* The stencil and/or depth values have been applied, and the
1750 * mask_reg, fbS_reg, and fbZ_reg values have been updated.
1751 * We're all done, except that we've allocated a fair number
1752 * of registers that we didn't bother tracking. Release all
1753 * those registers as part of the register set, and go home.
1754 */
1755 spe_release_register_set(f);
1756
1757 /* Return true if we could have modified the stencil and/or
1758 * depth buffers.
1759 */
1760 return modified_buffers;
1761 }
1762
1763
1764 /**
1765 * Generate SPE code to implement the fragment operations (alpha test,
1766 * depth test, stencil test, blending, colormask, and final
1767 * framebuffer write) as specified by the current context state.
1768 *
1769 * Logically, this code will be called after running the fragment
1770 * shader. But under some circumstances we could run some of this
1771 * code before the fragment shader to cull fragments/quads that are
1772 * totally occluded/discarded.
1773 *
1774 * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
1775 *
1776 * See the spu_default_fragment_ops() function to see how the per-fragment
1777 * operations would be done with ordinary C code.
1778 * The code we generate here though has no branches, is SIMD, etc and
1779 * should be much faster.
1780 *
1781 * \param cell the rendering context (in)
1782 * \param f the generated function (out)
1783 */
1784 void
1785 cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
1786 {
1787 const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
1788 const struct pipe_blend_state *blend = cell->blend;
1789 const struct pipe_blend_color *blend_color = &cell->blend_color;
1790 const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
1791
1792 /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
1793 const int x_reg = 3; /* uint */
1794 const int y_reg = 4; /* uint */
1795 const int color_tile_reg = 5; /* tile_t * */
1796 const int depth_tile_reg = 6; /* tile_t * */
1797 const int fragZ_reg = 7; /* vector float */
1798 const int fragR_reg = 8; /* vector float */
1799 const int fragG_reg = 9; /* vector float */
1800 const int fragB_reg = 10; /* vector float */
1801 const int fragA_reg = 11; /* vector float */
1802 const int mask_reg = 12; /* vector uint */
1803 const int facing_reg = 13; /* uint */
1804
1805 /* offset of quad from start of tile
1806 * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
1807 */
1808 int quad_offset_reg;
1809
1810 int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */
1811 int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */
1812
1813 spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
1814
1815 if (cell->debug_flags & CELL_DEBUG_ASM) {
1816 spe_print_code(f, true);
1817 spe_indent(f, 8);
1818 spe_comment(f, -4, "Begin per-fragment ops");
1819 }
1820
1821 spe_allocate_register(f, x_reg);
1822 spe_allocate_register(f, y_reg);
1823 spe_allocate_register(f, color_tile_reg);
1824 spe_allocate_register(f, depth_tile_reg);
1825 spe_allocate_register(f, fragZ_reg);
1826 spe_allocate_register(f, fragR_reg);
1827 spe_allocate_register(f, fragG_reg);
1828 spe_allocate_register(f, fragB_reg);
1829 spe_allocate_register(f, fragA_reg);
1830 spe_allocate_register(f, mask_reg);
1831 spe_allocate_register(f, facing_reg);
1832
1833 quad_offset_reg = spe_allocate_available_register(f);
1834 fbRGBA_reg = spe_allocate_available_register(f);
1835 fbZS_reg = spe_allocate_available_register(f);
1836
1837 /* compute offset of quad from start of tile, in bytes */
1838 {
1839 int x2_reg = spe_allocate_available_register(f);
1840 int y2_reg = spe_allocate_available_register(f);
1841
1842 ASSERT(TILE_SIZE == 32);
1843
1844 spe_comment(f, 0, "Compute quad offset within tile");
1845 spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */
1846 spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
1847 spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */
1848 spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */
1849 spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */
1850
1851 spe_release_register(f, x2_reg);
1852 spe_release_register(f, y2_reg);
1853 }
1854
1855 if (dsa->alpha.enabled) {
1856 gen_alpha_test(dsa, f, mask_reg, fragA_reg);
1857 }
1858
1859 /* If we need the stencil buffers (because one- or two-sided stencil is
1860 * enabled) or the depth buffer (because the depth test is enabled),
1861 * go grab them. Note that if either one- or two-sided stencil is
1862 * enabled, dsa->stencil[0].enabled will be true.
1863 */
1864 if (dsa->depth.enabled || dsa->stencil[0].enabled) {
1865 const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
1866 boolean write_depth_stencil;
1867
1868 /* We may or may not need to allocate a register for Z or stencil values */
1869 boolean fbS_reg_set = false, fbZ_reg_set = false;
1870 unsigned int fbS_reg, fbZ_reg = 0;
1871
1872 spe_comment(f, 0, "Fetch quad's Z/stencil values from tile");
1873
1874 /* fetch quad of depth/stencil values from tile at (x,y) */
1875 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
1876 /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
1877 spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
1878
1879 /* From the Z/stencil buffer format, pull out the bits we need for
1880 * Z and/or stencil. We'll also convert the incoming fragment Z
1881 * value in fragZ_reg from a floating point value in [0.0..1.0] to
1882 * an unsigned integer value with the appropriate resolution.
1883 */
1884 switch(zs_format) {
1885
1886 case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
1887 case PIPE_FORMAT_X8Z24_UNORM:
1888 if (dsa->depth.enabled) {
1889 /* We need the Z part at least */
1890 setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
1891 /* four 24-bit Z values in the low-order bits */
1892 spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
1893
1894 /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
1895 * to a 24-bit unsigned integer
1896 */
1897 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1898 spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
1899 }
1900 if (dsa->stencil[0].enabled) {
1901 setup_optional_register(f, &fbS_reg_set, &fbS_reg);
1902 /* four 8-bit Z values in the high-order bits */
1903 spe_rotmi(f, fbS_reg, fbZS_reg, -24);
1904 }
1905 break;
1906
1907 case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
1908 case PIPE_FORMAT_Z24X8_UNORM:
1909 if (dsa->depth.enabled) {
1910 setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
1911 /* shift by 8 to get the upper 24-bit values */
1912 spe_rotmi(f, fbS_reg, fbZS_reg, -8);
1913
1914 /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
1915 * to a 24-bit unsigned integer
1916 */
1917 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1918 spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
1919 }
1920 if (dsa->stencil[0].enabled) {
1921 setup_optional_register(f, &fbS_reg_set, &fbS_reg);
1922 /* 8-bit stencil in the low-order bits - mask them out */
1923 spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
1924 }
1925 break;
1926
1927 case PIPE_FORMAT_Z32_UNORM:
1928 if (dsa->depth.enabled) {
1929 setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
1930 /* Copy over 4 32-bit values */
1931 spe_move(f, fbZ_reg, fbZS_reg);
1932
1933 /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
1934 * to a 32-bit unsigned integer
1935 */
1936 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1937 }
1938 /* No stencil, so can't do anything there */
1939 break;
1940
1941 case PIPE_FORMAT_Z16_UNORM:
1942 if (dsa->depth.enabled) {
1943 /* XXX Not sure this is correct, but it was here before, so we're
1944 * going with it for now
1945 */
1946 setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
1947 /* Copy over 4 32-bit values */
1948 spe_move(f, fbZ_reg, fbZS_reg);
1949
1950 /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
1951 * to a 16-bit unsigned integer
1952 */
1953 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1954 spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
1955 }
1956 /* No stencil */
1957 break;
1958
1959 default:
1960 ASSERT(0); /* invalid format */
1961 }
1962
1963 /* If stencil is enabled, use the stencil-specific code
1964 * generator to generate both the stencil and depth (if needed)
1965 * tests. Otherwise, if only depth is enabled, generate
1966 * a quick depth test. The test generators themselves will
1967 * report back whether the depth/stencil buffer has to be
1968 * written back.
1969 */
1970 if (dsa->stencil[0].enabled) {
1971 /* This will perform the stencil and depth tests, and update
1972 * the mask_reg, fbZ_reg, and fbS_reg as required by the
1973 * tests.
1974 */
1975 ASSERT(fbS_reg_set);
1976 ASSERT(fbZ_reg_set);
1977 spe_comment(f, 0, "Perform stencil test");
1978
1979 write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
1980 }
1981 else if (dsa->depth.enabled) {
1982 int zmask_reg = spe_allocate_available_register(f);
1983 spe_comment(f, 0, "Perform depth test");
1984 write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
1985 spe_release_register(f, zmask_reg);
1986 }
1987 else {
1988 write_depth_stencil = false;
1989 }
1990
1991 if (write_depth_stencil) {
1992 /* Merge latest Z and Stencil values into fbZS_reg.
1993 * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
1994 * fbS_reg has four 8-bit Z values in bits [7..0].
1995 */
1996 spe_comment(f, 0, "Store quad's depth/stencil values in tile");
1997 if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
1998 zs_format == PIPE_FORMAT_X8Z24_UNORM) {
1999 if (fbS_reg_set) {
2000 spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
2001 spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
2002 }
2003 else {
2004 spe_move(f, fbZS_reg, fbZ_reg);
2005 }
2006 }
2007 else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
2008 zs_format == PIPE_FORMAT_Z24X8_UNORM) {
2009 spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
2010 if (fbS_reg_set) {
2011 spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
2012 }
2013 }
2014 else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
2015 spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
2016 }
2017 else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
2018 spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
2019 }
2020 else if (zs_format == PIPE_FORMAT_S8_UNORM) {
2021 ASSERT(0); /* XXX to do */
2022 }
2023 else {
2024 ASSERT(0); /* bad zs_format */
2025 }
2026
2027 /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
2028 spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
2029 }
2030
2031 release_optional_register(f, &fbZ_reg_set, fbZ_reg);
2032 release_optional_register(f, &fbS_reg_set, fbS_reg);
2033 }
2034
2035 /* Get framebuffer quad/colors. We'll need these for blending,
2036 * color masking, and to obey the quad/pixel mask.
2037 * Load: fbRGBA_reg = memory[color_tile + quad_offset]
2038 * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
2039 * we could skip this load.
2040 */
2041 spe_comment(f, 0, "Fetch quad colors from tile");
2042 spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
2043
2044 if (blend->blend_enable) {
2045 spe_comment(f, 0, "Perform blending");
2046 gen_blend(blend, blend_color, f, color_format,
2047 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
2048 }
2049
2050 /*
2051 * Write fragment colors to framebuffer/tile.
2052 * This involves converting the fragment colors from float[4] to the
2053 * tile's specific format and obeying the quad/pixel mask.
2054 */
2055 {
2056 int rgba_reg = spe_allocate_available_register(f);
2057
2058 /* Pack four float colors as four 32-bit int colors */
2059 spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
2060 gen_pack_colors(f, color_format,
2061 fragR_reg, fragG_reg, fragB_reg, fragA_reg,
2062 rgba_reg);
2063
2064 if (blend->logicop_enable) {
2065 spe_comment(f, 0, "Compute logic op");
2066 gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
2067 }
2068
2069 if (blend->colormask != PIPE_MASK_RGBA) {
2070 spe_comment(f, 0, "Compute color mask");
2071 gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
2072 }
2073
2074 /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
2075 * if (mask[i])
2076 * rgba[i] = rgba[i];
2077 * else
2078 * rgba[i] = framebuffer[i];
2079 */
2080 spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
2081
2082 /* Store updated quad in tile:
2083 * memory[color_tile + quad_offset] = rgba_reg;
2084 */
2085 spe_comment(f, 0, "Store quad colors into color tile");
2086 spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
2087
2088 spe_release_register(f, rgba_reg);
2089 }
2090
2091 //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
2092
2093 spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */
2094
2095 spe_release_register(f, fbRGBA_reg);
2096 spe_release_register(f, fbZS_reg);
2097 spe_release_register(f, quad_offset_reg);
2098
2099 if (cell->debug_flags & CELL_DEBUG_ASM) {
2100 spe_comment(f, -4, "End per-fragment ops");
2101 }
2102 }