cell: remove unneeded blend/depth_stencil subclasses
[mesa.git] / src / gallium / drivers / cell / ppu / cell_gen_fragment.c
1 /**************************************************************************
2 *
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29
30 /**
31 * Generate SPU per-fragment code (actually per-quad code).
32 * \author Brian Paul
33 */
34
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "rtasm/rtasm_ppc_spe.h"
39 #include "cell_context.h"
40 #include "cell_gen_fragment.h"
41
42
43
44 /** Do extra optimizations? */
45 #define OPTIMIZATIONS 1
46
47
48 /**
49 * Generate SPE code to perform Z/depth testing.
50 *
51 * \param dsa Gallium depth/stencil/alpha state to gen code for
52 * \param f SPE function to append instruction onto.
53 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
54 * \param ifragZ_reg register containing integer fragment Z values (in)
55 * \param ifbZ_reg register containing integer frame buffer Z values (in/out)
56 * \param zmask_reg register containing result of Z test/comparison (out)
57 */
58 static void
59 gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
60 struct spe_function *f,
61 int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
62 {
63 /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
64 * quantities. This only makes a difference for 32-bit Z values though.
65 */
66 ASSERT(dsa->depth.enabled);
67
68 switch (dsa->depth.func) {
69 case PIPE_FUNC_EQUAL:
70 /* zmask = (ifragZ == ref) */
71 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
72 /* mask = (mask & zmask) */
73 spe_and(f, mask_reg, mask_reg, zmask_reg);
74 break;
75
76 case PIPE_FUNC_NOTEQUAL:
77 /* zmask = (ifragZ == ref) */
78 spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
79 /* mask = (mask & ~zmask) */
80 spe_andc(f, mask_reg, mask_reg, zmask_reg);
81 break;
82
83 case PIPE_FUNC_GREATER:
84 /* zmask = (ifragZ > ref) */
85 spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
86 /* mask = (mask & zmask) */
87 spe_and(f, mask_reg, mask_reg, zmask_reg);
88 break;
89
90 case PIPE_FUNC_LESS:
91 /* zmask = (ref > ifragZ) */
92 spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
93 /* mask = (mask & zmask) */
94 spe_and(f, mask_reg, mask_reg, zmask_reg);
95 break;
96
97 case PIPE_FUNC_LEQUAL:
98 /* zmask = (ifragZ > ref) */
99 spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
100 /* mask = (mask & ~zmask) */
101 spe_andc(f, mask_reg, mask_reg, zmask_reg);
102 break;
103
104 case PIPE_FUNC_GEQUAL:
105 /* zmask = (ref > ifragZ) */
106 spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
107 /* mask = (mask & ~zmask) */
108 spe_andc(f, mask_reg, mask_reg, zmask_reg);
109 break;
110
111 case PIPE_FUNC_NEVER:
112 spe_il(f, mask_reg, 0); /* mask = {0,0,0,0} */
113 spe_move(f, zmask_reg, mask_reg); /* zmask = mask */
114 break;
115
116 case PIPE_FUNC_ALWAYS:
117 /* mask unchanged */
118 spe_il(f, zmask_reg, ~0); /* zmask = {~0,~0,~0,~0} */
119 break;
120
121 default:
122 ASSERT(0);
123 break;
124 }
125
126 if (dsa->depth.writemask) {
127 /*
128 * If (ztest passed) {
129 * framebufferZ = fragmentZ;
130 * }
131 * OR,
132 * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
133 */
134 spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
135 }
136 }
137
138
139 /**
140 * Generate SPE code to perform alpha testing.
141 *
142 * \param dsa Gallium depth/stencil/alpha state to gen code for
143 * \param f SPE function to append instruction onto.
144 * \param mask_reg register containing quad/pixel "alive" mask (in/out)
145 * \param fragA_reg register containing four fragment alpha values (in)
146 */
147 static void
148 gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
149 struct spe_function *f, int mask_reg, int fragA_reg)
150 {
151 int ref_reg = spe_allocate_available_register(f);
152 int amask_reg = spe_allocate_available_register(f);
153
154 ASSERT(dsa->alpha.enabled);
155
156 if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
157 (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
158 /* load/splat the alpha reference float value */
159 spe_load_float(f, ref_reg, dsa->alpha.ref);
160 }
161
162 /* emit code to do the alpha comparison, updating 'mask' */
163 switch (dsa->alpha.func) {
164 case PIPE_FUNC_EQUAL:
165 /* amask = (fragA == ref) */
166 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
167 /* mask = (mask & amask) */
168 spe_and(f, mask_reg, mask_reg, amask_reg);
169 break;
170
171 case PIPE_FUNC_NOTEQUAL:
172 /* amask = (fragA == ref) */
173 spe_fceq(f, amask_reg, fragA_reg, ref_reg);
174 /* mask = (mask & ~amask) */
175 spe_andc(f, mask_reg, mask_reg, amask_reg);
176 break;
177
178 case PIPE_FUNC_GREATER:
179 /* amask = (fragA > ref) */
180 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
181 /* mask = (mask & amask) */
182 spe_and(f, mask_reg, mask_reg, amask_reg);
183 break;
184
185 case PIPE_FUNC_LESS:
186 /* amask = (ref > fragA) */
187 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
188 /* mask = (mask & amask) */
189 spe_and(f, mask_reg, mask_reg, amask_reg);
190 break;
191
192 case PIPE_FUNC_LEQUAL:
193 /* amask = (fragA > ref) */
194 spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
195 /* mask = (mask & ~amask) */
196 spe_andc(f, mask_reg, mask_reg, amask_reg);
197 break;
198
199 case PIPE_FUNC_GEQUAL:
200 /* amask = (ref > fragA) */
201 spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
202 /* mask = (mask & ~amask) */
203 spe_andc(f, mask_reg, mask_reg, amask_reg);
204 break;
205
206 case PIPE_FUNC_NEVER:
207 spe_il(f, mask_reg, 0); /* mask = [0,0,0,0] */
208 break;
209
210 case PIPE_FUNC_ALWAYS:
211 /* no-op, mask unchanged */
212 break;
213
214 default:
215 ASSERT(0);
216 break;
217 }
218
219 #if OPTIMIZATIONS
220 /* if mask == {0,0,0,0} we're all done, return */
221 {
222 /* re-use amask reg here */
223 int tmp_reg = amask_reg;
224 /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
225 spe_orx(f, tmp_reg, mask_reg);
226 /* if tmp[0] == 0 then return from function call */
227 spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
228 }
229 #endif
230
231 spe_release_register(f, ref_reg);
232 spe_release_register(f, amask_reg);
233 }
234
235 /* This pair of functions is used inline to allocate and deallocate
236 * optional constant registers. Once a constant is discovered to be
237 * needed, we will likely need it again, so we don't want to deallocate
238 * it and have to allocate and load it again unnecessarily.
239 */
240 static inline void
241 setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
242 {
243 if (*is_already_set) return;
244 *r = spe_allocate_available_register(f);
245 spe_load_float(f, *r, value);
246 *is_already_set = true;
247 }
248
249 static inline void
250 release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
251 {
252 if (!*is_already_set) return;
253 spe_release_register(f, r);
254 *is_already_set = false;
255 }
256
257 /**
258 * Generate SPE code to implement the given blend mode for a quad of pixels.
259 * \param f SPE function to append instruction onto.
260 * \param fragR_reg register with fragment red values (float) (in/out)
261 * \param fragG_reg register with fragment green values (float) (in/out)
262 * \param fragB_reg register with fragment blue values (float) (in/out)
263 * \param fragA_reg register with fragment alpha values (float) (in/out)
264 * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
265 */
266 static void
267 gen_blend(const struct pipe_blend_state *blend,
268 const struct pipe_blend_color *blend_color,
269 struct spe_function *f,
270 enum pipe_format color_format,
271 int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
272 int fbRGBA_reg)
273 {
274 int term1R_reg = spe_allocate_available_register(f);
275 int term1G_reg = spe_allocate_available_register(f);
276 int term1B_reg = spe_allocate_available_register(f);
277 int term1A_reg = spe_allocate_available_register(f);
278
279 int term2R_reg = spe_allocate_available_register(f);
280 int term2G_reg = spe_allocate_available_register(f);
281 int term2B_reg = spe_allocate_available_register(f);
282 int term2A_reg = spe_allocate_available_register(f);
283
284 int fbR_reg = spe_allocate_available_register(f);
285 int fbG_reg = spe_allocate_available_register(f);
286 int fbB_reg = spe_allocate_available_register(f);
287 int fbA_reg = spe_allocate_available_register(f);
288
289 int tmp_reg = spe_allocate_available_register(f);
290
291 /* Optional constant registers we might or might not end up using;
292 * if we do use them, make sure we only allocate them once by
293 * keeping a flag on each one.
294 */
295 boolean one_reg_set = false;
296 unsigned int one_reg;
297 boolean constR_reg_set = false, constG_reg_set = false,
298 constB_reg_set = false, constA_reg_set = false;
299 unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
300
301 ASSERT(blend->blend_enable);
302
303 /* Unpack/convert framebuffer colors from four 32-bit packed colors
304 * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
305 * Each 8-bit color component is expanded into a float in [0.0, 1.0].
306 */
307 {
308 int mask_reg = spe_allocate_available_register(f);
309
310 /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
311 spe_load_int(f, mask_reg, 0xff);
312
313 /* XXX there may be more clever ways to implement the following code */
314 switch (color_format) {
315 case PIPE_FORMAT_A8R8G8B8_UNORM:
316 /* fbB = fbB & mask */
317 spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
318 /* mask = mask << 8 */
319 spe_roti(f, mask_reg, mask_reg, 8);
320
321 /* fbG = fbRGBA & mask */
322 spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
323 /* fbG = fbG >> 8 */
324 spe_roti(f, fbG_reg, fbG_reg, -8);
325 /* mask = mask << 8 */
326 spe_roti(f, mask_reg, mask_reg, 8);
327
328 /* fbR = fbRGBA & mask */
329 spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
330 /* fbR = fbR >> 16 */
331 spe_roti(f, fbR_reg, fbR_reg, -16);
332 /* mask = mask << 8 */
333 spe_roti(f, mask_reg, mask_reg, 8);
334
335 /* fbA = fbRGBA & mask */
336 spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
337 /* fbA = fbA >> 24 */
338 spe_roti(f, fbA_reg, fbA_reg, -24);
339 break;
340
341 case PIPE_FORMAT_B8G8R8A8_UNORM:
342 /* fbA = fbA & mask */
343 spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
344 /* mask = mask << 8 */
345 spe_roti(f, mask_reg, mask_reg, 8);
346
347 /* fbR = fbRGBA & mask */
348 spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
349 /* fbR = fbR >> 8 */
350 spe_roti(f, fbR_reg, fbR_reg, -8);
351 /* mask = mask << 8 */
352 spe_roti(f, mask_reg, mask_reg, 8);
353
354 /* fbG = fbRGBA & mask */
355 spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
356 /* fbG = fbG >> 16 */
357 spe_roti(f, fbG_reg, fbG_reg, -16);
358 /* mask = mask << 8 */
359 spe_roti(f, mask_reg, mask_reg, 8);
360
361 /* fbB = fbRGBA & mask */
362 spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
363 /* fbB = fbB >> 24 */
364 spe_roti(f, fbB_reg, fbB_reg, -24);
365 break;
366
367 default:
368 ASSERT(0);
369 }
370
371 /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
372 spe_cuflt(f, fbR_reg, fbR_reg, 8);
373 spe_cuflt(f, fbG_reg, fbG_reg, 8);
374 spe_cuflt(f, fbB_reg, fbB_reg, 8);
375 spe_cuflt(f, fbA_reg, fbA_reg, 8);
376
377 spe_release_register(f, mask_reg);
378 }
379
380 /*
381 * Compute Src RGB terms. We're actually looking for the value
382 * of (the appropriate RGB factors) * (the incoming source RGB color),
383 * because in some cases (like PIPE_BLENDFACTOR_ONE and
384 * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
385 */
386 switch (blend->rgb_src_factor) {
387 case PIPE_BLENDFACTOR_ONE:
388 /* factors = (1,1,1), so term = (R,G,B) */
389 spe_move(f, term1R_reg, fragR_reg);
390 spe_move(f, term1G_reg, fragG_reg);
391 spe_move(f, term1B_reg, fragB_reg);
392 break;
393 case PIPE_BLENDFACTOR_ZERO:
394 /* factors = (0,0,0), so term = (0,0,0) */
395 spe_load_float(f, term1R_reg, 0.0f);
396 spe_load_float(f, term1G_reg, 0.0f);
397 spe_load_float(f, term1B_reg, 0.0f);
398 break;
399 case PIPE_BLENDFACTOR_SRC_COLOR:
400 /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
401 spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
402 spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
403 spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
404 break;
405 case PIPE_BLENDFACTOR_SRC_ALPHA:
406 /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
407 spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
408 spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
409 spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
410 break;
411 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
412 /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B))
413 * or in other words term = (R-R*R, G-G*G, B-B*B)
414 * fnms(a,b,c,d) computes a = d - b*c
415 */
416 spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
417 spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
418 spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
419 break;
420 case PIPE_BLENDFACTOR_DST_COLOR:
421 /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
422 spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
423 spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
424 spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
425 break;
426 case PIPE_BLENDFACTOR_INV_DST_COLOR:
427 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
428 * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
429 * fnms(a,b,c,d) computes a = d - b*c
430 */
431 spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
432 spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
433 spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
434 break;
435 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
436 /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
437 * or term = (R-R*A,G-G*A,B-B*A)
438 * fnms(a,b,c,d) computes a = d - b*c
439 */
440 spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
441 spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
442 spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
443 break;
444 case PIPE_BLENDFACTOR_DST_ALPHA:
445 /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
446 spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
447 spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
448 spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
449 break;
450 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
451 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb))
452 * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
453 * fnms(a,b,c,d) computes a = d - b*c
454 */
455 spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
456 spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
457 spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
458 break;
459 case PIPE_BLENDFACTOR_CONST_COLOR:
460 /* We need the optional constant color registers */
461 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
462 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
463 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
464 /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
465 spe_fm(f, term1R_reg, fragR_reg, constR_reg);
466 spe_fm(f, term1G_reg, fragG_reg, constG_reg);
467 spe_fm(f, term1B_reg, fragB_reg, constB_reg);
468 break;
469 case PIPE_BLENDFACTOR_CONST_ALPHA:
470 /* we'll need the optional constant alpha register */
471 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
472 /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
473 spe_fm(f, term1R_reg, fragR_reg, constA_reg);
474 spe_fm(f, term1G_reg, fragG_reg, constA_reg);
475 spe_fm(f, term1B_reg, fragB_reg, constA_reg);
476 break;
477 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
478 /* We need the optional constant color registers */
479 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
480 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
481 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
482 /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc))
483 * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
484 * fnms(a,b,c,d) computes a = d - b*c
485 */
486 spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
487 spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
488 spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
489 break;
490 case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
491 /* We need the optional constant color registers */
492 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
493 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
494 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
495 /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
496 * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
497 * fnms(a,b,c,d) computes a = d - b*c
498 */
499 spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
500 spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
501 spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
502 break;
503 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
504 /* We'll need the optional {1,1,1,1} register */
505 setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
506 /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so
507 * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
508 * We could expand the term (as a*min(b,c) == min(a*b,a*c)
509 * as long as a is positive), but then we'd have to do three
510 * spe_float_min() functions instead of one, so this is simpler.
511 */
512 /* tmp = 1 - Afb */
513 spe_fs(f, tmp_reg, one_reg, fbA_reg);
514 /* tmp = min(A,tmp) */
515 spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
516 /* term = R*tmp */
517 spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
518 spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
519 spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
520 break;
521
522 /* These are special D3D cases involving a second color output
523 * from the fragment shader. I'm not sure we can support them
524 * yet... XXX
525 */
526 case PIPE_BLENDFACTOR_SRC1_COLOR:
527 case PIPE_BLENDFACTOR_SRC1_ALPHA:
528 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
529 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
530
531 default:
532 ASSERT(0);
533 }
534
535 /*
536 * Compute Src Alpha term. Like the above, we're looking for
537 * the full term A*factor, not just the factor itself, because
538 * in many cases we can avoid doing unnecessary multiplies.
539 */
540 switch (blend->alpha_src_factor) {
541 case PIPE_BLENDFACTOR_ZERO:
542 /* factor = 0, so term = 0 */
543 spe_load_float(f, term1A_reg, 0.0f);
544 break;
545
546 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
547 case PIPE_BLENDFACTOR_ONE:
548 /* factor = 1, so term = A */
549 spe_move(f, term1A_reg, fragA_reg);
550 break;
551
552 case PIPE_BLENDFACTOR_SRC_COLOR:
553 /* factor = A, so term = A*A */
554 spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
555 break;
556 case PIPE_BLENDFACTOR_SRC_ALPHA:
557 spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
558 break;
559
560 case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
561 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
562 /* factor = 1-A, so term = A*(1-A) = A-A*A */
563 /* fnms(a,b,c,d) computes a = d - b*c */
564 spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
565 break;
566
567 case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
568 case PIPE_BLENDFACTOR_DST_COLOR:
569 /* factor = Afb, so term = A*Afb */
570 spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
571 break;
572
573 case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
574 case PIPE_BLENDFACTOR_INV_DST_COLOR:
575 /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
576 /* fnms(a,b,c,d) computes a = d - b*c */
577 spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
578 break;
579
580 case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
581 case PIPE_BLENDFACTOR_CONST_COLOR:
582 /* We need the optional constA_reg register */
583 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
584 /* factor = Ac, so term = A*Ac */
585 spe_fm(f, term1A_reg, fragA_reg, constA_reg);
586 break;
587
588 case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
589 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
590 /* We need the optional constA_reg register */
591 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
592 /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
593 /* fnms(a,b,c,d) computes a = d - b*c */
594 spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
595 break;
596
597 /* These are special D3D cases involving a second color output
598 * from the fragment shader. I'm not sure we can support them
599 * yet... XXX
600 */
601 case PIPE_BLENDFACTOR_SRC1_COLOR:
602 case PIPE_BLENDFACTOR_SRC1_ALPHA:
603 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
604 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
605 default:
606 ASSERT(0);
607 }
608
609 /*
610 * Compute Dest RGB term. Like the above, we're looking for
611 * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
612 * in many cases we can avoid doing unnecessary multiplies.
613 */
614 switch (blend->rgb_dst_factor) {
615 case PIPE_BLENDFACTOR_ONE:
616 /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
617 spe_move(f, term2R_reg, fbR_reg);
618 spe_move(f, term2G_reg, fbG_reg);
619 spe_move(f, term2B_reg, fbB_reg);
620 break;
621 case PIPE_BLENDFACTOR_ZERO:
622 /* factor s= (0,0,0), so term = (0,0,0) */
623 spe_load_float(f, term2R_reg, 0.0f);
624 spe_load_float(f, term2G_reg, 0.0f);
625 spe_load_float(f, term2B_reg, 0.0f);
626 break;
627 case PIPE_BLENDFACTOR_SRC_COLOR:
628 /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
629 spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
630 spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
631 spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
632 break;
633 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
634 /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B))
635 * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
636 * fnms(a,b,c,d) computes a = d - b*c
637 */
638 spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
639 spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
640 spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
641 break;
642 case PIPE_BLENDFACTOR_SRC_ALPHA:
643 /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
644 spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
645 spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
646 spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
647 break;
648 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
649 /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
650 /* fnms(a,b,c,d) computes a = d - b*c */
651 spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
652 spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
653 spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
654 break;
655 case PIPE_BLENDFACTOR_DST_COLOR:
656 /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
657 spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
658 spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
659 spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
660 break;
661 case PIPE_BLENDFACTOR_INV_DST_COLOR:
662 /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
663 * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
664 * fnms(a,b,c,d) computes a = d - b*c
665 */
666 spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
667 spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
668 spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
669 break;
670
671 case PIPE_BLENDFACTOR_DST_ALPHA:
672 /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
673 spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
674 spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
675 spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
676 break;
677 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
678 /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb))
679 * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
680 * fnms(a,b,c,d) computes a = d - b*c
681 */
682 spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
683 spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
684 spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
685 break;
686 case PIPE_BLENDFACTOR_CONST_COLOR:
687 /* We need the optional constant color registers */
688 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
689 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
690 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
691 /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
692 spe_fm(f, term2R_reg, fbR_reg, constR_reg);
693 spe_fm(f, term2G_reg, fbG_reg, constG_reg);
694 spe_fm(f, term2B_reg, fbB_reg, constB_reg);
695 break;
696 case PIPE_BLENDFACTOR_CONST_ALPHA:
697 /* we'll need the optional constant alpha register */
698 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
699 /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
700 spe_fm(f, term2R_reg, fbR_reg, constA_reg);
701 spe_fm(f, term2G_reg, fbG_reg, constA_reg);
702 spe_fm(f, term2B_reg, fbB_reg, constA_reg);
703 break;
704 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
705 /* We need the optional constant color registers */
706 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
707 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
708 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
709 /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc))
710 * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
711 * fnms(a,b,c,d) computes a = d - b*c
712 */
713 spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
714 spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
715 spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
716 break;
717 case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
718 /* We need the optional constant color registers */
719 setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
720 setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
721 setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
722 /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
723 * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
724 * fnms(a,b,c,d) computes a = d - b*c
725 */
726 spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
727 spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
728 spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
729 break;
730 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
731 ASSERT(0);
732 break;
733
734 /* These are special D3D cases involving a second color output
735 * from the fragment shader. I'm not sure we can support them
736 * yet... XXX
737 */
738 case PIPE_BLENDFACTOR_SRC1_COLOR:
739 case PIPE_BLENDFACTOR_SRC1_ALPHA:
740 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
741 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
742
743 default:
744 ASSERT(0);
745 }
746
747 /*
748 * Compute Dest Alpha term. Like the above, we're looking for
749 * the full term Afb*factor, not just the factor itself, because
750 * in many cases we can avoid doing unnecessary multiplies.
751 */
752 switch (blend->alpha_dst_factor) {
753 case PIPE_BLENDFACTOR_ONE:
754 /* factor = 1, so term = Afb */
755 spe_move(f, term2A_reg, fbA_reg);
756 break;
757 case PIPE_BLENDFACTOR_ZERO:
758 /* factor = 0, so term = 0 */
759 spe_load_float(f, term2A_reg, 0.0f);
760 break;
761
762 case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
763 case PIPE_BLENDFACTOR_SRC_COLOR:
764 /* factor = A, so term = Afb*A */
765 spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
766 break;
767
768 case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
769 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
770 /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
771 /* fnms(a,b,c,d) computes a = d - b*c */
772 spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
773 break;
774
775 case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
776 case PIPE_BLENDFACTOR_DST_COLOR:
777 /* factor = Afb, so term = Afb*Afb */
778 spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
779 break;
780
781 case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
782 case PIPE_BLENDFACTOR_INV_DST_COLOR:
783 /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
784 /* fnms(a,b,c,d) computes a = d - b*c */
785 spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
786 break;
787
788 case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
789 case PIPE_BLENDFACTOR_CONST_COLOR:
790 /* We need the optional constA_reg register */
791 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
792 /* factor = Ac, so term = Afb*Ac */
793 spe_fm(f, term2A_reg, fbA_reg, constA_reg);
794 break;
795
796 case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
797 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
798 /* We need the optional constA_reg register */
799 setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
800 /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
801 /* fnms(a,b,c,d) computes a = d - b*c */
802 spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
803 break;
804
805 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
806 ASSERT(0);
807 break;
808
809 /* These are special D3D cases involving a second color output
810 * from the fragment shader. I'm not sure we can support them
811 * yet... XXX
812 */
813 case PIPE_BLENDFACTOR_SRC1_COLOR:
814 case PIPE_BLENDFACTOR_SRC1_ALPHA:
815 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
816 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
817 default:
818 ASSERT(0);
819 }
820
821 /*
822 * Combine Src/Dest RGB terms as per the blend equation.
823 */
824 switch (blend->rgb_func) {
825 case PIPE_BLEND_ADD:
826 spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
827 spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
828 spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
829 break;
830 case PIPE_BLEND_SUBTRACT:
831 spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
832 spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
833 spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
834 break;
835 case PIPE_BLEND_REVERSE_SUBTRACT:
836 spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
837 spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
838 spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
839 break;
840 case PIPE_BLEND_MIN:
841 spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
842 spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
843 spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
844 break;
845 case PIPE_BLEND_MAX:
846 spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
847 spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
848 spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
849 break;
850 default:
851 ASSERT(0);
852 }
853
854 /*
855 * Combine Src/Dest A term
856 */
857 switch (blend->alpha_func) {
858 case PIPE_BLEND_ADD:
859 spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
860 break;
861 case PIPE_BLEND_SUBTRACT:
862 spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
863 break;
864 case PIPE_BLEND_REVERSE_SUBTRACT:
865 spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
866 break;
867 case PIPE_BLEND_MIN:
868 spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
869 break;
870 case PIPE_BLEND_MAX:
871 spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
872 break;
873 default:
874 ASSERT(0);
875 }
876
877 spe_release_register(f, term1R_reg);
878 spe_release_register(f, term1G_reg);
879 spe_release_register(f, term1B_reg);
880 spe_release_register(f, term1A_reg);
881
882 spe_release_register(f, term2R_reg);
883 spe_release_register(f, term2G_reg);
884 spe_release_register(f, term2B_reg);
885 spe_release_register(f, term2A_reg);
886
887 spe_release_register(f, fbR_reg);
888 spe_release_register(f, fbG_reg);
889 spe_release_register(f, fbB_reg);
890 spe_release_register(f, fbA_reg);
891
892 spe_release_register(f, tmp_reg);
893
894 /* Free any optional registers that actually got used */
895 release_const_register(f, &one_reg_set, one_reg);
896 release_const_register(f, &constR_reg_set, constR_reg);
897 release_const_register(f, &constG_reg_set, constG_reg);
898 release_const_register(f, &constB_reg_set, constB_reg);
899 release_const_register(f, &constA_reg_set, constA_reg);
900 }
901
902
903 static void
904 gen_logicop(const struct pipe_blend_state *blend,
905 struct spe_function *f,
906 int fragRGBA_reg, int fbRGBA_reg)
907 {
908 /* We've got four 32-bit RGBA packed pixels in each of
909 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
910 * reds, greens, blues, and alphas.
911 * */
912 ASSERT(blend->logicop_enable);
913
914 switch(blend->logicop_func) {
915 case PIPE_LOGICOP_CLEAR: /* 0 */
916 spe_zero(f, fragRGBA_reg);
917 break;
918 case PIPE_LOGICOP_NOR: /* ~(s | d) */
919 spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
920 break;
921 case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
922 /* andc R, A, B computes R = A & ~B */
923 spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
924 break;
925 case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
926 spe_complement(f, fragRGBA_reg, fragRGBA_reg);
927 break;
928 case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
929 /* andc R, A, B computes R = A & ~B */
930 spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
931 break;
932 case PIPE_LOGICOP_INVERT: /* ~d */
933 /* Note that (A nor A) == ~(A|A) == ~A */
934 spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
935 break;
936 case PIPE_LOGICOP_XOR: /* s ^ d */
937 spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
938 break;
939 case PIPE_LOGICOP_NAND: /* ~(s & d) */
940 spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
941 break;
942 case PIPE_LOGICOP_AND: /* s & d */
943 spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
944 break;
945 case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
946 spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
947 spe_complement(f, fragRGBA_reg, fragRGBA_reg);
948 break;
949 case PIPE_LOGICOP_NOOP: /* d */
950 spe_move(f, fragRGBA_reg, fbRGBA_reg);
951 break;
952 case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
953 /* orc R, A, B computes R = A | ~B */
954 spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
955 break;
956 case PIPE_LOGICOP_COPY: /* s */
957 break;
958 case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
959 /* orc R, A, B computes R = A | ~B */
960 spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
961 break;
962 case PIPE_LOGICOP_OR: /* s | d */
963 spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
964 break;
965 case PIPE_LOGICOP_SET: /* 1 */
966 spe_load_int(f, fragRGBA_reg, 0xffffffff);
967 break;
968 default:
969 ASSERT(0);
970 }
971 }
972
973
974 /**
975 * Generate code to pack a quad of float colors into four 32-bit integers.
976 *
977 * \param f SPE function to append instruction onto.
978 * \param color_format the dest color packing format
979 * \param r_reg register containing four red values (in/clobbered)
980 * \param g_reg register containing four green values (in/clobbered)
981 * \param b_reg register containing four blue values (in/clobbered)
982 * \param a_reg register containing four alpha values (in/clobbered)
983 * \param rgba_reg register to store the packed RGBA colors (out)
984 */
985 static void
986 gen_pack_colors(struct spe_function *f,
987 enum pipe_format color_format,
988 int r_reg, int g_reg, int b_reg, int a_reg,
989 int rgba_reg)
990 {
991 int rg_reg = spe_allocate_available_register(f);
992 int ba_reg = spe_allocate_available_register(f);
993
994 /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
995 spe_cfltu(f, r_reg, r_reg, 32);
996 spe_cfltu(f, g_reg, g_reg, 32);
997 spe_cfltu(f, b_reg, b_reg, 32);
998 spe_cfltu(f, a_reg, a_reg, 32);
999
1000 /* Shift the most significant bytes to the least significant positions.
1001 * I.e.: reg = reg >> 24
1002 */
1003 spe_rotmi(f, r_reg, r_reg, -24);
1004 spe_rotmi(f, g_reg, g_reg, -24);
1005 spe_rotmi(f, b_reg, b_reg, -24);
1006 spe_rotmi(f, a_reg, a_reg, -24);
1007
1008 /* Shift the color bytes according to the surface format */
1009 if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
1010 spe_roti(f, g_reg, g_reg, 8); /* green <<= 8 */
1011 spe_roti(f, r_reg, r_reg, 16); /* red <<= 16 */
1012 spe_roti(f, a_reg, a_reg, 24); /* alpha <<= 24 */
1013 }
1014 else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1015 spe_roti(f, r_reg, r_reg, 8); /* red <<= 8 */
1016 spe_roti(f, g_reg, g_reg, 16); /* green <<= 16 */
1017 spe_roti(f, b_reg, b_reg, 24); /* blue <<= 24 */
1018 }
1019 else {
1020 ASSERT(0);
1021 }
1022
1023 /* Merge red, green, blue, alpha registers to make packed RGBA colors.
1024 * Eg: after shifting according to color_format we might have:
1025 * R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
1026 * G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
1027 * B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
1028 * A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
1029 * OR-ing all those together gives us four packed colors:
1030 * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
1031 */
1032 spe_or(f, rg_reg, r_reg, g_reg);
1033 spe_or(f, ba_reg, a_reg, b_reg);
1034 spe_or(f, rgba_reg, rg_reg, ba_reg);
1035
1036 spe_release_register(f, rg_reg);
1037 spe_release_register(f, ba_reg);
1038 }
1039
1040 static void
1041 gen_colormask(struct spe_function *f,
1042 uint colormask,
1043 enum pipe_format color_format,
1044 int fragRGBA_reg, int fbRGBA_reg)
1045 {
1046 /* We've got four 32-bit RGBA packed pixels in each of
1047 * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
1048 * reds, greens, blues, and alphas. Further, the pixels
1049 * are packed according to the given color format, not
1050 * necessarily RGBA...
1051 */
1052 unsigned int r_mask;
1053 unsigned int g_mask;
1054 unsigned int b_mask;
1055 unsigned int a_mask;
1056
1057 /* Calculate exactly where the bits for any particular color
1058 * end up, so we can mask them correctly.
1059 */
1060 switch(color_format) {
1061 case PIPE_FORMAT_A8R8G8B8_UNORM:
1062 /* ARGB */
1063 a_mask = 0xff000000;
1064 r_mask = 0x00ff0000;
1065 g_mask = 0x0000ff00;
1066 b_mask = 0x000000ff;
1067 break;
1068 case PIPE_FORMAT_B8G8R8A8_UNORM:
1069 /* BGRA */
1070 b_mask = 0xff000000;
1071 g_mask = 0x00ff0000;
1072 r_mask = 0x0000ff00;
1073 a_mask = 0x000000ff;
1074 break;
1075 default:
1076 ASSERT(0);
1077 }
1078
1079 /* For each R, G, B, and A component we're supposed to mask out,
1080 * clear its bits. Then our mask operation later will work
1081 * as expected.
1082 */
1083 if (!(colormask & PIPE_MASK_R)) {
1084 r_mask = 0;
1085 }
1086 if (!(colormask & PIPE_MASK_G)) {
1087 g_mask = 0;
1088 }
1089 if (!(colormask & PIPE_MASK_B)) {
1090 b_mask = 0;
1091 }
1092 if (!(colormask & PIPE_MASK_A)) {
1093 a_mask = 0;
1094 }
1095
1096 /* Get a temporary register to hold the mask that will be applied to the fragment */
1097 int colormask_reg = spe_allocate_available_register(f);
1098
1099 /* The actual mask we're going to use is an OR of the remaining R, G, B, and A
1100 * masks. Load the result value into our temporary register.
1101 */
1102 spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
1103
1104 /* Use the mask register to select between the fragment color
1105 * values and the frame buffer color values. Wherever the
1106 * mask has a 0 bit, the current frame buffer color should override
1107 * the fragment color. Wherever the mask has a 1 bit, the
1108 * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM)
1109 * instruction will select bits from its first operand rA wherever the
1110 * the mask bits rM are 0, and from its second operand rB wherever the
1111 * mask bits rM are 1. That means that the frame buffer color is the
1112 * first operand, and the fragment color the second.
1113 */
1114 spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
1115
1116 /* Release the temporary register and we're done */
1117 spe_release_register(f, colormask_reg);
1118 }
1119
1120 /**
1121 * Generate SPE code to implement the fragment operations (alpha test,
1122 * depth test, stencil test, blending, colormask, and final
1123 * framebuffer write) as specified by the current context state.
1124 *
1125 * Logically, this code will be called after running the fragment
1126 * shader. But under some circumstances we could run some of this
1127 * code before the fragment shader to cull fragments/quads that are
1128 * totally occluded/discarded.
1129 *
1130 * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
1131 *
1132 * See the spu_default_fragment_ops() function to see how the per-fragment
1133 * operations would be done with ordinary C code.
1134 * The code we generate here though has no branches, is SIMD, etc and
1135 * should be much faster.
1136 *
1137 * \param cell the rendering context (in)
1138 * \param f the generated function (out)
1139 */
1140 void
1141 cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
1142 {
1143 const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
1144 const struct pipe_blend_state *blend = cell->blend;
1145 const struct pipe_blend_color *blend_color = &cell->blend_color;
1146 const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
1147
1148 /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
1149 const int x_reg = 3; /* uint */
1150 const int y_reg = 4; /* uint */
1151 const int color_tile_reg = 5; /* tile_t * */
1152 const int depth_tile_reg = 6; /* tile_t * */
1153 const int fragZ_reg = 7; /* vector float */
1154 const int fragR_reg = 8; /* vector float */
1155 const int fragG_reg = 9; /* vector float */
1156 const int fragB_reg = 10; /* vector float */
1157 const int fragA_reg = 11; /* vector float */
1158 const int mask_reg = 12; /* vector uint */
1159
1160 /* offset of quad from start of tile
1161 * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
1162 */
1163 int quad_offset_reg;
1164
1165 int fbRGBA_reg; /**< framebuffer's RGBA colors for quad */
1166 int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */
1167
1168 spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
1169
1170 if (cell->debug_flags & CELL_DEBUG_ASM) {
1171 spe_print_code(f, true);
1172 spe_indent(f, 8);
1173 spe_comment(f, -4, "Begin per-fragment ops");
1174 }
1175
1176 spe_allocate_register(f, x_reg);
1177 spe_allocate_register(f, y_reg);
1178 spe_allocate_register(f, color_tile_reg);
1179 spe_allocate_register(f, depth_tile_reg);
1180 spe_allocate_register(f, fragZ_reg);
1181 spe_allocate_register(f, fragR_reg);
1182 spe_allocate_register(f, fragG_reg);
1183 spe_allocate_register(f, fragB_reg);
1184 spe_allocate_register(f, fragA_reg);
1185 spe_allocate_register(f, mask_reg);
1186
1187 quad_offset_reg = spe_allocate_available_register(f);
1188 fbRGBA_reg = spe_allocate_available_register(f);
1189 fbZS_reg = spe_allocate_available_register(f);
1190
1191 /* compute offset of quad from start of tile, in bytes */
1192 {
1193 int x2_reg = spe_allocate_available_register(f);
1194 int y2_reg = spe_allocate_available_register(f);
1195
1196 ASSERT(TILE_SIZE == 32);
1197
1198 spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */
1199 spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
1200 spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */
1201 spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */
1202 spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */
1203
1204 spe_release_register(f, x2_reg);
1205 spe_release_register(f, y2_reg);
1206 }
1207
1208
1209 if (dsa->alpha.enabled) {
1210 gen_alpha_test(dsa, f, mask_reg, fragA_reg);
1211 }
1212
1213 if (dsa->depth.enabled || dsa->stencil[0].enabled) {
1214 const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
1215 boolean write_depth_stencil;
1216
1217 int fbZ_reg = spe_allocate_available_register(f); /* Z values */
1218 int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
1219
1220 /* fetch quad of depth/stencil values from tile at (x,y) */
1221 /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
1222 spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
1223
1224 if (dsa->depth.enabled) {
1225 /* Extract Z bits from fbZS_reg into fbZ_reg */
1226 if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
1227 zs_format == PIPE_FORMAT_X8Z24_UNORM) {
1228 int mask_reg = spe_allocate_available_register(f);
1229 spe_fsmbi(f, mask_reg, 0x7777); /* mask[0,1,2,3] = 0x00ffffff */
1230 spe_and(f, fbZ_reg, fbZS_reg, mask_reg); /* fbZ = fbZS & mask */
1231 spe_release_register(f, mask_reg);
1232 /* OK, fbZ_reg has four 24-bit Z values now */
1233 }
1234 else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
1235 zs_format == PIPE_FORMAT_Z24X8_UNORM) {
1236 spe_rotmi(f, fbZ_reg, fbZS_reg, -8); /* fbZ = fbZS >> 8 */
1237 /* OK, fbZ_reg has four 24-bit Z values now */
1238 }
1239 else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
1240 spe_move(f, fbZ_reg, fbZS_reg);
1241 /* OK, fbZ_reg has four 32-bit Z values now */
1242 }
1243 else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
1244 spe_move(f, fbZ_reg, fbZS_reg);
1245 /* OK, fbZ_reg has four 16-bit Z values now */
1246 }
1247 else {
1248 ASSERT(0); /* invalid format */
1249 }
1250
1251 /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */
1252 if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
1253 zs_format == PIPE_FORMAT_X8Z24_UNORM ||
1254 zs_format == PIPE_FORMAT_Z24S8_UNORM ||
1255 zs_format == PIPE_FORMAT_Z24X8_UNORM) {
1256 /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
1257 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1258 /* fragZ = fragZ >> 8 */
1259 spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
1260 }
1261 else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
1262 /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
1263 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1264 }
1265 else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
1266 /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
1267 spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
1268 /* fragZ = fragZ >> 16 */
1269 spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
1270 }
1271 }
1272 else {
1273 /* no Z test, but set Z to zero so we don't OR-in garbage below */
1274 spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */
1275 }
1276
1277
1278 if (dsa->stencil[0].enabled) {
1279 /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
1280 if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
1281 zs_format == PIPE_FORMAT_X8Z24_UNORM) {
1282 /* XXX extract with a shift */
1283 ASSERT(0);
1284 }
1285 else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
1286 zs_format == PIPE_FORMAT_Z24X8_UNORM) {
1287 /* XXX extract with a mask */
1288 ASSERT(0);
1289 }
1290 }
1291 else {
1292 /* no stencil test, but set to zero so we don't OR-in garbage below */
1293 spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */
1294 }
1295
1296 if (dsa->stencil[0].enabled) {
1297 /* XXX this may involve depth testing too */
1298 // gen_stencil_test(dsa, f, ... );
1299 ASSERT(0);
1300 }
1301 else if (dsa->depth.enabled) {
1302 int zmask_reg = spe_allocate_available_register(f);
1303 gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
1304 spe_release_register(f, zmask_reg);
1305 }
1306
1307 /* do we need to write Z and/or Stencil back into framebuffer? */
1308 write_depth_stencil = (dsa->depth.writemask |
1309 dsa->stencil[0].write_mask |
1310 dsa->stencil[1].write_mask);
1311
1312 if (write_depth_stencil) {
1313 /* Merge latest Z and Stencil values into fbZS_reg.
1314 * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
1315 * fbS_reg has four 8-bit Z values in bits [7..0].
1316 */
1317 if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
1318 zs_format == PIPE_FORMAT_X8Z24_UNORM) {
1319 spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
1320 spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
1321 }
1322 else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
1323 zs_format == PIPE_FORMAT_Z24X8_UNORM) {
1324 spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
1325 spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
1326 }
1327 else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
1328 spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
1329 }
1330 else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
1331 spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
1332 }
1333 else if (zs_format == PIPE_FORMAT_S8_UNORM) {
1334 ASSERT(0); /* XXX to do */
1335 }
1336 else {
1337 ASSERT(0); /* bad zs_format */
1338 }
1339
1340 /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
1341 spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
1342 }
1343
1344 spe_release_register(f, fbZ_reg);
1345 spe_release_register(f, fbS_reg);
1346 }
1347
1348
1349 /* Get framebuffer quad/colors. We'll need these for blending,
1350 * color masking, and to obey the quad/pixel mask.
1351 * Load: fbRGBA_reg = memory[color_tile + quad_offset]
1352 * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
1353 * we could skip this load.
1354 */
1355 spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
1356
1357
1358 if (blend->blend_enable) {
1359 gen_blend(blend, blend_color, f, color_format,
1360 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
1361 }
1362
1363 /*
1364 * Write fragment colors to framebuffer/tile.
1365 * This involves converting the fragment colors from float[4] to the
1366 * tile's specific format and obeying the quad/pixel mask.
1367 */
1368 {
1369 int rgba_reg = spe_allocate_available_register(f);
1370
1371 /* Pack four float colors as four 32-bit int colors */
1372 gen_pack_colors(f, color_format,
1373 fragR_reg, fragG_reg, fragB_reg, fragA_reg,
1374 rgba_reg);
1375
1376 if (blend->logicop_enable) {
1377 gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
1378 }
1379
1380 if (blend->colormask != PIPE_MASK_RGBA) {
1381 gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
1382 }
1383
1384
1385 /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
1386 * if (mask[i])
1387 * rgba[i] = rgba[i];
1388 * else
1389 * rgba[i] = framebuffer[i];
1390 */
1391 spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
1392
1393 /* Store updated quad in tile:
1394 * memory[color_tile + quad_offset] = rgba_reg;
1395 */
1396 spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
1397
1398 spe_release_register(f, rgba_reg);
1399 }
1400
1401 //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
1402
1403 spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */
1404
1405 spe_release_register(f, fbRGBA_reg);
1406 spe_release_register(f, fbZS_reg);
1407 spe_release_register(f, quad_offset_reg);
1408
1409 if (cell->debug_flags & CELL_DEBUG_ASM) {
1410 spe_comment(f, -4, "End per-fragment ops");
1411 }
1412 }