cell: implement basic TXP instruction in fragment shaders
[mesa.git] / src / gallium / drivers / cell / spu / spu_tri.c
1 /**************************************************************************
2 *
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * Triangle rendering within a tile.
30 */
31
32 #include <transpose_matrix4x4.h>
33 #include "pipe/p_compiler.h"
34 #include "pipe/p_format.h"
35 #include "util/u_math.h"
36 #include "spu_colorpack.h"
37 #include "spu_main.h"
38 #include "spu_texture.h"
39 #include "spu_tile.h"
40 #include "spu_tri.h"
41
42
43 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
44 typedef vector unsigned int mask_t;
45
46 typedef union
47 {
48 vector float v;
49 float f[4];
50 } float4;
51
52
53 /**
54 * Simplified types taken from other parts of Gallium
55 */
56 struct vertex_header {
57 vector float data[1];
58 };
59
60
61
62 /* XXX fix this */
63 #undef CEILF
64 #define CEILF(X) ((float) (int) ((X) + 0.99999))
65
66
67 #define QUAD_TOP_LEFT 0
68 #define QUAD_TOP_RIGHT 1
69 #define QUAD_BOTTOM_LEFT 2
70 #define QUAD_BOTTOM_RIGHT 3
71 #define MASK_TOP_LEFT (1 << QUAD_TOP_LEFT)
72 #define MASK_TOP_RIGHT (1 << QUAD_TOP_RIGHT)
73 #define MASK_BOTTOM_LEFT (1 << QUAD_BOTTOM_LEFT)
74 #define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
75 #define MASK_ALL 0xf
76
77
78 #define DEBUG_VERTS 0
79
80 /**
81 * Triangle edge info
82 */
83 struct edge {
84 float dx; /**< X(v1) - X(v0), used only during setup */
85 float dy; /**< Y(v1) - Y(v0), used only during setup */
86 float dxdy; /**< dx/dy */
87 float sx, sy; /**< first sample point coord */
88 int lines; /**< number of lines on this edge */
89 };
90
91
92 struct interp_coef
93 {
94 float4 a0;
95 float4 dadx;
96 float4 dady;
97 };
98
99
100 /**
101 * Triangle setup info (derived from draw_stage).
102 * Also used for line drawing (taking some liberties).
103 */
104 struct setup_stage {
105
106 /* Vertices are just an array of floats making up each attribute in
107 * turn. Currently fixed at 4 floats, but should change in time.
108 * Codegen will help cope with this.
109 */
110 const struct vertex_header *vmax;
111 const struct vertex_header *vmid;
112 const struct vertex_header *vmin;
113 const struct vertex_header *vprovoke;
114
115 struct edge ebot;
116 struct edge etop;
117 struct edge emaj;
118
119 float oneoverarea;
120
121 uint facing;
122
123 uint tx, ty;
124
125 int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
126
127 #if 0
128 struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
129 #else
130 struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
131 #endif
132
133 #if 0
134 struct quad_header quad;
135 #endif
136
137 struct {
138 int left[2]; /**< [0] = row0, [1] = row1 */
139 int right[2];
140 int y;
141 unsigned y_flags;
142 unsigned mask; /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
143 } span;
144 };
145
146
147
148 static struct setup_stage setup;
149
150
151
152
153 #if 0
154 /**
155 * Basically a cast wrapper.
156 */
157 static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
158 {
159 return (struct setup_stage *)stage;
160 }
161 #endif
162
163 #if 0
164 /**
165 * Clip setup.quad against the scissor/surface bounds.
166 */
167 static INLINE void
168 quad_clip(struct setup_stage *setup)
169 {
170 const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
171 const int minx = (int) cliprect->minx;
172 const int maxx = (int) cliprect->maxx;
173 const int miny = (int) cliprect->miny;
174 const int maxy = (int) cliprect->maxy;
175
176 if (setup.quad.x0 >= maxx ||
177 setup.quad.y0 >= maxy ||
178 setup.quad.x0 + 1 < minx ||
179 setup.quad.y0 + 1 < miny) {
180 /* totally clipped */
181 setup.quad.mask = 0x0;
182 return;
183 }
184 if (setup.quad.x0 < minx)
185 setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
186 if (setup.quad.y0 < miny)
187 setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
188 if (setup.quad.x0 == maxx - 1)
189 setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
190 if (setup.quad.y0 == maxy - 1)
191 setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
192 }
193 #endif
194
195 #if 0
196 /**
197 * Emit a quad (pass to next stage) with clipping.
198 */
199 static INLINE void
200 clip_emit_quad(struct setup_stage *setup)
201 {
202 quad_clip(setup);
203 if (setup.quad.mask) {
204 struct softpipe_context *sp = setup.softpipe;
205 sp->quad.first->run(sp->quad.first, &setup.quad);
206 }
207 }
208 #endif
209
210 /**
211 * Evaluate attribute coefficients (plane equations) to compute
212 * attribute values for the four fragments in a quad.
213 * Eg: four colors will be computed (in AoS format).
214 */
215 static INLINE void
216 eval_coeff(uint slot, float x, float y, vector float result[4])
217 {
218 switch (spu.vertex_info.interp_mode[slot]) {
219 case INTERP_CONSTANT:
220 result[QUAD_TOP_LEFT] =
221 result[QUAD_TOP_RIGHT] =
222 result[QUAD_BOTTOM_LEFT] =
223 result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
224 break;
225
226 case INTERP_LINEAR:
227 /* fall-through, for now */
228 default:
229 {
230 register vector float dadx = setup.coef[slot].dadx.v;
231 register vector float dady = setup.coef[slot].dady.v;
232 register vector float topLeft
233 = spu_add(setup.coef[slot].a0.v,
234 spu_add(spu_mul(spu_splats(x), dadx),
235 spu_mul(spu_splats(y), dady)));
236
237 result[QUAD_TOP_LEFT] = topLeft;
238 result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
239 result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
240 result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
241 }
242 }
243 }
244
245
246 /**
247 * As above, but return 4 vectors in SOA format.
248 * XXX this will all be re-written someday.
249 */
250 static INLINE void
251 eval_coeff_soa(uint slot, float x, float y, vector float result[4])
252 {
253 eval_coeff(slot, x, y, result);
254 _transpose_matrix4x4(result, result);
255 }
256
257
258
259 static INLINE vector float
260 eval_z(float x, float y)
261 {
262 const uint slot = 0;
263 const float dzdx = setup.coef[slot].dadx.f[2];
264 const float dzdy = setup.coef[slot].dady.f[2];
265 const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
266 const vector float topLeftv = spu_splats(topLeft);
267 const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
268 return spu_add(topLeftv, derivs);
269 }
270
271
272 /**
273 * Emit a quad (pass to next stage). No clipping is done.
274 * Note: about 1/5 to 1/7 of the time, mask is zero and this function
275 * should be skipped. But adding the test for that slows things down
276 * overall.
277 */
278 static INLINE void
279 emit_quad( int x, int y, mask_t mask)
280 {
281 /* If any bits in mask are set... */
282 if (spu_extract(spu_orx(mask), 0)) {
283 const int ix = x - setup.cliprect_minx;
284 const int iy = y - setup.cliprect_miny;
285
286 spu.cur_ctile_status = TILE_STATUS_DIRTY;
287 spu.cur_ztile_status = TILE_STATUS_DIRTY;
288
289 if (0/*spu.texture[0].start*/) {
290 /*
291 * Temporary texture mapping path
292 * This will go away when fragment programs support TEX inst.
293 */
294 const uint unit = 0;
295 vector float colors[4];
296 vector float texcoords[4];
297 eval_coeff(2, (float) x, (float) y, texcoords);
298
299 if (spu_extract(mask, 0))
300 colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
301 if (spu_extract(mask, 1))
302 colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
303 if (spu_extract(mask, 2))
304 colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
305 if (spu_extract(mask, 3))
306 colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
307
308
309 if (spu.texture[1].start) {
310 /* multi-texture mapping */
311 const uint unit = 1;
312 vector float colors1[4];
313
314 eval_coeff(2, (float) x, (float) y, texcoords);
315
316 if (spu_extract(mask, 0))
317 colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
318 if (spu_extract(mask, 1))
319 colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
320 if (spu_extract(mask, 2))
321 colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
322 if (spu_extract(mask, 3))
323 colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
324
325 /* hack: modulate first texture by second */
326 colors[0] = spu_mul(colors[0], colors1[0]);
327 colors[1] = spu_mul(colors[1], colors1[1]);
328 colors[2] = spu_mul(colors[2], colors1[2]);
329 colors[3] = spu_mul(colors[3], colors1[3]);
330 }
331
332 {
333 /* Convert fragment data from AoS to SoA format.
334 * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
335 * This is temporary!
336 */
337 vector float soa_frag[4];
338 _transpose_matrix4x4(soa_frag, colors);
339
340 vector float fragZ = eval_z((float) x, (float) y);
341
342 /* Do all per-fragment/quad operations here, including:
343 * alpha test, z test, stencil test, blend and framebuffer writing.
344 */
345 spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
346 fragZ,
347 soa_frag[0], soa_frag[1],
348 soa_frag[2], soa_frag[3],
349 mask,
350 setup.facing);
351 }
352
353 }
354 else {
355 /*
356 * Run fragment shader, execute per-fragment ops, update fb/tile.
357 */
358 vector float inputs[4*4], outputs[2*4];
359 vector float fragZ = eval_z((float) x, (float) y);
360
361 /* setup inputs */
362 #if 0
363 eval_coeff_soa(1, (float) x, (float) y, inputs);
364 #else
365 uint i;
366 for (i = 0; i < spu.vertex_info.num_attribs; i++) {
367 eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4);
368 }
369 #endif
370 ASSERT(spu.fragment_program);
371 ASSERT(spu.fragment_ops);
372
373 /* Execute the current fragment program */
374 spu.fragment_program(inputs, outputs, spu.constants);
375
376 /* Execute per-fragment/quad operations, including:
377 * alpha test, z test, stencil test, blend and framebuffer writing.
378 */
379 spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
380 fragZ,
381 outputs[0*4+0],
382 outputs[0*4+1],
383 outputs[0*4+2],
384 outputs[0*4+3],
385 mask,
386 setup.facing);
387 }
388 }
389 }
390
391
392 /**
393 * Given an X or Y coordinate, return the block/quad coordinate that it
394 * belongs to.
395 */
396 static INLINE int block( int x )
397 {
398 return x & ~1;
399 }
400
401
402 /**
403 * Compute mask which indicates which pixels in the 2x2 quad are actually inside
404 * the triangle's bounds.
405 * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
406 */
407 static INLINE mask_t calculate_mask( int x )
408 {
409 /* This is a little tricky.
410 * Use & instead of && to avoid branches.
411 * Use negation to convert true/false to ~0/0 values.
412 */
413 mask_t mask;
414 mask = spu_insert(-((x >= setup.span.left[0]) & (x < setup.span.right[0])), mask, 0);
415 mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
416 mask = spu_insert(-((x >= setup.span.left[1]) & (x < setup.span.right[1])), mask, 2);
417 mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
418 return mask;
419 }
420
421
422 /**
423 * Render a horizontal span of quads
424 */
425 static void flush_spans( void )
426 {
427 int minleft, maxright;
428 int x;
429
430 switch (setup.span.y_flags) {
431 case 0x3:
432 /* both odd and even lines written (both quad rows) */
433 minleft = MIN2(setup.span.left[0], setup.span.left[1]);
434 maxright = MAX2(setup.span.right[0], setup.span.right[1]);
435 break;
436
437 case 0x1:
438 /* only even line written (quad top row) */
439 minleft = setup.span.left[0];
440 maxright = setup.span.right[0];
441 break;
442
443 case 0x2:
444 /* only odd line written (quad bottom row) */
445 minleft = setup.span.left[1];
446 maxright = setup.span.right[1];
447 break;
448
449 default:
450 return;
451 }
452
453
454 /* OK, we're very likely to need the tile data now.
455 * clear or finish waiting if needed.
456 */
457 if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
458 /* wait for mfc_get() to complete */
459 //printf("SPU: %u: waiting for ctile\n", spu.init.id);
460 wait_on_mask(1 << TAG_READ_TILE_COLOR);
461 spu.cur_ctile_status = TILE_STATUS_CLEAN;
462 }
463 else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
464 //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
465 clear_c_tile(&spu.ctile);
466 spu.cur_ctile_status = TILE_STATUS_DIRTY;
467 }
468 ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
469
470 if (spu.read_depth) {
471 if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
472 /* wait for mfc_get() to complete */
473 //printf("SPU: %u: waiting for ztile\n", spu.init.id);
474 wait_on_mask(1 << TAG_READ_TILE_Z);
475 spu.cur_ztile_status = TILE_STATUS_CLEAN;
476 }
477 else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
478 //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
479 clear_z_tile(&spu.ztile);
480 spu.cur_ztile_status = TILE_STATUS_DIRTY;
481 }
482 ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
483 }
484
485 /* XXX this loop could be moved into the above switch cases and
486 * calculate_mask() could be simplified a bit...
487 */
488 for (x = block(minleft); x <= block(maxright); x += 2) {
489 #if 1
490 emit_quad( x, setup.span.y, calculate_mask( x ));
491 #endif
492 }
493
494 setup.span.y = 0;
495 setup.span.y_flags = 0;
496 setup.span.right[0] = 0;
497 setup.span.right[1] = 0;
498 }
499
500 #if DEBUG_VERTS
501 static void print_vertex(const struct vertex_header *v)
502 {
503 int i;
504 fprintf(stderr, "Vertex: (%p)\n", v);
505 for (i = 0; i < setup.quad.nr_attrs; i++) {
506 fprintf(stderr, " %d: %f %f %f %f\n", i,
507 v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
508 }
509 }
510 #endif
511
512
513 static boolean setup_sort_vertices(const struct vertex_header *v0,
514 const struct vertex_header *v1,
515 const struct vertex_header *v2)
516 {
517
518 #if DEBUG_VERTS
519 fprintf(stderr, "Triangle:\n");
520 print_vertex(v0);
521 print_vertex(v1);
522 print_vertex(v2);
523 #endif
524
525 setup.vprovoke = v2;
526
527 /* determine bottom to top order of vertices */
528 {
529 float y0 = spu_extract(v0->data[0], 1);
530 float y1 = spu_extract(v1->data[0], 1);
531 float y2 = spu_extract(v2->data[0], 1);
532 if (y0 <= y1) {
533 if (y1 <= y2) {
534 /* y0<=y1<=y2 */
535 setup.vmin = v0;
536 setup.vmid = v1;
537 setup.vmax = v2;
538 }
539 else if (y2 <= y0) {
540 /* y2<=y0<=y1 */
541 setup.vmin = v2;
542 setup.vmid = v0;
543 setup.vmax = v1;
544 }
545 else {
546 /* y0<=y2<=y1 */
547 setup.vmin = v0;
548 setup.vmid = v2;
549 setup.vmax = v1;
550 }
551 }
552 else {
553 if (y0 <= y2) {
554 /* y1<=y0<=y2 */
555 setup.vmin = v1;
556 setup.vmid = v0;
557 setup.vmax = v2;
558 }
559 else if (y2 <= y1) {
560 /* y2<=y1<=y0 */
561 setup.vmin = v2;
562 setup.vmid = v1;
563 setup.vmax = v0;
564 }
565 else {
566 /* y1<=y2<=y0 */
567 setup.vmin = v1;
568 setup.vmid = v2;
569 setup.vmax = v0;
570 }
571 }
572 }
573
574 /* Check if triangle is completely outside the tile bounds */
575 if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
576 return FALSE;
577 if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
578 return FALSE;
579 if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
580 spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
581 spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
582 return FALSE;
583 if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
584 spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
585 spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
586 return FALSE;
587
588 setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
589 setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
590 setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
591 setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
592 setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
593 setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
594
595 /*
596 * Compute triangle's area. Use 1/area to compute partial
597 * derivatives of attributes later.
598 *
599 * The area will be the same as prim->det, but the sign may be
600 * different depending on how the vertices get sorted above.
601 *
602 * To determine whether the primitive is front or back facing we
603 * use the prim->det value because its sign is correct.
604 */
605 {
606 const float area = (setup.emaj.dx * setup.ebot.dy -
607 setup.ebot.dx * setup.emaj.dy);
608
609 setup.oneoverarea = 1.0f / area;
610 /*
611 _mesa_printf("%s one-over-area %f area %f det %f\n",
612 __FUNCTION__, setup.oneoverarea, area, prim->det );
613 */
614 }
615
616 #if 0
617 /* We need to know if this is a front or back-facing triangle for:
618 * - the GLSL gl_FrontFacing fragment attribute (bool)
619 * - two-sided stencil test
620 */
621 setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
622 #endif
623
624 return TRUE;
625 }
626
627
628 /**
629 * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
630 * The value value comes from vertex->data[slot].
631 * The result will be put into setup.coef[slot].a0.
632 * \param slot which attribute slot
633 */
634 static INLINE void
635 const_coeff(uint slot)
636 {
637 setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
638 setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
639 setup.coef[slot].a0.v = setup.vprovoke->data[slot];
640 }
641
642
643 /**
644 * Compute a0, dadx and dady for a linearly interpolated coefficient,
645 * for a triangle.
646 */
647 static INLINE void
648 tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
649 {
650 uint i;
651 const float *vmin_d = (float *) &setup.vmin->data[slot];
652 const float *vmid_d = (float *) &setup.vmid->data[slot];
653 const float *vmax_d = (float *) &setup.vmax->data[slot];
654 const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
655 const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
656
657 for (i = firstComp; i < lastComp; i++) {
658 float botda = vmid_d[i] - vmin_d[i];
659 float majda = vmax_d[i] - vmin_d[i];
660 float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
661 float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
662
663 ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
664
665 setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
666 setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
667
668 /* calculate a0 as the value which would be sampled for the
669 * fragment at (0,0), taking into account that we want to sample at
670 * pixel centers, in other words (0.5, 0.5).
671 *
672 * this is neat but unfortunately not a good way to do things for
673 * triangles with very large values of dadx or dady as it will
674 * result in the subtraction and re-addition from a0 of a very
675 * large number, which means we'll end up loosing a lot of the
676 * fractional bits and precision from a0. the way to fix this is
677 * to define a0 as the sample at a pixel center somewhere near vmin
678 * instead - i'll switch to this later.
679 */
680 setup.coef[slot].a0.f[i] = (vmin_d[i] -
681 (setup.coef[slot].dadx.f[i] * x +
682 setup.coef[slot].dady.f[i] * y));
683 }
684
685 /*
686 _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
687 slot, "xyzw"[i],
688 setup.coef[slot].a0[i],
689 setup.coef[slot].dadx.f[i],
690 setup.coef[slot].dady.f[i]);
691 */
692 }
693
694
695 /**
696 * As above, but interp setup all four vector components.
697 */
698 static INLINE void
699 tri_linear_coeff4(uint slot)
700 {
701 const vector float vmin_d = setup.vmin->data[slot];
702 const vector float vmid_d = setup.vmid->data[slot];
703 const vector float vmax_d = setup.vmax->data[slot];
704 const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
705 const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
706
707 vector float botda = vmid_d - vmin_d;
708 vector float majda = vmax_d - vmin_d;
709
710 vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
711 spu_mul(botda, spu_splats(setup.emaj.dy)));
712 vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
713 spu_mul(majda, spu_splats(setup.ebot.dx)));
714
715 setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
716 setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
717
718 vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
719 vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
720
721 setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
722 }
723
724
725
726 #if 0
727 /**
728 * Compute a0, dadx and dady for a perspective-corrected interpolant,
729 * for a triangle.
730 * We basically multiply the vertex value by 1/w before computing
731 * the plane coefficients (a0, dadx, dady).
732 * Later, when we compute the value at a particular fragment position we'll
733 * divide the interpolated value by the interpolated W at that fragment.
734 */
735 static void tri_persp_coeff( unsigned slot,
736 unsigned i )
737 {
738 /* premultiply by 1/w:
739 */
740 float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
741 float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
742 float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
743
744 float botda = mida - mina;
745 float majda = maxa - mina;
746 float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
747 float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
748
749 /*
750 printf("tri persp %d,%d: %f %f %f\n", slot, i,
751 setup.vmin->data[slot][i],
752 setup.vmid->data[slot][i],
753 setup.vmax->data[slot][i]
754 );
755 */
756
757 assert(slot < PIPE_MAX_SHADER_INPUTS);
758 assert(i <= 3);
759
760 setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
761 setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
762 setup.coef[slot].a0.f[i] = (mina -
763 (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) +
764 setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
765 }
766 #endif
767
768
769 /**
770 * Compute the setup.coef[] array dadx, dady, a0 values.
771 * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
772 */
773 static void setup_tri_coefficients(void)
774 {
775 #if 1
776 uint i;
777
778 for (i = 0; i < spu.vertex_info.num_attribs; i++) {
779 switch (spu.vertex_info.interp_mode[i]) {
780 case INTERP_NONE:
781 break;
782 case INTERP_POS:
783 /*tri_linear_coeff(i, 2, 3);*/
784 /* XXX interp W if PERSPECTIVE... */
785 tri_linear_coeff4(i);
786 break;
787 case INTERP_CONSTANT:
788 const_coeff(i);
789 break;
790 case INTERP_LINEAR:
791 tri_linear_coeff4(i);
792 break;
793 case INTERP_PERSPECTIVE:
794 tri_linear_coeff4(i); /* temporary */
795 break;
796 default:
797 ASSERT(0);
798 }
799 }
800 #else
801 ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
802 ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
803 spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
804 tri_linear_coeff(0, 2, 3); /* slot 0, z */
805 tri_linear_coeff(1, 0, 4); /* slot 1, color */
806 #endif
807 }
808
809
810 static void setup_tri_edges(void)
811 {
812 float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
813 float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
814
815 float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
816 float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
817 float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
818
819 setup.emaj.sy = CEILF(vmin_y);
820 setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
821 setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
822 setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
823
824 setup.etop.sy = CEILF(vmid_y);
825 setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
826 setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
827 setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
828
829 setup.ebot.sy = CEILF(vmin_y);
830 setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
831 setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
832 setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
833 }
834
835
836 /**
837 * Render the upper or lower half of a triangle.
838 * Scissoring/cliprect is applied here too.
839 */
840 static void subtriangle( struct edge *eleft,
841 struct edge *eright,
842 unsigned lines )
843 {
844 const int minx = setup.cliprect_minx;
845 const int maxx = setup.cliprect_maxx;
846 const int miny = setup.cliprect_miny;
847 const int maxy = setup.cliprect_maxy;
848 int y, start_y, finish_y;
849 int sy = (int)eleft->sy;
850
851 ASSERT((int)eleft->sy == (int) eright->sy);
852
853 /* clip top/bottom */
854 start_y = sy;
855 finish_y = sy + lines;
856
857 if (start_y < miny)
858 start_y = miny;
859
860 if (finish_y > maxy)
861 finish_y = maxy;
862
863 start_y -= sy;
864 finish_y -= sy;
865
866 /*
867 _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
868 */
869
870 for (y = start_y; y < finish_y; y++) {
871
872 /* avoid accumulating adds as floats don't have the precision to
873 * accurately iterate large triangle edges that way. luckily we
874 * can just multiply these days.
875 *
876 * this is all drowned out by the attribute interpolation anyway.
877 */
878 int left = (int)(eleft->sx + y * eleft->dxdy);
879 int right = (int)(eright->sx + y * eright->dxdy);
880
881 /* clip left/right */
882 if (left < minx)
883 left = minx;
884 if (right > maxx)
885 right = maxx;
886
887 if (left < right) {
888 int _y = sy + y;
889 if (block(_y) != setup.span.y) {
890 flush_spans();
891 setup.span.y = block(_y);
892 }
893
894 setup.span.left[_y&1] = left;
895 setup.span.right[_y&1] = right;
896 setup.span.y_flags |= 1<<(_y&1);
897 }
898 }
899
900
901 /* save the values so that emaj can be restarted:
902 */
903 eleft->sx += lines * eleft->dxdy;
904 eright->sx += lines * eright->dxdy;
905 eleft->sy += lines;
906 eright->sy += lines;
907 }
908
909 static float
910 determinant( const float *v0,
911 const float *v1,
912 const float *v2 )
913 {
914 /* edge vectors e = v0 - v2, f = v1 - v2 */
915 const float ex = v0[0] - v2[0];
916 const float ey = v0[1] - v2[1];
917 const float fx = v1[0] - v2[0];
918 const float fy = v1[1] - v2[1];
919
920 /* det = cross(e,f).z */
921 return ex * fy - ey * fx;
922 }
923
924
925 /**
926 * Draw triangle into tile at (tx, ty) (tile coords)
927 * The tile data should have already been fetched.
928 */
929 boolean
930 tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding)
931 {
932 setup.tx = tx;
933 setup.ty = ty;
934
935 /* set clipping bounds to tile bounds */
936 setup.cliprect_minx = tx * TILE_SIZE;
937 setup.cliprect_miny = ty * TILE_SIZE;
938 setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
939 setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
940
941 /* Before we sort vertices, determine the facing of the triangle,
942 * which will be needed for front/back-face stencil application
943 */
944 float det = determinant(v0, v1, v2);
945 setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
946
947 if (!setup_sort_vertices((struct vertex_header *) v0,
948 (struct vertex_header *) v1,
949 (struct vertex_header *) v2)) {
950 return FALSE; /* totally clipped */
951 }
952
953 setup_tri_coefficients();
954 setup_tri_edges();
955
956 setup.span.y = 0;
957 setup.span.y_flags = 0;
958 setup.span.right[0] = 0;
959 setup.span.right[1] = 0;
960 /* setup.span.z_mode = tri_z_mode( setup.ctx ); */
961
962 /* init_constant_attribs( setup ); */
963
964 if (setup.oneoverarea < 0.0) {
965 /* emaj on left:
966 */
967 subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
968 subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
969 }
970 else {
971 /* emaj on right:
972 */
973 subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
974 subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
975 }
976
977 flush_spans();
978
979 return TRUE;
980 }