Merge branch 'mesa_7_5_branch'
[mesa.git] / src / gallium / drivers / cell / spu / spu_render.c
1 /**************************************************************************
2 *
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 #include <stdio.h>
30 #include <libmisc.h>
31 #include <spu_mfcio.h>
32
33 #include "spu_main.h"
34 #include "spu_render.h"
35 #include "spu_shuffle.h"
36 #include "spu_tri.h"
37 #include "spu_tile.h"
38 #include "cell/common.h"
39 #include "util/u_memory.h"
40
41
42 /**
43 * Given a rendering command's bounding box (in pixels) compute the
44 * location of the corresponding screen tile bounding box.
45 */
46 static INLINE void
47 tile_bounding_box(const struct cell_command_render *render,
48 uint *txmin, uint *tymin,
49 uint *box_num_tiles, uint *box_width_tiles)
50 {
51 #if 0
52 /* Debug: full-window bounding box */
53 uint txmax = spu.fb.width_tiles - 1;
54 uint tymax = spu.fb.height_tiles - 1;
55 *txmin = 0;
56 *tymin = 0;
57 *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
58 *box_width_tiles = spu.fb.width_tiles;
59 (void) render;
60 (void) txmax;
61 (void) tymax;
62 #else
63 uint txmax, tymax, box_height_tiles;
64
65 *txmin = (uint) render->xmin / TILE_SIZE;
66 *tymin = (uint) render->ymin / TILE_SIZE;
67 txmax = (uint) render->xmax / TILE_SIZE;
68 tymax = (uint) render->ymax / TILE_SIZE;
69 if (txmax >= spu.fb.width_tiles)
70 txmax = spu.fb.width_tiles-1;
71 if (tymax >= spu.fb.height_tiles)
72 tymax = spu.fb.height_tiles-1;
73 *box_width_tiles = txmax - *txmin + 1;
74 box_height_tiles = tymax - *tymin + 1;
75 *box_num_tiles = *box_width_tiles * box_height_tiles;
76 #endif
77 #if 0
78 printf("SPU %u: bounds: %g, %g ... %g, %g\n", spu.init.id,
79 render->xmin, render->ymin, render->xmax, render->ymax);
80 printf("SPU %u: tiles: %u, %u .. %u, %u\n",
81 spu.init.id, *txmin, *tymin, txmax, tymax);
82 ASSERT(render->xmin <= render->xmax);
83 ASSERT(render->ymin <= render->ymax);
84 #endif
85 }
86
87
88 /** Check if the tile at (tx,ty) belongs to this SPU */
89 static INLINE boolean
90 my_tile(uint tx, uint ty)
91 {
92 return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
93 }
94
95
96 /**
97 * Start fetching non-clear color/Z tiles from main memory
98 */
99 static INLINE void
100 get_cz_tiles(uint tx, uint ty)
101 {
102 if (spu.read_depth_stencil) {
103 if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
104 //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
105 get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
106 spu.cur_ztile_status = TILE_STATUS_GETTING;
107 }
108 }
109
110 if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
111 //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty);
112 get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
113 spu.cur_ctile_status = TILE_STATUS_GETTING;
114 }
115 }
116
117
118 /**
119 * Start putting dirty color/Z tiles back to main memory
120 */
121 static INLINE void
122 put_cz_tiles(uint tx, uint ty)
123 {
124 if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
125 /* tile was modified and needs to be written back */
126 //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty);
127 put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
128 spu.cur_ztile_status = TILE_STATUS_DEFINED;
129 }
130 else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
131 /* tile was never used */
132 spu.cur_ztile_status = TILE_STATUS_DEFINED;
133 //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty);
134 }
135
136 if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
137 /* tile was modified and needs to be written back */
138 //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty);
139 put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
140 spu.cur_ctile_status = TILE_STATUS_DEFINED;
141 }
142 else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
143 /* tile was never used */
144 spu.cur_ctile_status = TILE_STATUS_DEFINED;
145 //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty);
146 }
147 }
148
149
150 /**
151 * Wait for 'put' of color/z tiles to complete.
152 */
153 static INLINE void
154 wait_put_cz_tiles(void)
155 {
156 wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
157 if (spu.read_depth_stencil) {
158 wait_on_mask(1 << TAG_WRITE_TILE_Z);
159 }
160 }
161
162
163 /**
164 * Render primitives
165 * \param pos_incr returns value indicating how may words to skip after
166 * this command in the batch buffer
167 */
168 void
169 cmd_render(const struct cell_command_render *render, uint *pos_incr)
170 {
171 /* we'll DMA into these buffers */
172 ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
173 const uint vertex_size = render->vertex_size; /* in bytes */
174 /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
175 uint index_bytes;
176 const ubyte *vertices;
177 const ushort *indexes;
178 uint i, j;
179 uint num_tiles;
180
181 D_PRINTF(CELL_DEBUG_CMD,
182 "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
183 render->prim_type,
184 render->num_verts,
185 render->num_indexes,
186 render->inline_verts);
187
188 ASSERT(sizeof(*render) % 4 == 0);
189 ASSERT(total_vertex_bytes % 16 == 0);
190 ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
191 ASSERT(render->num_indexes % 3 == 0);
192
193
194 /* indexes are right after the render command in the batch buffer */
195 indexes = (const ushort *) (render + 1);
196 index_bytes = ROUNDUP8(render->num_indexes * 2);
197 *pos_incr = index_bytes / 8 + sizeof(*render) / 8;
198
199
200 if (render->inline_verts) {
201 /* Vertices are after indexes in batch buffer at next 16-byte addr */
202 vertices = (const ubyte *) render + (*pos_incr * 8);
203 vertices = (const ubyte *) align_pointer((void *) vertices, 16);
204 ASSERT_ALIGN16(vertices);
205 *pos_incr = ((vertices + total_vertex_bytes) - (ubyte *) render) / 8;
206 }
207 else {
208 /* Begin DMA fetch of vertex buffer */
209 ubyte *src = spu.init.buffers[render->vertex_buf];
210 ubyte *dest = vertex_data;
211
212 /* skip vertex data we won't use */
213 #if 01
214 src += render->min_index * vertex_size;
215 dest += render->min_index * vertex_size;
216 total_vertex_bytes -= render->min_index * vertex_size;
217 #endif
218 ASSERT(total_vertex_bytes % 16 == 0);
219 ASSERT_ALIGN16(dest);
220 ASSERT_ALIGN16(src);
221
222 mfc_get(dest, /* in vertex_data[] array */
223 (unsigned int) src, /* src in main memory */
224 total_vertex_bytes, /* size */
225 TAG_VERTEX_BUFFER,
226 0, /* tid */
227 0 /* rid */);
228
229 vertices = vertex_data;
230
231 wait_on_mask(1 << TAG_VERTEX_BUFFER);
232 }
233
234
235 /**
236 ** find tiles which intersect the prim bounding box
237 **/
238 uint txmin, tymin, box_width_tiles, box_num_tiles;
239 tile_bounding_box(render, &txmin, &tymin,
240 &box_num_tiles, &box_width_tiles);
241
242
243 /* make sure any pending clears have completed */
244 wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
245
246
247 num_tiles = 0;
248
249 /**
250 ** loop over tiles, rendering tris
251 **/
252 for (i = 0; i < box_num_tiles; i++) {
253 const uint tx = txmin + i % box_width_tiles;
254 const uint ty = tymin + i / box_width_tiles;
255
256 ASSERT(tx < spu.fb.width_tiles);
257 ASSERT(ty < spu.fb.height_tiles);
258
259 if (!my_tile(tx, ty))
260 continue;
261
262 num_tiles++;
263
264 spu.cur_ctile_status = spu.ctile_status[ty][tx];
265 spu.cur_ztile_status = spu.ztile_status[ty][tx];
266
267 get_cz_tiles(tx, ty);
268
269 uint drawn = 0;
270
271 const qword vertex_sizes = (qword)spu_splats(vertex_size);
272 const qword verticess = (qword)spu_splats((uint)vertices);
273
274 ASSERT_ALIGN16(&indexes[0]);
275
276 const uint num_indexes = render->num_indexes;
277
278 /* loop over tris
279 * &indexes[0] will be 16 byte aligned. This loop is heavily unrolled
280 * avoiding variable rotates when extracting vertex indices.
281 */
282 for (j = 0; j < num_indexes; j += 24) {
283 /* Load three vectors, containing 24 ushort indices */
284 const qword* lower_qword = (qword*)&indexes[j];
285 const qword indices0 = lower_qword[0];
286 const qword indices1 = lower_qword[1];
287 const qword indices2 = lower_qword[2];
288
289 /* stores three indices for each tri n in slots 0, 1 and 2 of vsn */
290 /* Straightforward rotates for these */
291 qword vs0 = indices0;
292 qword vs1 = si_shlqbyi(indices0, 6);
293 qword vs3 = si_shlqbyi(indices1, 2);
294 qword vs4 = si_shlqbyi(indices1, 8);
295 qword vs6 = si_shlqbyi(indices2, 4);
296 qword vs7 = si_shlqbyi(indices2, 10);
297
298 /* For tri 2 and 5, the three indices are split across two machine
299 * words - rotate and combine */
300 const qword tmp2a = si_shlqbyi(indices0, 12);
301 const qword tmp2b = si_rotqmbyi(indices1, 12|16);
302 qword vs2 = si_selb(tmp2a, tmp2b, si_fsmh(si_from_uint(0x20)));
303
304 const qword tmp5a = si_shlqbyi(indices1, 14);
305 const qword tmp5b = si_rotqmbyi(indices2, 14|16);
306 qword vs5 = si_selb(tmp5a, tmp5b, si_fsmh(si_from_uint(0x60)));
307
308 /* unpack indices from halfword slots to word slots */
309 vs0 = si_shufb(vs0, vs0, SHUFB8(0,A,0,B,0,C,0,0));
310 vs1 = si_shufb(vs1, vs1, SHUFB8(0,A,0,B,0,C,0,0));
311 vs2 = si_shufb(vs2, vs2, SHUFB8(0,A,0,B,0,C,0,0));
312 vs3 = si_shufb(vs3, vs3, SHUFB8(0,A,0,B,0,C,0,0));
313 vs4 = si_shufb(vs4, vs4, SHUFB8(0,A,0,B,0,C,0,0));
314 vs5 = si_shufb(vs5, vs5, SHUFB8(0,A,0,B,0,C,0,0));
315 vs6 = si_shufb(vs6, vs6, SHUFB8(0,A,0,B,0,C,0,0));
316 vs7 = si_shufb(vs7, vs7, SHUFB8(0,A,0,B,0,C,0,0));
317
318 /* Calculate address of vertex in vertices[] */
319 vs0 = si_mpya(vs0, vertex_sizes, verticess);
320 vs1 = si_mpya(vs1, vertex_sizes, verticess);
321 vs2 = si_mpya(vs2, vertex_sizes, verticess);
322 vs3 = si_mpya(vs3, vertex_sizes, verticess);
323 vs4 = si_mpya(vs4, vertex_sizes, verticess);
324 vs5 = si_mpya(vs5, vertex_sizes, verticess);
325 vs6 = si_mpya(vs6, vertex_sizes, verticess);
326 vs7 = si_mpya(vs7, vertex_sizes, verticess);
327
328 /* Select the appropriate call based on the number of vertices
329 * remaining */
330 switch(num_indexes - j) {
331 default: drawn += tri_draw(vs7, tx, ty);
332 case 21: drawn += tri_draw(vs6, tx, ty);
333 case 18: drawn += tri_draw(vs5, tx, ty);
334 case 15: drawn += tri_draw(vs4, tx, ty);
335 case 12: drawn += tri_draw(vs3, tx, ty);
336 case 9: drawn += tri_draw(vs2, tx, ty);
337 case 6: drawn += tri_draw(vs1, tx, ty);
338 case 3: drawn += tri_draw(vs0, tx, ty);
339 }
340 }
341
342 //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
343
344 /* write color/z tiles back to main framebuffer, if dirtied */
345 put_cz_tiles(tx, ty);
346
347 wait_put_cz_tiles(); /* XXX seems unnecessary... */
348
349 spu.ctile_status[ty][tx] = spu.cur_ctile_status;
350 spu.ztile_status[ty][tx] = spu.cur_ztile_status;
351 }
352
353 D_PRINTF(CELL_DEBUG_CMD,
354 "RENDER done (%u tiles hit)\n",
355 num_tiles);
356 }