1 /**************************************************************************
3 * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
31 #include <spu_mfcio.h>
34 #include "spu_render.h"
35 #include "spu_shuffle.h"
38 #include "cell/common.h"
39 #include "util/u_memory.h"
43 * Given a rendering command's bounding box (in pixels) compute the
44 * location of the corresponding screen tile bounding box.
47 tile_bounding_box(const struct cell_command_render
*render
,
48 uint
*txmin
, uint
*tymin
,
49 uint
*box_num_tiles
, uint
*box_width_tiles
)
52 /* Debug: full-window bounding box */
53 uint txmax
= spu
.fb
.width_tiles
- 1;
54 uint tymax
= spu
.fb
.height_tiles
- 1;
57 *box_num_tiles
= spu
.fb
.width_tiles
* spu
.fb
.height_tiles
;
58 *box_width_tiles
= spu
.fb
.width_tiles
;
63 uint txmax
, tymax
, box_height_tiles
;
65 *txmin
= (uint
) render
->xmin
/ TILE_SIZE
;
66 *tymin
= (uint
) render
->ymin
/ TILE_SIZE
;
67 txmax
= (uint
) render
->xmax
/ TILE_SIZE
;
68 tymax
= (uint
) render
->ymax
/ TILE_SIZE
;
69 if (txmax
>= spu
.fb
.width_tiles
)
70 txmax
= spu
.fb
.width_tiles
-1;
71 if (tymax
>= spu
.fb
.height_tiles
)
72 tymax
= spu
.fb
.height_tiles
-1;
73 *box_width_tiles
= txmax
- *txmin
+ 1;
74 box_height_tiles
= tymax
- *tymin
+ 1;
75 *box_num_tiles
= *box_width_tiles
* box_height_tiles
;
78 printf("SPU %u: bounds: %g, %g ... %g, %g\n", spu
.init
.id
,
79 render
->xmin
, render
->ymin
, render
->xmax
, render
->ymax
);
80 printf("SPU %u: tiles: %u, %u .. %u, %u\n",
81 spu
.init
.id
, *txmin
, *tymin
, txmax
, tymax
);
82 ASSERT(render
->xmin
<= render
->xmax
);
83 ASSERT(render
->ymin
<= render
->ymax
);
88 /** Check if the tile at (tx,ty) belongs to this SPU */
90 my_tile(uint tx
, uint ty
)
92 return (spu
.fb
.width_tiles
* ty
+ tx
) % spu
.init
.num_spus
== spu
.init
.id
;
97 * Start fetching non-clear color/Z tiles from main memory
100 get_cz_tiles(uint tx
, uint ty
)
102 if (spu
.read_depth_stencil
) {
103 if (spu
.cur_ztile_status
!= TILE_STATUS_CLEAR
) {
104 //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
105 get_tile(tx
, ty
, &spu
.ztile
, TAG_READ_TILE_Z
, 1);
106 spu
.cur_ztile_status
= TILE_STATUS_GETTING
;
110 if (spu
.cur_ctile_status
!= TILE_STATUS_CLEAR
) {
111 //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty);
112 get_tile(tx
, ty
, &spu
.ctile
, TAG_READ_TILE_COLOR
, 0);
113 spu
.cur_ctile_status
= TILE_STATUS_GETTING
;
119 * Start putting dirty color/Z tiles back to main memory
122 put_cz_tiles(uint tx
, uint ty
)
124 if (spu
.cur_ztile_status
== TILE_STATUS_DIRTY
) {
125 /* tile was modified and needs to be written back */
126 //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty);
127 put_tile(tx
, ty
, &spu
.ztile
, TAG_WRITE_TILE_Z
, 1);
128 spu
.cur_ztile_status
= TILE_STATUS_DEFINED
;
130 else if (spu
.cur_ztile_status
== TILE_STATUS_GETTING
) {
131 /* tile was never used */
132 spu
.cur_ztile_status
= TILE_STATUS_DEFINED
;
133 //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty);
136 if (spu
.cur_ctile_status
== TILE_STATUS_DIRTY
) {
137 /* tile was modified and needs to be written back */
138 //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty);
139 put_tile(tx
, ty
, &spu
.ctile
, TAG_WRITE_TILE_COLOR
, 0);
140 spu
.cur_ctile_status
= TILE_STATUS_DEFINED
;
142 else if (spu
.cur_ctile_status
== TILE_STATUS_GETTING
) {
143 /* tile was never used */
144 spu
.cur_ctile_status
= TILE_STATUS_DEFINED
;
145 //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty);
151 * Wait for 'put' of color/z tiles to complete.
154 wait_put_cz_tiles(void)
156 wait_on_mask(1 << TAG_WRITE_TILE_COLOR
);
157 if (spu
.read_depth_stencil
) {
158 wait_on_mask(1 << TAG_WRITE_TILE_Z
);
165 * \param pos_incr returns value indicating how may words to skip after
166 * this command in the batch buffer
169 cmd_render(const struct cell_command_render
*render
, uint
*pos_incr
)
171 /* we'll DMA into these buffers */
172 ubyte vertex_data
[CELL_BUFFER_SIZE
] ALIGN16_ATTRIB
;
173 const uint vertex_size
= render
->vertex_size
; /* in bytes */
174 /*const*/ uint total_vertex_bytes
= render
->num_verts
* vertex_size
;
176 const ubyte
*vertices
;
177 const ushort
*indexes
;
181 D_PRINTF(CELL_DEBUG_CMD
,
182 "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
186 render
->inline_verts
);
188 ASSERT(sizeof(*render
) % 4 == 0);
189 ASSERT(total_vertex_bytes
% 16 == 0);
190 ASSERT(render
->prim_type
== PIPE_PRIM_TRIANGLES
);
191 ASSERT(render
->num_indexes
% 3 == 0);
194 /* indexes are right after the render command in the batch buffer */
195 indexes
= (const ushort
*) (render
+ 1);
196 index_bytes
= ROUNDUP8(render
->num_indexes
* 2);
197 *pos_incr
= index_bytes
/ 8 + sizeof(*render
) / 8;
200 if (render
->inline_verts
) {
201 /* Vertices are after indexes in batch buffer at next 16-byte addr */
202 vertices
= (const ubyte
*) render
+ (*pos_incr
* 8);
203 vertices
= (const ubyte
*) align_pointer((void *) vertices
, 16);
204 ASSERT_ALIGN16(vertices
);
205 *pos_incr
= ((vertices
+ total_vertex_bytes
) - (ubyte
*) render
) / 8;
208 /* Begin DMA fetch of vertex buffer */
209 ubyte
*src
= spu
.init
.buffers
[render
->vertex_buf
];
210 ubyte
*dest
= vertex_data
;
212 /* skip vertex data we won't use */
214 src
+= render
->min_index
* vertex_size
;
215 dest
+= render
->min_index
* vertex_size
;
216 total_vertex_bytes
-= render
->min_index
* vertex_size
;
218 ASSERT(total_vertex_bytes
% 16 == 0);
219 ASSERT_ALIGN16(dest
);
222 mfc_get(dest
, /* in vertex_data[] array */
223 (unsigned int) src
, /* src in main memory */
224 total_vertex_bytes
, /* size */
229 vertices
= vertex_data
;
231 wait_on_mask(1 << TAG_VERTEX_BUFFER
);
236 ** find tiles which intersect the prim bounding box
238 uint txmin
, tymin
, box_width_tiles
, box_num_tiles
;
239 tile_bounding_box(render
, &txmin
, &tymin
,
240 &box_num_tiles
, &box_width_tiles
);
243 /* make sure any pending clears have completed */
244 wait_on_mask(1 << TAG_SURFACE_CLEAR
); /* XXX temporary */
250 ** loop over tiles, rendering tris
252 for (i
= 0; i
< box_num_tiles
; i
++) {
253 const uint tx
= txmin
+ i
% box_width_tiles
;
254 const uint ty
= tymin
+ i
/ box_width_tiles
;
256 ASSERT(tx
< spu
.fb
.width_tiles
);
257 ASSERT(ty
< spu
.fb
.height_tiles
);
259 if (!my_tile(tx
, ty
))
264 spu
.cur_ctile_status
= spu
.ctile_status
[ty
][tx
];
265 spu
.cur_ztile_status
= spu
.ztile_status
[ty
][tx
];
267 get_cz_tiles(tx
, ty
);
271 const qword vertex_sizes
= (qword
)spu_splats(vertex_size
);
272 const qword verticess
= (qword
)spu_splats((uint
)vertices
);
274 ASSERT_ALIGN16(&indexes
[0]);
276 const uint num_indexes
= render
->num_indexes
;
279 * &indexes[0] will be 16 byte aligned. This loop is heavily unrolled
280 * avoiding variable rotates when extracting vertex indices.
282 for (j
= 0; j
< num_indexes
; j
+= 24) {
283 /* Load three vectors, containing 24 ushort indices */
284 const qword
* lower_qword
= (qword
*)&indexes
[j
];
285 const qword indices0
= lower_qword
[0];
286 const qword indices1
= lower_qword
[1];
287 const qword indices2
= lower_qword
[2];
289 /* stores three indices for each tri n in slots 0, 1 and 2 of vsn */
290 /* Straightforward rotates for these */
291 qword vs0
= indices0
;
292 qword vs1
= si_shlqbyi(indices0
, 6);
293 qword vs3
= si_shlqbyi(indices1
, 2);
294 qword vs4
= si_shlqbyi(indices1
, 8);
295 qword vs6
= si_shlqbyi(indices2
, 4);
296 qword vs7
= si_shlqbyi(indices2
, 10);
298 /* For tri 2 and 5, the three indices are split across two machine
299 * words - rotate and combine */
300 const qword tmp2a
= si_shlqbyi(indices0
, 12);
301 const qword tmp2b
= si_rotqmbyi(indices1
, 12|16);
302 qword vs2
= si_selb(tmp2a
, tmp2b
, si_fsmh(si_from_uint(0x20)));
304 const qword tmp5a
= si_shlqbyi(indices1
, 14);
305 const qword tmp5b
= si_rotqmbyi(indices2
, 14|16);
306 qword vs5
= si_selb(tmp5a
, tmp5b
, si_fsmh(si_from_uint(0x60)));
308 /* unpack indices from halfword slots to word slots */
309 vs0
= si_shufb(vs0
, vs0
, SHUFB8(0,A
,0,B
,0,C
,0,0));
310 vs1
= si_shufb(vs1
, vs1
, SHUFB8(0,A
,0,B
,0,C
,0,0));
311 vs2
= si_shufb(vs2
, vs2
, SHUFB8(0,A
,0,B
,0,C
,0,0));
312 vs3
= si_shufb(vs3
, vs3
, SHUFB8(0,A
,0,B
,0,C
,0,0));
313 vs4
= si_shufb(vs4
, vs4
, SHUFB8(0,A
,0,B
,0,C
,0,0));
314 vs5
= si_shufb(vs5
, vs5
, SHUFB8(0,A
,0,B
,0,C
,0,0));
315 vs6
= si_shufb(vs6
, vs6
, SHUFB8(0,A
,0,B
,0,C
,0,0));
316 vs7
= si_shufb(vs7
, vs7
, SHUFB8(0,A
,0,B
,0,C
,0,0));
318 /* Calculate address of vertex in vertices[] */
319 vs0
= si_mpya(vs0
, vertex_sizes
, verticess
);
320 vs1
= si_mpya(vs1
, vertex_sizes
, verticess
);
321 vs2
= si_mpya(vs2
, vertex_sizes
, verticess
);
322 vs3
= si_mpya(vs3
, vertex_sizes
, verticess
);
323 vs4
= si_mpya(vs4
, vertex_sizes
, verticess
);
324 vs5
= si_mpya(vs5
, vertex_sizes
, verticess
);
325 vs6
= si_mpya(vs6
, vertex_sizes
, verticess
);
326 vs7
= si_mpya(vs7
, vertex_sizes
, verticess
);
328 /* Select the appropriate call based on the number of vertices
330 switch(num_indexes
- j
) {
331 default: drawn
+= tri_draw(vs7
, tx
, ty
);
332 case 21: drawn
+= tri_draw(vs6
, tx
, ty
);
333 case 18: drawn
+= tri_draw(vs5
, tx
, ty
);
334 case 15: drawn
+= tri_draw(vs4
, tx
, ty
);
335 case 12: drawn
+= tri_draw(vs3
, tx
, ty
);
336 case 9: drawn
+= tri_draw(vs2
, tx
, ty
);
337 case 6: drawn
+= tri_draw(vs1
, tx
, ty
);
338 case 3: drawn
+= tri_draw(vs0
, tx
, ty
);
342 //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
344 /* write color/z tiles back to main framebuffer, if dirtied */
345 put_cz_tiles(tx
, ty
);
347 wait_put_cz_tiles(); /* XXX seems unnecessary... */
349 spu
.ctile_status
[ty
][tx
] = spu
.cur_ctile_status
;
350 spu
.ztile_status
[ty
][tx
] = spu
.cur_ztile_status
;
353 D_PRINTF(CELL_DEBUG_CMD
,
354 "RENDER done (%u tiles hit)\n",