1 /**************************************************************************
3 * Copyright 2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
28 #include "util/u_memory.h"
29 #include "util/u_math.h"
30 #include "util/u_cpu_detect.h"
31 #include "util/u_surface.h"
33 #include "lp_bin_queue.h"
37 #include "lp_rast_priv.h"
38 #include "lp_tile_soa.h"
39 #include "lp_bld_debug.h"
45 * Called by rasterization threads to get the next chunk of work.
46 * We use a lock to make sure that all the threads get the same bins.
48 static struct lp_bins
*
49 get_next_full_bin( struct lp_rasterizer
*rast
)
51 pipe_mutex_lock( rast
->get_bin_mutex
);
52 if (!rast
->curr_bins
) {
53 /* this will wait until there's something in the queue */
54 rast
->curr_bins
= lp_bins_dequeue( rast
->full_bins
);
55 rast
->release_count
= 0;
57 lp_bin_iter_begin( rast
->curr_bins
);
59 pipe_mutex_unlock( rast
->get_bin_mutex
);
60 return rast
->curr_bins
;
65 * Called by rasterization threads after they've finished with
66 * the current bin. When all threads have called this, we reset
67 * the bin and put it into the 'empty bins' queue.
70 release_current_bin( struct lp_rasterizer
*rast
)
72 pipe_mutex_lock( rast
->get_bin_mutex
);
73 rast
->release_count
++;
74 if (rast
->release_count
== rast
->num_threads
) {
75 assert(rast
->curr_bins
);
76 lp_reset_bins( rast
->curr_bins
);
77 lp_bins_enqueue( rast
->empty_bins
, rast
->curr_bins
);
78 rast
->curr_bins
= NULL
;
80 pipe_mutex_unlock( rast
->get_bin_mutex
);
86 * Begin the rasterization phase.
87 * Map the framebuffer surfaces. Initialize the 'rast' state.
90 lp_rast_begin( struct lp_rasterizer
*rast
,
91 const struct pipe_framebuffer_state
*fb
,
93 boolean write_zstencil
)
95 struct pipe_screen
*screen
= rast
->screen
;
96 struct pipe_surface
*cbuf
, *zsbuf
;
98 LP_DBG(DEBUG_RAST
, "%s\n", __FUNCTION__
);
100 util_copy_framebuffer_state(&rast
->state
.fb
, fb
);
102 rast
->state
.write_zstencil
= write_zstencil
;
103 rast
->state
.write_color
= write_color
;
105 rast
->check_for_clipped_tiles
= (fb
->width
% TILE_SIZE
!= 0 ||
106 fb
->height
% TILE_SIZE
!= 0);
108 /* XXX support multiple color buffers here */
109 cbuf
= rast
->state
.fb
.cbufs
[0];
111 rast
->cbuf_transfer
= screen
->get_tex_transfer(rast
->screen
,
116 PIPE_TRANSFER_READ_WRITE
,
118 fb
->width
, fb
->height
);
119 if (!rast
->cbuf_transfer
)
122 rast
->cbuf_map
= screen
->transfer_map(rast
->screen
,
123 rast
->cbuf_transfer
);
128 zsbuf
= rast
->state
.fb
.zsbuf
;
130 rast
->zsbuf_transfer
= screen
->get_tex_transfer(rast
->screen
,
135 PIPE_TRANSFER_READ_WRITE
,
137 fb
->width
, fb
->height
);
138 if (!rast
->zsbuf_transfer
)
141 rast
->zsbuf_map
= screen
->transfer_map(rast
->screen
,
142 rast
->zsbuf_transfer
);
143 if (!rast
->zsbuf_map
)
152 * Finish the rasterization phase.
153 * Unmap framebuffer surfaces.
156 lp_rast_end( struct lp_rasterizer
*rast
)
158 struct pipe_screen
*screen
= rast
->screen
;
161 screen
->transfer_unmap(screen
, rast
->cbuf_transfer
);
164 screen
->transfer_unmap(screen
, rast
->zsbuf_transfer
);
166 if (rast
->cbuf_transfer
)
167 screen
->tex_transfer_destroy(rast
->cbuf_transfer
);
169 if (rast
->zsbuf_transfer
)
170 screen
->tex_transfer_destroy(rast
->zsbuf_transfer
);
172 rast
->cbuf_transfer
= NULL
;
173 rast
->zsbuf_transfer
= NULL
;
174 rast
->cbuf_map
= NULL
;
175 rast
->zsbuf_map
= NULL
;
180 * Begining rasterization of a tile.
181 * \param x window X position of the tile, in pixels
182 * \param y window Y position of the tile, in pixels
185 lp_rast_start_tile( struct lp_rasterizer
*rast
,
186 unsigned thread_index
,
187 unsigned x
, unsigned y
)
189 LP_DBG(DEBUG_RAST
, "%s %d,%d\n", __FUNCTION__
, x
, y
);
191 rast
->tasks
[thread_index
].x
= x
;
192 rast
->tasks
[thread_index
].y
= y
;
197 * Clear the rasterizer's current color tile.
198 * This is a bin command called during bin processing.
200 void lp_rast_clear_color( struct lp_rasterizer
*rast
,
201 unsigned thread_index
,
202 const union lp_rast_cmd_arg arg
)
204 const uint8_t *clear_color
= arg
.clear_color
;
205 uint8_t *color_tile
= rast
->tasks
[thread_index
].tile
.color
;
207 LP_DBG(DEBUG_RAST
, "%s 0x%x,0x%x,0x%x,0x%x\n", __FUNCTION__
,
213 if (clear_color
[0] == clear_color
[1] &&
214 clear_color
[1] == clear_color
[2] &&
215 clear_color
[2] == clear_color
[3]) {
216 memset(color_tile
, clear_color
[0], TILE_SIZE
* TILE_SIZE
* 4);
220 for (y
= 0; y
< TILE_SIZE
; y
++)
221 for (x
= 0; x
< TILE_SIZE
; x
++)
222 for (chan
= 0; chan
< 4; ++chan
)
223 TILE_PIXEL(color_tile
, x
, y
, chan
) = clear_color
[chan
];
229 * Clear the rasterizer's current z/stencil tile.
230 * This is a bin command called during bin processing.
232 void lp_rast_clear_zstencil( struct lp_rasterizer
*rast
,
233 unsigned thread_index
,
234 const union lp_rast_cmd_arg arg
)
237 uint32_t *depth_tile
= rast
->tasks
[thread_index
].tile
.depth
;
239 LP_DBG(DEBUG_RAST
, "%s 0x%x\n", __FUNCTION__
, arg
.clear_zstencil
);
241 for (i
= 0; i
< TILE_SIZE
; i
++)
242 for (j
= 0; j
< TILE_SIZE
; j
++)
243 depth_tile
[i
*TILE_SIZE
+ j
] = arg
.clear_zstencil
;
248 * Load tile color from the framebuffer surface.
249 * This is a bin command called during bin processing.
251 void lp_rast_load_color( struct lp_rasterizer
*rast
,
252 unsigned thread_index
,
253 const union lp_rast_cmd_arg arg
)
255 LP_DBG(DEBUG_RAST
, "%s\n", __FUNCTION__
);
257 /* call u_tile func to load colors from surface */
262 * Load tile z/stencil from the framebuffer surface.
263 * This is a bin command called during bin processing.
265 void lp_rast_load_zstencil( struct lp_rasterizer
*rast
,
266 unsigned thread_index
,
267 const union lp_rast_cmd_arg arg
)
269 LP_DBG(DEBUG_RAST
, "%s\n", __FUNCTION__
);
271 /* call u_tile func to load depth (and stencil?) from surface */
275 void lp_rast_set_state( struct lp_rasterizer
*rast
,
276 unsigned thread_index
,
277 const union lp_rast_cmd_arg arg
)
279 const struct lp_rast_state
*state
= arg
.set_state
;
281 LP_DBG(DEBUG_RAST
, "%s %p\n", __FUNCTION__
, (void *) state
);
283 /* just set the current state pointer for this rasterizer */
284 rast
->tasks
[thread_index
].current_state
= state
;
293 * Run the shader on all blocks in a tile. This is used when a tile is
294 * completely contained inside a triangle.
295 * This is a bin command called during bin processing.
297 void lp_rast_shade_tile( struct lp_rasterizer
*rast
,
298 unsigned thread_index
,
299 const union lp_rast_cmd_arg arg
)
301 const struct lp_rast_shader_inputs
*inputs
= arg
.shade_tile
;
302 const unsigned tile_x
= rast
->tasks
[thread_index
].x
;
303 const unsigned tile_y
= rast
->tasks
[thread_index
].y
;
304 const unsigned mask
= ~0;
307 LP_DBG(DEBUG_RAST
, "%s\n", __FUNCTION__
);
309 /* Use the existing preference for 4x4 (four quads) shading:
311 for (y
= 0; y
< TILE_SIZE
; y
+= 4)
312 for (x
= 0; x
< TILE_SIZE
; x
+= 4)
313 lp_rast_shade_quads( rast
,
323 * Compute shading for a 4x4 block of pixels.
324 * This is a bin command called during bin processing.
326 void lp_rast_shade_quads( struct lp_rasterizer
*rast
,
327 unsigned thread_index
,
328 const struct lp_rast_shader_inputs
*inputs
,
329 unsigned x
, unsigned y
,
333 const struct lp_rast_state
*state
= rast
->tasks
[thread_index
].current_state
;
334 struct lp_rast_tile
*tile
= &rast
->tasks
[thread_index
].tile
;
337 uint32_t ALIGN16_ATTRIB masks
[2][2][2][2];
344 assert(x
% TILE_VECTOR_WIDTH
== 0);
345 assert(y
% TILE_VECTOR_HEIGHT
== 0);
347 /* mask: the rasterizer wants to treat pixels in 4x4 blocks, but
348 * the pixel shader wants to swizzle them into 4 2x2 quads.
350 * Additionally, the pixel shader wants masks as full dword ~0,
351 * while the rasterizer wants to pack per-pixel bits tightly.
355 for (qy
= 0; qy
< 2; ++qy
)
356 for (qx
= 0; qx
< 2; ++qx
)
357 for (iy
= 0; iy
< 2; ++iy
)
358 for (ix
= 0; ix
< 2; ++ix
)
359 masks
[qy
][qx
][iy
][ix
] = mask
& (1 << (qy
*8+iy
*4+qx
*2+ix
)) ? ~0 : 0;
361 masks
[0][0][0][0] = mask
& (1 << (0*8+0*4+0*2+0)) ? ~0 : 0;
362 masks
[0][0][0][1] = mask
& (1 << (0*8+0*4+0*2+1)) ? ~0 : 0;
363 masks
[0][0][1][0] = mask
& (1 << (0*8+1*4+0*2+0)) ? ~0 : 0;
364 masks
[0][0][1][1] = mask
& (1 << (0*8+1*4+0*2+1)) ? ~0 : 0;
365 masks
[0][1][0][0] = mask
& (1 << (0*8+0*4+1*2+0)) ? ~0 : 0;
366 masks
[0][1][0][1] = mask
& (1 << (0*8+0*4+1*2+1)) ? ~0 : 0;
367 masks
[0][1][1][0] = mask
& (1 << (0*8+1*4+1*2+0)) ? ~0 : 0;
368 masks
[0][1][1][1] = mask
& (1 << (0*8+1*4+1*2+1)) ? ~0 : 0;
370 masks
[1][0][0][0] = mask
& (1 << (1*8+0*4+0*2+0)) ? ~0 : 0;
371 masks
[1][0][0][1] = mask
& (1 << (1*8+0*4+0*2+1)) ? ~0 : 0;
372 masks
[1][0][1][0] = mask
& (1 << (1*8+1*4+0*2+0)) ? ~0 : 0;
373 masks
[1][0][1][1] = mask
& (1 << (1*8+1*4+0*2+1)) ? ~0 : 0;
374 masks
[1][1][0][0] = mask
& (1 << (1*8+0*4+1*2+0)) ? ~0 : 0;
375 masks
[1][1][0][1] = mask
& (1 << (1*8+0*4+1*2+1)) ? ~0 : 0;
376 masks
[1][1][1][0] = mask
& (1 << (1*8+1*4+1*2+0)) ? ~0 : 0;
377 masks
[1][1][1][1] = mask
& (1 << (1*8+1*4+1*2+1)) ? ~0 : 0;
380 assert((x
% 2) == 0);
381 assert((y
% 2) == 0);
386 /* offset of the 16x16 pixel block within the tile */
387 block_offset
= ((iy
/4)*(16*16) + (ix
/4)*16);
390 color
= tile
->color
+ 4 * block_offset
;
393 depth
= tile
->depth
+ block_offset
;
395 /* XXX: This will most likely fail on 32bit x86 without -mstackrealign */
396 assert(lp_check_alignment(masks
, 16));
398 assert(lp_check_alignment(depth
, 16));
399 assert(lp_check_alignment(color
, 16));
400 assert(lp_check_alignment(state
->jit_context
.blend_color
, 16));
403 state
->jit_function( &state
->jit_context
,
412 struct lp_rast_tile
*tile
= &rast
->tile
;
420 for (q
= 0; q
< 4; ++q
)
421 for(iy
= 0; iy
< 2; ++iy
)
422 for(ix
= 0; ix
< 2; ++ix
)
423 if(masks
[q
] & (1 << (iy
*2 + ix
)))
424 for (chan_index
= 0; chan_index
< NUM_CHANNELS
; ++chan_index
)
425 TILE_PIXEL(tile
->color
, x
+ q
*2 + ix
, y
+ iy
, chan_index
) = 0xff;
436 * Write the rasterizer's color tile to the framebuffer.
438 static void lp_rast_store_color( struct lp_rasterizer
*rast
,
439 unsigned thread_index
)
441 const unsigned x
= rast
->tasks
[thread_index
].x
;
442 const unsigned y
= rast
->tasks
[thread_index
].y
;
446 if (x
+ w
> rast
->state
.fb
.width
)
447 w
-= x
+ w
- rast
->state
.fb
.width
;
449 if (y
+ h
> rast
->state
.fb
.height
)
450 h
-= y
+ h
- rast
->state
.fb
.height
;
454 assert(w
<= TILE_SIZE
);
455 assert(h
<= TILE_SIZE
);
457 LP_DBG(DEBUG_RAST
, "%s [%u] %d,%d %dx%d\n", __FUNCTION__
,
458 thread_index
, x
, y
, w
, h
);
460 lp_tile_write_4ub(rast
->cbuf_transfer
->format
,
461 rast
->tasks
[thread_index
].tile
.color
,
463 rast
->cbuf_transfer
->stride
,
470 lp_tile_write_z32(const uint32_t *src
, uint8_t *dst
, unsigned dst_stride
,
471 unsigned x0
, unsigned y0
, unsigned w
, unsigned h
)
474 uint8_t *dst_row
= dst
+ y0
*dst_stride
;
475 for (y
= 0; y
< h
; ++y
) {
476 uint32_t *dst_pixel
= (uint32_t *)(dst_row
+ x0
*4);
477 for (x
= 0; x
< w
; ++x
) {
478 *dst_pixel
++ = *src
++;
480 dst_row
+= dst_stride
;
485 * Write the rasterizer's z/stencil tile to the framebuffer.
487 static void lp_rast_store_zstencil( struct lp_rasterizer
*rast
,
488 unsigned thread_index
)
490 const unsigned x
= rast
->tasks
[thread_index
].x
;
491 const unsigned y
= rast
->tasks
[thread_index
].y
;
492 unsigned w
= TILE_SIZE
;
493 unsigned h
= TILE_SIZE
;
495 if (x
+ w
> rast
->state
.fb
.width
)
496 w
-= x
+ w
- rast
->state
.fb
.width
;
498 if (y
+ h
> rast
->state
.fb
.height
)
499 h
-= y
+ h
- rast
->state
.fb
.height
;
501 LP_DBG(DEBUG_RAST
, "%s %d,%d %dx%d\n", __FUNCTION__
, x
, y
, w
, h
);
503 assert(rast
->zsbuf_transfer
->format
== PIPE_FORMAT_Z32_UNORM
);
504 lp_tile_write_z32(rast
->tasks
[thread_index
].tile
.depth
,
506 rast
->zsbuf_transfer
->stride
,
512 * Write the rasterizer's tiles to the framebuffer.
515 lp_rast_end_tile( struct lp_rasterizer
*rast
,
516 unsigned thread_index
)
518 LP_DBG(DEBUG_RAST
, "%s\n", __FUNCTION__
);
520 if (rast
->state
.write_color
)
521 lp_rast_store_color(rast
, thread_index
);
523 if (rast
->state
.write_zstencil
)
524 lp_rast_store_zstencil(rast
, thread_index
);
529 * Rasterize commands for a single bin.
530 * \param x, y position of the bin's tile in the framebuffer
531 * Must be called between lp_rast_begin() and lp_rast_end().
535 rasterize_bin( struct lp_rasterizer
*rast
,
536 unsigned thread_index
,
537 const struct cmd_bin
*bin
,
540 const struct cmd_block_list
*commands
= &bin
->commands
;
541 struct cmd_block
*block
;
544 lp_rast_start_tile( rast
, thread_index
, x
, y
);
546 /* simply execute each of the commands in the block list */
547 for (block
= commands
->head
; block
; block
= block
->next
) {
548 for (k
= 0; k
< block
->count
; k
++) {
549 block
->cmd
[k
]( rast
, thread_index
, block
->arg
[k
] );
553 lp_rast_end_tile( rast
, thread_index
);
558 * Rasterize/execute all bins.
562 rasterize_bins( struct lp_rasterizer
*rast
,
563 unsigned thread_index
,
564 struct lp_bins
*bins
,
567 /* loop over tile bins, rasterize each */
571 for (i
= 0; i
< bins
->tiles_x
; i
++) {
572 for (j
= 0; j
< bins
->tiles_y
; j
++) {
573 struct cmd_bin
*bin
= lp_get_bin(bins
, i
, j
);
574 rasterize_bin( rast
, thread_index
,
575 bin
, i
* TILE_SIZE
, j
* TILE_SIZE
);
585 while ((bin
= lp_bin_iter_next(bins
, &x
, &y
))) {
586 rasterize_bin( rast
, thread_index
, bin
, x
* TILE_SIZE
, y
* TILE_SIZE
);
594 * Called by setup module when it has something for us to render.
597 lp_rasterize_bins( struct lp_rasterizer
*rast
,
598 struct lp_bins
*bins
,
599 const struct pipe_framebuffer_state
*fb
,
602 boolean debug
= false;
604 LP_DBG(DEBUG_SETUP
, "%s\n", __FUNCTION__
);
608 printf("rasterize bins:\n");
609 printf(" data size: %u\n", lp_bin_data_size(bins
));
610 for (y
= 0; y
< bins
->tiles_y
; y
++) {
611 for (x
= 0; x
< bins
->tiles_x
; x
++) {
612 printf(" bin %u, %u size: %u\n", x
, y
,
613 lp_bin_cmd_size(bins
, x
, y
));
618 lp_rast_begin( rast
, fb
,
620 fb
->zsbuf
!= NULL
&& write_depth
);
622 if (rast
->num_threads
== 0) {
624 lp_bin_iter_begin( bins
);
625 rasterize_bins( rast
, 0, bins
, write_depth
);
627 /* reset bins and put into the empty queue */
628 lp_reset_bins( bins
);
629 lp_bins_enqueue( rast
->empty_bins
, bins
);
632 /* threaded rendering! */
635 lp_bins_enqueue( rast
->full_bins
, bins
);
637 /* XXX need to move/fix these */
639 rast
->write_depth
= write_depth
;
641 /*lp_bin_iter_begin( bins );*/
643 /* signal the threads that there's work to do */
644 for (i
= 0; i
< rast
->num_threads
; i
++) {
645 pipe_semaphore_signal(&rast
->tasks
[i
].work_ready
);
648 /* wait for work to complete */
649 for (i
= 0; i
< rast
->num_threads
; i
++) {
650 pipe_semaphore_wait(&rast
->tasks
[i
].work_done
);
656 LP_DBG(DEBUG_SETUP
, "%s done \n", __FUNCTION__
);
661 * This is the thread's main entrypoint.
662 * It's a simple loop:
665 * 3. signal that we're done
668 thread_func( void *init_data
)
670 struct lp_rasterizer_task
*task
= (struct lp_rasterizer_task
*) init_data
;
671 struct lp_rasterizer
*rast
= task
->rast
;
672 boolean debug
= false;
675 struct lp_bins
*bins
;
679 debug_printf("thread %d waiting for work\n", task
->thread_index
);
680 pipe_semaphore_wait(&task
->work_ready
);
682 bins
= get_next_full_bin( rast
);
687 debug_printf("thread %d doing work\n", task
->thread_index
);
688 rasterize_bins(rast
, task
->thread_index
,
689 bins
, rast
->write_depth
);
691 release_current_bin( rast
);
693 /* signal done with work */
695 debug_printf("thread %d done working\n", task
->thread_index
);
696 pipe_semaphore_signal(&task
->work_done
);
704 * Initialize semaphores and spawn the threads.
707 create_rast_threads(struct lp_rasterizer
*rast
)
711 rast
->num_threads
= util_cpu_caps
.nr_cpus
;
712 rast
->num_threads
= debug_get_num_option("LP_NUM_THREADS", rast
->num_threads
);
713 rast
->num_threads
= MIN2(rast
->num_threads
, MAX_THREADS
);
715 /* NOTE: if num_threads is zero, we won't use any threads */
716 for (i
= 0; i
< rast
->num_threads
; i
++) {
717 pipe_semaphore_init(&rast
->tasks
[i
].work_ready
, 0);
718 pipe_semaphore_init(&rast
->tasks
[i
].work_done
, 0);
719 rast
->threads
[i
] = pipe_thread_create(thread_func
,
720 (void *) &rast
->tasks
[i
]);
727 * Create new lp_rasterizer.
728 * \param empty the queue to put empty bins on after we've finished
731 struct lp_rasterizer
*
732 lp_rast_create( struct pipe_screen
*screen
, struct lp_bins_queue
*empty
)
734 struct lp_rasterizer
*rast
;
737 rast
= CALLOC_STRUCT(lp_rasterizer
);
741 rast
->screen
= screen
;
743 rast
->empty_bins
= empty
;
744 rast
->full_bins
= lp_bins_queue_create();
746 for (i
= 0; i
< Elements(rast
->tasks
); i
++) {
747 rast
->tasks
[i
].tile
.color
= align_malloc( TILE_SIZE
*TILE_SIZE
*4, 16 );
748 rast
->tasks
[i
].tile
.depth
= align_malloc( TILE_SIZE
*TILE_SIZE
*4, 16 );
749 rast
->tasks
[i
].rast
= rast
;
750 rast
->tasks
[i
].thread_index
= i
;
753 create_rast_threads(rast
);
761 void lp_rast_destroy( struct lp_rasterizer
*rast
)
765 util_unreference_framebuffer_state(&rast
->state
.fb
);
767 for (i
= 0; i
< Elements(rast
->tasks
); i
++) {
768 align_free(rast
->tasks
[i
].tile
.depth
);
769 align_free(rast
->tasks
[i
].tile
.color
);