X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fllvmpipe%2Flp_rast.c;h=9d4f9f8d0276af5d151cc67a1f478e3fa6115861;hb=c7c59f75e59510be890bc77a9257c14ffd5b7b59;hp=62a82e307882209f51bf7af4d520c871f4537e84;hpb=dc5dc4fd943e1da5207e0489c355e9a7ba1dff87;p=mesa.git diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index 62a82e30788..9d4f9f8d027 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -31,16 +31,20 @@ #include "util/u_rect.h" #include "util/u_surface.h" #include "util/u_pack_color.h" +#include "util/u_string.h" +#include "util/u_thread.h" -#include "os/os_time.h" +#include "util/os_time.h" #include "lp_scene_queue.h" +#include "lp_context.h" #include "lp_debug.h" #include "lp_fence.h" #include "lp_perf.h" #include "lp_query.h" #include "lp_rast.h" #include "lp_rast_priv.h" +#include "gallivm/lp_bld_format.h" #include "gallivm/lp_bld_debug.h" #include "lp_scene.h" #include "lp_tex_sample.h" @@ -61,7 +65,6 @@ static void lp_rast_begin( struct lp_rasterizer *rast, struct lp_scene *scene ) { - rast->curr_scene = scene; LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__); @@ -81,7 +84,7 @@ lp_rast_end( struct lp_rasterizer *rast ) /** - * Begining rasterization of a tile. + * Beginning rasterization of a tile. * \param x window X position of the tile, in pixels * \param y window Y position of the tile, in pixels */ @@ -90,6 +93,9 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task, const struct cmd_bin *bin, int x, int y) { + unsigned i; + struct lp_scene *scene = task->scene; + LP_DBG(DEBUG_RAST, "%s %d,%d\n", __FUNCTION__, x, y); task->bin = bin; @@ -100,9 +106,21 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task, task->height = TILE_SIZE + y * TILE_SIZE > task->scene->fb.height ? task->scene->fb.height - y * TILE_SIZE : TILE_SIZE; - /* reset pointers to color and depth tile(s) */ - memset(task->color_tiles, 0, sizeof(task->color_tiles)); - task->depth_tile = NULL; + task->thread_data.vis_counter = 0; + task->thread_data.ps_invocations = 0; + + for (i = 0; i < task->scene->fb.nr_cbufs; i++) { + if (task->scene->fb.cbufs[i]) { + task->color_tiles[i] = scene->cbufs[i].map + + scene->cbufs[i].stride * task->y + + scene->cbufs[i].format_bytes * task->x; + } + } + if (task->scene->fb.zsbuf) { + task->depth_tile = scene->zsbuf.map + + scene->zsbuf.stride * task->y + + scene->zsbuf.format_bytes * task->x; + } } @@ -116,85 +134,42 @@ lp_rast_clear_color(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_scene *scene = task->scene; + unsigned cbuf = arg.clear_rb->cbuf; + union util_color uc; + enum pipe_format format; - if (scene->fb.nr_cbufs) { - unsigned i; - union util_color uc; - - if (util_format_is_pure_integer(scene->fb.cbufs[0]->format)) { - /* - * We expect int/uint clear values here, though some APIs - * might disagree (but in any case util_pack_color() - * couldn't handle it)... - */ - LP_DBG(DEBUG_RAST, "%s pure int 0x%x,0x%x,0x%x,0x%x\n", __FUNCTION__, - arg.clear_color.ui[0], - arg.clear_color.ui[1], - arg.clear_color.ui[2], - arg.clear_color.ui[3]); - - for (i = 0; i < scene->fb.nr_cbufs; i++) { - enum pipe_format format = scene->fb.cbufs[i]->format; - - if (util_format_is_pure_sint(format)) { - util_format_write_4i(format, arg.clear_color.i, 0, &uc, 0, 0, 0, 1, 1); - } - else { - assert(util_format_is_pure_uint(format)); - util_format_write_4ui(format, arg.clear_color.ui, 0, &uc, 0, 0, 0, 1, 1); - } - - util_fill_box(scene->cbufs[i].map, - format, - scene->cbufs[i].stride, - scene->cbufs[i].layer_stride, - task->x, - task->y, - 0, - task->width, - task->height, - scene->fb_max_layer + 1, - &uc); - } - } - else { - uint8_t clear_color[4]; - - for (i = 0; i < 4; ++i) { - clear_color[i] = float_to_ubyte(arg.clear_color.f[i]); - } + /* we never bin clear commands for non-existing buffers */ + assert(cbuf < scene->fb.nr_cbufs); + assert(scene->fb.cbufs[cbuf]); - LP_DBG(DEBUG_RAST, "%s 0x%x,0x%x,0x%x,0x%x\n", __FUNCTION__, - clear_color[0], - clear_color[1], - clear_color[2], - clear_color[3]); - - for (i = 0; i < scene->fb.nr_cbufs; i++) { - util_pack_color(arg.clear_color.f, - scene->fb.cbufs[i]->format, &uc); - - util_fill_box(scene->cbufs[i].map, - scene->fb.cbufs[i]->format, - scene->cbufs[i].stride, - scene->cbufs[i].layer_stride, - task->x, - task->y, - 0, - task->width, - task->height, - scene->fb_max_layer + 1, - &uc); - } - } - } + format = scene->fb.cbufs[cbuf]->format; + uc = arg.clear_rb->color_val; + /* + * this is pretty rough since we have target format (bunch of bytes...) here. + * dump it as raw 4 dwords. + */ + LP_DBG(DEBUG_RAST, "%s clear value (target format %d) raw 0x%x,0x%x,0x%x,0x%x\n", + __FUNCTION__, format, uc.ui[0], uc.ui[1], uc.ui[2], uc.ui[3]); + + + util_fill_box(scene->cbufs[cbuf].map, + format, + scene->cbufs[cbuf].stride, + scene->cbufs[cbuf].layer_stride, + task->x, + task->y, + 0, + task->width, + task->height, + scene->fb_max_layer + 1, + &uc); + + /* this will increase for each rb which probably doesn't mean much */ LP_COUNT(nr_color_tile_clear); } - - /** * Clear the rasterizer's current z/stencil tile. * This is a bin command called during bin processing. @@ -225,7 +200,7 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task, if (scene->fb.zsbuf) { unsigned layer; - uint8_t *dst_layer = lp_rast_get_unswizzled_depth_tile_pointer(task, LP_TEX_USAGE_READ_WRITE); + uint8_t *dst_layer = task->depth_tile; block_size = util_format_get_blocksize(scene->fb.zsbuf->format); clear_value &= clear_mask; @@ -352,18 +327,27 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task, /* color buffer */ for (i = 0; i < scene->fb.nr_cbufs; i++){ - stride[i] = scene->cbufs[i].stride; - color[i] = lp_rast_get_unswizzled_color_block_pointer(task, i, tile_x + x, - tile_y + y, inputs->layer); + if (scene->fb.cbufs[i]) { + stride[i] = scene->cbufs[i].stride; + color[i] = lp_rast_get_color_block_pointer(task, i, tile_x + x, + tile_y + y, inputs->layer); + } + else { + stride[i] = 0; + color[i] = NULL; + } } /* depth buffer */ if (scene->zsbuf.map) { - depth = lp_rast_get_unswizzled_depth_block_pointer(task, tile_x + x, - tile_y + y, inputs->layer); + depth = lp_rast_get_depth_block_pointer(task, tile_x + x, + tile_y + y, inputs->layer); depth_stride = scene->zsbuf.stride; } + /* Propagate non-interpolated raster state. */ + task->thread_data.raster_state.viewport_index = inputs->viewport_index; + /* run shader on 4x4 block */ BEGIN_JIT_CALL(state, task); variant->jit_function[RAST_WHOLE]( &state->jit_context, @@ -438,14 +422,21 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, /* color buffer */ for (i = 0; i < scene->fb.nr_cbufs; i++) { - stride[i] = scene->cbufs[i].stride; - color[i] = lp_rast_get_unswizzled_color_block_pointer(task, i, x, y, inputs->layer); + if (scene->fb.cbufs[i]) { + stride[i] = scene->cbufs[i].stride; + color[i] = lp_rast_get_color_block_pointer(task, i, x, y, + inputs->layer); + } + else { + stride[i] = 0; + color[i] = NULL; + } } /* depth buffer */ if (scene->zsbuf.map) { depth_stride = scene->zsbuf.stride; - depth = lp_rast_get_unswizzled_depth_block_pointer(task, x, y, inputs->layer); + depth = lp_rast_get_depth_block_pointer(task, x, y, inputs->layer); } assert(lp_check_alignment(state->jit_context.u8_blend_color, 16)); @@ -455,10 +446,9 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, * allocated 4x4 blocks hence need to filter them out here. */ if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) { - if (task->query[PIPE_QUERY_PIPELINE_STATISTICS]) { - /* not very accurate would need a popcount on the mask */ - task->ps_invocations++; - } + /* Propagate non-interpolated raster state. */ + task->thread_data.raster_state.viewport_index = inputs->viewport_index; + /* run shader on 4x4 block */ BEGIN_JIT_CALL(state, task); variant->jit_function[RAST_EDGE_TEST](&state->jit_context, @@ -490,28 +480,19 @@ lp_rast_begin_query(struct lp_rasterizer_task *task, { struct llvmpipe_query *pq = arg.query_obj; - assert(task->query[pq->type] == NULL); - switch (pq->type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: - task->thread_data.vis_counter = 0; + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + pq->start[task->thread_index] = task->thread_data.vis_counter; break; case PIPE_QUERY_PIPELINE_STATISTICS: - task->ps_invocations = 0; - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - case PIPE_QUERY_PRIMITIVES_EMITTED: - case PIPE_QUERY_SO_STATISTICS: - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - case PIPE_QUERY_TIMESTAMP_DISJOINT: + pq->start[task->thread_index] = task->thread_data.ps_invocations; break; default: assert(0); break; } - - task->query[pq->type] = pq; } @@ -525,36 +506,27 @@ lp_rast_end_query(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { struct llvmpipe_query *pq = arg.query_obj; - assert(task->query[pq->type] == pq || - pq->type == PIPE_QUERY_TIMESTAMP || - pq->type == PIPE_QUERY_GPU_FINISHED); switch (pq->type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: - pq->count[task->thread_index] += task->thread_data.vis_counter; + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + pq->end[task->thread_index] += + task->thread_data.vis_counter - pq->start[task->thread_index]; + pq->start[task->thread_index] = 0; break; case PIPE_QUERY_TIMESTAMP: - pq->count[task->thread_index] = os_time_get_nano(); + pq->end[task->thread_index] = os_time_get_nano(); break; case PIPE_QUERY_PIPELINE_STATISTICS: - pq->count[task->thread_index] += task->ps_invocations; - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - case PIPE_QUERY_PRIMITIVES_EMITTED: - case PIPE_QUERY_SO_STATISTICS: - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - case PIPE_QUERY_TIMESTAMP_DISJOINT: - case PIPE_QUERY_GPU_FINISHED: + pq->end[task->thread_index] += + task->thread_data.ps_invocations - pq->start[task->thread_index]; + pq->start[task->thread_index] = 0; break; default: assert(0); break; } - - if (task->query[pq->type] == pq) { - task->query[pq->type] = NULL; - } } @@ -575,10 +547,8 @@ lp_rast_tile_end(struct lp_rasterizer_task *task) { unsigned i; - for (i = 0; i < PIPE_QUERY_TYPES; ++i) { - if (task->query[i]) { - lp_rast_end_query(task, lp_rast_arg_query(task->query[i])); - } + for (i = 0; i < task->scene->num_active_queries; ++i) { + lp_rast_end_query(task, lp_rast_arg_query(task->scene->active_queries[i])); } /* debug */ @@ -608,6 +578,17 @@ static lp_rast_cmd_func dispatch[LP_RAST_OP_MAX] = lp_rast_begin_query, lp_rast_end_query, lp_rast_set_state, + lp_rast_triangle_32_1, + lp_rast_triangle_32_2, + lp_rast_triangle_32_3, + lp_rast_triangle_32_4, + lp_rast_triangle_32_5, + lp_rast_triangle_32_6, + lp_rast_triangle_32_7, + lp_rast_triangle_32_8, + lp_rast_triangle_32_3_4, + lp_rast_triangle_32_3_16, + lp_rast_triangle_32_4_16 }; @@ -683,7 +664,18 @@ rasterize_scene(struct lp_rasterizer_task *task, { task->scene = scene; - if (!task->rast->no_rast && !scene->discard) { + /* Clear the cache tags. This should not always be necessary but + simpler for now. */ +#if LP_USE_TEXTURE_CACHE + memset(task->thread_data.cache->cache_tags, 0, + sizeof(task->thread_data.cache->cache_tags)); +#if LP_BUILD_FORMAT_CACHE_DEBUG + task->thread_data.cache->cache_access_total = 0; + task->thread_data.cache->cache_access_miss = 0; +#endif +#endif + + if (!task->rast->no_rast) { /* loop over scene bins, rasterize each */ { struct cmd_bin *bin; @@ -698,6 +690,20 @@ rasterize_scene(struct lp_rasterizer_task *task, } +#if LP_BUILD_FORMAT_CACHE_DEBUG + { + uint64_t total, miss; + total = task->thread_data.cache->cache_access_total; + miss = task->thread_data.cache->cache_access_miss; + if (total) { + debug_printf("thread %d cache access %llu miss %llu hit rate %f\n", + task->thread_index, (long long unsigned)total, + (long long unsigned)miss, + (float)(total - miss)/(float)total); + } + } +#endif + if (scene->fence) { lp_fence_signal(scene->fence); } @@ -717,6 +723,12 @@ lp_rast_queue_scene( struct lp_rasterizer *rast, if (rast->num_threads == 0) { /* no threading */ + unsigned fpstate = util_fpstate_get(); + + /* Make sure that denorms are treated like zeros. This is + * the behavior required by D3D10. OpenGL doesn't care. + */ + util_fpstate_set_denorms_to_zero(fpstate); lp_rast_begin( rast, scene ); @@ -724,6 +736,8 @@ lp_rast_queue_scene( struct lp_rasterizer *rast, lp_rast_end( rast ); + util_fpstate_set(fpstate); + rast->curr_scene = NULL; } else { @@ -766,11 +780,23 @@ lp_rast_finish( struct lp_rasterizer *rast ) * 2. do work * 3. signal that we're done */ -static PIPE_THREAD_ROUTINE( thread_function, init_data ) +static int +thread_function(void *init_data) { struct lp_rasterizer_task *task = (struct lp_rasterizer_task *) init_data; struct lp_rasterizer *rast = task->rast; boolean debug = false; + char thread_name[16]; + unsigned fpstate; + + util_snprintf(thread_name, sizeof thread_name, "llvmpipe-%u", task->thread_index); + u_thread_setname(thread_name); + + /* Make sure that denorms are treated like zeros. This is + * the behavior required by D3D10. OpenGL doesn't care. + */ + fpstate = util_fpstate_get(); + util_fpstate_set_denorms_to_zero(fpstate); while (1) { /* wait for work */ @@ -793,7 +819,7 @@ static PIPE_THREAD_ROUTINE( thread_function, init_data ) /* Wait for all threads to get here so that threads[1+] don't * get a null rast->curr_scene pointer. */ - pipe_barrier_wait( &rast->barrier ); + util_barrier_wait( &rast->barrier ); /* do work */ if (debug) @@ -803,7 +829,7 @@ static PIPE_THREAD_ROUTINE( thread_function, init_data ) rast->curr_scene); /* wait for all threads to finish with this scene */ - pipe_barrier_wait( &rast->barrier ); + util_barrier_wait( &rast->barrier ); /* XXX: shouldn't be necessary: */ @@ -818,7 +844,11 @@ static PIPE_THREAD_ROUTINE( thread_function, init_data ) pipe_semaphore_signal(&task->work_done); } - return NULL; +#ifdef _WIN32 + pipe_semaphore_signal(&task->work_done); +#endif + + return 0; } @@ -834,7 +864,7 @@ create_rast_threads(struct lp_rasterizer *rast) for (i = 0; i < rast->num_threads; i++) { pipe_semaphore_init(&rast->tasks[i].work_ready, 0); pipe_semaphore_init(&rast->tasks[i].work_done, 0); - rast->threads[i] = pipe_thread_create(thread_function, + rast->threads[i] = u_thread_create(thread_function, (void *) &rast->tasks[i]); } } @@ -862,10 +892,15 @@ lp_rast_create( unsigned num_threads ) goto no_full_scenes; } - for (i = 0; i < Elements(rast->tasks); i++) { + for (i = 0; i < MAX2(1, num_threads); i++) { struct lp_rasterizer_task *task = &rast->tasks[i]; task->rast = rast; task->thread_index = i; + task->thread_data.cache = align_malloc(sizeof(struct lp_build_format_cache), + 16); + if (!task->thread_data.cache) { + goto no_thread_data_cache; + } } rast->num_threads = num_threads; @@ -875,12 +910,22 @@ lp_rast_create( unsigned num_threads ) create_rast_threads(rast); /* for synchronizing rasterization threads */ - pipe_barrier_init( &rast->barrier, rast->num_threads ); + if (rast->num_threads > 0) { + util_barrier_init( &rast->barrier, rast->num_threads ); + } memset(lp_dummy_tile, 0, sizeof lp_dummy_tile); return rast; +no_thread_data_cache: + for (i = 0; i < MAX2(1, rast->num_threads); i++) { + if (rast->tasks[i].thread_data.cache) { + align_free(rast->tasks[i].thread_data.cache); + } + } + + lp_scene_queue_destroy(rast->full_scenes); no_full_scenes: FREE(rast); no_rast: @@ -903,9 +948,15 @@ void lp_rast_destroy( struct lp_rasterizer *rast ) pipe_semaphore_signal(&rast->tasks[i].work_ready); } - /* Wait for threads to terminate before cleaning up per-thread data */ + /* Wait for threads to terminate before cleaning up per-thread data. + * We don't actually call pipe_thread_wait to avoid dead lock on Windows + * per https://bugs.freedesktop.org/show_bug.cgi?id=76252 */ for (i = 0; i < rast->num_threads; i++) { - pipe_thread_wait(rast->threads[i]); +#ifdef _WIN32 + pipe_semaphore_wait(&rast->tasks[i].work_done); +#else + thrd_join(rast->threads[i], NULL); +#endif } /* Clean up per-thread data */ @@ -913,9 +964,14 @@ void lp_rast_destroy( struct lp_rasterizer *rast ) pipe_semaphore_destroy(&rast->tasks[i].work_ready); pipe_semaphore_destroy(&rast->tasks[i].work_done); } + for (i = 0; i < MAX2(1, rast->num_threads); i++) { + align_free(rast->tasks[i].thread_data.cache); + } /* for synchronizing rasterization threads */ - pipe_barrier_destroy( &rast->barrier ); + if (rast->num_threads > 0) { + util_barrier_destroy( &rast->barrier ); + } lp_scene_queue_destroy(rast->full_scenes); @@ -923,11 +979,3 @@ void lp_rast_destroy( struct lp_rasterizer *rast ) } -/** Return number of rasterization threads */ -unsigned -lp_rast_get_num_threads( struct lp_rasterizer *rast ) -{ - return rast->num_threads; -} - -