radeonsi: upload constants into VRAM instead of GTT
authorMarek Olšák <marek.olsak@amd.com>
Wed, 15 Feb 2017 18:50:15 +0000 (19:50 +0100)
committerMarek Olšák <marek.olsak@amd.com>
Sat, 18 Feb 2017 00:22:08 +0000 (01:22 +0100)
This lowers lgkm wait cycles by 30% on VI and normal conditions.
The might be a measurable improvement when CE is disabled (radeon)
or under L2 thrashing.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/drivers/radeon/r600_pipe_common.c
src/gallium/drivers/radeonsi/si_compute.c
src/gallium/drivers/radeonsi/si_descriptors.c
src/gallium/drivers/radeonsi/si_state.c

index d573b39d7c06c661471e1277ea961e1f55edc196..1781584f5ffdaabd0e30b04a5cf272d1f48250b1 100644 (file)
@@ -607,7 +607,11 @@ bool r600_common_context_init(struct r600_common_context *rctx,
                                                  0, PIPE_USAGE_STREAM);
        if (!rctx->b.stream_uploader)
                return false;
-       rctx->b.const_uploader = rctx->b.stream_uploader;
+
+       rctx->b.const_uploader = u_upload_create(&rctx->b, 128 * 1024,
+                                                0, PIPE_USAGE_DEFAULT);
+       if (!rctx->b.const_uploader)
+               return false;
 
        rctx->ctx = rctx->ws->ctx_create(rctx->ws);
        if (!rctx->ctx)
@@ -649,9 +653,10 @@ void r600_common_context_cleanup(struct r600_common_context *rctx)
        if (rctx->ctx)
                rctx->ws->ctx_destroy(rctx->ctx);
 
-       if (rctx->b.stream_uploader) {
+       if (rctx->b.stream_uploader)
                u_upload_destroy(rctx->b.stream_uploader);
-       }
+       if (rctx->b.const_uploader)
+               u_upload_destroy(rctx->b.const_uploader);
 
        slab_destroy_child(&rctx->pool_transfers);
 
index 381837c8a57b9a902f9885903033fd10d5b586ad..88d72c1ea2aff8c8b6e78fbe3eb85191efbfc40f 100644 (file)
@@ -503,7 +503,7 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx,
 
                dispatch.kernarg_address = kernel_args_va;
 
-               u_upload_data(sctx->b.b.stream_uploader, 0, sizeof(dispatch),
+               u_upload_data(sctx->b.b.const_uploader, 0, sizeof(dispatch),
                               256, &dispatch, &dispatch_offset,
                               (struct pipe_resource**)&dispatch_buf);
 
@@ -565,7 +565,7 @@ static void si_upload_compute_input(struct si_context *sctx,
        /* The extra num_work_size_bytes are for work group / work item size information */
        kernel_args_size = program->input_size + num_work_size_bytes;
 
-       u_upload_alloc(sctx->b.b.stream_uploader, 0, kernel_args_size,
+       u_upload_alloc(sctx->b.b.const_uploader, 0, kernel_args_size,
                       sctx->screen->b.info.tcc_cache_line_size,
                       &kernel_args_offset,
                       (struct pipe_resource**)&input_buffer, &kernel_args_ptr);
index b4f1fbfb2131b06f4f6e884acc8b0c5b7ac2f617..a41b243eec8da5d40e7dcc1c53f98cf12d56fd06 100644 (file)
@@ -235,7 +235,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
        } else {
                void *ptr;
 
-               u_upload_alloc(sctx->b.b.stream_uploader, 0, list_size,
+               u_upload_alloc(sctx->b.b.const_uploader, 0, list_size,
                               sctx->screen->b.info.tcc_cache_line_size,
                               &desc->buffer_offset,
                               (struct pipe_resource**)&desc->buffer, &ptr);
@@ -963,7 +963,7 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
         * directly through a staging buffer and don't go through
         * the fine-grained upload path.
         */
-       u_upload_alloc(sctx->b.b.stream_uploader, 0,
+       u_upload_alloc(sctx->b.b.const_uploader, 0,
                       desc_list_byte_size,
                       si_optimal_tcc_alignment(sctx, desc_list_byte_size),
                       &desc->buffer_offset,
@@ -1051,7 +1051,7 @@ void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuf
 {
        void *tmp;
 
-       u_upload_alloc(sctx->b.b.stream_uploader, 0, size,
+       u_upload_alloc(sctx->b.b.const_uploader, 0, size,
                       si_optimal_tcc_alignment(sctx, size),
                       const_offset,
                       (struct pipe_resource**)rbuffer, &tmp);
index f53f8dd8ee972791f21e99534cb1b5cb5a632082..81592a7e8587b2210b517be3de22baed294b3ade 100644 (file)
@@ -3526,8 +3526,11 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
                                assert(src->stride == 0);
 
                                /* Assume the attrib has 4 dwords like the vbo
-                                * module. This is also a good upper bound. */
-                               u_upload_data(sctx->b.b.stream_uploader, 0, 16, 16,
+                                * module. This is also a good upper bound.
+                                *
+                                * Use const_uploader to upload into VRAM directly.
+                                */
+                               u_upload_data(sctx->b.b.const_uploader, 0, 16, 16,
                                              src->user_buffer,
                                              &dsti->buffer_offset,
                                              &dsti->buffer);