r600: fork and import gallium/radeon
[mesa.git] / src / gallium / drivers / radeonsi / si_compute.c
index 79b107e96fc6f9dd5f5c75711971eb6e18b697f5..3987eecca8e9984f27198ace002745c94e476b33 100644 (file)
@@ -151,6 +151,7 @@ static void *si_create_compute_state(
        struct si_screen *sscreen = (struct si_screen *)ctx->screen;
        struct si_compute *program = CALLOC_STRUCT(si_compute);
 
+       pipe_reference_init(&program->reference, 1);
        program->screen = (struct si_screen *)ctx->screen;
        program->ir_type = cso->ir_type;
        program->local_size = cso->req_local_mem;
@@ -174,7 +175,7 @@ static void *si_create_compute_state(
 
                if ((sctx->b.debug.debug_message && !sctx->b.debug.async) ||
                    sctx->is_debug ||
-                   r600_can_dump_shader(&sscreen->b, PIPE_SHADER_COMPUTE))
+                   si_can_dump_shader(&sscreen->b, PIPE_SHADER_COMPUTE))
                        si_create_compute_state_async(program, -1);
                else
                        util_queue_add_job(&sscreen->shader_compiler_queue,
@@ -265,11 +266,6 @@ static void si_initialize_compute(struct si_context *sctx)
        struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        uint64_t bc_va;
 
-       radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
-       radeon_emit(cs, 0);
-       radeon_emit(cs, 0);
-       radeon_emit(cs, 0);
-
        radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
        /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */
        radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
@@ -332,7 +328,7 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
                r600_resource_reference(&sctx->compute_scratch_buffer, NULL);
 
                sctx->compute_scratch_buffer = (struct r600_resource*)
-                       r600_aligned_buffer_create(&sctx->screen->b.b,
+                       si_aligned_buffer_create(&sctx->screen->b.b,
                                                   R600_RESOURCE_FLAG_UNMAPPABLE,
                                                   PIPE_USAGE_DEFAULT,
                                                   scratch_needed, 256);
@@ -723,6 +719,13 @@ static void si_emit_dispatch_packets(struct si_context *sctx,
        radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
        radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
 
+       unsigned dispatch_initiator =
+               S_00B800_COMPUTE_SHADER_EN(1) |
+               S_00B800_FORCE_START_AT_000(1) |
+               /* If the KMD allows it (there is a KMD hw register for it),
+                * allow launching waves out-of-order. (same as Vulkan) */
+               S_00B800_ORDER_MODE(sctx->b.chip_class >= CIK);
+
        if (info->indirect) {
                uint64_t base_va = r600_resource(info->indirect)->gpu_address;
 
@@ -739,14 +742,14 @@ static void si_emit_dispatch_packets(struct si_context *sctx,
                radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) |
                                PKT3_SHADER_TYPE_S(1));
                radeon_emit(cs, info->indirect_offset);
-               radeon_emit(cs, 1);
+               radeon_emit(cs, dispatch_initiator);
        } else {
                radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) |
                                PKT3_SHADER_TYPE_S(1));
                radeon_emit(cs, info->grid[0]);
                radeon_emit(cs, info->grid[1]);
                radeon_emit(cs, info->grid[2]);
-               radeon_emit(cs, 1);
+               radeon_emit(cs, dispatch_initiator);
        }
 }
 
@@ -779,6 +782,11 @@ static void si_launch_grid(
            program->shader.compilation_failed)
                return;
 
+       if (sctx->b.last_num_draw_calls != sctx->b.num_draw_calls) {
+               si_update_fb_dirtiness_after_rendering(sctx);
+               sctx->b.last_num_draw_calls = sctx->b.num_draw_calls;
+       }
+
        si_decompress_compute_textures(sctx);
 
        /* Add buffer sizes for memory checking in need_cs_space. */
@@ -788,8 +796,9 @@ static void si_launch_grid(
        if (info->indirect) {
                r600_context_add_resource_size(ctx, info->indirect);
 
-               /* The hw doesn't read the indirect buffer via TC L2. */
-               if (r600_resource(info->indirect)->TC_L2_dirty) {
+               /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+               if (sctx->b.chip_class <= VI &&
+                   r600_resource(info->indirect)->TC_L2_dirty) {
                        sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
                        r600_resource(info->indirect)->TC_L2_dirty = false;
                }
@@ -797,6 +806,9 @@ static void si_launch_grid(
 
        si_need_cs_space(sctx);
 
+       if (sctx->b.log)
+               si_log_compute_state(sctx, sctx->b.log);
+
        if (!sctx->cs_shader_state.initialized)
                si_initialize_compute(sctx);
 
@@ -808,7 +820,7 @@ static void si_launch_grid(
                return;
 
        si_upload_compute_shader_descriptors(sctx);
-       si_emit_compute_shader_userdata(sctx);
+       si_emit_compute_shader_pointers(sctx);
 
        if (si_is_atom_dirty(sctx, sctx->atoms.s.render_cond)) {
                sctx->atoms.s.render_cond->emit(&sctx->b,
@@ -837,11 +849,10 @@ static void si_launch_grid(
        if (program->ir_type == PIPE_SHADER_IR_TGSI)
                si_setup_tgsi_grid(sctx, info);
 
-       si_ce_pre_draw_synchronization(sctx);
-
        si_emit_dispatch_packets(sctx, info);
 
-       si_ce_post_draw_synchronization(sctx);
+       if (unlikely(sctx->current_saved_cs))
+               si_trace_emit(sctx);
 
        sctx->compute_is_busy = true;
        sctx->b.num_compute_calls++;
@@ -852,20 +863,24 @@ static void si_launch_grid(
                sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
 }
 
+void si_destroy_compute(struct si_compute *program)
+{
+       if (program->ir_type == PIPE_SHADER_IR_TGSI) {
+               util_queue_drop_job(&program->screen->shader_compiler_queue,
+                                   &program->ready);
+               util_queue_fence_destroy(&program->ready);
+       }
+
+       si_shader_destroy(&program->shader);
+       FREE(program);
+}
 
 static void si_delete_compute_state(struct pipe_context *ctx, void* state){
        struct si_compute *program = (struct si_compute *)state;
        struct si_context *sctx = (struct si_context*)ctx;
 
-       if (!state) {
+       if (!state)
                return;
-       }
-
-       if (program->ir_type == PIPE_SHADER_IR_TGSI) {
-               util_queue_drop_job(&sctx->screen->shader_compiler_queue,
-                                   &program->ready);
-               util_queue_fence_destroy(&program->ready);
-       }
 
        if (program == sctx->cs_shader_state.program)
                sctx->cs_shader_state.program = NULL;
@@ -873,8 +888,7 @@ static void si_delete_compute_state(struct pipe_context *ctx, void* state){
        if (program == sctx->cs_shader_state.emitted_program)
                sctx->cs_shader_state.emitted_program = NULL;
 
-       si_shader_destroy(&program->shader);
-       FREE(program);
+       si_compute_reference(&program, NULL);
 }
 
 static void si_set_compute_resources(struct pipe_context * ctx_,