radeonsi/compute: Add some more debug printfs

[mesa.git] / src / gallium / drivers / radeonsi / si_compute.c
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c

index 7e05be551241fd99be3e567e17096aa51ac61646..56b511848c96edda5e3f5e3a4cd2baf384f0ae2e 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -27,7 +27,6 @@
  #include "util/u_upload_mgr.h"
  #include "radeon/r600_pipe_common.h"
  #include "radeon/radeon_elf_util.h"
-#include "radeon/radeon_llvm_util.h"
  
  #include "radeon/r600_cs.h"
  #include "si_pipe.h"
@@ -70,6 +69,7 @@ static void *si_create_compute_state(
  
                 sel.tokens = tgsi_dup_tokens(cso->prog);
                 if (!sel.tokens) {
+                       FREE(program);
                         return NULL;
                 }
  
@@ -84,6 +84,7 @@ static void *si_create_compute_state(
                 if (si_shader_create(sscreen, sctx->tm, &program->shader,
                                      &sctx->b.debug)) {
                         FREE(sel.tokens);
+                       FREE(program);
                         return NULL;
                 }
  
@@ -162,8 +163,7 @@ static void si_initialize_compute(struct si_context *sctx)
         radeon_emit(cs, 0);
         radeon_emit(cs, 0);
  
-       radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 3);
-       radeon_emit(cs, 0);
+       radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
         /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */
         radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
         radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | S_00B85C_SH1_CU_EN(0xffff));
@@ -206,9 +206,7 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
                 scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
  
         if (scratch_bo_size < scratch_needed) {
-               pipe_resource_reference(
-                       (struct pipe_resource**)&sctx->compute_scratch_buffer,
-                       NULL);
+               r600_resource_reference(&sctx->compute_scratch_buffer, NULL);
  
                 sctx->compute_scratch_buffer =
                                 si_resource_create_custom(&sctx->screen->b.b,
@@ -290,7 +288,7 @@ static bool si_switch_compute_shader(struct si_context *sctx,
         shader_va = shader->bo->gpu_address + offset;
  
         radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
-                                 RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
+                                 RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
  
         radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
         radeon_emit(cs, shader_va >> 8);
@@ -300,12 +298,17 @@ static bool si_switch_compute_shader(struct si_context *sctx,
         radeon_emit(cs, config->rsrc1);
         radeon_emit(cs, config->rsrc2);
  
+       COMPUTE_DBG(sctx->screen, "COMPUTE_PGM_RSRC1: 0x%08x "
+               "COMPUTE_PGM_RSRC2: 0x%08x\n", config->rsrc1, config->rsrc2);
+
         radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
                   S_00B860_WAVES(sctx->scratch_waves)
                      | S_00B860_WAVESIZE(config->scratch_bytes_per_wave >> 10));
  
         sctx->cs_shader_state.emitted_program = program;
         sctx->cs_shader_state.offset = offset;
+       sctx->cs_shader_state.uses_scratch =
+               config->scratch_bytes_per_wave != 0;
  
         return true;
  }
@@ -357,7 +360,7 @@ static void si_upload_compute_input(struct si_context *sctx,
         radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
                         S_008F04_STRIDE(0));
  
-       pipe_resource_reference((struct pipe_resource**)&input_buffer, NULL);
+       r600_resource_reference(&input_buffer, NULL);
  }
  
  static void si_setup_tgsi_grid(struct si_context *sctx,
@@ -399,6 +402,11 @@ static void si_emit_dispatch_packets(struct si_context *sctx,
  {
         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
         bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
+       unsigned waves_per_threadgroup =
+               DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 64);
+
+       radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+                         S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0));
  
         radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
         radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
@@ -439,16 +447,45 @@ static void si_launch_grid(
         struct si_context *sctx = (struct si_context*)ctx;
         struct si_compute *program = sctx->cs_shader_state.program;
         int i;
+       /* HW bug workaround when CS threadgroups > 256 threads and async
+        * compute isn't used, i.e. only one compute job can run at a time.
+        * If async compute is possible, the threadgroup size must be limited
+        * to 256 threads on all queues to avoid the bug.
+        * Only SI and certain CIK chips are affected.
+        */
+       bool cs_regalloc_hang =
+               (sctx->b.chip_class == SI ||
+                sctx->b.family == CHIP_BONAIRE ||
+                sctx->b.family == CHIP_KABINI) &&
+               info->block[0] * info->block[1] * info->block[2] > 256;
+
+       if (cs_regalloc_hang)
+               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                                SI_CONTEXT_CS_PARTIAL_FLUSH;
  
         si_decompress_compute_textures(sctx);
  
+       /* Add buffer sizes for memory checking in need_cs_space. */
+       r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
+       /* TODO: add the scratch buffer */
+
+       if (info->indirect) {
+               r600_context_add_resource_size(ctx, info->indirect);
+
+               /* The hw doesn't read the indirect buffer via TC L2. */
+               if (r600_resource(info->indirect)->TC_L2_dirty) {
+                       sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
+                       r600_resource(info->indirect)->TC_L2_dirty = false;
+               }
+       }
+
         si_need_cs_space(sctx);
  
         if (!sctx->cs_shader_state.initialized)
                 si_initialize_compute(sctx);
  
         if (sctx->b.flags)
-               si_emit_cache_flush(sctx, NULL);
+               si_emit_cache_flush(sctx);
  
         if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc))
                 return;
@@ -485,6 +522,14 @@ static void si_launch_grid(
         si_emit_dispatch_packets(sctx, info);
  
         si_ce_post_draw_synchronization(sctx);
+
+       sctx->compute_is_busy = true;
+       sctx->b.num_compute_calls++;
+       if (sctx->cs_shader_state.uses_scratch)
+               sctx->b.num_spill_compute_calls++;
+
+       if (cs_regalloc_hang)
+               sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
  }