radeonsi: add compute_last_block to configure the partial block fields
authorJiang, Sonny <Sonny.Jiang@amd.com>
Tue, 8 Jan 2019 19:47:07 +0000 (19:47 +0000)
committerMarek Olšák <marek.olsak@amd.com>
Tue, 22 Jan 2019 17:22:46 +0000 (12:22 -0500)
src/gallium/drivers/radeonsi/si_compute.c
src/gallium/drivers/radeonsi/si_pipe.h

index cbcd8e79c7b21b18a2b9ad83e9b35e793097d466..1f003cd36f2734f05c2fc52a4aaec865d9226650 100644 (file)
@@ -797,11 +797,6 @@ static void si_emit_dispatch_packets(struct si_context *sctx,
        radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
                          compute_resource_limits);
 
-       radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
-       radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
-       radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
-       radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
-
        unsigned dispatch_initiator =
                S_00B800_COMPUTE_SHADER_EN(1) |
                S_00B800_FORCE_START_AT_000(1) |
@@ -809,6 +804,33 @@ static void si_emit_dispatch_packets(struct si_context *sctx,
                 * allow launching waves out-of-order. (same as Vulkan) */
                S_00B800_ORDER_MODE(sctx->chip_class >= CIK);
 
+       uint *last_block = sctx->compute_last_block;
+       bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
+
+       radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+
+       if (partial_block_en) {
+               unsigned partial[3];
+
+               /* If no partial_block, these should be an entire block size, not 0. */
+               partial[0] = last_block[0] ? last_block[0] : info->block[0];
+               partial[1] = last_block[1] ? last_block[1] : info->block[1];
+               partial[2] = last_block[2] ? last_block[2] : info->block[2];
+
+               radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) |
+                               S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
+               radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]) |
+                               S_00B820_NUM_THREAD_PARTIAL(partial[1]));
+               radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]) |
+                               S_00B824_NUM_THREAD_PARTIAL(partial[2]));
+
+               dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
+       } else {
+               radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
+               radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
+               radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
+       }
+
        if (info->indirect) {
                uint64_t base_va = r600_resource(info->indirect)->gpu_address;
 
index 89a93182ed30b4b64e80e6cda2e24358bc2ef99e..37eb15f539eb0c5010d062335835a6a2ab5de118 100644 (file)
@@ -896,6 +896,28 @@ struct si_context {
        uint32_t                        vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD];
        uint32_t                        cs_user_data[4];
 
+        /**
+         * last_block allows disabling threads at the farthermost grid boundary.
+         * Full blocks as specified by "block" are launched, but the threads
+         * outside of "last_block" dimensions are disabled.
+         *
+         * If a block touches the grid boundary in the i-th axis, threads with
+         * THREAD_ID[i] >= last_block[i] are disabled.
+         *
+         * If last_block[i] is 0, it has the same behavior as last_block[i] = block[i],
+         * meaning no effect.
+         *
+         * It's equivalent to doing this at the beginning of the compute shader:
+         *
+         *   for (i = 0; i < 3; i++) {
+         *      if (block_id[i] == grid[i] - 1 &&
+         *          last_block[i] && last_block[i] >= thread_id[i])
+         *         return;
+         *   }
+         * (this could be moved into pipe_grid_info)
+         */
+        uint compute_last_block[3];
+
        /* Vertex and index buffers. */
        bool                            vertex_buffers_dirty;
        bool                            vertex_buffer_pointer_dirty;