From 05f594f2291f146f5f8704f80fb475dfb21b66fb Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 1 Dec 2017 04:06:19 +0000 Subject: [PATCH] r600/atomic: add cayman version of atomic save/restore from GDS (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit On Cayman we don't use the append/consume counters (fglrx doesn't) and they don't seem to work well with compute shaders. This just uses GDS instead to do the atomic operations. v1.1: remove unused line. v2: use EOS on cayman, it appears to work. Acked-by: Nicolai Hähnle Signed-off-by: Dave Airlie --- src/gallium/drivers/r600/evergreen_state.c | 57 ++++++++++++- src/gallium/drivers/r600/r600_shader.c | 93 +++++++++++++++++----- 2 files changed, 126 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 6bca35e850f..a1d2e0cd14b 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -2672,6 +2672,7 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx) r600_store_value(cb, 0x76543210); /* CM_R_028BD4_PA_SC_CENTROID_PRIORITY_0 */ r600_store_value(cb, 0xfedcba98); /* CM_R_028BD8_PA_SC_CENTROID_PRIORITY_1 */ + r600_store_context_reg(cb, R_028724_GDS_ADDR_SIZE, 0x3fff); r600_store_context_reg_seq(cb, R_0288E8_SQ_LDS_ALLOC, 2); r600_store_value(cb, 0); /* R_0288E8_SQ_LDS_ALLOC */ r600_store_value(cb, 0); /* R_0288EC_SQ_LDS_ALLOC_PS */ @@ -4627,6 +4628,51 @@ static void evergreen_emit_event_write_eos(struct r600_context *rctx, radeon_emit(cs, reloc); } +static void cayman_emit_event_write_eos(struct r600_context *rctx, + struct r600_shader_atomic *atomic, + struct r600_resource *resource, + uint32_t pkt_flags) +{ + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; + uint32_t event = EVENT_TYPE_PS_DONE; + uint32_t reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, + resource, + RADEON_USAGE_WRITE, + RADEON_PRIO_SHADER_RW_BUFFER); + uint64_t dst_offset = resource->gpu_address + (atomic->start * 4); + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags); + radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6)); + radeon_emit(cs, (dst_offset) & 0xffffffff); + radeon_emit(cs, (1 << 29) | ((dst_offset >> 32) & 0xff)); + radeon_emit(cs, (atomic->hw_idx) | (1 << 16)); + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, reloc); +} + +/* writes count from a buffer into GDS */ +static void cayman_write_count_to_gds(struct r600_context *rctx, + struct r600_shader_atomic *atomic, + struct r600_resource *resource, + uint32_t pkt_flags) +{ + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; + unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, + resource, + RADEON_USAGE_READ, + RADEON_PRIO_SHADER_RW_BUFFER); + uint64_t dst_offset = resource->gpu_address + (atomic->start * 4); + + radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0) | pkt_flags); + radeon_emit(cs, dst_offset & 0xffffffff); + radeon_emit(cs, PKT3_CP_DMA_CP_SYNC | PKT3_CP_DMA_DST_SEL(1) | ((dst_offset >> 32) & 0xff));// GDS + radeon_emit(cs, atomic->hw_idx * 4); + radeon_emit(cs, 0); + radeon_emit(cs, PKT3_CP_DMA_CMD_DAS | 4); + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, reloc); +} + bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx, struct r600_shader_atomic *combined_atomics, uint8_t *atomic_used_mask_p) @@ -4674,7 +4720,10 @@ bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx, struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer); assert(resource); - evergreen_emit_set_append_cnt(rctx, atomic, resource, pkt_flags); + if (rctx->b.chip_class == CAYMAN) + cayman_write_count_to_gds(rctx, atomic, resource, pkt_flags); + else + evergreen_emit_set_append_cnt(rctx, atomic, resource, pkt_flags); } *atomic_used_mask_p = atomic_used_mask; return true; @@ -4702,8 +4751,12 @@ void evergreen_emit_atomic_buffer_save(struct r600_context *rctx, struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer); assert(resource); - evergreen_emit_event_write_eos(rctx, atomic, resource, pkt_flags); + if (rctx->b.chip_class == CAYMAN) + cayman_emit_event_write_eos(rctx, atomic, resource, pkt_flags); + else + evergreen_emit_event_write_eos(rctx, atomic, resource, pkt_flags); } + ++rctx->append_fence_id; reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, r600_resource(rctx->append_fence), diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 5d78e4f8ade..da74de04de3 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -7809,6 +7809,53 @@ static int find_hw_atomic_counter(struct r600_shader_ctx *ctx, return -1; } +static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx, + int *uav_id_p, int *uav_index_mode_p) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + int uav_id, uav_index_mode; + int r; + bool is_cm = (ctx->bc->chip_class == CAYMAN); + + uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]); + + if (inst->Src[0].Register.Indirect) { + if (is_cm) { + struct r600_bytecode_alu alu; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_LSHL_INT; + alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index); + alu.src[0].chan = 0; + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[1].value = 2; + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = 0; + alu.dst.write = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_ADD_INT, + ctx->temp_reg, 0, + ctx->temp_reg, 0, + V_SQ_ALU_SRC_LITERAL, uav_id * 4); + if (r) + return r; + } else + uav_index_mode = 2; + } else if (is_cm) { + r = single_alu_op2(ctx, ALU_OP1_MOV, + ctx->temp_reg, 0, + V_SQ_ALU_SRC_LITERAL, uav_id * 4, + 0, 0); + if (r) + return r; + } + *uav_id_p = uav_id; + *uav_index_mode_p = uav_index_mode; + return 0; +} static int tgsi_load_gds(struct r600_shader_ctx *ctx) { @@ -7817,27 +7864,27 @@ static int tgsi_load_gds(struct r600_shader_ctx *ctx) struct r600_bytecode_gds gds; int uav_id = 0; int uav_index_mode = 0; + bool is_cm = (ctx->bc->chip_class == CAYMAN); - uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]); - - if (inst->Src[0].Register.Indirect) - uav_index_mode = 2; + r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode); + if (r) + return r; memset(&gds, 0, sizeof(struct r600_bytecode_gds)); gds.op = FETCH_OP_GDS_READ_RET; gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; - gds.uav_id = uav_id; - gds.uav_index_mode = uav_index_mode; + gds.uav_id = is_cm ? 0 : uav_id; + gds.uav_index_mode = is_cm ? 0 : uav_index_mode; gds.src_gpr = ctx->temp_reg; - gds.src_sel_x = 4; + gds.src_sel_x = (is_cm) ? 0 : 4; gds.src_sel_y = 4; gds.src_sel_z = 4; gds.dst_sel_x = 0; gds.dst_sel_y = 7; gds.dst_sel_z = 7; gds.dst_sel_w = 7; - gds.src_gpr2 = ctx->temp_reg; - gds.alloc_consume = 1; + gds.src_gpr2 = 0; + gds.alloc_consume = !is_cm; r = r600_bytecode_add_gds(ctx->bc, &gds); if (r) return r; @@ -8369,16 +8416,16 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx) int r; int uav_id = 0; int uav_index_mode = 0; + bool is_cm = (ctx->bc->chip_class == CAYMAN); if (gds_op == -1) { fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode); return -1; } - uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]); - - if (inst->Src[0].Register.Indirect) - uav_index_mode = 2; + r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode); + if (r) + return r; if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) { int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]); @@ -8388,7 +8435,7 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx) memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = ALU_OP1_MOV; alu.dst.sel = ctx->temp_reg; - alu.dst.chan = 0; + alu.dst.chan = is_cm ? 1 : 0; alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; alu.src[0].value = abs_value; alu.last = 1; @@ -8400,7 +8447,7 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx) memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = ALU_OP1_MOV; alu.dst.sel = ctx->temp_reg; - alu.dst.chan = 0; + alu.dst.chan = is_cm ? 1 : 0; r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); alu.last = 1; alu.dst.write = 1; @@ -8409,21 +8456,23 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx) return r; } + memset(&gds, 0, sizeof(struct r600_bytecode_gds)); gds.op = gds_op; gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; - gds.uav_id = uav_id; - gds.uav_index_mode = uav_index_mode; + gds.uav_id = is_cm ? 0 : uav_id; + gds.uav_index_mode = is_cm ? 0 : uav_index_mode; gds.src_gpr = ctx->temp_reg; - gds.src_gpr2 = ctx->temp_reg; - gds.src_sel_x = 4; - gds.src_sel_y = 0; - gds.src_sel_z = 4; + gds.src_gpr2 = 0; + gds.src_sel_x = is_cm ? 0 : 4; + gds.src_sel_y = is_cm ? 1 : 0; + gds.src_sel_z = 7; gds.dst_sel_x = 0; gds.dst_sel_y = 7; gds.dst_sel_z = 7; gds.dst_sel_w = 7; - gds.alloc_consume = 1; + gds.alloc_consume = !is_cm; + r = r600_bytecode_add_gds(ctx->bc, &gds); if (r) return r; -- 2.30.2