+
+#define AC_ENCODE_TRACE_POINT(id) (0xcafe0000 | ((id) & 0xffff))
+
+void eg_trace_emit(struct r600_context *rctx)
+{
+ struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
+ unsigned reloc;
+
+ if (rctx->b.chip_class < EVERGREEN)
+ return;
+
+ /* This must be done after r600_need_cs_space. */
+ reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+ (struct r600_resource*)rctx->trace_buf, RADEON_USAGE_WRITE,
+ RADEON_PRIO_CP_DMA);
+
+ rctx->trace_id++;
+ radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rctx->trace_buf,
+ RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
+ radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
+ radeon_emit(cs, rctx->trace_buf->gpu_address);
+ radeon_emit(cs, rctx->trace_buf->gpu_address >> 32 | MEM_WRITE_32_BITS | MEM_WRITE_CONFIRM);
+ radeon_emit(cs, rctx->trace_id);
+ radeon_emit(cs, 0);
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, AC_ENCODE_TRACE_POINT(rctx->trace_id));
+}
+
+static void evergreen_emit_set_append_cnt(struct r600_context *rctx,
+ struct r600_shader_atomic *atomic,
+ struct r600_resource *resource,
+ uint32_t pkt_flags)
+{
+ struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
+ unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+ resource,
+ RADEON_USAGE_READ,
+ RADEON_PRIO_SHADER_RW_BUFFER);
+ uint64_t dst_offset = resource->gpu_address + (atomic->start * 4);
+ uint32_t base_reg_0 = R_02872C_GDS_APPEND_COUNT_0;
+
+ uint32_t reg_val = (base_reg_0 + atomic->hw_idx * 4 - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+
+ radeon_emit(cs, PKT3(PKT3_SET_APPEND_CNT, 2, 0) | pkt_flags);
+ radeon_emit(cs, (reg_val << 16) | 0x3);
+ radeon_emit(cs, dst_offset & 0xfffffffc);
+ radeon_emit(cs, (dst_offset >> 32) & 0xff);
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+}
+
+static void evergreen_emit_event_write_eos(struct r600_context *rctx,
+ struct r600_shader_atomic *atomic,
+ struct r600_resource *resource,
+ uint32_t pkt_flags)
+{
+ struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
+ uint32_t event = EVENT_TYPE_PS_DONE;
+ uint32_t base_reg_0 = R_02872C_GDS_APPEND_COUNT_0;
+ uint32_t reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+ resource,
+ RADEON_USAGE_WRITE,
+ RADEON_PRIO_SHADER_RW_BUFFER);
+ uint64_t dst_offset = resource->gpu_address + (atomic->start * 4);
+ uint32_t reg_val = (base_reg_0 + atomic->hw_idx * 4) >> 2;
+
+ if (pkt_flags == RADEON_CP_PACKET3_COMPUTE_MODE)
+ event = EVENT_TYPE_CS_DONE;
+
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags);
+ radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6));
+ radeon_emit(cs, (dst_offset) & 0xffffffff);
+ radeon_emit(cs, (0 << 29) | ((dst_offset >> 32) & 0xff));
+ radeon_emit(cs, reg_val);
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+}
+
+static void cayman_emit_event_write_eos(struct r600_context *rctx,
+ struct r600_shader_atomic *atomic,
+ struct r600_resource *resource,
+ uint32_t pkt_flags)
+{
+ struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
+ uint32_t event = EVENT_TYPE_PS_DONE;
+ uint32_t reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+ resource,
+ RADEON_USAGE_WRITE,
+ RADEON_PRIO_SHADER_RW_BUFFER);
+ uint64_t dst_offset = resource->gpu_address + (atomic->start * 4);
+
+ if (pkt_flags == RADEON_CP_PACKET3_COMPUTE_MODE)
+ event = EVENT_TYPE_CS_DONE;
+
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags);
+ radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6));
+ radeon_emit(cs, (dst_offset) & 0xffffffff);
+ radeon_emit(cs, (1 << 29) | ((dst_offset >> 32) & 0xff));
+ radeon_emit(cs, (atomic->hw_idx) | (1 << 16));
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+}
+
+/* writes count from a buffer into GDS */
+static void cayman_write_count_to_gds(struct r600_context *rctx,
+ struct r600_shader_atomic *atomic,
+ struct r600_resource *resource,
+ uint32_t pkt_flags)
+{
+ struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
+ unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+ resource,
+ RADEON_USAGE_READ,
+ RADEON_PRIO_SHADER_RW_BUFFER);
+ uint64_t dst_offset = resource->gpu_address + (atomic->start * 4);
+
+ radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0) | pkt_flags);
+ radeon_emit(cs, dst_offset & 0xffffffff);
+ radeon_emit(cs, PKT3_CP_DMA_CP_SYNC | PKT3_CP_DMA_DST_SEL(1) | ((dst_offset >> 32) & 0xff));// GDS
+ radeon_emit(cs, atomic->hw_idx * 4);
+ radeon_emit(cs, 0);
+ radeon_emit(cs, PKT3_CP_DMA_CMD_DAS | 4);
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+}
+
+void evergreen_emit_atomic_buffer_setup_count(struct r600_context *rctx,
+ struct r600_pipe_shader *cs_shader,
+ struct r600_shader_atomic *combined_atomics,
+ uint8_t *atomic_used_mask_p)
+{
+ uint8_t atomic_used_mask = 0;
+ int i, j, k;
+ bool is_compute = cs_shader ? true : false;
+
+ for (i = 0; i < (is_compute ? 1 : EG_NUM_HW_STAGES); i++) {
+ uint8_t num_atomic_stage;
+ struct r600_pipe_shader *pshader;
+
+ if (is_compute)
+ pshader = cs_shader;
+ else
+ pshader = rctx->hw_shader_stages[i].shader;
+ if (!pshader)
+ continue;
+
+ num_atomic_stage = pshader->shader.nhwatomic_ranges;
+ if (!num_atomic_stage)
+ continue;
+
+ for (j = 0; j < num_atomic_stage; j++) {
+ struct r600_shader_atomic *atomic = &pshader->shader.atomics[j];
+ int natomics = atomic->end - atomic->start + 1;
+
+ for (k = 0; k < natomics; k++) {
+ /* seen this in a previous stage */
+ if (atomic_used_mask & (1u << (atomic->hw_idx + k)))
+ continue;
+
+ combined_atomics[atomic->hw_idx + k].hw_idx = atomic->hw_idx + k;
+ combined_atomics[atomic->hw_idx + k].buffer_id = atomic->buffer_id;
+ combined_atomics[atomic->hw_idx + k].start = atomic->start + k;
+ combined_atomics[atomic->hw_idx + k].end = combined_atomics[atomic->hw_idx + k].start + 1;
+ atomic_used_mask |= (1u << (atomic->hw_idx + k));
+ }
+ }
+ }
+ *atomic_used_mask_p = atomic_used_mask;
+}
+
+void evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
+ bool is_compute,
+ struct r600_shader_atomic *combined_atomics,
+ uint8_t atomic_used_mask)
+{
+ struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
+ unsigned pkt_flags = 0;
+ uint32_t mask;
+
+ if (is_compute)
+ pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
+
+ mask = atomic_used_mask;
+ if (!mask)
+ return;
+
+ while (mask) {
+ unsigned atomic_index = u_bit_scan(&mask);
+ struct r600_shader_atomic *atomic = &combined_atomics[atomic_index];
+ struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
+ assert(resource);
+
+ if (rctx->b.chip_class == CAYMAN)
+ cayman_write_count_to_gds(rctx, atomic, resource, pkt_flags);
+ else
+ evergreen_emit_set_append_cnt(rctx, atomic, resource, pkt_flags);
+ }
+}
+
+void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
+ bool is_compute,
+ struct r600_shader_atomic *combined_atomics,
+ uint8_t *atomic_used_mask_p)
+{
+ struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
+ struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
+ uint32_t pkt_flags = 0;
+ uint32_t event = EVENT_TYPE_PS_DONE;
+ uint32_t mask;
+ uint64_t dst_offset;
+ unsigned reloc;
+
+ if (is_compute)
+ pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
+
+ mask = *atomic_used_mask_p;
+ if (!mask)
+ return;
+
+ while (mask) {
+ unsigned atomic_index = u_bit_scan(&mask);
+ struct r600_shader_atomic *atomic = &combined_atomics[atomic_index];
+ struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
+ assert(resource);
+
+ if (rctx->b.chip_class == CAYMAN)
+ cayman_emit_event_write_eos(rctx, atomic, resource, pkt_flags);
+ else
+ evergreen_emit_event_write_eos(rctx, atomic, resource, pkt_flags);
+ }
+
+ if (pkt_flags == RADEON_CP_PACKET3_COMPUTE_MODE)
+ event = EVENT_TYPE_CS_DONE;
+
+ ++rctx->append_fence_id;
+ reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+ r600_resource(rctx->append_fence),
+ RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SHADER_RW_BUFFER);
+ dst_offset = r600_resource(rctx->append_fence)->gpu_address;
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags);
+ radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6));
+ radeon_emit(cs, dst_offset & 0xffffffff);
+ radeon_emit(cs, (2 << 29) | ((dst_offset >> 32) & 0xff));
+ radeon_emit(cs, rctx->append_fence_id);
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+
+ radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0) | pkt_flags);
+ radeon_emit(cs, WAIT_REG_MEM_GEQUAL | WAIT_REG_MEM_MEMORY | (1 << 8));
+ radeon_emit(cs, dst_offset & 0xffffffff);
+ radeon_emit(cs, ((dst_offset >> 32) & 0xff));
+ radeon_emit(cs, rctx->append_fence_id);
+ radeon_emit(cs, 0xffffffff);
+ radeon_emit(cs, 0xa);
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+}