This adds support for the evergreen/cayman atomic counters.
These are implemented using GDS append/consume counters. The values
for each counter are loaded before drawing and saved after each draw
using special CP packets.
v2: move hw atomic assignment into driver.
v3: fix messing up caps (Gert Wollny), only store ranges in driver,
drop buffers.
Signed-off-by: Dave Airlie <airlied@redhat.com>
Acked-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Tested-By: Gert Wollny <gw.fossdev@gmail.com>
rctx->tess_state_dirty = true;
}
+static void evergreen_set_hw_atomic_buffers(struct pipe_context *ctx,
+ unsigned start_slot,
+ unsigned count,
+ const struct pipe_shader_buffer *buffers)
+{
+ struct r600_context *rctx = (struct r600_context *)ctx;
+ struct r600_atomic_buffer_state *astate;
+ int i, idx;
+
+ astate = &rctx->atomic_buffer_state;
+
+ /* we'd probably like to expand this to 8 later so put the logic in */
+ for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
+ const struct pipe_shader_buffer *buf;
+ struct pipe_shader_buffer *abuf;
+
+ abuf = &astate->buffer[i];
+
+ if (!buffers || !buffers[idx].buffer) {
+ pipe_resource_reference(&abuf->buffer, NULL);
+ astate->enabled_mask &= ~(1 << i);
+ continue;
+ }
+ buf = &buffers[idx];
+
+ pipe_resource_reference(&abuf->buffer, buf->buffer);
+ abuf->buffer_offset = buf->buffer_offset;
+ abuf->buffer_size = buf->buffer_size;
+ astate->enabled_mask |= (1 << i);
+ }
+}
+
void evergreen_init_state_functions(struct r600_context *rctx)
{
unsigned id = 1;
rctx->b.b.set_polygon_stipple = evergreen_set_polygon_stipple;
rctx->b.b.set_min_samples = evergreen_set_min_samples;
rctx->b.b.set_tess_state = evergreen_set_tess_state;
+ rctx->b.b.set_hw_atomic_buffers = evergreen_set_hw_atomic_buffers;
if (rctx->b.chip_class == EVERGREEN)
rctx->b.b.get_sample_position = evergreen_get_sample_position;
else
radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
radeon_emit(cs, AC_ENCODE_TRACE_POINT(rctx->trace_id));
}
+
+bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
+ struct r600_shader_atomic *combined_atomics,
+ uint8_t *atomic_used_mask_p)
+{
+ struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+ struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
+ unsigned pkt_flags = 0;
+ uint8_t atomic_used_mask = 0;
+ int i, j, k;
+
+ for (i = 0; i < EG_NUM_HW_STAGES; i++) {
+ uint8_t num_atomic_stage;
+ struct r600_pipe_shader *pshader;
+
+ pshader = rctx->hw_shader_stages[i].shader;
+ if (!pshader)
+ continue;
+
+ num_atomic_stage = pshader->shader.nhwatomic_ranges;
+ if (!num_atomic_stage)
+ continue;
+
+ for (j = 0; j < num_atomic_stage; j++) {
+ struct r600_shader_atomic *atomic = &pshader->shader.atomics[j];
+ int natomics = atomic->end - atomic->start + 1;
+
+ for (k = 0; k < natomics; k++) {
+ /* seen this in a previous stage */
+ if (atomic_used_mask & (1u << (atomic->hw_idx + k)))
+ continue;
+
+ combined_atomics[atomic->hw_idx + k].hw_idx = atomic->hw_idx + k;
+ combined_atomics[atomic->hw_idx + k].buffer_id = atomic->buffer_id;
+ combined_atomics[atomic->hw_idx + k].start = atomic->start + k;
+ combined_atomics[atomic->hw_idx + k].end = combined_atomics[atomic->hw_idx + k].start + 1;
+ atomic_used_mask |= (1u << (atomic->hw_idx + k));
+ }
+ }
+ }
+
+ uint32_t mask = atomic_used_mask;
+ while (mask) {
+ unsigned atomic_index = u_bit_scan(&mask);
+ struct r600_shader_atomic *atomic = &combined_atomics[atomic_index];
+ struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
+ assert(resource);
+ unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+ resource,
+ RADEON_USAGE_READ,
+ RADEON_PRIO_SHADER_RW_BUFFER);
+ uint64_t dst_offset = resource->gpu_address + (atomic->start * 4);
+ uint32_t base_reg_0 = R_02872C_GDS_APPEND_COUNT_0;
+
+ uint32_t reg_val = (base_reg_0 + atomic->hw_idx * 4 - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+
+ radeon_emit(cs, PKT3(PKT3_SET_APPEND_CNT, 2, 0) | pkt_flags);
+ radeon_emit(cs, (reg_val << 16) | 0x3);
+ radeon_emit(cs, dst_offset & 0xfffffffc);
+ radeon_emit(cs, (dst_offset >> 32) & 0xff);
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+ }
+ *atomic_used_mask_p = atomic_used_mask;
+ return true;
+}
+
+void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
+ struct r600_shader_atomic *combined_atomics,
+ uint8_t *atomic_used_mask_p)
+{
+ struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+ struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
+ uint32_t pkt_flags = 0;
+ uint32_t event = EVENT_TYPE_PS_DONE;
+ uint32_t mask = astate->enabled_mask;
+ uint64_t dst_offset;
+ unsigned reloc;
+
+ mask = *atomic_used_mask_p;
+ while (mask) {
+ unsigned atomic_index = u_bit_scan(&mask);
+ struct r600_shader_atomic *atomic = &combined_atomics[atomic_index];
+ struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
+ assert(resource);
+
+ uint32_t base_reg_0 = R_02872C_GDS_APPEND_COUNT_0;
+ reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+ resource,
+ RADEON_USAGE_WRITE,
+ RADEON_PRIO_SHADER_RW_BUFFER);
+ dst_offset = resource->gpu_address + (atomic->start * 4);
+ uint32_t reg_val = (base_reg_0 + atomic->hw_idx * 4) >> 2;
+
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags);
+ radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6));
+ radeon_emit(cs, (dst_offset) & 0xffffffff);
+ radeon_emit(cs, (0 << 29) | ((dst_offset >> 32) & 0xff));
+ radeon_emit(cs, reg_val);
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+ }
+ ++rctx->append_fence_id;
+ reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+ r600_resource(rctx->append_fence),
+ RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SHADER_RW_BUFFER);
+ dst_offset = r600_resource(rctx->append_fence)->gpu_address;
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags);
+ radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6));
+ radeon_emit(cs, dst_offset & 0xffffffff);
+ radeon_emit(cs, (2 << 29) | ((dst_offset >> 32) & 0xff));
+ radeon_emit(cs, rctx->append_fence_id);
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+
+ radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0) | pkt_flags);
+ radeon_emit(cs, WAIT_REG_MEM_GEQUAL | WAIT_REG_MEM_MEMORY | (1 << 8));
+ radeon_emit(cs, dst_offset & 0xffffffff);
+ radeon_emit(cs, ((dst_offset >> 32) & 0xff));
+ radeon_emit(cs, rctx->append_fence_id);
+ radeon_emit(cs, 0xffffffff);
+ radeon_emit(cs, 0xa);
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+}
r600_resource_reference(&rctx->dummy_cmask, NULL);
r600_resource_reference(&rctx->dummy_fmask, NULL);
+ if (rctx->append_fence)
+ pipe_resource_reference((struct pipe_resource**)&rctx->append_fence, NULL);
for (sh = 0; sh < PIPE_SHADER_TYPES; sh++) {
rctx->b.b.set_constant_buffer(&rctx->b.b, sh, R600_BUFFER_INFO_CONST_BUFFER, NULL);
free(rctx->driver_consts[sh].constants);
rctx->b.family == CHIP_CAICOS ||
rctx->b.family == CHIP_CAYMAN ||
rctx->b.family == CHIP_ARUBA);
+
+ rctx->append_fence = pipe_buffer_create(rctx->b.b.screen, PIPE_BIND_CUSTOM,
+ PIPE_USAGE_DEFAULT, 32);
break;
default:
R600_ERR("Unsupported chip class %d.\n", rctx->b.chip_class);
case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+ return 0;
case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
+ if (rscreen->b.family >= CHIP_CEDAR && rscreen->has_atomics)
+ return 8;
+ return 0;
case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+ /* having to allocate the atomics out amongst shaders stages is messy,
+ so give compute 8 buffers and all the others one */
+ if (rscreen->b.family >= CHIP_CEDAR && rscreen->has_atomics) {
+ return EG_MAX_ATOMIC_BUFFERS;
+ }
return 0;
case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
/* due to a bug in the shader compiler, some loops hang
/* Create the auxiliary context. This must be done last. */
rscreen->b.aux_context = rscreen->b.b.context_create(&rscreen->b.b, NULL, 0);
+ rscreen->has_atomics = rscreen->b.info.drm_minor >= 44;
#if 0 /* This is for testing whether aux_context and buffer clearing work correctly. */
struct pipe_resource templ = {};
#define R600_MAX_DRIVER_CONST_BUFFERS 3
#define R600_MAX_CONST_BUFFERS (R600_MAX_USER_CONST_BUFFERS + R600_MAX_DRIVER_CONST_BUFFERS)
+#define EG_MAX_ATOMIC_BUFFERS 8
+
/* start driver buffers after user buffers */
#define R600_BUFFER_INFO_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS)
#define R600_UCP_SIZE (4*4*8)
struct r600_common_screen b;
bool has_msaa;
bool has_compressed_msaa_texturing;
+ bool has_atomics;
/*for compute global memory binding, we allocate stuff here, instead of
* buffers.
struct r600_pipe_shader *shader;
};
+struct r600_atomic_buffer_state {
+ uint32_t enabled_mask;
+ uint32_t dirty_mask;
+ struct pipe_shader_buffer buffer[EG_MAX_ATOMIC_BUFFERS];
+};
+
struct r600_context {
struct r600_common_context b;
struct r600_screen *screen;
struct r600_config_state config_state;
struct r600_stencil_ref_state stencil_ref;
struct r600_vgt_state vgt_state;
+ struct r600_atomic_buffer_state atomic_buffer_state;
/* Shaders and shader resources. */
struct r600_cso_state vertex_fetch_shader;
struct r600_shader_state hw_shader_stages[EG_NUM_HW_STAGES];
struct r600_resource *last_trace_buf;
struct r600_resource *trace_buf;
unsigned trace_id;
+
+ struct pipe_resource *append_fence;
+ uint32_t append_fence_id;
};
static inline void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
void eg_trace_emit(struct r600_context *rctx);
void eg_dump_debug_state(struct pipe_context *ctx, FILE *f,
unsigned flags);
+
+struct r600_shader_atomic;
+bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
+ struct r600_shader_atomic *combined_atomics,
+ uint8_t *atomic_used_mask_p);
+void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
+ struct r600_shader_atomic *combined_atomics,
+ uint8_t *atomic_used_mask_p);
+
#endif
/* disable SB for shaders using doubles */
use_sb &= !shader->shader.uses_doubles;
+ use_sb &= !shader->shader.uses_atomics;
+
/* Check if the bytecode has already been built. */
if (!shader->shader.bc.bytecode) {
r = r600_bytecode_build(&shader->shader.bc);
if (i->Src[j].Register.Dimension) {
switch (i->Src[j].Register.File) {
case TGSI_FILE_CONSTANT:
+ case TGSI_FILE_HW_ATOMIC:
break;
case TGSI_FILE_INPUT:
if (ctx->type == PIPE_SHADER_GEOMETRY ||
case TGSI_FILE_ADDRESS:
break;
+ case TGSI_FILE_HW_ATOMIC:
+ i = ctx->shader->nhwatomic_ranges;
+ ctx->shader->atomics[i].start = d->Range.First;
+ ctx->shader->atomics[i].end = d->Range.Last;
+ ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
+ ctx->shader->atomics[i].array_id = d->Array.ArrayID;
+ ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
+ ctx->shader->nhwatomic_ranges++;
+ ctx->shader->nhwatomic += count;
+ break;
+
case TGSI_FILE_SYSTEM_VALUE:
if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
shader->indirect_files = ctx.info.indirect_files;
shader->uses_doubles = ctx.info.uses_doubles;
+ shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
shader->nsys_inputs = 0;
indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
shader->vs_as_gs_a = key.vs.as_gs_a;
shader->vs_as_es = key.vs.as_es;
shader->vs_as_ls = key.vs.as_ls;
+ shader->atomic_base = key.vs.first_atomic_counter;
if (shader->vs_as_es)
ring_outputs = true;
if (shader->vs_as_ls)
break;
case PIPE_SHADER_GEOMETRY:
ring_outputs = true;
+ shader->atomic_base = key.gs.first_atomic_counter;
break;
case PIPE_SHADER_TESS_CTRL:
shader->tcs_prim_mode = key.tcs.prim_mode;
+ shader->atomic_base = key.tcs.first_atomic_counter;
lds_outputs = true;
lds_inputs = true;
break;
case PIPE_SHADER_TESS_EVAL:
shader->tes_as_es = key.tes.as_es;
+ shader->atomic_base = key.tes.first_atomic_counter;
lds_inputs = true;
if (shader->tes_as_es)
ring_outputs = true;
break;
case PIPE_SHADER_FRAGMENT:
shader->two_side = key.ps.color_two_side;
+ shader->atomic_base = key.ps.first_atomic_counter;
break;
default:
break;
return 0;
}
+static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
+ struct tgsi_full_src_register *src)
+{
+ int i;
+
+ if (src->Register.Indirect) {
+ for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
+ if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
+ return ctx->shader->atomics[i].hw_idx;
+ }
+ } else {
+ uint32_t index = src->Register.Index;
+ for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
+ if (ctx->shader->atomics[i].buffer_id != src->Dimension.Index)
+ continue;
+ if (index > ctx->shader->atomics[i].end)
+ continue;
+ if (index < ctx->shader->atomics[i].start)
+ continue;
+ uint32_t offset = (index - ctx->shader->atomics[i].start);
+ return ctx->shader->atomics[i].hw_idx + offset;
+ }
+ }
+ assert(0);
+ return -1;
+}
+
+
+static int tgsi_load_gds(struct r600_shader_ctx *ctx)
+{
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+ int r;
+ struct r600_bytecode_gds gds;
+ int uav_id = 0;
+ int uav_index_mode = 0;
+
+ uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
+
+ if (inst->Src[0].Register.Indirect)
+ uav_index_mode = 2;
+
+ memset(&gds, 0, sizeof(struct r600_bytecode_gds));
+ gds.op = FETCH_OP_GDS_READ_RET;
+ gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
+ gds.uav_id = uav_id;
+ gds.uav_index_mode = uav_index_mode;
+ gds.src_gpr = ctx->temp_reg;
+ gds.src_sel_x = 4;
+ gds.src_sel_y = 4;
+ gds.src_sel_z = 4;
+ gds.dst_sel_x = 0;
+ gds.dst_sel_y = 7;
+ gds.dst_sel_z = 7;
+ gds.dst_sel_w = 7;
+ gds.src_gpr2 = ctx->temp_reg;
+ gds.alloc_consume = 1;
+ r = r600_bytecode_add_gds(ctx->bc, &gds);
+ if (r)
+ return r;
+
+ ctx->bc->cf_last->vpm = 1;
+ return 0;
+}
+
+static int tgsi_load(struct r600_shader_ctx *ctx)
+{
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+ if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
+ return tgsi_load_gds(ctx);
+ return 0;
+}
+
+static int get_gds_op(int opcode)
+{
+ switch (opcode) {
+ case TGSI_OPCODE_ATOMUADD:
+ return FETCH_OP_GDS_ADD_RET;
+ case TGSI_OPCODE_ATOMAND:
+ return FETCH_OP_GDS_AND_RET;
+ case TGSI_OPCODE_ATOMOR:
+ return FETCH_OP_GDS_OR_RET;
+ case TGSI_OPCODE_ATOMXOR:
+ return FETCH_OP_GDS_XOR_RET;
+ case TGSI_OPCODE_ATOMUMIN:
+ return FETCH_OP_GDS_MIN_UINT_RET;
+ case TGSI_OPCODE_ATOMUMAX:
+ return FETCH_OP_GDS_MAX_UINT_RET;
+ case TGSI_OPCODE_ATOMXCHG:
+ return FETCH_OP_GDS_XCHG_RET;
+ case TGSI_OPCODE_ATOMCAS:
+ return FETCH_OP_GDS_CMP_XCHG_RET;
+ default:
+ return -1;
+ }
+}
+
+static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
+{
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+ struct r600_bytecode_gds gds;
+ struct r600_bytecode_alu alu;
+ int gds_op = get_gds_op(inst->Instruction.Opcode);
+ int r;
+ int uav_id = 0;
+ int uav_index_mode = 0;
+
+ if (gds_op == -1) {
+ fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
+ return -1;
+ }
+
+ uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
+
+ if (inst->Src[0].Register.Indirect)
+ uav_index_mode = 2;
+
+ if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
+ int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
+ int abs_value = abs(value);
+ if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
+ gds_op = FETCH_OP_GDS_SUB_RET;
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP1_MOV;
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = 0;
+ alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
+ alu.src[0].value = abs_value;
+ alu.last = 1;
+ alu.dst.write = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ } else {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP1_MOV;
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = 0;
+ r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
+ alu.last = 1;
+ alu.dst.write = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+
+ memset(&gds, 0, sizeof(struct r600_bytecode_gds));
+ gds.op = gds_op;
+ gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
+ gds.uav_id = uav_id;
+ gds.uav_index_mode = uav_index_mode;
+ gds.src_gpr = ctx->temp_reg;
+ gds.src_gpr2 = ctx->temp_reg;
+ gds.src_sel_x = 4;
+ gds.src_sel_y = 0;
+ gds.src_sel_z = 4;
+ gds.dst_sel_x = 0;
+ gds.dst_sel_y = 7;
+ gds.dst_sel_z = 7;
+ gds.dst_sel_w = 7;
+ gds.alloc_consume = 1;
+ r = r600_bytecode_add_gds(ctx->bc, &gds);
+ if (r)
+ return r;
+ ctx->bc->cf_last->vpm = 1;
+ return 0;
+}
+
+static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
+{
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+ if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
+ return tgsi_atomic_op_gds(ctx);
+ return 0;
+}
+
static int tgsi_lrp(struct r600_shader_ctx *ctx)
{
struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
[TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
[TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
[TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
- [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
+ [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
[TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
[163] = { ALU_OP0_NOP, tgsi_unsupported},
[164] = { ALU_OP0_NOP, tgsi_unsupported},
[165] = { ALU_OP0_NOP, tgsi_unsupported},
[TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
- [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
+ [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
[TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
[TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
[TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
[TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
[TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
[TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
- [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
+ [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
[TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
[163] = { ALU_OP0_NOP, tgsi_unsupported},
[164] = { ALU_OP0_NOP, tgsi_unsupported},
[165] = { ALU_OP0_NOP, tgsi_unsupported},
[TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
- [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
- [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
+ [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
+ [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
[TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
[TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
[TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
int ring_offset;
};
+struct r600_shader_atomic {
+ unsigned start, end;
+ unsigned buffer_id;
+ unsigned hw_idx;
+ unsigned array_id;
+};
+
struct r600_shader {
unsigned processor_type;
struct r600_bytecode bc;
unsigned ninput;
unsigned noutput;
+ unsigned nhwatomic;
unsigned nlds;
unsigned nsys_inputs;
struct r600_shader_io input[64];
struct r600_shader_io output[64];
+ struct r600_shader_atomic atomics[8];
+ unsigned nhwatomic_ranges;
boolean uses_kill;
boolean fs_write_all;
boolean two_side;
struct r600_shader_array * arrays;
boolean uses_doubles;
+ boolean uses_atomics;
+ uint8_t atomic_base;
};
union r600_shader_key {
struct {
unsigned nr_cbufs:4;
+ unsigned first_atomic_counter:4;
unsigned color_two_side:1;
unsigned alpha_to_one:1;
} ps;
struct {
unsigned prim_id_out:8;
+ unsigned first_atomic_counter:4;
unsigned as_es:1; /* export shader */
unsigned as_ls:1; /* local shader */
unsigned as_gs_a:1;
} vs;
struct {
+ unsigned first_atomic_counter:4;
unsigned as_es:1;
} tes;
struct {
+ unsigned first_atomic_counter:4;
unsigned prim_mode:3;
} tcs;
+ struct {
+ unsigned first_atomic_counter:4;
+ } gs;
};
struct r600_shader_array {
}
}
+static int r600_get_hw_atomic_count(const struct pipe_context *ctx,
+ enum pipe_shader_type shader)
+{
+ const struct r600_context *rctx = (struct r600_context *)ctx;
+ int value = 0;
+ switch (shader) {
+ case PIPE_SHADER_FRAGMENT:
+ case PIPE_SHADER_COMPUTE:
+ default:
+ break;
+ case PIPE_SHADER_VERTEX:
+ value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC];
+ break;
+ case PIPE_SHADER_GEOMETRY:
+ value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
+ rctx->vs_shader->info.file_count[TGSI_FILE_HW_ATOMIC];
+ break;
+ case PIPE_SHADER_TESS_EVAL:
+ value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
+ rctx->vs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
+ (rctx->gs_shader ? rctx->gs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] : 0);
+ break;
+ case PIPE_SHADER_TESS_CTRL:
+ value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
+ rctx->vs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
+ (rctx->gs_shader ? rctx->gs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] : 0) +
+ rctx->tes_shader->info.file_count[TGSI_FILE_HW_ATOMIC];
+ break;
+ }
+ return value;
+}
+
/* Compute the key for the hw shader variant */
static inline void r600_shader_selector_key(const struct pipe_context *ctx,
const struct r600_pipe_shader_selector *sel,
key->vs.as_gs_a = true;
key->vs.prim_id_out = rctx->ps_shader->current->shader.input[rctx->ps_shader->current->shader.ps_prim_id_input].spi_sid;
}
+ key->vs.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_VERTEX);
break;
}
case PIPE_SHADER_GEOMETRY:
+ key->gs.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_GEOMETRY);
break;
case PIPE_SHADER_FRAGMENT: {
+ key->ps.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_FRAGMENT);
key->ps.color_two_side = rctx->rasterizer && rctx->rasterizer->two_side;
key->ps.alpha_to_one = rctx->alpha_to_one &&
rctx->rasterizer && rctx->rasterizer->multisample_enable &&
}
case PIPE_SHADER_TESS_EVAL:
key->tes.as_es = (rctx->gs_shader != NULL);
+ key->tes.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_TESS_EVAL);
break;
case PIPE_SHADER_TESS_CTRL:
key->tcs.prim_mode = rctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+ key->tcs.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_TESS_CTRL);
break;
default:
assert(0);
unsigned num_patches, dirty_tex_counter, index_offset = 0;
unsigned index_size = info->index_size;
int index_bias;
+ struct r600_shader_atomic combined_atomics[8];
+ uint8_t atomic_used_mask;
if (!info->indirect && !info->count && (index_size || !info->count_from_stream_output)) {
return;
: (rctx->tes_shader)? rctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]
: info->mode;
+ if (rctx->b.chip_class >= EVERGREEN)
+ evergreen_emit_atomic_buffer_setup(rctx, combined_atomics, &atomic_used_mask);
+
if (index_size) {
index_offset += info->start * index_size;
radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT));
}
+
+ if (rctx->b.chip_class >= EVERGREEN)
+ evergreen_emit_atomic_buffer_save(rctx, combined_atomics, &atomic_used_mask);
+
if (rctx->trace_buf)
eg_trace_emit(rctx);
#define STRMOUT_SELECT_BUFFER(x) (((unsigned)(x) & 0x3) << 8)
#define PKT3_WAIT_REG_MEM 0x3C
#define WAIT_REG_MEM_EQUAL 3
+#define WAIT_REG_MEM_GEQUAL 5
+#define WAIT_REG_MEM_MEMORY (1 << 4)
#define WAIT_REG_MEM_MEM_SPACE(x) (((unsigned)(x) & 0x3) << 4)
#define PKT3_COPY_DATA 0x40
#define COPY_DATA_SRC_SEL(x) ((x) & 0xf)