memset(results, 0, buffer->b.b.width0);
if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
- query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+ query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+ query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
unsigned max_rbs = rscreen->info.num_render_backends;
unsigned enabled_rb_mask = rscreen->info.enabled_rb_mask;
unsigned num_results;
switch (query_type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
query->result_size = 16 * rscreen->info.num_render_backends;
query->result_size += 16; /* for the fence + alignment */
query->num_cs_dw_begin = 6;
unsigned type, int diff)
{
if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
- type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+ type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+ type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
bool old_enable = rctx->num_occlusion_queries != 0;
bool old_perfect_enable =
rctx->num_perfect_occlusion_queries != 0;
rctx->num_occlusion_queries += diff;
assert(rctx->num_occlusion_queries >= 0);
- if (type == PIPE_QUERY_OCCLUSION_COUNTER) {
+ if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
rctx->num_perfect_occlusion_queries += diff;
assert(rctx->num_perfect_occlusion_queries >= 0);
}
switch (query->b.type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
radeon_emit(cs, va);
switch (query->b.type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
va += 8;
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
switch (query->b.type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
op = PRED_OP(PREDICATION_OP_ZPASS);
break;
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
switch (rquery->b.type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
params->start_offset = 0;
params->end_offset = 8;
params->fence_offset = max_rbs * 16;
}
break;
}
- case PIPE_QUERY_OCCLUSION_PREDICATE: {
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
for (unsigned i = 0; i < max_rbs; ++i) {
unsigned results_base = i * 16;
result->b = result->b ||
* 1.x = fence_offset
* 1.y = pair_stride
* 1.z = pair_count
+ * 1.w = result_offset
+ * 2.x = buffer0 offset
*
* BUFFER[0] = query result buffer
* BUFFER[1] = previous summary buffer
"DCL BUFFER[0]\n"
"DCL BUFFER[1]\n"
"DCL BUFFER[2]\n"
- "DCL CONST[0][0..1]\n"
+ "DCL CONST[0][0..2]\n"
"DCL TEMP[0..5]\n"
"IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
"IMM[1] UINT32 {1, 2, 4, 8}\n"
"AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
"UIF TEMP[5]\n"
/* Check result availability. */
- "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
+ "UADD TEMP[1].x, CONST[0][1].xxxx, CONST[0][2].xxxx\n"
+ "LOAD TEMP[1].x, BUFFER[0], TEMP[1].xxxx\n"
"ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
"MOV TEMP[1], TEMP[0].zzzz\n"
"NOT TEMP[0].z, TEMP[0].zzzz\n"
/* Load result if available. */
"UIF TEMP[1]\n"
- "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
+ "UADD TEMP[0].x, IMM[0].xxxx, CONST[0][2].xxxx\n"
+ "LOAD TEMP[0].xy, BUFFER[0], TEMP[0].xxxx\n"
"ENDIF\n"
"ELSE\n"
/* Load previously accumulated result if requested. */
/* Load fence and check result availability */
"UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
+ "UADD TEMP[5].x, TEMP[5].xxxx, CONST[0][2].xxxx\n"
"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
"ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
"NOT TEMP[0].z, TEMP[0].zzzz\n"
/* Load start and end. */
"UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
"UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
+ "UADD TEMP[5].x, TEMP[5].xxxx, CONST[0][2].xxxx\n"
"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
"UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
"AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
"UIF TEMP[4]\n"
/* Store accumulated data for chaining. */
- "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
+ "STORE BUFFER[2].xyz, CONST[0][1].wwww, TEMP[0]\n"
"ELSE\n"
"AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
"UIF TEMP[4]\n"
/* Store result availability. */
"NOT TEMP[0].z, TEMP[0]\n"
"AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
- "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
+ "STORE BUFFER[2].x, CONST[0][1].wwww, TEMP[0].zzzz\n"
"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
"UIF TEMP[4]\n"
- "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
+ "STORE BUFFER[2].y, CONST[0][1].wwww, IMM[0].xxxx\n"
"ENDIF\n"
"ELSE\n"
/* Store result if it is available. */
"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
"UIF TEMP[4]\n"
- "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
+ "STORE BUFFER[2].xy, CONST[0][1].wwww, TEMP[0].xyxy\n"
"ELSE\n"
/* Clamping */
"UIF TEMP[0].yyyy\n"
"UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
"ENDIF\n"
- "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+ "STORE BUFFER[2].x, CONST[0][1].wwww, TEMP[0].xxxx\n"
"ENDIF\n"
"ENDIF\n"
"ENDIF\n"
uint32_t fence_offset;
uint32_t pair_stride;
uint32_t pair_count;
+ uint32_t buffer_offset;
+ uint32_t buffer0_offset;
} consts;
if (!rctx->query_result_shader) {
consts.config = 0;
if (index < 0)
consts.config |= 4;
- if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE)
+ if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+ query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
consts.config |= 8;
else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
params.start_offset += qbuf->results_end - query->result_size;
}
- rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
-
ssbo[0].buffer = &qbuf->buf->b.b;
- ssbo[0].buffer_offset = params.start_offset;
- ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
-
+ ssbo[0].buffer_offset = params.start_offset & ~0xff;
+ ssbo[0].buffer_size = qbuf->results_end - ssbo[0].buffer_offset;
+ consts.buffer0_offset = (params.start_offset & 0xff);
if (!qbuf->previous) {
+
ssbo[2].buffer = resource;
- ssbo[2].buffer_offset = offset;
- ssbo[2].buffer_size = 8;
+ ssbo[2].buffer_offset = offset & ~0xff;
+ ssbo[2].buffer_size = offset + 8;
+ consts.buffer_offset = (offset & 0xff);
+ } else
+ consts.buffer_offset = 0;
- }
+ rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo);
va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
va += params.fence_offset;
- r600_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000);
+ r600_gfx_wait_fence(rctx, qbuf->buf, va, 0x80000000, 0x80000000);
}
rctx->b.launch_grid(&rctx->b, &grid);
int r;
/* validate this for other ops */
- assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT);
+ assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT);
memset(&alu, 0, sizeof(struct r600_bytecode_alu));
alu.op = op;
alu.src[0].sel = src0_sel;
return 0;
}
-static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
+static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type,
+ struct r600_bytecode_alu_src *src)
{
struct r600_bytecode_alu alu;
int r;
alu.dst.write = 1;
alu.dst.chan = 0;
- r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ alu.src[0] = *src;
alu.src[1].sel = V_SQ_ALU_SRC_0;
alu.src[1].chan = 0;
}
#endif
-static int emit_if(struct r600_shader_ctx *ctx, int opcode)
+static int emit_if(struct r600_shader_ctx *ctx, int opcode,
+ struct r600_bytecode_alu_src *src)
{
int alu_type = CF_OP_ALU_PUSH_BEFORE;
alu_type = CF_OP_ALU;
}
- emit_logic_pred(ctx, opcode, alu_type);
+ emit_logic_pred(ctx, opcode, alu_type, src);
r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
static int tgsi_if(struct r600_shader_ctx *ctx)
{
- return emit_if(ctx, ALU_OP2_PRED_SETNE);
+ struct r600_bytecode_alu_src alu_src;
+ r600_bytecode_src(&alu_src, &ctx->src[0], 0);
+
+ return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src);
}
static int tgsi_uif(struct r600_shader_ctx *ctx)
{
- return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
+ struct r600_bytecode_alu_src alu_src;
+ r600_bytecode_src(&alu_src, &ctx->src[0], 0);
+ return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
}
static int tgsi_else(struct r600_shader_ctx *ctx)
return 0;
}
+static int emit_u64add(struct r600_shader_ctx *ctx, int op,
+ int treg,
+ int src0_sel, int src0_chan,
+ int src1_sel, int src1_chan)
+{
+ struct r600_bytecode_alu alu;
+ int r;
+ int opc;
+
+ if (op == ALU_OP2_ADD_INT)
+ opc = ALU_OP2_ADDC_UINT;
+ else
+ opc = ALU_OP2_SUBB_UINT;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = op; ;
+ alu.dst.sel = treg;
+ alu.dst.chan = 0;
+ alu.dst.write = 1;
+ alu.src[0].sel = src0_sel;
+ alu.src[0].chan = src0_chan + 0;
+ alu.src[1].sel = src1_sel;
+ alu.src[1].chan = src1_chan + 0;
+ alu.src[1].neg = 0;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = op;
+ alu.dst.sel = treg;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+ alu.src[0].sel = src0_sel;
+ alu.src[0].chan = src0_chan + 1;
+ alu.src[1].sel = src1_sel;
+ alu.src[1].chan = src1_chan + 1;
+ alu.src[1].neg = 0;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = opc;
+ alu.dst.sel = treg;
+ alu.dst.chan = 2;
+ alu.dst.write = 1;
+ alu.last = 1;
+ alu.src[0].sel = src0_sel;
+ alu.src[0].chan = src0_chan + 0;
+ alu.src[1].sel = src1_sel;
+ alu.src[1].chan = src1_chan + 0;
+ alu.src[1].neg = 0;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = op;
+ alu.dst.sel = treg;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+ alu.src[0].sel = treg;
+ alu.src[0].chan = 1;
+ alu.src[1].sel = treg;
+ alu.src[1].chan = 2;
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ return 0;
+}
+
+static int egcm_u64add(struct r600_shader_ctx *ctx)
+{
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+ struct r600_bytecode_alu alu;
+ int r;
+ int treg = ctx->temp_reg;
+ int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT;
+
+ if (ctx->src[1].neg) {
+ op = ALU_OP2_SUB_INT;
+ opc = ALU_OP2_SUBB_UINT;
+ }
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = op; ;
+ alu.dst.sel = treg;
+ alu.dst.chan = 0;
+ alu.dst.write = 1;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
+ alu.src[1].neg = 0;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = op;
+ alu.dst.sel = treg;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
+ alu.src[1].neg = 0;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = opc ;
+ alu.dst.sel = treg;
+ alu.dst.chan = 2;
+ alu.dst.write = 1;
+ alu.last = 1;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
+ alu.src[1].neg = 0;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = op;
+ tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
+ alu.src[0].sel = treg;
+ alu.src[0].chan = 1;
+ alu.src[1].sel = treg;
+ alu.src[1].chan = 2;
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP1_MOV;
+ tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
+ alu.src[0].sel = treg;
+ alu.src[0].chan = 0;
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ return 0;
+}
+
+/* result.y = mul_high a, b
+ result.x = mul a,b
+ result.y += a.x * b.y + a.y * b.x;
+*/
+static int egcm_u64mul(struct r600_shader_ctx *ctx)
+{
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+ struct r600_bytecode_alu alu;
+ int r;
+ int treg = ctx->temp_reg;
+
+ /* temp.x = mul_lo a.x, b.x */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP2_MULLO_UINT;
+ alu.dst.sel = treg;
+ alu.dst.chan = 0;
+ alu.dst.write = 1;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
+ r = emit_mul_int_op(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ /* temp.y = mul_hi a.x, b.x */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP2_MULHI_UINT;
+ alu.dst.sel = treg;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
+ r = emit_mul_int_op(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ /* temp.z = mul a.x, b.y */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP2_MULLO_UINT;
+ alu.dst.sel = treg;
+ alu.dst.chan = 2;
+ alu.dst.write = 1;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
+ r = emit_mul_int_op(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ /* temp.w = mul a.y, b.x */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP2_MULLO_UINT;
+ alu.dst.sel = treg;
+ alu.dst.chan = 3;
+ alu.dst.write = 1;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
+ r = emit_mul_int_op(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ /* temp.z = temp.z + temp.w */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP2_ADD_INT;
+ alu.dst.sel = treg;
+ alu.dst.chan = 2;
+ alu.dst.write = 1;
+ alu.src[0].sel = treg;
+ alu.src[0].chan = 2;
+ alu.src[1].sel = treg;
+ alu.src[1].chan = 3;
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ /* temp.y = temp.y + temp.z */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP2_ADD_INT;
+ alu.dst.sel = treg;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+ alu.src[0].sel = treg;
+ alu.src[0].chan = 1;
+ alu.src[1].sel = treg;
+ alu.src[1].chan = 2;
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ /* dst.x = temp.x */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP1_MOV;
+ tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
+ alu.src[0].sel = treg;
+ alu.src[0].chan = 0;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ /* dst.y = temp.y */
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP1_MOV;
+ tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
+ alu.src[0].sel = treg;
+ alu.src[0].chan = 1;
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ return 0;
+}
+
+static int emit_u64sge(struct r600_shader_ctx *ctx,
+ int treg,
+ int src0_sel, int src0_base_chan,
+ int src1_sel, int src1_base_chan)
+{
+ int r;
+ /* for 64-bit sge */
+ /* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */
+ r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT,
+ treg, 1,
+ src0_sel, src0_base_chan + 1,
+ src1_sel, src1_base_chan + 1);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
+ treg, 0,
+ src0_sel, src0_base_chan,
+ src1_sel, src1_base_chan);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP2_SETE_INT,
+ treg, 2,
+ src0_sel, src0_base_chan + 1,
+ src1_sel, src1_base_chan + 1);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP2_AND_INT,
+ treg, 0,
+ treg, 0,
+ treg, 2);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP2_OR_INT,
+ treg, 0,
+ treg, 0,
+ treg, 1);
+ if (r)
+ return r;
+ return 0;
+}
+
+/* this isn't a complete div it's just enough for qbo shader to work */
+static int egcm_u64div(struct r600_shader_ctx *ctx)
+{
+ struct r600_bytecode_alu alu;
+ struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src;
+ int r, i;
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+
+ /* make sure we are dividing my a const with 0 in the high bits */
+ if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL)
+ return -1;
+ if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0)
+ return -1;
+ /* make sure we are doing one division */
+ if (inst->Dst[0].Register.WriteMask != 0x3)
+ return -1;
+
+ /* emit_if uses ctx->temp_reg so we can't */
+ int treg = r600_get_temp(ctx);
+ int tmp_num = r600_get_temp(ctx);
+ int sub_tmp = r600_get_temp(ctx);
+
+ /* tmp quot are tmp_num.zw */
+ r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0);
+ r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1);
+ r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0);
+ r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1);
+
+ /* MOV tmp_num.xy, numerator */
+ r = single_alu_op2(ctx, ALU_OP1_MOV,
+ tmp_num, 0,
+ alu_num_lo.sel, alu_num_lo.chan,
+ 0, 0);
+ if (r)
+ return r;
+ r = single_alu_op2(ctx, ALU_OP1_MOV,
+ tmp_num, 1,
+ alu_num_hi.sel, alu_num_hi.chan,
+ 0, 0);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP1_MOV,
+ tmp_num, 2,
+ V_SQ_ALU_SRC_LITERAL, 0,
+ 0, 0);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP1_MOV,
+ tmp_num, 3,
+ V_SQ_ALU_SRC_LITERAL, 0,
+ 0, 0);
+ if (r)
+ return r;
+
+ /* treg 0 is log2_denom */
+ /* normally this gets the MSB for the denom high value
+ - however we know this will always be 0 here. */
+ r = single_alu_op2(ctx,
+ ALU_OP1_MOV,
+ treg, 0,
+ V_SQ_ALU_SRC_LITERAL, 32,
+ 0, 0);
+ if (r)
+ return r;
+
+ /* normally check demon hi for 0, but we know it is already */
+ /* t0.z = num_hi >= denom_lo */
+ r = single_alu_op2(ctx,
+ ALU_OP2_SETGE_UINT,
+ treg, 1,
+ alu_num_hi.sel, alu_num_hi.chan,
+ V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
+ if (r)
+ return r;
+
+ memset(&alu_src, 0, sizeof(alu_src));
+ alu_src.sel = treg;
+ alu_src.chan = 1;
+ r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
+ if (r)
+ return r;
+
+ /* for loops in here */
+ /* get msb t0.x = msb(src[1].x) first */
+ int msb_lo = util_last_bit(alu_denom_lo.value);
+ r = single_alu_op2(ctx, ALU_OP1_MOV,
+ treg, 0,
+ V_SQ_ALU_SRC_LITERAL, msb_lo,
+ 0, 0);
+ if (r)
+ return r;
+
+ /* unroll the asm here */
+ for (i = 0; i < 31; i++) {
+ r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
+ treg, 2,
+ V_SQ_ALU_SRC_LITERAL, i,
+ treg, 0);
+ if (r)
+ return r;
+
+ /* we can do this on the CPU */
+ uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i);
+ /* t0.z = tmp_num.y >= t0.z */
+ r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
+ treg, 1,
+ tmp_num, 1,
+ V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP2_AND_INT,
+ treg, 1,
+ treg, 1,
+ treg, 2);
+ if (r)
+ return r;
+
+ memset(&alu_src, 0, sizeof(alu_src));
+ alu_src.sel = treg;
+ alu_src.chan = 1;
+ r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
+ tmp_num, 1,
+ tmp_num, 1,
+ V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP2_OR_INT,
+ tmp_num, 3,
+ tmp_num, 3,
+ V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
+ if (r)
+ return r;
+
+ r = tgsi_endif(ctx);
+ if (r)
+ return r;
+ }
+
+ /* log2_denom is always <= 31, so manually peel the last loop
+ * iteration.
+ */
+ r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
+ treg, 1,
+ tmp_num, 1,
+ V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
+ if (r)
+ return r;
+
+ memset(&alu_src, 0, sizeof(alu_src));
+ alu_src.sel = treg;
+ alu_src.chan = 1;
+ r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
+ tmp_num, 1,
+ tmp_num, 1,
+ V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP2_OR_INT,
+ tmp_num, 3,
+ tmp_num, 3,
+ V_SQ_ALU_SRC_LITERAL, 1U);
+ if (r)
+ return r;
+ r = tgsi_endif(ctx);
+ if (r)
+ return r;
+
+ r = tgsi_endif(ctx);
+ if (r)
+ return r;
+
+ /* onto the second loop to unroll */
+ for (i = 0; i < 31; i++) {
+ r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
+ treg, 1,
+ V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)),
+ treg, 0);
+ if (r)
+ return r;
+
+ uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i);
+ r = single_alu_op2(ctx, ALU_OP1_MOV,
+ treg, 2,
+ V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
+ 0, 0);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP1_MOV,
+ treg, 3,
+ V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
+ 0, 0);
+ if (r)
+ return r;
+
+ r = emit_u64sge(ctx, sub_tmp,
+ tmp_num, 0,
+ treg, 2);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP2_AND_INT,
+ treg, 1,
+ treg, 1,
+ sub_tmp, 0);
+ if (r)
+ return r;
+
+ memset(&alu_src, 0, sizeof(alu_src));
+ alu_src.sel = treg;
+ alu_src.chan = 1;
+ r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
+ if (r)
+ return r;
+
+
+ r = emit_u64add(ctx, ALU_OP2_SUB_INT,
+ sub_tmp,
+ tmp_num, 0,
+ treg, 2);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP1_MOV,
+ tmp_num, 0,
+ sub_tmp, 0,
+ 0, 0);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP1_MOV,
+ tmp_num, 1,
+ sub_tmp, 1,
+ 0, 0);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP2_OR_INT,
+ tmp_num, 2,
+ tmp_num, 2,
+ V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
+ if (r)
+ return r;
+
+ r = tgsi_endif(ctx);
+ if (r)
+ return r;
+ }
+
+ /* log2_denom is always <= 63, so manually peel the last loop
+ * iteration.
+ */
+ uint64_t denom_shl = (uint64_t)alu_denom_lo.value;
+ r = single_alu_op2(ctx, ALU_OP1_MOV,
+ treg, 2,
+ V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
+ 0, 0);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP1_MOV,
+ treg, 3,
+ V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
+ 0, 0);
+ if (r)
+ return r;
+
+ r = emit_u64sge(ctx, sub_tmp,
+ tmp_num, 0,
+ treg, 2);
+ if (r)
+ return r;
+
+ memset(&alu_src, 0, sizeof(alu_src));
+ alu_src.sel = sub_tmp;
+ alu_src.chan = 0;
+ r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
+ if (r)
+ return r;
+
+ r = emit_u64add(ctx, ALU_OP2_SUB_INT,
+ sub_tmp,
+ tmp_num, 0,
+ treg, 2);
+ if (r)
+ return r;
+
+ r = single_alu_op2(ctx, ALU_OP2_OR_INT,
+ tmp_num, 2,
+ tmp_num, 2,
+ V_SQ_ALU_SRC_LITERAL, 1U);
+ if (r)
+ return r;
+ r = tgsi_endif(ctx);
+ if (r)
+ return r;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP1_MOV;
+ tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
+ alu.src[0].sel = tmp_num;
+ alu.src[0].chan = 2;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP1_MOV;
+ tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
+ alu.src[0].sel = tmp_num;
+ alu.src[0].chan = 3;
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ return 0;
+}
+
+static int egcm_u64sne(struct r600_shader_ctx *ctx)
+{
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+ struct r600_bytecode_alu alu;
+ int r;
+ int treg = ctx->temp_reg;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP2_SETNE_INT;
+ alu.dst.sel = treg;
+ alu.dst.chan = 0;
+ alu.dst.write = 1;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP2_SETNE_INT;
+ alu.dst.sel = treg;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP2_OR_INT;
+ tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
+ alu.src[0].sel = treg;
+ alu.src[0].chan = 0;
+ alu.src[1].sel = treg;
+ alu.src[1].chan = 1;
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ return 0;
+}
+
static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
[TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
[TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
[TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
[TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
[TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
+ [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },
+ [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },
+ [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },
+ [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },
[TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
};
[TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
[TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
[TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
+ [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },
+ [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },
+ [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },
+ [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },
[TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
};