From: Dave Airlie <airlied@redhat.com>
Date: Fri, 20 Feb 2015 00:47:15 +0000 (+1000)
Subject: r600g: add doubles support for CAYMAN
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=3be5ee1574e0a3494c0550f4b7ae09117855d749;p=mesa.git

r600g: add doubles support for CAYMAN

Only a subset of AMD GPUs supported by r600g support doubles,
CAYMAN and CYPRESS are probably all we'll try and support, however
I don't have a CYPRESS so ignore that for now.

This disables SB support for doubles, as we think we need to
make the scheduler smarter to introduce delay slots.

[airlied: pushing this to avoid pain of rebasing, it mostly
works on cayman only so far, Glenn has some ideas about
delay slot issues we need to look into. turned off by
default for now]

Signed-off-by: Dave Airlie <airlied@redhat.com>
---

diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 9b356bad4a9..bc6980660a5 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -252,6 +252,12 @@ static int alu_uses_rel(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
 	return 0;
 }
 
+static int is_alu_64bit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
+{
+	const struct alu_op_info *op = r600_isa_alu(alu->op);
+	return (op->flags & AF_64);
+}
+
 static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
 {
 	unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
@@ -576,6 +582,12 @@ static int replace_gpr_with_pv_ps(struct r600_bytecode *bc,
 
 	for (i = 0; i < max_slots; ++i) {
 		if (prev[i] && (prev[i]->dst.write || prev[i]->is_op3) && !prev[i]->dst.rel) {
+
+			if (is_alu_64bit_inst(bc, prev[i])) {
+				gpr[i] = -1;
+				continue;
+			}
+
 			gpr[i] = prev[i]->dst.sel;
 			/* cube writes more than PV.X */
 			if (is_alu_reduction_inst(bc, prev[i]))
@@ -591,6 +603,8 @@ static int replace_gpr_with_pv_ps(struct r600_bytecode *bc,
 		if(!alu)
 			continue;
 
+		if (is_alu_64bit_inst(bc, alu))
+			continue;
 		num_src = r600_bytecode_get_num_operands(bc, alu);
 		for (src = 0; src < num_src; ++src) {
 			if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index b282907db5f..7cf3a090908 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -280,4 +280,19 @@ void eg_bytecode_export_read(struct r600_bytecode *bc,
 
 void r600_vertex_data_type(enum pipe_format pformat, unsigned *format,
 			   unsigned *num_format, unsigned *format_comp, unsigned *endian);
+
+static inline int fp64_switch(int i)
+{
+	switch (i) {
+	case 0:
+		return 1;
+	case 1:
+		return 0;
+	case 2:
+		return 3;
+	case 3:
+		return 2;
+	}
+	return 0;
+}
 #endif
diff --git a/src/gallium/drivers/r600/r600_isa.h b/src/gallium/drivers/r600/r600_isa.h
index fdbe1c00d20..27fc1e8e8be 100644
--- a/src/gallium/drivers/r600/r600_isa.h
+++ b/src/gallium/drivers/r600/r600_isa.h
@@ -339,11 +339,11 @@ static const struct alu_op_info alu_op_table[] = {
 		{"PRED_SETGT_64",             2, { 0x7C, 0xC7 },{   AF_V,  AF_V,  AF_V,  AF_V},  AF_PRED | AF_CC_GT | AF_64 },
 		{"PRED_SETE_64",              2, { 0x7D, 0xC8 },{   AF_V,  AF_V,  AF_V,  AF_V},  AF_PRED | AF_CC_E | AF_64 },
 		{"PRED_SETGE_64",             2, { 0x7E, 0xC9 },{   AF_V,  AF_V,  AF_V,  AF_V},  AF_PRED | AF_CC_GE | AF_64 },
-		{"MUL_64",                    2, { 0x1B, 0xCA },{   AF_V,  AF_V,  AF_V,  AF_V},  AF_64 },
+		{"MUL_64",                    2, { 0x1B, 0xCA },{   AF_V,  AF_V,  AF_V,  AF_4V}, AF_64 },
 		{"ADD_64",                    2, { 0x17, 0xCB },{   AF_V,  AF_V,  AF_V,  AF_V},  AF_64 },
 		{"MOVA_INT",                  1, { 0x18, 0xCC },{   AF_V,  AF_V,  AF_V,  AF_V},  AF_MOVA },
-		{"FLT64_TO_FLT32",            1, { 0x1C, 0xCD },{   AF_V,  AF_V,  AF_V,  AF_V},  0 },
-		{"FLT32_TO_FLT64",            1, { 0x1D, 0xCE },{   AF_V,  AF_V,  AF_V,  AF_V},  0 },
+		{"FLT64_TO_FLT32",            1, { 0x1C, 0xCD },{   AF_V,  AF_V,  AF_V,  AF_V},  AF_64 },
+		{"FLT32_TO_FLT64",            1, { 0x1D, 0xCE },{   AF_V,  AF_V,  AF_V,  AF_V},  AF_64 },
 		{"SAD_ACCUM_PREV_UINT",       2, {   -1, 0xCF },{      0,     0,  AF_V,  AF_V},  AF_UINT_DST | AF_PREV_NEXT },
 		{"DOT",                       2, {   -1, 0xD0 },{      0,     0,  AF_V,  AF_V},  AF_PREV_NEXT },
 		{"MUL_PREV",                  1, {   -1, 0xD1 },{      0,     0,  AF_V,  AF_V},  AF_PREV_INTERLEAVE },
@@ -369,7 +369,7 @@ static const struct alu_op_info alu_op_table[] = {
 		{"FMA",                       3, {   -1, 0x07 },{      0,     0,  AF_V,  AF_V},  0 },
 		{"MULADD_INT24",              3, {   -1, 0x08 },{      0,     0,     0,  AF_V},  AF_INT_DST | AF_24 },
 		{"CNDNE_64",                  3, {   -1, 0x09 },{      0,     0,  AF_V,  AF_V},  AF_CMOV | AF_64 },
-		{"FMA_64",                    3, {   -1, 0x0A },{      0,     0,  AF_V,  AF_V},  AF_64 },
+		{"FMA_64",                    3, {   -1, 0x0A },{      0,     0,  AF_V,  AF_4V}, AF_64 },
 		{"LERP_UINT",                 3, {   -1, 0x0B },{      0,     0,  AF_V,  AF_V},  AF_UINT_DST },
 		{"BIT_ALIGN_INT",             3, {   -1, 0x0C },{      0,     0,  AF_V,  AF_V},  AF_INT_DST },
 		{"BYTE_ALIGN_INT",            3, {   -1, 0x0D },{      0,     0,  AF_V,  AF_V},  AF_INT_DST },
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 25a251e8187..1489298a097 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -47,7 +47,7 @@ MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
 These 8xx t-slot only opcodes become vector ops, with all four 
 slots expecting the arguments on sources a and b. Result is 
 broadcast to all channels.
-MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
+MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
 These 8xx t-slot only opcodes become vector ops in the z, y, and 
 x slots.
 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
@@ -167,6 +167,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
     }
 	/* disable SB for shaders using CF_INDEX_0/1 (sampler/ubo array indexing) as it doesn't handle those currently */
 	use_sb &= !shader->shader.uses_index_registers;
+	/* disable SB for shaders using doubles */
+	use_sb &= !shader->shader.uses_doubles;
 
 	/* Check if the bytecode has already been built.  When using the llvm
 	 * backend, r600_shader_from_tgsi() will take care of building the
@@ -343,7 +345,7 @@ static int tgsi_is_supported(struct r600_shader_ctx *ctx)
 	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
 	int j;
 
-	if (i->Instruction.NumDstRegs > 1) {
+	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
 		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
 		return -EINVAL;
 	}
@@ -1907,6 +1909,9 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.tokens = tokens;
 	tgsi_scan_shader(tokens, &ctx.info);
 	shader->indirect_files = ctx.info.indirect_files;
+
+	shader->uses_doubles = ctx.info.uses_doubles;
+
 	indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT);
 	tgsi_parse_init(&ctx.parse, tokens);
 	ctx.type = ctx.info.processor;
@@ -2679,6 +2684,167 @@ static int tgsi_last_instruction(unsigned writemask)
 	return lasti;
 }
 
+
+
+static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	unsigned write_mask = inst->Dst[0].Register.WriteMask;
+	struct r600_bytecode_alu alu;
+	int i, j, r, lasti = tgsi_last_instruction(write_mask);
+	int use_tmp = 0;
+
+	if (singledest) {
+		switch (write_mask) {
+		case 0x1:
+			write_mask = 0x3;
+			break;
+		case 0x2:
+			use_tmp = 1;
+			write_mask = 0x3;
+			break;
+		case 0x4:
+			write_mask = 0xc;
+			break;
+		case 0x8:
+			write_mask = 0xc;
+			use_tmp = 3;
+			break;
+		}
+	}
+
+	lasti = tgsi_last_instruction(write_mask);
+	for (i = 0; i <= lasti; i++) {
+
+		if (!(write_mask & (1 << i)))
+			continue;
+
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+		if (singledest) {
+			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+			if (use_tmp) {
+				alu.dst.sel = ctx->temp_reg;
+				alu.dst.chan = i;
+				alu.dst.write = 1;
+			}
+			if (i == 1 || i == 3)
+				alu.dst.write = 0;
+		} else
+			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+
+		alu.op = ctx->inst_info->op;
+		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
+			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+		} else if (!swap) {
+			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
+				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
+			}
+		} else {
+			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
+			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
+		}
+
+		/* handle some special cases */
+		if (i == 1 || i == 3) {
+			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
+			case TGSI_OPCODE_SUB:
+				r600_bytecode_src_toggle_neg(&alu.src[1]);
+				break;
+			case TGSI_OPCODE_DABS:
+				r600_bytecode_src_set_abs(&alu.src[0]);
+				break;
+			default:
+				break;
+			}
+		}
+		if (i == lasti) {
+			alu.last = 1;
+		}
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+
+	if (use_tmp) {
+		write_mask = inst->Dst[0].Register.WriteMask;
+
+		/* move result from temp to dst */
+		for (i = 0; i <= lasti; i++) {
+			if (!(write_mask & (1 << i)))
+				continue;
+
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = ALU_OP1_MOV;
+			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+			alu.src[0].sel = ctx->temp_reg;
+			alu.src[0].chan = use_tmp - 1;
+			alu.last = (i == lasti);
+
+			r = r600_bytecode_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+		}
+	}
+	return 0;
+}
+
+static int tgsi_op2_64(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	unsigned write_mask = inst->Dst[0].Register.WriteMask;
+	/* confirm writemaskiing */
+	if ((write_mask & 0x3) != 0x3 &&
+	    (write_mask & 0xc) != 0xc) {
+		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
+		return -1;
+	}
+	return tgsi_op2_64_params(ctx, false, false);
+}
+
+static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
+{
+	return tgsi_op2_64_params(ctx, true, false);
+}
+
+static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
+{
+	return tgsi_op2_64_params(ctx, true, true);
+}
+
+static int tgsi_op3_64(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bytecode_alu alu;
+	int i, j, r;
+	int lasti = 3;
+	int tmp = r600_get_temp(ctx);
+
+	for (i = 0; i < lasti + 1; i++) {
+
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ctx->inst_info->op;
+		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
+			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
+		}
+
+		if (inst->Dst[0].Register.WriteMask & (1 << i))
+			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+		else
+			alu.dst.sel = tmp;
+
+		alu.dst.chan = i;
+		alu.is_op3 = 1;
+		if (i == lasti) {
+			alu.last = 1;
+		}
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
@@ -2795,6 +2961,152 @@ static int tgsi_ineg(struct r600_shader_ctx *ctx)
 
 }
 
+static int tgsi_dneg(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bytecode_alu alu;
+	int i, r;
+	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
+
+	for (i = 0; i < lasti + 1; i++) {
+
+		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
+			continue;
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ALU_OP1_MOV;
+
+		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+
+		if (i == 1 || i == 3)
+			r600_bytecode_src_toggle_neg(&alu.src[0]);
+		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+
+		if (i == lasti) {
+			alu.last = 1;
+		}
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+	return 0;
+
+}
+
+static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bytecode_alu alu;
+	unsigned write_mask = inst->Dst[0].Register.WriteMask;
+	int i, j, r;
+	int firsti = write_mask == 0xc ? 2 : 0;
+
+	for (i = 0; i <= 3; i++) {
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ctx->inst_info->op;
+
+		alu.dst.sel = ctx->temp_reg;
+		alu.dst.chan = i;
+		alu.dst.write = 1;
+		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
+			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
+		}
+
+		if (i == 3)
+			alu.last = 1;
+
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+
+	/* MOV first two channels to writemask dst0 */
+	for (i = 0; i <= 1; i++) {
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ALU_OP1_MOV;
+		alu.src[0].chan = i + 2;
+		alu.src[0].sel = ctx->temp_reg;
+
+		tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst);
+		alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1;
+		alu.last = 1;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+
+	for (i = 0; i <= 3; i++) {
+		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
+			/* MOV third channels to writemask dst1 */
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = ALU_OP1_MOV;
+			alu.src[0].chan = 1;
+			alu.src[0].sel = ctx->temp_reg;
+
+			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
+			alu.last = 1;
+			r = r600_bytecode_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+			break;
+		}
+	}
+	return 0;
+}
+
+static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	int i, r;
+	struct r600_bytecode_alu alu;
+	int last_slot = 3;
+	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
+	int t1 = ctx->temp_reg;
+
+	/* these have to write the result to X/Y by the looks of it */
+	for (i = 0 ; i < last_slot; i++) {
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ctx->inst_info->op;
+
+		/* should only be one src regs */
+		assert (inst->Instruction.NumSrcRegs == 1);
+
+		r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
+		r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
+
+		/* RSQ should take the absolute value of src */
+		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
+		    ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT) {
+			r600_bytecode_src_set_abs(&alu.src[1]);
+		}
+		alu.dst.sel = t1;
+		alu.dst.chan = i;
+		alu.dst.write = (i == 0 || i == 1);
+
+		if (i == last_slot - 1)
+			alu.last = 1;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+
+	for (i = 0 ; i <= lasti; i++) {
+		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
+			continue;
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ALU_OP1_MOV;
+		alu.src[0].sel = t1;
+		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
+		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+		alu.dst.write = 1;
+		if (i == lasti)
+			alu.last = 1;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
@@ -2873,6 +3185,55 @@ static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
 	return 0;
 }
 
+
+static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	int i, j, k, r;
+	struct r600_bytecode_alu alu;
+	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
+	int t1 = ctx->temp_reg;
+
+	for (k = 0; k <= 2; k++) {
+		if (!(inst->Dst[0].Register.WriteMask & (0x3 << (k * 2))))
+			continue;
+
+		for (i = 0; i < 4; i++) {
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = ctx->inst_info->op;
+			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
+				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));;
+			}
+			alu.dst.sel = t1;
+			alu.dst.chan = i;
+			alu.dst.write = 1;
+			if (i == 3)
+				alu.last = 1;
+			r = r600_bytecode_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+		}
+	}
+
+	for (i = 0; i <= lasti; i++) {
+		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
+			continue;
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ALU_OP1_MOV;
+		alu.src[0].sel = t1;
+		alu.src[0].chan = i;
+		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+		alu.dst.write = 1;
+		if (i == lasti)
+			alu.last = 1;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
 /*
  * r600 - trunc to -PI..PI range
  * r700 - normalize by dividing by 2PI
@@ -7965,5 +8326,28 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] =
 	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
 	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
 	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
+	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
+	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
+	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
+	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
+	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
+	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
+	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
+	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
+	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
+	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
+	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
+	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
+	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
+	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
+	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
+	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
+	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
+	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
+	[TGSI_OPCODE_D2I]	= { ALU_OP0_NOP, tgsi_unsupported},
+	[TGSI_OPCODE_I2D]	= { ALU_OP0_NOP, tgsi_unsupported},
+	[TGSI_OPCODE_D2U]	= { ALU_OP0_NOP, tgsi_unsupported},
+	[TGSI_OPCODE_U2D]	= { ALU_OP0_NOP, tgsi_unsupported},
+	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
 	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
 };
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index f49e1c1d64e..48de9cdb156 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -89,6 +89,8 @@ struct r600_shader {
 	unsigned		vs_as_gs_a;
 	unsigned                ps_prim_id_input;
 	struct r600_shader_array * arrays;
+
+	boolean			uses_doubles;
 };
 
 union r600_shader_key {