X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fr600%2Fr600_shader.c;h=903a66302632f38161324b112778b8045a287468;hb=22fc5eff803d14e99e9f86db223ceedcef1b9108;hp=a462691f7aa7af0a1744f9a135bc4a5507b6154a;hpb=df155a73f4ba43fcc720c7b70b375ec9ed41bb89;p=mesa.git

diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index a462691f7aa..903a6630263 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -197,6 +197,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
 
 	use_sb &= !shader->shader.uses_atomics;
 	use_sb &= !shader->shader.uses_images;
+	use_sb &= !shader->shader.uses_helper_invocation;
 
 	/* Check if the bytecode has already been built. */
 	if (!shader->shader.bc.bytecode) {
@@ -346,6 +347,7 @@ struct r600_shader_ctx {
 	boolean                 clip_vertex_write;
 	unsigned                cv_output;
 	unsigned		edgeflag_output;
+	int					helper_invoc_reg;
 	int                                     cs_block_size_reg;
 	int                                     cs_grid_size_reg;
 	bool cs_block_size_loaded, cs_grid_size_loaded;
@@ -1109,7 +1111,6 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off
 
 				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
 					location = TGSI_INTERPOLATE_LOC_CENTER;
-					inputs[1].enabled = true; /* needs SAMPLEID */
 				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
 					location = TGSI_INTERPOLATE_LOC_CENTER;
 					/* Needs sample positions, currently those are always available */
@@ -1137,6 +1138,24 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off
 
 	tgsi_parse_free(&parse);
 
+	if (ctx->info.reads_samplemask &&
+	    (ctx->info.uses_linear_sample || ctx->info.uses_linear_sample)) {
+		inputs[1].enabled = true;
+	}
+
+	if (ctx->bc->chip_class >= EVERGREEN) {
+		int num_baryc = 0;
+		/* assign gpr to each interpolator according to priority */
+		for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
+			if (ctx->eg_interpolators[i].enabled) {
+				ctx->eg_interpolators[i].ij_index = num_baryc;
+				num_baryc++;
+			}
+		}
+		num_baryc = (num_baryc + 1) >> 1;
+		gpr_offset += num_baryc;
+	}
+
 	for (i = 0; i < ARRAY_SIZE(inputs); i++) {
 		boolean enabled = inputs[i].enabled;
 		int *reg = inputs[i].reg;
@@ -1163,18 +1182,21 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off
  * for evergreen we need to scan the shader to find the number of GPRs we need to
  * reserve for interpolation and system values
  *
- * we need to know if we are going to emit
- * any sample or centroid inputs
+ * we need to know if we are going to emit any sample or centroid inputs
  * if perspective and linear are required
 */
 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
 {
 	unsigned i;
-	int num_baryc;
-	struct tgsi_parse_context parse;
 
 	memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
 
+	/*
+	 * Could get this information from the shader info. But right now
+	 * we interpolate all declared inputs, whereas the shader info will
+	 * only contain the bits if the inputs are actually used, so it might
+	 * not be safe...
+	 */
 	for (i = 0; i < ctx->info.num_inputs; i++) {
 		int k;
 		/* skip position/face/mask/sampleid */
@@ -1191,53 +1213,9 @@ static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
 			ctx->eg_interpolators[k].enabled = TRUE;
 	}
 
-	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
-		return 0;
-	}
-
-	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
-	while (!tgsi_parse_end_of_tokens(&parse)) {
-		tgsi_parse_token(&parse);
-
-		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
-			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
-			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
-				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
-				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
-			{
-				int interpolate, location, k;
-
-				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
-					location = TGSI_INTERPOLATE_LOC_CENTER;
-				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
-					location = TGSI_INTERPOLATE_LOC_CENTER;
-				} else {
-					location = TGSI_INTERPOLATE_LOC_CENTROID;
-				}
-
-				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
-				k = eg_get_interpolator_index(interpolate, location);
-				if (k >= 0)
-					ctx->eg_interpolators[k].enabled = true;
-			}
-		}
-	}
-
-	tgsi_parse_free(&parse);
-
-	/* assign gpr to each interpolator according to priority */
-	num_baryc = 0;
-	for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
-		if (ctx->eg_interpolators[i].enabled) {
-			ctx->eg_interpolators[i].ij_index = num_baryc;
-			num_baryc ++;
-		}
-	}
-
 	/* XXX PULL MODEL and LINE STIPPLE */
 
-	num_baryc = (num_baryc + 1) >> 1;
-	return allocate_system_value_inputs(ctx, num_baryc);
+	return allocate_system_value_inputs(ctx, 0);
 }
 
 /* sample_id_sel == NULL means fetch for current sample */
@@ -1246,8 +1224,6 @@ static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_
 	struct r600_bytecode_vtx vtx;
 	int r, t1;
 
-	assert(ctx->fixed_pt_position_gpr != -1);
-
 	t1 = r600_get_temp(ctx);
 
 	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
@@ -1255,6 +1231,8 @@ static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_
 	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
 	if (sample_id == NULL) {
+		assert(ctx->fixed_pt_position_gpr != -1);
+
 		vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
 		vtx.src_sel_x = 3;
 	}
@@ -1295,6 +1273,75 @@ static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_
 	return t1;
 }
 
+static int eg_load_helper_invocation(struct r600_shader_ctx *ctx)
+{
+	int r;
+	struct r600_bytecode_alu alu;
+
+	/* do a vtx fetch with wqm set on the vtx fetch */
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP1_MOV;
+	alu.dst.sel = ctx->helper_invoc_reg;
+	alu.dst.chan = 0;
+	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[0].value = 0xffffffff;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	/* do a vtx fetch in VPM mode */
+	struct r600_bytecode_vtx vtx;
+	memset(&vtx, 0, sizeof(vtx));
+	vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
+	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
+	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
+	vtx.src_gpr = 0;
+	vtx.mega_fetch_count = 16; /* no idea here really... */
+	vtx.dst_gpr = ctx->helper_invoc_reg;
+	vtx.dst_sel_x = 4;
+	vtx.dst_sel_y = 7;		/* SEL_Y */
+	vtx.dst_sel_z = 7;		/* SEL_Z */
+	vtx.dst_sel_w = 7;		/* SEL_W */
+	vtx.data_format = FMT_32;
+	if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
+		return r;
+	ctx->bc->cf_last->vpm = 1;
+	return 0;
+}
+
+static int cm_load_helper_invocation(struct r600_shader_ctx *ctx)
+{
+	int r;
+	struct r600_bytecode_alu alu;
+
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP1_MOV;
+	alu.dst.sel = ctx->helper_invoc_reg;
+	alu.dst.chan = 0;
+	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[0].value = 0xffffffff;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP1_MOV;
+	alu.dst.sel = ctx->helper_invoc_reg;
+	alu.dst.chan = 0;
+	alu.src[0].sel = V_SQ_ALU_SRC_0;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
+	if (r)
+		return r;
+
+	return ctx->helper_invoc_reg;
+}
+
 static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
 {
 	struct r600_bytecode_vtx vtx;
@@ -1458,6 +1505,12 @@ static void tgsi_src(struct r600_shader_ctx *ctx,
 			r600_src->sel = load_block_grid_size(ctx, false);
 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
 			r600_src->sel = load_block_grid_size(ctx, true);
+		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {
+			r600_src->sel = ctx->helper_invoc_reg;
+			r600_src->swizzle[0] = 0;
+			r600_src->swizzle[1] = 0;
+			r600_src->swizzle[2] = 0;
+			r600_src->swizzle[3] = 0;
 		}
 	} else {
 		if (tgsi_src->Register.Indirect)
@@ -2136,7 +2189,7 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output
 	for (i = 0; i < so->num_outputs; i++) {
 		struct r600_bytecode_output output;
 
-		if (stream != -1 && stream != so->output[i].output_buffer)
+		if (stream != -1 && stream != so->output[i].stream)
 			continue;
 
 		memset(&output, 0, sizeof(struct r600_bytecode_output));
@@ -3120,6 +3173,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	tgsi_scan_shader(tokens, &ctx.info);
 	shader->indirect_files = ctx.info.indirect_files;
 
+	shader->uses_helper_invocation = false;
 	shader->uses_doubles = ctx.info.uses_doubles;
 	shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
 	shader->nsys_inputs = 0;
@@ -3169,7 +3223,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		break;
 	case PIPE_SHADER_COMPUTE:
 		shader->rat_base = 0;
-		shader->image_size_const_offset = 0;
+		shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER];
 		break;
 	default:
 		break;
@@ -3193,6 +3247,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.clip_vertex_write = 0;
 	ctx.thread_id_gpr_loaded = false;
 
+	ctx.helper_invoc_reg = -1;
 	ctx.cs_block_size_reg = -1;
 	ctx.cs_grid_size_reg = -1;
 	ctx.cs_block_size_loaded = false;
@@ -3238,6 +3293,13 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
 		else
 			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
+
+		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
+			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) {
+				ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
+				shader->uses_helper_invocation = true;
+			}
+		}
 	}
 	if (ctx.type == PIPE_SHADER_GEOMETRY) {
 		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
@@ -3282,41 +3344,38 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
 
 	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
-	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
-			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
-	ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
-	ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
+
+	int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] +
+			ctx.info.file_max[TGSI_FILE_TEMPORARY];
+	ctx.bc->ar_reg = ++regno;
+	ctx.bc->index_reg[0] = ++regno;
+	ctx.bc->index_reg[1] = ++regno;
 
 	if (ctx.type == PIPE_SHADER_TESS_CTRL) {
-		ctx.tess_input_info = ctx.bc->ar_reg + 3;
-		ctx.tess_output_info = ctx.bc->ar_reg + 4;
-		ctx.temp_reg = ctx.bc->ar_reg + 5;
+		ctx.tess_input_info = ++regno;
+		ctx.tess_output_info = ++regno;
 	} else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
 		ctx.tess_input_info = 0;
-		ctx.tess_output_info = ctx.bc->ar_reg + 3;
-		ctx.temp_reg = ctx.bc->ar_reg + 4;
+		ctx.tess_output_info = ++regno;
 	} else if (ctx.type == PIPE_SHADER_GEOMETRY) {
-		ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
-		ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
-		ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
-		ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
-		ctx.temp_reg = ctx.bc->ar_reg + 7;
+		ctx.gs_export_gpr_tregs[0] = ++regno;
+		ctx.gs_export_gpr_tregs[1] = ++regno;
+		ctx.gs_export_gpr_tregs[2] = ++regno;
+		ctx.gs_export_gpr_tregs[3] = ++regno;
 		if (ctx.shader->gs_tri_strip_adj_fix) {
-			ctx.gs_rotated_input[0] = ctx.bc->ar_reg + 7;
-			ctx.gs_rotated_input[1] = ctx.bc->ar_reg + 8;
-			ctx.temp_reg += 2;
+			ctx.gs_rotated_input[0] = ++regno;
+			ctx.gs_rotated_input[1] = ++regno;
 		} else {
 			ctx.gs_rotated_input[0] = 0;
 			ctx.gs_rotated_input[1] = 1;
 		}
-	} else {
-		ctx.temp_reg = ctx.bc->ar_reg + 3;
 	}
 
 	if (shader->uses_images) {
-		ctx.thread_id_gpr = ctx.temp_reg++;
+		ctx.thread_id_gpr = ++regno;
 		ctx.thread_id_gpr_loaded = false;
 	}
+	ctx.temp_reg = ++regno;
 
 	shader->max_arrays = 0;
 	shader->num_arrays = 0;
@@ -3439,6 +3498,64 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
 		shader->nr_ps_max_color_exports = 8;
 
+	if (ctx.shader->uses_helper_invocation) {
+		if (ctx.bc->chip_class == CAYMAN)
+			r = cm_load_helper_invocation(&ctx);
+		else
+			r = eg_load_helper_invocation(&ctx);
+		if (r)
+			return r;
+	}
+
+	/*
+	 * XXX this relies on fixed_pt_position_gpr only being present when
+	 * this shader should be executed per sample. Should be the case for now...
+	 */
+	if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) {
+		/*
+		 * Fix up sample mask. The hw always gives us coverage mask for
+		 * the pixel. However, for per-sample shading, we need the
+		 * coverage for the shader invocation only.
+		 * Also, with disabled msaa, only the first bit should be set
+		 * (luckily the same fixup works for both problems).
+		 * For now, we can only do it if we know this shader is always
+		 * executed per sample (due to usage of bits in the shader
+		 * forcing per-sample execution).
+		 * If the fb is not multisampled, we'd do unnecessary work but
+		 * it should still be correct.
+		 * It will however do nothing for sample shading according
+		 * to MinSampleShading.
+		 */
+		struct r600_bytecode_alu alu;
+		int tmp = r600_get_temp(&ctx);
+		assert(ctx.face_gpr != -1);
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+		alu.op = ALU_OP2_LSHL_INT;
+		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
+		alu.src[0].value = 0x1;
+		alu.src[1].sel = ctx.fixed_pt_position_gpr;
+		alu.src[1].chan = 3;
+		alu.dst.sel = tmp;
+		alu.dst.chan = 0;
+		alu.dst.write = 1;
+		alu.last = 1;
+		if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
+			return r;
+
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ALU_OP2_AND_INT;
+		alu.src[0].sel = tmp;
+		alu.src[1].sel = ctx.face_gpr;
+		alu.src[1].chan = 2;
+		alu.dst.sel = ctx.face_gpr;
+		alu.dst.chan = 2;
+		alu.dst.write = 1;
+		alu.last = 1;
+		if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
+			return r;
+	}
+
 	if (ctx.fragcoord_input >= 0) {
 		if (ctx.bc->chip_class == CAYMAN) {
 			for (j = 0 ; j < 4; j++) {
@@ -3780,6 +3897,17 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 					output[j].array_base = shader->output[i].sid;
 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 					shader->nr_ps_color_exports++;
+					shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4));
+
+					/* If the i-th target format is set, all previous target formats must
+					 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.
+					 */
+					if (shader->output[i].sid > 0)
+						for (unsigned x = 0; x < shader->output[i].sid; x++)
+							shader->ps_color_export_mask |= (1 << (x*4));
+
+					if (shader->output[i].sid > shader->ps_export_highest)
+						shader->ps_export_highest = shader->output[i].sid;
 					if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
 						for (k = 1; k < max_color_exports; k++) {
 							j++;
@@ -3795,6 +3923,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 							output[j].op = CF_OP_EXPORT;
 							output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 							shader->nr_ps_color_exports++;
+							shader->ps_color_export_mask |= (0xf << (j * 4));
 						}
 					}
 				} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
@@ -3883,6 +4012,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 			output[j].op = CF_OP_EXPORT;
 			j++;
 			shader->nr_ps_color_exports++;
+			shader->ps_color_export_mask = 0xf;
 		}
 
 		noutput = j;
@@ -4395,44 +4525,109 @@ static int egcm_int_to_double(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	struct r600_bytecode_alu alu;
-	int i, r;
-	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
+	int i, c, r;
+	int write_mask = inst->Dst[0].Register.WriteMask;
+	int temp_reg = r600_get_temp(ctx);
 
 	assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
 		inst->Instruction.Opcode == TGSI_OPCODE_U2D);
 
-	for (i = 0; i <= (lasti+1)/2; i++) {
-		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-		alu.op = ctx->inst_info->op;
-
-		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
-		alu.dst.sel = ctx->temp_reg;
-		alu.dst.chan = i;
-		alu.dst.write = 1;
-		alu.last = 1;
+	for (c = 0; c < 2; c++) {
+		int dchan = c * 2;
+		if (write_mask & (0x3 << dchan)) {
+	/* split into 24-bit int and 8-bit int */
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = ALU_OP2_AND_INT;
+			alu.dst.sel = temp_reg;
+			alu.dst.chan = dchan;
+			r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
+			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+			alu.src[1].value = 0xffffff00;
+			alu.dst.write = 1;
+			r = r600_bytecode_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
 
-		r = r600_bytecode_add_alu(ctx->bc, &alu);
-		if (r)
-			return r;
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = ALU_OP2_AND_INT;
+			alu.dst.sel = temp_reg;
+			alu.dst.chan = dchan + 1;
+			r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
+			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+			alu.src[1].value = 0xff;
+			alu.dst.write = 1;
+			alu.last = 1;
+			r = r600_bytecode_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+		}
 	}
 
-	for (i = 0; i <= lasti; i++) {
-		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-		alu.op = ALU_OP1_FLT32_TO_FLT64;
+	for (c = 0; c < 2; c++) {
+		int dchan = c * 2;
+		if (write_mask & (0x3 << dchan)) {
+			for (i = dchan; i <= dchan + 1; i++) {
+				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+				alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT;
 
-		alu.src[0].chan = i/2;
-		if (i%2 == 0)
-			alu.src[0].sel = ctx->temp_reg;
-		else {
-			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
-			alu.src[0].value = 0x0;
+				alu.src[0].sel = temp_reg;
+				alu.src[0].chan = i;
+				alu.dst.sel = temp_reg;
+				alu.dst.chan = i;
+				alu.dst.write = 1;
+				if (ctx->bc->chip_class == CAYMAN)
+					alu.last = i == dchan + 1;
+				else
+					alu.last = 1; /* trans only ops on evergreen */
+				
+				r = r600_bytecode_add_alu(ctx->bc, &alu);
+				if (r)
+					return r;
+			}
 		}
-		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
-		alu.last = i == lasti;
+	}
 
-		r = r600_bytecode_add_alu(ctx->bc, &alu);
-		if (r)
-			return r;
+	for (c = 0; c < 2; c++) {
+		int dchan = c * 2;
+		if (write_mask & (0x3 << dchan)) {
+			for (i = 0; i < 4; i++) {
+				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+				alu.op = ALU_OP1_FLT32_TO_FLT64;
+
+				alu.src[0].chan = dchan + (i / 2);
+				if (i == 0 || i == 2)
+					alu.src[0].sel = temp_reg;
+				else {
+					alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
+					alu.src[0].value = 0x0;
+				}
+				alu.dst.sel = ctx->temp_reg;
+				alu.dst.chan = i;
+				alu.last = i == 3;
+				alu.dst.write = 1;
+
+				r = r600_bytecode_add_alu(ctx->bc, &alu);
+				if (r)
+					return r;
+			}
+
+			for (i = 0; i <= 1; i++) {
+				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+				alu.op = ALU_OP2_ADD_64;
+
+				alu.src[0].chan = fp64_switch(i);
+				alu.src[0].sel = ctx->temp_reg;
+
+				alu.src[1].chan = fp64_switch(i + 2);
+				alu.src[1].sel = ctx->temp_reg;
+				tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst);
+				alu.last = i == 1;
+
+				r = r600_bytecode_add_alu(ctx->bc, &alu);
+				if (r)
+					return r;
+			}
+		}
 	}
 
 	return 0;
@@ -6850,7 +7045,7 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_l
 	return 0;
 }
 
-static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset)
+static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	int r;
@@ -6876,7 +7071,7 @@ static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offs
 		struct r600_bytecode_vtx vtx;
 		memset(&vtx, 0, sizeof(vtx));
 		vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
-		vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
+		vtx.buffer_id = id + eg_buffer_base;
 		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
 		vtx.src_gpr = 0;
 		vtx.mega_fetch_count = 16; /* no idea here really... */
@@ -6950,7 +7145,7 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
 		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
 			if (ctx->bc->chip_class < EVERGREEN)
 				ctx->shader->uses_tex_buffers = true;
-			return r600_do_buffer_txq(ctx, 1, 0);
+			return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS);
 		}
 		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
 			if (ctx->bc->chip_class < EVERGREEN)
@@ -7302,6 +7497,168 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
 		}
 	}
 
+	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
+		/* Gather4 should follow the same rules as bilinear filtering, but the hardware
+		 * incorrectly forces nearest filtering if the texture format is integer.
+		 * The only effect it has on Gather4, which always returns 4 texels for
+		 * bilinear filtering, is that the final coordinates are off by 0.5 of
+		 * the texel size.
+		 *
+		 * The workaround is to subtract 0.5 from the unnormalized coordinates,
+		 * or (0.5 / size) from the normalized coordinates.
+		 */
+		if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
+		    inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
+			int treg = r600_get_temp(ctx);
+
+			/* mov array and comparison oordinate to temp_reg if needed */
+			if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
+			     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
+			     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) {
+				int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2;
+				for (i = 2; i <= end; i++) {
+					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+					alu.op = ALU_OP1_MOV;
+					alu.dst.sel = ctx->temp_reg;
+					alu.dst.chan = i;
+					alu.dst.write = 1;
+					alu.last = (i == end);
+					r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+					r = r600_bytecode_add_alu(ctx->bc, &alu);
+					if (r)
+						return r;
+				}
+			}
+
+			if (inst->Texture.Texture == TGSI_TEXTURE_RECT ||
+			    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
+				for (i = 0; i < 2; i++) {
+					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+					alu.op = ALU_OP2_ADD;
+					alu.dst.sel = ctx->temp_reg;
+					alu.dst.chan = i;
+					alu.dst.write = 1;
+					alu.last = i == 1;
+					if (src_loaded) {
+						alu.src[0].sel = ctx->temp_reg;
+						alu.src[0].chan = i;
+					} else
+						r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+					alu.src[1].sel = V_SQ_ALU_SRC_0_5;
+					alu.src[1].neg = 1;
+					r = r600_bytecode_add_alu(ctx->bc, &alu);
+					if (r)
+						return r;
+				}
+			} else {
+				/* execute a TXQ */
+				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
+				tex.op = FETCH_OP_GET_TEXTURE_RESINFO;
+				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
+				tex.sampler_index_mode = sampler_index_mode;
+				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
+				tex.resource_index_mode = sampler_index_mode;
+				tex.dst_gpr = treg;
+				tex.src_sel_x = 4;
+				tex.src_sel_y = 4;
+				tex.src_sel_z = 4;
+				tex.src_sel_w = 4;
+				tex.dst_sel_x = 0;
+				tex.dst_sel_y = 1;
+				tex.dst_sel_z = 7;
+				tex.dst_sel_w = 7;
+				r = r600_bytecode_add_tex(ctx->bc, &tex);
+				if (r)
+					return r;
+
+				/* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */
+				if (ctx->bc->chip_class == CAYMAN) {
+					/* */
+					for (i = 0; i < 2; i++) {
+						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+						alu.op = ALU_OP1_INT_TO_FLT;
+						alu.dst.sel = treg;
+						alu.dst.chan = i;
+						alu.dst.write = 1;
+						alu.src[0].sel = treg;
+						alu.src[0].chan = i;
+						alu.last = (i == 1) ? 1 : 0;
+						r = r600_bytecode_add_alu(ctx->bc, &alu);
+						if (r)
+							return r;
+					}
+					for (j = 0; j < 2; j++) {
+						for (i = 0; i < 3; i++) {
+							memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+							alu.op = ALU_OP1_RECIP_IEEE;
+							alu.src[0].sel = treg;
+							alu.src[0].chan = j;
+							alu.dst.sel = treg;
+							alu.dst.chan = i;
+							if (i == 2)
+								alu.last = 1;
+							if (i == j)
+								alu.dst.write = 1;
+							r = r600_bytecode_add_alu(ctx->bc, &alu);
+							if (r)
+								return r;
+						}
+					}
+				} else {
+					for (i = 0; i < 2; i++) {
+						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+						alu.op = ALU_OP1_INT_TO_FLT;
+						alu.dst.sel = treg;
+						alu.dst.chan = i;
+						alu.dst.write = 1;
+						alu.src[0].sel = treg;
+						alu.src[0].chan = i;
+						alu.last = 1;
+						r = r600_bytecode_add_alu(ctx->bc, &alu);
+						if (r)
+							return r;
+					}
+					for (i = 0; i < 2; i++) {
+						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+						alu.op = ALU_OP1_RECIP_IEEE;
+						alu.src[0].sel = treg;
+						alu.src[0].chan = i;
+						alu.dst.sel = treg;
+						alu.dst.chan = i;
+						alu.last = 1;
+						alu.dst.write = 1;
+						r = r600_bytecode_add_alu(ctx->bc, &alu);
+						if (r)
+							return r;
+					}
+				}
+				for (i = 0; i < 2; i++) {
+					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+					alu.op = ALU_OP3_MULADD;
+					alu.is_op3 = 1;
+					alu.dst.sel = ctx->temp_reg;
+					alu.dst.chan = i;
+					alu.dst.write = 1;
+					alu.last = i == 1;
+					alu.src[0].sel = treg;
+					alu.src[0].chan = i;
+					alu.src[1].sel = V_SQ_ALU_SRC_0_5;
+					alu.src[1].neg = 1;
+					if (src_loaded) {
+						alu.src[2].sel = ctx->temp_reg;
+						alu.src[2].chan = i;
+					} else
+						r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
+					r = r600_bytecode_add_alu(ctx->bc, &alu);
+					if (r)
+						return r;
+				}
+			}
+			src_loaded = TRUE;
+			src_gpr = ctx->temp_reg;
+		}
+	}
+
 	if (src_requires_loading && !src_loaded) {
 		for (i = 0; i < 4; i++) {
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
@@ -7619,15 +7976,15 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
 		tex.inst_mod = texture_component_select;
 
 		if (ctx->bc->chip_class == CAYMAN) {
-		/* GATHER4 result order is different from TGSI TG4 */
-			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
-			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
-			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
+			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
+			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
+			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
 			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
 		} else {
-			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
-			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
-			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
+			/* GATHER4 result order is different from TGSI TG4 */
+			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7;
+			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7;
+			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7;
 			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
 		}
 	}
@@ -8520,6 +8877,33 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
 	if (r)
 		return r;
 
+	if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) {
+		if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) {
+			int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]);
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = ALU_OP1_MOV;
+			alu.dst.sel = ctx->temp_reg;
+			alu.dst.chan = is_cm ? 2 : 1;
+			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
+			alu.src[0].value = value;
+			alu.last = 1;
+			alu.dst.write = 1;
+			r = r600_bytecode_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+		} else {
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = ALU_OP1_MOV;
+			alu.dst.sel = ctx->temp_reg;
+			alu.dst.chan = is_cm ? 2 : 1;
+			r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
+			alu.last = 1;
+			alu.dst.write = 1;
+			r = r600_bytecode_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+		}
+	}
 	if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
 		int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
 		int abs_value = abs(value);
@@ -8559,7 +8943,10 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
 	gds.src_gpr2 = 0;
 	gds.src_sel_x = is_cm ? 0 : 4;
 	gds.src_sel_y = is_cm ? 1 : 0;
-	gds.src_sel_z = 7;
+	if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET)
+		gds.src_sel_z = is_cm ? 2 : 1;
+	else
+		gds.src_sel_z = 7;
 	gds.dst_sel_x = 0;
 	gds.dst_sel_y = 7;
 	gds.dst_sel_z = 7;
@@ -8664,7 +9051,11 @@ static int tgsi_resq(struct r600_shader_ctx *ctx)
 	    (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
 		if (ctx->bc->chip_class < EVERGREEN)
 			ctx->shader->uses_tex_buffers = true;
-		return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset);
+		unsigned eg_buffer_base = 0;
+		eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET;
+		if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
+			eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE];
+		return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base);
 	}
 
 	if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&