From: Tom Stellard <thomas.stellard@amd.com>
Date: Tue, 17 May 2016 18:10:51 +0000 (+0000)
Subject: radeon/llvm: Use alloca instructions for larger arrays
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=513fccdfb68e6a71180e21827f071617c93fd09b;p=mesa.git

radeon/llvm: Use alloca instructions for larger arrays

We were storing arrays in vectors, which was leading to some really bad
spill code for large arrays.  allocas instructions are a better fit for
arrays and LLVM optimizations are more geared toward dealing with
allocas instead of vectors.

For arrays that have 16 or less 32-bit elements, we will continue to use
vectors, because this will force LLVM to store them in registers and
use indirect registers, which is usually faster for small arrays.

In the future we should use allocas for all arrays and teach LLVM
how to store allocas in registers.

This fixes the piglit test:

spec/glsl-1.50/execution/geometry/max-input-component

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---

diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 61afa7aa776..cc4bd5b7254 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -50,6 +50,11 @@ struct radeon_llvm_loop {
 	LLVMBasicBlockRef endloop_block;
 };
 
+struct radeon_llvm_array {
+	struct tgsi_declaration_range range;
+	LLVMValueRef alloca;
+};
+
 struct radeon_llvm_context {
 	struct lp_build_tgsi_soa_context soa;
 
@@ -96,7 +101,7 @@ struct radeon_llvm_context {
 	unsigned loop_depth;
 	unsigned loop_depth_max;
 
-	struct tgsi_declaration_range *arrays;
+	struct radeon_llvm_array *arrays;
 
 	LLVMValueRef main_fn;
 	LLVMTypeRef return_type;
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 3def486883d..6d31f7072fc 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -83,11 +83,25 @@ static LLVMValueRef emit_swizzle(
 
 static struct tgsi_declaration_range
 get_array_range(struct lp_build_tgsi_context *bld_base,
-		unsigned File, const struct tgsi_ind_register *reg)
+		unsigned File, unsigned reg_index,
+		const struct tgsi_ind_register *reg)
 {
 	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
 
-	if (File != TGSI_FILE_TEMPORARY || reg->ArrayID == 0 ||
+	if (!reg) {
+		unsigned i;
+		unsigned num_arrays = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
+		for (i = 0; i < num_arrays; i++) {
+			const struct tgsi_declaration_range *range =
+						&ctx->arrays[i].range;
+
+			if (reg_index >= range->First && reg_index <= range->Last) {
+				return ctx->arrays[i].range;
+			}
+		}
+	}
+
+	if (File != TGSI_FILE_TEMPORARY || !reg || reg->ArrayID == 0 ||
 	    reg->ArrayID > bld_base->info->array_max[TGSI_FILE_TEMPORARY]) {
 		struct tgsi_declaration_range range;
 		range.First = 0;
@@ -95,7 +109,32 @@ get_array_range(struct lp_build_tgsi_context *bld_base,
 		return range;
 	}
 
-	return ctx->arrays[reg->ArrayID - 1];
+	return ctx->arrays[reg->ArrayID - 1].range;
+}
+
+static LLVMValueRef get_alloca_for_array(
+	struct lp_build_tgsi_context *bld_base,
+	unsigned file,
+	unsigned index) {
+
+	unsigned i;
+	unsigned num_arrays;
+	struct radeon_llvm_context *ctx = radeon_llvm_context(bld_base);
+
+	if (file != TGSI_FILE_TEMPORARY) {
+		return NULL;
+	}
+
+	num_arrays = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
+	for (i = 0; i < num_arrays; i++) {
+		const struct tgsi_declaration_range *range =
+						&ctx->arrays[i].range;
+
+		if (index >= range->First && index <= range->Last) {
+			return ctx->arrays[i].alloca;
+		}
+	}
+	return NULL;
 }
 
 static LLVMValueRef
@@ -106,6 +145,9 @@ emit_array_index(
 {
 	struct gallivm_state * gallivm = bld->bld_base.base.gallivm;
 
+	if (!reg) {
+		return lp_build_const_int32(gallivm, offset);
+	}
 	LLVMValueRef addr = LLVMBuildLoad(gallivm->builder, bld->addr[reg->Index][reg->Swizzle], "");
 	return LLVMBuildAdd(gallivm->builder, addr, lp_build_const_int32(gallivm, offset), "");
 }
@@ -155,7 +197,7 @@ emit_array_fetch(
 		tmp_reg.Register.Index = i + range.First;
 		LLVMValueRef temp = radeon_llvm_emit_fetch(bld_base, &tmp_reg, type, swizzle);
 		result = LLVMBuildInsertElement(builder, result, temp,
-			lp_build_const_int32(gallivm, i), "");
+			lp_build_const_int32(gallivm, i), "array_vector");
 	}
 	return result;
 }
@@ -170,13 +212,35 @@ load_value_from_array(
 	const struct tgsi_ind_register *reg_indirect)
 {
 	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
-	struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_indirect);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_index, reg_indirect);
+	LLVMValueRef index = emit_array_index(bld, reg_indirect, reg_index - range.First);
+	LLVMValueRef array = get_alloca_for_array(bld_base, file, reg_index);
+	LLVMValueRef ptr, val, indices[2];
+
+	if (!array) {
+		/* Handle the case where the array is stored as a vector. */
+		return LLVMBuildExtractElement(builder,
+				emit_array_fetch(bld_base, file, type, range, swizzle),
+				index, "");
+	}
 
-	return LLVMBuildExtractElement(builder,
-			emit_array_fetch(bld_base, file, type, range, swizzle),
-			emit_array_index(bld, reg_indirect, reg_index - range.First), "");
+	index = LLVMBuildMul(builder, index, lp_build_const_int32(gallivm, TGSI_NUM_CHANNELS), "");
+	index = LLVMBuildAdd(builder, index, lp_build_const_int32(gallivm, swizzle), "");
+	indices[0] = bld_base->uint_bld.zero;
+	indices[1] = index;
+	ptr = LLVMBuildGEP(builder, array, indices, 2, "");
+	val = LLVMBuildLoad(builder, ptr, "");
+	if (tgsi_type_is_64bit(type)) {
+		LLVMValueRef ptr_hi, val_hi;
+		indices[0] = lp_build_const_int32(gallivm, 1);
+		ptr_hi = LLVMBuildGEP(builder, ptr, indices, 1, "");
+		val_hi = LLVMBuildLoad(builder, ptr_hi, "");
+		val = radeon_llvm_emit_fetch_64bit(bld_base, type, val, val_hi);
 
+	}
+	return val;
 }
 
 static LLVMValueRef store_value_to_array(
@@ -188,13 +252,26 @@ static LLVMValueRef store_value_to_array(
 	const struct tgsi_ind_register *reg_indirect)
 {
 	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
-	struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_indirect);
-
-	return LLVMBuildInsertElement(builder,
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_index, reg_indirect);
+	LLVMValueRef index = emit_array_index(bld, reg_indirect, reg_index - range.First);
+	LLVMValueRef array = get_alloca_for_array(bld_base, file, reg_index);
+
+	if (array) {
+		LLVMValueRef indices[2];
+		index = LLVMBuildMul(builder, index, lp_build_const_int32(gallivm, TGSI_NUM_CHANNELS), "");
+		index = LLVMBuildAdd(builder, index, lp_build_const_int32(gallivm, chan_index), "");
+		indices[0] = bld_base->uint_bld.zero;
+		indices[1] = index;
+		LLVMValueRef pointer = LLVMBuildGEP(builder, array, indices, 2, "");
+		LLVMBuildStore(builder, value, pointer);
+		return NULL;
+	} else {
+		return LLVMBuildInsertElement(builder,
 				emit_array_fetch(bld_base, file, TGSI_TYPE_FLOAT, range, chan_index),
-				value,  emit_array_index(bld, reg_indirect, reg_index - range.First), "");
-	return NULL;
+				value, index, "");
+	}
 }
 
 LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
@@ -218,8 +295,9 @@ LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
 	}
 
 	if (reg->Register.Indirect) {
-		return load_value_from_array(bld_base, reg->Register.File, type,
+		LLVMValueRef load = load_value_from_array(bld_base, reg->Register.File, type,
 				swizzle, reg->Register.Index, &reg->Indirect);
+		return bitcast(bld_base, type, load);
 	}
 
 	switch(reg->Register.File) {
@@ -258,6 +336,11 @@ LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
 						 LLVMBuildLoad(builder, ptr, ""),
 						 LLVMBuildLoad(builder, ptr2, ""));
 		}
+		LLVMValueRef array = get_alloca_for_array(bld_base, reg->Register.File, reg->Register.Index);
+		if (array) {
+			return bitcast(bld_base, type, load_value_from_array(bld_base, reg->Register.File, type,
+					swizzle, reg->Register.Index, NULL));
+		}
 		result = LLVMBuildLoad(builder, ptr, "");
 		break;
 
@@ -310,6 +393,7 @@ static void emit_declaration(
 	const struct tgsi_full_declaration *decl)
 {
 	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
 	unsigned first, last, i, idx;
 	switch(decl->Declaration.File) {
 	case TGSI_FILE_ADDRESS:
@@ -327,13 +411,36 @@ static void emit_declaration(
 	}
 
 	case TGSI_FILE_TEMPORARY:
+	{
+		unsigned decl_size;
+		first = decl->Range.First;
+		last = decl->Range.Last;
+		decl_size = 4 * ((last - first) + 1);
 		if (decl->Declaration.Array) {
+			unsigned id = decl->Array.ArrayID - 1;
 			if (!ctx->arrays) {
 				int size = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
-				ctx->arrays = MALLOC(sizeof(ctx->arrays[0]) * size);
+				ctx->arrays = CALLOC(size, sizeof(ctx->arrays[0]));
+			for (i = 0; i < size; ++i) {
+				assert(!ctx->arrays[i].alloca);}
 			}
 
-			ctx->arrays[decl->Array.ArrayID - 1] = decl->Range;
+			ctx->arrays[id].range = decl->Range;
+
+			/* If the array is more than 16 elements (each element
+			 * is 32-bits), then store it in a vector.  Storing the
+			 * array in a vector will causes the compiler to store
+			 * the array in registers and access it using indirect
+			 * addressing.  16 is number of vector elements that
+			 * LLVM will store in a register.
+			 * FIXME: We shouldn't need to do this.  LLVM should be
+			 * smart enough to promote allocas int registers when
+			 * profitable.
+			 */
+			if (decl_size > 16) {
+				ctx->arrays[id].alloca = LLVMBuildAlloca(builder,
+					LLVMArrayType(bld_base->base.vec_type, decl_size),"array");
+			}
 		}
 		first = decl->Range.First;
 		last = decl->Range.Last;
@@ -350,7 +457,7 @@ static void emit_declaration(
 			}
 		}
 		break;
-
+	}
 	case TGSI_FILE_INPUT:
 	{
 		unsigned idx;
@@ -460,11 +567,16 @@ void radeon_llvm_emit_store(
 
 		if (reg->Register.Indirect) {
 			struct tgsi_declaration_range range = get_array_range(bld_base,
-				reg->Register.File, &reg->Indirect);
+				reg->Register.File, reg->Register.Index, &reg->Indirect);
 
         		unsigned i, size = range.Last - range.First + 1;
-			LLVMValueRef array = store_value_to_array(bld_base, value, reg->Register.File, chan_index,
-			                                          reg->Register.Index, &reg->Indirect);
+			unsigned file = reg->Register.File;
+			unsigned reg_index = reg->Register.Index;
+			LLVMValueRef array = store_value_to_array(bld_base, value, file, chan_index,
+			                                          reg_index, &reg->Indirect);
+	                if (get_alloca_for_array(bld_base, file, reg_index)) {
+				continue;
+			}
         		for (i = 0; i < size; ++i) {
 				switch(reg->Register.File) {
 				case TGSI_FILE_OUTPUT:
@@ -478,7 +590,7 @@ void radeon_llvm_emit_store(
 					break;
 
 				default:
-					return;
+					continue;
 				}
 				value = LLVMBuildExtractElement(builder, array, 
 					lp_build_const_int32(gallivm, i), "");
@@ -494,14 +606,23 @@ void radeon_llvm_emit_store(
 				break;
 
 			case TGSI_FILE_TEMPORARY:
+			{
+				LLVMValueRef array;
 				if (reg->Register.Index >= ctx->temps_count)
 					continue;
+				array = get_alloca_for_array(bld_base, reg->Register.File, reg->Register.Index);
+
+				if (array) {
+					store_value_to_array(bld_base, value, reg->Register.File, chan_index, reg->Register.Index,
+								NULL);
+					continue;
+				}
 				temp_ptr = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index];
 				if (tgsi_type_is_64bit(dtype))
 					temp_ptr2 = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index + 1];
 
 				break;
-
+			}
 			default:
 				return;
 			}