radeon/llvm: Use alloca instructions for larger arrays [revert a revert]
authorMarek Olšák <marek.olsak@amd.com>
Mon, 25 Jul 2016 19:26:05 +0000 (21:26 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Tue, 26 Jul 2016 21:31:56 +0000 (23:31 +0200)
This reverts commit f84e9d749fbb6da73a60fb70e6725db773c9b8f8.

Bioshock Infinite no longer hangs.

src/gallium/drivers/radeon/radeon_llvm.h
src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c

index d456a92bc2706ad5f7457eef777b63dc653a07cd..13f333631477c231d531fef31dc5b31f1a3ae11f 100644 (file)
@@ -50,6 +50,11 @@ struct radeon_llvm_loop {
        LLVMBasicBlockRef endloop_block;
 };
 
+struct radeon_llvm_array {
+       struct tgsi_declaration_range range;
+       LLVMValueRef alloca;
+};
+
 struct radeon_llvm_context {
        struct lp_build_tgsi_soa_context soa;
 
@@ -96,7 +101,7 @@ struct radeon_llvm_context {
        unsigned loop_depth;
        unsigned loop_depth_max;
 
-       struct tgsi_declaration_range *arrays;
+       struct radeon_llvm_array *arrays;
 
        LLVMValueRef main_fn;
        LLVMTypeRef return_type;
index ed8fd3001dc0695c3ddb7bb0f4c699b26d76fd41..d382496387fbd09569c3f17ee101d89e9f2e0e9d 100644 (file)
@@ -112,11 +112,25 @@ static LLVMValueRef emit_swizzle(struct lp_build_tgsi_context *bld_base,
 
 static struct tgsi_declaration_range
 get_array_range(struct lp_build_tgsi_context *bld_base,
-               unsigned File, const struct tgsi_ind_register *reg)
+               unsigned File, unsigned reg_index,
+               const struct tgsi_ind_register *reg)
 {
        struct radeon_llvm_context *ctx = radeon_llvm_context(bld_base);
 
-       if (File != TGSI_FILE_TEMPORARY || reg->ArrayID == 0 ||
+       if (!reg) {
+               unsigned i;
+               unsigned num_arrays = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
+               for (i = 0; i < num_arrays; i++) {
+                       const struct tgsi_declaration_range *range =
+                                               &ctx->arrays[i].range;
+
+                       if (reg_index >= range->First && reg_index <= range->Last) {
+                               return ctx->arrays[i].range;
+                       }
+               }
+       }
+
+       if (File != TGSI_FILE_TEMPORARY || !reg || reg->ArrayID == 0 ||
            reg->ArrayID > bld_base->info->array_max[TGSI_FILE_TEMPORARY]) {
                struct tgsi_declaration_range range;
                range.First = 0;
@@ -124,7 +138,30 @@ get_array_range(struct lp_build_tgsi_context *bld_base,
                return range;
        }
 
-       return ctx->arrays[reg->ArrayID - 1];
+       return ctx->arrays[reg->ArrayID - 1].range;
+}
+
+static LLVMValueRef get_alloca_for_array(struct lp_build_tgsi_context *bld_base,
+                                        unsigned file,
+                                        unsigned index)
+{
+       unsigned i;
+       unsigned num_arrays;
+       struct radeon_llvm_context *ctx = radeon_llvm_context(bld_base);
+
+       if (file != TGSI_FILE_TEMPORARY)
+               return NULL;
+
+       num_arrays = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
+       for (i = 0; i < num_arrays; i++) {
+               const struct tgsi_declaration_range *range =
+                                               &ctx->arrays[i].range;
+
+               if (index >= range->First && index <= range->Last) {
+                       return ctx->arrays[i].alloca;
+               }
+       }
+       return NULL;
 }
 
 static LLVMValueRef
@@ -134,6 +171,9 @@ emit_array_index(struct lp_build_tgsi_soa_context *bld,
 {
        struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
 
+       if (!reg) {
+               return lp_build_const_int32(gallivm, offset);
+       }
        LLVMValueRef addr = LLVMBuildLoad(gallivm->builder, bld->addr[reg->Index][reg->Swizzle], "");
        return LLVMBuildAdd(gallivm->builder, addr, lp_build_const_int32(gallivm, offset), "");
 }
@@ -181,7 +221,7 @@ emit_array_fetch(struct lp_build_tgsi_context *bld_base,
                tmp_reg.Register.Index = i + range.First;
                LLVMValueRef temp = radeon_llvm_emit_fetch(bld_base, &tmp_reg, type, swizzle);
                result = LLVMBuildInsertElement(builder, result, temp,
-                       lp_build_const_int32(gallivm, i), "");
+                       lp_build_const_int32(gallivm, i), "array_vector");
        }
        return result;
 }
@@ -195,13 +235,35 @@ load_value_from_array(struct lp_build_tgsi_context *bld_base,
                      const struct tgsi_ind_register *reg_indirect)
 {
        struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-       LLVMBuilderRef builder = bld_base->base.gallivm->builder;
-       struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_indirect);
+       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       LLVMBuilderRef builder = gallivm->builder;
+       struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_index, reg_indirect);
+       LLVMValueRef index = emit_array_index(bld, reg_indirect, reg_index - range.First);
+       LLVMValueRef array = get_alloca_for_array(bld_base, file, reg_index);
+       LLVMValueRef ptr, val, indices[2];
+
+       if (!array) {
+               /* Handle the case where the array is stored as a vector. */
+               return LLVMBuildExtractElement(builder,
+                               emit_array_fetch(bld_base, file, type, range, swizzle),
+                               index, "");
+       }
 
-       return LLVMBuildExtractElement(builder,
-                       emit_array_fetch(bld_base, file, type, range, swizzle),
-                       emit_array_index(bld, reg_indirect, reg_index - range.First), "");
+       index = LLVMBuildMul(builder, index, lp_build_const_int32(gallivm, TGSI_NUM_CHANNELS), "");
+       index = LLVMBuildAdd(builder, index, lp_build_const_int32(gallivm, swizzle), "");
+       indices[0] = bld_base->uint_bld.zero;
+       indices[1] = index;
+       ptr = LLVMBuildGEP(builder, array, indices, 2, "");
+       val = LLVMBuildLoad(builder, ptr, "");
+       if (tgsi_type_is_64bit(type)) {
+               LLVMValueRef ptr_hi, val_hi;
+               indices[0] = lp_build_const_int32(gallivm, 1);
+               ptr_hi = LLVMBuildGEP(builder, ptr, indices, 1, "");
+               val_hi = LLVMBuildLoad(builder, ptr_hi, "");
+               val = radeon_llvm_emit_fetch_64bit(bld_base, type, val, val_hi);
 
+       }
+       return val;
 }
 
 static LLVMValueRef
@@ -213,13 +275,26 @@ store_value_to_array(struct lp_build_tgsi_context *bld_base,
                     const struct tgsi_ind_register *reg_indirect)
 {
        struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-       LLVMBuilderRef builder = bld_base->base.gallivm->builder;
-       struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_indirect);
-
-       return LLVMBuildInsertElement(builder,
+       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       LLVMBuilderRef builder = gallivm->builder;
+       struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_index, reg_indirect);
+       LLVMValueRef index = emit_array_index(bld, reg_indirect, reg_index - range.First);
+       LLVMValueRef array = get_alloca_for_array(bld_base, file, reg_index);
+
+       if (array) {
+               LLVMValueRef indices[2];
+               index = LLVMBuildMul(builder, index, lp_build_const_int32(gallivm, TGSI_NUM_CHANNELS), "");
+               index = LLVMBuildAdd(builder, index, lp_build_const_int32(gallivm, chan_index), "");
+               indices[0] = bld_base->uint_bld.zero;
+               indices[1] = index;
+               LLVMValueRef pointer = LLVMBuildGEP(builder, array, indices, 2, "");
+               LLVMBuildStore(builder, value, pointer);
+               return NULL;
+       } else {
+               return LLVMBuildInsertElement(builder,
                                emit_array_fetch(bld_base, file, TGSI_TYPE_FLOAT, range, chan_index),
-                               value,  emit_array_index(bld, reg_indirect, reg_index - range.First), "");
-       return NULL;
+                               value, index, "");
+       }
 }
 
 LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
@@ -243,8 +318,9 @@ LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
        }
 
        if (reg->Register.Indirect) {
-               return load_value_from_array(bld_base, reg->Register.File, type,
+               LLVMValueRef load = load_value_from_array(bld_base, reg->Register.File, type,
                                swizzle, reg->Register.Index, &reg->Indirect);
+               return bitcast(bld_base, type, load);
        }
 
        switch(reg->Register.File) {
@@ -283,6 +359,11 @@ LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
                                                 LLVMBuildLoad(builder, ptr, ""),
                                                 LLVMBuildLoad(builder, ptr2, ""));
                }
+               LLVMValueRef array = get_alloca_for_array(bld_base, reg->Register.File, reg->Register.Index);
+               if (array) {
+                       return bitcast(bld_base, type, load_value_from_array(bld_base, reg->Register.File, type,
+                                       swizzle, reg->Register.Index, NULL));
+               }
                result = LLVMBuildLoad(builder, ptr, "");
                break;
 
@@ -333,6 +414,7 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
                             const struct tgsi_full_declaration *decl)
 {
        struct radeon_llvm_context *ctx = radeon_llvm_context(bld_base);
+       LLVMBuilderRef builder = bld_base->base.gallivm->builder;
        unsigned first, last, i, idx;
        switch(decl->Declaration.File) {
        case TGSI_FILE_ADDRESS:
@@ -350,13 +432,36 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
        }
 
        case TGSI_FILE_TEMPORARY:
+       {
+               unsigned decl_size;
+               first = decl->Range.First;
+               last = decl->Range.Last;
+               decl_size = 4 * ((last - first) + 1);
                if (decl->Declaration.Array) {
+                       unsigned id = decl->Array.ArrayID - 1;
                        if (!ctx->arrays) {
                                int size = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
-                               ctx->arrays = MALLOC(sizeof(ctx->arrays[0]) * size);
+                               ctx->arrays = CALLOC(size, sizeof(ctx->arrays[0]));
+                       for (i = 0; i < size; ++i) {
+                               assert(!ctx->arrays[i].alloca);}
                        }
 
-                       ctx->arrays[decl->Array.ArrayID - 1] = decl->Range;
+                       ctx->arrays[id].range = decl->Range;
+
+                       /* If the array is more than 16 elements (each element
+                        * is 32-bits), then store it in a vector.  Storing the
+                        * array in a vector will causes the compiler to store
+                        * the array in registers and access it using indirect
+                        * addressing.  16 is number of vector elements that
+                        * LLVM will store in a register.
+                        * FIXME: We shouldn't need to do this.  LLVM should be
+                        * smart enough to promote allocas int registers when
+                        * profitable.
+                        */
+                       if (decl_size > 16) {
+                               ctx->arrays[id].alloca = LLVMBuildAlloca(builder,
+                                       LLVMArrayType(bld_base->base.vec_type, decl_size),"array");
+                       }
                }
                first = decl->Range.First;
                last = decl->Range.Last;
@@ -373,7 +478,7 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
                        }
                }
                break;
-
+       }
        case TGSI_FILE_INPUT:
        {
                unsigned idx;
@@ -482,11 +587,16 @@ void radeon_llvm_emit_store(struct lp_build_tgsi_context *bld_base,
 
                if (reg->Register.Indirect) {
                        struct tgsi_declaration_range range = get_array_range(bld_base,
-                               reg->Register.File, &reg->Indirect);
+                               reg->Register.File, reg->Register.Index, &reg->Indirect);
 
                        unsigned i, size = range.Last - range.First + 1;
-                       LLVMValueRef array = store_value_to_array(bld_base, value, reg->Register.File, chan_index,
-                                                                 reg->Register.Index, &reg->Indirect);
+                       unsigned file = reg->Register.File;
+                       unsigned reg_index = reg->Register.Index;
+                       LLVMValueRef array = store_value_to_array(bld_base, value, file, chan_index,
+                                                                 reg_index, &reg->Indirect);
+                       if (get_alloca_for_array(bld_base, file, reg_index)) {
+                               continue;
+                       }
                        for (i = 0; i < size; ++i) {
                                switch(reg->Register.File) {
                                case TGSI_FILE_OUTPUT:
@@ -500,7 +610,7 @@ void radeon_llvm_emit_store(struct lp_build_tgsi_context *bld_base,
                                        break;
 
                                default:
-                                       return;
+                                       continue;
                                }
                                value = LLVMBuildExtractElement(builder, array, 
                                        lp_build_const_int32(gallivm, i), "");
@@ -516,14 +626,23 @@ void radeon_llvm_emit_store(struct lp_build_tgsi_context *bld_base,
                                break;
 
                        case TGSI_FILE_TEMPORARY:
+                       {
+                               LLVMValueRef array;
                                if (reg->Register.Index >= ctx->temps_count)
                                        continue;
+                               array = get_alloca_for_array(bld_base, reg->Register.File, reg->Register.Index);
+
+                               if (array) {
+                                       store_value_to_array(bld_base, value, reg->Register.File, chan_index, reg->Register.Index,
+                                                               NULL);
+                                       continue;
+                               }
                                temp_ptr = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index];
                                if (tgsi_type_is_64bit(dtype))
                                        temp_ptr2 = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index + 1];
 
                                break;
-
+                       }
                        default:
                                return;
                        }