From: Roland Scheidegger Date: Wed, 8 Apr 2015 22:49:11 +0000 (+0200) Subject: gallivm: don't use control flow when doing indirect constant buffer lookups X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=586536a4e1c34725b3b38c3425db569fac0c91e9;p=mesa.git gallivm: don't use control flow when doing indirect constant buffer lookups llvm goes crazy when doing that, using way more memory and time, though there's probably more to it - this points to a very much similar issue as fixed in 8a9f5ecdb116d0449d63f7b94efbfa8b205d826f. In any case I've seen a quite plain looking vertex shader with just ~50 simple tgsi instructions (but with a dozen or so such indirect constant buffer lookups) go from a terribly high ~440ms compile time (consuming 25MB of memory in the process) down to a still awful ~230ms and 13MB with this fix (with llvm 3.3), so there's still obvious improvements possible (but I have no clue why it's so slow...). The resulting shader is most likely also faster (certainly seemed so though I don't have any hard numbers as it may have been influenced by compile times) since generally fetching constants outside the buffer range is most likely an app error (that is we expect all indices to be valid). It is possible this fixes some mysterious vertex shader slowdowns we've seen ever since we are conforming to newer apis at least partially (the main draw loop also has similar looking conditionals which we probably could do without - if not for the fetch at least for the additional elts condition.) v2: use static vars for the fake bufs, minor code cleanups Reviewed-by: Jose Fonseca --- diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c index 0dfafdca51a..d17d6959b44 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c @@ -271,30 +271,38 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle, static void llvm_middle_end_bind_parameters(struct draw_pt_middle_end *middle) { + static const float fake_const_buf[4]; struct llvm_middle_end *fpme = llvm_middle_end(middle); struct draw_context *draw = fpme->draw; + struct draw_llvm *llvm = fpme->llvm; unsigned i; - for (i = 0; i < Elements(fpme->llvm->jit_context.vs_constants); ++i) { + for (i = 0; i < Elements(llvm->jit_context.vs_constants); ++i) { int num_consts = draw->pt.user.vs_constants_size[i] / (sizeof(float) * 4); - fpme->llvm->jit_context.vs_constants[i] = draw->pt.user.vs_constants[i]; - fpme->llvm->jit_context.num_vs_constants[i] = num_consts; + llvm->jit_context.vs_constants[i] = draw->pt.user.vs_constants[i]; + llvm->jit_context.num_vs_constants[i] = num_consts; + if (num_consts == 0) { + llvm->jit_context.vs_constants[i] = fake_const_buf; + } } - for (i = 0; i < Elements(fpme->llvm->gs_jit_context.constants); ++i) { + for (i = 0; i < Elements(llvm->gs_jit_context.constants); ++i) { int num_consts = draw->pt.user.gs_constants_size[i] / (sizeof(float) * 4); - fpme->llvm->gs_jit_context.constants[i] = draw->pt.user.gs_constants[i]; - fpme->llvm->gs_jit_context.num_constants[i] = num_consts; + llvm->gs_jit_context.constants[i] = draw->pt.user.gs_constants[i]; + llvm->gs_jit_context.num_constants[i] = num_consts; + if (num_consts == 0) { + llvm->gs_jit_context.constants[i] = fake_const_buf; + } } - fpme->llvm->jit_context.planes = + llvm->jit_context.planes = (float (*)[DRAW_TOTAL_CLIP_PLANES][4]) draw->pt.user.planes[0]; - fpme->llvm->gs_jit_context.planes = + llvm->gs_jit_context.planes = (float (*)[DRAW_TOTAL_CLIP_PLANES][4]) draw->pt.user.planes[0]; - fpme->llvm->jit_context.viewports = draw->viewports; - fpme->llvm->gs_jit_context.viewports = draw->viewports; + llvm->jit_context.viewports = draw->viewports; + llvm->gs_jit_context.viewports = draw->viewports; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index 17b68ff7c28..448c99d3547 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -944,20 +944,38 @@ gather_outputs(struct lp_build_tgsi_soa_context * bld) * with a little work. */ static LLVMValueRef -build_gather(struct lp_build_context *bld, +build_gather(struct lp_build_tgsi_context *bld_base, LLVMValueRef base_ptr, LLVMValueRef indexes, - LLVMValueRef *overflow_mask) + LLVMValueRef overflow_mask) { - LLVMBuilderRef builder = bld->gallivm->builder; + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *uint_bld = &bld_base->uint_bld; + struct lp_build_context *bld = &bld_base->base; LLVMValueRef res = bld->undef; unsigned i; - LLVMValueRef temp_ptr = NULL; + + /* + * overflow_mask is a vector telling us which channels + * in the vector overflowed. We use the overflow behavior for + * constant buffers which is defined as: + * Out of bounds access to constant buffer returns 0 in all + * components. Out of bounds behavior is always with respect + * to the size of the buffer bound at that slot. + */ if (overflow_mask) { - temp_ptr = lp_build_alloca( - bld->gallivm, - lp_build_vec_type(bld->gallivm, bld->type), ""); + /* + * We avoid per-element control flow here (also due to llvm going crazy, + * though I suspect it's better anyway since overflow is likely rare). + * Note that since we still fetch from buffers even if num_elements was + * zero (in this case we'll fetch from index zero) the jit func callers + * MUST provide valid fake constant buffers of size 4x32 (the values do + * not matter), otherwise we'd still need (not per element though) + * control flow. + */ + indexes = lp_build_select(uint_bld, overflow_mask, uint_bld->zero, indexes); } /* @@ -968,53 +986,16 @@ build_gather(struct lp_build_context *bld, LLVMValueRef index = LLVMBuildExtractElement(builder, indexes, ii, ""); LLVMValueRef scalar_ptr, scalar; - LLVMValueRef overflow; - struct lp_build_if_state if_ctx; - - /* - * overflow_mask is a boolean vector telling us which channels - * in the vector overflowed. We use the overflow behavior for - * constant buffers which is defined as: - * Out of bounds access to constant buffer returns 0 in all - * componenets. Out of bounds behavior is always with respect - * to the size of the buffer bound at that slot. - */ - if (overflow_mask) { - overflow = LLVMBuildExtractElement(builder, *overflow_mask, - ii, ""); - lp_build_if(&if_ctx, bld->gallivm, overflow); - { - LLVMValueRef val = LLVMBuildLoad(builder, temp_ptr, ""); - val = LLVMBuildInsertElement( - builder, val, - LLVMConstNull(LLVMFloatTypeInContext(bld->gallivm->context)), - ii, ""); - LLVMBuildStore(builder, val, temp_ptr); - } - lp_build_else(&if_ctx); - { - LLVMValueRef val = LLVMBuildLoad(builder, temp_ptr, ""); - - scalar_ptr = LLVMBuildGEP(builder, base_ptr, - &index, 1, "gather_ptr"); - scalar = LLVMBuildLoad(builder, scalar_ptr, ""); - - val = LLVMBuildInsertElement(builder, val, scalar, ii, ""); - LLVMBuildStore(builder, val, temp_ptr); - } - lp_build_endif(&if_ctx); - } else { - scalar_ptr = LLVMBuildGEP(builder, base_ptr, - &index, 1, "gather_ptr"); - scalar = LLVMBuildLoad(builder, scalar_ptr, ""); + scalar_ptr = LLVMBuildGEP(builder, base_ptr, + &index, 1, "gather_ptr"); + scalar = LLVMBuildLoad(builder, scalar_ptr, ""); - res = LLVMBuildInsertElement(builder, res, scalar, ii, ""); - } + res = LLVMBuildInsertElement(builder, res, scalar, ii, ""); } if (overflow_mask) { - res = LLVMBuildLoad(builder, temp_ptr, "gather_val"); + res = lp_build_select(bld, overflow_mask, bld->zero, res); } return res; @@ -1247,17 +1228,15 @@ emit_fetch_constant( num_consts = lp_build_broadcast_scalar(uint_bld, num_consts); /* Construct a boolean vector telling us which channels * overflow the bound constant buffer */ - overflow_mask = LLVMBuildICmp(builder, LLVMIntUGE, - indirect_index, - num_consts, ""); + overflow_mask = lp_build_compare(gallivm, uint_bld->type, PIPE_FUNC_GEQUAL, + indirect_index, num_consts); /* index_vec = indirect_index * 4 + swizzle */ index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2); index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec); /* Gather values from the constant buffer */ - res = build_gather(&bld_base->base, consts_ptr, index_vec, - &overflow_mask); + res = build_gather(bld_base, consts_ptr, index_vec, overflow_mask); } else { LLVMValueRef index; /* index into the const buffer */ @@ -1319,7 +1298,7 @@ emit_fetch_immediate( FALSE); /* Gather values from the immediate register array */ - res = build_gather(&bld_base->base, imms_array, index_vec, NULL); + res = build_gather(bld_base, imms_array, index_vec, NULL); } else { LLVMValueRef lindex = lp_build_const_int32(gallivm, reg->Register.Index * 4 + swizzle); @@ -1373,7 +1352,7 @@ emit_fetch_input( inputs_array = LLVMBuildBitCast(builder, bld->inputs_array, fptr_type, ""); /* Gather values from the input register array */ - res = build_gather(&bld_base->base, inputs_array, index_vec, NULL); + res = build_gather(bld_base, inputs_array, index_vec, NULL); } else { if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) { LLVMValueRef lindex = lp_build_const_int32(gallivm, @@ -1495,7 +1474,7 @@ emit_fetch_temporary( temps_array = LLVMBuildBitCast(builder, bld->temps_array, fptr_type, ""); /* Gather values from the temporary register array */ - res = build_gather(&bld_base->base, temps_array, index_vec, NULL); + res = build_gather(bld_base, temps_array, index_vec, NULL); } else { LLVMValueRef temp_ptr; diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index 3b0056c49cb..96cc77c250c 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -999,6 +999,7 @@ lp_setup_is_resource_referenced( const struct lp_setup_context *setup, static boolean try_update_scene_state( struct lp_setup_context *setup ) { + static const float fake_const_buf[4]; boolean new_scene = (setup->fs.stored == NULL); struct lp_scene *scene = setup->scene; unsigned i; @@ -1103,14 +1104,15 @@ try_update_scene_state( struct lp_setup_context *setup ) setup->constants[i].stored_size = current_size; setup->constants[i].stored_data = stored; } + setup->fs.current.jit_context.constants[i] = + setup->constants[i].stored_data; } else { setup->constants[i].stored_size = 0; setup->constants[i].stored_data = NULL; + setup->fs.current.jit_context.constants[i] = fake_const_buf; } - setup->fs.current.jit_context.constants[i] = - setup->constants[i].stored_data; num_constants = setup->constants[i].stored_size / (sizeof(float) * 4); setup->fs.current.jit_context.num_constants[i] = num_constants;