From: Roland Scheidegger Date: Tue, 4 Jun 2013 22:17:22 +0000 (+0200) Subject: llvmpipe: reduce alignment requirement for 1d resources from 4x4 to 4x1 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=ffe2a1ca3c097661dd3f6e3ca5cfd72be184426c;p=mesa.git llvmpipe: reduce alignment requirement for 1d resources from 4x4 to 4x1 For rendering to buffers, we cannot have any y alignment. So make sure that tile clear commands only clear up to the fb width/height, not more (do this for all resources actually as clearing more seems pointless for other resources too). For the jit fs function, skip execution of the lower half of the fragment shader for the 4x4 stamp completely, for depth/stencil only load/store the values from the first row (replace other row with undef). For the blend function, also only load half the values from fs output, replace the rest with undefs so that everything still operates on the full 4x4 block to keep code the same between 4x1 and 4x4 (except for load/store of course which also needs to skip (store) or replace these values with undefs (load))., at the cost of slightly less optimal code being produced in some cases. Also reduce 1d and 1d array alignment too, because they can be handled the same as buffers so don't need to waste memory. v2: don't try to run special blend code for 4x1, (very) slightly less complexity if we just use the same code as for 4x4 which may or may not make it easier to optimize in the future (as we care a lot more about 4x4 performance than 1d). v2: don't use undef values for unused fs src outputs with llvm 3.1 as it apparently can trigger a bug in llvm. Reviewed-by: Jose Fonseca --- diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c index df6a6c41bbf..a8bd15f8751 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c @@ -525,6 +525,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm, * * \param type the data type of the fragment depth/stencil values * \param format_desc description of the depth/stencil surface + * \param is_1d whether this resource has only one dimension * \param loop_counter the current loop iteration * \param depth_ptr pointer to the depth/stencil values of this 4x4 block * \param depth_stride stride of the depth/stencil buffer @@ -535,6 +536,7 @@ void lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm, struct lp_type z_src_type, const struct util_format_description *format_desc, + boolean is_1d, LLVMValueRef depth_ptr, LLVMValueRef depth_stride, LLVMValueRef *z_fb, @@ -592,9 +594,14 @@ lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm, zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, ""); zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, ""); zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, ""); - zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, ""); - zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, ""); - zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, ""); + if (is_1d) { + zs_dst2 = lp_build_undef(gallivm, zs_load_type); + } + else { + zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, ""); + zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, ""); + zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, ""); + } *z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2, LLVMConstVector(shuffles, zs_type.length), ""); @@ -648,6 +655,7 @@ lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm, * * \param type the data type of the fragment depth/stencil values * \param format_desc description of the depth/stencil surface + * \param is_1d whether this resource has only one dimension * \param mask the alive/dead pixel mask for the quad (vector) * \param z_fb z values read from fb (with padding) * \param s_fb s values read from fb (with padding) @@ -661,6 +669,7 @@ void lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm, struct lp_type z_src_type, const struct util_format_description *format_desc, + boolean is_1d, struct lp_build_mask_context *mask, LLVMValueRef z_fb, LLVMValueRef s_fb, @@ -791,7 +800,9 @@ lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm, } LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1); - LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2); + if (!is_1d) { + LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2); + } } /** diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h index 2534dc309ce..d169c896711 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h +++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h @@ -74,6 +74,7 @@ void lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm, struct lp_type z_src_type, const struct util_format_description *format_desc, + boolean is_1d, LLVMValueRef depth_ptr, LLVMValueRef depth_stride, LLVMValueRef *z_fb, @@ -84,6 +85,7 @@ void lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm, struct lp_type z_src_type, const struct util_format_description *format_desc, + boolean is_1d, struct lp_build_mask_context *mask, LLVMValueRef z_fb, LLVMValueRef s_fb, diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index be5a286e3da..981dd712126 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -95,10 +95,10 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task, task->bin = bin; task->x = x * TILE_SIZE; task->y = y * TILE_SIZE; - task->width = TILE_SIZE + x * TILE_SIZE > task->scene->width_aligned ? - task->scene->width_aligned - x * TILE_SIZE : TILE_SIZE; - task->height = TILE_SIZE + y * TILE_SIZE > task->scene->height_aligned ? - task->scene->height_aligned - y * TILE_SIZE : TILE_SIZE; + task->width = TILE_SIZE + x * TILE_SIZE > task->scene->fb.width ? + task->scene->fb.width - x * TILE_SIZE : TILE_SIZE; + task->height = TILE_SIZE + y * TILE_SIZE > task->scene->fb.height ? + task->scene->fb.height - y * TILE_SIZE : TILE_SIZE; /* reset pointers to color and depth tile(s) */ memset(task->color_tiles, 0, sizeof(task->color_tiles)); diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c index 2dfc7ff9ce7..771ad085a12 100644 --- a/src/gallium/drivers/llvmpipe/lp_scene.c +++ b/src/gallium/drivers/llvmpipe/lp_scene.c @@ -505,8 +505,6 @@ void lp_scene_begin_binning( struct lp_scene *scene, scene->tiles_x = align(fb->width, TILE_SIZE) / TILE_SIZE; scene->tiles_y = align(fb->height, TILE_SIZE) / TILE_SIZE; - scene->width_aligned = align(fb->width, LP_RASTER_BLOCK_SIZE); - scene->height_aligned = align(fb->height, LP_RASTER_BLOCK_SIZE); assert(scene->tiles_x <= TILES_X); assert(scene->tiles_y <= TILES_Y); diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h index bc6c448bc7f..fa5bbcaf013 100644 --- a/src/gallium/drivers/llvmpipe/lp_scene.h +++ b/src/gallium/drivers/llvmpipe/lp_scene.h @@ -144,10 +144,6 @@ struct lp_scene { /** list of resources referenced by the scene commands */ struct resource_ref *resources; - /** aligned scene width, height */ - unsigned width_aligned; - unsigned height_aligned; - /** Total memory used by the scene (in bytes). This sums all the * data blocks and counts all bins, state, resource references and * other random allocations within the scene. diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index a7bd836918e..260d93ce98e 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -353,7 +353,7 @@ generate_fs_loop(struct gallivm_state *gallivm, if (depth_mode & EARLY_DEPTH_TEST) { lp_build_depth_stencil_load_swizzled(gallivm, type, - zs_format_desc, + zs_format_desc, key->resource_1d, depth_ptr, depth_stride, &z_fb, &s_fb, loop_state.counter); lp_build_depth_stencil_test(gallivm, @@ -369,7 +369,8 @@ generate_fs_loop(struct gallivm_state *gallivm, !simple_shader); if (depth_mode & EARLY_DEPTH_WRITE) { - lp_build_depth_stencil_write_swizzled(gallivm, type, zs_format_desc, + lp_build_depth_stencil_write_swizzled(gallivm, type, + zs_format_desc, key->resource_1d, NULL, NULL, NULL, loop_state.counter, depth_ptr, depth_stride, z_value, s_value); @@ -424,7 +425,7 @@ generate_fs_loop(struct gallivm_state *gallivm, } lp_build_depth_stencil_load_swizzled(gallivm, type, - zs_format_desc, + zs_format_desc, key->resource_1d, depth_ptr, depth_stride, &z_fb, &s_fb, loop_state.counter); @@ -441,7 +442,8 @@ generate_fs_loop(struct gallivm_state *gallivm, !simple_shader); /* Late Z write */ if (depth_mode & LATE_DEPTH_WRITE) { - lp_build_depth_stencil_write_swizzled(gallivm, type, zs_format_desc, + lp_build_depth_stencil_write_swizzled(gallivm, type, + zs_format_desc, key->resource_1d, NULL, NULL, NULL, loop_state.counter, depth_ptr, depth_stride, z_value, s_value); @@ -454,7 +456,8 @@ generate_fs_loop(struct gallivm_state *gallivm, * depth value, update from zs_value with the new mask value and * write that out. */ - lp_build_depth_stencil_write_swizzled(gallivm, type, zs_format_desc, + lp_build_depth_stencil_write_swizzled(gallivm, type, + zs_format_desc, key->resource_1d, &mask, z_fb, s_fb, loop_state.counter, depth_ptr, depth_stride, z_value, s_value); @@ -508,6 +511,7 @@ generate_fs_loop(struct gallivm_state *gallivm, * * @param type fragment shader type (4x or 8x float) * @param num_fs number of fs_src + * @param is_1d whether we're outputting to a 1d resource * @param dst_channels number of output channels * @param fs_src output from fragment shader * @param dst pointer to store result @@ -1345,6 +1349,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, LLVMValueRef blend_alpha; LLVMValueRef i32_zero; LLVMValueRef check_mask; + LLVMValueRef undef_src_val; struct lp_build_mask_context mask_ctx; struct lp_type mask_type; @@ -1369,9 +1374,16 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, const boolean dual_source_blend = variant->key.blend.rt[0].blend_enable && util_blend_state_is_dual(&variant->key.blend, 0); + const boolean is_1d = variant->key.resource_1d; + unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs; + mask_type = lp_int32_vec4_type(); mask_type.length = fs_type.length; + for (i = num_fs; i < num_fullblock_fs; i++) { + fs_mask[i] = lp_build_zero(gallivm, mask_type); + } + /* Compute the alignment of the destination pointer in bytes */ #if 0 dst_alignment = (block_width * out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8); @@ -1388,7 +1400,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, if (do_branch) { check_mask = LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type)); - for (i = 0; i < num_fs; ++i) { + for (i = 0; i < num_fullblock_fs; ++i) { check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], ""); } @@ -1399,6 +1411,17 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, partial_mask |= !variant->opaque; i32_zero = lp_build_const_int32(gallivm, 0); +#if HAVE_LLVM < 0x0302 + /* + * undef triggers a crash in LLVMBuildTrunc in convert_from_blend_type in some + * cases (seen with r10g10b10a2, 128bit wide vectors) (only used for 1d case). + */ + undef_src_val = lp_build_zero(gallivm, fs_type); +#else + undef_src_val = lp_build_undef(gallivm, fs_type); +#endif + + /* Get type from output format */ lp_blend_type_from_format_desc(out_format_desc, &row_type); lp_mem_type_from_format_desc(out_format_desc, &dst_type); @@ -1459,14 +1482,25 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, /* * Load shader output */ - for (i = 0; i < num_fs; ++i) { + for (i = 0; i < num_fullblock_fs; ++i) { /* Always load alpha for use in blending */ - LLVMValueRef alpha = LLVMBuildLoad(builder, fs_out_color[rt][alpha_channel][i], ""); + LLVMValueRef alpha; + if (i < num_fs) { + alpha = LLVMBuildLoad(builder, fs_out_color[rt][alpha_channel][i], ""); + } + else { + alpha = undef_src_val; + } /* Load each channel */ for (j = 0; j < dst_channels; ++j) { assert(swizzle[j] < 4); - fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[rt][swizzle[j]][i], ""); + if (i < num_fs) { + fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[rt][swizzle[j]][i], ""); + } + else { + fs_src[i][j] = undef_src_val; + } } /* If 3 channels then pad to include alpha for 4 element transpose */ @@ -1492,12 +1526,23 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, } if (dual_source_blend) { /* same as above except different src/dst, skip masks and comments... */ - for (i = 0; i < num_fs; ++i) { - LLVMValueRef alpha = LLVMBuildLoad(builder, fs_out_color[1][alpha_channel][i], ""); + for (i = 0; i < num_fullblock_fs; ++i) { + LLVMValueRef alpha; + if (i < num_fs) { + alpha = LLVMBuildLoad(builder, fs_out_color[1][alpha_channel][i], ""); + } + else { + alpha = undef_src_val; + } for (j = 0; j < dst_channels; ++j) { assert(swizzle[j] < 4); - fs_src1[i][j] = LLVMBuildLoad(builder, fs_out_color[1][swizzle[j]][i], ""); + if (i < num_fs) { + fs_src1[i][j] = LLVMBuildLoad(builder, fs_out_color[1][swizzle[j]][i], ""); + } + else { + fs_src1[i][j] = undef_src_val; + } } if (dst_channels == 3 && !has_alpha) { fs_src1[i][3] = alpha; @@ -1518,7 +1563,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, */ fs_type.floating = 0; fs_type.sign = dst_type.sign; - for (i = 0; i < num_fs; ++i) { + for (i = 0; i < num_fullblock_fs; ++i) { for (j = 0; j < dst_channels; ++j) { fs_src[i][j] = LLVMBuildBitCast(builder, fs_src[i][j], lp_build_vec_type(gallivm, fs_type), ""); @@ -1533,16 +1578,16 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, /* * Pixel twiddle from fragment shader order to memory order */ - src_count = generate_fs_twiddle(gallivm, fs_type, num_fs, + src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels, fs_src, src, pad_inline); if (dual_source_blend) { - generate_fs_twiddle(gallivm, fs_type, num_fs, dst_channels, + generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels, fs_src1, src1, pad_inline); } src_channels = dst_channels < 3 ? dst_channels : 4; - if (src_count != num_fs * src_channels) { - unsigned ds = src_count / (num_fs * src_channels); + if (src_count != num_fullblock_fs * src_channels) { + unsigned ds = src_count / (num_fullblock_fs * src_channels); row_type.length /= ds; fs_type.length = row_type.length; } @@ -1685,8 +1730,18 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, dst_type.length = block_width; } - load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, - dst, dst_type, dst_count, dst_alignment); + if (is_1d) { + load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1, + dst, dst_type, dst_count / 4, dst_alignment); + for (i = dst_count / 4; i < dst_count; i++) { + dst[i] = lp_build_undef(gallivm, dst_type); + } + + } + else { + load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, + dst, dst_type, dst_count, dst_alignment); + } /* @@ -1761,8 +1816,14 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, /* * Store blend result to memory */ - store_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, - dst, dst_type, dst_count, dst_alignment); + if (is_1d) { + store_unswizzled_block(gallivm, color_ptr, stride, block_width, 1, + dst, dst_type, dst_count / 4, dst_alignment); + } + else { + store_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, + dst, dst_type, dst_count, dst_alignment); + } if (do_branch) { lp_build_mask_end(&mask_ctx); @@ -1855,7 +1916,6 @@ generate_fragment(struct llvmpipe_context *lp, fs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */ fs_type.width = 32; /* 32-bit float */ fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */ - num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */ memset(&blend_type, 0, sizeof blend_type); blend_type.floating = FALSE; /* values are integers */ @@ -1944,6 +2004,11 @@ generate_fragment(struct llvmpipe_context *lp, /* code generated texture sampling */ sampler = lp_llvm_sampler_soa_create(key->state, context_ptr); + num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */ + /* for 1d resources only run "upper half" of stamp */ + if (key->resource_1d) + num_fs /= 2; + { LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs); LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type); @@ -2533,6 +2598,9 @@ make_variant_key(struct llvmpipe_context *lp, key->zsbuf_format = zsbuf_format; memcpy(&key->stencil, &lp->depth_stencil->stencil, sizeof key->stencil); } + if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) { + key->resource_1d = TRUE; + } } /* alpha test only applies if render buffer 0 is non-integer (or does not exist) */ @@ -2570,6 +2638,15 @@ make_variant_key(struct llvmpipe_context *lp, key->cbuf_format[i] = format; + /* + * Figure out if this is a 1d resource. Note that OpenGL allows crazy + * mixing of 2d textures with height 1 and 1d textures, so make sure + * we pick 1d if any cbuf or zsbuf is 1d. + */ + if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[0]->texture)) { + key->resource_1d = TRUE; + } + format_desc = util_format_description(format); assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB); diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h index c8dc1c33cfe..33140901c18 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.h +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h @@ -75,6 +75,7 @@ struct lp_fragment_shader_variant_key unsigned nr_sampler_views:8; /* actually derivable from just the shader */ unsigned flatshade:1; unsigned occlusion_count:1; + unsigned resource_1d:1; enum pipe_format zsbuf_format; enum pipe_format cbuf_format[PIPE_MAX_COLOR_BUFS]; diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c index 22f952c8924..f1a1ed0960a 100644 --- a/src/gallium/drivers/llvmpipe/lp_texture.c +++ b/src/gallium/drivers/llvmpipe/lp_texture.c @@ -83,22 +83,30 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen, /* Row stride and image stride */ { - unsigned alignment, nblocksx, nblocksy, block_size; + unsigned align_x, align_y, nblocksx, nblocksy, block_size; /* For non-compressed formats we need 4x4 pixel alignment - * (for now). We also want cache line size in x direction, + * so we can read/write LP_RASTER_BLOCK_SIZE when rendering to them. + * We also want cache line size in x direction, * otherwise same cache line could end up in multiple threads. - * XXX this blows up 1d/1d array textures by a factor of 4. + * For explicit 1d resources however we reduce this to 4x1 and + * handle specially in render output code (as we need to do special + * handling there for buffers in any case). */ if (util_format_is_compressed(pt->format)) - alignment = 1; - else - alignment = LP_RASTER_BLOCK_SIZE; + align_x = align_y = 1; + else { + align_x = LP_RASTER_BLOCK_SIZE; + if (llvmpipe_resource_is_1d(&lpr->base)) + align_y = 1; + else + align_y = LP_RASTER_BLOCK_SIZE; + } nblocksx = util_format_get_nblocksx(pt->format, - align(width, alignment)); + align(width, align_x)); nblocksy = util_format_get_nblocksy(pt->format, - align(height, alignment)); + align(height, align_y)); block_size = util_format_get_blocksize(pt->format); if (util_format_is_compressed(pt->format)) diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h b/src/gallium/drivers/llvmpipe/lp_texture.h index faba6f21025..e73d44946d2 100644 --- a/src/gallium/drivers/llvmpipe/lp_texture.h +++ b/src/gallium/drivers/llvmpipe/lp_texture.h @@ -159,6 +159,27 @@ llvmpipe_resource_is_texture(const struct pipe_resource *resource) } +static INLINE boolean +llvmpipe_resource_is_1d(const struct pipe_resource *resource) +{ + switch (resource->target) { + case PIPE_BUFFER: + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + return TRUE; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_3D: + case PIPE_TEXTURE_CUBE: + return FALSE; + default: + assert(0); + return FALSE; + } +} + + static INLINE unsigned llvmpipe_resource_stride(struct pipe_resource *resource, unsigned level)