From: Eric Anholt Date: Wed, 4 Jan 2017 22:08:10 +0000 (-0800) Subject: vc4: Handle partial loads/stores of tiled textures. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=25bee5ef9ea923bf0d99f5f8eb19082c449f3e53;p=mesa.git vc4: Handle partial loads/stores of tiled textures. Previously, we would load out the tile-aligned area, update the raster copy, and store it back. This was a huge cost for XPutImage calls to the screen under glamor. Instead, implement a general load/store path that walks over the source x/y writing into the corresponding pixel of the destination (using clever math from https://fgiesen.wordpress.com/2011/01/17/texture-tiling-and-swizzling/). If things are aligned, we go through the previous utile-at-a-time loop. Improves x11perf -putimage10 performance by 139.777% +/- 2.83464% (n=5) Improves x11perf -putimage100 performance by 383.908% +/- 22.6297% (n=11) Improves x11perf -getimage10 performance by 2.75731% +/- 0.585054% (n=145) --- diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c index 381a618be2d..f2adb290614 100644 --- a/src/gallium/drivers/vc4/vc4_resource.c +++ b/src/gallium/drivers/vc4/vc4_resource.c @@ -181,9 +181,6 @@ vc4_resource_transfer_map(struct pipe_context *pctx, struct vc4_resource_slice *slice = &rsc->slices[level]; if (rsc->tiled) { - uint32_t utile_w = vc4_utile_width(rsc->cpp); - uint32_t utile_h = vc4_utile_height(rsc->cpp); - /* No direct mappings of tiled, since we need to manually * tile/untile. */ @@ -204,49 +201,12 @@ vc4_resource_transfer_map(struct pipe_context *pctx, ptrans->box.height = (ptrans->box.height + 3) >> 2; } - /* We need to align the box to utile boundaries, since that's - * what load/store operates on. This may cause us to need to - * read out the original contents in that border area. Right - * now we just read out the entire contents, including the - * middle area that will just get overwritten. - */ - uint32_t box_start_x = ptrans->box.x & (utile_w - 1); - uint32_t box_start_y = ptrans->box.y & (utile_h - 1); - bool needs_load = (usage & PIPE_TRANSFER_READ) != 0; - - if (box_start_x) { - ptrans->box.width += box_start_x; - ptrans->box.x -= box_start_x; - needs_load = true; - } - if (box_start_y) { - ptrans->box.height += box_start_y; - ptrans->box.y -= box_start_y; - needs_load = true; - } - if (ptrans->box.width & (utile_w - 1)) { - /* We only need to force a load if our border region - * we're extending into is actually part of the - * texture. - */ - uint32_t slice_width = u_minify(prsc->width0, level); - if (ptrans->box.x + ptrans->box.width != slice_width) - needs_load = true; - ptrans->box.width = align(ptrans->box.width, utile_w); - } - if (ptrans->box.height & (utile_h - 1)) { - uint32_t slice_height = u_minify(prsc->height0, level); - if (ptrans->box.y + ptrans->box.height != slice_height) - needs_load = true; - ptrans->box.height = align(ptrans->box.height, utile_h); - } - ptrans->stride = ptrans->box.width * rsc->cpp; ptrans->layer_stride = ptrans->stride * ptrans->box.height; trans->map = malloc(ptrans->layer_stride * ptrans->box.depth); - if (needs_load) { + if (usage & PIPE_TRANSFER_READ) { vc4_load_tiled_image(trans->map, ptrans->stride, buf + slice->offset + ptrans->box.z * rsc->cube_map_stride, @@ -254,9 +214,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx, slice->tiling, rsc->cpp, &ptrans->box); } - return (trans->map + - box_start_x * rsc->cpp + - box_start_y * ptrans->stride); + return trans->map; } else { ptrans->stride = slice->stride; ptrans->layer_stride = ptrans->stride; diff --git a/src/gallium/drivers/vc4/vc4_tiling.c b/src/gallium/drivers/vc4/vc4_tiling.c index 07e1c9c5f67..2da520eb4db 100644 --- a/src/gallium/drivers/vc4/vc4_tiling.c +++ b/src/gallium/drivers/vc4/vc4_tiling.c @@ -63,15 +63,6 @@ vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) height <= 4 * vc4_utile_height(cpp)); } -static void -check_box_utile_alignment(const struct pipe_box *box, int cpp) -{ - assert(!(box->x & (vc4_utile_width(cpp) - 1))); - assert(!(box->y & (vc4_utile_height(cpp) - 1))); - assert(!(box->width & (vc4_utile_width(cpp) - 1))); - assert(!(box->height & (vc4_utile_height(cpp) - 1))); -} - /** * Takes a utile x and y (and the number of utiles of width of the image) and * returns the offset to the utile within a VC4_TILING_FORMAT_TF image. @@ -216,8 +207,6 @@ vc4_load_tiled_image(void *dst, uint32_t dst_stride, uint8_t tiling_format, int cpp, const struct pipe_box *box) { - check_box_utile_alignment(box, cpp); - if (tiling_format == VC4_TILING_FORMAT_LT) { vc4_load_lt_image(dst, dst_stride, src, src_stride, @@ -240,8 +229,6 @@ vc4_store_tiled_image(void *dst, uint32_t dst_stride, uint8_t tiling_format, int cpp, const struct pipe_box *box) { - check_box_utile_alignment(box, cpp); - if (tiling_format == VC4_TILING_FORMAT_LT) { vc4_store_lt_image(dst, dst_stride, src, src_stride, diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt.c b/src/gallium/drivers/vc4/vc4_tiling_lt.c index 8c875e7bd3a..ec42a3dc2f7 100644 --- a/src/gallium/drivers/vc4/vc4_tiling_lt.c +++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c @@ -41,6 +41,12 @@ #define NEON_TAG(x) x ## _base #endif +static inline uint32_t +align_down(uint32_t val, uint32_t align) +{ + return val & ~(align - 1); +} + /** Returns the stride in bytes of a 64-byte microtile. */ static uint32_t vc4_utile_stride(int cpp) @@ -252,6 +258,66 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp) #endif } +/** + * Returns the X value into the address bits for LT tiling. + * + * The LT tile load/stores rely on the X bits not intersecting with the Y + * bits. Because of this, we have to choose to put the utile index within the + * LT tile into one of the two values, and we do so in swizzle_lt_x() to make + * NPOT handling easier. + */ +static uint32_t +swizzle_lt_x(int x, int cpp) +{ + switch (cpp) { + case 1: + /* 8x8 inside of 4x4 */ + return ((x & 0x7) << (0 - 0) | + (x & ~0x7) << (6 - 3)); + case 2: + /* 8x4 inside of 4x4 */ + return ((x & 0x7) << (1 - 0) | + (x & ~0x7) << (6 - 3)); + case 4: + /* 4x4 inside of 4x4 */ + return ((x & 0x3) << (2 - 0) | + (x & ~0x3) << (6 - 2)); + case 8: + /* 2x4 inside of 4x4 */ + return ((x & 0x1) << (3 - 0) | + (x & ~0x1) << (6 - 1)); + default: + unreachable("bad cpp"); + } +} + +/** + * Returns the Y value into the address bits for LT tiling. + * + * The LT tile load/stores rely on the X bits not intersecting with the Y + * bits. + */ +static uint32_t +swizzle_lt_y(int y, int cpp) +{ + + switch (cpp) { + case 1: + /* 8x8 inside of 4x4 */ + return ((y & 0x7) << 3); + case 2: + /* 8x4 inside of 4x4 */ + return ((y & 0x3) << 4); + case 4: + /* 4x4 inside of 4x4 */ + return ((y & 0x3) << 4); + case 8: + /* 2x4 inside of 4x4 */ + return ((y & 0x3) << 4); + default: + unreachable("bad cpp"); + } +} /** * Helper for loading or storing to an LT image, where the box is aligned @@ -261,9 +327,9 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp) * vc4_load_utile/vc4_store_utile helpers. */ static inline void -vc4_lt_image_helper(void *gpu, uint32_t gpu_stride, - void *cpu, uint32_t cpu_stride, - int cpp, const struct pipe_box *box, bool to_cpu) +vc4_lt_image_aligned(void *gpu, uint32_t gpu_stride, + void *cpu, uint32_t cpu_stride, + int cpp, const struct pipe_box *box, bool to_cpu) { uint32_t utile_w = vc4_utile_width(cpp); uint32_t utile_h = vc4_utile_height(cpp); @@ -289,6 +355,90 @@ vc4_lt_image_helper(void *gpu, uint32_t gpu_stride, } } +/** + * Helper for loading or storing to an LT image, where the box is not aligned + * to utiles. + * + * This walks through the raster-order data, copying to/from the corresponding + * tiled pixel. This means we don't get write-combining on stores, but the + * loop is very few CPU instructions since the memcpy will be inlined. + */ +static inline void +vc4_lt_image_unaligned(void *gpu, uint32_t gpu_stride, + void *cpu, uint32_t cpu_stride, + int cpp, const struct pipe_box *box, bool to_cpu) +{ + + /* These are the address bits for the start of the box, split out into + * x/y so that they can be incremented separately in their loops. + */ + uint32_t offs_x0 = swizzle_lt_x(box->x, cpp); + uint32_t offs_y = swizzle_lt_y(box->y, cpp); + /* The *_mask values are "what bits of the address are from x or y" */ + uint32_t x_mask = swizzle_lt_x(~0, cpp); + uint32_t y_mask = swizzle_lt_y(~0, cpp); + uint32_t incr_y = swizzle_lt_x(gpu_stride / cpp, cpp); + + assert(!(x_mask & y_mask)); + + offs_x0 += incr_y * (box->y / vc4_utile_height(cpp)); + + for (uint32_t y = 0; y < box->height; y++) { + void *gpu_row = gpu + offs_y; + + uint32_t offs_x = offs_x0; + + for (uint32_t x = 0; x < box->width; x++) { + /* Use a memcpy here to move a pixel's worth of data. + * We're relying on this function to be inlined, so + * this will get expanded into the appropriate 1, 2, + * or 4-byte move. + */ + if (to_cpu) { + memcpy(cpu + x * cpp, gpu_row + offs_x, cpp); + } else { + memcpy(gpu_row + offs_x, cpu + x * cpp, cpp); + } + + /* This math trick with x_mask increments offs_x by 1 + * in x. + */ + offs_x = (offs_x - x_mask) & x_mask; + } + + offs_y = (offs_y - y_mask) & y_mask; + /* When offs_y wraps (we hit the end of the utile), we + * increment offs_x0 by effectively the utile stride. + */ + if (!offs_y) + offs_x0 += incr_y; + + cpu += cpu_stride; + } +} + +/** + * General LT image load/store helper. + */ +static inline void +vc4_lt_image_helper(void *gpu, uint32_t gpu_stride, + void *cpu, uint32_t cpu_stride, + int cpp, const struct pipe_box *box, bool to_cpu) +{ + if (box->x & (vc4_utile_width(cpp) - 1) || + box->y & (vc4_utile_height(cpp) - 1) || + box->width & (vc4_utile_width(cpp) - 1) || + box->height & (vc4_utile_height(cpp) - 1)) { + vc4_lt_image_unaligned(gpu, gpu_stride, + cpu, cpu_stride, + cpp, box, to_cpu); + } else { + vc4_lt_image_aligned(gpu, gpu_stride, + cpu, cpu_stride, + cpp, box, to_cpu); + } +} + static inline void vc4_lt_image_cpp_helper(void *gpu, uint32_t gpu_stride, void *cpu, uint32_t cpu_stride,