From e64b1169d37599a9ee1c5877aa457a41c5a8d726 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 4 Jan 2017 12:40:37 -0800 Subject: [PATCH] vc4: Simplify the load/store utile functions. They now have less of a dependency on the cpp, and don't have to do a divide. Hacking up mesa-demos teximage to do only one subtest and not draw points, I saw 1024x1024 glTexSubImage2D() improve by 4.86939% +/- 1.40408% (n=30) and glGetTexImage() by 2.18978% +/- 0.140268% (n=5). --- src/gallium/drivers/vc4/vc4_tiling.c | 32 +++++++++++++++++++--------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_tiling.c b/src/gallium/drivers/vc4/vc4_tiling.c index 4bcb85b16f5..390ebe555c2 100644 --- a/src/gallium/drivers/vc4/vc4_tiling.c +++ b/src/gallium/drivers/vc4/vc4_tiling.c @@ -87,6 +87,22 @@ vc4_utile_height(int cpp) } } +/** Returns the stride in bytes of a 64-byte microtile. */ +static uint32_t +vc4_utile_stride(int cpp) +{ + switch (cpp) { + case 1: + return 8; + case 2: + case 4: + case 8: + return 16; + default: + unreachable("bad cpp"); + } +} + /** * The texture unit decides what tiling format a particular miplevel is using * this function, so we lay out our miptrees accordingly. @@ -101,25 +117,21 @@ vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) void vc4_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp) { - uint32_t utile_h = vc4_utile_height(cpp); - uint32_t row_size = 64 / utile_h; + uint32_t src_stride = vc4_utile_stride(cpp); - for (int y = 0; y < utile_h; y++) { - memcpy(dst, src, row_size); + for (uint32_t src_offset = 0; src_offset < 64; src_offset += src_stride) { + memcpy(dst, src + src_offset, src_stride); dst += dst_stride; - src += row_size; } } void vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp) { - uint32_t utile_h = vc4_utile_height(cpp); - uint32_t row_size = 64 / utile_h; + uint32_t dst_stride = vc4_utile_stride(cpp); - for (int y = 0; y < utile_h; y++) { - memcpy(dst, src, row_size); - dst += row_size; + for (uint32_t dst_offset = 0; dst_offset < 64; dst_offset += dst_stride) { + memcpy(dst + dst_offset, src, dst_stride); src += src_stride; } } -- 2.30.2