From f2801f77750668fc94cb8b6369477615d30516c8 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Tue, 25 Jun 2019 08:09:58 -0700 Subject: [PATCH] panfrost: Rewrite u-interleaving code Rather than using a magic lookup table with no explanations, let's add liberal comments to the code to explain what this tiling scheme is and how to encode/decode it efficiently. It's not so mysterious after all -- just reordering bits with some XORs thrown in. v2: Correct copyright identifier. Fix spelling error. Switch space_4 to a LUT. Fix comment typo. Use LUT instead of space_x tricks. Fallback on generic rather than split up unaligned writes. v3: Correct stride order (fixes crash loading). Correct coordinate system mishap. Signed-off-by: Alyssa Rosenzweig Reviewed-by: Vasily Khoruzhick Tested-by: Andreas Baierl --- src/panfrost/shared/pan_tiling.c | 290 ++++++++++++++++++++----------- 1 file changed, 189 insertions(+), 101 deletions(-) diff --git a/src/panfrost/shared/pan_tiling.c b/src/panfrost/shared/pan_tiling.c index 413cd89420b..08c9561b2ef 100644 --- a/src/panfrost/shared/pan_tiling.c +++ b/src/panfrost/shared/pan_tiling.c @@ -2,6 +2,7 @@ * Copyright (c) 2011-2013 Luc Verhaegen * Copyright (c) 2018 Alyssa Rosenzweig * Copyright (c) 2018 Vasily Khoruzhick + * Copyright (c) 2019 Collabora, Ltd. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -24,129 +25,212 @@ * */ +#include #include "pan_tiling.h" -uint32_t space_filler[16][16] = { - { 0, 1, 4, 5, 16, 17, 20, 21, 64, 65, 68, 69, 80, 81, 84, 85, }, - { 3, 2, 7, 6, 19, 18, 23, 22, 67, 66, 71, 70, 83, 82, 87, 86, }, - { 12, 13, 8, 9, 28, 29, 24, 25, 76, 77, 72, 73, 92, 93, 88, 89, }, - { 15, 14, 11, 10, 31, 30, 27, 26, 79, 78, 75, 74, 95, 94, 91, 90, }, - { 48, 49, 52, 53, 32, 33, 36, 37, 112, 113, 116, 117, 96, 97, 100, 101, }, - { 51, 50, 55, 54, 35, 34, 39, 38, 115, 114, 119, 118, 99, 98, 103, 102, }, - { 60, 61, 56, 57, 44, 45, 40, 41, 124, 125, 120, 121, 108, 109, 104, 105, }, - { 63, 62, 59, 58, 47, 46, 43, 42, 127, 126, 123, 122, 111, 110, 107, 106, }, - { 192, 193, 196, 197, 208, 209, 212, 213, 128, 129, 132, 133, 144, 145, 148, 149, }, - { 195, 194, 199, 198, 211, 210, 215, 214, 131, 130, 135, 134, 147, 146, 151, 150, }, - { 204, 205, 200, 201, 220, 221, 216, 217, 140, 141, 136, 137, 156, 157, 152, 153, }, - { 207, 206, 203, 202, 223, 222, 219, 218, 143, 142, 139, 138, 159, 158, 155, 154, }, - { 240, 241, 244, 245, 224, 225, 228, 229, 176, 177, 180, 181, 160, 161, 164, 165, }, - { 243, 242, 247, 246, 227, 226, 231, 230, 179, 178, 183, 182, 163, 162, 167, 166, }, - { 252, 253, 248, 249, 236, 237, 232, 233, 188, 189, 184, 185, 172, 173, 168, 169, }, - { 255, 254, 251, 250, 239, 238, 235, 234, 191, 190, 187, 186, 175, 174, 171, 170, }, +/* This file implements software encode/decode of the tiling format used for + * textures and framebuffers primarily on Utgard GPUs. Names for this format + * include "Utgard-style tiling", "(Mali) swizzled textures", and + * "U-interleaved" (the former two names being used in the community + * Lima/Panfrost drivers; the latter name used internally at Arm). + * Conceptually, like any tiling scheme, the pixel reordering attempts to 2D + * spatial locality, to improve cache locality in both horizontal and vertical + * directions. + * + * This format is tiled: first, the image dimensions must be aligned to 16 + * pixels in each axis. Once aligned, the image is divided into 16x16 tiles. + * This size harmonizes with other properties of the GPU; on Midgard, + * framebuffer tiles are logically 16x16 (this is the tile size used in + * Transaction Elimination and the minimum tile size used in Hierarchical + * Tiling). Conversely, for a standard 4 bytes-per-pixel format (like + * RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line + * size. + * + * Within each 16x16 block, the bits are reordered according to this pattern: + * + * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) | + * + * Basically, interleaving the X and Y bits, with XORs thrown in for every + * adjacent bit pair. + * + * This is cheap to implement both encode/decode in both hardware and software. + * In hardware, lines are simply rerouted to reorder and some XOR gates are + * thrown in. Software has to be a bit more clever. + * + * In software, the trick is to divide the pattern into two lines: + * + * | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 | + * ^ | 0 | x3 | 0 | x2 | 0 | x1 | 0 | x0 | + * + * That is, duplicate the bits of the Y and space out the bits of the X. The + * top line is a function only of Y, so it can be calculated once per row and + * stored in a register. The bottom line is simply X with the bits spaced out. + * Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the + * mask pattern (abusing carry bits). + * + * This format is also supported on Midgard GPUs, where it *can* be used for + * textures and framebuffers. That said, in practice it is usually as a + * fallback layout; Midgard introduces Arm FrameBuffer Compression, which is + * significantly more efficient than Utgard-style tiling and preferred for both + * textures and framebuffers, where possible. For unsupported texture types, + * for instance sRGB textures and framebuffers, this tiling scheme is used at a + * performance penalty, as AFBC is not compatible. + */ + +/* Given the lower 4-bits of the Y coordinate, we would like to + * duplicate every bit over. So instead of 0b1010, we would like + * 0b11001100. The idea is that for the bits in the solely Y place, we + * get a Y place, and the bits in the XOR place *also* get a Y. */ + +uint32_t bit_duplication[16] = { + 0b00000000, + 0b00000011, + 0b00001100, + 0b00001111, + 0b00110000, + 0b00110011, + 0b00111100, + 0b00111111, + 0b11000000, + 0b11000011, + 0b11001100, + 0b11001111, + 0b11110000, + 0b11110011, + 0b11111100, + 0b11111111, +}; + +/* Space the bits out of a 4-bit nibble */ + +unsigned space_4[16] = { + 0b0000000, + 0b0000001, + 0b0000100, + 0b0000101, + 0b0010000, + 0b0010001, + 0b0010100, + 0b0010101, + 0b1000000, + 0b1000001, + 0b1000100, + 0b1000101, + 0b1010000, + 0b1010001, + 0b1010100, + 0b1010101 }; +/* The scheme uses 16x16 tiles */ + +#define TILE_WIDTH 16 +#define TILE_HEIGHT 16 +#define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT) + +/* An optimized routine to tile an aligned (width & 0xF == 0) bpp4 texture */ + static void panfrost_store_tiled_image_bpp4(void *dst, const void *src, const struct pipe_box *box, uint32_t dst_stride, uint32_t src_stride) { + /* Precompute the offset to the beginning of the first horizontal tile we're + * writing to, knowing that box->x is 16-aligned. Tiles themselves are + * stored linearly, so we get the X tile number by shifting and then + * multiply by the bytes per tile */ + + uint8_t *dest_start = dst + ((box->x >> 4) * PIXELS_PER_TILE * 4); + + /* Iterate across the pixels we're trying to store in source-order */ + for (int y = box->y, src_y = 0; src_y < box->height; ++y, ++src_y) { + /* For each pixel in the destination image, figure out the part + * corresponding to the 16x16 block index */ + int block_y = y & ~0x0f; - int rem_y = y & 0x0F; - int block_start_s = block_y * dst_stride; - int source_start = src_y * src_stride; - for (int x = box->x, src_x = 0; src_x < box->width; ++x, ++src_x) { - int block_x_s = (x >> 4) * 256; - int rem_x = x & 0x0F; + /* In pixel coordinates (where the origin is the top-left), (block_y, 0) + * is the top-left corner of the leftmost tile in this row. While pixels + * are reordered within a block, the blocks themselves are stored + * linearly, so multiplying block_y by the pixel stride of the + * destination image equals the byte offset of that top-left corner of + * the block this row is in */ + + uint32_t *dest = (uint32_t *) (dest_start + (block_y * dst_stride)); + + /* The source is actually linear, so compute the byte offset to the start + * and end of this row in the source */ + + const uint32_t *source = src + (src_y * src_stride); + const uint32_t *source_end = source + box->width; + + /* We want to duplicate the bits of the bottom nibble of Y */ + unsigned expanded_y = bit_duplication[y & 0xF]; - int index = space_filler[rem_y][rem_x]; - const uint32_t *source = src + source_start + 4 * src_x; - uint32_t *dest = dst + block_start_s + 4 * (block_x_s + index); + /* Iterate the row in source order. In the outer loop, we iterate 16 + * bytes tiles. After each tile, we increment dest to include the size of + * that tile in pixels. */ - *dest = *source; + for (; source < source_end; dest += PIXELS_PER_TILE) { + /* Within each tile, we iterate each of the 16 pixels in the row of + * the tile. This loop should be unrolled. */ + + for (int i = 0; i < 16; ++i) { + /* We have the X component spaced out in space_x and we have the Y + * component duplicated. So we just XOR them together. The X bits + * get the XOR like the pattern needs. The Y bits are XORing with + * zero so this is a no-op */ + + unsigned index = expanded_y ^ space_4[i]; + + /* Copy over the pixel */ + dest[index] = *(source++); + } } } } static void -panfrost_store_tiled_image_generic(void *dst, const void *src, +panfrost_access_tiled_image_generic(void *dst, void *src, const struct pipe_box *box, uint32_t dst_stride, uint32_t src_stride, - uint32_t bpp) + uint32_t bpp, + bool is_store) { for (int y = box->y, src_y = 0; src_y < box->height; ++y, ++src_y) { int block_y = y & ~0x0f; - int rem_y = y & 0x0F; int block_start_s = block_y * dst_stride; int source_start = src_y * src_stride; - for (int x = box->x, src_x = 0; src_x < box->width; ++x, ++src_x) { - int block_x_s = (x >> 4) * 256; - int rem_x = x & 0x0F; - - int index = space_filler[rem_y][rem_x]; - const uint8_t *src8 = src; - const uint8_t *source = &src8[source_start + bpp * src_x]; - uint8_t *dest = dst + block_start_s + bpp * (block_x_s + index); - - for (int b = 0; b < bpp; ++b) - dest[b] = source[b]; - } - } -} - -static void -panfrost_load_tiled_image_bpp4(void *dst, const void *src, - const struct pipe_box *box, - uint32_t dst_stride, - uint32_t src_stride) -{ - for (int y = box->y, dest_y = 0; dest_y < box->height; ++y, ++dest_y) { - int block_y = y & ~0x0f; - int rem_y = y & 0x0F; - int block_start_s = block_y * src_stride; - int dest_start = dest_y * dst_stride; + unsigned expanded_y = bit_duplication[y & 0xF]; - for (int x = box->x, dest_x = 0; dest_x < box->width; ++x, ++dest_x) { + for (int x = box->x, src_x = 0; src_x < box->width; ++x, ++src_x) { int block_x_s = (x >> 4) * 256; - int rem_x = x & 0x0F; - - int index = space_filler[rem_y][rem_x]; - uint32_t *dest = dst + dest_start + 4 * dest_x; - const uint32_t *source = src + block_start_s + 4 * (block_x_s + index); - *dest = *source; - } - } -} + unsigned index = expanded_y ^ space_4[x & 0xF]; -static void -panfrost_load_tiled_image_generic(void *dst, const void *src, - const struct pipe_box *box, - uint32_t dst_stride, - uint32_t src_stride, - uint32_t bpp) -{ - for (int y = box->y, dest_y = 0; dest_y < box->height; ++y, ++dest_y) { - int block_y = y & ~0x0f; - int rem_y = y & 0x0F; - int block_start_s = block_y * src_stride; - int dest_start = dest_y * dst_stride; + uint8_t *src8 = src; + uint8_t *source = &src8[source_start + bpp * src_x]; + uint8_t *dest = dst + block_start_s + bpp * (block_x_s + index); - for (int x = box->x, dest_x = 0; dest_x < box->width; ++x, ++dest_x) { - int block_x_s = (x >> 4) * 256; - int rem_x = x & 0x0F; + uint8_t *out = is_store ? dest : source; + uint8_t *in = is_store ? source : dest; - int index = space_filler[rem_y][rem_x]; - uint8_t *dst8 = dst; - uint8_t *dest = &dst8[dest_start + bpp * dest_x]; - const uint8_t *source = src + block_start_s + bpp * (block_x_s + index); + /* Write out 1-4 bytes. Written like this rather than a loop so the + * compiler doesn't need to do branching (just some predication) */ - for (int b = 0; b < bpp; ++b) - dest[b] = source[b]; + out[0] = in[0]; + if (bpp > 1) { + out[1] = in[1]; + if (bpp > 2) { + out[2] = in[2]; + if (bpp > 3) + out[3] = in[3]; + } + } } } } @@ -158,13 +242,23 @@ panfrost_store_tiled_image(void *dst, const void *src, uint32_t src_stride, uint32_t bpp) { - switch (bpp) { - case 4: - panfrost_store_tiled_image_bpp4(dst, src, box, dst_stride, src_stride); - break; - default: - panfrost_store_tiled_image_generic(dst, src, box, dst_stride, src_stride, bpp); - } + /* The optimized path is for aligned writes specifically */ + + if (box->x & 0xF || box->width & 0xF) { + panfrost_access_tiled_image_generic(dst, (void *) src, box, dst_stride, src_stride, bpp, TRUE); + return; + } + + /* Attempt to use an optimized path if we have one */ + + switch (bpp) { + case 4: + panfrost_store_tiled_image_bpp4(dst, (void *) src, box, dst_stride, src_stride); + break; + default: + panfrost_access_tiled_image_generic(dst, (void *) src, box, dst_stride, src_stride, bpp, TRUE); + break; + } } void @@ -174,11 +268,5 @@ panfrost_load_tiled_image(void *dst, const void *src, uint32_t src_stride, uint32_t bpp) { - switch (bpp) { - case 4: - panfrost_load_tiled_image_bpp4(dst, src, box, dst_stride, src_stride); - break; - default: - panfrost_load_tiled_image_generic(dst, src, box, dst_stride, src_stride, bpp); - } + panfrost_access_tiled_image_generic((void *) src, dst, box, src_stride, dst_stride, bpp, FALSE); } -- 2.30.2