* Copyright (c) 2011-2013 Luc Verhaegen <libv@skynet.be>
* Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
* Copyright (c) 2018 Vasily Khoruzhick <anarsoul@gmail.com>
+ * Copyright (c) 2019 Collabora, Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
*/
#include "pan_tiling.h"
+#include <stdbool.h>
+#include "util/macros.h"
-uint32_t space_filler[16][16] = {
- { 0, 1, 4, 5, 16, 17, 20, 21, 64, 65, 68, 69, 80, 81, 84, 85, },
- { 3, 2, 7, 6, 19, 18, 23, 22, 67, 66, 71, 70, 83, 82, 87, 86, },
- { 12, 13, 8, 9, 28, 29, 24, 25, 76, 77, 72, 73, 92, 93, 88, 89, },
- { 15, 14, 11, 10, 31, 30, 27, 26, 79, 78, 75, 74, 95, 94, 91, 90, },
- { 48, 49, 52, 53, 32, 33, 36, 37, 112, 113, 116, 117, 96, 97, 100, 101, },
- { 51, 50, 55, 54, 35, 34, 39, 38, 115, 114, 119, 118, 99, 98, 103, 102, },
- { 60, 61, 56, 57, 44, 45, 40, 41, 124, 125, 120, 121, 108, 109, 104, 105, },
- { 63, 62, 59, 58, 47, 46, 43, 42, 127, 126, 123, 122, 111, 110, 107, 106, },
- { 192, 193, 196, 197, 208, 209, 212, 213, 128, 129, 132, 133, 144, 145, 148, 149, },
- { 195, 194, 199, 198, 211, 210, 215, 214, 131, 130, 135, 134, 147, 146, 151, 150, },
- { 204, 205, 200, 201, 220, 221, 216, 217, 140, 141, 136, 137, 156, 157, 152, 153, },
- { 207, 206, 203, 202, 223, 222, 219, 218, 143, 142, 139, 138, 159, 158, 155, 154, },
- { 240, 241, 244, 245, 224, 225, 228, 229, 176, 177, 180, 181, 160, 161, 164, 165, },
- { 243, 242, 247, 246, 227, 226, 231, 230, 179, 178, 183, 182, 163, 162, 167, 166, },
- { 252, 253, 248, 249, 236, 237, 232, 233, 188, 189, 184, 185, 172, 173, 168, 169, },
- { 255, 254, 251, 250, 239, 238, 235, 234, 191, 190, 187, 186, 175, 174, 171, 170, },
+/* This file implements software encode/decode of the tiling format used for
+ * textures and framebuffers primarily on Utgard GPUs. Names for this format
+ * include "Utgard-style tiling", "(Mali) swizzled textures", and
+ * "U-interleaved" (the former two names being used in the community
+ * Lima/Panfrost drivers; the latter name used internally at Arm).
+ * Conceptually, like any tiling scheme, the pixel reordering attempts to 2D
+ * spatial locality, to improve cache locality in both horizontal and vertical
+ * directions.
+ *
+ * This format is tiled: first, the image dimensions must be aligned to 16
+ * pixels in each axis. Once aligned, the image is divided into 16x16 tiles.
+ * This size harmonizes with other properties of the GPU; on Midgard,
+ * framebuffer tiles are logically 16x16 (this is the tile size used in
+ * Transaction Elimination and the minimum tile size used in Hierarchical
+ * Tiling). Conversely, for a standard 4 bytes-per-pixel format (like
+ * RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line
+ * size.
+ *
+ * Within each 16x16 block, the bits are reordered according to this pattern:
+ *
+ * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) |
+ *
+ * Basically, interleaving the X and Y bits, with XORs thrown in for every
+ * adjacent bit pair.
+ *
+ * This is cheap to implement both encode/decode in both hardware and software.
+ * In hardware, lines are simply rerouted to reorder and some XOR gates are
+ * thrown in. Software has to be a bit more clever.
+ *
+ * In software, the trick is to divide the pattern into two lines:
+ *
+ * | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 |
+ * ^ | 0 | x3 | 0 | x2 | 0 | x1 | 0 | x0 |
+ *
+ * That is, duplicate the bits of the Y and space out the bits of the X. The
+ * top line is a function only of Y, so it can be calculated once per row and
+ * stored in a register. The bottom line is simply X with the bits spaced out.
+ * Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the
+ * mask pattern (abusing carry bits).
+ *
+ * This format is also supported on Midgard GPUs, where it *can* be used for
+ * textures and framebuffers. That said, in practice it is usually as a
+ * fallback layout; Midgard introduces Arm FrameBuffer Compression, which is
+ * significantly more efficient than Utgard-style tiling and preferred for both
+ * textures and framebuffers, where possible. For unsupported texture types,
+ * for instance sRGB textures and framebuffers, this tiling scheme is used at a
+ * performance penalty, as AFBC is not compatible.
+ */
+
+/* Given the lower 4-bits of the Y coordinate, we would like to
+ * duplicate every bit over. So instead of 0b1010, we would like
+ * 0b11001100. The idea is that for the bits in the solely Y place, we
+ * get a Y place, and the bits in the XOR place *also* get a Y. */
+
+const uint32_t bit_duplication[16] = {
+ 0b00000000,
+ 0b00000011,
+ 0b00001100,
+ 0b00001111,
+ 0b00110000,
+ 0b00110011,
+ 0b00111100,
+ 0b00111111,
+ 0b11000000,
+ 0b11000011,
+ 0b11001100,
+ 0b11001111,
+ 0b11110000,
+ 0b11110011,
+ 0b11111100,
+ 0b11111111,
};
-static void
-panfrost_store_tiled_image_bpp4(void *dst, const void *src,
- const struct pipe_box *box,
- uint32_t dst_stride,
- uint32_t src_stride)
-{
- for (int y = box->y, src_y = 0; src_y < box->height; ++y, ++src_y) {
- int block_y = y & ~0x0f;
- int rem_y = y & 0x0F;
- int block_start_s = block_y * dst_stride;
- int source_start = src_y * src_stride;
-
- for (int x = box->x, src_x = 0; src_x < box->width; ++x, ++src_x) {
- int block_x_s = (x >> 4) * 256;
- int rem_x = x & 0x0F;
-
- int index = space_filler[rem_y][rem_x];
- const uint32_t *source = src + source_start + 4 * src_x;
- uint32_t *dest = dst + block_start_s + 4 * (block_x_s + index);
-
- *dest = *source;
- }
- }
+/* Space the bits out of a 4-bit nibble */
+
+const unsigned space_4[16] = {
+ 0b0000000,
+ 0b0000001,
+ 0b0000100,
+ 0b0000101,
+ 0b0010000,
+ 0b0010001,
+ 0b0010100,
+ 0b0010101,
+ 0b1000000,
+ 0b1000001,
+ 0b1000100,
+ 0b1000101,
+ 0b1010000,
+ 0b1010001,
+ 0b1010100,
+ 0b1010101
+};
+
+/* The scheme uses 16x16 tiles */
+
+#define TILE_WIDTH 16
+#define TILE_HEIGHT 16
+#define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
+
+/* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must
+ * only support copies and sizeof, so emulating with a packed structure works
+ * well enough, but if there's a native 128-bit type we may we well prefer
+ * that. */
+
+#ifdef __SIZEOF_INT128__
+typedef __uint128_t pan_uint128_t;
+#else
+typedef struct {
+ uint64_t lo;
+ uint64_t hi;
+} __attribute__((packed)) pan_uint128_t;
+#endif
+
+typedef struct {
+ uint16_t lo;
+ uint8_t hi;
+} __attribute__((packed)) pan_uint24_t;
+
+/* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation:
+ *
+ * dest_start precomputes the offset to the beginning of the first horizontal
+ * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are
+ * stored linearly, so we get the X tile number by shifting and then multiply
+ * by the bytes per tile .
+ *
+ * We iterate across the pixels we're trying to store in source-order. For each
+ * row in the destination image, we figure out which row of 16x16 block we're
+ * in, by slicing off the lower 4-bits (block_y).
+ *
+ * dest then precomputes the location of the top-left corner of the block the
+ * row starts in. In pixel coordinates (where the origin is the top-left),
+ * (block_y, 0) is the top-left corner of the leftmost tile in this row. While
+ * pixels are reordered within a block, the blocks themselves are stored
+ * linearly, so multiplying block_y by the pixel stride of the destination
+ * image equals the byte offset of that top-left corner of the block this row
+ * is in.
+ *
+ * On the other hand, the source is linear so we compute the locations of the
+ * start and end of the row in the source by a simple linear addressing.
+ *
+ * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0
+ * y0] value. Since this is constant across a row, we look it up per-row and
+ * store in expanded_y.
+ *
+ * Finally, we iterate each row in source order. In the outer loop, we iterate
+ * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should
+ * be unrolled), calculating the index within the tile and writing.
+ */
+
+#define TILED_ACCESS_TYPE(pixel_t, shift) \
+static ALWAYS_INLINE void \
+panfrost_access_tiled_image_##pixel_t \
+ (void *dst, void *src, \
+ uint16_t sx, uint16_t sy, \
+ uint16_t w, uint16_t h, \
+ uint32_t dst_stride, \
+ uint32_t src_stride, \
+ bool is_store) \
+{ \
+ uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \
+ for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
+ uint16_t block_y = y & ~0x0f; \
+ uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \
+ pixel_t *source = src + (src_y * src_stride); \
+ pixel_t *source_end = source + w; \
+ unsigned expanded_y = bit_duplication[y & 0xF] << shift; \
+ for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \
+ for (uint8_t i = 0; i < 16; ++i) { \
+ unsigned index = expanded_y ^ (space_4[i] << shift); \
+ if (is_store) \
+ *((pixel_t *) (dest + index)) = *(source++); \
+ else \
+ *(source++) = *((pixel_t *) (dest + index)); \
+ } \
+ } \
+ } \
+} \
+
+TILED_ACCESS_TYPE(uint8_t, 0);
+TILED_ACCESS_TYPE(uint16_t, 1);
+TILED_ACCESS_TYPE(uint32_t, 2);
+TILED_ACCESS_TYPE(uint64_t, 3);
+TILED_ACCESS_TYPE(pan_uint128_t, 4);
+
+#define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \
+ const unsigned mask = (1 << tile_shift) - 1; \
+ for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
+ unsigned block_y = y & ~mask; \
+ unsigned block_start_s = block_y * dst_stride; \
+ unsigned source_start = src_y * src_stride; \
+ unsigned expanded_y = bit_duplication[y & mask]; \
+ \
+ for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \
+ unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \
+ unsigned index = expanded_y ^ space_4[x & mask]; \
+ uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \
+ uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \
+ \
+ pixel_t *outp = (pixel_t *) (is_store ? dest : source); \
+ pixel_t *inp = (pixel_t *) (is_store ? source : dest); \
+ *outp = *inp; \
+ } \
+ } \
+}
+
+#define TILED_UNALIGNED_TYPES(store, shift) { \
+ if (bpp == 8) \
+ TILED_UNALIGNED_TYPE(uint8_t, store, shift) \
+ else if (bpp == 16) \
+ TILED_UNALIGNED_TYPE(uint16_t, store, shift) \
+ else if (bpp == 24) \
+ TILED_UNALIGNED_TYPE(pan_uint24_t, store, shift) \
+ else if (bpp == 32) \
+ TILED_UNALIGNED_TYPE(uint32_t, store, shift) \
+ else if (bpp == 64) \
+ TILED_UNALIGNED_TYPE(uint64_t, store, shift) \
+ else if (bpp == 128) \
+ TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \
}
static void
-panfrost_store_tiled_image_generic(void *dst, const void *src,
- const struct pipe_box *box,
+panfrost_access_tiled_image_generic(void *dst, void *src,
+ unsigned sx, unsigned sy,
+ unsigned w, unsigned h,
uint32_t dst_stride,
uint32_t src_stride,
- uint32_t bpp)
+ const struct util_format_description *desc,
+ bool _is_store)
{
- for (int y = box->y, src_y = 0; src_y < box->height; ++y, ++src_y) {
- int block_y = y & ~0x0f;
- int rem_y = y & 0x0F;
- int block_start_s = block_y * dst_stride;
- int source_start = src_y * src_stride;
-
- for (int x = box->x, src_x = 0; src_x < box->width; ++x, ++src_x) {
- int block_x_s = (x >> 4) * 256;
- int rem_x = x & 0x0F;
-
- int index = space_filler[rem_y][rem_x];
- const uint8_t *src8 = src;
- const uint8_t *source = &src8[source_start + bpp * src_x];
- uint8_t *dest = dst + block_start_s + bpp * (block_x_s + index);
-
- for (int b = 0; b < bpp; ++b)
- dest[b] = source[b];
- }
+ unsigned bpp = desc->block.bits;
+
+ if (desc->block.width > 1) {
+ w = DIV_ROUND_UP(w, desc->block.width);
+ h = DIV_ROUND_UP(h, desc->block.height);
+
+ if (_is_store)
+ TILED_UNALIGNED_TYPES(true, 2)
+ else
+ TILED_UNALIGNED_TYPES(false, 2)
+ } else {
+ if (_is_store)
+ TILED_UNALIGNED_TYPES(true, 4)
+ else
+ TILED_UNALIGNED_TYPES(false, 4)
}
}
-static void
-panfrost_load_tiled_image_bpp4(void *dst, const void *src,
- const struct pipe_box *box,
- uint32_t dst_stride,
- uint32_t src_stride)
+#define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8)))
+
+static ALWAYS_INLINE void
+panfrost_access_tiled_image(void *dst, void *src,
+ unsigned x, unsigned y,
+ unsigned w, unsigned h,
+ uint32_t dst_stride,
+ uint32_t src_stride,
+ enum pipe_format format,
+ bool is_store)
{
- for (int y = box->y, dest_y = 0; dest_y < box->height; ++y, ++dest_y) {
- int block_y = y & ~0x0f;
- int rem_y = y & 0x0F;
- int block_start_s = block_y * src_stride;
- int dest_start = dest_y * dst_stride;
-
- for (int x = box->x, dest_x = 0; dest_x < box->width; ++x, ++dest_x) {
- int block_x_s = (x >> 4) * 256;
- int rem_x = x & 0x0F;
-
- int index = space_filler[rem_y][rem_x];
- uint32_t *dest = dst + dest_start + 4 * dest_x;
- const uint32_t *source = src + block_start_s + 4 * (block_x_s + index);
-
- *dest = *source;
- }
+ const struct util_format_description *desc = util_format_description(format);
+
+ if (desc->block.width > 1 || desc->block.bits == 24) {
+ panfrost_access_tiled_image_generic(dst, (void *) src,
+ x, y, w, h,
+ dst_stride, src_stride, desc, is_store);
+
+ return;
}
-}
-static void
-panfrost_load_tiled_image_generic(void *dst, const void *src,
- const struct pipe_box *box,
- uint32_t dst_stride,
- uint32_t src_stride,
- uint32_t bpp)
-{
- for (int y = box->y, dest_y = 0; dest_y < box->height; ++y, ++dest_y) {
- int block_y = y & ~0x0f;
- int rem_y = y & 0x0F;
- int block_start_s = block_y * src_stride;
- int dest_start = dest_y * dst_stride;
-
- for (int x = box->x, dest_x = 0; dest_x < box->width; ++x, ++dest_x) {
- int block_x_s = (x >> 4) * 256;
- int rem_x = x & 0x0F;
-
- int index = space_filler[rem_y][rem_x];
- uint8_t *dst8 = dst;
- uint8_t *dest = &dst8[dest_start + bpp * dest_x];
- const uint8_t *source = src + block_start_s + bpp * (block_x_s + index);
-
- for (int b = 0; b < bpp; ++b)
- dest[b] = source[b];
- }
+ unsigned bpp = desc->block.bits;
+ unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH;
+ unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT;
+ unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH;
+ unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT;
+
+ /* First, tile the top portion */
+
+ unsigned orig_x = x, orig_y = y;
+
+ if (first_full_tile_y != y) {
+ unsigned dist = MIN2(first_full_tile_y - y, h);
+
+ panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
+ x, y, w, dist,
+ dst_stride, src_stride, desc, is_store);
+
+ if (dist == h)
+ return;
+
+ y += dist;
+ h -= dist;
+ }
+
+ /* Next, the bottom portion */
+ if (last_full_tile_y != (y + h)) {
+ unsigned dist = (y + h) - last_full_tile_y;
+
+ panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y),
+ x, last_full_tile_y, w, dist,
+ dst_stride, src_stride, desc, is_store);
+
+ h -= dist;
+ }
+
+ /* The left portion */
+ if (first_full_tile_x != x) {
+ unsigned dist = MIN2(first_full_tile_x - x, w);
+
+ panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
+ x, y, dist, h,
+ dst_stride, src_stride, desc, is_store);
+
+ if (dist == w)
+ return;
+
+ x += dist;
+ w -= dist;
+ }
+
+ /* Finally, the right portion */
+ if (last_full_tile_x != (x + w)) {
+ unsigned dist = (x + w) - last_full_tile_x;
+
+ panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y),
+ last_full_tile_x, y, dist, h,
+ dst_stride, src_stride, desc, is_store);
+
+ w -= dist;
}
+
+ if (bpp == 8)
+ panfrost_access_tiled_image_uint8_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
+ else if (bpp == 16)
+ panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
+ else if (bpp == 32)
+ panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
+ else if (bpp == 64)
+ panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
+ else if (bpp == 128)
+ panfrost_access_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
}
void
panfrost_store_tiled_image(void *dst, const void *src,
- const struct pipe_box *box,
+ unsigned x, unsigned y,
+ unsigned w, unsigned h,
uint32_t dst_stride,
uint32_t src_stride,
- uint32_t bpp)
+ enum pipe_format format)
{
- switch (bpp) {
- case 4:
- panfrost_store_tiled_image_bpp4(dst, src, box, dst_stride, src_stride);
- break;
- default:
- panfrost_store_tiled_image_generic(dst, src, box, dst_stride, src_stride, bpp);
- }
+ panfrost_access_tiled_image(dst, (void *) src,
+ x, y, w, h,
+ dst_stride, src_stride, format, true);
}
void
panfrost_load_tiled_image(void *dst, const void *src,
- const struct pipe_box *box,
+ unsigned x, unsigned y,
+ unsigned w, unsigned h,
uint32_t dst_stride,
uint32_t src_stride,
- uint32_t bpp)
+ enum pipe_format format)
{
- switch (bpp) {
- case 4:
- panfrost_load_tiled_image_bpp4(dst, src, box, dst_stride, src_stride);
- break;
- default:
- panfrost_load_tiled_image_generic(dst, src, box, dst_stride, src_stride, bpp);
- }
+ panfrost_access_tiled_image((void *) src, dst,
+ x, y, w, h,
+ src_stride, dst_stride, format, false);
}