src/panfrost/shared/pan_tiling.c

   1 /*
   2  * Copyright (c) 2011-2013 Luc Verhaegen <libv@skynet.be>
   3  * Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   4  * Copyright (c) 2018 Vasily Khoruzhick <anarsoul@gmail.com>
   5  * Copyright (c) 2019 Collabora, Ltd.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sub license,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24  * DEALINGS IN THE SOFTWARE.
  25  *
  26  */
  27
  28 #include "pan_tiling.h"
  29 #include <stdbool.h>
  30 #include "util/macros.h"
  31
  32 /* This file implements software encode/decode of the tiling format used for
  33  * textures and framebuffers primarily on Utgard GPUs. Names for this format
  34  * include "Utgard-style tiling", "(Mali) swizzled textures", and
  35  * "U-interleaved" (the former two names being used in the community
  36  * Lima/Panfrost drivers; the latter name used internally at Arm).
  37  * Conceptually, like any tiling scheme, the pixel reordering attempts to 2D
  38  * spatial locality, to improve cache locality in both horizontal and vertical
  39  * directions.
  40  *
  41  * This format is tiled: first, the image dimensions must be aligned to 16
  42  * pixels in each axis. Once aligned, the image is divided into 16x16 tiles.
  43  * This size harmonizes with other properties of the GPU; on Midgard,
  44  * framebuffer tiles are logically 16x16 (this is the tile size used in
  45  * Transaction Elimination and the minimum tile size used in Hierarchical
  46  * Tiling). Conversely, for a standard 4 bytes-per-pixel format (like
  47  * RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line
  48  * size.
  49  *
  50  * Within each 16x16 block, the bits are reordered according to this pattern:
  51  *
  52  * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) |
  53  *
  54  * Basically, interleaving the X and Y bits, with XORs thrown in for every
  55  * adjacent bit pair.
  56  *
  57  * This is cheap to implement both encode/decode in both hardware and software.
  58  * In hardware, lines are simply rerouted to reorder and some XOR gates are
  59  * thrown in. Software has to be a bit more clever.
  60  *
  61  * In software, the trick is to divide the pattern into two lines:
  62  *
  63  *    | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 |
  64  *  ^ |  0 | x3 |  0 | x2 |  0 | x1 |  0 | x0 |
  65  *
  66  * That is, duplicate the bits of the Y and space out the bits of the X. The
  67  * top line is a function only of Y, so it can be calculated once per row and
  68  * stored in a register. The bottom line is simply X with the bits spaced out.
  69  * Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the
  70  * mask pattern (abusing carry bits).
  71  *
  72  * This format is also supported on Midgard GPUs, where it *can* be used for
  73  * textures and framebuffers. That said, in practice it is usually as a
  74  * fallback layout; Midgard introduces Arm FrameBuffer Compression, which is
  75  * significantly more efficient than Utgard-style tiling and preferred for both
  76  * textures and framebuffers, where possible. For unsupported texture types,
  77  * for instance sRGB textures and framebuffers, this tiling scheme is used at a
  78  * performance penalty, as AFBC is not compatible.
  79  */
  80
  81 /* Given the lower 4-bits of the Y coordinate, we would like to
  82  * duplicate every bit over. So instead of 0b1010, we would like
  83  * 0b11001100. The idea is that for the bits in the solely Y place, we
  84  * get a Y place, and the bits in the XOR place *also* get a Y. */
  85
  86 const uint32_t bit_duplication[16] = {
  87    0b00000000,
  88    0b00000011,
  89    0b00001100,
  90    0b00001111,
  91    0b00110000,
  92    0b00110011,
  93    0b00111100,
  94    0b00111111,
  95    0b11000000,
  96    0b11000011,
  97    0b11001100,
  98    0b11001111,
  99    0b11110000,
 100    0b11110011,
 101    0b11111100,
 102    0b11111111,
 103 };
 104
 105 /* Space the bits out of a 4-bit nibble */
 106
 107 const unsigned space_4[16] = {
 108    0b0000000,
 109    0b0000001,
 110    0b0000100,
 111    0b0000101,
 112    0b0010000,
 113    0b0010001,
 114    0b0010100,
 115    0b0010101,
 116    0b1000000,
 117    0b1000001,
 118    0b1000100,
 119    0b1000101,
 120    0b1010000,
 121    0b1010001,
 122    0b1010100,
 123    0b1010101
 124 };
 125
 126 /* The scheme uses 16x16 tiles */
 127
 128 #define TILE_WIDTH 16
 129 #define TILE_HEIGHT 16
 130 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
 131
 132 /* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must
 133  * only support copies and sizeof, so emulating with a packed structure works
 134  * well enough, but if there's a native 128-bit type we may we well prefer
 135  * that. */
 136
 137 #ifdef __SIZEOF_INT128__
 138 typedef __uint128_t pan_uint128_t;
 139 #else
 140 typedef struct {
 141   uint64_t lo;
 142   uint64_t hi;
 143 } __attribute__((packed)) pan_uint128_t;
 144 #endif
 145
 146 /* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation:
 147  *
 148  * dest_start precomputes the offset to the beginning of the first horizontal
 149  * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are
 150  * stored linearly, so we get the X tile number by shifting and then multiply
 151  * by the bytes per tile .
 152  *
 153  * We iterate across the pixels we're trying to store in source-order. For each
 154  * row in the destination image, we figure out which row of 16x16 block we're
 155  * in, by slicing off the lower 4-bits (block_y).
 156  *
 157  * dest then precomputes the location of the top-left corner of the block the
 158  * row starts in. In pixel coordinates (where the origin is the top-left),
 159  * (block_y, 0) is the top-left corner of the leftmost tile in this row.  While
 160  * pixels are reordered within a block, the blocks themselves are stored
 161  * linearly, so multiplying block_y by the pixel stride of the destination
 162  * image equals the byte offset of that top-left corner of the block this row
 163  * is in.
 164  *
 165  * On the other hand, the source is linear so we compute the locations of the
 166  * start and end of the row in the source by a simple linear addressing.
 167  *
 168  * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0
 169  * y0] value. Since this is constant across a row, we look it up per-row and
 170  * store in expanded_y.
 171  *
 172  * Finally, we iterate each row in source order. In the outer loop, we iterate
 173  * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should
 174  * be unrolled), calculating the index within the tile and writing.
 175  */
 176
 177 #define TILED_ACCESS_TYPE(pixel_t, shift) \
 178 static ALWAYS_INLINE void \
 179 panfrost_access_tiled_image_##pixel_t \
 180                               (void *dst, void *src, \
 181                                uint16_t sx, uint16_t sy, \
 182                                uint16_t w, uint16_t h, \
 183                                uint32_t dst_stride, \
 184                                uint32_t src_stride, \
 185                                bool is_store) \
 186 { \
 187    uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \
 188    for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
 189       uint16_t block_y = y & ~0x0f; \
 190       uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \
 191       pixel_t *source = src + (src_y * src_stride); \
 192       pixel_t *source_end = source + w; \
 193       unsigned expanded_y = bit_duplication[y & 0xF] << shift; \
 194       for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \
 195          for (uint8_t i = 0; i < 16; ++i) { \
 196             unsigned index = expanded_y ^ (space_4[i] << shift); \
 197             if (is_store) \
 198                 *((pixel_t *) (dest + index)) = *(source++); \
 199             else \
 200                 *(source++) = *((pixel_t *) (dest + index)); \
 201          } \
 202       } \
 203    } \
 204 } \
 205
 206 TILED_ACCESS_TYPE(uint8_t, 0);
 207 TILED_ACCESS_TYPE(uint16_t, 1);
 208 TILED_ACCESS_TYPE(uint32_t, 2);
 209 TILED_ACCESS_TYPE(uint64_t, 3);
 210 TILED_ACCESS_TYPE(pan_uint128_t, 4);
 211
 212 #define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \
 213    const unsigned mask = (1 << tile_shift) - 1; \
 214    for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
 215       unsigned block_y = y & ~mask; \
 216       unsigned block_start_s = block_y * dst_stride; \
 217       unsigned source_start = src_y * src_stride; \
 218       unsigned expanded_y = bit_duplication[y & mask]; \
 219  \
 220       for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \
 221          unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \
 222          unsigned index = expanded_y ^ space_4[x & mask]; \
 223          uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \
 224          uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \
 225  \
 226          pixel_t *outp = (pixel_t *) (is_store ? dest : source); \
 227          pixel_t *inp = (pixel_t *) (is_store ? source : dest); \
 228          *outp = *inp; \
 229       } \
 230    } \
 231 }
 232
 233 #define TILED_UNALIGNED_TYPES(store, shift) { \
 234    if (bpp == 8) \
 235       TILED_UNALIGNED_TYPE(uint8_t, store, shift) \
 236    else if (bpp == 16) \
 237       TILED_UNALIGNED_TYPE(uint16_t, store, shift) \
 238    else if (bpp == 32) \
 239       TILED_UNALIGNED_TYPE(uint32_t, store, shift) \
 240    else if (bpp == 64) \
 241       TILED_UNALIGNED_TYPE(uint64_t, store, shift) \
 242    else if (bpp == 128) \
 243       TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \
 244 }
 245
 246 static void
 247 panfrost_access_tiled_image_generic(void *dst, void *src,
 248                                unsigned sx, unsigned sy,
 249                                unsigned w, unsigned h,
 250                                uint32_t dst_stride,
 251                                uint32_t src_stride,
 252                                const struct util_format_description *desc,
 253                                bool _is_store)
 254 {
 255    unsigned bpp = desc->block.bits;
 256
 257    if (desc->block.width > 1) {
 258       w = DIV_ROUND_UP(w, desc->block.width);
 259       h = DIV_ROUND_UP(h, desc->block.height);
 260
 261       if (_is_store)
 262          TILED_UNALIGNED_TYPES(true, 2)
 263       else
 264          TILED_UNALIGNED_TYPES(false, 2)
 265    } else {
 266       if (_is_store)
 267          TILED_UNALIGNED_TYPES(true, 4)
 268       else
 269          TILED_UNALIGNED_TYPES(false, 4)
 270    }
 271 }
 272
 273 #define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8)))
 274
 275 static ALWAYS_INLINE void
 276 panfrost_access_tiled_image(void *dst, void *src,
 277                            unsigned x, unsigned y,
 278                            unsigned w, unsigned h,
 279                            uint32_t dst_stride,
 280                            uint32_t src_stride,
 281                            enum pipe_format format,
 282                            bool is_store)
 283 {
 284    const struct util_format_description *desc = util_format_description(format);
 285
 286    if (desc->block.width > 1) {
 287       panfrost_access_tiled_image_generic(dst, (void *) src,
 288             x, y, w, h,
 289             dst_stride, src_stride, desc, is_store);
 290
 291       return;
 292    }
 293
 294    unsigned bpp = desc->block.bits;
 295    unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH;
 296    unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT;
 297    unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH;
 298    unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT;
 299
 300    /* First, tile the top portion */
 301
 302    unsigned orig_x = x, orig_y = y;
 303
 304    if (first_full_tile_y != y) {
 305       unsigned dist = MIN2(first_full_tile_y - y, h);
 306
 307       panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
 308             x, y, w, dist,
 309             dst_stride, src_stride, desc, is_store);
 310
 311       if (dist == h)
 312          return;
 313
 314       y += dist;
 315       h -= dist;
 316    }
 317
 318    /* Next, the bottom portion */
 319    if (last_full_tile_y != (y + h)) {
 320       unsigned dist = (y + h) - last_full_tile_y;
 321
 322       panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y),
 323             x, last_full_tile_y, w, dist,
 324             dst_stride, src_stride, desc, is_store);
 325
 326       h -= dist;
 327    }
 328
 329    /* The left portion */
 330    if (first_full_tile_x != x) {
 331       unsigned dist = MIN2(first_full_tile_x - x, w);
 332
 333       panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
 334             x, y, dist, h,
 335             dst_stride, src_stride, desc, is_store);
 336
 337       if (dist == w)
 338          return;
 339
 340       x += dist;
 341       w -= dist;
 342    }
 343
 344    /* Finally, the right portion */
 345    if (last_full_tile_x != (x + w)) {
 346       unsigned dist = (x + w) - last_full_tile_x;
 347
 348       panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y),
 349             last_full_tile_x, y, dist, h,
 350             dst_stride, src_stride, desc, is_store);
 351
 352       w -= dist;
 353    }
 354
 355    if (bpp == 8)
 356       panfrost_access_tiled_image_uint8_t(dst,  OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
 357    else if (bpp == 16)
 358       panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
 359    else if (bpp == 32)
 360       panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
 361    else if (bpp == 64)
 362       panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
 363    else if (bpp == 128)
 364       panfrost_access_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
 365 }
 366
 367 void
 368 panfrost_store_tiled_image(void *dst, const void *src,
 369                            unsigned x, unsigned y,
 370                            unsigned w, unsigned h,
 371                            uint32_t dst_stride,
 372                            uint32_t src_stride,
 373                            enum pipe_format format)
 374 {
 375     panfrost_access_tiled_image(dst, (void *) src,
 376         x, y, w, h,
 377         dst_stride, src_stride, format, true);
 378 }
 379
 380 void
 381 panfrost_load_tiled_image(void *dst, const void *src,
 382                            unsigned x, unsigned y,
 383                            unsigned w, unsigned h,
 384                            uint32_t dst_stride,
 385                            uint32_t src_stride,
 386                            enum pipe_format format)
 387 {
 388    panfrost_access_tiled_image((void *) src, dst,
 389        x, y, w, h,
 390        src_stride, dst_stride, format, false);
 391 }