src/panfrost/shared/pan_tiling.c

   1 /*
   2  * Copyright (c) 2011-2013 Luc Verhaegen <libv@skynet.be>
   3  * Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   4  * Copyright (c) 2018 Vasily Khoruzhick <anarsoul@gmail.com>
   5  * Copyright (c) 2019 Collabora, Ltd.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sub license,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24  * DEALINGS IN THE SOFTWARE.
  25  *
  26  */
  27
  28 #include <stdbool.h>
  29 #include "pan_tiling.h"
  30
  31 /* This file implements software encode/decode of the tiling format used for
  32  * textures and framebuffers primarily on Utgard GPUs. Names for this format
  33  * include "Utgard-style tiling", "(Mali) swizzled textures", and
  34  * "U-interleaved" (the former two names being used in the community
  35  * Lima/Panfrost drivers; the latter name used internally at Arm).
  36  * Conceptually, like any tiling scheme, the pixel reordering attempts to 2D
  37  * spatial locality, to improve cache locality in both horizontal and vertical
  38  * directions.
  39  *
  40  * This format is tiled: first, the image dimensions must be aligned to 16
  41  * pixels in each axis. Once aligned, the image is divided into 16x16 tiles.
  42  * This size harmonizes with other properties of the GPU; on Midgard,
  43  * framebuffer tiles are logically 16x16 (this is the tile size used in
  44  * Transaction Elimination and the minimum tile size used in Hierarchical
  45  * Tiling). Conversely, for a standard 4 bytes-per-pixel format (like
  46  * RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line
  47  * size.
  48  *
  49  * Within each 16x16 block, the bits are reordered according to this pattern:
  50  *
  51  * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) |
  52  *
  53  * Basically, interleaving the X and Y bits, with XORs thrown in for every
  54  * adjacent bit pair.
  55  *
  56  * This is cheap to implement both encode/decode in both hardware and software.
  57  * In hardware, lines are simply rerouted to reorder and some XOR gates are
  58  * thrown in. Software has to be a bit more clever.
  59  *
  60  * In software, the trick is to divide the pattern into two lines:
  61  *
  62  *    | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 |
  63  *  ^ |  0 | x3 |  0 | x2 |  0 | x1 |  0 | x0 |
  64  *
  65  * That is, duplicate the bits of the Y and space out the bits of the X. The
  66  * top line is a function only of Y, so it can be calculated once per row and
  67  * stored in a register. The bottom line is simply X with the bits spaced out.
  68  * Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the
  69  * mask pattern (abusing carry bits).
  70  *
  71  * This format is also supported on Midgard GPUs, where it *can* be used for
  72  * textures and framebuffers. That said, in practice it is usually as a
  73  * fallback layout; Midgard introduces Arm FrameBuffer Compression, which is
  74  * significantly more efficient than Utgard-style tiling and preferred for both
  75  * textures and framebuffers, where possible. For unsupported texture types,
  76  * for instance sRGB textures and framebuffers, this tiling scheme is used at a
  77  * performance penalty, as AFBC is not compatible.
  78  */
  79
  80 /* Given the lower 4-bits of the Y coordinate, we would like to
  81  * duplicate every bit over. So instead of 0b1010, we would like
  82  * 0b11001100. The idea is that for the bits in the solely Y place, we
  83  * get a Y place, and the bits in the XOR place *also* get a Y. */
  84
  85 uint32_t bit_duplication[16] = {
  86    0b00000000,
  87    0b00000011,
  88    0b00001100,
  89    0b00001111,
  90    0b00110000,
  91    0b00110011,
  92    0b00111100,
  93    0b00111111,
  94    0b11000000,
  95    0b11000011,
  96    0b11001100,
  97    0b11001111,
  98    0b11110000,
  99    0b11110011,
 100    0b11111100,
 101    0b11111111,
 102 };
 103
 104 /* Space the bits out of a 4-bit nibble */
 105
 106 unsigned space_4[16] = {
 107    0b0000000,
 108    0b0000001,
 109    0b0000100,
 110    0b0000101,
 111    0b0010000,
 112    0b0010001,
 113    0b0010100,
 114    0b0010101,
 115    0b1000000,
 116    0b1000001,
 117    0b1000100,
 118    0b1000101,
 119    0b1010000,
 120    0b1010001,
 121    0b1010100,
 122    0b1010101
 123 };
 124
 125 /* The scheme uses 16x16 tiles */
 126
 127 #define TILE_WIDTH 16
 128 #define TILE_HEIGHT 16
 129 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
 130
 131 /* An optimized routine to tile an aligned (width & 0xF == 0) bpp4 texture */
 132
 133 static void
 134 panfrost_store_tiled_image_bpp4(void *dst, const void *src,
 135                                const struct pipe_box *box,
 136                                uint32_t dst_stride,
 137                                uint32_t src_stride)
 138 {
 139    /* Precompute the offset to the beginning of the first horizontal tile we're
 140     * writing to, knowing that box->x is 16-aligned. Tiles themselves are
 141     * stored linearly, so we get the X tile number by shifting and then
 142     * multiply by the bytes per tile */
 143
 144    uint8_t *dest_start = dst + ((box->x >> 4) * PIXELS_PER_TILE * 4);
 145
 146    /* Iterate across the pixels we're trying to store in source-order */
 147
 148    for (int y = box->y, src_y = 0; src_y < box->height; ++y, ++src_y) {
 149       /* For each pixel in the destination image, figure out the part
 150        * corresponding to the 16x16 block index */
 151
 152       int block_y = y & ~0x0f;
 153
 154       /* In pixel coordinates (where the origin is the top-left), (block_y, 0)
 155        * is the top-left corner of the leftmost tile in this row. While pixels
 156        * are reordered within a block, the blocks themselves are stored
 157        * linearly, so multiplying block_y by the pixel stride of the
 158        * destination image equals the byte offset of that top-left corner of
 159        * the block this row is in */
 160
 161       uint32_t *dest = (uint32_t *) (dest_start + (block_y * dst_stride));
 162
 163       /* The source is actually linear, so compute the byte offset to the start
 164        * and end of this row in the source */
 165
 166       const uint32_t *source = src + (src_y * src_stride);
 167       const uint32_t *source_end = source + box->width;
 168
 169       /* We want to duplicate the bits of the bottom nibble of Y */
 170       unsigned expanded_y = bit_duplication[y & 0xF];
 171
 172       /* Iterate the row in source order. In the outer loop, we iterate 16
 173        * bytes tiles. After each tile, we increment dest to include the size of
 174        * that tile in pixels. */
 175
 176       for (; source < source_end; dest += PIXELS_PER_TILE) {
 177          /* Within each tile, we iterate each of the 16 pixels in the row of
 178           * the tile. This loop should be unrolled. */
 179
 180          for (int i = 0; i < 16; ++i) {
 181             /* We have the X component spaced out in space_x and we have the Y
 182              * component duplicated. So we just XOR them together. The X bits
 183              * get the XOR like the pattern needs. The Y bits are XORing with
 184              * zero so this is a no-op */
 185
 186             unsigned index = expanded_y ^ space_4[i];
 187
 188             /* Copy over the pixel */
 189             dest[index] = *(source++);
 190          }
 191       }
 192    }
 193 }
 194
 195 static void
 196 panfrost_access_tiled_image_generic(void *dst, void *src,
 197                                const struct pipe_box *box,
 198                                uint32_t dst_stride,
 199                                uint32_t src_stride,
 200                                uint32_t bpp,
 201                                bool is_store)
 202 {
 203    for (int y = box->y, src_y = 0; src_y < box->height; ++y, ++src_y) {
 204       int block_y = y & ~0x0f;
 205       int block_start_s = block_y * dst_stride;
 206       int source_start = src_y * src_stride;
 207
 208       unsigned expanded_y = bit_duplication[y & 0xF];
 209
 210       for (int x = box->x, src_x = 0; src_x < box->width; ++x, ++src_x) {
 211          int block_x_s = (x >> 4) * 256;
 212
 213          unsigned index = expanded_y ^ space_4[x & 0xF];
 214
 215          uint8_t *src8 = src;
 216          uint8_t *source = &src8[source_start + bpp * src_x];
 217          uint8_t *dest = dst + block_start_s + bpp * (block_x_s + index);
 218
 219          uint8_t *out = is_store ? dest : source;
 220          uint8_t *in = is_store ? source : dest;
 221
 222          uint16_t *out16 = (uint16_t *) out;
 223          uint16_t *in16 = (uint16_t *) in;
 224
 225          uint32_t *out32 = (uint32_t *) out;
 226          uint32_t *in32 = (uint32_t *) in;
 227
 228          uint64_t *out64 = (uint64_t *) out;
 229          uint64_t *in64 = (uint64_t *) in;
 230
 231          /* Write out 1-16 bytes. Written like this rather than a loop so the
 232           * compiler can see what's going on */
 233
 234          switch (bpp) {
 235             case 1:
 236                out[0] = in[0];
 237                break;
 238
 239             case 2:
 240                out16[0] = in16[0];
 241                break;
 242
 243             case 3:
 244                out16[0] = in16[0];
 245                out[2] = in[2];
 246                break;
 247
 248             case 4:
 249                out32[0] = in32[0];
 250                break;
 251
 252             case 6:
 253                out32[0] = in32[0];
 254                out16[2] = in16[2];
 255                break;
 256
 257             case 8:
 258                out64[0] = in64[0];
 259                break;
 260
 261             case 12:
 262                out64[0] = in64[0];
 263                out32[2] = in32[2];
 264                break;
 265
 266             case 16:
 267                out64[0] = in64[0];
 268                out64[1] = in64[1];
 269                break;
 270
 271             default:
 272                unreachable("Invalid bpp in software tiling");
 273          }
 274       }
 275    }
 276 }
 277
 278 void
 279 panfrost_store_tiled_image(void *dst, const void *src,
 280                            const struct pipe_box *box,
 281                            uint32_t dst_stride,
 282                            uint32_t src_stride,
 283                            uint32_t bpp)
 284 {
 285    /* The optimized path is for aligned writes specifically */
 286
 287    if (box->x & 0xF || box->width & 0xF) {
 288       panfrost_access_tiled_image_generic(dst, (void *) src, box, dst_stride, src_stride, bpp, TRUE);
 289       return;
 290    }
 291
 292    /* Attempt to use an optimized path if we have one */
 293
 294    switch (bpp) {
 295       case 4:
 296          panfrost_store_tiled_image_bpp4(dst, (void *) src, box, dst_stride, src_stride);
 297          break;
 298       default:
 299          panfrost_access_tiled_image_generic(dst, (void *) src, box, dst_stride, src_stride, bpp, TRUE);
 300          break;
 301    }
 302 }
 303
 304 void
 305 panfrost_load_tiled_image(void *dst, const void *src,
 306                            const struct pipe_box *box,
 307                            uint32_t dst_stride,
 308                            uint32_t src_stride,
 309                            uint32_t bpp)
 310 {
 311    panfrost_access_tiled_image_generic((void *) src, dst, box, src_stride, dst_stride, bpp, FALSE);
 312 }