From 2091d311c9d063138d5c84bbf4afe99ca864e597 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Date: Wed, 15 Jan 2020 13:15:01 -0500
Subject: [PATCH] panfrost: Rework linear<--->tiled conversions

There's a lot going on here (it's a ton of commits squashed together
since otherwise this would be impossible to review...)

1. We have a fast path for linear->tiled for whole (aligned) tiles, but we
have to use a slow path for unaligned accesses. We can get a pretty
major win for partial updates by using this slow path simply on the
borders of the update region, and then hit the fast path for the
tile-aligned interior. This does require some shuffling.

2. Mark the LUTs constant, which allows the compiler to inline them,
which pairs well with loop unrolling (eliminating the memory accesses
and just becoming some immediates.. which are not as immediate on
aarch64 as I'd like..)

3. Add fast path for bpp1/2/8/16. These use the same algorithm and we
have native types for them, so may as well get the fast path.

4. Drop generic path for bpp != 1/2/8/16, since these formats are
generally awful and there's no way to tile them efficienctly and
honestly there's not a good reason too either. Lima doesn't support any
of these formats; Panfrost can make the opinionated choice to make them
linear.

5. Specialize the unaligned routines. They don't have to be fully
generic, they just can't assume alignment. So now they should be nearly
as fast as the aligned versions (which get some extra tricks to be even
faster but the difference might be neglible on some workloads).

6. Specialize also for the size of the tile, to allow 4x4 tiling as well
as 16x16 tiling. This allows compressed textures to be efficiently tiled
with the same routines (so we add support for tiling ASTC/ETC textures
while we're at it)

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Reviewed-by: Vasily Khoruzhick <anarsoul@gmail.com>
Tested-by: Vasily Khoruzhick <anarsoul@gmail.com> #lima on Mali400
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3414>
---
 src/gallium/drivers/lima/lima_resource.c    |   4 +-
 src/gallium/drivers/panfrost/pan_resource.c |   8 +-
 src/panfrost/shared/pan_tiling.c            | 345 ++++++++++++--------
 src/panfrost/shared/pan_tiling.h            |   5 +-
 4 files changed, 210 insertions(+), 152 deletions(-)

diff --git a/src/gallium/drivers/lima/lima_resource.c b/src/gallium/drivers/lima/lima_resource.c
index 2b86466101e..a0edbe92e01 100644
--- a/src/gallium/drivers/lima/lima_resource.c
+++ b/src/gallium/drivers/lima/lima_resource.c
@@ -636,7 +636,7 @@ lima_transfer_map(struct pipe_context *pctx,
                ptrans->box.width, ptrans->box.height,
                ptrans->stride,
                res->levels[level].stride,
-               util_format_get_blocksize(pres->format));
+               pres->format);
       }
 
       return trans->staging;
@@ -682,7 +682,7 @@ lima_transfer_unmap(struct pipe_context *pctx,
                ptrans->box.width, ptrans->box.height,
                res->levels[ptrans->level].stride,
                ptrans->stride,
-               util_format_get_blocksize(pres->format));
+               pres->format);
       }
       free(trans->staging);
    }
diff --git a/src/gallium/drivers/panfrost/pan_resource.c b/src/gallium/drivers/panfrost/pan_resource.c
index a20bd79c1ad..18c6e05ba3b 100644
--- a/src/gallium/drivers/panfrost/pan_resource.c
+++ b/src/gallium/drivers/panfrost/pan_resource.c
@@ -402,10 +402,12 @@ panfrost_resource_create_bo(struct panfrost_screen *screen, struct panfrost_reso
                 PIPE_BIND_SAMPLER_VIEW |
                 PIPE_BIND_DISPLAY_TARGET;
 
+        unsigned bpp = util_format_get_blocksizebits(res->format);
         bool is_2d = (res->target == PIPE_TEXTURE_2D);
+        bool is_sane_bpp = bpp == 8 || bpp == 16 || bpp == 32 || bpp == 64 || bpp == 128;
         bool should_tile = (res->usage != PIPE_USAGE_STREAM);
         bool must_tile = (res->bind & PIPE_BIND_DEPTH_STENCIL) && (screen->quirks & MIDGARD_SFBD);
-        bool can_tile = is_2d && ((res->bind & ~valid_binding) == 0);
+        bool can_tile = is_2d && is_sane_bpp && ((res->bind & ~valid_binding) == 0);
 
         /* FBOs we would like to checksum, if at all possible */
         bool can_checksum = !(res->bind & ~valid_binding);
@@ -667,7 +669,7 @@ panfrost_transfer_map(struct pipe_context *pctx,
                                         box->x, box->y, box->width, box->height,
                                         transfer->base.stride,
                                         rsrc->slices[level].stride,
-                                        util_format_get_blocksize(resource->format));
+                                        resource->format);
                         }
                 }
 
@@ -722,7 +724,7 @@ panfrost_transfer_unmap(struct pipe_context *pctx,
                                         transfer->box.width, transfer->box.height,
                                         prsrc->slices[transfer->level].stride,
                                         transfer->stride,
-                                        util_format_get_blocksize(prsrc->base.format));
+                                        prsrc->base.format);
                         }
                 }
         }
diff --git a/src/panfrost/shared/pan_tiling.c b/src/panfrost/shared/pan_tiling.c
index 158fde9718a..01cd4ca6657 100644
--- a/src/panfrost/shared/pan_tiling.c
+++ b/src/panfrost/shared/pan_tiling.c
@@ -27,7 +27,7 @@
 
 #include "pan_tiling.h"
 #include <stdbool.h>
-#include <assert.h>
+#include "util/macros.h"
 
 /* This file implements software encode/decode of the tiling format used for
  * textures and framebuffers primarily on Utgard GPUs. Names for this format
@@ -83,7 +83,7 @@
  * 0b11001100. The idea is that for the bits in the solely Y place, we
  * get a Y place, and the bits in the XOR place *also* get a Y. */
 
-uint32_t bit_duplication[16] = {
+const uint32_t bit_duplication[16] = {
    0b00000000,
    0b00000011,
    0b00001100,
@@ -104,7 +104,7 @@ uint32_t bit_duplication[16] = {
 
 /* Space the bits out of a 4-bit nibble */
 
-unsigned space_4[16] = {
+const unsigned space_4[16] = {
    0b0000000,
    0b0000001,
    0b0000100,
@@ -129,69 +129,114 @@ unsigned space_4[16] = {
 #define TILE_HEIGHT 16
 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
 
-/* An optimized routine to tile an aligned (w & 0xF == 0) bpp4 texture */
-
-static void
-panfrost_store_tiled_image_bpp4(void *dst, const void *src,
-                               unsigned sx, unsigned sy,
-                               unsigned w, unsigned h,
-                               uint32_t dst_stride,
-                               uint32_t src_stride)
-{
-   /* Precompute the offset to the beginning of the first horizontal tile we're
-    * writing to, knowing that x is 16-aligned. Tiles themselves are
-    * stored linearly, so we get the X tile number by shifting and then
-    * multiply by the bytes per tile */
-
-   uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * 4);
-
-   /* Iterate across the pixels we're trying to store in source-order */
-
-   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) {
-      /* For each pixel in the destination image, figure out the part
-       * corresponding to the 16x16 block index */
-
-      int block_y = y & ~0x0f;
-
-      /* In pixel coordinates (where the origin is the top-left), (block_y, 0)
-       * is the top-left corner of the leftmost tile in this row. While pixels
-       * are reordered within a block, the blocks themselves are stored
-       * linearly, so multiplying block_y by the pixel stride of the
-       * destination image equals the byte offset of that top-left corner of
-       * the block this row is in */
-
-      uint32_t *dest = (uint32_t *) (dest_start + (block_y * dst_stride));
-
-      /* The source is actually linear, so compute the byte offset to the start
-       * and end of this row in the source */
-
-      const uint32_t *source = src + (src_y * src_stride);
-      const uint32_t *source_end = source + w;
-
-      /* We want to duplicate the bits of the bottom nibble of Y */
-      unsigned expanded_y = bit_duplication[y & 0xF];
-
-      /* Iterate the row in source order. In the outer loop, we iterate 16
-       * bytes tiles. After each tile, we increment dest to include the size of
-       * that tile in pixels. */
-
-      for (; source < source_end; dest += PIXELS_PER_TILE) {
-         /* Within each tile, we iterate each of the 16 pixels in the row of
-          * the tile. This loop should be unrolled. */
-
-         for (int i = 0; i < 16; ++i) {
-            /* We have the X component spaced out in space_x and we have the Y
-             * component duplicated. So we just XOR them together. The X bits
-             * get the XOR like the pattern needs. The Y bits are XORing with
-             * zero so this is a no-op */
+/* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must
+ * only support copies and sizeof, so emulating with a packed structure works
+ * well enough, but if there's a native 128-bit type we may we well prefer
+ * that. */
+
+#ifdef __SIZEOF_INT128__
+typedef __uint128_t pan_uint128_t;
+#else
+typedef struct {
+  uint64_t lo;
+  uint64_t hi;
+} __attribute__((packed)) pan_uint128_t;
+#endif
+
+/* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation:
+ *
+ * dest_start precomputes the offset to the beginning of the first horizontal
+ * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are
+ * stored linearly, so we get the X tile number by shifting and then multiply
+ * by the bytes per tile .
+ *
+ * We iterate across the pixels we're trying to store in source-order. For each
+ * row in the destination image, we figure out which row of 16x16 block we're
+ * in, by slicing off the lower 4-bits (block_y).
+ *
+ * dest then precomputes the location of the top-left corner of the block the
+ * row starts in. In pixel coordinates (where the origin is the top-left),
+ * (block_y, 0) is the top-left corner of the leftmost tile in this row.  While
+ * pixels are reordered within a block, the blocks themselves are stored
+ * linearly, so multiplying block_y by the pixel stride of the destination
+ * image equals the byte offset of that top-left corner of the block this row
+ * is in.
+ *
+ * On the other hand, the source is linear so we compute the locations of the
+ * start and end of the row in the source by a simple linear addressing.
+ *
+ * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0
+ * y0] value. Since this is constant across a row, we look it up per-row and
+ * store in expanded_y.
+ *
+ * Finally, we iterate each row in source order. In the outer loop, we iterate
+ * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should
+ * be unrolled), calculating the index within the tile and writing.
+ */
 
-            unsigned index = expanded_y ^ space_4[i];
+#define TILED_STORE_TYPE(pixel_t, shift) \
+static void \
+panfrost_store_tiled_image_##pixel_t \
+                              (void *dst, const void *src, \
+                               uint16_t sx, uint16_t sy, \
+                               uint16_t w, uint16_t h, \
+                               uint32_t dst_stride, \
+                               uint32_t src_stride) \
+{ \
+   uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \
+   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
+      uint16_t block_y = y & ~0x0f; \
+      uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \
+      const pixel_t *source = src + (src_y * src_stride); \
+      const pixel_t *source_end = source + w; \
+      unsigned expanded_y = bit_duplication[y & 0xF] << shift; \
+      for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \
+         for (uint8_t i = 0; i < 16; ++i) { \
+            unsigned index = expanded_y ^ (space_4[i] << shift); \
+            *((pixel_t *) (dest + index)) = *(source++); \
+         } \
+      } \
+   } \
+} \
+
+TILED_STORE_TYPE(uint8_t, 0);
+TILED_STORE_TYPE(uint16_t, 1);
+TILED_STORE_TYPE(uint32_t, 2);
+TILED_STORE_TYPE(uint64_t, 3);
+TILED_STORE_TYPE(pan_uint128_t, 4);
+
+#define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \
+   const unsigned mask = (1 << tile_shift) - 1; \
+   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
+      unsigned block_y = y & ~mask; \
+      unsigned block_start_s = block_y * dst_stride; \
+      unsigned source_start = src_y * src_stride; \
+      unsigned expanded_y = bit_duplication[y & mask]; \
+ \
+      for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \
+         unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \
+         unsigned index = expanded_y ^ space_4[x & mask]; \
+         uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \
+         uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \
+ \
+         pixel_t *outp = (pixel_t *) (is_store ? dest : source); \
+         pixel_t *inp = (pixel_t *) (is_store ? source : dest); \
+         *outp = *inp; \
+      } \
+   } \
+}
 
-            /* Copy over the pixel */
-            dest[index] = *(source++);
-         }
-      }
-   }
+#define TILED_UNALIGNED_TYPES(store, shift) { \
+   if (bpp == 8) \
+      TILED_UNALIGNED_TYPE(uint8_t, store, shift) \
+   else if (bpp == 16) \
+      TILED_UNALIGNED_TYPE(uint16_t, store, shift) \
+   else if (bpp == 32) \
+      TILED_UNALIGNED_TYPE(uint32_t, store, shift) \
+   else if (bpp == 64) \
+      TILED_UNALIGNED_TYPE(uint64_t, store, shift) \
+   else if (bpp == 128) \
+      TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \
 }
 
 static void
@@ -200,109 +245,118 @@ panfrost_access_tiled_image_generic(void *dst, void *src,
                                unsigned w, unsigned h,
                                uint32_t dst_stride,
                                uint32_t src_stride,
-                               uint32_t bpp,
-                               bool is_store)
+                               const struct util_format_description *desc,
+                               bool _is_store)
 {
-   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) {
-      int block_y = y & ~0x0f;
-      int block_start_s = block_y * dst_stride;
-      int source_start = src_y * src_stride;
+   unsigned bpp = desc->block.bits;
+
+   if (desc->block.width > 1) {
+      w = DIV_ROUND_UP(w, desc->block.width);
+      h = DIV_ROUND_UP(h, desc->block.height);
+
+      if (_is_store)
+         TILED_UNALIGNED_TYPES(true, 2)
+      else
+         TILED_UNALIGNED_TYPES(false, 2)
+   } else {
+      if (_is_store)
+         TILED_UNALIGNED_TYPES(true, 4)
+      else
+         TILED_UNALIGNED_TYPES(false, 4)
+   }
+}
 
-      unsigned expanded_y = bit_duplication[y & 0xF];
+#define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8)))
 
-      for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) {
-         int block_x_s = (x >> 4) * 256;
+void
+panfrost_store_tiled_image(void *dst, const void *src,
+                           unsigned x, unsigned y,
+                           unsigned w, unsigned h,
+                           uint32_t dst_stride,
+                           uint32_t src_stride,
+                           enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
 
-         unsigned index = expanded_y ^ space_4[x & 0xF];
+   if (desc->block.width > 1) {
+      panfrost_access_tiled_image_generic(dst, (void *) src,
+            x, y, w, h,
+            dst_stride, src_stride, desc, true);
 
-         uint8_t *src8 = src;
-         uint8_t *source = &src8[source_start + bpp * src_x];
-         uint8_t *dest = dst + block_start_s + bpp * (block_x_s + index);
+      return;
+   }
 
-         uint8_t *out = is_store ? dest : source;
-         uint8_t *in = is_store ? source : dest;
+   unsigned bpp = desc->block.bits;
+   unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH;
+   unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT;
+   unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH;
+   unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT;
 
-         uint16_t *out16 = (uint16_t *) out;
-         uint16_t *in16 = (uint16_t *) in;
+   /* First, tile the top portion */
 
-         uint32_t *out32 = (uint32_t *) out;
-         uint32_t *in32 = (uint32_t *) in;
+   unsigned orig_x = x, orig_y = y;
 
-         uint64_t *out64 = (uint64_t *) out;
-         uint64_t *in64 = (uint64_t *) in;
+   if (first_full_tile_y != y) {
+      unsigned dist = MIN2(first_full_tile_y - y, h);
 
-         /* Write out 1-16 bytes. Written like this rather than a loop so the
-          * compiler can see what's going on */
+      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
+            x, y, w, dist,
+            dst_stride, src_stride, desc, true);  
 
-         switch (bpp) {
-            case 1:
-               out[0] = in[0];
-               break;
+      if (dist == h)
+         return;
 
-            case 2:
-               out16[0] = in16[0];
-               break;
+      y += dist;
+      h -= dist;
+   }
 
-            case 3:
-               out16[0] = in16[0];
-               out[2] = in[2];
-               break;
+   /* Next, the bottom portion */
+   if (last_full_tile_y != (y + h)) {
+      unsigned dist = (y + h) - last_full_tile_y;
 
-            case 4:
-               out32[0] = in32[0];
-               break;
+      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y),
+            x, last_full_tile_y, w, dist,
+            dst_stride, src_stride, desc, true);
 
-            case 6:
-               out32[0] = in32[0];
-               out16[2] = in16[2];
-               break;
+      h -= dist;
+   }
 
-            case 8:
-               out64[0] = in64[0];
-               break;
+   /* The left portion */
+   if (first_full_tile_x != x) {
+      unsigned dist = MIN2(first_full_tile_x - x, w);
 
-            case 12:
-               out64[0] = in64[0];
-               out32[2] = in32[2];
-               break;
+      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
+            x, y, dist, h,
+            dst_stride, src_stride, desc, true);
 
-            case 16:
-               out64[0] = in64[0];
-               out64[1] = in64[1];
-               break;
+      if (dist == w)
+         return;
 
-            default:
-               assert(0); /* Invalid */
-         }
-      }
+      x += dist;
+      w -= dist;
    }
-}
 
-void
-panfrost_store_tiled_image(void *dst, const void *src,
-                           unsigned x, unsigned y,
-                           unsigned w, unsigned h,
-                           uint32_t dst_stride,
-                           uint32_t src_stride,
-                           uint32_t bpp)
-{
-   /* The optimized path is for aligned writes specifically */
-
-   if (x & 0xF || w & 0xF) {
-      panfrost_access_tiled_image_generic(dst, (void *) src, x, y, w, h, dst_stride, src_stride, bpp, true);
-      return;
-   }
+   /* Finally, the right portion */
+   if (last_full_tile_x != (x + w)) {
+      unsigned dist = (x + w) - last_full_tile_x;
 
-   /* Attempt to use an optimized path if we have one */
+      panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y),
+            last_full_tile_x, y, dist, h,
+            dst_stride, src_stride, desc, true);
 
-   switch (bpp) {
-      case 4:
-         panfrost_store_tiled_image_bpp4(dst, (void *) src, x, y, w, h, dst_stride, src_stride);
-         break;
-      default:
-         panfrost_access_tiled_image_generic(dst, (void *) src, x, y, w, h, dst_stride, src_stride, bpp, true);
-         break;
+      w -= dist;
    }
+
+   if (bpp == 8)
+      panfrost_store_tiled_image_uint8_t(dst,  OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
+   else if (bpp == 16)
+      panfrost_store_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
+   else if (bpp == 32)
+      panfrost_store_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
+   else if (bpp == 64)
+      panfrost_store_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
+   else if (bpp == 128)
+      panfrost_store_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
 }
 
 void
@@ -311,7 +365,8 @@ panfrost_load_tiled_image(void *dst, const void *src,
                            unsigned w, unsigned h,
                            uint32_t dst_stride,
                            uint32_t src_stride,
-                           uint32_t bpp)
+                           enum pipe_format format)
 {
-   panfrost_access_tiled_image_generic((void *) src, dst, x, y, w, h, src_stride, dst_stride, bpp, false);
+   const struct util_format_description *desc = util_format_description(format);
+   panfrost_access_tiled_image_generic((void *) src, dst, x, y, w, h, src_stride, dst_stride, desc, false);
 }
diff --git a/src/panfrost/shared/pan_tiling.h b/src/panfrost/shared/pan_tiling.h
index e13d50c41e4..d8591e6dbdd 100644
--- a/src/panfrost/shared/pan_tiling.h
+++ b/src/panfrost/shared/pan_tiling.h
@@ -28,19 +28,20 @@
 #define H_PANFROST_TILING
 
 #include <stdint.h>
+#include <util/format/u_format.h>
 
 void panfrost_load_tiled_image(void *dst, const void *src,
                                unsigned x, unsigned y,
                                unsigned w, unsigned h,
                                uint32_t dst_stride,
                                uint32_t src_stride,
-                               uint32_t bpp);
+                               enum pipe_format format);
 
 void panfrost_store_tiled_image(void *dst, const void *src,
                                 unsigned x, unsigned y,
                                 unsigned w, unsigned h,
                                 uint32_t dst_stride,
                                 uint32_t src_stride,
-                                uint32_t bpp);
+                                enum pipe_format format);
 
 #endif
-- 
2.30.2