src/gallium/drivers/panfrost/pan_swizzle.c

   1 /*
   2  * Copyright (c) 2012-2013 Luc Verhaegen <libv@skynet.be>
   3  * Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sub license,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the
  13  * next paragraph) shall be included in all copies or substantial portions
  14  * of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  22  * DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include <stdio.h>
  26 #include "pan_swizzle.h"
  27 #include "pan_allocate.h"
  28
  29 /* Space a group of 4-bits out. For instance, 0x7 -- that is, 0b111 -- would
  30  * become 0b10101 */
  31
  32 static inline int
  33 space_bits_4(int i)
  34 {
  35         return ((i & 0x8) << 3) |
  36                ((i & 0x4) << 2) |
  37                ((i & 0x2) << 1) |
  38                ((i & 0x1) << 0);
  39 }
  40
  41 /* Generate lookup table for the space filler curve. Note this is a 1:1
  42  * mapping, just with bits twiddled around. */
  43
  44 uint32_t space_filler[16][16];
  45 uint32_t space_filler_packed4[16][4];
  46
  47 void
  48 panfrost_generate_space_filler_indices()
  49 {
  50         for (int y = 0; y < 16; ++y) {
  51                 for (int x = 0; x < 16; ++x) {
  52                         space_filler[y][x] =
  53                                 space_bits_4(y ^ x) | (space_bits_4(y) << 1);
  54                 }
  55
  56                 for (int q = 0; q < 4; ++q) {
  57                         space_filler_packed4[y][q] =
  58                                 (space_filler[y][(q * 4) + 0] << 0) |
  59                                 (space_filler[y][(q * 4) + 1] << 8) |
  60                                 (space_filler[y][(q * 4) + 2] << 16) |
  61                                 (space_filler[y][(q * 4) + 3] << 24);
  62                 }
  63         }
  64 }
  65
  66 static void
  67 swizzle_bpp1_align16(int width, int height, int source_stride, int block_pitch,
  68                      const uint8_t *pixels,
  69                      uint8_t *ldest)
  70 {
  71         for (int y = 0; y < height; ++y) {
  72                 {
  73                         int block_y = y & ~(0x0f);
  74                         int rem_y = y & 0x0f;
  75                         uint8_t *block_start_s = ldest + (block_y * block_pitch);
  76                         const uint8_t *source_start = pixels + (y * source_stride);
  77                         const uint8_t *source_end = source_start + width;
  78
  79                         /* Operate on blocks of 16 pixels to minimise bookkeeping */
  80
  81                         for (; source_start < source_end; block_start_s += 16 * 16, source_start += 16) {
  82                                 const uint32_t *src_32 = (const uint32_t *) source_start;
  83
  84                                 for (int q = 0; q < 4; ++q) {
  85                                         uint32_t src = src_32[q];
  86                                         uint32_t spaced = space_filler_packed4[rem_y][q];
  87                                         uint16_t *bs = (uint16_t *) block_start_s;
  88
  89                                         int spacedA = (spaced >> 0) & 0xFF;
  90                                         int spacedB = (spaced >> 16) & 0xFF;
  91
  92                                         bs[spacedA >> 1] = (src >> 0) & 0xFFFF;
  93                                         bs[spacedB >> 1] = (src >> 16) & 0xFFFF;
  94                                 }
  95                         }
  96                 }
  97
  98                 ++y;
  99
 100                 if (y >= height)
 101                         break;
 102
 103                 {
 104                         int block_y = y & ~(0x0f);
 105                         int rem_y = y & 0x0f;
 106                         uint8_t *block_start_s = ldest + (block_y * block_pitch);
 107                         const uint8_t *source_start = pixels + (y * source_stride);
 108                         const uint8_t *source_end = source_start + width;
 109
 110                         /* Operate on blocks of 16 pixels to minimise bookkeeping */
 111
 112                         for (; source_start < source_end; block_start_s += 16 * 16, source_start += 16) {
 113                                 const uint32_t *src_32 = (const uint32_t *) source_start;
 114
 115                                 for (int q = 0; q < 4; ++q) {
 116                                         uint32_t src = src_32[q];
 117                                         uint32_t spaced = space_filler_packed4[rem_y][q];
 118
 119                                         block_start_s[(spaced >> 0) & 0xFF] = (src >> 0) & 0xFF;
 120                                         block_start_s[(spaced >> 8) & 0xFF] = (src >> 8) & 0xFF;
 121
 122                                         block_start_s[(spaced >> 16) & 0xFF] = (src >> 16) & 0xFF;
 123                                         block_start_s[(spaced >> 24) & 0xFF] = (src >> 24) & 0xFF;
 124                                 }
 125                         }
 126                 }
 127
 128         }
 129 }
 130
 131 static void
 132 swizzle_bpp4_align16(int width, int height, int source_stride, int block_pitch,
 133                      const uint32_t *pixels,
 134                      uint32_t *ldest)
 135 {
 136         for (int y = 0; y < height; ++y) {
 137                 int block_y = y & ~(0x0f);
 138                 int rem_y = y & 0x0f;
 139                 uint32_t *block_start_s = ldest + (block_y * block_pitch);
 140                 const uint32_t *source_start = pixels + (y * source_stride);
 141                 const uint32_t *source_end = source_start + width;
 142
 143                 /* Operate on blocks of 16 pixels to minimise bookkeeping */
 144
 145                 for (; source_start < source_end; block_start_s += 16 * 16, source_start += 16) {
 146                         for (int j = 0; j < 16; ++j)
 147                                 block_start_s[space_filler[rem_y][j]] = source_start[j];
 148                 }
 149         }
 150 }
 151
 152 void
 153 panfrost_texture_swizzle(unsigned off_x,
 154                          unsigned off_y,
 155                          int width, int height, int bytes_per_pixel, int dest_width,
 156                          const uint8_t *pixels,
 157                          uint8_t *ldest)
 158 {
 159         /* Calculate maximum size, overestimating a bit */
 160         int block_pitch = ALIGN(dest_width, 16) >> 4;
 161
 162         /* Strides must be tight, since we're only ever called indirectly */
 163         int source_stride = width * bytes_per_pixel;
 164
 165         /* Use fast path if available */
 166         if (!(off_x || off_y) && (width == dest_width)) {
 167                 if (bytes_per_pixel == 4 && (ALIGN(width, 16) == width)) {
 168                         swizzle_bpp4_align16(width, height, source_stride >> 2, (block_pitch * 256 >> 4), (const uint32_t *) pixels, (uint32_t *) ldest);
 169                         return;
 170                 } else if (bytes_per_pixel == 1 && (ALIGN(width, 16) == width)) {
 171                         swizzle_bpp1_align16(width, height, source_stride, (block_pitch * 256 >> 4), pixels, (uint8_t *) ldest);
 172                         return;
 173                 }
 174         }
 175
 176         /* Otherwise, default back on generic path */
 177
 178         for (int y = 0; y < height; ++y) {
 179                 int block_y = (y + off_y) >> 4;
 180                 int rem_y = (y + off_y) & 0x0F;
 181                 int block_start_s = block_y * block_pitch * 256;
 182                 int source_start = y * source_stride;
 183
 184                 for (int x = 0; x < width; ++x) {
 185                         int block_x_s = ((x + off_x) >> 4) * 256;
 186                         int rem_x = (x + off_x) & 0x0F;
 187
 188                         int index = space_filler[rem_y][rem_x];
 189                         const uint8_t *source = &pixels[source_start + bytes_per_pixel * x];
 190                         uint8_t *dest = ldest + bytes_per_pixel * (block_start_s + block_x_s + index);
 191
 192                         for (int b = 0; b < bytes_per_pixel; ++b)
 193                                 dest[b] = source[b];
 194                 }
 195         }
 196 }