src/gallium/drivers/vc4/vc4_tiling_lt.c

   1 /*
   2  * Copyright © 2017 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file vc4_tiling_lt.c
  25  *
  26  * Helper functions from vc4_tiling.c that will be compiled for using NEON
  27  * assembly or not.
  28  *
  29  * If VC4_BUILD_NEON is set, then the functions will be suffixed with _neon.
  30  * They will only use NEON assembly if __ARM_ARCH is also set, to keep the x86
  31  * sim build working.
  32  */
  33
  34 #include <string.h>
  35 #include "pipe/p_state.h"
  36 #include "vc4_tiling.h"
  37
  38 #ifdef VC4_BUILD_NEON
  39 #define NEON_TAG(x) x ## _neon
  40 #else
  41 #define NEON_TAG(x) x ## _base
  42 #endif
  43
  44 /** Returns the stride in bytes of a 64-byte microtile. */
  45 static uint32_t
  46 vc4_utile_stride(int cpp)
  47 {
  48         switch (cpp) {
  49         case 1:
  50                 return 8;
  51         case 2:
  52         case 4:
  53         case 8:
  54                 return 16;
  55         default:
  56                 unreachable("bad cpp");
  57         }
  58 }
  59
  60 static void
  61 vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
  62 {
  63         uint32_t gpu_stride = vc4_utile_stride(cpp);
  64 #if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
  65         if (gpu_stride == 8) {
  66                 __asm__ volatile (
  67                         /* Load from the GPU in one shot, no interleave, to
  68                          * d0-d7.
  69                          */
  70                         "vldm %0, {q0, q1, q2, q3}\n"
  71                         /* Store each 8-byte line to cpu-side destination,
  72                          * incrementing it by the stride each time.
  73                          */
  74                         "vst1.8 d0, [%1], %2\n"
  75                         "vst1.8 d1, [%1], %2\n"
  76                         "vst1.8 d2, [%1], %2\n"
  77                         "vst1.8 d3, [%1], %2\n"
  78                         "vst1.8 d4, [%1], %2\n"
  79                         "vst1.8 d5, [%1], %2\n"
  80                         "vst1.8 d6, [%1], %2\n"
  81                         "vst1.8 d7, [%1]\n"
  82                         :
  83                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
  84                         : "q0", "q1", "q2", "q3");
  85         } else {
  86                 assert(gpu_stride == 16);
  87                 __asm__ volatile (
  88                         /* Load from the GPU in one shot, no interleave, to
  89                          * d0-d7.
  90                          */
  91                         "vldm %0, {q0, q1, q2, q3};\n"
  92                         /* Store each 16-byte line in 2 parts to the cpu-side
  93                          * destination.  (vld1 can only store one d-register
  94                          * at a time).
  95                          */
  96                         "vst1.8 d0, [%1], %3\n"
  97                         "vst1.8 d1, [%2], %3\n"
  98                         "vst1.8 d2, [%1], %3\n"
  99                         "vst1.8 d3, [%2], %3\n"
 100                         "vst1.8 d4, [%1], %3\n"
 101                         "vst1.8 d5, [%2], %3\n"
 102                         "vst1.8 d6, [%1]\n"
 103                         "vst1.8 d7, [%2]\n"
 104                         :
 105                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
 106                         : "q0", "q1", "q2", "q3");
 107         }
 108 #elif defined (PIPE_ARCH_AARCH64)
 109         if (gpu_stride == 8) {
 110                 __asm__ volatile (
 111                         /* Load from the GPU in one shot, no interleave, to
 112                          * d0-d7.
 113                          */
 114                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
 115                         /* Store each 8-byte line to cpu-side destination,
 116                          * incrementing it by the stride each time.
 117                          */
 118                         "st1 {v0.D}[0], [%1], %2\n"
 119                         "st1 {v0.D}[1], [%1], %2\n"
 120                         "st1 {v1.D}[0], [%1], %2\n"
 121                         "st1 {v1.D}[1], [%1], %2\n"
 122                         "st1 {v2.D}[0], [%1], %2\n"
 123                         "st1 {v2.D}[1], [%1], %2\n"
 124                         "st1 {v3.D}[0], [%1], %2\n"
 125                         "st1 {v3.D}[1], [%1]\n"
 126                         :
 127                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
 128                         : "v0", "v1", "v2", "v3");
 129         } else {
 130                 assert(gpu_stride == 16);
 131                 __asm__ volatile (
 132                         /* Load from the GPU in one shot, no interleave, to
 133                          * d0-d7.
 134                          */
 135                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
 136                         /* Store each 16-byte line in 2 parts to the cpu-side
 137                          * destination.  (vld1 can only store one d-register
 138                          * at a time).
 139                          */
 140                         "st1 {v0.D}[0], [%1], %3\n"
 141                         "st1 {v0.D}[1], [%2], %3\n"
 142                         "st1 {v1.D}[0], [%1], %3\n"
 143                         "st1 {v1.D}[1], [%2], %3\n"
 144                         "st1 {v2.D}[0], [%1], %3\n"
 145                         "st1 {v2.D}[1], [%2], %3\n"
 146                         "st1 {v3.D}[0], [%1]\n"
 147                         "st1 {v3.D}[1], [%2]\n"
 148                         :
 149                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
 150                         : "v0", "v1", "v2", "v3");
 151         }
 152 #else
 153         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
 154                 memcpy(cpu, gpu + gpu_offset, gpu_stride);
 155                 cpu += cpu_stride;
 156         }
 157 #endif
 158 }
 159
 160 static void
 161 vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
 162 {
 163         uint32_t gpu_stride = vc4_utile_stride(cpp);
 164
 165 #if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
 166         if (gpu_stride == 8) {
 167                 __asm__ volatile (
 168                         /* Load each 8-byte line from cpu-side source,
 169                          * incrementing it by the stride each time.
 170                          */
 171                         "vld1.8 d0, [%1], %2\n"
 172                         "vld1.8 d1, [%1], %2\n"
 173                         "vld1.8 d2, [%1], %2\n"
 174                         "vld1.8 d3, [%1], %2\n"
 175                         "vld1.8 d4, [%1], %2\n"
 176                         "vld1.8 d5, [%1], %2\n"
 177                         "vld1.8 d6, [%1], %2\n"
 178                         "vld1.8 d7, [%1]\n"
 179                         /* Load from the GPU in one shot, no interleave, to
 180                          * d0-d7.
 181                          */
 182                         "vstm %0, {q0, q1, q2, q3}\n"
 183                         :
 184                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
 185                         : "q0", "q1", "q2", "q3");
 186         } else {
 187                 assert(gpu_stride == 16);
 188                 __asm__ volatile (
 189                         /* Load each 16-byte line in 2 parts from the cpu-side
 190                          * destination.  (vld1 can only store one d-register
 191                          * at a time).
 192                          */
 193                         "vld1.8 d0, [%1], %3\n"
 194                         "vld1.8 d1, [%2], %3\n"
 195                         "vld1.8 d2, [%1], %3\n"
 196                         "vld1.8 d3, [%2], %3\n"
 197                         "vld1.8 d4, [%1], %3\n"
 198                         "vld1.8 d5, [%2], %3\n"
 199                         "vld1.8 d6, [%1]\n"
 200                         "vld1.8 d7, [%2]\n"
 201                         /* Store to the GPU in one shot, no interleave. */
 202                         "vstm %0, {q0, q1, q2, q3}\n"
 203                         :
 204                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
 205                         : "q0", "q1", "q2", "q3");
 206         }
 207 #elif defined (PIPE_ARCH_AARCH64)
 208         if (gpu_stride == 8) {
 209                 __asm__ volatile (
 210                         /* Load each 8-byte line from cpu-side source,
 211                          * incrementing it by the stride each time.
 212                          */
 213                         "ld1 {v0.D}[0], [%1], %2\n"
 214                         "ld1 {v0.D}[1], [%1], %2\n"
 215                         "ld1 {v1.D}[0], [%1], %2\n"
 216                         "ld1 {v1.D}[1], [%1], %2\n"
 217                         "ld1 {v2.D}[0], [%1], %2\n"
 218                         "ld1 {v2.D}[1], [%1], %2\n"
 219                         "ld1 {v3.D}[0], [%1], %2\n"
 220                         "ld1 {v3.D}[1], [%1]\n"
 221                         /* Store to the GPU in one shot, no interleave. */
 222                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
 223                         :
 224                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
 225                         : "v0", "v1", "v2", "v3");
 226         } else {
 227                 assert(gpu_stride == 16);
 228                 __asm__ volatile (
 229                         /* Load each 16-byte line in 2 parts from the cpu-side
 230                          * destination.  (vld1 can only store one d-register
 231                          * at a time).
 232                          */
 233                         "ld1 {v0.D}[0], [%1], %3\n"
 234                         "ld1 {v0.D}[1], [%2], %3\n"
 235                         "ld1 {v1.D}[0], [%1], %3\n"
 236                         "ld1 {v1.D}[1], [%2], %3\n"
 237                         "ld1 {v2.D}[0], [%1], %3\n"
 238                         "ld1 {v2.D}[1], [%2], %3\n"
 239                         "ld1 {v3.D}[0], [%1]\n"
 240                         "ld1 {v3.D}[1], [%2]\n"
 241                         /* Store to the GPU in one shot, no interleave. */
 242                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
 243                         :
 244                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
 245                         : "v0", "v1", "v2", "v3");
 246         }
 247 #else
 248         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
 249                 memcpy(gpu + gpu_offset, cpu, gpu_stride);
 250                 cpu += cpu_stride;
 251         }
 252 #endif
 253
 254 }
 255
 256 void
 257 NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
 258                             void *src, uint32_t src_stride,
 259                             int cpp, const struct pipe_box *box)
 260 {
 261         uint32_t utile_w = vc4_utile_width(cpp);
 262         uint32_t utile_h = vc4_utile_height(cpp);
 263         uint32_t xstart = box->x;
 264         uint32_t ystart = box->y;
 265
 266         for (uint32_t y = 0; y < box->height; y += utile_h) {
 267                 for (int x = 0; x < box->width; x += utile_w) {
 268                         vc4_load_utile(dst + (dst_stride * y +
 269                                               x * cpp),
 270                                        src + ((ystart + y) * src_stride +
 271                                               (xstart + x) * 64 / utile_w),
 272                                        dst_stride, cpp);
 273                 }
 274         }
 275 }
 276
 277 void
 278 NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride,
 279                              void *src, uint32_t src_stride,
 280                              int cpp, const struct pipe_box *box)
 281 {
 282         uint32_t utile_w = vc4_utile_width(cpp);
 283         uint32_t utile_h = vc4_utile_height(cpp);
 284         uint32_t xstart = box->x;
 285         uint32_t ystart = box->y;
 286
 287         for (uint32_t y = 0; y < box->height; y += utile_h) {
 288                 for (int x = 0; x < box->width; x += utile_w) {
 289                         vc4_store_utile(dst + ((ystart + y) * dst_stride +
 290                                                (xstart + x) * 64 / utile_w),
 291                                         src + (src_stride * y +
 292                                                x * cpp),
 293                                         src_stride, cpp);
 294                 }
 295         }
 296 }