src/gallium/drivers/vc4/vc4_tiling_lt.c

   1 /*
   2  * Copyright © 2017 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file vc4_tiling_lt.c
  25  *
  26  * Helper functions from vc4_tiling.c that will be compiled for using NEON
  27  * assembly or not.
  28  *
  29  * If VC4_BUILD_NEON is set, then the functions will be suffixed with _neon.
  30  * They will only use NEON assembly if __ARM_ARCH is also set, to keep the x86
  31  * sim build working.
  32  */
  33
  34 #include <string.h>
  35 #include "pipe/p_state.h"
  36 #include "vc4_tiling.h"
  37
  38 #ifdef VC4_BUILD_NEON
  39 #define NEON_TAG(x) x ## _neon
  40 #else
  41 #define NEON_TAG(x) x ## _base
  42 #endif
  43
  44 /** Returns the stride in bytes of a 64-byte microtile. */
  45 static uint32_t
  46 vc4_utile_stride(int cpp)
  47 {
  48         switch (cpp) {
  49         case 1:
  50                 return 8;
  51         case 2:
  52         case 4:
  53         case 8:
  54                 return 16;
  55         default:
  56                 unreachable("bad cpp");
  57         }
  58 }
  59
  60 static void
  61 vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
  62 {
  63         uint32_t gpu_stride = vc4_utile_stride(cpp);
  64 #if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
  65         if (gpu_stride == 8) {
  66                 __asm__ volatile (
  67                         /* Load from the GPU in one shot, no interleave, to
  68                          * d0-d7.
  69                          */
  70                         "vldm %0, {q0, q1, q2, q3}\n"
  71                         /* Store each 8-byte line to cpu-side destination,
  72                          * incrementing it by the stride each time.
  73                          */
  74                         "vst1.8 d0, [%1], %2\n"
  75                         "vst1.8 d1, [%1], %2\n"
  76                         "vst1.8 d2, [%1], %2\n"
  77                         "vst1.8 d3, [%1], %2\n"
  78                         "vst1.8 d4, [%1], %2\n"
  79                         "vst1.8 d5, [%1], %2\n"
  80                         "vst1.8 d6, [%1], %2\n"
  81                         "vst1.8 d7, [%1]\n"
  82                         :
  83                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
  84                         : "q0", "q1", "q2", "q3");
  85         } else {
  86                 assert(gpu_stride == 16);
  87                 __asm__ volatile (
  88                         /* Load from the GPU in one shot, no interleave, to
  89                          * d0-d7.
  90                          */
  91                         "vldm %0, {q0, q1, q2, q3};\n"
  92                         /* Store each 16-byte line in 2 parts to the cpu-side
  93                          * destination.  (vld1 can only store one d-register
  94                          * at a time).
  95                          */
  96                         "vst1.8 d0, [%1], %3\n"
  97                         "vst1.8 d1, [%2], %3\n"
  98                         "vst1.8 d2, [%1], %3\n"
  99                         "vst1.8 d3, [%2], %3\n"
 100                         "vst1.8 d4, [%1], %3\n"
 101                         "vst1.8 d5, [%2], %3\n"
 102                         "vst1.8 d6, [%1]\n"
 103                         "vst1.8 d7, [%2]\n"
 104                         :
 105                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
 106                         : "q0", "q1", "q2", "q3");
 107         }
 108 #else
 109         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
 110                 memcpy(cpu, gpu + gpu_offset, gpu_stride);
 111                 cpu += cpu_stride;
 112         }
 113 #endif
 114 }
 115
 116 static void
 117 vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
 118 {
 119         uint32_t gpu_stride = vc4_utile_stride(cpp);
 120
 121 #if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
 122         if (gpu_stride == 8) {
 123                 __asm__ volatile (
 124                         /* Load each 8-byte line from cpu-side source,
 125                          * incrementing it by the stride each time.
 126                          */
 127                         "vld1.8 d0, [%1], %2\n"
 128                         "vld1.8 d1, [%1], %2\n"
 129                         "vld1.8 d2, [%1], %2\n"
 130                         "vld1.8 d3, [%1], %2\n"
 131                         "vld1.8 d4, [%1], %2\n"
 132                         "vld1.8 d5, [%1], %2\n"
 133                         "vld1.8 d6, [%1], %2\n"
 134                         "vld1.8 d7, [%1]\n"
 135                         /* Load from the GPU in one shot, no interleave, to
 136                          * d0-d7.
 137                          */
 138                         "vstm %0, {q0, q1, q2, q3}\n"
 139                         :
 140                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
 141                         : "q0", "q1", "q2", "q3");
 142         } else {
 143                 assert(gpu_stride == 16);
 144                 __asm__ volatile (
 145                         /* Load each 16-byte line in 2 parts from the cpu-side
 146                          * destination.  (vld1 can only store one d-register
 147                          * at a time).
 148                          */
 149                         "vld1.8 d0, [%1], %3\n"
 150                         "vld1.8 d1, [%2], %3\n"
 151                         "vld1.8 d2, [%1], %3\n"
 152                         "vld1.8 d3, [%2], %3\n"
 153                         "vld1.8 d4, [%1], %3\n"
 154                         "vld1.8 d5, [%2], %3\n"
 155                         "vld1.8 d6, [%1]\n"
 156                         "vld1.8 d7, [%2]\n"
 157                         /* Store to the GPU in one shot, no interleave. */
 158                         "vstm %0, {q0, q1, q2, q3}\n"
 159                         :
 160                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
 161                         : "q0", "q1", "q2", "q3");
 162         }
 163 #else
 164         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
 165                 memcpy(gpu + gpu_offset, cpu, gpu_stride);
 166                 cpu += cpu_stride;
 167         }
 168 #endif
 169
 170 }
 171
 172 void
 173 NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
 174                             void *src, uint32_t src_stride,
 175                             int cpp, const struct pipe_box *box)
 176 {
 177         uint32_t utile_w = vc4_utile_width(cpp);
 178         uint32_t utile_h = vc4_utile_height(cpp);
 179         uint32_t xstart = box->x;
 180         uint32_t ystart = box->y;
 181
 182         for (uint32_t y = 0; y < box->height; y += utile_h) {
 183                 for (int x = 0; x < box->width; x += utile_w) {
 184                         vc4_load_utile(dst + (dst_stride * y +
 185                                               x * cpp),
 186                                        src + ((ystart + y) * src_stride +
 187                                               (xstart + x) * 64 / utile_w),
 188                                        dst_stride, cpp);
 189                 }
 190         }
 191 }
 192
 193 void
 194 NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride,
 195                              void *src, uint32_t src_stride,
 196                              int cpp, const struct pipe_box *box)
 197 {
 198         uint32_t utile_w = vc4_utile_width(cpp);
 199         uint32_t utile_h = vc4_utile_height(cpp);
 200         uint32_t xstart = box->x;
 201         uint32_t ystart = box->y;
 202
 203         for (uint32_t y = 0; y < box->height; y += utile_h) {
 204                 for (int x = 0; x < box->width; x += utile_w) {
 205                         vc4_store_utile(dst + ((ystart + y) * dst_stride +
 206                                                (xstart + x) * 64 / utile_w),
 207                                         src + (src_stride * y +
 208                                                x * cpp),
 209                                         src_stride, cpp);
 210                 }
 211         }
 212 }