src/broadcom/common/v3d_cpu_tiling.h

   1 /*
   2  * Copyright © 2017 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file v3d_cpu_tiling.h
  25  *
  26  * Contains load/store functions common to both v3d and vc4.  The utile layout
  27  * stayed the same, though the way utiles get laid out has changed.
  28  */
  29
  30 static inline void
  31 v3d_load_utile(void *cpu, uint32_t cpu_stride,
  32                void *gpu, uint32_t gpu_stride)
  33 {
  34 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
  35         if (gpu_stride == 8) {
  36                 __asm__ volatile (
  37                         /* Load from the GPU in one shot, no interleave, to
  38                          * d0-d7.
  39                          */
  40                         "vldm %0, {q0, q1, q2, q3}\n"
  41                         /* Store each 8-byte line to cpu-side destination,
  42                          * incrementing it by the stride each time.
  43                          */
  44                         "vst1.8 d0, [%1], %2\n"
  45                         "vst1.8 d1, [%1], %2\n"
  46                         "vst1.8 d2, [%1], %2\n"
  47                         "vst1.8 d3, [%1], %2\n"
  48                         "vst1.8 d4, [%1], %2\n"
  49                         "vst1.8 d5, [%1], %2\n"
  50                         "vst1.8 d6, [%1], %2\n"
  51                         "vst1.8 d7, [%1]\n"
  52                         :
  53                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
  54                         : "q0", "q1", "q2", "q3");
  55         } else {
  56                 assert(gpu_stride == 16);
  57                 __asm__ volatile (
  58                         /* Load from the GPU in one shot, no interleave, to
  59                          * d0-d7.
  60                          */
  61                         "vldm %0, {q0, q1, q2, q3};\n"
  62                         /* Store each 16-byte line in 2 parts to the cpu-side
  63                          * destination.  (vld1 can only store one d-register
  64                          * at a time).
  65                          */
  66                         "vst1.8 d0, [%1], %3\n"
  67                         "vst1.8 d1, [%2], %3\n"
  68                         "vst1.8 d2, [%1], %3\n"
  69                         "vst1.8 d3, [%2], %3\n"
  70                         "vst1.8 d4, [%1], %3\n"
  71                         "vst1.8 d5, [%2], %3\n"
  72                         "vst1.8 d6, [%1]\n"
  73                         "vst1.8 d7, [%2]\n"
  74                         :
  75                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
  76                         : "q0", "q1", "q2", "q3");
  77         }
  78 #elif defined (PIPE_ARCH_AARCH64)
  79         if (gpu_stride == 8) {
  80                 __asm__ volatile (
  81                         /* Load from the GPU in one shot, no interleave, to
  82                          * d0-d7.
  83                          */
  84                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
  85                         /* Store each 8-byte line to cpu-side destination,
  86                          * incrementing it by the stride each time.
  87                          */
  88                         "st1 {v0.D}[0], [%1], %2\n"
  89                         "st1 {v0.D}[1], [%1], %2\n"
  90                         "st1 {v1.D}[0], [%1], %2\n"
  91                         "st1 {v1.D}[1], [%1], %2\n"
  92                         "st1 {v2.D}[0], [%1], %2\n"
  93                         "st1 {v2.D}[1], [%1], %2\n"
  94                         "st1 {v3.D}[0], [%1], %2\n"
  95                         "st1 {v3.D}[1], [%1]\n"
  96                         :
  97                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
  98                         : "v0", "v1", "v2", "v3");
  99         } else {
 100                 assert(gpu_stride == 16);
 101                 __asm__ volatile (
 102                         /* Load from the GPU in one shot, no interleave, to
 103                          * d0-d7.
 104                          */
 105                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
 106                         /* Store each 16-byte line in 2 parts to the cpu-side
 107                          * destination.  (vld1 can only store one d-register
 108                          * at a time).
 109                          */
 110                         "st1 {v0.D}[0], [%1], %3\n"
 111                         "st1 {v0.D}[1], [%2], %3\n"
 112                         "st1 {v1.D}[0], [%1], %3\n"
 113                         "st1 {v1.D}[1], [%2], %3\n"
 114                         "st1 {v2.D}[0], [%1], %3\n"
 115                         "st1 {v2.D}[1], [%2], %3\n"
 116                         "st1 {v3.D}[0], [%1]\n"
 117                         "st1 {v3.D}[1], [%2]\n"
 118                         :
 119                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
 120                         : "v0", "v1", "v2", "v3");
 121         }
 122 #else
 123         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
 124                 memcpy(cpu, gpu + gpu_offset, gpu_stride);
 125                 cpu += cpu_stride;
 126         }
 127 #endif
 128 }
 129
 130 static inline void
 131 v3d_store_utile(void *gpu, uint32_t gpu_stride,
 132                 void *cpu, uint32_t cpu_stride)
 133 {
 134 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
 135         if (gpu_stride == 8) {
 136                 __asm__ volatile (
 137                         /* Load each 8-byte line from cpu-side source,
 138                          * incrementing it by the stride each time.
 139                          */
 140                         "vld1.8 d0, [%1], %2\n"
 141                         "vld1.8 d1, [%1], %2\n"
 142                         "vld1.8 d2, [%1], %2\n"
 143                         "vld1.8 d3, [%1], %2\n"
 144                         "vld1.8 d4, [%1], %2\n"
 145                         "vld1.8 d5, [%1], %2\n"
 146                         "vld1.8 d6, [%1], %2\n"
 147                         "vld1.8 d7, [%1]\n"
 148                         /* Load from the GPU in one shot, no interleave, to
 149                          * d0-d7.
 150                          */
 151                         "vstm %0, {q0, q1, q2, q3}\n"
 152                         :
 153                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
 154                         : "q0", "q1", "q2", "q3");
 155         } else {
 156                 assert(gpu_stride == 16);
 157                 __asm__ volatile (
 158                         /* Load each 16-byte line in 2 parts from the cpu-side
 159                          * destination.  (vld1 can only store one d-register
 160                          * at a time).
 161                          */
 162                         "vld1.8 d0, [%1], %3\n"
 163                         "vld1.8 d1, [%2], %3\n"
 164                         "vld1.8 d2, [%1], %3\n"
 165                         "vld1.8 d3, [%2], %3\n"
 166                         "vld1.8 d4, [%1], %3\n"
 167                         "vld1.8 d5, [%2], %3\n"
 168                         "vld1.8 d6, [%1]\n"
 169                         "vld1.8 d7, [%2]\n"
 170                         /* Store to the GPU in one shot, no interleave. */
 171                         "vstm %0, {q0, q1, q2, q3}\n"
 172                         :
 173                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
 174                         : "q0", "q1", "q2", "q3");
 175         }
 176 #elif defined (PIPE_ARCH_AARCH64)
 177         if (gpu_stride == 8) {
 178                 __asm__ volatile (
 179                         /* Load each 8-byte line from cpu-side source,
 180                          * incrementing it by the stride each time.
 181                          */
 182                         "ld1 {v0.D}[0], [%1], %2\n"
 183                         "ld1 {v0.D}[1], [%1], %2\n"
 184                         "ld1 {v1.D}[0], [%1], %2\n"
 185                         "ld1 {v1.D}[1], [%1], %2\n"
 186                         "ld1 {v2.D}[0], [%1], %2\n"
 187                         "ld1 {v2.D}[1], [%1], %2\n"
 188                         "ld1 {v3.D}[0], [%1], %2\n"
 189                         "ld1 {v3.D}[1], [%1]\n"
 190                         /* Store to the GPU in one shot, no interleave. */
 191                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
 192                         :
 193                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
 194                         : "v0", "v1", "v2", "v3");
 195         } else {
 196                 assert(gpu_stride == 16);
 197                 __asm__ volatile (
 198                         /* Load each 16-byte line in 2 parts from the cpu-side
 199                          * destination.  (vld1 can only store one d-register
 200                          * at a time).
 201                          */
 202                         "ld1 {v0.D}[0], [%1], %3\n"
 203                         "ld1 {v0.D}[1], [%2], %3\n"
 204                         "ld1 {v1.D}[0], [%1], %3\n"
 205                         "ld1 {v1.D}[1], [%2], %3\n"
 206                         "ld1 {v2.D}[0], [%1], %3\n"
 207                         "ld1 {v2.D}[1], [%2], %3\n"
 208                         "ld1 {v3.D}[0], [%1]\n"
 209                         "ld1 {v3.D}[1], [%2]\n"
 210                         /* Store to the GPU in one shot, no interleave. */
 211                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
 212                         :
 213                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
 214                         : "v0", "v1", "v2", "v3");
 215         }
 216 #else
 217         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
 218                 memcpy(gpu + gpu_offset, cpu, gpu_stride);
 219                 cpu += cpu_stride;
 220         }
 221 #endif
 222 }