src/broadcom/common/v3d_cpu_tiling.h

   1 /*
   2  * Copyright © 2017 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file v3d_cpu_tiling.h
  25  *
  26  * Contains load/store functions common to both v3d and vc4.  The utile layout
  27  * stayed the same, though the way utiles get laid out has changed.
  28  */
  29
  30 static inline void
  31 v3d_load_utile(void *cpu, uint32_t cpu_stride,
  32                void *gpu, uint32_t gpu_stride)
  33 {
  34 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
  35         if (gpu_stride == 8) {
  36                 __asm__ volatile (
  37                         /* Load from the GPU in one shot, no interleave, to
  38                          * d0-d7.
  39                          */
  40                         "vldm %0, {q0, q1, q2, q3}\n"
  41                         /* Store each 8-byte line to cpu-side destination,
  42                          * incrementing it by the stride each time.
  43                          */
  44                         "vst1.8 d0, [%1], %2\n"
  45                         "vst1.8 d1, [%1], %2\n"
  46                         "vst1.8 d2, [%1], %2\n"
  47                         "vst1.8 d3, [%1], %2\n"
  48                         "vst1.8 d4, [%1], %2\n"
  49                         "vst1.8 d5, [%1], %2\n"
  50                         "vst1.8 d6, [%1], %2\n"
  51                         "vst1.8 d7, [%1]\n"
  52                         :
  53                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
  54                         : "q0", "q1", "q2", "q3");
  55                 return;
  56         } else if (gpu_stride == 16) {
  57                 __asm__ volatile (
  58                         /* Load from the GPU in one shot, no interleave, to
  59                          * d0-d7.
  60                          */
  61                         "vldm %0, {q0, q1, q2, q3};\n"
  62                         /* Store each 16-byte line in 2 parts to the cpu-side
  63                          * destination.  (vld1 can only store one d-register
  64                          * at a time).
  65                          */
  66                         "vst1.8 d0, [%1], %3\n"
  67                         "vst1.8 d1, [%2], %3\n"
  68                         "vst1.8 d2, [%1], %3\n"
  69                         "vst1.8 d3, [%2], %3\n"
  70                         "vst1.8 d4, [%1], %3\n"
  71                         "vst1.8 d5, [%2], %3\n"
  72                         "vst1.8 d6, [%1]\n"
  73                         "vst1.8 d7, [%2]\n"
  74                         :
  75                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
  76                         : "q0", "q1", "q2", "q3");
  77                 return;
  78         }
  79 #elif defined (PIPE_ARCH_AARCH64)
  80         if (gpu_stride == 8) {
  81                 __asm__ volatile (
  82                         /* Load from the GPU in one shot, no interleave, to
  83                          * d0-d7.
  84                          */
  85                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
  86                         /* Store each 8-byte line to cpu-side destination,
  87                          * incrementing it by the stride each time.
  88                          */
  89                         "st1 {v0.D}[0], [%1], %2\n"
  90                         "st1 {v0.D}[1], [%1], %2\n"
  91                         "st1 {v1.D}[0], [%1], %2\n"
  92                         "st1 {v1.D}[1], [%1], %2\n"
  93                         "st1 {v2.D}[0], [%1], %2\n"
  94                         "st1 {v2.D}[1], [%1], %2\n"
  95                         "st1 {v3.D}[0], [%1], %2\n"
  96                         "st1 {v3.D}[1], [%1]\n"
  97                         :
  98                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
  99                         : "v0", "v1", "v2", "v3");
 100                 return;
 101         } else if (gpu_stride == 16) {
 102                 __asm__ volatile (
 103                         /* Load from the GPU in one shot, no interleave, to
 104                          * d0-d7.
 105                          */
 106                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
 107                         /* Store each 16-byte line in 2 parts to the cpu-side
 108                          * destination.  (vld1 can only store one d-register
 109                          * at a time).
 110                          */
 111                         "st1 {v0.D}[0], [%1], %3\n"
 112                         "st1 {v0.D}[1], [%2], %3\n"
 113                         "st1 {v1.D}[0], [%1], %3\n"
 114                         "st1 {v1.D}[1], [%2], %3\n"
 115                         "st1 {v2.D}[0], [%1], %3\n"
 116                         "st1 {v2.D}[1], [%2], %3\n"
 117                         "st1 {v3.D}[0], [%1]\n"
 118                         "st1 {v3.D}[1], [%2]\n"
 119                         :
 120                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
 121                         : "v0", "v1", "v2", "v3");
 122                 return;
 123         }
 124 #endif
 125
 126         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
 127                 memcpy(cpu, gpu + gpu_offset, gpu_stride);
 128                 cpu += cpu_stride;
 129         }
 130 }
 131
 132 static inline void
 133 v3d_store_utile(void *gpu, uint32_t gpu_stride,
 134                 void *cpu, uint32_t cpu_stride)
 135 {
 136 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
 137         if (gpu_stride == 8) {
 138                 __asm__ volatile (
 139                         /* Load each 8-byte line from cpu-side source,
 140                          * incrementing it by the stride each time.
 141                          */
 142                         "vld1.8 d0, [%1], %2\n"
 143                         "vld1.8 d1, [%1], %2\n"
 144                         "vld1.8 d2, [%1], %2\n"
 145                         "vld1.8 d3, [%1], %2\n"
 146                         "vld1.8 d4, [%1], %2\n"
 147                         "vld1.8 d5, [%1], %2\n"
 148                         "vld1.8 d6, [%1], %2\n"
 149                         "vld1.8 d7, [%1]\n"
 150                         /* Load from the GPU in one shot, no interleave, to
 151                          * d0-d7.
 152                          */
 153                         "vstm %0, {q0, q1, q2, q3}\n"
 154                         :
 155                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
 156                         : "q0", "q1", "q2", "q3");
 157                 return;
 158         } else if (gpu_stride == 16) {
 159                 __asm__ volatile (
 160                         /* Load each 16-byte line in 2 parts from the cpu-side
 161                          * destination.  (vld1 can only store one d-register
 162                          * at a time).
 163                          */
 164                         "vld1.8 d0, [%1], %3\n"
 165                         "vld1.8 d1, [%2], %3\n"
 166                         "vld1.8 d2, [%1], %3\n"
 167                         "vld1.8 d3, [%2], %3\n"
 168                         "vld1.8 d4, [%1], %3\n"
 169                         "vld1.8 d5, [%2], %3\n"
 170                         "vld1.8 d6, [%1]\n"
 171                         "vld1.8 d7, [%2]\n"
 172                         /* Store to the GPU in one shot, no interleave. */
 173                         "vstm %0, {q0, q1, q2, q3}\n"
 174                         :
 175                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
 176                         : "q0", "q1", "q2", "q3");
 177                 return;
 178         }
 179 #elif defined (PIPE_ARCH_AARCH64)
 180         if (gpu_stride == 8) {
 181                 __asm__ volatile (
 182                         /* Load each 8-byte line from cpu-side source,
 183                          * incrementing it by the stride each time.
 184                          */
 185                         "ld1 {v0.D}[0], [%1], %2\n"
 186                         "ld1 {v0.D}[1], [%1], %2\n"
 187                         "ld1 {v1.D}[0], [%1], %2\n"
 188                         "ld1 {v1.D}[1], [%1], %2\n"
 189                         "ld1 {v2.D}[0], [%1], %2\n"
 190                         "ld1 {v2.D}[1], [%1], %2\n"
 191                         "ld1 {v3.D}[0], [%1], %2\n"
 192                         "ld1 {v3.D}[1], [%1]\n"
 193                         /* Store to the GPU in one shot, no interleave. */
 194                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
 195                         :
 196                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
 197                         : "v0", "v1", "v2", "v3");
 198                 return;
 199         } else if (gpu_stride == 16) {
 200                 __asm__ volatile (
 201                         /* Load each 16-byte line in 2 parts from the cpu-side
 202                          * destination.  (vld1 can only store one d-register
 203                          * at a time).
 204                          */
 205                         "ld1 {v0.D}[0], [%1], %3\n"
 206                         "ld1 {v0.D}[1], [%2], %3\n"
 207                         "ld1 {v1.D}[0], [%1], %3\n"
 208                         "ld1 {v1.D}[1], [%2], %3\n"
 209                         "ld1 {v2.D}[0], [%1], %3\n"
 210                         "ld1 {v2.D}[1], [%2], %3\n"
 211                         "ld1 {v3.D}[0], [%1]\n"
 212                         "ld1 {v3.D}[1], [%2]\n"
 213                         /* Store to the GPU in one shot, no interleave. */
 214                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
 215                         :
 216                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
 217                         : "v0", "v1", "v2", "v3");
 218                 return;
 219         }
 220 #endif
 221
 222         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
 223                 memcpy(gpu + gpu_offset, cpu, gpu_stride);
 224                 cpu += cpu_stride;
 225         }
 226 }