src/broadcom/common/v3d_cpu_tiling.h

   1 /*
   2  * Copyright © 2017 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file v3d_cpu_tiling.h
  25  *
  26  * Contains load/store functions common to both v3d and vc4.  The utile layout
  27  * stayed the same, though the way utiles get laid out has changed.
  28  */
  29
  30 static inline void
  31 v3d_load_utile(void *cpu, uint32_t cpu_stride,
  32                void *gpu, uint32_t gpu_stride)
  33 {
  34 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
  35         if (gpu_stride == 8) {
  36                 __asm__ volatile (
  37                         /* Load from the GPU in one shot, no interleave, to
  38                          * d0-d7.
  39                          */
  40                         "vldm %[gpu], {q0, q1, q2, q3}\n"
  41                         /* Store each 8-byte line to cpu-side destination,
  42                          * incrementing it by the stride each time.
  43                          */
  44                         "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
  45                         "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
  46                         "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
  47                         "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
  48                         "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
  49                         "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
  50                         "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
  51                         "vst1.8 d7, [%[cpu]]\n"
  52                         : [cpu]         "+r"(cpu)
  53                         : [gpu]         "r"(gpu),
  54                           [cpu_stride]  "r"(cpu_stride)
  55                         : "q0", "q1", "q2", "q3");
  56                 return;
  57         } else if (gpu_stride == 16) {
  58                 void *cpu2 = cpu + 8;
  59                 __asm__ volatile (
  60                         /* Load from the GPU in one shot, no interleave, to
  61                          * d0-d7.
  62                          */
  63                         "vldm %[gpu], {q0, q1, q2, q3};\n"
  64                         /* Store each 16-byte line in 2 parts to the cpu-side
  65                          * destination.  (vld1 can only store one d-register
  66                          * at a time).
  67                          */
  68                         "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
  69                         "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
  70                         "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
  71                         "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
  72                         "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
  73                         "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
  74                         "vst1.8 d6, [%[cpu]]\n"
  75                         "vst1.8 d7, [%[cpu2]]\n"
  76                         : [cpu]         "+r"(cpu),
  77                           [cpu2]        "+r"(cpu2)
  78                         : [gpu]         "r"(gpu),
  79                           [cpu_stride]  "r"(cpu_stride)
  80                         : "q0", "q1", "q2", "q3");
  81                 return;
  82         }
  83 #elif defined (PIPE_ARCH_AARCH64)
  84         if (gpu_stride == 8) {
  85                 __asm__ volatile (
  86                         /* Load from the GPU in one shot, no interleave, to
  87                          * d0-d7.
  88                          */
  89                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
  90                         /* Store each 8-byte line to cpu-side destination,
  91                          * incrementing it by the stride each time.
  92                          */
  93                         "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
  94                         "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
  95                         "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
  96                         "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
  97                         "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
  98                         "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
  99                         "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
 100                         "st1 {v3.D}[1], [%[cpu]]\n"
 101                         : [cpu]         "+r"(cpu)
 102                         : [gpu]         "r"(gpu),
 103                           [cpu_stride]  "r"(cpu_stride)
 104                         : "v0", "v1", "v2", "v3");
 105                 return;
 106         } else if (gpu_stride == 16) {
 107                 void *cpu2 = cpu + 8;
 108                 __asm__ volatile (
 109                         /* Load from the GPU in one shot, no interleave, to
 110                          * d0-d7.
 111                          */
 112                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
 113                         /* Store each 16-byte line in 2 parts to the cpu-side
 114                          * destination.  (vld1 can only store one d-register
 115                          * at a time).
 116                          */
 117                         "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
 118                         "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
 119                         "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
 120                         "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
 121                         "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
 122                         "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
 123                         "st1 {v3.D}[0], [%[cpu]]\n"
 124                         "st1 {v3.D}[1], [%[cpu2]]\n"
 125                         : [cpu]         "+r"(cpu),
 126                           [cpu2]        "+r"(cpu2)
 127                         : [gpu]         "r"(gpu),
 128                           [cpu_stride]  "r"(cpu_stride)
 129                         : "v0", "v1", "v2", "v3");
 130                 return;
 131         }
 132 #endif
 133
 134         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
 135                 memcpy(cpu, gpu + gpu_offset, gpu_stride);
 136                 cpu += cpu_stride;
 137         }
 138 }
 139
 140 static inline void
 141 v3d_store_utile(void *gpu, uint32_t gpu_stride,
 142                 void *cpu, uint32_t cpu_stride)
 143 {
 144 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
 145         if (gpu_stride == 8) {
 146                 __asm__ volatile (
 147                         /* Load each 8-byte line from cpu-side source,
 148                          * incrementing it by the stride each time.
 149                          */
 150                         "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
 151                         "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
 152                         "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
 153                         "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
 154                         "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
 155                         "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
 156                         "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
 157                         "vld1.8 d7, [%[cpu]]\n"
 158                         /* Load from the GPU in one shot, no interleave, to
 159                          * d0-d7.
 160                          */
 161                         "vstm %[gpu], {q0, q1, q2, q3}\n"
 162                         : [cpu]         "+r"(cpu)
 163                         : [gpu]         "r"(gpu),
 164                           [cpu_stride]  "r"(cpu_stride)
 165                         : "q0", "q1", "q2", "q3");
 166                 return;
 167         } else if (gpu_stride == 16) {
 168                 void *cpu2 = cpu + 8;
 169                 __asm__ volatile (
 170                         /* Load each 16-byte line in 2 parts from the cpu-side
 171                          * destination.  (vld1 can only store one d-register
 172                          * at a time).
 173                          */
 174                         "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
 175                         "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
 176                         "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
 177                         "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
 178                         "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
 179                         "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
 180                         "vld1.8 d6, [%[cpu]]\n"
 181                         "vld1.8 d7, [%[cpu2]]\n"
 182                         /* Store to the GPU in one shot, no interleave. */
 183                         "vstm %[gpu], {q0, q1, q2, q3}\n"
 184                         : [cpu]         "+r"(cpu),
 185                           [cpu2]        "+r"(cpu2)
 186                         : [gpu]         "r"(gpu),
 187                           [cpu_stride]  "r"(cpu_stride)
 188                         : "q0", "q1", "q2", "q3");
 189                 return;
 190         }
 191 #elif defined (PIPE_ARCH_AARCH64)
 192         if (gpu_stride == 8) {
 193                 __asm__ volatile (
 194                         /* Load each 8-byte line from cpu-side source,
 195                          * incrementing it by the stride each time.
 196                          */
 197                         "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
 198                         "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
 199                         "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
 200                         "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
 201                         "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
 202                         "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
 203                         "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
 204                         "ld1 {v3.D}[1], [%[cpu]]\n"
 205                         /* Store to the GPU in one shot, no interleave. */
 206                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
 207                         : [cpu]         "+r"(cpu)
 208                         : [gpu]         "r"(gpu),
 209                           [cpu_stride]  "r"(cpu_stride)
 210                         : "v0", "v1", "v2", "v3");
 211                 return;
 212         } else if (gpu_stride == 16) {
 213                 void *cpu2 = cpu + 8;
 214                 __asm__ volatile (
 215                         /* Load each 16-byte line in 2 parts from the cpu-side
 216                          * destination.  (vld1 can only store one d-register
 217                          * at a time).
 218                          */
 219                         "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
 220                         "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
 221                         "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
 222                         "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
 223                         "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
 224                         "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
 225                         "ld1 {v3.D}[0], [%[cpu]]\n"
 226                         "ld1 {v3.D}[1], [%[cpu2]]\n"
 227                         /* Store to the GPU in one shot, no interleave. */
 228                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
 229                         : [cpu]         "+r"(cpu),
 230                           [cpu2]        "+r"(cpu2)
 231                         : [gpu]         "r"(gpu),
 232                           [cpu_stride]  "r"(cpu_stride)
 233                         : "v0", "v1", "v2", "v3");
 234                 return;
 235         }
 236 #endif
 237
 238         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
 239                 memcpy(gpu + gpu_offset, cpu, gpu_stride);
 240                 cpu += cpu_stride;
 241         }
 242 }