src/broadcom/common/v3d_cpu_tiling.h

   1 /*
   2  * Copyright © 2017 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file v3d_cpu_tiling.h
  25  *
  26  * Contains load/store functions common to both v3d and vc4.  The utile layout
  27  * stayed the same, though the way utiles get laid out has changed.
  28  */
  29
  30 static inline void
  31 v3d_load_utile(void *cpu, uint32_t cpu_stride,
  32                void *gpu, uint32_t gpu_stride)
  33 {
  34 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
  35         if (gpu_stride == 8) {
  36                 __asm__ volatile (
  37                         /* Load from the GPU in one shot, no interleave, to
  38                          * d0-d7.
  39                          */
  40                         "vldm %[gpu], {q0, q1, q2, q3}\n"
  41                         /* Store each 8-byte line to cpu-side destination,
  42                          * incrementing it by the stride each time.
  43                          */
  44                         "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
  45                         "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
  46                         "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
  47                         "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
  48                         "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
  49                         "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
  50                         "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
  51                         "vst1.8 d7, [%[cpu]]\n"
  52                         :
  53                         : [gpu]         "r"(gpu),
  54                           [cpu]         "r"(cpu),
  55                           [cpu_stride]  "r"(cpu_stride)
  56                         : "q0", "q1", "q2", "q3");
  57                 return;
  58         } else if (gpu_stride == 16) {
  59                 __asm__ volatile (
  60                         /* Load from the GPU in one shot, no interleave, to
  61                          * d0-d7.
  62                          */
  63                         "vldm %[gpu], {q0, q1, q2, q3};\n"
  64                         /* Store each 16-byte line in 2 parts to the cpu-side
  65                          * destination.  (vld1 can only store one d-register
  66                          * at a time).
  67                          */
  68                         "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
  69                         "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
  70                         "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
  71                         "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
  72                         "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
  73                         "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
  74                         "vst1.8 d6, [%[cpu]]\n"
  75                         "vst1.8 d7, [%[cpu2]]\n"
  76                         :
  77                         : [gpu]         "r"(gpu),
  78                           [cpu]         "r"(cpu),
  79                           [cpu2]        "r"(cpu + 8),
  80                           [cpu_stride]  "r"(cpu_stride)
  81                         : "q0", "q1", "q2", "q3");
  82                 return;
  83         }
  84 #elif defined (PIPE_ARCH_AARCH64)
  85         if (gpu_stride == 8) {
  86                 __asm__ volatile (
  87                         /* Load from the GPU in one shot, no interleave, to
  88                          * d0-d7.
  89                          */
  90                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
  91                         /* Store each 8-byte line to cpu-side destination,
  92                          * incrementing it by the stride each time.
  93                          */
  94                         "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
  95                         "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
  96                         "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
  97                         "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
  98                         "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
  99                         "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
 100                         "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
 101                         "st1 {v3.D}[1], [%[cpu]]\n"
 102                         :
 103                         : [gpu]         "r"(gpu),
 104                           [cpu]         "r"(cpu),
 105                           [cpu_stride]  "r"(cpu_stride)
 106                         : "v0", "v1", "v2", "v3");
 107                 return;
 108         } else if (gpu_stride == 16) {
 109                 __asm__ volatile (
 110                         /* Load from the GPU in one shot, no interleave, to
 111                          * d0-d7.
 112                          */
 113                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
 114                         /* Store each 16-byte line in 2 parts to the cpu-side
 115                          * destination.  (vld1 can only store one d-register
 116                          * at a time).
 117                          */
 118                         "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
 119                         "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
 120                         "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
 121                         "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
 122                         "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
 123                         "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
 124                         "st1 {v3.D}[0], [%[cpu]]\n"
 125                         "st1 {v3.D}[1], [%[cpu2]]\n"
 126                         :
 127                         : [gpu]         "r"(gpu),
 128                           [cpu]         "r"(cpu),
 129                           [cpu2]        "r"(cpu + 8),
 130                           [cpu_stride]  "r"(cpu_stride)
 131                         : "v0", "v1", "v2", "v3");
 132                 return;
 133         }
 134 #endif
 135
 136         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
 137                 memcpy(cpu, gpu + gpu_offset, gpu_stride);
 138                 cpu += cpu_stride;
 139         }
 140 }
 141
 142 static inline void
 143 v3d_store_utile(void *gpu, uint32_t gpu_stride,
 144                 void *cpu, uint32_t cpu_stride)
 145 {
 146 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
 147         if (gpu_stride == 8) {
 148                 __asm__ volatile (
 149                         /* Load each 8-byte line from cpu-side source,
 150                          * incrementing it by the stride each time.
 151                          */
 152                         "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
 153                         "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
 154                         "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
 155                         "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
 156                         "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
 157                         "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
 158                         "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
 159                         "vld1.8 d7, [%[cpu]]\n"
 160                         /* Load from the GPU in one shot, no interleave, to
 161                          * d0-d7.
 162                          */
 163                         "vstm %[gpu], {q0, q1, q2, q3}\n"
 164                         :
 165                         : [gpu]         "r"(gpu),
 166                           [cpu]         "r"(cpu),
 167                           [cpu_stride]  "r"(cpu_stride)
 168                         : "q0", "q1", "q2", "q3");
 169                 return;
 170         } else if (gpu_stride == 16) {
 171                 __asm__ volatile (
 172                         /* Load each 16-byte line in 2 parts from the cpu-side
 173                          * destination.  (vld1 can only store one d-register
 174                          * at a time).
 175                          */
 176                         "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
 177                         "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
 178                         "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
 179                         "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
 180                         "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
 181                         "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
 182                         "vld1.8 d6, [%[cpu]]\n"
 183                         "vld1.8 d7, [%[cpu2]]\n"
 184                         /* Store to the GPU in one shot, no interleave. */
 185                         "vstm %[gpu], {q0, q1, q2, q3}\n"
 186                         :
 187                         : [gpu]         "r"(gpu),
 188                           [cpu]         "r"(cpu),
 189                           [cpu2]        "r"(cpu + 8),
 190                           [cpu_stride]  "r"(cpu_stride)
 191                         : "q0", "q1", "q2", "q3");
 192                 return;
 193         }
 194 #elif defined (PIPE_ARCH_AARCH64)
 195         if (gpu_stride == 8) {
 196                 __asm__ volatile (
 197                         /* Load each 8-byte line from cpu-side source,
 198                          * incrementing it by the stride each time.
 199                          */
 200                         "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
 201                         "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
 202                         "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
 203                         "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
 204                         "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
 205                         "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
 206                         "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
 207                         "ld1 {v3.D}[1], [%[cpu]]\n"
 208                         /* Store to the GPU in one shot, no interleave. */
 209                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
 210                         :
 211                         : [gpu]         "r"(gpu),
 212                           [cpu]         "r"(cpu),
 213                           [cpu_stride]  "r"(cpu_stride)
 214                         : "v0", "v1", "v2", "v3");
 215                 return;
 216         } else if (gpu_stride == 16) {
 217                 __asm__ volatile (
 218                         /* Load each 16-byte line in 2 parts from the cpu-side
 219                          * destination.  (vld1 can only store one d-register
 220                          * at a time).
 221                          */
 222                         "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
 223                         "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
 224                         "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
 225                         "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
 226                         "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
 227                         "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
 228                         "ld1 {v3.D}[0], [%[cpu]]\n"
 229                         "ld1 {v3.D}[1], [%[cpu2]]\n"
 230                         /* Store to the GPU in one shot, no interleave. */
 231                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
 232                         :
 233                         : [gpu]         "r"(gpu),
 234                           [cpu]         "r"(cpu),
 235                           [cpu2]        "r"(cpu + 8),
 236                           [cpu_stride]  "r"(cpu_stride)
 237                         : "v0", "v1", "v2", "v3");
 238                 return;
 239         }
 240 #endif
 241
 242         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
 243                 memcpy(gpu + gpu_offset, cpu, gpu_stride);
 244                 cpu += cpu_stride;
 245         }
 246 }