src/gallium/drivers/vc4/vc4_tiling_lt.c

   1 /*
   2  * Copyright © 2017 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file vc4_tiling_lt.c
  25  *
  26  * Helper functions from vc4_tiling.c that will be compiled for using NEON
  27  * assembly or not.
  28  *
  29  * If VC4_BUILD_NEON is set, then the functions will be suffixed with _neon.
  30  * They will only use NEON assembly if __ARM_ARCH is also set, to keep the x86
  31  * sim build working.
  32  */
  33
  34 #include <string.h>
  35 #include "pipe/p_state.h"
  36 #include "vc4_tiling.h"
  37
  38 #ifdef VC4_BUILD_NEON
  39 #define NEON_TAG(x) x ## _neon
  40 #else
  41 #define NEON_TAG(x) x ## _base
  42 #endif
  43
  44 /** Returns the stride in bytes of a 64-byte microtile. */
  45 static uint32_t
  46 vc4_utile_stride(int cpp)
  47 {
  48         switch (cpp) {
  49         case 1:
  50                 return 8;
  51         case 2:
  52         case 4:
  53         case 8:
  54                 return 16;
  55         default:
  56                 unreachable("bad cpp");
  57         }
  58 }
  59
  60 static void
  61 vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
  62 {
  63         uint32_t gpu_stride = vc4_utile_stride(cpp);
  64 #if defined(VC4_BUILD_NEON) && defined(__ARM_ARCH)
  65         if (gpu_stride == 8) {
  66                 __asm__ volatile (
  67                         /* Load from the GPU in one shot, no interleave, to
  68                          * d0-d7.
  69                          */
  70                         "vldm %0, {q0, q1, q2, q3};\n"
  71                         /* Store each 8-byte line to cpu-side destination,
  72                          * incrementing it by the stride each time.
  73                          */
  74                         "vst1.8 d0, [%1], %r2;\n"
  75                         "vst1.8 d1, [%1], %r2;\n"
  76                         "vst1.8 d2, [%1], %r2;\n"
  77                         "vst1.8 d3, [%1], %r2;\n"
  78                         "vst1.8 d4, [%1], %r2;\n"
  79                         "vst1.8 d5, [%1], %r2;\n"
  80                         "vst1.8 d6, [%1], %r2;\n"
  81                         "vst1.8 d7, [%1];\n"
  82                         :
  83                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
  84                         : "q0", "q1", "q2", "q3");
  85         } else {
  86                 assert(gpu_stride == 16);
  87                 __asm__ volatile (
  88                         /* Load from the GPU in one shot, no interleave, to
  89                          * d0-d7.
  90                          */
  91                         "vldm %0, {q0, q1, q2, q3};\n"
  92                         /* Store each 16-byte line in 2 parts to the cpu-side
  93                          * destination.  (vld1 can only store one d-register
  94                          * at a time).
  95                          */
  96                         "vst1.8 d0, [%1], %r3;\n"
  97                         "vst1.8 d1, [%2], %r3;\n"
  98                         "vst1.8 d2, [%1], %r3;\n"
  99                         "vst1.8 d3, [%2], %r3;\n"
 100                         "vst1.8 d4, [%1], %r3;\n"
 101                         "vst1.8 d5, [%2], %r3;\n"
 102                         "vst1.8 d6, [%1];\n"
 103                         "vst1.8 d7, [%2];\n"
 104                         :
 105                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
 106                         : "q0", "q1", "q2", "q3");
 107         }
 108 #else
 109         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
 110                 memcpy(cpu, gpu + gpu_offset, gpu_stride);
 111                 cpu += cpu_stride;
 112         }
 113 #endif
 114 }
 115
 116 static void
 117 vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp)
 118 {
 119         uint32_t dst_stride = vc4_utile_stride(cpp);
 120
 121         for (uint32_t dst_offset = 0; dst_offset < 64; dst_offset += dst_stride) {
 122                 memcpy(dst + dst_offset, src, dst_stride);
 123                 src += src_stride;
 124         }
 125 }
 126
 127 void
 128 NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
 129                             void *src, uint32_t src_stride,
 130                             int cpp, const struct pipe_box *box)
 131 {
 132         uint32_t utile_w = vc4_utile_width(cpp);
 133         uint32_t utile_h = vc4_utile_height(cpp);
 134         uint32_t xstart = box->x;
 135         uint32_t ystart = box->y;
 136
 137         for (uint32_t y = 0; y < box->height; y += utile_h) {
 138                 for (int x = 0; x < box->width; x += utile_w) {
 139                         vc4_load_utile(dst + (dst_stride * y +
 140                                               x * cpp),
 141                                        src + ((ystart + y) * src_stride +
 142                                               (xstart + x) * 64 / utile_w),
 143                                        dst_stride, cpp);
 144                 }
 145         }
 146 }
 147
 148 void
 149 NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride,
 150                              void *src, uint32_t src_stride,
 151                              int cpp, const struct pipe_box *box)
 152 {
 153         uint32_t utile_w = vc4_utile_width(cpp);
 154         uint32_t utile_h = vc4_utile_height(cpp);
 155         uint32_t xstart = box->x;
 156         uint32_t ystart = box->y;
 157
 158         for (uint32_t y = 0; y < box->height; y += utile_h) {
 159                 for (int x = 0; x < box->width; x += utile_w) {
 160                         vc4_store_utile(dst + ((ystart + y) * dst_stride +
 161                                                (xstart + x) * 64 / utile_w),
 162                                         src + (src_stride * y +
 163                                                x * cpp),
 164                                         src_stride, cpp);
 165                 }
 166         }
 167 }