src/freedreno/ir3/ir3_a4xx.c

   1 /*
   2  * Copyright (C) 2017-2018 Rob Clark <robclark@freedesktop.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27 #define GPU 400
  28
  29 #include "ir3_context.h"
  30 #include "ir3_image.h"
  31
  32 /*
  33  * Handlers for instructions changed/added in a4xx:
  34  */
  35
  36
  37 /* src[] = { buffer_index, offset }. No const_index */
  38 static void
  39 emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
  40                 struct ir3_instruction **dst)
  41 {
  42         struct ir3_block *b = ctx->block;
  43         struct ir3_instruction *ldgb, *src0, *src1, *byte_offset, *offset;
  44
  45         /* can this be non-const buffer_index?  how do we handle that? */
  46         int ibo_idx = ir3_ssbo_to_ibo(ctx->so->shader, nir_src_as_uint(intr->src[0]));
  47
  48         byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
  49         offset = ir3_get_src(ctx, &intr->src[2])[0];
  50
  51         /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
  52         src0 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
  53                 byte_offset,
  54                 create_immed(b, 0),
  55         }, 2);
  56         src1 = offset;
  57
  58         ldgb = ir3_LDGB(b, create_immed(b, ibo_idx), 0,
  59                         src0, 0, src1, 0);
  60         ldgb->regs[0]->wrmask = MASK(intr->num_components);
  61         ldgb->cat6.iim_val = intr->num_components;
  62         ldgb->cat6.d = 4;
  63         ldgb->cat6.type = TYPE_U32;
  64         ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
  65         ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
  66
  67         ir3_split_dest(b, dst, ldgb, 0, intr->num_components);
  68 }
  69
  70 /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
  71 static void
  72 emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  73 {
  74         struct ir3_block *b = ctx->block;
  75         struct ir3_instruction *stgb, *src0, *src1, *src2, *byte_offset, *offset;
  76         /* TODO handle wrmask properly, see _store_shared().. but I think
  77          * it is more a PITA than that, since blob ends up loading the
  78          * masked components and writing them back out.
  79          */
  80         unsigned wrmask = intr->const_index[0];
  81         unsigned ncomp = ffs(~wrmask) - 1;
  82
  83         /* can this be non-const buffer_index?  how do we handle that? */
  84         int ibo_idx = ir3_ssbo_to_ibo(ctx->so->shader, nir_src_as_uint(intr->src[1]));
  85
  86         byte_offset = ir3_get_src(ctx, &intr->src[2])[0];
  87         offset = ir3_get_src(ctx, &intr->src[3])[0];
  88
  89         /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
  90          * nir already *= 4:
  91          */
  92         src0 = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
  93         src1 = offset;
  94         src2 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
  95                 byte_offset,
  96                 create_immed(b, 0),
  97         }, 2);
  98
  99         stgb = ir3_STGB(b, create_immed(b, ibo_idx), 0, src0, 0, src1, 0, src2, 0);
 100         stgb->cat6.iim_val = ncomp;
 101         stgb->cat6.d = 4;
 102         stgb->cat6.type = TYPE_U32;
 103         stgb->barrier_class = IR3_BARRIER_BUFFER_W;
 104         stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
 105
 106         array_insert(b, b->keeps, stgb);
 107 }
 108
 109 /*
 110  * SSBO atomic intrinsics
 111  *
 112  * All of the SSBO atomic memory operations read a value from memory,
 113  * compute a new value using one of the operations below, write the new
 114  * value to memory, and return the original value read.
 115  *
 116  * All operations take 3 sources except CompSwap that takes 4. These
 117  * sources represent:
 118  *
 119  * 0: The SSBO buffer index.
 120  * 1: The offset into the SSBO buffer of the variable that the atomic
 121  *    operation will operate on.
 122  * 2: The data parameter to the atomic function (i.e. the value to add
 123  *    in ssbo_atomic_add, etc).
 124  * 3: For CompSwap only: the second data parameter.
 125  */
 126 static struct ir3_instruction *
 127 emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 128 {
 129         struct ir3_block *b = ctx->block;
 130         struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *byte_offset,
 131                 *offset;
 132         type_t type = TYPE_U32;
 133
 134         /* can this be non-const buffer_index?  how do we handle that? */
 135         int ibo_idx = ir3_ssbo_to_ibo(ctx->so->shader, nir_src_as_uint(intr->src[0]));
 136         ssbo = create_immed(b, ibo_idx);
 137
 138         byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
 139         offset = ir3_get_src(ctx, &intr->src[3])[0];
 140
 141         /* src0 is data (or uvec2(data, compare))
 142          * src1 is offset
 143          * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
 144          *
 145          * Note that nir already multiplies the offset by four
 146          */
 147         src0 = ir3_get_src(ctx, &intr->src[2])[0];
 148         src1 = offset;
 149         src2 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
 150                 byte_offset,
 151                 create_immed(b, 0),
 152         }, 2);
 153
 154         switch (intr->intrinsic) {
 155         case nir_intrinsic_ssbo_atomic_add_ir3:
 156                 atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 157                 break;
 158         case nir_intrinsic_ssbo_atomic_imin_ir3:
 159                 atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 160                 type = TYPE_S32;
 161                 break;
 162         case nir_intrinsic_ssbo_atomic_umin_ir3:
 163                 atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 164                 break;
 165         case nir_intrinsic_ssbo_atomic_imax_ir3:
 166                 atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 167                 type = TYPE_S32;
 168                 break;
 169         case nir_intrinsic_ssbo_atomic_umax_ir3:
 170                 atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 171                 break;
 172         case nir_intrinsic_ssbo_atomic_and_ir3:
 173                 atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 174                 break;
 175         case nir_intrinsic_ssbo_atomic_or_ir3:
 176                 atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 177                 break;
 178         case nir_intrinsic_ssbo_atomic_xor_ir3:
 179                 atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 180                 break;
 181         case nir_intrinsic_ssbo_atomic_exchange_ir3:
 182                 atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 183                 break;
 184         case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
 185                 /* for cmpxchg, src0 is [ui]vec2(data, compare): */
 186                 src0 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
 187                         ir3_get_src(ctx, &intr->src[3])[0],
 188                         src0,
 189                 }, 2);
 190                 src1 = ir3_get_src(ctx, &intr->src[4])[0];
 191                 atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 192                 break;
 193         default:
 194                 unreachable("boo");
 195         }
 196
 197         atomic->cat6.iim_val = 1;
 198         atomic->cat6.d = 4;
 199         atomic->cat6.type = type;
 200         atomic->barrier_class = IR3_BARRIER_BUFFER_W;
 201         atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
 202
 203         /* even if nothing consume the result, we can't DCE the instruction: */
 204         array_insert(b, b->keeps, atomic);
 205
 206         return atomic;
 207 }
 208
 209 static struct ir3_instruction *
 210 get_image_offset(struct ir3_context *ctx, const nir_intrinsic_instr *instr,
 211                 struct ir3_instruction * const *coords, bool byteoff)
 212 {
 213         struct ir3_block *b = ctx->block;
 214         struct ir3_instruction *offset;
 215         unsigned index = nir_src_as_uint(instr->src[0]);
 216         unsigned ncoords = ir3_get_image_coords(instr, NULL);
 217
 218         /* to calculate the byte offset (yes, uggg) we need (up to) three
 219          * const values to know the bytes per pixel, and y and z stride:
 220          */
 221         struct ir3_const_state *const_state = &ctx->so->shader->const_state;
 222         unsigned cb = regid(const_state->offsets.image_dims, 0) +
 223                 const_state->image_dims.off[index];
 224
 225         debug_assert(const_state->image_dims.mask & (1 << index));
 226
 227         /* offset = coords.x * bytes_per_pixel: */
 228         offset = ir3_MUL_S24(b, coords[0], 0, create_uniform(b, cb + 0), 0);
 229         if (ncoords > 1) {
 230                 /* offset += coords.y * y_pitch: */
 231                 offset = ir3_MAD_S24(b, create_uniform(b, cb + 1), 0,
 232                                 coords[1], 0, offset, 0);
 233         }
 234         if (ncoords > 2) {
 235                 /* offset += coords.z * z_pitch: */
 236                 offset = ir3_MAD_S24(b, create_uniform(b, cb + 2), 0,
 237                                 coords[2], 0, offset, 0);
 238         }
 239
 240         if (!byteoff) {
 241                 /* Some cases, like atomics, seem to use dword offset instead
 242                  * of byte offsets.. blob just puts an extra shr.b in there
 243                  * in those cases:
 244                  */
 245                 offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
 246         }
 247
 248         return ir3_create_collect(ctx, (struct ir3_instruction*[]){
 249                 offset,
 250                 create_immed(b, 0),
 251         }, 2);
 252 }
 253
 254 /* src[] = { index, coord, sample_index, value }. const_index[] = {} */
 255 static void
 256 emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 257 {
 258         struct ir3_block *b = ctx->block;
 259         struct ir3_instruction *stib, *offset;
 260         struct ir3_instruction * const *value = ir3_get_src(ctx, &intr->src[3]);
 261         struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
 262         unsigned ncoords = ir3_get_image_coords(intr, NULL);
 263         unsigned slot = nir_src_as_uint(intr->src[0]);
 264         unsigned ibo_idx = ir3_image_to_ibo(ctx->so->shader, slot);
 265         unsigned ncomp = ir3_get_num_components_for_image_format(nir_intrinsic_format(intr));
 266
 267         /* src0 is value
 268          * src1 is coords
 269          * src2 is 64b byte offset
 270          */
 271
 272         offset = get_image_offset(ctx, intr, coords, true);
 273
 274         /* NOTE: stib seems to take byte offset, but stgb.typed can be used
 275          * too and takes a dword offset.. not quite sure yet why blob uses
 276          * one over the other in various cases.
 277          */
 278
 279         stib = ir3_STIB(b, create_immed(b, ibo_idx), 0,
 280                         ir3_create_collect(ctx, value, ncomp), 0,
 281                         ir3_create_collect(ctx, coords, ncoords), 0,
 282                         offset, 0);
 283         stib->cat6.iim_val = ncomp;
 284         stib->cat6.d = ncoords;
 285         stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
 286         stib->cat6.typed = true;
 287         stib->barrier_class = IR3_BARRIER_IMAGE_W;
 288         stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
 289
 290         array_insert(b, b->keeps, stib);
 291 }
 292
 293 /* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
 294 static struct ir3_instruction *
 295 emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 296 {
 297         struct ir3_block *b = ctx->block;
 298         struct ir3_instruction *atomic, *image, *src0, *src1, *src2;
 299         struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
 300         unsigned ncoords = ir3_get_image_coords(intr, NULL);
 301         unsigned slot = nir_src_as_uint(intr->src[0]);
 302         unsigned ibo_idx = ir3_image_to_ibo(ctx->so->shader, slot);
 303
 304         image = create_immed(b, ibo_idx);
 305
 306         /* src0 is value (or uvec2(value, compare))
 307          * src1 is coords
 308          * src2 is 64b byte offset
 309          */
 310         src0 = ir3_get_src(ctx, &intr->src[3])[0];
 311         src1 = ir3_create_collect(ctx, coords, ncoords);
 312         src2 = get_image_offset(ctx, intr, coords, false);
 313
 314         switch (intr->intrinsic) {
 315         case nir_intrinsic_image_atomic_add:
 316                 atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 317                 break;
 318         case nir_intrinsic_image_atomic_imin:
 319         case nir_intrinsic_image_atomic_umin:
 320                 atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 321                 break;
 322         case nir_intrinsic_image_atomic_imax:
 323         case nir_intrinsic_image_atomic_umax:
 324                 atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 325                 break;
 326         case nir_intrinsic_image_atomic_and:
 327                 atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 328                 break;
 329         case nir_intrinsic_image_atomic_or:
 330                 atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 331                 break;
 332         case nir_intrinsic_image_atomic_xor:
 333                 atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 334                 break;
 335         case nir_intrinsic_image_atomic_exchange:
 336                 atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 337                 break;
 338         case nir_intrinsic_image_atomic_comp_swap:
 339                 /* for cmpxchg, src0 is [ui]vec2(data, compare): */
 340                 src0 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
 341                         ir3_get_src(ctx, &intr->src[4])[0],
 342                         src0,
 343                 }, 2);
 344                 atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 345                 break;
 346         default:
 347                 unreachable("boo");
 348         }
 349
 350         atomic->cat6.iim_val = 1;
 351         atomic->cat6.d = ncoords;
 352         atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
 353         atomic->cat6.typed = true;
 354         atomic->barrier_class = IR3_BARRIER_IMAGE_W;
 355         atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
 356
 357         /* even if nothing consume the result, we can't DCE the instruction: */
 358         array_insert(b, b->keeps, atomic);
 359
 360         return atomic;
 361 }
 362
 363 const struct ir3_context_funcs ir3_a4xx_funcs = {
 364                 .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
 365                 .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
 366                 .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
 367                 .emit_intrinsic_store_image = emit_intrinsic_store_image,
 368                 .emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
 369 };