src/freedreno/ir3/ir3_a4xx.c

   1 /*
   2  * Copyright (C) 2017-2018 Rob Clark <robclark@freedesktop.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27 #define GPU 400
  28
  29 #include "ir3_context.h"
  30 #include "ir3_image.h"
  31
  32 /*
  33  * Handlers for instructions changed/added in a4xx:
  34  */
  35
  36
  37 /* src[] = { buffer_index, offset }. No const_index */
  38 static void
  39 emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
  40                 struct ir3_instruction **dst)
  41 {
  42         struct ir3_block *b = ctx->block;
  43         struct ir3_instruction *ldgb, *src0, *src1, *byte_offset, *offset;
  44
  45         struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
  46
  47         byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
  48         offset = ir3_get_src(ctx, &intr->src[2])[0];
  49
  50         /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
  51         src0 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
  52                 byte_offset,
  53                 create_immed(b, 0),
  54         }, 2);
  55         src1 = offset;
  56
  57         ldgb = ir3_LDGB(b, ssbo, 0,
  58                         src0, 0, src1, 0);
  59         ldgb->regs[0]->wrmask = MASK(intr->num_components);
  60         ldgb->cat6.iim_val = intr->num_components;
  61         ldgb->cat6.d = 4;
  62         ldgb->cat6.type = TYPE_U32;
  63         ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
  64         ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
  65
  66         ir3_split_dest(b, dst, ldgb, 0, intr->num_components);
  67 }
  68
  69 /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
  70 static void
  71 emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  72 {
  73         struct ir3_block *b = ctx->block;
  74         struct ir3_instruction *stgb, *src0, *src1, *src2, *byte_offset, *offset;
  75         unsigned wrmask = nir_intrinsic_write_mask(intr);
  76         unsigned ncomp = ffs(~wrmask) - 1;
  77
  78         assert(wrmask == BITFIELD_MASK(intr->num_components));
  79
  80         struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[1]);
  81
  82         byte_offset = ir3_get_src(ctx, &intr->src[2])[0];
  83         offset = ir3_get_src(ctx, &intr->src[3])[0];
  84
  85         /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
  86          * nir already *= 4:
  87          */
  88         src0 = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
  89         src1 = offset;
  90         src2 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
  91                 byte_offset,
  92                 create_immed(b, 0),
  93         }, 2);
  94
  95         stgb = ir3_STGB(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
  96         stgb->cat6.iim_val = ncomp;
  97         stgb->cat6.d = 4;
  98         stgb->cat6.type = TYPE_U32;
  99         stgb->barrier_class = IR3_BARRIER_BUFFER_W;
 100         stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
 101
 102         array_insert(b, b->keeps, stgb);
 103 }
 104
 105 /*
 106  * SSBO atomic intrinsics
 107  *
 108  * All of the SSBO atomic memory operations read a value from memory,
 109  * compute a new value using one of the operations below, write the new
 110  * value to memory, and return the original value read.
 111  *
 112  * All operations take 3 sources except CompSwap that takes 4. These
 113  * sources represent:
 114  *
 115  * 0: The SSBO buffer index.
 116  * 1: The offset into the SSBO buffer of the variable that the atomic
 117  *    operation will operate on.
 118  * 2: The data parameter to the atomic function (i.e. the value to add
 119  *    in ssbo_atomic_add, etc).
 120  * 3: For CompSwap only: the second data parameter.
 121  */
 122 static struct ir3_instruction *
 123 emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 124 {
 125         struct ir3_block *b = ctx->block;
 126         struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *byte_offset,
 127                 *offset;
 128         type_t type = TYPE_U32;
 129
 130         ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
 131
 132         byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
 133         offset = ir3_get_src(ctx, &intr->src[3])[0];
 134
 135         /* src0 is data (or uvec2(data, compare))
 136          * src1 is offset
 137          * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
 138          *
 139          * Note that nir already multiplies the offset by four
 140          */
 141         src0 = ir3_get_src(ctx, &intr->src[2])[0];
 142         src1 = offset;
 143         src2 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
 144                 byte_offset,
 145                 create_immed(b, 0),
 146         }, 2);
 147
 148         switch (intr->intrinsic) {
 149         case nir_intrinsic_ssbo_atomic_add_ir3:
 150                 atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 151                 break;
 152         case nir_intrinsic_ssbo_atomic_imin_ir3:
 153                 atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 154                 type = TYPE_S32;
 155                 break;
 156         case nir_intrinsic_ssbo_atomic_umin_ir3:
 157                 atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 158                 break;
 159         case nir_intrinsic_ssbo_atomic_imax_ir3:
 160                 atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 161                 type = TYPE_S32;
 162                 break;
 163         case nir_intrinsic_ssbo_atomic_umax_ir3:
 164                 atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 165                 break;
 166         case nir_intrinsic_ssbo_atomic_and_ir3:
 167                 atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 168                 break;
 169         case nir_intrinsic_ssbo_atomic_or_ir3:
 170                 atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 171                 break;
 172         case nir_intrinsic_ssbo_atomic_xor_ir3:
 173                 atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 174                 break;
 175         case nir_intrinsic_ssbo_atomic_exchange_ir3:
 176                 atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 177                 break;
 178         case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
 179                 /* for cmpxchg, src0 is [ui]vec2(data, compare): */
 180                 src0 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
 181                         ir3_get_src(ctx, &intr->src[3])[0],
 182                         src0,
 183                 }, 2);
 184                 src1 = ir3_get_src(ctx, &intr->src[4])[0];
 185                 atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
 186                 break;
 187         default:
 188                 unreachable("boo");
 189         }
 190
 191         atomic->cat6.iim_val = 1;
 192         atomic->cat6.d = 4;
 193         atomic->cat6.type = type;
 194         atomic->barrier_class = IR3_BARRIER_BUFFER_W;
 195         atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
 196
 197         /* even if nothing consume the result, we can't DCE the instruction: */
 198         array_insert(b, b->keeps, atomic);
 199
 200         return atomic;
 201 }
 202
 203 static struct ir3_instruction *
 204 get_image_offset(struct ir3_context *ctx, const nir_intrinsic_instr *instr,
 205                 struct ir3_instruction * const *coords, bool byteoff)
 206 {
 207         struct ir3_block *b = ctx->block;
 208         struct ir3_instruction *offset;
 209         unsigned index = nir_src_as_uint(instr->src[0]);
 210         unsigned ncoords = ir3_get_image_coords(instr, NULL);
 211
 212         /* to calculate the byte offset (yes, uggg) we need (up to) three
 213          * const values to know the bytes per pixel, and y and z stride:
 214          */
 215         const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
 216         unsigned cb = regid(const_state->offsets.image_dims, 0) +
 217                 const_state->image_dims.off[index];
 218
 219         debug_assert(const_state->image_dims.mask & (1 << index));
 220
 221         /* offset = coords.x * bytes_per_pixel: */
 222         offset = ir3_MUL_S24(b, coords[0], 0, create_uniform(b, cb + 0), 0);
 223         if (ncoords > 1) {
 224                 /* offset += coords.y * y_pitch: */
 225                 offset = ir3_MAD_S24(b, create_uniform(b, cb + 1), 0,
 226                                 coords[1], 0, offset, 0);
 227         }
 228         if (ncoords > 2) {
 229                 /* offset += coords.z * z_pitch: */
 230                 offset = ir3_MAD_S24(b, create_uniform(b, cb + 2), 0,
 231                                 coords[2], 0, offset, 0);
 232         }
 233
 234         if (!byteoff) {
 235                 /* Some cases, like atomics, seem to use dword offset instead
 236                  * of byte offsets.. blob just puts an extra shr.b in there
 237                  * in those cases:
 238                  */
 239                 offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
 240         }
 241
 242         return ir3_create_collect(ctx, (struct ir3_instruction*[]){
 243                 offset,
 244                 create_immed(b, 0),
 245         }, 2);
 246 }
 247
 248 /* src[] = { index, coord, sample_index, value }. const_index[] = {} */
 249 static void
 250 emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 251 {
 252         struct ir3_block *b = ctx->block;
 253         struct ir3_instruction *stib, *offset;
 254         struct ir3_instruction * const *value = ir3_get_src(ctx, &intr->src[3]);
 255         struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
 256         struct ir3_instruction * ibo = ir3_image_to_ibo(ctx, intr->src[0]);
 257         unsigned ncoords = ir3_get_image_coords(intr, NULL);
 258         unsigned ncomp = ir3_get_num_components_for_image_format(nir_intrinsic_format(intr));
 259
 260         /* src0 is value
 261          * src1 is coords
 262          * src2 is 64b byte offset
 263          */
 264
 265         offset = get_image_offset(ctx, intr, coords, true);
 266
 267         /* NOTE: stib seems to take byte offset, but stgb.typed can be used
 268          * too and takes a dword offset.. not quite sure yet why blob uses
 269          * one over the other in various cases.
 270          */
 271
 272         stib = ir3_STIB(b, ibo, 0,
 273                         ir3_create_collect(ctx, value, ncomp), 0,
 274                         ir3_create_collect(ctx, coords, ncoords), 0,
 275                         offset, 0);
 276         stib->cat6.iim_val = ncomp;
 277         stib->cat6.d = ncoords;
 278         stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
 279         stib->cat6.typed = true;
 280         stib->barrier_class = IR3_BARRIER_IMAGE_W;
 281         stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
 282
 283         array_insert(b, b->keeps, stib);
 284 }
 285
 286 /* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
 287 static struct ir3_instruction *
 288 emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 289 {
 290         struct ir3_block *b = ctx->block;
 291         struct ir3_instruction *atomic, *src0, *src1, *src2;
 292         struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
 293         struct ir3_instruction * image = ir3_image_to_ibo(ctx, intr->src[0]);
 294         unsigned ncoords = ir3_get_image_coords(intr, NULL);
 295
 296         /* src0 is value (or uvec2(value, compare))
 297          * src1 is coords
 298          * src2 is 64b byte offset
 299          */
 300         src0 = ir3_get_src(ctx, &intr->src[3])[0];
 301         src1 = ir3_create_collect(ctx, coords, ncoords);
 302         src2 = get_image_offset(ctx, intr, coords, false);
 303
 304         switch (intr->intrinsic) {
 305         case nir_intrinsic_image_atomic_add:
 306                 atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 307                 break;
 308         case nir_intrinsic_image_atomic_imin:
 309         case nir_intrinsic_image_atomic_umin:
 310                 atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 311                 break;
 312         case nir_intrinsic_image_atomic_imax:
 313         case nir_intrinsic_image_atomic_umax:
 314                 atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 315                 break;
 316         case nir_intrinsic_image_atomic_and:
 317                 atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 318                 break;
 319         case nir_intrinsic_image_atomic_or:
 320                 atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 321                 break;
 322         case nir_intrinsic_image_atomic_xor:
 323                 atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 324                 break;
 325         case nir_intrinsic_image_atomic_exchange:
 326                 atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 327                 break;
 328         case nir_intrinsic_image_atomic_comp_swap:
 329                 /* for cmpxchg, src0 is [ui]vec2(data, compare): */
 330                 src0 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
 331                         ir3_get_src(ctx, &intr->src[4])[0],
 332                         src0,
 333                 }, 2);
 334                 atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 335                 break;
 336         default:
 337                 unreachable("boo");
 338         }
 339
 340         atomic->cat6.iim_val = 1;
 341         atomic->cat6.d = ncoords;
 342         atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
 343         atomic->cat6.typed = true;
 344         atomic->barrier_class = IR3_BARRIER_IMAGE_W;
 345         atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
 346
 347         /* even if nothing consume the result, we can't DCE the instruction: */
 348         array_insert(b, b->keeps, atomic);
 349
 350         return atomic;
 351 }
 352
 353 const struct ir3_context_funcs ir3_a4xx_funcs = {
 354                 .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
 355                 .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
 356                 .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
 357                 .emit_intrinsic_store_image = emit_intrinsic_store_image,
 358                 .emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
 359                 .emit_intrinsic_image_size = emit_intrinsic_image_size_tex,
 360 };