src/gallium/drivers/freedreno/a2xx/ir2_assemble.c

   1 /*
   2  * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Jonathan Marek <jonathan@marek.ca>
  25  */
  26
  27 #include "ir2_private.h"
  28
  29 static unsigned
  30 src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
  31 {
  32         struct ir2_reg_component *comps;
  33         unsigned swiz = 0;
  34
  35         switch (src->type) {
  36         case IR2_SRC_SSA:
  37         case IR2_SRC_REG:
  38                 break;
  39         default:
  40                 return src->swizzle;
  41         }
  42         /* we need to take into account where the components were allocated */
  43         comps = get_reg_src(ctx, src)->comp;
  44         for (int i = 0; i < ncomp; i++) {
  45                 swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i);
  46         }
  47         return swiz;
  48 }
  49
  50 /* alu instr need to take into how the output components are allocated */
  51
  52 /* scalar doesn't need to take into account dest swizzle */
  53
  54 static unsigned
  55 alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg)
  56 {
  57         /* hardware seems to take from W, but swizzle everywhere just in case */
  58         return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX);
  59 }
  60
  61 static unsigned
  62 alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr, struct ir2_src *src)
  63 {
  64         struct ir2_reg_component *comp = get_reg(instr)->comp;
  65         unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr));
  66         unsigned swiz = 0;
  67
  68         /* non per component special cases */
  69         switch (instr->alu.vector_opc) {
  70         case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv:
  71                 return alu_swizzle_scalar(ctx, src);
  72         case DOT2ADDv:
  73         case DOT3v:
  74         case DOT4v:
  75         case CUBEv:
  76                 return swiz0;
  77         default:
  78                 break;
  79         }
  80
  81         for (int i = 0, j = 0; i < dst_ncomp(instr); j++) {
  82                 if (instr->alu.write_mask & 1 << j) {
  83                         if (comp[j].c != 7)
  84                                 swiz |= swiz_set(i, comp[j].c);
  85                         i++;
  86                 }
  87         }
  88         return swiz_merge(swiz0, swiz);
  89 }
  90
  91 static unsigned
  92 alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1)
  93 {
  94         /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */
  95         unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0);
  96         return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY);
  97 }
  98
  99 /* write_mask needs to be transformed by allocation information */
 100
 101 static unsigned
 102 alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr)
 103 {
 104         struct ir2_reg_component *comp = get_reg(instr)->comp;
 105         unsigned write_mask = 0;
 106
 107         for (int i = 0; i < 4; i++) {
 108                 if (instr->alu.write_mask & 1 << i)
 109                         write_mask |= 1 << comp[i].c;
 110         }
 111
 112         return write_mask;
 113 }
 114
 115 /* fetch instructions can swizzle dest, but src swizzle needs conversion */
 116
 117 static unsigned
 118 fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
 119 {
 120         unsigned alu_swiz = src_swizzle(ctx, src, ncomp);
 121         unsigned swiz = 0;
 122         for (int i = 0; i < ncomp; i++)
 123                 swiz |= swiz_get(alu_swiz, i) << i * 2;
 124         return swiz;
 125 }
 126
 127 static unsigned
 128 fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr)
 129 {
 130         struct ir2_reg_component *comp = get_reg(instr)->comp;
 131         unsigned dst_swiz = 0xfff;
 132         for (int i = 0; i < dst_ncomp(instr); i++) {
 133                 dst_swiz &= ~(7 << comp[i].c * 3);
 134                 dst_swiz |= i << comp[i].c * 3;
 135         }
 136         return dst_swiz;
 137 }
 138
 139 /* register / export # for instr */
 140 static unsigned
 141 dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr)
 142 {
 143         if (is_export(instr))
 144                 return instr->alu.export;
 145
 146         return get_reg(instr)->idx;
 147 }
 148
 149 /* register # for src */
 150 static unsigned src_to_reg(struct ir2_context *ctx, struct ir2_src *src)
 151 {
 152         return get_reg_src(ctx, src)->idx;
 153 }
 154
 155 static unsigned src_reg_byte(struct ir2_context *ctx, struct ir2_src *src)
 156 {
 157         if (src->type == IR2_SRC_CONST) {
 158                 assert(!src->abs); /* no abs bit for const */
 159                 return src->num;
 160         }
 161         return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0);
 162 }
 163
 164 /* produce the 12 byte binary instruction for a given sched_instr */
 165 static void
 166 fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched,
 167                    instr_t *bc, bool * is_fetch)
 168 {
 169         struct ir2_instr *instr = sched->instr, *instr_s, *instr_v;
 170
 171         *bc = (instr_t) {};
 172
 173         if (instr && instr->type == IR2_FETCH) {
 174                 *is_fetch = true;
 175
 176                 bc->fetch.opc = instr->fetch.opc;
 177                 bc->fetch.pred_select = !!instr->pred;
 178                 bc->fetch.pred_condition = instr->pred & 1;
 179
 180                 struct ir2_src *src = instr->src;
 181
 182                 if (instr->fetch.opc == VTX_FETCH) {
 183                         instr_fetch_vtx_t *vtx = &bc->fetch.vtx;
 184
 185                         assert(instr->fetch.vtx.const_idx <= 0x1f);
 186                         assert(instr->fetch.vtx.const_idx_sel <= 0x3);
 187
 188                         vtx->src_reg = src_to_reg(ctx, src);
 189                         vtx->src_swiz = fetch_swizzle(ctx, src, 1);
 190                         vtx->dst_reg = dst_to_reg(ctx, instr);
 191                         vtx->dst_swiz = fetch_dst_swiz(ctx, instr);
 192
 193                         vtx->must_be_one = 1;
 194                         vtx->const_index = instr->fetch.vtx.const_idx;
 195                         vtx->const_index_sel = instr->fetch.vtx.const_idx_sel;
 196
 197                         /* other fields will be patched */
 198
 199                         /* XXX seems like every FETCH but the first has
 200                          * this bit set:
 201                          */
 202                         vtx->reserved3 = instr->idx ? 0x1 : 0x0;
 203                         vtx->reserved0 = instr->idx ? 0x2 : 0x3;
 204                 } else if (instr->fetch.opc == TEX_FETCH) {
 205                         instr_fetch_tex_t *tex = &bc->fetch.tex;
 206
 207                         tex->src_reg = src_to_reg(ctx, src);
 208                         tex->src_swiz = fetch_swizzle(ctx, src, 3);
 209                         tex->dst_reg = dst_to_reg(ctx, instr);
 210                         tex->dst_swiz = fetch_dst_swiz(ctx, instr);
 211                         /* tex->const_idx = patch_fetches */
 212                         tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
 213                         tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
 214                         tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
 215                         tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
 216                         tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
 217                         tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
 218                         tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
 219                         tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT;
 220                         tex->use_reg_lod = instr->src_count == 2;
 221                         tex->sample_location = SAMPLE_CENTER;
 222                         tex->tx_coord_denorm = instr->fetch.tex.is_rect;
 223                 } else if (instr->fetch.opc == TEX_SET_TEX_LOD) {
 224                         instr_fetch_tex_t *tex = &bc->fetch.tex;
 225
 226                         tex->src_reg = src_to_reg(ctx, src);
 227                         tex->src_swiz = fetch_swizzle(ctx, src, 1);
 228                         tex->dst_reg = 0;
 229                         tex->dst_swiz = 0xfff;
 230
 231                         tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
 232                         tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
 233                         tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
 234                         tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
 235                         tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
 236                         tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
 237                         tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
 238                         tex->use_comp_lod = 1;
 239                         tex->use_reg_lod = 0;
 240                         tex->sample_location = SAMPLE_CENTER;
 241                 } else {
 242                         assert(0);
 243                 }
 244                 return;
 245         }
 246
 247         instr_v = sched->instr;
 248         instr_s = sched->instr_s;
 249
 250         if (instr_v) {
 251                 struct ir2_src src1, src2, *src3;
 252
 253                 src1 = instr_v->src[0];
 254                 src2 = instr_v->src[instr_v->src_count > 1];
 255                 src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL;
 256
 257                 bc->alu.vector_opc = instr_v->alu.vector_opc;
 258                 bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v);
 259                 bc->alu.vector_dest = dst_to_reg(ctx, instr_v);
 260                 bc->alu.vector_clamp = instr_v->alu.saturate;
 261                 bc->alu.export_data = instr_v->alu.export >= 0;
 262
 263                 /* single operand SETEv, use 0.0f as src2 */
 264                 if (instr_v->src_count == 1 &&
 265                         (bc->alu.vector_opc == SETEv ||
 266                         bc->alu.vector_opc == SETNEv ||
 267                         bc->alu.vector_opc == SETGTv ||
 268                         bc->alu.vector_opc == SETGTEv))
 269                         src2 = ir2_zero(ctx);
 270
 271                 /* export32 instr for a20x hw binning has this bit set..
 272                  * it seems to do more than change the base address of constants
 273                  * XXX this is a hack
 274                  */
 275                 bc->alu.relative_addr =
 276                         (bc->alu.export_data && bc->alu.vector_dest == 32);
 277
 278                 bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1);
 279                 bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1);
 280                 bc->alu.src1_reg_negate = src1.negate;
 281                 bc->alu.src1_sel = src1.type != IR2_SRC_CONST;
 282
 283                 bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2);
 284                 bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2);
 285                 bc->alu.src2_reg_negate = src2.negate;
 286                 bc->alu.src2_sel = src2.type != IR2_SRC_CONST;
 287
 288                 if (src3) {
 289                         bc->alu.src3_reg_byte = src_reg_byte(ctx, src3);
 290                         bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3);
 291                         bc->alu.src3_reg_negate = src3->negate;
 292                         bc->alu.src3_sel = src3->type != IR2_SRC_CONST;
 293                 }
 294
 295                 bc->alu.pred_select = instr_v->pred;
 296         }
 297
 298         if (instr_s) {
 299                 struct ir2_src *src = instr_s->src;
 300
 301                 bc->alu.scalar_opc = instr_s->alu.scalar_opc;
 302                 bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s);
 303                 bc->alu.scalar_dest = dst_to_reg(ctx, instr_s);
 304                 bc->alu.scalar_clamp = instr_s->alu.saturate;
 305                 bc->alu.export_data = instr_s->alu.export >= 0;
 306
 307                 if (instr_s->src_count == 1) {
 308                         bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
 309                         bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src);
 310                         bc->alu.src3_reg_negate = src->negate;
 311                         bc->alu.src3_sel = src->type != IR2_SRC_CONST;
 312                 } else {
 313                         assert(instr_s->src_count == 2);
 314
 315                         bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
 316                         bc->alu.src3_swiz = alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle);
 317                         bc->alu.src3_reg_negate = src->negate;
 318                         bc->alu.src3_sel = src->type != IR2_SRC_CONST;;
 319                 }
 320
 321                 if (instr_v)
 322                         assert(instr_s->pred == instr_v->pred);
 323                 bc->alu.pred_select = instr_s->pred;
 324         }
 325
 326         *is_fetch = false;
 327         return;
 328 }
 329
 330 static unsigned
 331 write_cfs(struct ir2_context *ctx, instr_cf_t * cfs, unsigned cf_idx,
 332                   instr_cf_alloc_t *alloc, instr_cf_exec_t *exec)
 333 {
 334         assert(exec->count);
 335
 336         if (alloc)
 337                 cfs[cf_idx++].alloc = *alloc;
 338
 339         /* for memory alloc offset for patching */
 340         if (alloc && alloc->buffer_select == SQ_MEMORY &&
 341                 ctx->info->mem_export_ptr == -1)
 342                 ctx->info->mem_export_ptr = cf_idx / 2 * 3;
 343
 344         cfs[cf_idx++].exec = *exec;
 345         exec->address += exec->count;
 346         exec->serialize = 0;
 347         exec->count = 0;
 348
 349         return cf_idx;
 350 }
 351
 352 /* assemble the final shader */
 353 void assemble(struct ir2_context *ctx, bool binning)
 354 {
 355         /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384)
 356          * address is 9 bits so could it be 512 ?
 357          */
 358         instr_cf_t cfs[384];
 359         instr_t bytecode[384], bc;
 360         unsigned block_addr[128];
 361         unsigned num_cf = 0;
 362
 363         /* CF instr state */
 364         instr_cf_exec_t exec = {.opc = EXEC};
 365         instr_cf_alloc_t alloc = {.opc = ALLOC};
 366
 367         int sync_id, sync_id_prev = -1;
 368         bool is_fetch = false;
 369         bool need_sync = true;
 370         bool need_alloc = false;
 371         unsigned block_idx = 0;
 372
 373         ctx->info->mem_export_ptr = -1;
 374         ctx->info->num_fetch_instrs = 0;
 375
 376         /* vertex shader always needs to allocate at least one parameter
 377          * if it will never happen,
 378          */
 379         if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) {
 380                 alloc.buffer_select = SQ_PARAMETER_PIXEL;
 381                 cfs[num_cf++].alloc = alloc;
 382         }
 383
 384         block_addr[0] = 0;
 385
 386         for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) {
 387                 struct ir2_instr *instr = ctx->instr_sched[j].instr;
 388
 389                 /* catch IR2_CF since it isn't a regular instruction */
 390                 if (instr && instr->type == IR2_CF) {
 391                         assert(!need_alloc); /* XXX */
 392
 393                         /* flush any exec cf before inserting jmp */
 394                         if (exec.count)
 395                                 num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec);
 396
 397                         cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t) {
 398                                 .opc = COND_JMP,
 399                                 .address = instr->cf.block_idx, /* will be fixed later */
 400                                 .force_call = !instr->pred,
 401                                 .predicated_jmp = 1,
 402                                 .direction = instr->cf.block_idx > instr->block_idx,
 403                                 .condition = instr->pred & 1,
 404                         };
 405                         continue;
 406                 }
 407
 408                 /* fill the 3 dwords for the instruction */
 409                 fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch);
 410
 411                 /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */
 412                 sync_id = 0;
 413                 if (is_fetch)
 414                         sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2;
 415
 416                 need_sync = sync_id != sync_id_prev;
 417                 sync_id_prev = sync_id;
 418
 419                 unsigned block;
 420                 {
 421
 422                         if (ctx->instr_sched[j].instr)
 423                                 block = ctx->instr_sched[j].instr->block_idx;
 424                         else
 425                                 block = ctx->instr_sched[j].instr_s->block_idx;
 426
 427                         assert(block_idx <= block);
 428                 }
 429
 430                 /* info for patching */
 431                 if (is_fetch) {
 432                         struct ir2_fetch_info *info =
 433                                 &ctx->info->fetch_info[ctx->info->num_fetch_instrs++];
 434                         info->offset = i * 3;   /* add cf offset later */
 435
 436                         if (bc.fetch.opc == VTX_FETCH) {
 437                                 info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz;
 438                         } else if (bc.fetch.opc == TEX_FETCH) {
 439                                 info->tex.samp_id = instr->fetch.tex.samp_id;
 440                                 info->tex.src_swiz = bc.fetch.tex.src_swiz;
 441                         } else {
 442                                 ctx->info->num_fetch_instrs--;
 443                         }
 444                 }
 445
 446                 /* exec cf after 6 instr or when switching between fetch / alu */
 447                 if (exec.count == 6 || (exec.count && (need_sync || block != block_idx))) {
 448                         num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
 449                         need_alloc = false;
 450                 }
 451
 452                 /* update block_addrs for jmp patching */
 453                 while (block_idx < block)
 454                         block_addr[++block_idx] = num_cf;
 455
 456                 /* export - fill alloc cf */
 457                 if (!is_fetch && bc.alu.export_data) {
 458                         /* get the export buffer from either vector/scalar dest */
 459                         instr_alloc_type_t buffer =
 460                                 export_buf(bc.alu.vector_dest);
 461                         if (bc.alu.scalar_write_mask) {
 462                                 if (bc.alu.vector_write_mask)
 463                                         assert(buffer == export_buf(bc.alu.scalar_dest));
 464                                 buffer = export_buf(bc.alu.scalar_dest);
 465                         }
 466
 467                         /* flush previous alloc if the buffer changes */
 468                         bool need_new_alloc = buffer != alloc.buffer_select;
 469
 470                         /* memory export always in 32/33 pair, new alloc on 32 */
 471                         if (bc.alu.vector_dest == 32)
 472                                 need_new_alloc = true;
 473
 474                         if (need_new_alloc && exec.count) {
 475                                 num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
 476                                 need_alloc = false;
 477                         }
 478
 479                         need_alloc |= need_new_alloc;
 480
 481                         alloc.size = 0;
 482                         alloc.buffer_select = buffer;
 483
 484                         if (buffer == SQ_PARAMETER_PIXEL && ctx->so->type == MESA_SHADER_VERTEX)
 485                                 alloc.size = ctx->f->inputs_count - 1;
 486
 487                         if (buffer == SQ_POSITION)
 488                                 alloc.size = ctx->so->writes_psize;
 489                 }
 490
 491                 if (is_fetch)
 492                         exec.serialize |= 0x1 << exec.count * 2;
 493                 if (need_sync)
 494                         exec.serialize |= 0x2 << exec.count * 2;
 495
 496                 need_sync = false;
 497                 exec.count += 1;
 498                 bytecode[i++] = bc;
 499         }
 500
 501         /* final exec cf */
 502         exec.opc = EXEC_END;
 503         num_cf =
 504                 write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
 505
 506         /* insert nop to get an even # of CFs */
 507         if (num_cf % 2)
 508                 cfs[num_cf++] = (instr_cf_t) {
 509                 .opc = NOP};
 510
 511         /* patch cf addrs */
 512         for (int idx = 0; idx < num_cf; idx++) {
 513                 switch (cfs[idx].opc) {
 514                 case NOP:
 515                 case ALLOC:
 516                         break;
 517                 case EXEC:
 518                 case EXEC_END:
 519                         cfs[idx].exec.address += num_cf / 2;
 520                         break;
 521                 case COND_JMP:
 522                         cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address];
 523                         break;
 524                 default:
 525                         assert(0);
 526                 }
 527         }
 528
 529         /* concatenate cfs and alu/fetch */
 530         uint32_t cfdwords = num_cf / 2 * 3;
 531         uint32_t alufetchdwords = exec.address * 3;
 532         uint32_t sizedwords = cfdwords + alufetchdwords;
 533         uint32_t *dwords = malloc(sizedwords * 4);
 534         assert(dwords);
 535         memcpy(dwords, cfs, cfdwords * 4);
 536         memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4);
 537
 538         /* finalize ir2_shader_info */
 539         ctx->info->dwords = dwords;
 540         ctx->info->sizedwords = sizedwords;
 541         for (int i = 0; i < ctx->info->num_fetch_instrs; i++)
 542                 ctx->info->fetch_info[i].offset += cfdwords;
 543
 544         if (fd_mesa_debug & FD_DBG_DISASM) {
 545                 DBG("disassemble: type=%d", ctx->so->type);
 546                 disasm_a2xx(dwords, sizedwords, 0, ctx->so->type);
 547         }
 548 }