src/gallium/drivers/vc4/vc4_qpu_emit.c

   1 /*
   2  * Copyright © 2014 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <inttypes.h>
  25
  26 #include "vc4_context.h"
  27 #include "vc4_qir.h"
  28 #include "vc4_qpu.h"
  29
  30 static void
  31 vc4_dump_program(struct vc4_compile *c)
  32 {
  33         fprintf(stderr, "%s prog %d/%d QPU:\n",
  34                 qir_get_stage_name(c->stage),
  35                 c->program_id, c->variant_id);
  36
  37         for (int i = 0; i < c->qpu_inst_count; i++) {
  38                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
  39                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
  40                 fprintf(stderr, "\n");
  41         }
  42 }
  43
  44 static void
  45 queue(struct vc4_compile *c, uint64_t inst)
  46 {
  47         struct queued_qpu_inst *q = calloc(1, sizeof(*q));
  48         q->inst = inst;
  49         insert_at_tail(&c->qpu_inst_list, &q->link);
  50 }
  51
  52 static uint64_t *
  53 last_inst(struct vc4_compile *c)
  54 {
  55         struct queued_qpu_inst *q =
  56                 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
  57         return &q->inst;
  58 }
  59
  60 static void
  61 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
  62 {
  63         *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
  64 }
  65
  66 /**
  67  * Some special registers can be read from either file, which lets us resolve
  68  * raddr conflicts without extra MOVs.
  69  */
  70 static bool
  71 swap_file(struct qpu_reg *src)
  72 {
  73         switch (src->addr) {
  74         case QPU_R_UNIF:
  75         case QPU_R_VARY:
  76                 if (src->mux == QPU_MUX_A)
  77                         src->mux = QPU_MUX_B;
  78                 else
  79                         src->mux = QPU_MUX_A;
  80                 return true;
  81
  82         default:
  83                 return false;
  84         }
  85 }
  86
  87 /**
  88  * This is used to resolve the fact that we might register-allocate two
  89  * different operands of an instruction to the same physical register file
  90  * even though instructions have only one field for the register file source
  91  * address.
  92  *
  93  * In that case, we need to move one to a temporary that can be used in the
  94  * instruction, instead.
  95  */
  96 static void
  97 fixup_raddr_conflict(struct vc4_compile *c,
  98                      struct qpu_reg *src0, struct qpu_reg *src1)
  99 {
 100         if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
 101             src0->mux != src1->mux ||
 102             src0->addr == src1->addr) {
 103                 return;
 104         }
 105
 106         if (swap_file(src0) || swap_file(src1))
 107                 return;
 108
 109         queue(c, qpu_a_MOV(qpu_r3(), *src1));
 110         *src1 = qpu_r3();
 111 }
 112
 113 void
 114 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 115 {
 116         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
 117         bool discard = false;
 118         uint32_t inputs_remaining = c->num_inputs;
 119         uint32_t vpm_read_fifo_count = 0;
 120         uint32_t vpm_read_offset = 0;
 121
 122         make_empty_list(&c->qpu_inst_list);
 123
 124         switch (c->stage) {
 125         case QSTAGE_VERT:
 126         case QSTAGE_COORD:
 127                 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
 128                  * load up to 16 dwords (4 vec4s) per vertex.
 129                  */
 130                 while (inputs_remaining) {
 131                         uint32_t num_entries = MIN2(inputs_remaining, 16);
 132                         queue(c, qpu_load_imm_ui(qpu_vrsetup(),
 133                                                  vpm_read_offset |
 134                                                  0x00001a00 |
 135                                                  ((num_entries & 0xf) << 20)));
 136                         inputs_remaining -= num_entries;
 137                         vpm_read_offset += num_entries;
 138                         vpm_read_fifo_count++;
 139                 }
 140                 assert(vpm_read_fifo_count <= 4);
 141
 142                 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
 143                 break;
 144         case QSTAGE_FRAG:
 145                 break;
 146         }
 147
 148         struct simple_node *node;
 149         foreach(node, &c->instructions) {
 150                 struct qinst *qinst = (struct qinst *)node;
 151
 152 #if 0
 153                 fprintf(stderr, "translating qinst to qpu: ");
 154                 qir_dump_inst(qinst);
 155                 fprintf(stderr, "\n");
 156 #endif
 157
 158                 static const struct {
 159                         uint32_t op;
 160                         bool is_mul;
 161                 } translate[] = {
 162 #define A(name) [QOP_##name] = {QPU_A_##name, false}
 163 #define M(name) [QOP_##name] = {QPU_M_##name, true}
 164                         A(FADD),
 165                         A(FSUB),
 166                         A(FMIN),
 167                         A(FMAX),
 168                         A(FMINABS),
 169                         A(FMAXABS),
 170                         A(FTOI),
 171                         A(ITOF),
 172                         A(ADD),
 173                         A(SUB),
 174                         A(SHL),
 175                         A(SHR),
 176                         A(ASR),
 177                         A(MIN),
 178                         A(MAX),
 179                         A(AND),
 180                         A(OR),
 181                         A(XOR),
 182                         A(NOT),
 183
 184                         M(FMUL),
 185                         M(MUL24),
 186                 };
 187
 188                 struct qpu_reg src[4];
 189                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
 190                         int index = qinst->src[i].index;
 191                         switch (qinst->src[i].file) {
 192                         case QFILE_NULL:
 193                                 src[i] = qpu_rn(0);
 194                                 break;
 195                         case QFILE_TEMP:
 196                                 src[i] = temp_registers[index];
 197                                 break;
 198                         case QFILE_UNIF:
 199                                 src[i] = qpu_unif();
 200                                 break;
 201                         case QFILE_VARY:
 202                                 src[i] = qpu_vary();
 203                                 break;
 204                         }
 205                 }
 206
 207                 struct qpu_reg dst;
 208                 switch (qinst->dst.file) {
 209                 case QFILE_NULL:
 210                         dst = qpu_ra(QPU_W_NOP);
 211                         break;
 212                 case QFILE_TEMP:
 213                         dst = temp_registers[qinst->dst.index];
 214                         break;
 215                 case QFILE_VARY:
 216                 case QFILE_UNIF:
 217                         assert(!"not reached");
 218                         break;
 219                 }
 220
 221                 switch (qinst->op) {
 222                 case QOP_MOV:
 223                         /* Skip emitting the MOV if it's a no-op. */
 224                         if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
 225                             dst.mux != src[0].mux || dst.addr != src[0].addr) {
 226                                 queue(c, qpu_a_MOV(dst, src[0]));
 227                         }
 228                         break;
 229
 230                 case QOP_SF:
 231                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
 232                         *last_inst(c) |= QPU_SF;
 233                         break;
 234
 235                 case QOP_SEL_X_0_ZS:
 236                 case QOP_SEL_X_0_ZC:
 237                 case QOP_SEL_X_0_NS:
 238                 case QOP_SEL_X_0_NC:
 239                         queue(c, qpu_a_MOV(dst, src[0]));
 240                         set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
 241                                           QPU_COND_ZS);
 242
 243                         queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
 244                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
 245                                               1) + QPU_COND_ZS);
 246                         break;
 247
 248                 case QOP_SEL_X_Y_ZS:
 249                 case QOP_SEL_X_Y_ZC:
 250                 case QOP_SEL_X_Y_NS:
 251                 case QOP_SEL_X_Y_NC:
 252                         queue(c, qpu_a_MOV(dst, src[0]));
 253                         set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
 254                                           QPU_COND_ZS);
 255
 256                         queue(c, qpu_a_MOV(dst, src[1]));
 257                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
 258                                               1) + QPU_COND_ZS);
 259
 260                         break;
 261
 262                 case QOP_VPM_WRITE:
 263                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
 264                         break;
 265
 266                 case QOP_VPM_READ:
 267                         queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
 268                         break;
 269
 270                 case QOP_RCP:
 271                 case QOP_RSQ:
 272                 case QOP_EXP2:
 273                 case QOP_LOG2:
 274                         switch (qinst->op) {
 275                         case QOP_RCP:
 276                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
 277                                                    src[0]));
 278                                 break;
 279                         case QOP_RSQ:
 280                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
 281                                                    src[0]));
 282                                 break;
 283                         case QOP_EXP2:
 284                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
 285                                                    src[0]));
 286                                 break;
 287                         case QOP_LOG2:
 288                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
 289                                                    src[0]));
 290                                 break;
 291                         default:
 292                                 abort();
 293                         }
 294
 295                         queue(c, qpu_a_MOV(dst, qpu_r4()));
 296
 297                         break;
 298
 299                 case QOP_PACK_COLORS: {
 300                         /* We have to be careful not to start writing over one
 301                          * of our source values when incrementally writing the
 302                          * destination.  So, if the dst is one of the srcs, we
 303                          * pack that one first (and we pack 4 channels at once
 304                          * for the first pack).
 305                          */
 306                         struct qpu_reg first_pack = src[0];
 307                         for (int i = 0; i < 4; i++) {
 308                                 if (src[i].mux == dst.mux &&
 309                                     src[i].addr == dst.addr) {
 310                                         first_pack = dst;
 311                                         break;
 312                                 }
 313                         }
 314                         queue(c, qpu_m_MOV(dst, first_pack));
 315                         *last_inst(c) |= QPU_PM;
 316                         *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
 317                                                        QPU_PACK);
 318
 319                         for (int i = 0; i < 4; i++) {
 320                                 if (src[i].mux == first_pack.mux &&
 321                                     src[i].addr == first_pack.addr) {
 322                                         continue;
 323                                 }
 324
 325                                 queue(c, qpu_m_MOV(dst, src[i]));
 326                                 *last_inst(c) |= QPU_PM;
 327                                 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
 328                                                                QPU_PACK);
 329                         }
 330
 331                         break;
 332                 }
 333
 334                 case QOP_FRAG_X:
 335                         queue(c, qpu_a_ITOF(dst,
 336                                             qpu_ra(QPU_R_XY_PIXEL_COORD)));
 337                         break;
 338
 339                 case QOP_FRAG_Y:
 340                         queue(c, qpu_a_ITOF(dst,
 341                                             qpu_rb(QPU_R_XY_PIXEL_COORD)));
 342                         break;
 343
 344                 case QOP_FRAG_REV_FLAG:
 345                         queue(c, qpu_a_ITOF(dst,
 346                                             qpu_rb(QPU_R_MS_REV_FLAGS)));
 347                         break;
 348
 349                 case QOP_FRAG_Z:
 350                 case QOP_FRAG_W:
 351                         /* QOP_FRAG_Z/W don't emit instructions, just allocate
 352                          * the register to the Z/W payload.
 353                          */
 354                         break;
 355
 356                 case QOP_TLB_DISCARD_SETUP:
 357                         discard = true;
 358                         queue(c, qpu_a_MOV(src[0], src[0]));
 359                         *last_inst(c) |= QPU_SF;
 360                         break;
 361
 362                 case QOP_TLB_STENCIL_SETUP:
 363                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
 364                         break;
 365
 366                 case QOP_TLB_Z_WRITE:
 367                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
 368                         if (discard) {
 369                                 set_last_cond_add(c, QPU_COND_ZS);
 370                         }
 371                         break;
 372
 373                 case QOP_TLB_COLOR_READ:
 374                         queue(c, qpu_NOP());
 375                         *last_inst(c) = qpu_set_sig(*last_inst(c),
 376                                                     QPU_SIG_COLOR_LOAD);
 377
 378                         break;
 379
 380                 case QOP_TLB_COLOR_WRITE:
 381                         queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
 382                         if (discard) {
 383                                 set_last_cond_add(c, QPU_COND_ZS);
 384                         }
 385                         break;
 386
 387                 case QOP_VARY_ADD_C:
 388                         queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
 389                         break;
 390
 391                 case QOP_PACK_SCALED: {
 392                         uint64_t a = (qpu_a_MOV(dst, src[0]) |
 393                                       QPU_SET_FIELD(QPU_PACK_A_16A,
 394                                                     QPU_PACK));
 395                         uint64_t b = (qpu_a_MOV(dst, src[1]) |
 396                                       QPU_SET_FIELD(QPU_PACK_A_16B,
 397                                                     QPU_PACK));
 398
 399                         if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
 400                                 queue(c, b);
 401                                 queue(c, a);
 402                         } else {
 403                                 queue(c, a);
 404                                 queue(c, b);
 405                         }
 406                         break;
 407                 }
 408
 409                 case QOP_TEX_S:
 410                 case QOP_TEX_T:
 411                 case QOP_TEX_R:
 412                 case QOP_TEX_B:
 413                         queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
 414                                                   (qinst->op - QOP_TEX_S)),
 415                                            src[0]));
 416                         break;
 417
 418                 case QOP_TEX_DIRECT:
 419                         fixup_raddr_conflict(c, &src[0], &src[1]);
 420                         queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
 421                         break;
 422
 423                 case QOP_TEX_RESULT:
 424                         queue(c, qpu_NOP());
 425                         *last_inst(c) = qpu_set_sig(*last_inst(c),
 426                                                     QPU_SIG_LOAD_TMU0);
 427
 428                         break;
 429
 430                 case QOP_R4_UNPACK_A:
 431                 case QOP_R4_UNPACK_B:
 432                 case QOP_R4_UNPACK_C:
 433                 case QOP_R4_UNPACK_D:
 434                         assert(src[0].mux == QPU_MUX_R4);
 435                         queue(c, qpu_a_MOV(dst, src[0]));
 436                         *last_inst(c) |= QPU_PM;
 437                         *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
 438                                                        (qinst->op -
 439                                                         QOP_R4_UNPACK_A),
 440                                                        QPU_UNPACK);
 441
 442                         break;
 443
 444                 case QOP_UNPACK_8A:
 445                 case QOP_UNPACK_8B:
 446                 case QOP_UNPACK_8C:
 447                 case QOP_UNPACK_8D: {
 448                         assert(src[0].mux == QPU_MUX_A);
 449
 450                         /* And, since we're setting the pack bits, if the
 451                          * destination is in A it would get re-packed.
 452                          */
 453                         struct qpu_reg orig_dst = dst;
 454                         if (orig_dst.mux == QPU_MUX_A)
 455                                 dst = qpu_rn(3);
 456
 457                         queue(c, qpu_a_FMAX(dst, src[0], src[0]));
 458                         *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
 459                                                        (qinst->op -
 460                                                         QOP_UNPACK_8A),
 461                                                        QPU_UNPACK);
 462
 463                         if (orig_dst.mux == QPU_MUX_A) {
 464                                 queue(c, qpu_a_MOV(orig_dst, dst));
 465                         }
 466                 }
 467                         break;
 468
 469                 default:
 470                         assert(qinst->op < ARRAY_SIZE(translate));
 471                         assert(translate[qinst->op].op != 0); /* NOPs */
 472
 473                         /* If we have only one source, put it in the second
 474                          * argument slot as well so that we don't take up
 475                          * another raddr just to get unused data.
 476                          */
 477                         if (qir_get_op_nsrc(qinst->op) == 1)
 478                                 src[1] = src[0];
 479
 480                         fixup_raddr_conflict(c, &src[0], &src[1]);
 481
 482                         if (translate[qinst->op].is_mul) {
 483                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
 484                                                     dst,
 485                                                     src[0], src[1]));
 486                         } else {
 487                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
 488                                                     dst,
 489                                                     src[0], src[1]));
 490                         }
 491                         break;
 492                 }
 493         }
 494
 495         qpu_schedule_instructions(c);
 496
 497         /* thread end can't have VPM write or read */
 498         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 499                           QPU_WADDR_ADD) == QPU_W_VPM ||
 500             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 501                           QPU_WADDR_MUL) == QPU_W_VPM ||
 502             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 503                           QPU_RADDR_A) == QPU_R_VPM ||
 504             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 505                           QPU_RADDR_B) == QPU_R_VPM) {
 506                 qpu_serialize_one_inst(c, qpu_NOP());
 507         }
 508
 509         /* thread end can't have uniform read */
 510         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 511                           QPU_RADDR_A) == QPU_R_UNIF ||
 512             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 513                           QPU_RADDR_B) == QPU_R_UNIF) {
 514                 qpu_serialize_one_inst(c, qpu_NOP());
 515         }
 516
 517         /* thread end can't have TLB operations */
 518         if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
 519                 qpu_serialize_one_inst(c, qpu_NOP());
 520
 521         c->qpu_insts[c->qpu_inst_count - 1] =
 522                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
 523                             QPU_SIG_PROG_END);
 524         qpu_serialize_one_inst(c, qpu_NOP());
 525         qpu_serialize_one_inst(c, qpu_NOP());
 526
 527         switch (c->stage) {
 528         case QSTAGE_VERT:
 529         case QSTAGE_COORD:
 530                 break;
 531         case QSTAGE_FRAG:
 532                 c->qpu_insts[c->qpu_inst_count - 1] =
 533                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
 534                                     QPU_SIG_SCOREBOARD_UNLOCK);
 535                 break;
 536         }
 537
 538         if (vc4_debug & VC4_DEBUG_QPU)
 539                 vc4_dump_program(c);
 540
 541         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
 542
 543         free(temp_registers);
 544 }