src/gallium/drivers/vc4/vc4_program.c

   1 /*
   2  * Copyright (c) 2014 Scott Mansell
   3  * Copyright © 2014 Broadcom
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #include <inttypes.h>
  26 #include "pipe/p_state.h"
  27 #include "util/u_format.h"
  28 #include "util/u_hash.h"
  29 #include "util/u_math.h"
  30 #include "util/u_memory.h"
  31 #include "util/ralloc.h"
  32 #include "util/hash_table.h"
  33 #include "tgsi/tgsi_dump.h"
  34 #include "tgsi/tgsi_info.h"
  35 #include "tgsi/tgsi_lowering.h"
  36 #include "tgsi/tgsi_parse.h"
  37 #include "nir/tgsi_to_nir.h"
  38
  39 #include "vc4_context.h"
  40 #include "vc4_qpu.h"
  41 #include "vc4_qir.h"
  42 #ifdef USE_VC4_SIMULATOR
  43 #include "simpenrose/simpenrose.h"
  44 #endif
  45
  46 struct vc4_key {
  47         struct vc4_uncompiled_shader *shader_state;
  48         struct {
  49                 enum pipe_format format;
  50                 unsigned compare_mode:1;
  51                 unsigned compare_func:3;
  52                 unsigned wrap_s:3;
  53                 unsigned wrap_t:3;
  54                 uint8_t swizzle[4];
  55         } tex[VC4_MAX_TEXTURE_SAMPLERS];
  56         uint8_t ucp_enables;
  57 };
  58
  59 struct vc4_fs_key {
  60         struct vc4_key base;
  61         enum pipe_format color_format;
  62         bool depth_enabled;
  63         bool stencil_enabled;
  64         bool stencil_twoside;
  65         bool stencil_full_writemasks;
  66         bool is_points;
  67         bool is_lines;
  68         bool alpha_test;
  69         bool point_coord_upper_left;
  70         bool light_twoside;
  71         uint8_t alpha_test_func;
  72         uint8_t logicop_func;
  73         uint32_t point_sprite_mask;
  74
  75         struct pipe_rt_blend_state blend;
  76 };
  77
  78 struct vc4_vs_key {
  79         struct vc4_key base;
  80
  81         /**
  82          * This is a proxy for the array of FS input semantics, which is
  83          * larger than we would want to put in the key.
  84          */
  85         uint64_t compiled_fs_id;
  86
  87         enum pipe_format attr_formats[8];
  88         bool is_coord;
  89         bool per_vertex_point_size;
  90 };
  91
  92 static void
  93 resize_qreg_array(struct vc4_compile *c,
  94                   struct qreg **regs,
  95                   uint32_t *size,
  96                   uint32_t decl_size)
  97 {
  98         if (*size >= decl_size)
  99                 return;
 100
 101         uint32_t old_size = *size;
 102         *size = MAX2(*size * 2, decl_size);
 103         *regs = reralloc(c, *regs, struct qreg, *size);
 104         if (!*regs) {
 105                 fprintf(stderr, "Malloc failure\n");
 106                 abort();
 107         }
 108
 109         for (uint32_t i = old_size; i < *size; i++)
 110                 (*regs)[i] = c->undef;
 111 }
 112
 113 static struct qreg
 114 indirect_uniform_load(struct vc4_compile *c,
 115                       struct qreg indirect_offset,
 116                       unsigned offset)
 117 {
 118         struct vc4_compiler_ubo_range *range = NULL;
 119         unsigned i;
 120         for (i = 0; i < c->num_uniform_ranges; i++) {
 121                 range = &c->ubo_ranges[i];
 122                 if (offset >= range->src_offset &&
 123                     offset < range->src_offset + range->size) {
 124                         break;
 125                 }
 126         }
 127         /* The driver-location-based offset always has to be within a declared
 128          * uniform range.
 129          */
 130         assert(range);
 131         if (!range->used) {
 132                 range->used = true;
 133                 range->dst_offset = c->next_ubo_dst_offset;
 134                 c->next_ubo_dst_offset += range->size;
 135                 c->num_ubo_ranges++;
 136         };
 137
 138         offset -= range->src_offset;
 139         /* Translate the user's TGSI register index from the TGSI register
 140          * base to a byte offset.
 141          */
 142         indirect_offset = qir_SHL(c, indirect_offset, qir_uniform_ui(c, 4));
 143
 144         /* Adjust for where we stored the TGSI register base. */
 145         indirect_offset = qir_ADD(c, indirect_offset,
 146                                   qir_uniform_ui(c, (range->dst_offset +
 147                                                      offset)));
 148
 149         /* Clamp to [0, array size).  Note that MIN/MAX are signed. */
 150         indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0));
 151         indirect_offset = qir_MIN(c, indirect_offset,
 152                                   qir_uniform_ui(c, (range->dst_offset +
 153                                                      range->size - 4)));
 154
 155         qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0));
 156         struct qreg r4 = qir_TEX_RESULT(c);
 157         c->num_texture_samples++;
 158         return qir_MOV(c, r4);
 159 }
 160
 161 static struct qreg *
 162 ntq_get_dest(struct vc4_compile *c, nir_dest dest)
 163 {
 164         assert(!dest.is_ssa);
 165         nir_register *reg = dest.reg.reg;
 166         struct hash_entry *entry = _mesa_hash_table_search(c->def_ht, reg);
 167         assert(reg->num_array_elems == 0);
 168         assert(dest.reg.base_offset == 0);
 169
 170         struct qreg *qregs = entry->data;
 171         return qregs;
 172 }
 173
 174 static struct qreg
 175 ntq_get_src(struct vc4_compile *c, nir_src src, int i)
 176 {
 177         struct hash_entry *entry;
 178         if (src.is_ssa) {
 179                 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
 180                 assert(i < src.ssa->num_components);
 181         } else {
 182                 nir_register *reg = src.reg.reg;
 183                 entry = _mesa_hash_table_search(c->def_ht, reg);
 184                 assert(reg->num_array_elems == 0);
 185                 assert(src.reg.base_offset == 0);
 186                 assert(i < reg->num_components);
 187         }
 188
 189         struct qreg *qregs = entry->data;
 190         return qregs[i];
 191 }
 192
 193 static struct qreg
 194 ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,
 195                 unsigned src)
 196 {
 197         assert(util_is_power_of_two(instr->dest.write_mask));
 198         unsigned chan = ffs(instr->dest.write_mask) - 1;
 199         struct qreg r = ntq_get_src(c, instr->src[src].src,
 200                                     instr->src[src].swizzle[chan]);
 201
 202         assert(!instr->src[src].abs);
 203         assert(!instr->src[src].negate);
 204
 205         return r;
 206 };
 207
 208 static struct qreg
 209 get_swizzled_channel(struct vc4_compile *c,
 210                      struct qreg *srcs, int swiz)
 211 {
 212         switch (swiz) {
 213         default:
 214         case UTIL_FORMAT_SWIZZLE_NONE:
 215                 fprintf(stderr, "warning: unknown swizzle\n");
 216                 /* FALLTHROUGH */
 217         case UTIL_FORMAT_SWIZZLE_0:
 218                 return qir_uniform_f(c, 0.0);
 219         case UTIL_FORMAT_SWIZZLE_1:
 220                 return qir_uniform_f(c, 1.0);
 221         case UTIL_FORMAT_SWIZZLE_X:
 222         case UTIL_FORMAT_SWIZZLE_Y:
 223         case UTIL_FORMAT_SWIZZLE_Z:
 224         case UTIL_FORMAT_SWIZZLE_W:
 225                 return srcs[swiz];
 226         }
 227 }
 228
 229 static inline struct qreg
 230 qir_SAT(struct vc4_compile *c, struct qreg val)
 231 {
 232         return qir_FMAX(c,
 233                         qir_FMIN(c, val, qir_uniform_f(c, 1.0)),
 234                         qir_uniform_f(c, 0.0));
 235 }
 236
 237 static struct qreg
 238 ntq_rcp(struct vc4_compile *c, struct qreg x)
 239 {
 240         struct qreg r = qir_RCP(c, x);
 241
 242         /* Apply a Newton-Raphson step to improve the accuracy. */
 243         r = qir_FMUL(c, r, qir_FSUB(c,
 244                                     qir_uniform_f(c, 2.0),
 245                                     qir_FMUL(c, x, r)));
 246
 247         return r;
 248 }
 249
 250 static struct qreg
 251 ntq_rsq(struct vc4_compile *c, struct qreg x)
 252 {
 253         struct qreg r = qir_RSQ(c, x);
 254
 255         /* Apply a Newton-Raphson step to improve the accuracy. */
 256         r = qir_FMUL(c, r, qir_FSUB(c,
 257                                     qir_uniform_f(c, 1.5),
 258                                     qir_FMUL(c,
 259                                              qir_uniform_f(c, 0.5),
 260                                              qir_FMUL(c, x,
 261                                                       qir_FMUL(c, r, r)))));
 262
 263         return r;
 264 }
 265
 266 static struct qreg
 267 qir_srgb_decode(struct vc4_compile *c, struct qreg srgb)
 268 {
 269         struct qreg low = qir_FMUL(c, srgb, qir_uniform_f(c, 1.0 / 12.92));
 270         struct qreg high = qir_POW(c,
 271                                    qir_FMUL(c,
 272                                             qir_FADD(c,
 273                                                      srgb,
 274                                                      qir_uniform_f(c, 0.055)),
 275                                             qir_uniform_f(c, 1.0 / 1.055)),
 276                                    qir_uniform_f(c, 2.4));
 277
 278         qir_SF(c, qir_FSUB(c, srgb, qir_uniform_f(c, 0.04045)));
 279         return qir_SEL_X_Y_NS(c, low, high);
 280 }
 281
 282 static struct qreg
 283 qir_srgb_encode(struct vc4_compile *c, struct qreg linear)
 284 {
 285         struct qreg low = qir_FMUL(c, linear, qir_uniform_f(c, 12.92));
 286         struct qreg high = qir_FSUB(c,
 287                                     qir_FMUL(c,
 288                                              qir_uniform_f(c, 1.055),
 289                                              qir_POW(c,
 290                                                      linear,
 291                                                      qir_uniform_f(c, 0.41666))),
 292                                     qir_uniform_f(c, 0.055));
 293
 294         qir_SF(c, qir_FSUB(c, linear, qir_uniform_f(c, 0.0031308)));
 295         return qir_SEL_X_Y_NS(c, low, high);
 296 }
 297
 298 static struct qreg
 299 ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
 300 {
 301         struct qreg src0_hi = qir_SHR(c, src0,
 302                                       qir_uniform_ui(c, 24));
 303         struct qreg src1_hi = qir_SHR(c, src1,
 304                                       qir_uniform_ui(c, 24));
 305
 306         struct qreg hilo = qir_MUL24(c, src0_hi, src1);
 307         struct qreg lohi = qir_MUL24(c, src0, src1_hi);
 308         struct qreg lolo = qir_MUL24(c, src0, src1);
 309
 310         return qir_ADD(c, lolo, qir_SHL(c,
 311                                         qir_ADD(c, hilo, lohi),
 312                                         qir_uniform_ui(c, 24)));
 313 }
 314
 315 static void
 316 ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
 317 {
 318         struct qreg s, t, r, lod, proj, compare;
 319         bool is_txb = false, is_txl = false, has_proj = false;
 320         unsigned unit = instr->sampler_index;
 321
 322         for (unsigned i = 0; i < instr->num_srcs; i++) {
 323                 switch (instr->src[i].src_type) {
 324                 case nir_tex_src_coord:
 325                         s = ntq_get_src(c, instr->src[i].src, 0);
 326                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D)
 327                                 t = qir_uniform_f(c, 0.5);
 328                         else
 329                                 t = ntq_get_src(c, instr->src[i].src, 1);
 330                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
 331                                 r = ntq_get_src(c, instr->src[i].src, 2);
 332                         break;
 333                 case nir_tex_src_bias:
 334                         lod = ntq_get_src(c, instr->src[i].src, 0);
 335                         is_txb = true;
 336                         break;
 337                 case nir_tex_src_lod:
 338                         lod = ntq_get_src(c, instr->src[i].src, 0);
 339                         is_txl = true;
 340                         break;
 341                 case nir_tex_src_comparitor:
 342                         compare = ntq_get_src(c, instr->src[i].src, 0);
 343                         break;
 344                 case nir_tex_src_projector:
 345                         proj = qir_RCP(c, ntq_get_src(c, instr->src[i].src, 0));
 346                         s = qir_FMUL(c, s, proj);
 347                         t = qir_FMUL(c, t, proj);
 348                         has_proj = true;
 349                         break;
 350                 default:
 351                         unreachable("unknown texture source");
 352                 }
 353         }
 354
 355         struct qreg texture_u[] = {
 356                 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0, unit),
 357                 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit),
 358                 qir_uniform(c, QUNIFORM_CONSTANT, 0),
 359                 qir_uniform(c, QUNIFORM_CONSTANT, 0),
 360         };
 361         uint32_t next_texture_u = 0;
 362
 363         /* There is no native support for GL texture rectangle coordinates, so
 364          * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0,
 365          * 1]).
 366          */
 367         if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
 368                 s = qir_FMUL(c, s,
 369                              qir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, unit));
 370                 t = qir_FMUL(c, t,
 371                              qir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, unit));
 372         }
 373
 374         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE || is_txl) {
 375                 texture_u[2] = qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P2,
 376                                            unit | (is_txl << 16));
 377         }
 378
 379         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
 380                 struct qreg ma = qir_FMAXABS(c, qir_FMAXABS(c, s, t), r);
 381                 struct qreg rcp_ma = qir_RCP(c, ma);
 382                 s = qir_FMUL(c, s, rcp_ma);
 383                 t = qir_FMUL(c, t, rcp_ma);
 384                 r = qir_FMUL(c, r, rcp_ma);
 385
 386                 qir_TEX_R(c, r, texture_u[next_texture_u++]);
 387         } else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
 388                    c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP ||
 389                    c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
 390                    c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
 391                 qir_TEX_R(c, qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR, unit),
 392                           texture_u[next_texture_u++]);
 393         }
 394
 395         if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) {
 396                 s = qir_SAT(c, s);
 397         }
 398
 399         if (c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
 400                 t = qir_SAT(c, t);
 401         }
 402
 403         qir_TEX_T(c, t, texture_u[next_texture_u++]);
 404
 405         if (is_txl || is_txb)
 406                 qir_TEX_B(c, lod, texture_u[next_texture_u++]);
 407
 408         qir_TEX_S(c, s, texture_u[next_texture_u++]);
 409
 410         c->num_texture_samples++;
 411         struct qreg r4 = qir_TEX_RESULT(c);
 412
 413         enum pipe_format format = c->key->tex[unit].format;
 414
 415         struct qreg unpacked[4];
 416         if (util_format_is_depth_or_stencil(format)) {
 417                 struct qreg depthf = qir_ITOF(c, qir_SHR(c, r4,
 418                                                          qir_uniform_ui(c, 8)));
 419                 struct qreg normalized = qir_FMUL(c, depthf,
 420                                                   qir_uniform_f(c, 1.0f/0xffffff));
 421
 422                 struct qreg depth_output;
 423
 424                 struct qreg one = qir_uniform_f(c, 1.0f);
 425                 if (c->key->tex[unit].compare_mode) {
 426                         if (has_proj)
 427                                 compare = qir_FMUL(c, compare, proj);
 428
 429                         switch (c->key->tex[unit].compare_func) {
 430                         case PIPE_FUNC_NEVER:
 431                                 depth_output = qir_uniform_f(c, 0.0f);
 432                                 break;
 433                         case PIPE_FUNC_ALWAYS:
 434                                 depth_output = one;
 435                                 break;
 436                         case PIPE_FUNC_EQUAL:
 437                                 qir_SF(c, qir_FSUB(c, compare, normalized));
 438                                 depth_output = qir_SEL_X_0_ZS(c, one);
 439                                 break;
 440                         case PIPE_FUNC_NOTEQUAL:
 441                                 qir_SF(c, qir_FSUB(c, compare, normalized));
 442                                 depth_output = qir_SEL_X_0_ZC(c, one);
 443                                 break;
 444                         case PIPE_FUNC_GREATER:
 445                                 qir_SF(c, qir_FSUB(c, compare, normalized));
 446                                 depth_output = qir_SEL_X_0_NC(c, one);
 447                                 break;
 448                         case PIPE_FUNC_GEQUAL:
 449                                 qir_SF(c, qir_FSUB(c, normalized, compare));
 450                                 depth_output = qir_SEL_X_0_NS(c, one);
 451                                 break;
 452                         case PIPE_FUNC_LESS:
 453                                 qir_SF(c, qir_FSUB(c, compare, normalized));
 454                                 depth_output = qir_SEL_X_0_NS(c, one);
 455                                 break;
 456                         case PIPE_FUNC_LEQUAL:
 457                                 qir_SF(c, qir_FSUB(c, normalized, compare));
 458                                 depth_output = qir_SEL_X_0_NC(c, one);
 459                                 break;
 460                         }
 461                 } else {
 462                         depth_output = normalized;
 463                 }
 464
 465                 for (int i = 0; i < 4; i++)
 466                         unpacked[i] = depth_output;
 467         } else {
 468                 for (int i = 0; i < 4; i++)
 469                         unpacked[i] = qir_R4_UNPACK(c, r4, i);
 470         }
 471
 472         const uint8_t *format_swiz = vc4_get_format_swizzle(format);
 473         struct qreg texture_output[4];
 474         for (int i = 0; i < 4; i++) {
 475                 texture_output[i] = get_swizzled_channel(c, unpacked,
 476                                                          format_swiz[i]);
 477         }
 478
 479         if (util_format_is_srgb(format)) {
 480                 for (int i = 0; i < 3; i++)
 481                         texture_output[i] = qir_srgb_decode(c,
 482                                                             texture_output[i]);
 483         }
 484
 485         struct qreg *dest = ntq_get_dest(c, instr->dest);
 486         for (int i = 0; i < 4; i++) {
 487                 dest[i] = get_swizzled_channel(c, texture_output,
 488                                                c->key->tex[unit].swizzle[i]);
 489         }
 490 }
 491
 492 /**
 493  * Computes x - floor(x), which is tricky because our FTOI truncates (rounds
 494  * to zero).
 495  */
 496 static struct qreg
 497 ntq_ffract(struct vc4_compile *c, struct qreg src)
 498 {
 499         struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
 500         struct qreg diff = qir_FSUB(c, src, trunc);
 501         qir_SF(c, diff);
 502         return qir_SEL_X_Y_NS(c,
 503                               qir_FADD(c, diff, qir_uniform_f(c, 1.0)),
 504                               diff);
 505 }
 506
 507 /**
 508  * Computes floor(x), which is tricky because our FTOI truncates (rounds to
 509  * zero).
 510  */
 511 static struct qreg
 512 ntq_ffloor(struct vc4_compile *c, struct qreg src)
 513 {
 514         struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
 515
 516         /* This will be < 0 if we truncated and the truncation was of a value
 517          * that was < 0 in the first place.
 518          */
 519         qir_SF(c, qir_FSUB(c, src, trunc));
 520
 521         return qir_SEL_X_Y_NS(c,
 522                               qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)),
 523                               trunc);
 524 }
 525
 526 /**
 527  * Computes ceil(x), which is tricky because our FTOI truncates (rounds to
 528  * zero).
 529  */
 530 static struct qreg
 531 ntq_fceil(struct vc4_compile *c, struct qreg src)
 532 {
 533         struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
 534
 535         /* This will be < 0 if we truncated and the truncation was of a value
 536          * that was > 0 in the first place.
 537          */
 538         qir_SF(c, qir_FSUB(c, trunc, src));
 539
 540         return qir_SEL_X_Y_NS(c,
 541                               qir_FADD(c, trunc, qir_uniform_f(c, 1.0)),
 542                               trunc);
 543 }
 544
 545 static struct qreg
 546 ntq_fsin(struct vc4_compile *c, struct qreg src)
 547 {
 548         float coeff[] = {
 549                 -2.0 * M_PI,
 550                 pow(2.0 * M_PI, 3) / (3 * 2 * 1),
 551                 -pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1),
 552                 pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1),
 553                 -pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
 554         };
 555
 556         struct qreg scaled_x =
 557                 qir_FMUL(c,
 558                          src,
 559                          qir_uniform_f(c, 1.0 / (M_PI * 2.0)));
 560
 561         struct qreg x = qir_FADD(c,
 562                                  ntq_ffract(c, scaled_x),
 563                                  qir_uniform_f(c, -0.5));
 564         struct qreg x2 = qir_FMUL(c, x, x);
 565         struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0]));
 566         for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
 567                 x = qir_FMUL(c, x, x2);
 568                 sum = qir_FADD(c,
 569                                sum,
 570                                qir_FMUL(c,
 571                                         x,
 572                                         qir_uniform_f(c, coeff[i])));
 573         }
 574         return sum;
 575 }
 576
 577 static struct qreg
 578 ntq_fcos(struct vc4_compile *c, struct qreg src)
 579 {
 580         float coeff[] = {
 581                 -1.0f,
 582                 pow(2.0 * M_PI, 2) / (2 * 1),
 583                 -pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1),
 584                 pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1),
 585                 -pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
 586                 pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
 587         };
 588
 589         struct qreg scaled_x =
 590                 qir_FMUL(c, src,
 591                          qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
 592         struct qreg x_frac = qir_FADD(c,
 593                                       ntq_ffract(c, scaled_x),
 594                                       qir_uniform_f(c, -0.5));
 595
 596         struct qreg sum = qir_uniform_f(c, coeff[0]);
 597         struct qreg x2 = qir_FMUL(c, x_frac, x_frac);
 598         struct qreg x = x2; /* Current x^2, x^4, or x^6 */
 599         for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
 600                 if (i != 1)
 601                         x = qir_FMUL(c, x, x2);
 602
 603                 struct qreg mul = qir_FMUL(c,
 604                                            x,
 605                                            qir_uniform_f(c, coeff[i]));
 606                 if (i == 0)
 607                         sum = mul;
 608                 else
 609                         sum = qir_FADD(c, sum, mul);
 610         }
 611         return sum;
 612 }
 613
 614 static struct qreg
 615 ntq_fsign(struct vc4_compile *c, struct qreg src)
 616 {
 617         qir_SF(c, src);
 618         return qir_SEL_X_Y_NC(c,
 619                               qir_SEL_X_0_ZC(c, qir_uniform_f(c, 1.0)),
 620                               qir_uniform_f(c, -1.0));
 621 }
 622
 623 static struct qreg
 624 get_channel_from_vpm(struct vc4_compile *c,
 625                      struct qreg *vpm_reads,
 626                      uint8_t swiz,
 627                      const struct util_format_description *desc)
 628 {
 629         const struct util_format_channel_description *chan =
 630                 &desc->channel[swiz];
 631         struct qreg temp;
 632
 633         if (swiz > UTIL_FORMAT_SWIZZLE_W)
 634                 return get_swizzled_channel(c, vpm_reads, swiz);
 635         else if (chan->size == 32 &&
 636                  chan->type == UTIL_FORMAT_TYPE_FLOAT) {
 637                 return get_swizzled_channel(c, vpm_reads, swiz);
 638         } else if (chan->size == 32 &&
 639                    chan->type == UTIL_FORMAT_TYPE_SIGNED) {
 640                 if (chan->normalized) {
 641                         return qir_FMUL(c,
 642                                         qir_ITOF(c, vpm_reads[swiz]),
 643                                         qir_uniform_f(c,
 644                                                       1.0 / 0x7fffffff));
 645                 } else {
 646                         return qir_ITOF(c, vpm_reads[swiz]);
 647                 }
 648         } else if (chan->size == 8 &&
 649                    (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
 650                     chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
 651                 struct qreg vpm = vpm_reads[0];
 652                 if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
 653                         temp = qir_XOR(c, vpm, qir_uniform_ui(c, 0x80808080));
 654                         if (chan->normalized) {
 655                                 return qir_FSUB(c, qir_FMUL(c,
 656                                                             qir_UNPACK_8_F(c, temp, swiz),
 657                                                             qir_uniform_f(c, 2.0)),
 658                                                 qir_uniform_f(c, 1.0));
 659                         } else {
 660                                 return qir_FADD(c,
 661                                                 qir_ITOF(c,
 662                                                          qir_UNPACK_8_I(c, temp,
 663                                                                         swiz)),
 664                                                 qir_uniform_f(c, -128.0));
 665                         }
 666                 } else {
 667                         if (chan->normalized) {
 668                                 return qir_UNPACK_8_F(c, vpm, swiz);
 669                         } else {
 670                                 return qir_ITOF(c, qir_UNPACK_8_I(c, vpm, swiz));
 671                         }
 672                 }
 673         } else if (chan->size == 16 &&
 674                    (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
 675                     chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
 676                 struct qreg vpm = vpm_reads[swiz / 2];
 677
 678                 /* Note that UNPACK_16F eats a half float, not ints, so we use
 679                  * UNPACK_16_I for all of these.
 680                  */
 681                 if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
 682                         temp = qir_ITOF(c, qir_UNPACK_16_I(c, vpm, swiz % 2));
 683                         if (chan->normalized) {
 684                                 return qir_FMUL(c, temp,
 685                                                 qir_uniform_f(c, 1/32768.0f));
 686                         } else {
 687                                 return temp;
 688                         }
 689                 } else {
 690                         /* UNPACK_16I sign-extends, so we have to emit ANDs. */
 691                         temp = vpm;
 692                         if (swiz == 1 || swiz == 3)
 693                                 temp = qir_UNPACK_16_I(c, temp, 1);
 694                         temp = qir_AND(c, temp, qir_uniform_ui(c, 0xffff));
 695                         temp = qir_ITOF(c, temp);
 696
 697                         if (chan->normalized) {
 698                                 return qir_FMUL(c, temp,
 699                                                 qir_uniform_f(c, 1 / 65535.0));
 700                         } else {
 701                                 return temp;
 702                         }
 703                 }
 704         } else {
 705                 return c->undef;
 706         }
 707 }
 708
 709 static void
 710 emit_vertex_input(struct vc4_compile *c, int attr)
 711 {
 712         enum pipe_format format = c->vs_key->attr_formats[attr];
 713         uint32_t attr_size = util_format_get_blocksize(format);
 714         struct qreg vpm_reads[4];
 715
 716         c->vattr_sizes[attr] = align(attr_size, 4);
 717         for (int i = 0; i < align(attr_size, 4) / 4; i++) {
 718                 struct qreg vpm = { QFILE_VPM, attr * 4 + i };
 719                 vpm_reads[i] = qir_MOV(c, vpm);
 720                 c->num_inputs++;
 721         }
 722
 723         bool format_warned = false;
 724         const struct util_format_description *desc =
 725                 util_format_description(format);
 726
 727         for (int i = 0; i < 4; i++) {
 728                 uint8_t swiz = desc->swizzle[i];
 729                 struct qreg result = get_channel_from_vpm(c, vpm_reads,
 730                                                           swiz, desc);
 731
 732                 if (result.file == QFILE_NULL) {
 733                         if (!format_warned) {
 734                                 fprintf(stderr,
 735                                         "vtx element %d unsupported type: %s\n",
 736                                         attr, util_format_name(format));
 737                                 format_warned = true;
 738                         }
 739                         result = qir_uniform_f(c, 0.0);
 740                 }
 741                 c->inputs[attr * 4 + i] = result;
 742         }
 743 }
 744
 745 static void
 746 emit_fragcoord_input(struct vc4_compile *c, int attr)
 747 {
 748         c->inputs[attr * 4 + 0] = qir_FRAG_X(c);
 749         c->inputs[attr * 4 + 1] = qir_FRAG_Y(c);
 750         c->inputs[attr * 4 + 2] =
 751                 qir_FMUL(c,
 752                          qir_ITOF(c, qir_FRAG_Z(c)),
 753                          qir_uniform_f(c, 1.0 / 0xffffff));
 754         c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c));
 755 }
 756
 757 static void
 758 emit_point_coord_input(struct vc4_compile *c, int attr)
 759 {
 760         if (c->point_x.file == QFILE_NULL) {
 761                 c->point_x = qir_uniform_f(c, 0.0);
 762                 c->point_y = qir_uniform_f(c, 0.0);
 763         }
 764
 765         c->inputs[attr * 4 + 0] = c->point_x;
 766         if (c->fs_key->point_coord_upper_left) {
 767                 c->inputs[attr * 4 + 1] = qir_FSUB(c,
 768                                                    qir_uniform_f(c, 1.0),
 769                                                    c->point_y);
 770         } else {
 771                 c->inputs[attr * 4 + 1] = c->point_y;
 772         }
 773         c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0);
 774         c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0);
 775 }
 776
 777 static struct qreg
 778 emit_fragment_varying(struct vc4_compile *c, uint8_t semantic,
 779                       uint8_t index, uint8_t swizzle)
 780 {
 781         uint32_t i = c->num_input_semantics++;
 782         struct qreg vary = {
 783                 QFILE_VARY,
 784                 i
 785         };
 786
 787         if (c->num_input_semantics >= c->input_semantics_array_size) {
 788                 c->input_semantics_array_size =
 789                         MAX2(4, c->input_semantics_array_size * 2);
 790
 791                 c->input_semantics = reralloc(c, c->input_semantics,
 792                                               struct vc4_varying_semantic,
 793                                               c->input_semantics_array_size);
 794         }
 795
 796         c->input_semantics[i].semantic = semantic;
 797         c->input_semantics[i].index = index;
 798         c->input_semantics[i].swizzle = swizzle;
 799
 800         return qir_VARY_ADD_C(c, qir_FMUL(c, vary, qir_FRAG_W(c)));
 801 }
 802
 803 static void
 804 emit_fragment_input(struct vc4_compile *c, int attr,
 805                     unsigned semantic_name, unsigned semantic_index)
 806 {
 807         for (int i = 0; i < 4; i++) {
 808                 c->inputs[attr * 4 + i] =
 809                         emit_fragment_varying(c,
 810                                               semantic_name,
 811                                               semantic_index,
 812                                               i);
 813                 c->num_inputs++;
 814         }
 815 }
 816
 817 static void
 818 emit_face_input(struct vc4_compile *c, int attr)
 819 {
 820         c->inputs[attr * 4 + 0] = qir_FSUB(c,
 821                                            qir_uniform_f(c, 1.0),
 822                                            qir_FMUL(c,
 823                                                     qir_ITOF(c, qir_FRAG_REV_FLAG(c)),
 824                                                     qir_uniform_f(c, 2.0)));
 825         c->inputs[attr * 4 + 1] = qir_uniform_f(c, 0.0);
 826         c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0);
 827         c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0);
 828 }
 829
 830 static void
 831 add_output(struct vc4_compile *c,
 832            uint32_t decl_offset,
 833            uint8_t semantic_name,
 834            uint8_t semantic_index,
 835            uint8_t semantic_swizzle)
 836 {
 837         uint32_t old_array_size = c->outputs_array_size;
 838         resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
 839                           decl_offset + 1);
 840
 841         if (old_array_size != c->outputs_array_size) {
 842                 c->output_semantics = reralloc(c,
 843                                                c->output_semantics,
 844                                                struct vc4_varying_semantic,
 845                                                c->outputs_array_size);
 846         }
 847
 848         c->output_semantics[decl_offset].semantic = semantic_name;
 849         c->output_semantics[decl_offset].index = semantic_index;
 850         c->output_semantics[decl_offset].swizzle = semantic_swizzle;
 851 }
 852
 853 static void
 854 declare_uniform_range(struct vc4_compile *c, uint32_t start, uint32_t size)
 855 {
 856         unsigned array_id = c->num_uniform_ranges++;
 857         if (array_id >= c->ubo_ranges_array_size) {
 858                 c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
 859                                                 array_id + 1);
 860                 c->ubo_ranges = reralloc(c, c->ubo_ranges,
 861                                          struct vc4_compiler_ubo_range,
 862                                          c->ubo_ranges_array_size);
 863         }
 864
 865         c->ubo_ranges[array_id].dst_offset = 0;
 866         c->ubo_ranges[array_id].src_offset = start;
 867         c->ubo_ranges[array_id].size = size;
 868         c->ubo_ranges[array_id].used = false;
 869 }
 870
 871 static void
 872 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
 873 {
 874         /* Vectors are special in that they have non-scalarized writemasks,
 875          * and just take the first swizzle channel for each argument in order
 876          * into each writemask channel.
 877          */
 878         if (instr->op == nir_op_vec2 ||
 879             instr->op == nir_op_vec3 ||
 880             instr->op == nir_op_vec4) {
 881                 struct qreg srcs[4];
 882                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
 883                         srcs[i] = ntq_get_src(c, instr->src[i].src,
 884                                               instr->src[i].swizzle[0]);
 885                 struct qreg *dest = ntq_get_dest(c, instr->dest.dest);
 886                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
 887                         dest[i] = srcs[i];
 888                 return;
 889         }
 890
 891         /* General case: We can just grab the one used channel per src. */
 892         struct qreg src[nir_op_infos[instr->op].num_inputs];
 893         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 894                 src[i] = ntq_get_alu_src(c, instr, i);
 895         }
 896
 897         /* Pick the channel to store the output in. */
 898         assert(!instr->dest.saturate);
 899         struct qreg *dest = ntq_get_dest(c, instr->dest.dest);
 900         assert(util_is_power_of_two(instr->dest.write_mask));
 901         dest += ffs(instr->dest.write_mask) - 1;
 902
 903         switch (instr->op) {
 904         case nir_op_fmov:
 905         case nir_op_imov:
 906                 *dest = qir_MOV(c, src[0]);
 907                 break;
 908         case nir_op_fmul:
 909                 *dest = qir_FMUL(c, src[0], src[1]);
 910                 break;
 911         case nir_op_fadd:
 912                 *dest = qir_FADD(c, src[0], src[1]);
 913                 break;
 914         case nir_op_fsub:
 915                 *dest = qir_FSUB(c, src[0], src[1]);
 916                 break;
 917         case nir_op_fmin:
 918                 *dest = qir_FMIN(c, src[0], src[1]);
 919                 break;
 920         case nir_op_fmax:
 921                 *dest = qir_FMAX(c, src[0], src[1]);
 922                 break;
 923
 924         case nir_op_f2i:
 925         case nir_op_f2u:
 926                 *dest = qir_FTOI(c, src[0]);
 927                 break;
 928         case nir_op_i2f:
 929         case nir_op_u2f:
 930                 *dest = qir_ITOF(c, src[0]);
 931                 break;
 932         case nir_op_b2f:
 933                 *dest = qir_AND(c, src[0], qir_uniform_f(c, 1.0));
 934                 break;
 935         case nir_op_b2i:
 936                 *dest = qir_AND(c, src[0], qir_uniform_ui(c, 1));
 937                 break;
 938         case nir_op_i2b:
 939         case nir_op_f2b:
 940                 qir_SF(c, src[0]);
 941                 *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
 942                 break;
 943
 944         case nir_op_iadd:
 945                 *dest = qir_ADD(c, src[0], src[1]);
 946                 break;
 947         case nir_op_ushr:
 948                 *dest = qir_SHR(c, src[0], src[1]);
 949                 break;
 950         case nir_op_isub:
 951                 *dest = qir_SUB(c, src[0], src[1]);
 952                 break;
 953         case nir_op_ishr:
 954                 *dest = qir_ASR(c, src[0], src[1]);
 955                 break;
 956         case nir_op_ishl:
 957                 *dest = qir_SHL(c, src[0], src[1]);
 958                 break;
 959         case nir_op_imin:
 960                 *dest = qir_MIN(c, src[0], src[1]);
 961                 break;
 962         case nir_op_imax:
 963                 *dest = qir_MAX(c, src[0], src[1]);
 964                 break;
 965         case nir_op_iand:
 966                 *dest = qir_AND(c, src[0], src[1]);
 967                 break;
 968         case nir_op_ior:
 969                 *dest = qir_OR(c, src[0], src[1]);
 970                 break;
 971         case nir_op_ixor:
 972                 *dest = qir_XOR(c, src[0], src[1]);
 973                 break;
 974         case nir_op_inot:
 975                 *dest = qir_NOT(c, src[0]);
 976                 break;
 977
 978         case nir_op_imul:
 979                 *dest = ntq_umul(c, src[0], src[1]);
 980                 break;
 981
 982         case nir_op_seq:
 983                 qir_SF(c, qir_FSUB(c, src[0], src[1]));
 984                 *dest = qir_SEL_X_0_ZS(c, qir_uniform_f(c, 1.0));
 985                 break;
 986         case nir_op_sne:
 987                 qir_SF(c, qir_FSUB(c, src[0], src[1]));
 988                 *dest = qir_SEL_X_0_ZC(c, qir_uniform_f(c, 1.0));
 989                 break;
 990         case nir_op_sge:
 991                 qir_SF(c, qir_FSUB(c, src[0], src[1]));
 992                 *dest = qir_SEL_X_0_NC(c, qir_uniform_f(c, 1.0));
 993                 break;
 994         case nir_op_slt:
 995                 qir_SF(c, qir_FSUB(c, src[0], src[1]));
 996                 *dest = qir_SEL_X_0_NS(c, qir_uniform_f(c, 1.0));
 997                 break;
 998         case nir_op_feq:
 999                 qir_SF(c, qir_FSUB(c, src[0], src[1]));
1000                 *dest = qir_SEL_X_0_ZS(c, qir_uniform_ui(c, ~0));
1001                 break;
1002         case nir_op_fne:
1003                 qir_SF(c, qir_FSUB(c, src[0], src[1]));
1004                 *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
1005                 break;
1006         case nir_op_fge:
1007                 qir_SF(c, qir_FSUB(c, src[0], src[1]));
1008                 *dest = qir_SEL_X_0_NC(c, qir_uniform_ui(c, ~0));
1009                 break;
1010         case nir_op_flt:
1011                 qir_SF(c, qir_FSUB(c, src[0], src[1]));
1012                 *dest = qir_SEL_X_0_NS(c, qir_uniform_ui(c, ~0));
1013                 break;
1014         case nir_op_ieq:
1015                 qir_SF(c, qir_SUB(c, src[0], src[1]));
1016                 *dest = qir_SEL_X_0_ZS(c, qir_uniform_ui(c, ~0));
1017                 break;
1018         case nir_op_ine:
1019                 qir_SF(c, qir_SUB(c, src[0], src[1]));
1020                 *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
1021                 break;
1022         case nir_op_ige:
1023                 qir_SF(c, qir_SUB(c, src[0], src[1]));
1024                 *dest = qir_SEL_X_0_NC(c, qir_uniform_ui(c, ~0));
1025                 break;
1026         case nir_op_ilt:
1027                 qir_SF(c, qir_SUB(c, src[0], src[1]));
1028                 *dest = qir_SEL_X_0_NS(c, qir_uniform_ui(c, ~0));
1029                 break;
1030
1031         case nir_op_bcsel:
1032                 qir_SF(c, src[0]);
1033                 *dest = qir_SEL_X_Y_NS(c, src[1], src[2]);
1034                 break;
1035         case nir_op_fcsel:
1036                 qir_SF(c, src[0]);
1037                 *dest = qir_SEL_X_Y_ZC(c, src[1], src[2]);
1038                 break;
1039
1040         case nir_op_frcp:
1041                 *dest = ntq_rcp(c, src[0]);
1042                 break;
1043         case nir_op_frsq:
1044                 *dest = ntq_rsq(c, src[0]);
1045                 break;
1046         case nir_op_fexp2:
1047                 *dest = qir_EXP2(c, src[0]);
1048                 break;
1049         case nir_op_flog2:
1050                 *dest = qir_LOG2(c, src[0]);
1051                 break;
1052
1053         case nir_op_ftrunc:
1054                 *dest = qir_ITOF(c, qir_FTOI(c, src[0]));
1055                 break;
1056         case nir_op_fceil:
1057                 *dest = ntq_fceil(c, src[0]);
1058                 break;
1059         case nir_op_ffract:
1060                 *dest = ntq_ffract(c, src[0]);
1061                 break;
1062         case nir_op_ffloor:
1063                 *dest = ntq_ffloor(c, src[0]);
1064                 break;
1065
1066         case nir_op_fsin:
1067                 *dest = ntq_fsin(c, src[0]);
1068                 break;
1069         case nir_op_fcos:
1070                 *dest = ntq_fcos(c, src[0]);
1071                 break;
1072
1073         case nir_op_fsign:
1074                 *dest = ntq_fsign(c, src[0]);
1075                 break;
1076
1077         case nir_op_fabs:
1078                 *dest = qir_FMAXABS(c, src[0], src[0]);
1079                 break;
1080         case nir_op_iabs:
1081                 *dest = qir_MAX(c, src[0],
1082                                 qir_SUB(c, qir_uniform_ui(c, 0), src[0]));
1083                 break;
1084
1085         default:
1086                 fprintf(stderr, "unknown NIR ALU inst: ");
1087                 nir_print_instr(&instr->instr, stderr);
1088                 fprintf(stderr, "\n");
1089                 abort();
1090         }
1091 }
1092
1093 static struct qreg
1094 vc4_blend_channel(struct vc4_compile *c,
1095                   struct qreg *dst,
1096                   struct qreg *src,
1097                   struct qreg val,
1098                   unsigned factor,
1099                   int channel)
1100 {
1101         switch(factor) {
1102         case PIPE_BLENDFACTOR_ONE:
1103                 return val;
1104         case PIPE_BLENDFACTOR_SRC_COLOR:
1105                 return qir_FMUL(c, val, src[channel]);
1106         case PIPE_BLENDFACTOR_SRC_ALPHA:
1107                 return qir_FMUL(c, val, src[3]);
1108         case PIPE_BLENDFACTOR_DST_ALPHA:
1109                 return qir_FMUL(c, val, dst[3]);
1110         case PIPE_BLENDFACTOR_DST_COLOR:
1111                 return qir_FMUL(c, val, dst[channel]);
1112         case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
1113                 if (channel != 3) {
1114                         return qir_FMUL(c,
1115                                         val,
1116                                         qir_FMIN(c,
1117                                                  src[3],
1118                                                  qir_FSUB(c,
1119                                                           qir_uniform_f(c, 1.0),
1120                                                           dst[3])));
1121                 } else {
1122                         return val;
1123                 }
1124         case PIPE_BLENDFACTOR_CONST_COLOR:
1125                 return qir_FMUL(c, val,
1126                                 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR,
1127                                             channel));
1128         case PIPE_BLENDFACTOR_CONST_ALPHA:
1129                 return qir_FMUL(c, val,
1130                                 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR, 3));
1131         case PIPE_BLENDFACTOR_ZERO:
1132                 return qir_uniform_f(c, 0.0);
1133         case PIPE_BLENDFACTOR_INV_SRC_COLOR:
1134                 return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
1135                                                  src[channel]));
1136         case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
1137                 return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
1138                                                  src[3]));
1139         case PIPE_BLENDFACTOR_INV_DST_ALPHA:
1140                 return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
1141                                                  dst[3]));
1142         case PIPE_BLENDFACTOR_INV_DST_COLOR:
1143                 return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
1144                                                  dst[channel]));
1145         case PIPE_BLENDFACTOR_INV_CONST_COLOR:
1146                 return qir_FMUL(c, val,
1147                                 qir_FSUB(c, qir_uniform_f(c, 1.0),
1148                                          qir_uniform(c,
1149                                                      QUNIFORM_BLEND_CONST_COLOR,
1150                                                      channel)));
1151         case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
1152                 return qir_FMUL(c, val,
1153                                 qir_FSUB(c, qir_uniform_f(c, 1.0),
1154                                          qir_uniform(c,
1155                                                      QUNIFORM_BLEND_CONST_COLOR,
1156                                                      3)));
1157
1158         default:
1159         case PIPE_BLENDFACTOR_SRC1_COLOR:
1160         case PIPE_BLENDFACTOR_SRC1_ALPHA:
1161         case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
1162         case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
1163                 /* Unsupported. */
1164                 fprintf(stderr, "Unknown blend factor %d\n", factor);
1165                 return val;
1166         }
1167 }
1168
1169 static struct qreg
1170 vc4_blend_func(struct vc4_compile *c,
1171                struct qreg src, struct qreg dst,
1172                unsigned func)
1173 {
1174         switch (func) {
1175         case PIPE_BLEND_ADD:
1176                 return qir_FADD(c, src, dst);
1177         case PIPE_BLEND_SUBTRACT:
1178                 return qir_FSUB(c, src, dst);
1179         case PIPE_BLEND_REVERSE_SUBTRACT:
1180                 return qir_FSUB(c, dst, src);
1181         case PIPE_BLEND_MIN:
1182                 return qir_FMIN(c, src, dst);
1183         case PIPE_BLEND_MAX:
1184                 return qir_FMAX(c, src, dst);
1185
1186         default:
1187                 /* Unsupported. */
1188                 fprintf(stderr, "Unknown blend func %d\n", func);
1189                 return src;
1190
1191         }
1192 }
1193
1194 /**
1195  * Implements fixed function blending in shader code.
1196  *
1197  * VC4 doesn't have any hardware support for blending.  Instead, you read the
1198  * current contents of the destination from the tile buffer after having
1199  * waited for the scoreboard (which is handled by vc4_qpu_emit.c), then do
1200  * math using your output color and that destination value, and update the
1201  * output color appropriately.
1202  */
1203 static void
1204 vc4_blend(struct vc4_compile *c, struct qreg *result,
1205           struct qreg *dst_color, struct qreg *src_color)
1206 {
1207         struct pipe_rt_blend_state *blend = &c->fs_key->blend;
1208
1209         if (!blend->blend_enable) {
1210                 for (int i = 0; i < 4; i++)
1211                         result[i] = src_color[i];
1212                 return;
1213         }
1214
1215         struct qreg clamped_src[4];
1216         struct qreg clamped_dst[4];
1217         for (int i = 0; i < 4; i++) {
1218                 clamped_src[i] = qir_SAT(c, src_color[i]);
1219                 clamped_dst[i] = qir_SAT(c, dst_color[i]);
1220         }
1221         src_color = clamped_src;
1222         dst_color = clamped_dst;
1223
1224         struct qreg src_blend[4], dst_blend[4];
1225         for (int i = 0; i < 3; i++) {
1226                 src_blend[i] = vc4_blend_channel(c,
1227                                                  dst_color, src_color,
1228                                                  src_color[i],
1229                                                  blend->rgb_src_factor, i);
1230                 dst_blend[i] = vc4_blend_channel(c,
1231                                                  dst_color, src_color,
1232                                                  dst_color[i],
1233                                                  blend->rgb_dst_factor, i);
1234         }
1235         src_blend[3] = vc4_blend_channel(c,
1236                                          dst_color, src_color,
1237                                          src_color[3],
1238                                          blend->alpha_src_factor, 3);
1239         dst_blend[3] = vc4_blend_channel(c,
1240                                          dst_color, src_color,
1241                                          dst_color[3],
1242                                          blend->alpha_dst_factor, 3);
1243
1244         for (int i = 0; i < 3; i++) {
1245                 result[i] = vc4_blend_func(c,
1246                                            src_blend[i], dst_blend[i],
1247                                            blend->rgb_func);
1248         }
1249         result[3] = vc4_blend_func(c,
1250                                    src_blend[3], dst_blend[3],
1251                                    blend->alpha_func);
1252 }
1253
1254 static void
1255 clip_distance_discard(struct vc4_compile *c)
1256 {
1257         for (int i = 0; i < PIPE_MAX_CLIP_PLANES; i++) {
1258                 if (!(c->key->ucp_enables & (1 << i)))
1259                         continue;
1260
1261                 struct qreg dist = emit_fragment_varying(c,
1262                                                          TGSI_SEMANTIC_CLIPDIST,
1263                                                          i,
1264                                                          TGSI_SWIZZLE_X);
1265
1266                 qir_SF(c, dist);
1267
1268                 if (c->discard.file == QFILE_NULL)
1269                         c->discard = qir_uniform_ui(c, 0);
1270
1271                 c->discard = qir_SEL_X_Y_NS(c, qir_uniform_ui(c, ~0),
1272                                             c->discard);
1273         }
1274 }
1275
1276 static void
1277 alpha_test_discard(struct vc4_compile *c)
1278 {
1279         struct qreg src_alpha;
1280         struct qreg alpha_ref = qir_uniform(c, QUNIFORM_ALPHA_REF, 0);
1281
1282         if (!c->fs_key->alpha_test)
1283                 return;
1284
1285         if (c->output_color_index != -1)
1286                 src_alpha = c->outputs[c->output_color_index + 3];
1287         else
1288                 src_alpha = qir_uniform_f(c, 1.0);
1289
1290         if (c->discard.file == QFILE_NULL)
1291                 c->discard = qir_uniform_ui(c, 0);
1292
1293         switch (c->fs_key->alpha_test_func) {
1294         case PIPE_FUNC_NEVER:
1295                 c->discard = qir_uniform_ui(c, ~0);
1296                 break;
1297         case PIPE_FUNC_ALWAYS:
1298                 break;
1299         case PIPE_FUNC_EQUAL:
1300                 qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
1301                 c->discard = qir_SEL_X_Y_ZS(c, c->discard,
1302                                             qir_uniform_ui(c, ~0));
1303                 break;
1304         case PIPE_FUNC_NOTEQUAL:
1305                 qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
1306                 c->discard = qir_SEL_X_Y_ZC(c, c->discard,
1307                                             qir_uniform_ui(c, ~0));
1308                 break;
1309         case PIPE_FUNC_GREATER:
1310                 qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
1311                 c->discard = qir_SEL_X_Y_NC(c, c->discard,
1312                                             qir_uniform_ui(c, ~0));
1313                 break;
1314         case PIPE_FUNC_GEQUAL:
1315                 qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha));
1316                 c->discard = qir_SEL_X_Y_NS(c, c->discard,
1317                                             qir_uniform_ui(c, ~0));
1318                 break;
1319         case PIPE_FUNC_LESS:
1320                 qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
1321                 c->discard = qir_SEL_X_Y_NS(c, c->discard,
1322                                             qir_uniform_ui(c, ~0));
1323                 break;
1324         case PIPE_FUNC_LEQUAL:
1325                 qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha));
1326                 c->discard = qir_SEL_X_Y_NC(c, c->discard,
1327                                             qir_uniform_ui(c, ~0));
1328                 break;
1329         }
1330 }
1331
1332 static struct qreg
1333 vc4_logicop(struct vc4_compile *c, struct qreg src, struct qreg dst)
1334 {
1335         switch (c->fs_key->logicop_func) {
1336         case PIPE_LOGICOP_CLEAR:
1337                 return qir_uniform_f(c, 0.0);
1338         case PIPE_LOGICOP_NOR:
1339                 return qir_NOT(c, qir_OR(c, src, dst));
1340         case PIPE_LOGICOP_AND_INVERTED:
1341                 return qir_AND(c, qir_NOT(c, src), dst);
1342         case PIPE_LOGICOP_COPY_INVERTED:
1343                 return qir_NOT(c, src);
1344         case PIPE_LOGICOP_AND_REVERSE:
1345                 return qir_AND(c, src, qir_NOT(c, dst));
1346         case PIPE_LOGICOP_INVERT:
1347                 return qir_NOT(c, dst);
1348         case PIPE_LOGICOP_XOR:
1349                 return qir_XOR(c, src, dst);
1350         case PIPE_LOGICOP_NAND:
1351                 return qir_NOT(c, qir_AND(c, src, dst));
1352         case PIPE_LOGICOP_AND:
1353                 return qir_AND(c, src, dst);
1354         case PIPE_LOGICOP_EQUIV:
1355                 return qir_NOT(c, qir_XOR(c, src, dst));
1356         case PIPE_LOGICOP_NOOP:
1357                 return dst;
1358         case PIPE_LOGICOP_OR_INVERTED:
1359                 return qir_OR(c, qir_NOT(c, src), dst);
1360         case PIPE_LOGICOP_OR_REVERSE:
1361                 return qir_OR(c, src, qir_NOT(c, dst));
1362         case PIPE_LOGICOP_OR:
1363                 return qir_OR(c, src, dst);
1364         case PIPE_LOGICOP_SET:
1365                 return qir_uniform_ui(c, ~0);
1366         case PIPE_LOGICOP_COPY:
1367         default:
1368                 return src;
1369         }
1370 }
1371
1372 /**
1373  * Applies the GL blending pipeline and returns the packed (8888) output
1374  * color.
1375  */
1376 static struct qreg
1377 blend_pipeline(struct vc4_compile *c)
1378 {
1379         enum pipe_format color_format = c->fs_key->color_format;
1380         const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
1381         struct qreg tlb_read_color[4] = { c->undef, c->undef, c->undef, c->undef };
1382         struct qreg dst_color[4] = { c->undef, c->undef, c->undef, c->undef };
1383         struct qreg linear_dst_color[4] = { c->undef, c->undef, c->undef, c->undef };
1384         struct qreg packed_dst_color = c->undef;
1385
1386         if (c->fs_key->blend.blend_enable ||
1387             c->fs_key->blend.colormask != 0xf ||
1388             c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
1389                 struct qreg r4 = qir_TLB_COLOR_READ(c);
1390                 for (int i = 0; i < 4; i++)
1391                         tlb_read_color[i] = qir_R4_UNPACK(c, r4, i);
1392                 for (int i = 0; i < 4; i++) {
1393                         dst_color[i] = get_swizzled_channel(c,
1394                                                             tlb_read_color,
1395                                                             format_swiz[i]);
1396                         if (util_format_is_srgb(color_format) && i != 3) {
1397                                 linear_dst_color[i] =
1398                                         qir_srgb_decode(c, dst_color[i]);
1399                         } else {
1400                                 linear_dst_color[i] = dst_color[i];
1401                         }
1402                 }
1403
1404                 /* Save the packed value for logic ops.  Can't reuse r4
1405                  * because other things might smash it (like sRGB)
1406                  */
1407                 packed_dst_color = qir_MOV(c, r4);
1408         }
1409
1410         struct qreg undef_array[4] = { c->undef, c->undef, c->undef, c->undef };
1411         const struct qreg *output_colors = (c->output_color_index != -1 ?
1412                                             c->outputs + c->output_color_index :
1413                                             undef_array);
1414         struct qreg blend_src_color[4];
1415         for (int i = 0; i < 4; i++)
1416                 blend_src_color[i] = output_colors[i];
1417
1418         struct qreg blend_color[4];
1419         vc4_blend(c, blend_color, linear_dst_color, blend_src_color);
1420
1421         if (util_format_is_srgb(color_format)) {
1422                 for (int i = 0; i < 3; i++)
1423                         blend_color[i] = qir_srgb_encode(c, blend_color[i]);
1424         }
1425
1426         /* Debug: Sometimes you're getting a black output and just want to see
1427          * if the FS is getting executed at all.  Spam magenta into the color
1428          * output.
1429          */
1430         if (0) {
1431                 blend_color[0] = qir_uniform_f(c, 1.0);
1432                 blend_color[1] = qir_uniform_f(c, 0.0);
1433                 blend_color[2] = qir_uniform_f(c, 1.0);
1434                 blend_color[3] = qir_uniform_f(c, 0.5);
1435         }
1436
1437         struct qreg swizzled_outputs[4];
1438         for (int i = 0; i < 4; i++) {
1439                 swizzled_outputs[i] = get_swizzled_channel(c, blend_color,
1440                                                            format_swiz[i]);
1441         }
1442
1443         struct qreg packed_color = c->undef;
1444         for (int i = 0; i < 4; i++) {
1445                 if (swizzled_outputs[i].file == QFILE_NULL)
1446                         continue;
1447                 if (packed_color.file == QFILE_NULL) {
1448                         packed_color = qir_PACK_8888_F(c, swizzled_outputs[i]);
1449                 } else {
1450                         packed_color = qir_PACK_8_F(c,
1451                                                     packed_color,
1452                                                     swizzled_outputs[i],
1453                                                     i);
1454                 }
1455         }
1456
1457         if (packed_color.file == QFILE_NULL)
1458                 packed_color = qir_uniform_ui(c, 0);
1459
1460         if (c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
1461                 packed_color = vc4_logicop(c, packed_color, packed_dst_color);
1462         }
1463
1464         /* If the bit isn't set in the color mask, then just return the
1465          * original dst color, instead.
1466          */
1467         uint32_t colormask = 0xffffffff;
1468         for (int i = 0; i < 4; i++) {
1469                 if (format_swiz[i] < 4 &&
1470                     !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) {
1471                         colormask &= ~(0xff << (i * 8));
1472                 }
1473         }
1474         if (colormask != 0xffffffff) {
1475                 packed_color = qir_OR(c,
1476                                       qir_AND(c, packed_color,
1477                                               qir_uniform_ui(c, colormask)),
1478                                       qir_AND(c, packed_dst_color,
1479                                               qir_uniform_ui(c, ~colormask)));
1480         }
1481
1482         return packed_color;
1483 }
1484
1485 static void
1486 emit_frag_end(struct vc4_compile *c)
1487 {
1488         clip_distance_discard(c);
1489         alpha_test_discard(c);
1490         struct qreg color = blend_pipeline(c);
1491
1492         if (c->discard.file != QFILE_NULL)
1493                 qir_TLB_DISCARD_SETUP(c, c->discard);
1494
1495         if (c->fs_key->stencil_enabled) {
1496                 qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 0));
1497                 if (c->fs_key->stencil_twoside) {
1498                         qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 1));
1499                 }
1500                 if (c->fs_key->stencil_full_writemasks) {
1501                         qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 2));
1502                 }
1503         }
1504
1505         if (c->fs_key->depth_enabled) {
1506                 struct qreg z;
1507                 if (c->output_position_index != -1) {
1508                         z = qir_FTOI(c, qir_FMUL(c, c->outputs[c->output_position_index + 2],
1509                                                  qir_uniform_f(c, 0xffffff)));
1510                 } else {
1511                         z = qir_FRAG_Z(c);
1512                 }
1513                 qir_TLB_Z_WRITE(c, z);
1514         }
1515
1516         qir_TLB_COLOR_WRITE(c, color);
1517 }
1518
1519 static void
1520 emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w)
1521 {
1522         struct qreg xyi[2];
1523
1524         for (int i = 0; i < 2; i++) {
1525                 struct qreg scale =
1526                         qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0);
1527
1528                 xyi[i] = qir_FTOI(c, qir_FMUL(c,
1529                                               qir_FMUL(c,
1530                                                        c->outputs[c->output_position_index + i],
1531                                                        scale),
1532                                               rcp_w));
1533         }
1534
1535         qir_VPM_WRITE(c, qir_PACK_SCALED(c, xyi[0], xyi[1]));
1536 }
1537
1538 static void
1539 emit_zs_write(struct vc4_compile *c, struct qreg rcp_w)
1540 {
1541         struct qreg zscale = qir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
1542         struct qreg zoffset = qir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
1543
1544         qir_VPM_WRITE(c, qir_FADD(c, qir_FMUL(c, qir_FMUL(c,
1545                                                           c->outputs[c->output_position_index + 2],
1546                                                           zscale),
1547                                               rcp_w),
1548                                   zoffset));
1549 }
1550
1551 static void
1552 emit_rcp_wc_write(struct vc4_compile *c, struct qreg rcp_w)
1553 {
1554         qir_VPM_WRITE(c, rcp_w);
1555 }
1556
1557 static void
1558 emit_point_size_write(struct vc4_compile *c)
1559 {
1560         struct qreg point_size;
1561
1562         if (c->output_point_size_index != -1)
1563                 point_size = c->outputs[c->output_point_size_index + 3];
1564         else
1565                 point_size = qir_uniform_f(c, 1.0);
1566
1567         /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
1568          * BCM21553).
1569          */
1570         point_size = qir_FMAX(c, point_size, qir_uniform_f(c, .125));
1571
1572         qir_VPM_WRITE(c, point_size);
1573 }
1574
1575 /**
1576  * Emits a VPM read of the stub vertex attribute set up by vc4_draw.c.
1577  *
1578  * The simulator insists that there be at least one vertex attribute, so
1579  * vc4_draw.c will emit one if it wouldn't have otherwise.  The simulator also
1580  * insists that all vertex attributes loaded get read by the VS/CS, so we have
1581  * to consume it here.
1582  */
1583 static void
1584 emit_stub_vpm_read(struct vc4_compile *c)
1585 {
1586         if (c->num_inputs)
1587                 return;
1588
1589         c->vattr_sizes[0] = 4;
1590         struct qreg vpm = { QFILE_VPM, 0 };
1591         (void)qir_MOV(c, vpm);
1592         c->num_inputs++;
1593 }
1594
1595 static void
1596 emit_ucp_clipdistance(struct vc4_compile *c)
1597 {
1598         unsigned cv;
1599         if (c->output_clipvertex_index != -1)
1600                 cv = c->output_clipvertex_index;
1601         else if (c->output_position_index != -1)
1602                 cv = c->output_position_index;
1603         else
1604                 return;
1605
1606         for (int plane = 0; plane < PIPE_MAX_CLIP_PLANES; plane++) {
1607                 if (!(c->key->ucp_enables & (1 << plane)))
1608                         continue;
1609
1610                 /* Pick the next outputs[] that hasn't been written to, since
1611                  * there are no other program writes left to be processed at
1612                  * this point.  If something had been declared but not written
1613                  * (like a w component), we'll just smash over the top of it.
1614                  */
1615                 uint32_t output_index = c->num_outputs++;
1616                 add_output(c, output_index,
1617                            TGSI_SEMANTIC_CLIPDIST,
1618                            plane,
1619                            TGSI_SWIZZLE_X);
1620
1621
1622                 struct qreg dist = qir_uniform_f(c, 0.0);
1623                 for (int i = 0; i < 4; i++) {
1624                         struct qreg pos_chan = c->outputs[cv + i];
1625                         struct qreg ucp =
1626                                 qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
1627                                             plane * 4 + i);
1628                         dist = qir_FADD(c, dist, qir_FMUL(c, pos_chan, ucp));
1629                 }
1630
1631                 c->outputs[output_index] = dist;
1632         }
1633 }
1634
1635 static void
1636 emit_vert_end(struct vc4_compile *c,
1637               struct vc4_varying_semantic *fs_inputs,
1638               uint32_t num_fs_inputs)
1639 {
1640         struct qreg rcp_w = qir_RCP(c, c->outputs[c->output_position_index + 3]);
1641
1642         emit_stub_vpm_read(c);
1643         emit_ucp_clipdistance(c);
1644
1645         emit_scaled_viewport_write(c, rcp_w);
1646         emit_zs_write(c, rcp_w);
1647         emit_rcp_wc_write(c, rcp_w);
1648         if (c->vs_key->per_vertex_point_size)
1649                 emit_point_size_write(c);
1650
1651         for (int i = 0; i < num_fs_inputs; i++) {
1652                 struct vc4_varying_semantic *input = &fs_inputs[i];
1653                 int j;
1654
1655                 for (j = 0; j < c->num_outputs; j++) {
1656                         struct vc4_varying_semantic *output =
1657                                 &c->output_semantics[j];
1658
1659                         if (input->semantic == output->semantic &&
1660                             input->index == output->index &&
1661                             input->swizzle == output->swizzle) {
1662                                 qir_VPM_WRITE(c, c->outputs[j]);
1663                                 break;
1664                         }
1665                 }
1666                 /* Emit padding if we didn't find a declared VS output for
1667                  * this FS input.
1668                  */
1669                 if (j == c->num_outputs)
1670                         qir_VPM_WRITE(c, qir_uniform_f(c, 0.0));
1671         }
1672 }
1673
1674 static void
1675 emit_coord_end(struct vc4_compile *c)
1676 {
1677         struct qreg rcp_w = qir_RCP(c, c->outputs[c->output_position_index + 3]);
1678
1679         emit_stub_vpm_read(c);
1680
1681         for (int i = 0; i < 4; i++)
1682                 qir_VPM_WRITE(c, c->outputs[c->output_position_index + i]);
1683
1684         emit_scaled_viewport_write(c, rcp_w);
1685         emit_zs_write(c, rcp_w);
1686         emit_rcp_wc_write(c, rcp_w);
1687         if (c->vs_key->per_vertex_point_size)
1688                 emit_point_size_write(c);
1689 }
1690
1691 static void
1692 vc4_optimize_nir(struct nir_shader *s)
1693 {
1694         bool progress;
1695
1696         do {
1697                 progress = false;
1698
1699                 nir_lower_vars_to_ssa(s);
1700                 nir_lower_alu_to_scalar(s);
1701
1702                 progress = nir_copy_prop(s) || progress;
1703                 progress = nir_opt_dce(s) || progress;
1704                 progress = nir_opt_cse(s) || progress;
1705                 progress = nir_opt_peephole_select(s) || progress;
1706                 progress = nir_opt_algebraic(s) || progress;
1707                 progress = nir_opt_constant_folding(s) || progress;
1708         } while (progress);
1709 }
1710
1711 static int
1712 driver_location_compare(const void *in_a, const void *in_b)
1713 {
1714         const nir_variable *const *a = in_a;
1715         const nir_variable *const *b = in_b;
1716
1717         return (*a)->data.driver_location - (*b)->data.driver_location;
1718 }
1719
1720 static void
1721 ntq_setup_inputs(struct vc4_compile *c)
1722 {
1723         unsigned num_entries = 0;
1724         foreach_list_typed(nir_variable, var, node, &c->s->inputs)
1725                 num_entries++;
1726
1727         nir_variable *vars[num_entries];
1728
1729         unsigned i = 0;
1730         foreach_list_typed(nir_variable, var, node, &c->s->inputs)
1731                 vars[i++] = var;
1732
1733         /* Sort the variables so that we emit the input setup in
1734          * driver_location order.  This is required for VPM reads, whose data
1735          * is fetched into the VPM in driver_location (TGSI register index)
1736          * order.
1737          */
1738         qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
1739
1740         for (unsigned i = 0; i < num_entries; i++) {
1741                 nir_variable *var = vars[i];
1742                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1743                 /* XXX: map loc slots to semantics */
1744                 unsigned semantic_name = var->data.location;
1745                 unsigned semantic_index = var->data.index;
1746                 unsigned loc = var->data.driver_location;
1747
1748                 assert(array_len == 1);
1749                 (void)array_len;
1750                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1751                                   (loc + 1) * 4);
1752
1753                 if (c->stage == QSTAGE_FRAG) {
1754                         if (semantic_name == TGSI_SEMANTIC_POSITION) {
1755                                 emit_fragcoord_input(c, loc);
1756                         } else if (semantic_name == TGSI_SEMANTIC_FACE) {
1757                                 emit_face_input(c, loc);
1758                         } else if (semantic_name == TGSI_SEMANTIC_GENERIC &&
1759                                    (c->fs_key->point_sprite_mask &
1760                                     (1 << semantic_index))) {
1761                                 emit_point_coord_input(c, loc);
1762                         } else {
1763                                 emit_fragment_input(c, loc,
1764                                                     semantic_name,
1765                                                     semantic_index);
1766                         }
1767                 } else {
1768                         emit_vertex_input(c, loc);
1769                 }
1770         }
1771 }
1772
1773 static void
1774 ntq_setup_outputs(struct vc4_compile *c)
1775 {
1776         foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
1777                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1778                 /* XXX: map loc slots to semantics */
1779                 unsigned semantic_name = var->data.location;
1780                 unsigned semantic_index = var->data.index;
1781                 unsigned loc = var->data.driver_location * 4;
1782
1783                 assert(array_len == 1);
1784                 (void)array_len;
1785
1786                 /* NIR hack to pass through
1787                  * TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS */
1788                 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1789                     semantic_index == -1)
1790                         semantic_index = 0;
1791
1792                 for (int i = 0; i < 4; i++) {
1793                         add_output(c,
1794                                    loc + i,
1795                                    semantic_name,
1796                                    semantic_index,
1797                                    i);
1798                 }
1799
1800                 switch (semantic_name) {
1801                 case TGSI_SEMANTIC_POSITION:
1802                         c->output_position_index = loc;
1803                         break;
1804                 case TGSI_SEMANTIC_CLIPVERTEX:
1805                         c->output_clipvertex_index = loc;
1806                         break;
1807                 case TGSI_SEMANTIC_COLOR:
1808                         c->output_color_index = loc;
1809                         break;
1810                 case TGSI_SEMANTIC_PSIZE:
1811                         c->output_point_size_index = loc;
1812                         break;
1813                 }
1814
1815         }
1816 }
1817
1818 static void
1819 ntq_setup_uniforms(struct vc4_compile *c)
1820 {
1821         foreach_list_typed(nir_variable, var, node, &c->s->uniforms) {
1822                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1823                 unsigned array_elem_size = 4 * sizeof(float);
1824
1825                 declare_uniform_range(c, var->data.driver_location * array_elem_size,
1826                                       array_len * array_elem_size);
1827
1828         }
1829 }
1830
1831 /**
1832  * Sets up the mapping from nir_register to struct qreg *.
1833  *
1834  * Each nir_register gets a struct qreg per 32-bit component being stored.
1835  */
1836 static void
1837 ntq_setup_registers(struct vc4_compile *c, struct exec_list *list)
1838 {
1839         foreach_list_typed(nir_register, nir_reg, node, list) {
1840                 unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
1841                 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
1842                                                   array_len *
1843                                                   nir_reg->num_components);
1844
1845                 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
1846
1847                 for (int i = 0; i < array_len * nir_reg->num_components; i++)
1848                         qregs[i] = qir_uniform_ui(c, 0);
1849         }
1850 }
1851
1852 static void
1853 ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
1854 {
1855         struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
1856                                           instr->def.num_components);
1857         for (int i = 0; i < instr->def.num_components; i++)
1858                 qregs[i] = qir_uniform_ui(c, instr->value.u[i]);
1859
1860         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
1861 }
1862
1863 static void
1864 ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
1865 {
1866         const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
1867         struct qreg *dest = NULL;
1868
1869         if (info->has_dest) {
1870                 dest = ntq_get_dest(c, instr->dest);
1871         }
1872
1873         switch (instr->intrinsic) {
1874         case nir_intrinsic_load_uniform:
1875                 for (int i = 0; i < instr->num_components; i++) {
1876                         dest[i] = qir_uniform(c, QUNIFORM_UNIFORM,
1877                                               instr->const_index[0] * 4 + i);
1878                 }
1879                 break;
1880
1881         case nir_intrinsic_load_uniform_indirect:
1882                 for (int i = 0; i < instr->num_components; i++) {
1883                         dest[i] = indirect_uniform_load(c,
1884                                                         ntq_get_src(c, instr->src[0], 0),
1885                                                         (instr->const_index[0] *
1886                                                          4 + i) * sizeof(float));
1887                 }
1888
1889                 break;
1890
1891         case nir_intrinsic_load_input:
1892                 for (int i = 0; i < instr->num_components; i++)
1893                         dest[i] = c->inputs[instr->const_index[0] * 4 + i];
1894
1895                 break;
1896
1897         case nir_intrinsic_store_output:
1898                 for (int i = 0; i < instr->num_components; i++) {
1899                         c->outputs[instr->const_index[0] * 4 + i] =
1900                                 qir_MOV(c, ntq_get_src(c, instr->src[0], i));
1901                 }
1902                 c->num_outputs = MAX2(c->num_outputs,
1903                                       instr->const_index[0] * 4 +
1904                                       instr->num_components + 1);
1905                 break;
1906
1907         case nir_intrinsic_discard:
1908                 c->discard = qir_uniform_ui(c, ~0);
1909                 break;
1910
1911         case nir_intrinsic_discard_if:
1912                 if (c->discard.file == QFILE_NULL)
1913                         c->discard = qir_uniform_ui(c, 0);
1914                 c->discard = qir_OR(c, c->discard,
1915                                     ntq_get_src(c, instr->src[0], 0));
1916                 break;
1917
1918         default:
1919                 fprintf(stderr, "Unknown intrinsic: ");
1920                 nir_print_instr(&instr->instr, stderr);
1921                 fprintf(stderr, "\n");
1922                 break;
1923         }
1924 }
1925
1926 static void
1927 ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt)
1928 {
1929         fprintf(stderr, "general IF statements not handled.\n");
1930 }
1931
1932 static void
1933 ntq_emit_instr(struct vc4_compile *c, nir_instr *instr)
1934 {
1935         switch (instr->type) {
1936         case nir_instr_type_alu:
1937                 ntq_emit_alu(c, nir_instr_as_alu(instr));
1938                 break;
1939
1940         case nir_instr_type_intrinsic:
1941                 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
1942                 break;
1943
1944         case nir_instr_type_load_const:
1945                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
1946                 break;
1947
1948         case nir_instr_type_tex:
1949                 ntq_emit_tex(c, nir_instr_as_tex(instr));
1950                 break;
1951
1952         default:
1953                 fprintf(stderr, "Unknown NIR instr type: ");
1954                 nir_print_instr(instr, stderr);
1955                 fprintf(stderr, "\n");
1956                 abort();
1957         }
1958 }
1959
1960 static void
1961 ntq_emit_block(struct vc4_compile *c, nir_block *block)
1962 {
1963         nir_foreach_instr(block, instr) {
1964                 ntq_emit_instr(c, instr);
1965         }
1966 }
1967
1968 static void
1969 ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list)
1970 {
1971         foreach_list_typed(nir_cf_node, node, node, list) {
1972                 switch (node->type) {
1973                         /* case nir_cf_node_loop: */
1974                 case nir_cf_node_block:
1975                         ntq_emit_block(c, nir_cf_node_as_block(node));
1976                         break;
1977
1978                 case nir_cf_node_if:
1979                         ntq_emit_if(c, nir_cf_node_as_if(node));
1980                         break;
1981
1982                 default:
1983                         assert(0);
1984                 }
1985         }
1986 }
1987
1988 static void
1989 ntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl)
1990 {
1991         ntq_setup_registers(c, &impl->registers);
1992         ntq_emit_cf_list(c, &impl->body);
1993 }
1994
1995 static void
1996 nir_to_qir(struct vc4_compile *c)
1997 {
1998         ntq_setup_inputs(c);
1999         ntq_setup_outputs(c);
2000         ntq_setup_uniforms(c);
2001         ntq_setup_registers(c, &c->s->registers);
2002
2003         /* Find the main function and emit the body. */
2004         nir_foreach_overload(c->s, overload) {
2005                 assert(strcmp(overload->function->name, "main") == 0);
2006                 assert(overload->impl);
2007                 ntq_emit_impl(c, overload->impl);
2008         }
2009 }
2010
2011 static const nir_shader_compiler_options nir_options = {
2012         .lower_ffma = true,
2013         .lower_flrp = true,
2014         .lower_fpow = true,
2015         .lower_fsat = true,
2016         .lower_fsqrt = true,
2017         .lower_negate = true,
2018 };
2019
2020 static bool
2021 count_nir_instrs_in_block(nir_block *block, void *state)
2022 {
2023         int *count = (int *) state;
2024         nir_foreach_instr(block, instr) {
2025                 *count = *count + 1;
2026         }
2027         return true;
2028 }
2029
2030 static int
2031 count_nir_instrs(nir_shader *nir)
2032 {
2033         int count = 0;
2034         nir_foreach_overload(nir, overload) {
2035                 if (!overload->impl)
2036                         continue;
2037                 nir_foreach_block(overload->impl, count_nir_instrs_in_block, &count);
2038         }
2039         return count;
2040 }
2041
2042 static struct vc4_compile *
2043 vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
2044                        struct vc4_key *key)
2045 {
2046         struct vc4_compile *c = qir_compile_init();
2047
2048         c->stage = stage;
2049         c->shader_state = &key->shader_state->base;
2050         c->program_id = key->shader_state->program_id;
2051         c->variant_id = key->shader_state->compiled_variant_count++;
2052
2053         c->key = key;
2054         switch (stage) {
2055         case QSTAGE_FRAG:
2056                 c->fs_key = (struct vc4_fs_key *)key;
2057                 if (c->fs_key->is_points) {
2058                         c->point_x = emit_fragment_varying(c, ~0, ~0, 0);
2059                         c->point_y = emit_fragment_varying(c, ~0, ~0, 0);
2060                 } else if (c->fs_key->is_lines) {
2061                         c->line_x = emit_fragment_varying(c, ~0, ~0, 0);
2062                 }
2063                 break;
2064         case QSTAGE_VERT:
2065                 c->vs_key = (struct vc4_vs_key *)key;
2066                 break;
2067         case QSTAGE_COORD:
2068                 c->vs_key = (struct vc4_vs_key *)key;
2069                 break;
2070         }
2071
2072         const struct tgsi_token *tokens = key->shader_state->base.tokens;
2073         if (c->fs_key && c->fs_key->light_twoside) {
2074                 if (!key->shader_state->twoside_tokens) {
2075                         const struct tgsi_lowering_config lowering_config = {
2076                                 .color_two_side = true,
2077                         };
2078                         struct tgsi_shader_info info;
2079                         key->shader_state->twoside_tokens =
2080                                 tgsi_transform_lowering(&lowering_config,
2081                                                         key->shader_state->base.tokens,
2082                                                         &info);
2083
2084                         /* If no transformation occurred, then NULL is
2085                          * returned and we just use our original tokens.
2086                          */
2087                         if (!key->shader_state->twoside_tokens) {
2088                                 key->shader_state->twoside_tokens =
2089                                         key->shader_state->base.tokens;
2090                         }
2091                 }
2092                 tokens = key->shader_state->twoside_tokens;
2093         }
2094
2095         if (vc4_debug & VC4_DEBUG_TGSI) {
2096                 fprintf(stderr, "%s prog %d/%d TGSI:\n",
2097                         qir_get_stage_name(c->stage),
2098                         c->program_id, c->variant_id);
2099                 tgsi_dump(tokens, 0);
2100         }
2101
2102         c->s = tgsi_to_nir(tokens, &nir_options);
2103         nir_opt_global_to_local(c->s);
2104         nir_convert_to_ssa(c->s);
2105         nir_lower_idiv(c->s);
2106
2107         vc4_optimize_nir(c->s);
2108
2109         nir_remove_dead_variables(c->s);
2110
2111         nir_convert_from_ssa(c->s, false);
2112
2113         if (vc4_debug & VC4_DEBUG_SHADERDB) {
2114                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n",
2115                         qir_get_stage_name(c->stage),
2116                         c->program_id, c->variant_id,
2117                         count_nir_instrs(c->s));
2118         }
2119
2120         if (vc4_debug & VC4_DEBUG_NIR) {
2121                 fprintf(stderr, "%s prog %d/%d NIR:\n",
2122                         qir_get_stage_name(c->stage),
2123                         c->program_id, c->variant_id);
2124                 nir_print_shader(c->s, stderr);
2125         }
2126
2127         nir_to_qir(c);
2128
2129         switch (stage) {
2130         case QSTAGE_FRAG:
2131                 emit_frag_end(c);
2132                 break;
2133         case QSTAGE_VERT:
2134                 emit_vert_end(c,
2135                               vc4->prog.fs->input_semantics,
2136                               vc4->prog.fs->num_inputs);
2137                 break;
2138         case QSTAGE_COORD:
2139                 emit_coord_end(c);
2140                 break;
2141         }
2142
2143         if (vc4_debug & VC4_DEBUG_QIR) {
2144                 fprintf(stderr, "%s prog %d/%d pre-opt QIR:\n",
2145                         qir_get_stage_name(c->stage),
2146                         c->program_id, c->variant_id);
2147                 qir_dump(c);
2148         }
2149
2150         qir_optimize(c);
2151         qir_lower_uniforms(c);
2152
2153         if (vc4_debug & VC4_DEBUG_QIR) {
2154                 fprintf(stderr, "%s prog %d/%d QIR:\n",
2155                         qir_get_stage_name(c->stage),
2156                         c->program_id, c->variant_id);
2157                 qir_dump(c);
2158         }
2159         qir_reorder_uniforms(c);
2160         vc4_generate_code(vc4, c);
2161
2162         if (vc4_debug & VC4_DEBUG_SHADERDB) {
2163                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d instructions\n",
2164                         qir_get_stage_name(c->stage),
2165                         c->program_id, c->variant_id,
2166                         c->qpu_inst_count);
2167                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d uniforms\n",
2168                         qir_get_stage_name(c->stage),
2169                         c->program_id, c->variant_id,
2170                         c->num_uniforms);
2171         }
2172
2173         ralloc_free(c->s);
2174
2175         return c;
2176 }
2177
2178 static void *
2179 vc4_shader_state_create(struct pipe_context *pctx,
2180                         const struct pipe_shader_state *cso)
2181 {
2182         struct vc4_context *vc4 = vc4_context(pctx);
2183         struct vc4_uncompiled_shader *so = CALLOC_STRUCT(vc4_uncompiled_shader);
2184         if (!so)
2185                 return NULL;
2186
2187         so->base.tokens = tgsi_dup_tokens(cso->tokens);
2188         so->program_id = vc4->next_uncompiled_program_id++;
2189
2190         return so;
2191 }
2192
2193 static void
2194 copy_uniform_state_to_shader(struct vc4_compiled_shader *shader,
2195                              struct vc4_compile *c)
2196 {
2197         int count = c->num_uniforms;
2198         struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
2199
2200         uinfo->count = count;
2201         uinfo->data = ralloc_array(shader, uint32_t, count);
2202         memcpy(uinfo->data, c->uniform_data,
2203                count * sizeof(*uinfo->data));
2204         uinfo->contents = ralloc_array(shader, enum quniform_contents, count);
2205         memcpy(uinfo->contents, c->uniform_contents,
2206                count * sizeof(*uinfo->contents));
2207         uinfo->num_texture_samples = c->num_texture_samples;
2208 }
2209
2210 static struct vc4_compiled_shader *
2211 vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
2212                         struct vc4_key *key)
2213 {
2214         struct hash_table *ht;
2215         uint32_t key_size;
2216         if (stage == QSTAGE_FRAG) {
2217                 ht = vc4->fs_cache;
2218                 key_size = sizeof(struct vc4_fs_key);
2219         } else {
2220                 ht = vc4->vs_cache;
2221                 key_size = sizeof(struct vc4_vs_key);
2222         }
2223
2224         struct vc4_compiled_shader *shader;
2225         struct hash_entry *entry = _mesa_hash_table_search(ht, key);
2226         if (entry)
2227                 return entry->data;
2228
2229         struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key);
2230         shader = rzalloc(NULL, struct vc4_compiled_shader);
2231
2232         shader->program_id = vc4->next_compiled_program_id++;
2233         if (stage == QSTAGE_FRAG) {
2234                 bool input_live[c->num_input_semantics];
2235
2236                 memset(input_live, 0, sizeof(input_live));
2237                 list_for_each_entry(struct qinst, inst, &c->instructions, link) {
2238                         for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
2239                                 if (inst->src[i].file == QFILE_VARY)
2240                                         input_live[inst->src[i].index] = true;
2241                         }
2242                 }
2243
2244                 shader->input_semantics = ralloc_array(shader,
2245                                                        struct vc4_varying_semantic,
2246                                                        c->num_input_semantics);
2247
2248                 for (int i = 0; i < c->num_input_semantics; i++) {
2249                         struct vc4_varying_semantic *sem = &c->input_semantics[i];
2250
2251                         if (!input_live[i])
2252                                 continue;
2253
2254                         /* Skip non-VS-output inputs. */
2255                         if (sem->semantic == (uint8_t)~0)
2256                                 continue;
2257
2258                         if (sem->semantic == TGSI_SEMANTIC_COLOR ||
2259                             sem->semantic == TGSI_SEMANTIC_BCOLOR) {
2260                                 shader->color_inputs |= (1 << shader->num_inputs);
2261                         }
2262
2263                         shader->input_semantics[shader->num_inputs] = *sem;
2264                         shader->num_inputs++;
2265                 }
2266         } else {
2267                 shader->num_inputs = c->num_inputs;
2268
2269                 shader->vattr_offsets[0] = 0;
2270                 for (int i = 0; i < 8; i++) {
2271                         shader->vattr_offsets[i + 1] =
2272                                 shader->vattr_offsets[i] + c->vattr_sizes[i];
2273
2274                         if (c->vattr_sizes[i])
2275                                 shader->vattrs_live |= (1 << i);
2276                 }
2277         }
2278
2279         copy_uniform_state_to_shader(shader, c);
2280         shader->bo = vc4_bo_alloc_mem(vc4->screen, c->qpu_insts,
2281                                       c->qpu_inst_count * sizeof(uint64_t),
2282                                       "code");
2283
2284         /* Copy the compiler UBO range state to the compiled shader, dropping
2285          * out arrays that were never referenced by an indirect load.
2286          *
2287          * (Note that QIR dead code elimination of an array access still
2288          * leaves that array alive, though)
2289          */
2290         if (c->num_ubo_ranges) {
2291                 shader->num_ubo_ranges = c->num_ubo_ranges;
2292                 shader->ubo_ranges = ralloc_array(shader, struct vc4_ubo_range,
2293                                                   c->num_ubo_ranges);
2294                 uint32_t j = 0;
2295                 for (int i = 0; i < c->num_uniform_ranges; i++) {
2296                         struct vc4_compiler_ubo_range *range =
2297                                 &c->ubo_ranges[i];
2298                         if (!range->used)
2299                                 continue;
2300
2301                         shader->ubo_ranges[j].dst_offset = range->dst_offset;
2302                         shader->ubo_ranges[j].src_offset = range->src_offset;
2303                         shader->ubo_ranges[j].size = range->size;
2304                         shader->ubo_size += c->ubo_ranges[i].size;
2305                         j++;
2306                 }
2307         }
2308         if (shader->ubo_size) {
2309                 if (vc4_debug & VC4_DEBUG_SHADERDB) {
2310                         fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
2311                                 qir_get_stage_name(c->stage),
2312                                 c->program_id, c->variant_id,
2313                                 shader->ubo_size / 4);
2314                 }
2315         }
2316
2317         qir_compile_destroy(c);
2318
2319         struct vc4_key *dup_key;
2320         dup_key = ralloc_size(shader, key_size);
2321         memcpy(dup_key, key, key_size);
2322         _mesa_hash_table_insert(ht, dup_key, shader);
2323
2324         return shader;
2325 }
2326
2327 static void
2328 vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,
2329                      struct vc4_texture_stateobj *texstate)
2330 {
2331         for (int i = 0; i < texstate->num_textures; i++) {
2332                 struct pipe_sampler_view *sampler = texstate->textures[i];
2333                 struct pipe_sampler_state *sampler_state =
2334                         texstate->samplers[i];
2335
2336                 if (sampler) {
2337                         key->tex[i].format = sampler->format;
2338                         key->tex[i].swizzle[0] = sampler->swizzle_r;
2339                         key->tex[i].swizzle[1] = sampler->swizzle_g;
2340                         key->tex[i].swizzle[2] = sampler->swizzle_b;
2341                         key->tex[i].swizzle[3] = sampler->swizzle_a;
2342                         key->tex[i].compare_mode = sampler_state->compare_mode;
2343                         key->tex[i].compare_func = sampler_state->compare_func;
2344                         key->tex[i].wrap_s = sampler_state->wrap_s;
2345                         key->tex[i].wrap_t = sampler_state->wrap_t;
2346                 }
2347         }
2348
2349         key->ucp_enables = vc4->rasterizer->base.clip_plane_enable;
2350 }
2351
2352 static void
2353 vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
2354 {
2355         struct vc4_fs_key local_key;
2356         struct vc4_fs_key *key = &local_key;
2357
2358         if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
2359                             VC4_DIRTY_BLEND |
2360                             VC4_DIRTY_FRAMEBUFFER |
2361                             VC4_DIRTY_ZSA |
2362                             VC4_DIRTY_RASTERIZER |
2363                             VC4_DIRTY_FRAGTEX |
2364                             VC4_DIRTY_TEXSTATE |
2365                             VC4_DIRTY_UNCOMPILED_FS))) {
2366                 return;
2367         }
2368
2369         memset(key, 0, sizeof(*key));
2370         vc4_setup_shared_key(vc4, &key->base, &vc4->fragtex);
2371         key->base.shader_state = vc4->prog.bind_fs;
2372         key->is_points = (prim_mode == PIPE_PRIM_POINTS);
2373         key->is_lines = (prim_mode >= PIPE_PRIM_LINES &&
2374                          prim_mode <= PIPE_PRIM_LINE_STRIP);
2375         key->blend = vc4->blend->rt[0];
2376         if (vc4->blend->logicop_enable) {
2377                 key->logicop_func = vc4->blend->logicop_func;
2378         } else {
2379                 key->logicop_func = PIPE_LOGICOP_COPY;
2380         }
2381         if (vc4->framebuffer.cbufs[0])
2382                 key->color_format = vc4->framebuffer.cbufs[0]->format;
2383
2384         key->stencil_enabled = vc4->zsa->stencil_uniforms[0] != 0;
2385         key->stencil_twoside = vc4->zsa->stencil_uniforms[1] != 0;
2386         key->stencil_full_writemasks = vc4->zsa->stencil_uniforms[2] != 0;
2387         key->depth_enabled = (vc4->zsa->base.depth.enabled ||
2388                               key->stencil_enabled);
2389         if (vc4->zsa->base.alpha.enabled) {
2390                 key->alpha_test = true;
2391                 key->alpha_test_func = vc4->zsa->base.alpha.func;
2392         }
2393
2394         if (key->is_points) {
2395                 key->point_sprite_mask =
2396                         vc4->rasterizer->base.sprite_coord_enable;
2397                 key->point_coord_upper_left =
2398                         (vc4->rasterizer->base.sprite_coord_mode ==
2399                          PIPE_SPRITE_COORD_UPPER_LEFT);
2400         }
2401
2402         key->light_twoside = vc4->rasterizer->base.light_twoside;
2403
2404         struct vc4_compiled_shader *old_fs = vc4->prog.fs;
2405         vc4->prog.fs = vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key->base);
2406         if (vc4->prog.fs == old_fs)
2407                 return;
2408
2409         vc4->dirty |= VC4_DIRTY_COMPILED_FS;
2410         if (vc4->rasterizer->base.flatshade &&
2411             old_fs && vc4->prog.fs->color_inputs != old_fs->color_inputs) {
2412                 vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS;
2413         }
2414 }
2415
2416 static void
2417 vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)
2418 {
2419         struct vc4_vs_key local_key;
2420         struct vc4_vs_key *key = &local_key;
2421
2422         if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
2423                             VC4_DIRTY_RASTERIZER |
2424                             VC4_DIRTY_VERTTEX |
2425                             VC4_DIRTY_TEXSTATE |
2426                             VC4_DIRTY_VTXSTATE |
2427                             VC4_DIRTY_UNCOMPILED_VS |
2428                             VC4_DIRTY_COMPILED_FS))) {
2429                 return;
2430         }
2431
2432         memset(key, 0, sizeof(*key));
2433         vc4_setup_shared_key(vc4, &key->base, &vc4->verttex);
2434         key->base.shader_state = vc4->prog.bind_vs;
2435         key->compiled_fs_id = vc4->prog.fs->program_id;
2436
2437         for (int i = 0; i < ARRAY_SIZE(key->attr_formats); i++)
2438                 key->attr_formats[i] = vc4->vtx->pipe[i].src_format;
2439
2440         key->per_vertex_point_size =
2441                 (prim_mode == PIPE_PRIM_POINTS &&
2442                  vc4->rasterizer->base.point_size_per_vertex);
2443
2444         vc4->prog.vs = vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
2445         key->is_coord = true;
2446         vc4->prog.cs = vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
2447 }
2448
2449 void
2450 vc4_update_compiled_shaders(struct vc4_context *vc4, uint8_t prim_mode)
2451 {
2452         vc4_update_compiled_fs(vc4, prim_mode);
2453         vc4_update_compiled_vs(vc4, prim_mode);
2454 }
2455
2456 static uint32_t
2457 fs_cache_hash(const void *key)
2458 {
2459         return _mesa_hash_data(key, sizeof(struct vc4_fs_key));
2460 }
2461
2462 static uint32_t
2463 vs_cache_hash(const void *key)
2464 {
2465         return _mesa_hash_data(key, sizeof(struct vc4_vs_key));
2466 }
2467
2468 static bool
2469 fs_cache_compare(const void *key1, const void *key2)
2470 {
2471         return memcmp(key1, key2, sizeof(struct vc4_fs_key)) == 0;
2472 }
2473
2474 static bool
2475 vs_cache_compare(const void *key1, const void *key2)
2476 {
2477         return memcmp(key1, key2, sizeof(struct vc4_vs_key)) == 0;
2478 }
2479
2480 static void
2481 delete_from_cache_if_matches(struct hash_table *ht,
2482                              struct hash_entry *entry,
2483                              struct vc4_uncompiled_shader *so)
2484 {
2485         const struct vc4_key *key = entry->key;
2486
2487         if (key->shader_state == so) {
2488                 struct vc4_compiled_shader *shader = entry->data;
2489                 _mesa_hash_table_remove(ht, entry);
2490                 vc4_bo_unreference(&shader->bo);
2491                 ralloc_free(shader);
2492         }
2493 }
2494
2495 static void
2496 vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)
2497 {
2498         struct vc4_context *vc4 = vc4_context(pctx);
2499         struct vc4_uncompiled_shader *so = hwcso;
2500
2501         struct hash_entry *entry;
2502         hash_table_foreach(vc4->fs_cache, entry)
2503                 delete_from_cache_if_matches(vc4->fs_cache, entry, so);
2504         hash_table_foreach(vc4->vs_cache, entry)
2505                 delete_from_cache_if_matches(vc4->vs_cache, entry, so);
2506
2507         if (so->twoside_tokens != so->base.tokens)
2508                 free((void *)so->twoside_tokens);
2509         free((void *)so->base.tokens);
2510         free(so);
2511 }
2512
2513 static void
2514 vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso)
2515 {
2516         struct vc4_context *vc4 = vc4_context(pctx);
2517         vc4->prog.bind_fs = hwcso;
2518         vc4->dirty |= VC4_DIRTY_UNCOMPILED_FS;
2519 }
2520
2521 static void
2522 vc4_vp_state_bind(struct pipe_context *pctx, void *hwcso)
2523 {
2524         struct vc4_context *vc4 = vc4_context(pctx);
2525         vc4->prog.bind_vs = hwcso;
2526         vc4->dirty |= VC4_DIRTY_UNCOMPILED_VS;
2527 }
2528
2529 void
2530 vc4_program_init(struct pipe_context *pctx)
2531 {
2532         struct vc4_context *vc4 = vc4_context(pctx);
2533
2534         pctx->create_vs_state = vc4_shader_state_create;
2535         pctx->delete_vs_state = vc4_shader_state_delete;
2536
2537         pctx->create_fs_state = vc4_shader_state_create;
2538         pctx->delete_fs_state = vc4_shader_state_delete;
2539
2540         pctx->bind_fs_state = vc4_fp_state_bind;
2541         pctx->bind_vs_state = vc4_vp_state_bind;
2542
2543         vc4->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash,
2544                                                 fs_cache_compare);
2545         vc4->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash,
2546                                                 vs_cache_compare);
2547 }
2548
2549 void
2550 vc4_program_fini(struct pipe_context *pctx)
2551 {
2552         struct vc4_context *vc4 = vc4_context(pctx);
2553
2554         struct hash_entry *entry;
2555         hash_table_foreach(vc4->fs_cache, entry) {
2556                 struct vc4_compiled_shader *shader = entry->data;
2557                 vc4_bo_unreference(&shader->bo);
2558                 ralloc_free(shader);
2559                 _mesa_hash_table_remove(vc4->fs_cache, entry);
2560         }
2561
2562         hash_table_foreach(vc4->vs_cache, entry) {
2563                 struct vc4_compiled_shader *shader = entry->data;
2564                 vc4_bo_unreference(&shader->bo);
2565                 ralloc_free(shader);
2566                 _mesa_hash_table_remove(vc4->vs_cache, entry);
2567         }
2568 }