src/mesa/state_tracker/st_glsl_to_tgsi.cpp

   1 /*
   2  * Copyright (C) 2005-2007  Brian Paul   All Rights Reserved.
   3  * Copyright (C) 2008  VMware, Inc.   All Rights Reserved.
   4  * Copyright © 2010 Intel Corporation
   5  * Copyright © 2011 Bryan Cain
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the next
  15  * paragraph) shall be included in all copies or substantial portions of the
  16  * Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24  * DEALINGS IN THE SOFTWARE.
  25  */
  26
  27 /**
  28  * \file glsl_to_tgsi.cpp
  29  *
  30  * Translate GLSL IR to TGSI.
  31  */
  32
  33 #include "st_glsl_to_tgsi.h"
  34
  35 #include "compiler/glsl/glsl_parser_extras.h"
  36 #include "compiler/glsl/ir_optimization.h"
  37 #include "compiler/glsl/program.h"
  38
  39 #include "main/errors.h"
  40 #include "main/shaderobj.h"
  41 #include "main/uniforms.h"
  42 #include "main/shaderapi.h"
  43 #include "main/shaderimage.h"
  44 #include "program/prog_instruction.h"
  45
  46 #include "pipe/p_context.h"
  47 #include "pipe/p_screen.h"
  48 #include "tgsi/tgsi_ureg.h"
  49 #include "tgsi/tgsi_info.h"
  50 #include "util/u_math.h"
  51 #include "util/u_memory.h"
  52 #include "st_glsl_types.h"
  53 #include "st_program.h"
  54 #include "st_mesa_to_tgsi.h"
  55 #include "st_format.h"
  56 #include "st_nir.h"
  57 #include "st_shader_cache.h"
  58 #include "st_glsl_to_tgsi_temprename.h"
  59
  60 #include "util/hash_table.h"
  61 #include <algorithm>
  62
  63 #define PROGRAM_ANY_CONST ((1 << PROGRAM_STATE_VAR) |    \
  64                            (1 << PROGRAM_CONSTANT) |     \
  65                            (1 << PROGRAM_UNIFORM))
  66
  67 #define MAX_GLSL_TEXTURE_OFFSET 4
  68
  69 #ifndef NDEBUG
  70 #include "util/u_atomic.h"
  71 #include "util/simple_mtx.h"
  72 #include <fstream>
  73 #include <ios>
  74
  75 /* Prepare to make it possible to specify log file */
  76 static std::ofstream stats_log;
  77
  78 /* Helper function to check whether we want to write some statistics
  79  * of the shader conversion.
  80  */
  81
  82 static simple_mtx_t print_stats_mutex = _SIMPLE_MTX_INITIALIZER_NP;
  83
  84 static inline bool print_stats_enabled ()
  85 {
  86    static int stats_enabled = 0;
  87
  88    if (!stats_enabled) {
  89       simple_mtx_lock(&print_stats_mutex);
  90       if (!stats_enabled) {
  91          const char *stats_filename = getenv("GLSL_TO_TGSI_PRINT_STATS");
  92          if (stats_filename) {
  93             bool write_header = std::ifstream(stats_filename).fail();
  94             stats_log.open(stats_filename, std::ios_base::out | std::ios_base::app);
  95             stats_enabled = stats_log.good() ? 1 : -1;
  96             if (write_header)
  97                stats_log << "arrays,temps,temps in arrays,total,instructions\n";
  98          } else {
  99             stats_enabled = -1;
 100          }
 101       }
 102       simple_mtx_unlock(&print_stats_mutex);
 103    }
 104    return stats_enabled > 0;
 105 }
 106 #define PRINT_STATS(X) if (print_stats_enabled()) do { X; } while (false);
 107 #else
 108 #define PRINT_STATS(X)
 109 #endif
 110
 111
 112 static unsigned is_precise(const ir_variable *ir)
 113 {
 114    if (!ir)
 115       return 0;
 116    return ir->data.precise || ir->data.invariant;
 117 }
 118
 119 class variable_storage {
 120    DECLARE_RZALLOC_CXX_OPERATORS(variable_storage)
 121
 122 public:
 123    variable_storage(ir_variable *var, gl_register_file file, int index,
 124                     unsigned array_id = 0)
 125       : file(file), index(index), component(0), var(var), array_id(array_id)
 126    {
 127       assert(file != PROGRAM_ARRAY || array_id != 0);
 128    }
 129
 130    gl_register_file file;
 131    int index;
 132
 133    /* Explicit component location. This is given in terms of the GLSL-style
 134     * swizzles where each double is a single component, i.e. for 64-bit types
 135     * it can only be 0 or 1.
 136     */
 137    int component;
 138    ir_variable *var; /* variable that maps to this, if any */
 139    unsigned array_id;
 140 };
 141
 142 class immediate_storage : public exec_node {
 143 public:
 144    immediate_storage(gl_constant_value *values, int size32, GLenum type)
 145    {
 146       memcpy(this->values, values, size32 * sizeof(gl_constant_value));
 147       this->size32 = size32;
 148       this->type = type;
 149    }
 150
 151    /* doubles are stored across 2 gl_constant_values */
 152    gl_constant_value values[4];
 153    int size32; /**< Number of 32-bit components (1-4) */
 154    GLenum type; /**< GL_DOUBLE, GL_FLOAT, GL_INT, GL_BOOL, or GL_UNSIGNED_INT */
 155 };
 156
 157 static const st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
 158 static const st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
 159
 160 struct inout_decl {
 161    unsigned mesa_index;
 162    unsigned array_id; /* TGSI ArrayID; 1-based: 0 means not an array */
 163    unsigned size;
 164    unsigned interp_loc;
 165    unsigned gs_out_streams;
 166    enum glsl_interp_mode interp;
 167    enum glsl_base_type base_type;
 168    ubyte usage_mask; /* GLSL-style usage-mask,  i.e. single bit per double */
 169    bool invariant;
 170 };
 171
 172 static struct inout_decl *
 173 find_inout_array(struct inout_decl *decls, unsigned count, unsigned array_id)
 174 {
 175    assert(array_id != 0);
 176
 177    for (unsigned i = 0; i < count; i++) {
 178       struct inout_decl *decl = &decls[i];
 179
 180       if (array_id == decl->array_id) {
 181          return decl;
 182       }
 183    }
 184
 185    return NULL;
 186 }
 187
 188 static enum glsl_base_type
 189 find_array_type(struct inout_decl *decls, unsigned count, unsigned array_id)
 190 {
 191    if (!array_id)
 192       return GLSL_TYPE_ERROR;
 193    struct inout_decl *decl = find_inout_array(decls, count, array_id);
 194    if (decl)
 195       return decl->base_type;
 196    return GLSL_TYPE_ERROR;
 197 }
 198
 199 struct hwatomic_decl {
 200    unsigned location;
 201    unsigned binding;
 202    unsigned size;
 203    unsigned array_id;
 204 };
 205
 206 struct glsl_to_tgsi_visitor : public ir_visitor {
 207 public:
 208    glsl_to_tgsi_visitor();
 209    ~glsl_to_tgsi_visitor();
 210
 211    struct gl_context *ctx;
 212    struct gl_program *prog;
 213    struct gl_shader_program *shader_program;
 214    struct gl_linked_shader *shader;
 215    struct gl_shader_compiler_options *options;
 216
 217    int next_temp;
 218
 219    unsigned *array_sizes;
 220    unsigned max_num_arrays;
 221    unsigned next_array;
 222
 223    struct inout_decl inputs[4 * PIPE_MAX_SHADER_INPUTS];
 224    unsigned num_inputs;
 225    unsigned num_input_arrays;
 226    struct inout_decl outputs[4 * PIPE_MAX_SHADER_OUTPUTS];
 227    unsigned num_outputs;
 228    unsigned num_output_arrays;
 229
 230    struct hwatomic_decl atomic_info[PIPE_MAX_HW_ATOMIC_BUFFERS];
 231    unsigned num_atomics;
 232    unsigned num_atomic_arrays;
 233    int num_address_regs;
 234    uint32_t samplers_used;
 235    glsl_base_type sampler_types[PIPE_MAX_SAMPLERS];
 236    enum tgsi_texture_type sampler_targets[PIPE_MAX_SAMPLERS];
 237    int images_used;
 238    enum tgsi_texture_type image_targets[PIPE_MAX_SHADER_IMAGES];
 239    enum pipe_format image_formats[PIPE_MAX_SHADER_IMAGES];
 240    bool image_wr[PIPE_MAX_SHADER_IMAGES];
 241    bool indirect_addr_consts;
 242    int wpos_transform_const;
 243
 244    bool native_integers;
 245    bool have_sqrt;
 246    bool have_fma;
 247    bool use_shared_memory;
 248    bool has_tex_txf_lz;
 249    bool precise;
 250    bool need_uarl;
 251
 252    variable_storage *find_variable_storage(ir_variable *var);
 253
 254    int add_constant(gl_register_file file, gl_constant_value values[8],
 255                     int size, GLenum datatype, uint16_t *swizzle_out);
 256
 257    st_src_reg get_temp(const glsl_type *type);
 258    void reladdr_to_temp(ir_instruction *ir, st_src_reg *reg, int *num_reladdr);
 259
 260    st_src_reg st_src_reg_for_double(double val);
 261    st_src_reg st_src_reg_for_float(float val);
 262    st_src_reg st_src_reg_for_int(int val);
 263    st_src_reg st_src_reg_for_int64(int64_t val);
 264    st_src_reg st_src_reg_for_type(enum glsl_base_type type, int val);
 265
 266    /**
 267     * \name Visit methods
 268     *
 269     * As typical for the visitor pattern, there must be one \c visit method for
 270     * each concrete subclass of \c ir_instruction.  Virtual base classes within
 271     * the hierarchy should not have \c visit methods.
 272     */
 273    /*@{*/
 274    virtual void visit(ir_variable *);
 275    virtual void visit(ir_loop *);
 276    virtual void visit(ir_loop_jump *);
 277    virtual void visit(ir_function_signature *);
 278    virtual void visit(ir_function *);
 279    virtual void visit(ir_expression *);
 280    virtual void visit(ir_swizzle *);
 281    virtual void visit(ir_dereference_variable  *);
 282    virtual void visit(ir_dereference_array *);
 283    virtual void visit(ir_dereference_record *);
 284    virtual void visit(ir_assignment *);
 285    virtual void visit(ir_constant *);
 286    virtual void visit(ir_call *);
 287    virtual void visit(ir_return *);
 288    virtual void visit(ir_discard *);
 289    virtual void visit(ir_texture *);
 290    virtual void visit(ir_if *);
 291    virtual void visit(ir_emit_vertex *);
 292    virtual void visit(ir_end_primitive *);
 293    virtual void visit(ir_barrier *);
 294    /*@}*/
 295
 296    void visit_expression(ir_expression *, st_src_reg *) ATTRIBUTE_NOINLINE;
 297
 298    void visit_atomic_counter_intrinsic(ir_call *);
 299    void visit_ssbo_intrinsic(ir_call *);
 300    void visit_membar_intrinsic(ir_call *);
 301    void visit_shared_intrinsic(ir_call *);
 302    void visit_image_intrinsic(ir_call *);
 303    void visit_generic_intrinsic(ir_call *, enum tgsi_opcode op);
 304
 305    st_src_reg result;
 306
 307    /** List of variable_storage */
 308    struct hash_table *variables;
 309
 310    /** List of immediate_storage */
 311    exec_list immediates;
 312    unsigned num_immediates;
 313
 314    /** List of glsl_to_tgsi_instruction */
 315    exec_list instructions;
 316
 317    glsl_to_tgsi_instruction *emit_asm(ir_instruction *ir, enum tgsi_opcode op,
 318                                       st_dst_reg dst = undef_dst,
 319                                       st_src_reg src0 = undef_src,
 320                                       st_src_reg src1 = undef_src,
 321                                       st_src_reg src2 = undef_src,
 322                                       st_src_reg src3 = undef_src);
 323
 324    glsl_to_tgsi_instruction *emit_asm(ir_instruction *ir, enum tgsi_opcode op,
 325                                       st_dst_reg dst, st_dst_reg dst1,
 326                                       st_src_reg src0 = undef_src,
 327                                       st_src_reg src1 = undef_src,
 328                                       st_src_reg src2 = undef_src,
 329                                       st_src_reg src3 = undef_src);
 330
 331    enum tgsi_opcode get_opcode(enum tgsi_opcode op,
 332                                st_dst_reg dst,
 333                                st_src_reg src0, st_src_reg src1);
 334
 335    /**
 336     * Emit the correct dot-product instruction for the type of arguments
 337     */
 338    glsl_to_tgsi_instruction *emit_dp(ir_instruction *ir,
 339                                      st_dst_reg dst,
 340                                      st_src_reg src0,
 341                                      st_src_reg src1,
 342                                      unsigned elements);
 343
 344    void emit_scalar(ir_instruction *ir, enum tgsi_opcode op,
 345                     st_dst_reg dst, st_src_reg src0);
 346
 347    void emit_scalar(ir_instruction *ir, enum tgsi_opcode op,
 348                     st_dst_reg dst, st_src_reg src0, st_src_reg src1);
 349
 350    void emit_arl(ir_instruction *ir, st_dst_reg dst, st_src_reg src0);
 351
 352    void get_deref_offsets(ir_dereference *ir,
 353                           unsigned *array_size,
 354                           unsigned *base,
 355                           uint16_t *index,
 356                           st_src_reg *reladdr,
 357                           bool opaque);
 358   void calc_deref_offsets(ir_dereference *tail,
 359                           unsigned *array_elements,
 360                           uint16_t *index,
 361                           st_src_reg *indirect,
 362                           unsigned *location);
 363    st_src_reg canonicalize_gather_offset(st_src_reg offset);
 364    bool handle_bound_deref(ir_dereference *ir);
 365
 366    bool try_emit_mad(ir_expression *ir,
 367               int mul_operand);
 368    bool try_emit_mad_for_and_not(ir_expression *ir,
 369               int mul_operand);
 370
 371    void emit_swz(ir_expression *ir);
 372
 373    bool process_move_condition(ir_rvalue *ir);
 374
 375    void simplify_cmp(void);
 376
 377    void rename_temp_registers(struct rename_reg_pair *renames);
 378    void get_first_temp_read(int *first_reads);
 379    void get_first_temp_write(int *first_writes);
 380    void get_last_temp_read_first_temp_write(int *last_reads, int *first_writes);
 381    void get_last_temp_write(int *last_writes);
 382
 383    void copy_propagate(void);
 384    int eliminate_dead_code(void);
 385
 386    void split_arrays(void);
 387    void merge_two_dsts(void);
 388    void merge_registers(void);
 389    void renumber_registers(void);
 390
 391    void emit_block_mov(ir_assignment *ir, const struct glsl_type *type,
 392                        st_dst_reg *l, st_src_reg *r,
 393                        st_src_reg *cond, bool cond_swap);
 394
 395    void print_stats();
 396
 397    void *mem_ctx;
 398 };
 399
 400 static st_dst_reg address_reg = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X,
 401                                            GLSL_TYPE_FLOAT, 0);
 402 static st_dst_reg address_reg2 = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X,
 403                                             GLSL_TYPE_FLOAT, 1);
 404 static st_dst_reg sampler_reladdr = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X,
 405                                                GLSL_TYPE_FLOAT, 2);
 406
 407 static void
 408 fail_link(struct gl_shader_program *prog, const char *fmt, ...)
 409    PRINTFLIKE(2, 3);
 410
 411 static void
 412 fail_link(struct gl_shader_program *prog, const char *fmt, ...)
 413 {
 414    va_list args;
 415    va_start(args, fmt);
 416    ralloc_vasprintf_append(&prog->data->InfoLog, fmt, args);
 417    va_end(args);
 418
 419    prog->data->LinkStatus = LINKING_FAILURE;
 420 }
 421
 422 int
 423 swizzle_for_size(int size)
 424 {
 425    static const int size_swizzles[4] = {
 426       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
 427       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
 428       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z),
 429       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W),
 430    };
 431
 432    assert((size >= 1) && (size <= 4));
 433    return size_swizzles[size - 1];
 434 }
 435
 436
 437 glsl_to_tgsi_instruction *
 438 glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, enum tgsi_opcode op,
 439                                st_dst_reg dst, st_dst_reg dst1,
 440                                st_src_reg src0, st_src_reg src1,
 441                                st_src_reg src2, st_src_reg src3)
 442 {
 443    glsl_to_tgsi_instruction *inst = new(mem_ctx) glsl_to_tgsi_instruction();
 444    int num_reladdr = 0, i, j;
 445    bool dst_is_64bit[2];
 446
 447    op = get_opcode(op, dst, src0, src1);
 448
 449    /* If we have to do relative addressing, we want to load the ARL
 450     * reg directly for one of the regs, and preload the other reladdr
 451     * sources into temps.
 452     */
 453    num_reladdr += dst.reladdr != NULL || dst.reladdr2;
 454    assert(!dst1.reladdr); /* should be lowered in earlier passes */
 455    num_reladdr += src0.reladdr != NULL || src0.reladdr2 != NULL;
 456    num_reladdr += src1.reladdr != NULL || src1.reladdr2 != NULL;
 457    num_reladdr += src2.reladdr != NULL || src2.reladdr2 != NULL;
 458    num_reladdr += src3.reladdr != NULL || src3.reladdr2 != NULL;
 459
 460    reladdr_to_temp(ir, &src3, &num_reladdr);
 461    reladdr_to_temp(ir, &src2, &num_reladdr);
 462    reladdr_to_temp(ir, &src1, &num_reladdr);
 463    reladdr_to_temp(ir, &src0, &num_reladdr);
 464
 465    if (dst.reladdr || dst.reladdr2) {
 466       if (dst.reladdr)
 467          emit_arl(ir, address_reg, *dst.reladdr);
 468       if (dst.reladdr2)
 469          emit_arl(ir, address_reg2, *dst.reladdr2);
 470       num_reladdr--;
 471    }
 472
 473    assert(num_reladdr == 0);
 474
 475    /* inst->op has only 8 bits. */
 476    STATIC_ASSERT(TGSI_OPCODE_LAST <= 255);
 477
 478    inst->op = op;
 479    inst->precise = this->precise;
 480    inst->info = tgsi_get_opcode_info(op);
 481    inst->dst[0] = dst;
 482    inst->dst[1] = dst1;
 483    inst->src[0] = src0;
 484    inst->src[1] = src1;
 485    inst->src[2] = src2;
 486    inst->src[3] = src3;
 487    inst->is_64bit_expanded = false;
 488    inst->ir = ir;
 489    inst->dead_mask = 0;
 490    inst->tex_offsets = NULL;
 491    inst->tex_offset_num_offset = 0;
 492    inst->saturate = 0;
 493    inst->tex_shadow = 0;
 494    /* default to float, for paths where this is not initialized
 495     * (since 0==UINT which is likely wrong):
 496     */
 497    inst->tex_type = GLSL_TYPE_FLOAT;
 498
 499    /* Update indirect addressing status used by TGSI */
 500    if (dst.reladdr || dst.reladdr2) {
 501       switch (dst.file) {
 502       case PROGRAM_STATE_VAR:
 503       case PROGRAM_CONSTANT:
 504       case PROGRAM_UNIFORM:
 505          this->indirect_addr_consts = true;
 506          break;
 507       case PROGRAM_IMMEDIATE:
 508          assert(!"immediates should not have indirect addressing");
 509          break;
 510       default:
 511          break;
 512       }
 513    }
 514    else {
 515       for (i = 0; i < 4; i++) {
 516          if (inst->src[i].reladdr) {
 517             switch (inst->src[i].file) {
 518             case PROGRAM_STATE_VAR:
 519             case PROGRAM_CONSTANT:
 520             case PROGRAM_UNIFORM:
 521                this->indirect_addr_consts = true;
 522                break;
 523             case PROGRAM_IMMEDIATE:
 524                assert(!"immediates should not have indirect addressing");
 525                break;
 526             default:
 527                break;
 528             }
 529          }
 530       }
 531    }
 532
 533    /*
 534     * This section contains the double processing.
 535     * GLSL just represents doubles as single channel values,
 536     * however most HW and TGSI represent doubles as pairs of register channels.
 537     *
 538     * so we have to fixup destination writemask/index and src swizzle/indexes.
 539     * dest writemasks need to translate from single channel write mask
 540     * to a dual-channel writemask, but also need to modify the index,
 541     * if we are touching the Z,W fields in the pre-translated writemask.
 542     *
 543     * src channels have similiar index modifications along with swizzle
 544     * changes to we pick the XY, ZW pairs from the correct index.
 545     *
 546     * GLSL [0].x -> TGSI [0].xy
 547     * GLSL [0].y -> TGSI [0].zw
 548     * GLSL [0].z -> TGSI [1].xy
 549     * GLSL [0].w -> TGSI [1].zw
 550     */
 551    for (j = 0; j < 2; j++) {
 552       dst_is_64bit[j] = glsl_base_type_is_64bit(inst->dst[j].type);
 553       if (!dst_is_64bit[j] && inst->dst[j].file == PROGRAM_OUTPUT &&
 554           inst->dst[j].type == GLSL_TYPE_ARRAY) {
 555          enum glsl_base_type type = find_array_type(this->outputs,
 556                                                     this->num_outputs,
 557                                                     inst->dst[j].array_id);
 558          if (glsl_base_type_is_64bit(type))
 559             dst_is_64bit[j] = true;
 560       }
 561    }
 562
 563    if (dst_is_64bit[0] || dst_is_64bit[1] ||
 564        glsl_base_type_is_64bit(inst->src[0].type)) {
 565       glsl_to_tgsi_instruction *dinst = NULL;
 566       int initial_src_swz[4], initial_src_idx[4];
 567       int initial_dst_idx[2], initial_dst_writemask[2];
 568       /* select the writemask for dst0 or dst1 */
 569       unsigned writemask = inst->dst[1].file == PROGRAM_UNDEFINED
 570          ? inst->dst[0].writemask : inst->dst[1].writemask;
 571
 572       /* copy out the writemask, index and swizzles for all src/dsts. */
 573       for (j = 0; j < 2; j++) {
 574          initial_dst_writemask[j] = inst->dst[j].writemask;
 575          initial_dst_idx[j] = inst->dst[j].index;
 576       }
 577
 578       for (j = 0; j < 4; j++) {
 579          initial_src_swz[j] = inst->src[j].swizzle;
 580          initial_src_idx[j] = inst->src[j].index;
 581       }
 582
 583       /*
 584        * scan all the components in the dst writemask
 585        * generate an instruction for each of them if required.
 586        */
 587       st_src_reg addr;
 588       while (writemask) {
 589
 590          int i = u_bit_scan(&writemask);
 591
 592          /* before emitting the instruction, see if we have to adjust
 593           * load / store address */
 594          if (i > 1 && (inst->op == TGSI_OPCODE_LOAD ||
 595                        inst->op == TGSI_OPCODE_STORE) &&
 596              addr.file == PROGRAM_UNDEFINED) {
 597             /* We have to advance the buffer address by 16 */
 598             addr = get_temp(glsl_type::uint_type);
 599             emit_asm(ir, TGSI_OPCODE_UADD, st_dst_reg(addr),
 600                      inst->src[0], st_src_reg_for_int(16));
 601          }
 602
 603          /* first time use previous instruction */
 604          if (dinst == NULL) {
 605             dinst = inst;
 606          } else {
 607             /* create a new instructions for subsequent attempts */
 608             dinst = new(mem_ctx) glsl_to_tgsi_instruction();
 609             *dinst = *inst;
 610             dinst->next = NULL;
 611             dinst->prev = NULL;
 612          }
 613          this->instructions.push_tail(dinst);
 614          dinst->is_64bit_expanded = true;
 615
 616          /* modify the destination if we are splitting */
 617          for (j = 0; j < 2; j++) {
 618             if (dst_is_64bit[j]) {
 619                dinst->dst[j].writemask = (i & 1) ? WRITEMASK_ZW : WRITEMASK_XY;
 620                dinst->dst[j].index = initial_dst_idx[j];
 621                if (i > 1) {
 622                   if (dinst->op == TGSI_OPCODE_LOAD ||
 623                       dinst->op == TGSI_OPCODE_STORE)
 624                      dinst->src[0] = addr;
 625                   if (dinst->op != TGSI_OPCODE_STORE)
 626                      dinst->dst[j].index++;
 627                }
 628             } else {
 629                /* if we aren't writing to a double, just get the bit of the
 630                 * initial writemask for this channel
 631                 */
 632                dinst->dst[j].writemask = initial_dst_writemask[j] & (1 << i);
 633             }
 634          }
 635
 636          /* modify the src registers */
 637          for (j = 0; j < 4; j++) {
 638             int swz = GET_SWZ(initial_src_swz[j], i);
 639
 640             if (glsl_base_type_is_64bit(dinst->src[j].type)) {
 641                dinst->src[j].index = initial_src_idx[j];
 642                if (swz > 1) {
 643                   dinst->src[j].double_reg2 = true;
 644                   dinst->src[j].index++;
 645                }
 646
 647                if (swz & 1)
 648                   dinst->src[j].swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W,
 649                                                         SWIZZLE_Z, SWIZZLE_W);
 650                else
 651                   dinst->src[j].swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
 652                                                         SWIZZLE_X, SWIZZLE_Y);
 653
 654             } else {
 655                /* some opcodes are special case in what they use as sources
 656                 * - [FUI]2D/[UI]2I64 is a float/[u]int src0, (D)LDEXP is
 657                 * integer src1
 658                 */
 659                if (op == TGSI_OPCODE_F2D || op == TGSI_OPCODE_U2D ||
 660                    op == TGSI_OPCODE_I2D ||
 661                    op == TGSI_OPCODE_I2I64 || op == TGSI_OPCODE_U2I64 ||
 662                    op == TGSI_OPCODE_DLDEXP || op == TGSI_OPCODE_LDEXP ||
 663                    (op == TGSI_OPCODE_UCMP && dst_is_64bit[0])) {
 664                   dinst->src[j].swizzle = MAKE_SWIZZLE4(swz, swz, swz, swz);
 665                }
 666             }
 667          }
 668       }
 669       inst = dinst;
 670    } else {
 671       this->instructions.push_tail(inst);
 672    }
 673
 674
 675    return inst;
 676 }
 677
 678 glsl_to_tgsi_instruction *
 679 glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, enum tgsi_opcode op,
 680                                st_dst_reg dst,
 681                                st_src_reg src0, st_src_reg src1,
 682                                st_src_reg src2, st_src_reg src3)
 683 {
 684    return emit_asm(ir, op, dst, undef_dst, src0, src1, src2, src3);
 685 }
 686
 687 /**
 688  * Determines whether to use an integer, unsigned integer, or float opcode
 689  * based on the operands and input opcode, then emits the result.
 690  */
 691 enum tgsi_opcode
 692 glsl_to_tgsi_visitor::get_opcode(enum tgsi_opcode op,
 693                                  st_dst_reg dst,
 694                                  st_src_reg src0, st_src_reg src1)
 695 {
 696    enum glsl_base_type type = GLSL_TYPE_FLOAT;
 697
 698    if (op == TGSI_OPCODE_MOV)
 699        return op;
 700
 701    assert(src0.type != GLSL_TYPE_ARRAY);
 702    assert(src0.type != GLSL_TYPE_STRUCT);
 703    assert(src1.type != GLSL_TYPE_ARRAY);
 704    assert(src1.type != GLSL_TYPE_STRUCT);
 705
 706    if (is_resource_instruction(op))
 707       type = src1.type;
 708    else if (src0.type == GLSL_TYPE_INT64 || src1.type == GLSL_TYPE_INT64)
 709       type = GLSL_TYPE_INT64;
 710    else if (src0.type == GLSL_TYPE_UINT64 || src1.type == GLSL_TYPE_UINT64)
 711       type = GLSL_TYPE_UINT64;
 712    else if (src0.type == GLSL_TYPE_DOUBLE || src1.type == GLSL_TYPE_DOUBLE)
 713       type = GLSL_TYPE_DOUBLE;
 714    else if (src0.type == GLSL_TYPE_FLOAT || src1.type == GLSL_TYPE_FLOAT)
 715       type = GLSL_TYPE_FLOAT;
 716    else if (native_integers)
 717       type = src0.type == GLSL_TYPE_BOOL ? GLSL_TYPE_INT : src0.type;
 718
 719 #define case7(c, f, i, u, d, i64, ui64)             \
 720    case TGSI_OPCODE_##c: \
 721       if (type == GLSL_TYPE_UINT64)           \
 722          op = TGSI_OPCODE_##ui64; \
 723       else if (type == GLSL_TYPE_INT64)       \
 724          op = TGSI_OPCODE_##i64; \
 725       else if (type == GLSL_TYPE_DOUBLE)       \
 726          op = TGSI_OPCODE_##d; \
 727       else if (type == GLSL_TYPE_INT)       \
 728          op = TGSI_OPCODE_##i; \
 729       else if (type == GLSL_TYPE_UINT) \
 730          op = TGSI_OPCODE_##u; \
 731       else \
 732          op = TGSI_OPCODE_##f; \
 733       break;
 734
 735 #define casecomp(c, f, i, u, d, i64, ui64)           \
 736    case TGSI_OPCODE_##c: \
 737       if (type == GLSL_TYPE_INT64)             \
 738          op = TGSI_OPCODE_##i64; \
 739       else if (type == GLSL_TYPE_UINT64)        \
 740          op = TGSI_OPCODE_##ui64; \
 741       else if (type == GLSL_TYPE_DOUBLE)       \
 742          op = TGSI_OPCODE_##d; \
 743       else if (type == GLSL_TYPE_INT || type == GLSL_TYPE_SUBROUTINE)       \
 744          op = TGSI_OPCODE_##i; \
 745       else if (type == GLSL_TYPE_UINT) \
 746          op = TGSI_OPCODE_##u; \
 747       else if (native_integers) \
 748          op = TGSI_OPCODE_##f; \
 749       else \
 750          op = TGSI_OPCODE_##c; \
 751       break;
 752
 753    switch (op) {
 754       /* Some instructions are initially selected without considering the type.
 755        * This fixes the type:
 756        *
 757        *    INIT     FLOAT SINT     UINT     DOUBLE   SINT64   UINT64
 758        */
 759       case7(ADD,     ADD,  UADD,    UADD,    DADD,    U64ADD,  U64ADD);
 760       case7(CEIL,    CEIL, LAST,    LAST,    DCEIL,   LAST,    LAST);
 761       case7(DIV,     DIV,  IDIV,    UDIV,    DDIV,    I64DIV,  U64DIV);
 762       case7(FMA,     FMA,  UMAD,    UMAD,    DFMA,    LAST,    LAST);
 763       case7(FLR,     FLR,  LAST,    LAST,    DFLR,    LAST,    LAST);
 764       case7(FRC,     FRC,  LAST,    LAST,    DFRAC,   LAST,    LAST);
 765       case7(MUL,     MUL,  UMUL,    UMUL,    DMUL,    U64MUL,  U64MUL);
 766       case7(MAD,     MAD,  UMAD,    UMAD,    DMAD,    LAST,    LAST);
 767       case7(MAX,     MAX,  IMAX,    UMAX,    DMAX,    I64MAX,  U64MAX);
 768       case7(MIN,     MIN,  IMIN,    UMIN,    DMIN,    I64MIN,  U64MIN);
 769       case7(RCP,     RCP,  LAST,    LAST,    DRCP,    LAST,    LAST);
 770       case7(ROUND,   ROUND,LAST,    LAST,    DROUND,  LAST,    LAST);
 771       case7(RSQ,     RSQ,  LAST,    LAST,    DRSQ,    LAST,    LAST);
 772       case7(SQRT,    SQRT, LAST,    LAST,    DSQRT,   LAST,    LAST);
 773       case7(SSG,     SSG,  ISSG,    ISSG,    DSSG,    I64SSG,  I64SSG);
 774       case7(TRUNC,   TRUNC,LAST,    LAST,    DTRUNC,  LAST,    LAST);
 775
 776       case7(MOD,     LAST, MOD,     UMOD,    LAST,    I64MOD,  U64MOD);
 777       case7(SHL,     LAST, SHL,     SHL,     LAST,    U64SHL,  U64SHL);
 778       case7(IBFE,    LAST, IBFE,    UBFE,    LAST,    LAST,    LAST);
 779       case7(IMSB,    LAST, IMSB,    UMSB,    LAST,    LAST,    LAST);
 780       case7(IMUL_HI, LAST, IMUL_HI, UMUL_HI, LAST,    LAST,    LAST);
 781       case7(ISHR,    LAST, ISHR,    USHR,    LAST,    I64SHR,  U64SHR);
 782       case7(ATOMIMAX,LAST, ATOMIMAX,ATOMUMAX,LAST,    LAST,    LAST);
 783       case7(ATOMIMIN,LAST, ATOMIMIN,ATOMUMIN,LAST,    LAST,    LAST);
 784
 785       casecomp(SEQ, FSEQ, USEQ, USEQ, DSEQ, U64SEQ, U64SEQ);
 786       casecomp(SNE, FSNE, USNE, USNE, DSNE, U64SNE, U64SNE);
 787       casecomp(SGE, FSGE, ISGE, USGE, DSGE, I64SGE, U64SGE);
 788       casecomp(SLT, FSLT, ISLT, USLT, DSLT, I64SLT, U64SLT);
 789
 790       default:
 791          break;
 792    }
 793
 794    assert(op != TGSI_OPCODE_LAST);
 795    return op;
 796 }
 797
 798 glsl_to_tgsi_instruction *
 799 glsl_to_tgsi_visitor::emit_dp(ir_instruction *ir,
 800                               st_dst_reg dst, st_src_reg src0, st_src_reg src1,
 801                               unsigned elements)
 802 {
 803    static const enum tgsi_opcode dot_opcodes[] = {
 804       TGSI_OPCODE_DP2, TGSI_OPCODE_DP3, TGSI_OPCODE_DP4
 805    };
 806
 807    return emit_asm(ir, dot_opcodes[elements - 2], dst, src0, src1);
 808 }
 809
 810 /**
 811  * Emits TGSI scalar opcodes to produce unique answers across channels.
 812  *
 813  * Some TGSI opcodes are scalar-only, like ARB_fp/vp.  The src X
 814  * channel determines the result across all channels.  So to do a vec4
 815  * of this operation, we want to emit a scalar per source channel used
 816  * to produce dest channels.
 817  */
 818 void
 819 glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, enum tgsi_opcode op,
 820                                   st_dst_reg dst,
 821                                   st_src_reg orig_src0, st_src_reg orig_src1)
 822 {
 823    int i, j;
 824    int done_mask = ~dst.writemask;
 825
 826    /* TGSI RCP is a scalar operation splatting results to all channels,
 827     * like ARB_fp/vp.  So emit as many RCPs as necessary to cover our
 828     * dst channels.
 829     */
 830    for (i = 0; i < 4; i++) {
 831       GLuint this_mask = (1 << i);
 832       st_src_reg src0 = orig_src0;
 833       st_src_reg src1 = orig_src1;
 834
 835       if (done_mask & this_mask)
 836          continue;
 837
 838       GLuint src0_swiz = GET_SWZ(src0.swizzle, i);
 839       GLuint src1_swiz = GET_SWZ(src1.swizzle, i);
 840       for (j = i + 1; j < 4; j++) {
 841          /* If there is another enabled component in the destination that is
 842           * derived from the same inputs, generate its value on this pass as
 843           * well.
 844           */
 845          if (!(done_mask & (1 << j)) &&
 846              GET_SWZ(src0.swizzle, j) == src0_swiz &&
 847              GET_SWZ(src1.swizzle, j) == src1_swiz) {
 848             this_mask |= (1 << j);
 849          }
 850       }
 851       src0.swizzle = MAKE_SWIZZLE4(src0_swiz, src0_swiz,
 852                                    src0_swiz, src0_swiz);
 853       src1.swizzle = MAKE_SWIZZLE4(src1_swiz, src1_swiz,
 854                                    src1_swiz, src1_swiz);
 855
 856       dst.writemask = this_mask;
 857       emit_asm(ir, op, dst, src0, src1);
 858       done_mask |= this_mask;
 859    }
 860 }
 861
 862 void
 863 glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, enum tgsi_opcode op,
 864                                   st_dst_reg dst, st_src_reg src0)
 865 {
 866    st_src_reg undef = undef_src;
 867
 868    undef.swizzle = SWIZZLE_XXXX;
 869
 870    emit_scalar(ir, op, dst, src0, undef);
 871 }
 872
 873 void
 874 glsl_to_tgsi_visitor::emit_arl(ir_instruction *ir,
 875                                st_dst_reg dst, st_src_reg src0)
 876 {
 877    enum tgsi_opcode op = TGSI_OPCODE_ARL;
 878
 879    if (src0.type == GLSL_TYPE_INT || src0.type == GLSL_TYPE_UINT) {
 880       if (!this->need_uarl && src0.is_legal_tgsi_address_operand())
 881          return;
 882
 883       op = TGSI_OPCODE_UARL;
 884    }
 885
 886    assert(dst.file == PROGRAM_ADDRESS);
 887    if (dst.index >= this->num_address_regs)
 888       this->num_address_regs = dst.index + 1;
 889
 890    emit_asm(NULL, op, dst, src0);
 891 }
 892
 893 int
 894 glsl_to_tgsi_visitor::add_constant(gl_register_file file,
 895                                    gl_constant_value values[8], int size,
 896                                    GLenum datatype,
 897                                    uint16_t *swizzle_out)
 898 {
 899    if (file == PROGRAM_CONSTANT) {
 900       GLuint swizzle = swizzle_out ? *swizzle_out : 0;
 901       int result = _mesa_add_typed_unnamed_constant(this->prog->Parameters,
 902                                                     values, size, datatype,
 903                                                     &swizzle);
 904       if (swizzle_out)
 905          *swizzle_out = swizzle;
 906       return result;
 907    }
 908
 909    assert(file == PROGRAM_IMMEDIATE);
 910
 911    int index = 0;
 912    immediate_storage *entry;
 913    int size32 = size * ((datatype == GL_DOUBLE ||
 914                          datatype == GL_INT64_ARB ||
 915                          datatype == GL_UNSIGNED_INT64_ARB) ? 2 : 1);
 916    int i;
 917
 918    /* Search immediate storage to see if we already have an identical
 919     * immediate that we can use instead of adding a duplicate entry.
 920     */
 921    foreach_in_list(immediate_storage, entry, &this->immediates) {
 922       immediate_storage *tmp = entry;
 923
 924       for (i = 0; i * 4 < size32; i++) {
 925          int slot_size = MIN2(size32 - (i * 4), 4);
 926          if (tmp->type != datatype || tmp->size32 != slot_size)
 927             break;
 928          if (memcmp(tmp->values, &values[i * 4],
 929                     slot_size * sizeof(gl_constant_value)))
 930             break;
 931
 932          /* Everything matches, keep going until the full size is matched */
 933          tmp = (immediate_storage *)tmp->next;
 934       }
 935
 936       /* The full value matched */
 937       if (i * 4 >= size32)
 938          return index;
 939
 940       index++;
 941    }
 942
 943    for (i = 0; i * 4 < size32; i++) {
 944       int slot_size = MIN2(size32 - (i * 4), 4);
 945       /* Add this immediate to the list. */
 946       entry = new(mem_ctx) immediate_storage(&values[i * 4],
 947                                              slot_size, datatype);
 948       this->immediates.push_tail(entry);
 949       this->num_immediates++;
 950    }
 951    return index;
 952 }
 953
 954 st_src_reg
 955 glsl_to_tgsi_visitor::st_src_reg_for_float(float val)
 956 {
 957    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_FLOAT);
 958    union gl_constant_value uval;
 959
 960    uval.f = val;
 961    src.index = add_constant(src.file, &uval, 1, GL_FLOAT, &src.swizzle);
 962
 963    return src;
 964 }
 965
 966 st_src_reg
 967 glsl_to_tgsi_visitor::st_src_reg_for_double(double val)
 968 {
 969    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_DOUBLE);
 970    union gl_constant_value uval[2];
 971
 972    memcpy(uval, &val, sizeof(uval));
 973    src.index = add_constant(src.file, uval, 1, GL_DOUBLE, &src.swizzle);
 974    src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
 975    return src;
 976 }
 977
 978 st_src_reg
 979 glsl_to_tgsi_visitor::st_src_reg_for_int(int val)
 980 {
 981    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_INT);
 982    union gl_constant_value uval;
 983
 984    assert(native_integers);
 985
 986    uval.i = val;
 987    src.index = add_constant(src.file, &uval, 1, GL_INT, &src.swizzle);
 988
 989    return src;
 990 }
 991
 992 st_src_reg
 993 glsl_to_tgsi_visitor::st_src_reg_for_int64(int64_t val)
 994 {
 995    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_INT64);
 996    union gl_constant_value uval[2];
 997
 998    memcpy(uval, &val, sizeof(uval));
 999    src.index = add_constant(src.file, uval, 1, GL_DOUBLE, &src.swizzle);
1000    src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
1001
1002    return src;
1003 }
1004
1005 st_src_reg
1006 glsl_to_tgsi_visitor::st_src_reg_for_type(enum glsl_base_type type, int val)
1007 {
1008    if (native_integers)
1009       return type == GLSL_TYPE_FLOAT ? st_src_reg_for_float(val) :
1010                                        st_src_reg_for_int(val);
1011    else
1012       return st_src_reg_for_float(val);
1013 }
1014
1015 static int
1016 attrib_type_size(const struct glsl_type *type, bool is_vs_input)
1017 {
1018    return type->count_attribute_slots(is_vs_input);
1019 }
1020
1021 static int
1022 type_size(const struct glsl_type *type)
1023 {
1024    return type->count_attribute_slots(false);
1025 }
1026
1027 static void
1028 add_buffer_to_load_and_stores(glsl_to_tgsi_instruction *inst, st_src_reg *buf,
1029                               exec_list *instructions, ir_constant *access)
1030 {
1031    /**
1032     * emit_asm() might have actually split the op into pieces, e.g. for
1033     * double stores. We have to go back and fix up all the generated ops.
1034     */
1035    enum tgsi_opcode op = inst->op;
1036    do {
1037       inst->resource = *buf;
1038       if (access)
1039          inst->buffer_access = access->value.u[0];
1040
1041       if (inst == instructions->get_head_raw())
1042          break;
1043       inst = (glsl_to_tgsi_instruction *)inst->get_prev();
1044
1045       if (inst->op == TGSI_OPCODE_UADD) {
1046          if (inst == instructions->get_head_raw())
1047             break;
1048          inst = (glsl_to_tgsi_instruction *)inst->get_prev();
1049       }
1050    } while (inst->op == op && inst->resource.file == PROGRAM_UNDEFINED);
1051 }
1052
1053 /**
1054  * If the given GLSL type is an array or matrix or a structure containing
1055  * an array/matrix member, return true.  Else return false.
1056  *
1057  * This is used to determine which kind of temp storage (PROGRAM_TEMPORARY
1058  * or PROGRAM_ARRAY) should be used for variables of this type.  Anytime
1059  * we have an array that might be indexed with a variable, we need to use
1060  * the later storage type.
1061  */
1062 static bool
1063 type_has_array_or_matrix(const glsl_type *type)
1064 {
1065    if (type->is_array() || type->is_matrix())
1066       return true;
1067
1068    if (type->is_record()) {
1069       for (unsigned i = 0; i < type->length; i++) {
1070          if (type_has_array_or_matrix(type->fields.structure[i].type)) {
1071             return true;
1072          }
1073       }
1074    }
1075
1076    return false;
1077 }
1078
1079
1080 /**
1081  * In the initial pass of codegen, we assign temporary numbers to
1082  * intermediate results.  (not SSA -- variable assignments will reuse
1083  * storage).
1084  */
1085 st_src_reg
1086 glsl_to_tgsi_visitor::get_temp(const glsl_type *type)
1087 {
1088    st_src_reg src;
1089
1090    src.type = native_integers ? type->base_type : GLSL_TYPE_FLOAT;
1091    src.reladdr = NULL;
1092    src.negate = 0;
1093    src.abs = 0;
1094
1095    if (!options->EmitNoIndirectTemp && type_has_array_or_matrix(type)) {
1096       if (next_array >= max_num_arrays) {
1097          max_num_arrays += 32;
1098          array_sizes = (unsigned*)
1099             realloc(array_sizes, sizeof(array_sizes[0]) * max_num_arrays);
1100       }
1101
1102       src.file = PROGRAM_ARRAY;
1103       src.index = 0;
1104       src.array_id = next_array + 1;
1105       array_sizes[next_array] = type_size(type);
1106       ++next_array;
1107
1108    } else {
1109       src.file = PROGRAM_TEMPORARY;
1110       src.index = next_temp;
1111       next_temp += type_size(type);
1112    }
1113
1114    if (type->is_array() || type->is_record()) {
1115       src.swizzle = SWIZZLE_NOOP;
1116    } else {
1117       src.swizzle = swizzle_for_size(type->vector_elements);
1118    }
1119
1120    return src;
1121 }
1122
1123 variable_storage *
1124 glsl_to_tgsi_visitor::find_variable_storage(ir_variable *var)
1125 {
1126    struct hash_entry *entry;
1127
1128    entry = _mesa_hash_table_search(this->variables, var);
1129    if (!entry)
1130       return NULL;
1131
1132    return (variable_storage *)entry->data;
1133 }
1134
1135 void
1136 glsl_to_tgsi_visitor::visit(ir_variable *ir)
1137 {
1138    if (strcmp(ir->name, "gl_FragCoord") == 0) {
1139       this->prog->OriginUpperLeft = ir->data.origin_upper_left;
1140       this->prog->PixelCenterInteger = ir->data.pixel_center_integer;
1141    }
1142
1143    if (ir->data.mode == ir_var_uniform && strncmp(ir->name, "gl_", 3) == 0) {
1144       unsigned int i;
1145       const ir_state_slot *const slots = ir->get_state_slots();
1146       assert(slots != NULL);
1147
1148       /* Check if this statevar's setup in the STATE file exactly
1149        * matches how we'll want to reference it as a
1150        * struct/array/whatever.  If not, then we need to move it into
1151        * temporary storage and hope that it'll get copy-propagated
1152        * out.
1153        */
1154       for (i = 0; i < ir->get_num_state_slots(); i++) {
1155          if (slots[i].swizzle != SWIZZLE_XYZW) {
1156             break;
1157          }
1158       }
1159
1160       variable_storage *storage;
1161       st_dst_reg dst;
1162       if (i == ir->get_num_state_slots()) {
1163          /* We'll set the index later. */
1164          storage = new(mem_ctx) variable_storage(ir, PROGRAM_STATE_VAR, -1);
1165
1166          _mesa_hash_table_insert(this->variables, ir, storage);
1167
1168          dst = undef_dst;
1169       } else {
1170          /* The variable_storage constructor allocates slots based on the size
1171           * of the type.  However, this had better match the number of state
1172           * elements that we're going to copy into the new temporary.
1173           */
1174          assert((int) ir->get_num_state_slots() == type_size(ir->type));
1175
1176          dst = st_dst_reg(get_temp(ir->type));
1177
1178          storage = new(mem_ctx) variable_storage(ir, dst.file, dst.index,
1179                                                  dst.array_id);
1180
1181          _mesa_hash_table_insert(this->variables, ir, storage);
1182       }
1183
1184
1185       for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1186          int index = _mesa_add_state_reference(this->prog->Parameters,
1187                                                slots[i].tokens);
1188
1189          if (storage->file == PROGRAM_STATE_VAR) {
1190             if (storage->index == -1) {
1191                storage->index = index;
1192             } else {
1193                assert(index == storage->index + (int)i);
1194             }
1195          } else {
1196             /* We use GLSL_TYPE_FLOAT here regardless of the actual type of
1197              * the data being moved since MOV does not care about the type of
1198              * data it is moving, and we don't want to declare registers with
1199              * array or struct types.
1200              */
1201             st_src_reg src(PROGRAM_STATE_VAR, index, GLSL_TYPE_FLOAT);
1202             src.swizzle = slots[i].swizzle;
1203             emit_asm(ir, TGSI_OPCODE_MOV, dst, src);
1204             /* even a float takes up a whole vec4 reg in a struct/array. */
1205             dst.index++;
1206          }
1207       }
1208
1209       if (storage->file == PROGRAM_TEMPORARY &&
1210           dst.index != storage->index + (int) ir->get_num_state_slots()) {
1211          fail_link(this->shader_program,
1212                   "failed to load builtin uniform `%s'  (%d/%d regs loaded)\n",
1213                   ir->name, dst.index - storage->index,
1214                   type_size(ir->type));
1215       }
1216    }
1217 }
1218
1219 void
1220 glsl_to_tgsi_visitor::visit(ir_loop *ir)
1221 {
1222    emit_asm(NULL, TGSI_OPCODE_BGNLOOP);
1223
1224    visit_exec_list(&ir->body_instructions, this);
1225
1226    emit_asm(NULL, TGSI_OPCODE_ENDLOOP);
1227 }
1228
1229 void
1230 glsl_to_tgsi_visitor::visit(ir_loop_jump *ir)
1231 {
1232    switch (ir->mode) {
1233    case ir_loop_jump::jump_break:
1234       emit_asm(NULL, TGSI_OPCODE_BRK);
1235       break;
1236    case ir_loop_jump::jump_continue:
1237       emit_asm(NULL, TGSI_OPCODE_CONT);
1238       break;
1239    }
1240 }
1241
1242
1243 void
1244 glsl_to_tgsi_visitor::visit(ir_function_signature *ir)
1245 {
1246    assert(0);
1247    (void)ir;
1248 }
1249
1250 void
1251 glsl_to_tgsi_visitor::visit(ir_function *ir)
1252 {
1253    /* Ignore function bodies other than main() -- we shouldn't see calls to
1254     * them since they should all be inlined before we get to glsl_to_tgsi.
1255     */
1256    if (strcmp(ir->name, "main") == 0) {
1257       const ir_function_signature *sig;
1258       exec_list empty;
1259
1260       sig = ir->matching_signature(NULL, &empty, false);
1261
1262       assert(sig);
1263
1264       foreach_in_list(ir_instruction, ir, &sig->body) {
1265          ir->accept(this);
1266       }
1267    }
1268 }
1269
1270 bool
1271 glsl_to_tgsi_visitor::try_emit_mad(ir_expression *ir, int mul_operand)
1272 {
1273    int nonmul_operand = 1 - mul_operand;
1274    st_src_reg a, b, c;
1275    st_dst_reg result_dst;
1276
1277    ir_expression *expr = ir->operands[mul_operand]->as_expression();
1278    if (!expr || expr->operation != ir_binop_mul)
1279       return false;
1280
1281    expr->operands[0]->accept(this);
1282    a = this->result;
1283    expr->operands[1]->accept(this);
1284    b = this->result;
1285    ir->operands[nonmul_operand]->accept(this);
1286    c = this->result;
1287
1288    this->result = get_temp(ir->type);
1289    result_dst = st_dst_reg(this->result);
1290    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1291    emit_asm(ir, TGSI_OPCODE_MAD, result_dst, a, b, c);
1292
1293    return true;
1294 }
1295
1296 /**
1297  * Emit MAD(a, -b, a) instead of AND(a, NOT(b))
1298  *
1299  * The logic values are 1.0 for true and 0.0 for false.  Logical-and is
1300  * implemented using multiplication, and logical-or is implemented using
1301  * addition.  Logical-not can be implemented as (true - x), or (1.0 - x).
1302  * As result, the logical expression (a & !b) can be rewritten as:
1303  *
1304  *     - a * !b
1305  *     - a * (1 - b)
1306  *     - (a * 1) - (a * b)
1307  *     - a + -(a * b)
1308  *     - a + (a * -b)
1309  *
1310  * This final expression can be implemented as a single MAD(a, -b, a)
1311  * instruction.
1312  */
1313 bool
1314 glsl_to_tgsi_visitor::try_emit_mad_for_and_not(ir_expression *ir,
1315                                                int try_operand)
1316 {
1317    const int other_operand = 1 - try_operand;
1318    st_src_reg a, b;
1319
1320    ir_expression *expr = ir->operands[try_operand]->as_expression();
1321    if (!expr || expr->operation != ir_unop_logic_not)
1322       return false;
1323
1324    ir->operands[other_operand]->accept(this);
1325    a = this->result;
1326    expr->operands[0]->accept(this);
1327    b = this->result;
1328
1329    b.negate = ~b.negate;
1330
1331    this->result = get_temp(ir->type);
1332    emit_asm(ir, TGSI_OPCODE_MAD, st_dst_reg(this->result), a, b, a);
1333
1334    return true;
1335 }
1336
1337 void
1338 glsl_to_tgsi_visitor::reladdr_to_temp(ir_instruction *ir,
1339                                       st_src_reg *reg, int *num_reladdr)
1340 {
1341    if (!reg->reladdr && !reg->reladdr2)
1342       return;
1343
1344    if (reg->reladdr)
1345       emit_arl(ir, address_reg, *reg->reladdr);
1346    if (reg->reladdr2)
1347       emit_arl(ir, address_reg2, *reg->reladdr2);
1348
1349    if (*num_reladdr != 1) {
1350       st_src_reg temp = get_temp(glsl_type::get_instance(reg->type, 4, 1));
1351
1352       emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
1353       *reg = temp;
1354    }
1355
1356    (*num_reladdr)--;
1357 }
1358
1359 void
1360 glsl_to_tgsi_visitor::visit(ir_expression *ir)
1361 {
1362    st_src_reg op[ARRAY_SIZE(ir->operands)];
1363
1364    /* Quick peephole: Emit MAD(a, b, c) instead of ADD(MUL(a, b), c)
1365     */
1366    if (!this->precise && ir->operation == ir_binop_add) {
1367       if (try_emit_mad(ir, 1))
1368          return;
1369       if (try_emit_mad(ir, 0))
1370          return;
1371    }
1372
1373    /* Quick peephole: Emit OPCODE_MAD(-a, -b, a) instead of AND(a, NOT(b))
1374     */
1375    if (!native_integers && ir->operation == ir_binop_logic_and) {
1376       if (try_emit_mad_for_and_not(ir, 1))
1377          return;
1378       if (try_emit_mad_for_and_not(ir, 0))
1379          return;
1380    }
1381
1382    if (ir->operation == ir_quadop_vector)
1383       assert(!"ir_quadop_vector should have been lowered");
1384
1385    for (unsigned int operand = 0; operand < ir->num_operands; operand++) {
1386       this->result.file = PROGRAM_UNDEFINED;
1387       ir->operands[operand]->accept(this);
1388       if (this->result.file == PROGRAM_UNDEFINED) {
1389          printf("Failed to get tree for expression operand:\n");
1390          ir->operands[operand]->print();
1391          printf("\n");
1392          exit(1);
1393       }
1394       op[operand] = this->result;
1395
1396       /* Matrix expression operands should have been broken down to vector
1397        * operations already.
1398        */
1399       assert(!ir->operands[operand]->type->is_matrix());
1400    }
1401
1402    visit_expression(ir, op);
1403 }
1404
1405 /* The non-recursive part of the expression visitor lives in a separate
1406  * function and should be prevented from being inlined, to avoid a stack
1407  * explosion when deeply nested expressions are visited.
1408  */
1409 void
1410 glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
1411 {
1412    st_src_reg result_src;
1413    st_dst_reg result_dst;
1414
1415    int vector_elements = ir->operands[0]->type->vector_elements;
1416    if (ir->operands[1] &&
1417        ir->operation != ir_binop_interpolate_at_offset &&
1418        ir->operation != ir_binop_interpolate_at_sample) {
1419       st_src_reg *swz_op = NULL;
1420       if (vector_elements > ir->operands[1]->type->vector_elements) {
1421          assert(ir->operands[1]->type->vector_elements == 1);
1422          swz_op = &op[1];
1423       } else if (vector_elements < ir->operands[1]->type->vector_elements) {
1424          assert(ir->operands[0]->type->vector_elements == 1);
1425          swz_op = &op[0];
1426       }
1427       if (swz_op) {
1428          uint16_t swizzle_x = GET_SWZ(swz_op->swizzle, 0);
1429          swz_op->swizzle = MAKE_SWIZZLE4(swizzle_x, swizzle_x,
1430                                          swizzle_x, swizzle_x);
1431       }
1432       vector_elements = MAX2(vector_elements,
1433                              ir->operands[1]->type->vector_elements);
1434    }
1435    if (ir->operands[2] &&
1436        ir->operands[2]->type->vector_elements != vector_elements) {
1437       /* This can happen with ir_triop_lrp, i.e. glsl mix */
1438       assert(ir->operands[2]->type->vector_elements == 1);
1439       uint16_t swizzle_x = GET_SWZ(op[2].swizzle, 0);
1440       op[2].swizzle = MAKE_SWIZZLE4(swizzle_x, swizzle_x,
1441                                     swizzle_x, swizzle_x);
1442    }
1443
1444    this->result.file = PROGRAM_UNDEFINED;
1445
1446    /* Storage for our result.  Ideally for an assignment we'd be using
1447     * the actual storage for the result here, instead.
1448     */
1449    result_src = get_temp(ir->type);
1450    /* convenience for the emit functions below. */
1451    result_dst = st_dst_reg(result_src);
1452    /* Limit writes to the channels that will be used by result_src later.
1453     * This does limit this temp's use as a temporary for multi-instruction
1454     * sequences.
1455     */
1456    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1457
1458    switch (ir->operation) {
1459    case ir_unop_logic_not:
1460       if (result_dst.type != GLSL_TYPE_FLOAT)
1461          emit_asm(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
1462       else {
1463          /* Previously 'SEQ dst, src, 0.0' was used for this.  However, many
1464           * older GPUs implement SEQ using multiple instructions (i915 uses two
1465           * SGE instructions and a MUL instruction).  Since our logic values are
1466           * 0.0 and 1.0, 1-x also implements !x.
1467           */
1468          op[0].negate = ~op[0].negate;
1469          emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0],
1470                   st_src_reg_for_float(1.0));
1471       }
1472       break;
1473    case ir_unop_neg:
1474       if (result_dst.type == GLSL_TYPE_INT64 ||
1475           result_dst.type == GLSL_TYPE_UINT64)
1476          emit_asm(ir, TGSI_OPCODE_I64NEG, result_dst, op[0]);
1477       else if (result_dst.type == GLSL_TYPE_INT ||
1478                result_dst.type == GLSL_TYPE_UINT)
1479          emit_asm(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
1480       else if (result_dst.type == GLSL_TYPE_DOUBLE)
1481          emit_asm(ir, TGSI_OPCODE_DNEG, result_dst, op[0]);
1482       else {
1483          op[0].negate = ~op[0].negate;
1484          result_src = op[0];
1485       }
1486       break;
1487    case ir_unop_subroutine_to_int:
1488       emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
1489       break;
1490    case ir_unop_abs:
1491       if (result_dst.type == GLSL_TYPE_FLOAT)
1492          emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0].get_abs());
1493       else if (result_dst.type == GLSL_TYPE_DOUBLE)
1494          emit_asm(ir, TGSI_OPCODE_DABS, result_dst, op[0]);
1495       else if (result_dst.type == GLSL_TYPE_INT64 ||
1496                result_dst.type == GLSL_TYPE_UINT64)
1497          emit_asm(ir, TGSI_OPCODE_I64ABS, result_dst, op[0]);
1498       else
1499          emit_asm(ir, TGSI_OPCODE_IABS, result_dst, op[0]);
1500       break;
1501    case ir_unop_sign:
1502       emit_asm(ir, TGSI_OPCODE_SSG, result_dst, op[0]);
1503       break;
1504    case ir_unop_rcp:
1505       emit_scalar(ir, TGSI_OPCODE_RCP, result_dst, op[0]);
1506       break;
1507
1508    case ir_unop_exp2:
1509       emit_scalar(ir, TGSI_OPCODE_EX2, result_dst, op[0]);
1510       break;
1511    case ir_unop_exp:
1512       assert(!"not reached: should be handled by exp_to_exp2");
1513       break;
1514    case ir_unop_log:
1515       assert(!"not reached: should be handled by log_to_log2");
1516       break;
1517    case ir_unop_log2:
1518       emit_scalar(ir, TGSI_OPCODE_LG2, result_dst, op[0]);
1519       break;
1520    case ir_unop_sin:
1521       emit_scalar(ir, TGSI_OPCODE_SIN, result_dst, op[0]);
1522       break;
1523    case ir_unop_cos:
1524       emit_scalar(ir, TGSI_OPCODE_COS, result_dst, op[0]);
1525       break;
1526    case ir_unop_saturate: {
1527       glsl_to_tgsi_instruction *inst;
1528       inst = emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
1529       inst->saturate = true;
1530       break;
1531    }
1532
1533    case ir_unop_dFdx:
1534    case ir_unop_dFdx_coarse:
1535       emit_asm(ir, TGSI_OPCODE_DDX, result_dst, op[0]);
1536       break;
1537    case ir_unop_dFdx_fine:
1538       emit_asm(ir, TGSI_OPCODE_DDX_FINE, result_dst, op[0]);
1539       break;
1540    case ir_unop_dFdy:
1541    case ir_unop_dFdy_coarse:
1542    case ir_unop_dFdy_fine:
1543    {
1544       /* The X component contains 1 or -1 depending on whether the framebuffer
1545        * is a FBO or the window system buffer, respectively.
1546        * It is then multiplied with the source operand of DDY.
1547        */
1548       static const gl_state_index16 transform_y_state[STATE_LENGTH]
1549          = { STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM };
1550
1551       unsigned transform_y_index =
1552          _mesa_add_state_reference(this->prog->Parameters,
1553                                    transform_y_state);
1554
1555       st_src_reg transform_y = st_src_reg(PROGRAM_STATE_VAR,
1556                                           transform_y_index,
1557                                           glsl_type::vec4_type);
1558       transform_y.swizzle = SWIZZLE_XXXX;
1559
1560       st_src_reg temp = get_temp(glsl_type::vec4_type);
1561
1562       emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(temp), transform_y, op[0]);
1563       emit_asm(ir, ir->operation == ir_unop_dFdy_fine ?
1564            TGSI_OPCODE_DDY_FINE : TGSI_OPCODE_DDY, result_dst, temp);
1565       break;
1566    }
1567
1568    case ir_unop_frexp_sig:
1569       emit_asm(ir, TGSI_OPCODE_DFRACEXP, result_dst, undef_dst, op[0]);
1570       break;
1571
1572    case ir_unop_frexp_exp:
1573       emit_asm(ir, TGSI_OPCODE_DFRACEXP, undef_dst, result_dst, op[0]);
1574       break;
1575
1576    case ir_unop_noise: {
1577       /* At some point, a motivated person could add a better
1578        * implementation of noise.  Currently not even the nvidia
1579        * binary drivers do anything more than this.  In any case, the
1580        * place to do this is in the GL state tracker, not the poor
1581        * driver.
1582        */
1583       emit_asm(ir, TGSI_OPCODE_MOV, result_dst, st_src_reg_for_float(0.5));
1584       break;
1585    }
1586
1587    case ir_binop_add:
1588       emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1589       break;
1590    case ir_binop_sub:
1591       op[1].negate = ~op[1].negate;
1592       emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1593       break;
1594
1595    case ir_binop_mul:
1596       emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
1597       break;
1598    case ir_binop_div:
1599       emit_asm(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
1600       break;
1601    case ir_binop_mod:
1602       if (result_dst.type == GLSL_TYPE_FLOAT)
1603          assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
1604       else
1605          emit_asm(ir, TGSI_OPCODE_MOD, result_dst, op[0], op[1]);
1606       break;
1607
1608    case ir_binop_less:
1609       emit_asm(ir, TGSI_OPCODE_SLT, result_dst, op[0], op[1]);
1610       break;
1611    case ir_binop_gequal:
1612       emit_asm(ir, TGSI_OPCODE_SGE, result_dst, op[0], op[1]);
1613       break;
1614    case ir_binop_equal:
1615       emit_asm(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
1616       break;
1617    case ir_binop_nequal:
1618       emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1619       break;
1620    case ir_binop_all_equal:
1621       /* "==" operator producing a scalar boolean. */
1622       if (ir->operands[0]->type->is_vector() ||
1623           ir->operands[1]->type->is_vector()) {
1624          st_src_reg temp = get_temp(native_integers ?
1625                                     glsl_type::uvec4_type :
1626                                     glsl_type::vec4_type);
1627
1628          if (native_integers) {
1629             st_dst_reg temp_dst = st_dst_reg(temp);
1630             st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
1631
1632             if (ir->operands[0]->type->is_boolean() &&
1633                 ir->operands[1]->as_constant() &&
1634                 ir->operands[1]->as_constant()->is_one()) {
1635                emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), op[0]);
1636             } else {
1637                emit_asm(ir, TGSI_OPCODE_SEQ, st_dst_reg(temp), op[0], op[1]);
1638             }
1639
1640             /* Emit 1-3 AND operations to combine the SEQ results. */
1641             switch (ir->operands[0]->type->vector_elements) {
1642             case 2:
1643                break;
1644             case 3:
1645                temp_dst.writemask = WRITEMASK_Y;
1646                temp1.swizzle = SWIZZLE_YYYY;
1647                temp2.swizzle = SWIZZLE_ZZZZ;
1648                emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
1649                break;
1650             case 4:
1651                temp_dst.writemask = WRITEMASK_X;
1652                temp1.swizzle = SWIZZLE_XXXX;
1653                temp2.swizzle = SWIZZLE_YYYY;
1654                emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
1655                temp_dst.writemask = WRITEMASK_Y;
1656                temp1.swizzle = SWIZZLE_ZZZZ;
1657                temp2.swizzle = SWIZZLE_WWWW;
1658                emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
1659             }
1660
1661             temp1.swizzle = SWIZZLE_XXXX;
1662             temp2.swizzle = SWIZZLE_YYYY;
1663             emit_asm(ir, TGSI_OPCODE_AND, result_dst, temp1, temp2);
1664          } else {
1665             emit_asm(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
1666
1667             /* After the dot-product, the value will be an integer on the
1668              * range [0,4].  Zero becomes 1.0, and positive values become zero.
1669              */
1670             emit_dp(ir, result_dst, temp, temp, vector_elements);
1671
1672             /* Negating the result of the dot-product gives values on the range
1673              * [-4, 0].  Zero becomes 1.0, and negative values become zero.
1674              * This is achieved using SGE.
1675              */
1676             st_src_reg sge_src = result_src;
1677             sge_src.negate = ~sge_src.negate;
1678             emit_asm(ir, TGSI_OPCODE_SGE, result_dst, sge_src,
1679                      st_src_reg_for_float(0.0));
1680          }
1681       } else {
1682          emit_asm(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
1683       }
1684       break;
1685    case ir_binop_any_nequal:
1686       /* "!=" operator producing a scalar boolean. */
1687       if (ir->operands[0]->type->is_vector() ||
1688           ir->operands[1]->type->is_vector()) {
1689          st_src_reg temp = get_temp(native_integers ?
1690                                     glsl_type::uvec4_type :
1691                                     glsl_type::vec4_type);
1692          if (ir->operands[0]->type->is_boolean() &&
1693              ir->operands[1]->as_constant() &&
1694              ir->operands[1]->as_constant()->is_zero()) {
1695             emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), op[0]);
1696          } else {
1697             emit_asm(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
1698          }
1699
1700          if (native_integers) {
1701             st_dst_reg temp_dst = st_dst_reg(temp);
1702             st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
1703
1704             /* Emit 1-3 OR operations to combine the SNE results. */
1705             switch (ir->operands[0]->type->vector_elements) {
1706             case 2:
1707                break;
1708             case 3:
1709                temp_dst.writemask = WRITEMASK_Y;
1710                temp1.swizzle = SWIZZLE_YYYY;
1711                temp2.swizzle = SWIZZLE_ZZZZ;
1712                emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
1713                break;
1714             case 4:
1715                temp_dst.writemask = WRITEMASK_X;
1716                temp1.swizzle = SWIZZLE_XXXX;
1717                temp2.swizzle = SWIZZLE_YYYY;
1718                emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
1719                temp_dst.writemask = WRITEMASK_Y;
1720                temp1.swizzle = SWIZZLE_ZZZZ;
1721                temp2.swizzle = SWIZZLE_WWWW;
1722                emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
1723             }
1724
1725             temp1.swizzle = SWIZZLE_XXXX;
1726             temp2.swizzle = SWIZZLE_YYYY;
1727             emit_asm(ir, TGSI_OPCODE_OR, result_dst, temp1, temp2);
1728          } else {
1729             /* After the dot-product, the value will be an integer on the
1730              * range [0,4].  Zero stays zero, and positive values become 1.0.
1731              */
1732             glsl_to_tgsi_instruction *const dp =
1733                   emit_dp(ir, result_dst, temp, temp, vector_elements);
1734             if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
1735                /* The clamping to [0,1] can be done for free in the fragment
1736                 * shader with a saturate.
1737                 */
1738                dp->saturate = true;
1739             } else {
1740                /* Negating the result of the dot-product gives values on the
1741                 * range [-4, 0].  Zero stays zero, and negative values become
1742                 * 1.0.  This achieved using SLT.
1743                 */
1744                st_src_reg slt_src = result_src;
1745                slt_src.negate = ~slt_src.negate;
1746                emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src,
1747                         st_src_reg_for_float(0.0));
1748             }
1749          }
1750       } else {
1751          emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1752       }
1753       break;
1754
1755    case ir_binop_logic_xor:
1756       if (native_integers)
1757          emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
1758       else
1759          emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1760       break;
1761
1762    case ir_binop_logic_or: {
1763       if (native_integers) {
1764          /* If integers are used as booleans, we can use an actual "or"
1765           * instruction.
1766           */
1767          assert(native_integers);
1768          emit_asm(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
1769       } else {
1770          /* After the addition, the value will be an integer on the
1771           * range [0,2].  Zero stays zero, and positive values become 1.0.
1772           */
1773          glsl_to_tgsi_instruction *add =
1774             emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1775          if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
1776             /* The clamping to [0,1] can be done for free in the fragment
1777              * shader with a saturate if floats are being used as boolean
1778              * values.
1779              */
1780             add->saturate = true;
1781          } else {
1782             /* Negating the result of the addition gives values on the range
1783              * [-2, 0].  Zero stays zero, and negative values become 1.0
1784              * This is achieved using SLT.
1785              */
1786             st_src_reg slt_src = result_src;
1787             slt_src.negate = ~slt_src.negate;
1788             emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src,
1789                      st_src_reg_for_float(0.0));
1790          }
1791       }
1792       break;
1793    }
1794
1795    case ir_binop_logic_and:
1796       /* If native integers are disabled, the bool args are stored as float 0.0
1797        * or 1.0, so "mul" gives us "and".  If they're enabled, just use the
1798        * actual AND opcode.
1799        */
1800       if (native_integers)
1801          emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
1802       else
1803          emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
1804       break;
1805
1806    case ir_binop_dot:
1807       assert(ir->operands[0]->type->is_vector());
1808       assert(ir->operands[0]->type == ir->operands[1]->type);
1809       emit_dp(ir, result_dst, op[0], op[1],
1810               ir->operands[0]->type->vector_elements);
1811       break;
1812
1813    case ir_unop_sqrt:
1814       if (have_sqrt) {
1815          emit_scalar(ir, TGSI_OPCODE_SQRT, result_dst, op[0]);
1816       } else {
1817          /* This is the only instruction sequence that makes the game "Risen"
1818           * render correctly. ABS is not required for the game, but since GLSL
1819           * declares negative values as "undefined", allowing us to do whatever
1820           * we want, I choose to use ABS to match DX9 and pre-GLSL RSQ
1821           * behavior.
1822           */
1823          emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0].get_abs());
1824          emit_scalar(ir, TGSI_OPCODE_RCP, result_dst, result_src);
1825       }
1826       break;
1827    case ir_unop_rsq:
1828       emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0]);
1829       break;
1830    case ir_unop_i2f:
1831       if (native_integers) {
1832          emit_asm(ir, TGSI_OPCODE_I2F, result_dst, op[0]);
1833          break;
1834       }
1835       /* fallthrough to next case otherwise */
1836    case ir_unop_b2f:
1837       if (native_integers) {
1838          emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0],
1839                   st_src_reg_for_float(1.0));
1840          break;
1841       }
1842       /* fallthrough to next case otherwise */
1843    case ir_unop_i2u:
1844    case ir_unop_u2i:
1845    case ir_unop_i642u64:
1846    case ir_unop_u642i64:
1847       /* Converting between signed and unsigned integers is a no-op. */
1848       result_src = op[0];
1849       result_src.type = result_dst.type;
1850       break;
1851    case ir_unop_b2i:
1852       if (native_integers) {
1853          /* Booleans are stored as integers using ~0 for true and 0 for false.
1854           * GLSL requires that int(bool) return 1 for true and 0 for false.
1855           * This conversion is done with AND, but it could be done with NEG.
1856           */
1857          emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0],
1858                   st_src_reg_for_int(1));
1859       } else {
1860          /* Booleans and integers are both stored as floats when native
1861           * integers are disabled.
1862           */
1863          result_src = op[0];
1864       }
1865       break;
1866    case ir_unop_f2i:
1867       if (native_integers)
1868          emit_asm(ir, TGSI_OPCODE_F2I, result_dst, op[0]);
1869       else
1870          emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
1871       break;
1872    case ir_unop_f2u:
1873       if (native_integers)
1874          emit_asm(ir, TGSI_OPCODE_F2U, result_dst, op[0]);
1875       else
1876          emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
1877       break;
1878    case ir_unop_bitcast_f2i:
1879    case ir_unop_bitcast_f2u:
1880       /* Make sure we don't propagate the negate modifier to integer opcodes. */
1881       if (op[0].negate || op[0].abs)
1882          emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
1883       else
1884          result_src = op[0];
1885       result_src.type = ir->operation == ir_unop_bitcast_f2i ? GLSL_TYPE_INT :
1886                                                                GLSL_TYPE_UINT;
1887       break;
1888    case ir_unop_bitcast_i2f:
1889    case ir_unop_bitcast_u2f:
1890       result_src = op[0];
1891       result_src.type = GLSL_TYPE_FLOAT;
1892       break;
1893    case ir_unop_f2b:
1894       emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0],
1895                st_src_reg_for_float(0.0));
1896       break;
1897    case ir_unop_d2b:
1898       emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0],
1899                st_src_reg_for_double(0.0));
1900       break;
1901    case ir_unop_i2b:
1902       if (native_integers)
1903          emit_asm(ir, TGSI_OPCODE_USNE, result_dst, op[0],
1904                   st_src_reg_for_int(0));
1905       else
1906          emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0],
1907                   st_src_reg_for_float(0.0));
1908       break;
1909    case ir_unop_bitcast_u642d:
1910    case ir_unop_bitcast_i642d:
1911       result_src = op[0];
1912       result_src.type = GLSL_TYPE_DOUBLE;
1913       break;
1914    case ir_unop_bitcast_d2i64:
1915       result_src = op[0];
1916       result_src.type = GLSL_TYPE_INT64;
1917       break;
1918    case ir_unop_bitcast_d2u64:
1919       result_src = op[0];
1920       result_src.type = GLSL_TYPE_UINT64;
1921       break;
1922    case ir_unop_trunc:
1923       emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
1924       break;
1925    case ir_unop_ceil:
1926       emit_asm(ir, TGSI_OPCODE_CEIL, result_dst, op[0]);
1927       break;
1928    case ir_unop_floor:
1929       emit_asm(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
1930       break;
1931    case ir_unop_round_even:
1932       emit_asm(ir, TGSI_OPCODE_ROUND, result_dst, op[0]);
1933       break;
1934    case ir_unop_fract:
1935       emit_asm(ir, TGSI_OPCODE_FRC, result_dst, op[0]);
1936       break;
1937
1938    case ir_binop_min:
1939       emit_asm(ir, TGSI_OPCODE_MIN, result_dst, op[0], op[1]);
1940       break;
1941    case ir_binop_max:
1942       emit_asm(ir, TGSI_OPCODE_MAX, result_dst, op[0], op[1]);
1943       break;
1944    case ir_binop_pow:
1945       emit_scalar(ir, TGSI_OPCODE_POW, result_dst, op[0], op[1]);
1946       break;
1947
1948    case ir_unop_bit_not:
1949       if (native_integers) {
1950          emit_asm(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
1951          break;
1952       }
1953    case ir_unop_u2f:
1954       if (native_integers) {
1955          emit_asm(ir, TGSI_OPCODE_U2F, result_dst, op[0]);
1956          break;
1957       }
1958    case ir_binop_lshift:
1959    case ir_binop_rshift:
1960       if (native_integers) {
1961          enum tgsi_opcode opcode = ir->operation == ir_binop_lshift
1962             ? TGSI_OPCODE_SHL : TGSI_OPCODE_ISHR;
1963          st_src_reg count;
1964
1965          if (glsl_base_type_is_64bit(op[0].type)) {
1966             /* GLSL shift operations have 32-bit shift counts, but TGSI uses
1967              * 64 bits.
1968              */
1969             count = get_temp(glsl_type::u64vec(ir->operands[1]
1970                                                ->type->components()));
1971             emit_asm(ir, TGSI_OPCODE_U2I64, st_dst_reg(count), op[1]);
1972          } else {
1973             count = op[1];
1974          }
1975
1976          emit_asm(ir, opcode, result_dst, op[0], count);
1977          break;
1978       }
1979    case ir_binop_bit_and:
1980       if (native_integers) {
1981          emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
1982          break;
1983       }
1984    case ir_binop_bit_xor:
1985       if (native_integers) {
1986          emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
1987          break;
1988       }
1989    case ir_binop_bit_or:
1990       if (native_integers) {
1991          emit_asm(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
1992          break;
1993       }
1994
1995       assert(!"GLSL 1.30 features unsupported");
1996       break;
1997
1998    case ir_binop_ubo_load: {
1999       if (ctx->Const.UseSTD430AsDefaultPacking) {
2000          ir_rvalue *block = ir->operands[0];
2001          ir_rvalue *offset = ir->operands[1];
2002          ir_constant *const_block = block->as_constant();
2003
2004          st_src_reg cbuf(PROGRAM_CONSTANT,
2005             (const_block ? const_block->value.u[0] + 1 : 1),
2006             ir->type->base_type);
2007
2008          cbuf.has_index2 = true;
2009
2010          if (!const_block) {
2011             block->accept(this);
2012             cbuf.reladdr = ralloc(mem_ctx, st_src_reg);
2013             *cbuf.reladdr = this->result;
2014             emit_arl(ir, sampler_reladdr, this->result);
2015          }
2016
2017          /* Calculate the surface offset */
2018          offset->accept(this);
2019          st_src_reg off = this->result;
2020
2021          glsl_to_tgsi_instruction *inst =
2022             emit_asm(ir, TGSI_OPCODE_LOAD, result_dst, off);
2023
2024          if (result_dst.type == GLSL_TYPE_BOOL)
2025             emit_asm(ir, TGSI_OPCODE_USNE, result_dst, st_src_reg(result_dst),
2026                      st_src_reg_for_int(0));
2027
2028          add_buffer_to_load_and_stores(inst, &cbuf, &this->instructions,
2029                                        NULL);
2030       } else {
2031          ir_constant *const_uniform_block = ir->operands[0]->as_constant();
2032          ir_constant *const_offset_ir = ir->operands[1]->as_constant();
2033          unsigned const_offset = const_offset_ir ?
2034             const_offset_ir->value.u[0] : 0;
2035          unsigned const_block = const_uniform_block ?
2036             const_uniform_block->value.u[0] + 1 : 1;
2037          st_src_reg index_reg = get_temp(glsl_type::uint_type);
2038          st_src_reg cbuf;
2039
2040          cbuf.type = ir->type->base_type;
2041          cbuf.file = PROGRAM_CONSTANT;
2042          cbuf.index = 0;
2043          cbuf.reladdr = NULL;
2044          cbuf.negate = 0;
2045          cbuf.abs = 0;
2046          cbuf.index2D = const_block;
2047
2048          assert(ir->type->is_vector() || ir->type->is_scalar());
2049
2050          if (const_offset_ir) {
2051             /* Constant index into constant buffer */
2052             cbuf.reladdr = NULL;
2053             cbuf.index = const_offset / 16;
2054          } else {
2055             ir_expression *offset_expr = ir->operands[1]->as_expression();
2056             st_src_reg offset = op[1];
2057
2058             /* The OpenGL spec is written in such a way that accesses with
2059              * non-constant offset are almost always vec4-aligned. The only
2060              * exception to this are members of structs in arrays of structs:
2061              * each struct in an array of structs is at least vec4-aligned,
2062              * but single-element and [ui]vec2 members of the struct may be at
2063              * an offset that is not a multiple of 16 bytes.
2064              *
2065              * Here, we extract that offset, relying on previous passes to
2066              * always generate offset expressions of the form
2067              * (+ expr constant_offset).
2068              *
2069              * Note that the std430 layout, which allows more cases of
2070              * alignment less than vec4 in arrays, is not supported for
2071              * uniform blocks, so we do not have to deal with it here.
2072              */
2073             if (offset_expr && offset_expr->operation == ir_binop_add) {
2074                const_offset_ir = offset_expr->operands[1]->as_constant();
2075                if (const_offset_ir) {
2076                   const_offset = const_offset_ir->value.u[0];
2077                   cbuf.index = const_offset / 16;
2078                   offset_expr->operands[0]->accept(this);
2079                   offset = this->result;
2080                }
2081             }
2082
2083             /* Relative/variable index into constant buffer */
2084             emit_asm(ir, TGSI_OPCODE_USHR, st_dst_reg(index_reg), offset,
2085                  st_src_reg_for_int(4));
2086             cbuf.reladdr = ralloc(mem_ctx, st_src_reg);
2087             memcpy(cbuf.reladdr, &index_reg, sizeof(index_reg));
2088          }
2089
2090          if (const_uniform_block) {
2091             /* Constant constant buffer */
2092             cbuf.reladdr2 = NULL;
2093          } else {
2094             /* Relative/variable constant buffer */
2095             cbuf.reladdr2 = ralloc(mem_ctx, st_src_reg);
2096             memcpy(cbuf.reladdr2, &op[0], sizeof(st_src_reg));
2097          }
2098          cbuf.has_index2 = true;
2099
2100          cbuf.swizzle = swizzle_for_size(ir->type->vector_elements);
2101          if (glsl_base_type_is_64bit(cbuf.type))
2102             cbuf.swizzle += MAKE_SWIZZLE4(const_offset % 16 / 8,
2103                                           const_offset % 16 / 8,
2104                                           const_offset % 16 / 8,
2105                                           const_offset % 16 / 8);
2106          else
2107             cbuf.swizzle += MAKE_SWIZZLE4(const_offset % 16 / 4,
2108                                           const_offset % 16 / 4,
2109                                           const_offset % 16 / 4,
2110                                           const_offset % 16 / 4);
2111
2112          if (ir->type->is_boolean()) {
2113             emit_asm(ir, TGSI_OPCODE_USNE, result_dst, cbuf,
2114                      st_src_reg_for_int(0));
2115          } else {
2116             emit_asm(ir, TGSI_OPCODE_MOV, result_dst, cbuf);
2117          }
2118       }
2119       break;
2120    }
2121    case ir_triop_lrp:
2122       /* note: we have to reorder the three args here */
2123       emit_asm(ir, TGSI_OPCODE_LRP, result_dst, op[2], op[1], op[0]);
2124       break;
2125    case ir_triop_csel:
2126       if (this->ctx->Const.NativeIntegers)
2127          emit_asm(ir, TGSI_OPCODE_UCMP, result_dst, op[0], op[1], op[2]);
2128       else {
2129          op[0].negate = ~op[0].negate;
2130          emit_asm(ir, TGSI_OPCODE_CMP, result_dst, op[0], op[1], op[2]);
2131       }
2132       break;
2133    case ir_triop_bitfield_extract:
2134       emit_asm(ir, TGSI_OPCODE_IBFE, result_dst, op[0], op[1], op[2]);
2135       break;
2136    case ir_quadop_bitfield_insert:
2137       emit_asm(ir, TGSI_OPCODE_BFI, result_dst, op[0], op[1], op[2], op[3]);
2138       break;
2139    case ir_unop_bitfield_reverse:
2140       emit_asm(ir, TGSI_OPCODE_BREV, result_dst, op[0]);
2141       break;
2142    case ir_unop_bit_count:
2143       emit_asm(ir, TGSI_OPCODE_POPC, result_dst, op[0]);
2144       break;
2145    case ir_unop_find_msb:
2146       emit_asm(ir, TGSI_OPCODE_IMSB, result_dst, op[0]);
2147       break;
2148    case ir_unop_find_lsb:
2149       emit_asm(ir, TGSI_OPCODE_LSB, result_dst, op[0]);
2150       break;
2151    case ir_binop_imul_high:
2152       emit_asm(ir, TGSI_OPCODE_IMUL_HI, result_dst, op[0], op[1]);
2153       break;
2154    case ir_triop_fma:
2155       /* In theory, MAD is incorrect here. */
2156       if (have_fma)
2157          emit_asm(ir, TGSI_OPCODE_FMA, result_dst, op[0], op[1], op[2]);
2158       else
2159          emit_asm(ir, TGSI_OPCODE_MAD, result_dst, op[0], op[1], op[2]);
2160       break;
2161    case ir_unop_interpolate_at_centroid:
2162       emit_asm(ir, TGSI_OPCODE_INTERP_CENTROID, result_dst, op[0]);
2163       break;
2164    case ir_binop_interpolate_at_offset: {
2165       /* The y coordinate needs to be flipped for the default fb */
2166       static const gl_state_index16 transform_y_state[STATE_LENGTH]
2167          = { STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM };
2168
2169       unsigned transform_y_index =
2170          _mesa_add_state_reference(this->prog->Parameters,
2171                                    transform_y_state);
2172
2173       st_src_reg transform_y = st_src_reg(PROGRAM_STATE_VAR,
2174                                           transform_y_index,
2175                                           glsl_type::vec4_type);
2176       transform_y.swizzle = SWIZZLE_XXXX;
2177
2178       st_src_reg temp = get_temp(glsl_type::vec2_type);
2179       st_dst_reg temp_dst = st_dst_reg(temp);
2180
2181       emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[1]);
2182       temp_dst.writemask = WRITEMASK_Y;
2183       emit_asm(ir, TGSI_OPCODE_MUL, temp_dst, transform_y, op[1]);
2184       emit_asm(ir, TGSI_OPCODE_INTERP_OFFSET, result_dst, op[0], temp);
2185       break;
2186    }
2187    case ir_binop_interpolate_at_sample:
2188       emit_asm(ir, TGSI_OPCODE_INTERP_SAMPLE, result_dst, op[0], op[1]);
2189       break;
2190
2191    case ir_unop_d2f:
2192       emit_asm(ir, TGSI_OPCODE_D2F, result_dst, op[0]);
2193       break;
2194    case ir_unop_f2d:
2195       emit_asm(ir, TGSI_OPCODE_F2D, result_dst, op[0]);
2196       break;
2197    case ir_unop_d2i:
2198       emit_asm(ir, TGSI_OPCODE_D2I, result_dst, op[0]);
2199       break;
2200    case ir_unop_i2d:
2201       emit_asm(ir, TGSI_OPCODE_I2D, result_dst, op[0]);
2202       break;
2203    case ir_unop_d2u:
2204       emit_asm(ir, TGSI_OPCODE_D2U, result_dst, op[0]);
2205       break;
2206    case ir_unop_u2d:
2207       emit_asm(ir, TGSI_OPCODE_U2D, result_dst, op[0]);
2208       break;
2209    case ir_unop_unpack_double_2x32:
2210    case ir_unop_pack_double_2x32:
2211    case ir_unop_unpack_int_2x32:
2212    case ir_unop_pack_int_2x32:
2213    case ir_unop_unpack_uint_2x32:
2214    case ir_unop_pack_uint_2x32:
2215    case ir_unop_unpack_sampler_2x32:
2216    case ir_unop_pack_sampler_2x32:
2217    case ir_unop_unpack_image_2x32:
2218    case ir_unop_pack_image_2x32:
2219       emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
2220       break;
2221
2222    case ir_binop_ldexp:
2223       if (ir->operands[0]->type->is_double()) {
2224          emit_asm(ir, TGSI_OPCODE_DLDEXP, result_dst, op[0], op[1]);
2225       } else if (ir->operands[0]->type->is_float()) {
2226          emit_asm(ir, TGSI_OPCODE_LDEXP, result_dst, op[0], op[1]);
2227       } else {
2228          assert(!"Invalid ldexp for non-double opcode in glsl_to_tgsi_visitor::visit()");
2229       }
2230       break;
2231
2232    case ir_unop_pack_half_2x16:
2233       emit_asm(ir, TGSI_OPCODE_PK2H, result_dst, op[0]);
2234       break;
2235    case ir_unop_unpack_half_2x16:
2236       emit_asm(ir, TGSI_OPCODE_UP2H, result_dst, op[0]);
2237       break;
2238
2239    case ir_unop_get_buffer_size: {
2240       ir_constant *const_offset = ir->operands[0]->as_constant();
2241       int buf_base = ctx->st->has_hw_atomics
2242          ? 0 : ctx->Const.Program[shader->Stage].MaxAtomicBuffers;
2243       st_src_reg buffer(
2244             PROGRAM_BUFFER,
2245             buf_base + (const_offset ? const_offset->value.u[0] : 0),
2246             GLSL_TYPE_UINT);
2247       if (!const_offset) {
2248          buffer.reladdr = ralloc(mem_ctx, st_src_reg);
2249          *buffer.reladdr = op[0];
2250          emit_arl(ir, sampler_reladdr, op[0]);
2251       }
2252       emit_asm(ir, TGSI_OPCODE_RESQ, result_dst)->resource = buffer;
2253       break;
2254    }
2255
2256    case ir_unop_u2i64:
2257    case ir_unop_u2u64:
2258    case ir_unop_b2i64: {
2259       st_src_reg temp = get_temp(glsl_type::uvec4_type);
2260       st_dst_reg temp_dst = st_dst_reg(temp);
2261       unsigned orig_swz = op[0].swizzle;
2262       /*
2263        * To convert unsigned to 64-bit:
2264        * zero Y channel, copy X channel.
2265        */
2266       temp_dst.writemask = WRITEMASK_Y;
2267       if (vector_elements > 1)
2268          temp_dst.writemask |= WRITEMASK_W;
2269       emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, st_src_reg_for_int(0));
2270       temp_dst.writemask = WRITEMASK_X;
2271       if (vector_elements > 1)
2272           temp_dst.writemask |= WRITEMASK_Z;
2273       op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(orig_swz, 0), GET_SWZ(orig_swz, 0),
2274                                     GET_SWZ(orig_swz, 1), GET_SWZ(orig_swz, 1));
2275       if (ir->operation == ir_unop_u2i64 || ir->operation == ir_unop_u2u64)
2276          emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[0]);
2277       else
2278          emit_asm(ir, TGSI_OPCODE_AND, temp_dst, op[0], st_src_reg_for_int(1));
2279       result_src = temp;
2280       result_src.type = GLSL_TYPE_UINT64;
2281       if (vector_elements > 2) {
2282          /* Subtle: We rely on the fact that get_temp here returns the next
2283           * TGSI temporary register directly after the temp register used for
2284           * the first two components, so that the result gets picked up
2285           * automatically.
2286           */
2287          st_src_reg temp = get_temp(glsl_type::uvec4_type);
2288          st_dst_reg temp_dst = st_dst_reg(temp);
2289          temp_dst.writemask = WRITEMASK_Y;
2290          if (vector_elements > 3)
2291             temp_dst.writemask |= WRITEMASK_W;
2292          emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, st_src_reg_for_int(0));
2293
2294          temp_dst.writemask = WRITEMASK_X;
2295          if (vector_elements > 3)
2296             temp_dst.writemask |= WRITEMASK_Z;
2297          op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(orig_swz, 2),
2298                                        GET_SWZ(orig_swz, 2),
2299                                        GET_SWZ(orig_swz, 3),
2300                                        GET_SWZ(orig_swz, 3));
2301          if (ir->operation == ir_unop_u2i64 || ir->operation == ir_unop_u2u64)
2302             emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[0]);
2303          else
2304             emit_asm(ir, TGSI_OPCODE_AND, temp_dst, op[0],
2305                      st_src_reg_for_int(1));
2306       }
2307       break;
2308    }
2309    case ir_unop_i642i:
2310    case ir_unop_u642i:
2311    case ir_unop_u642u:
2312    case ir_unop_i642u: {
2313       st_src_reg temp = get_temp(glsl_type::uvec4_type);
2314       st_dst_reg temp_dst = st_dst_reg(temp);
2315       unsigned orig_swz = op[0].swizzle;
2316       unsigned orig_idx = op[0].index;
2317       int el;
2318       temp_dst.writemask = WRITEMASK_X;
2319
2320       for (el = 0; el < vector_elements; el++) {
2321          unsigned swz = GET_SWZ(orig_swz, el);
2322          if (swz & 1)
2323             op[0].swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_Z,
2324                                           SWIZZLE_Z, SWIZZLE_Z);
2325          else
2326             op[0].swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X,
2327                                           SWIZZLE_X, SWIZZLE_X);
2328          if (swz > 2)
2329             op[0].index = orig_idx + 1;
2330          op[0].type = GLSL_TYPE_UINT;
2331          temp_dst.writemask = WRITEMASK_X << el;
2332          emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[0]);
2333       }
2334       result_src = temp;
2335       if (ir->operation == ir_unop_u642u || ir->operation == ir_unop_i642u)
2336          result_src.type = GLSL_TYPE_UINT;
2337       else
2338          result_src.type = GLSL_TYPE_INT;
2339       break;
2340    }
2341    case ir_unop_i642b:
2342       emit_asm(ir, TGSI_OPCODE_U64SNE, result_dst, op[0],
2343                st_src_reg_for_int64(0));
2344       break;
2345    case ir_unop_i642f:
2346       emit_asm(ir, TGSI_OPCODE_I642F, result_dst, op[0]);
2347       break;
2348    case ir_unop_u642f:
2349       emit_asm(ir, TGSI_OPCODE_U642F, result_dst, op[0]);
2350       break;
2351    case ir_unop_i642d:
2352       emit_asm(ir, TGSI_OPCODE_I642D, result_dst, op[0]);
2353       break;
2354    case ir_unop_u642d:
2355       emit_asm(ir, TGSI_OPCODE_U642D, result_dst, op[0]);
2356       break;
2357    case ir_unop_i2i64:
2358       emit_asm(ir, TGSI_OPCODE_I2I64, result_dst, op[0]);
2359       break;
2360    case ir_unop_f2i64:
2361       emit_asm(ir, TGSI_OPCODE_F2I64, result_dst, op[0]);
2362       break;
2363    case ir_unop_d2i64:
2364       emit_asm(ir, TGSI_OPCODE_D2I64, result_dst, op[0]);
2365       break;
2366    case ir_unop_i2u64:
2367       emit_asm(ir, TGSI_OPCODE_I2I64, result_dst, op[0]);
2368       break;
2369    case ir_unop_f2u64:
2370       emit_asm(ir, TGSI_OPCODE_F2U64, result_dst, op[0]);
2371       break;
2372    case ir_unop_d2u64:
2373       emit_asm(ir, TGSI_OPCODE_D2U64, result_dst, op[0]);
2374       break;
2375       /* these might be needed */
2376    case ir_unop_pack_snorm_2x16:
2377    case ir_unop_pack_unorm_2x16:
2378    case ir_unop_pack_snorm_4x8:
2379    case ir_unop_pack_unorm_4x8:
2380
2381    case ir_unop_unpack_snorm_2x16:
2382    case ir_unop_unpack_unorm_2x16:
2383    case ir_unop_unpack_snorm_4x8:
2384    case ir_unop_unpack_unorm_4x8:
2385
2386    case ir_quadop_vector:
2387    case ir_binop_vector_extract:
2388    case ir_triop_vector_insert:
2389    case ir_binop_carry:
2390    case ir_binop_borrow:
2391    case ir_unop_ssbo_unsized_array_length:
2392       /* This operation is not supported, or should have already been handled.
2393        */
2394       assert(!"Invalid ir opcode in glsl_to_tgsi_visitor::visit()");
2395       break;
2396    }
2397
2398    this->result = result_src;
2399 }
2400
2401
2402 void
2403 glsl_to_tgsi_visitor::visit(ir_swizzle *ir)
2404 {
2405    st_src_reg src;
2406    int i;
2407    int swizzle[4];
2408
2409    /* Note that this is only swizzles in expressions, not those on the left
2410     * hand side of an assignment, which do write masking.  See ir_assignment
2411     * for that.
2412     */
2413
2414    ir->val->accept(this);
2415    src = this->result;
2416    assert(src.file != PROGRAM_UNDEFINED);
2417    assert(ir->type->vector_elements > 0);
2418
2419    for (i = 0; i < 4; i++) {
2420       if (i < ir->type->vector_elements) {
2421          switch (i) {
2422          case 0:
2423             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.x);
2424             break;
2425          case 1:
2426             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.y);
2427             break;
2428          case 2:
2429             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.z);
2430             break;
2431          case 3:
2432             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.w);
2433             break;
2434          }
2435       } else {
2436          /* If the type is smaller than a vec4, replicate the last
2437           * channel out.
2438           */
2439          swizzle[i] = swizzle[ir->type->vector_elements - 1];
2440       }
2441    }
2442
2443    src.swizzle = MAKE_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2444
2445    this->result = src;
2446 }
2447
2448 /* Test if the variable is an array. Note that geometry and
2449  * tessellation shader inputs are outputs are always arrays (except
2450  * for patch inputs), so only the array element type is considered.
2451  */
2452 static bool
2453 is_inout_array(unsigned stage, ir_variable *var, bool *remove_array)
2454 {
2455    const glsl_type *type = var->type;
2456
2457    *remove_array = false;
2458
2459    if ((stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in) ||
2460        (stage == MESA_SHADER_FRAGMENT && var->data.mode == ir_var_shader_out))
2461       return false;
2462
2463    if (((stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in) ||
2464         (stage == MESA_SHADER_TESS_EVAL && var->data.mode == ir_var_shader_in) ||
2465         stage == MESA_SHADER_TESS_CTRL) &&
2466        !var->data.patch) {
2467       if (!var->type->is_array())
2468          return false; /* a system value probably */
2469
2470       type = var->type->fields.array;
2471       *remove_array = true;
2472    }
2473
2474    return type->is_array() || type->is_matrix();
2475 }
2476
2477 static unsigned
2478 st_translate_interp_loc(ir_variable *var)
2479 {
2480    if (var->data.centroid)
2481       return TGSI_INTERPOLATE_LOC_CENTROID;
2482    else if (var->data.sample)
2483       return TGSI_INTERPOLATE_LOC_SAMPLE;
2484    else
2485       return TGSI_INTERPOLATE_LOC_CENTER;
2486 }
2487
2488 void
2489 glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
2490 {
2491    variable_storage *entry;
2492    ir_variable *var = ir->var;
2493    bool remove_array;
2494
2495    if (handle_bound_deref(ir->as_dereference()))
2496       return;
2497
2498    entry = find_variable_storage(ir->var);
2499
2500    if (!entry) {
2501       switch (var->data.mode) {
2502       case ir_var_uniform:
2503          entry = new(mem_ctx) variable_storage(var, PROGRAM_UNIFORM,
2504                                                var->data.param_index);
2505          _mesa_hash_table_insert(this->variables, var, entry);
2506          break;
2507       case ir_var_shader_in: {
2508          /* The linker assigns locations for varyings and attributes,
2509           * including deprecated builtins (like gl_Color), user-assign
2510           * generic attributes (glBindVertexLocation), and
2511           * user-defined varyings.
2512           */
2513          assert(var->data.location != -1);
2514
2515          const glsl_type *type_without_array = var->type->without_array();
2516          struct inout_decl *decl = &inputs[num_inputs];
2517          unsigned component = var->data.location_frac;
2518          unsigned num_components;
2519          num_inputs++;
2520
2521          if (type_without_array->is_64bit())
2522             component = component / 2;
2523          if (type_without_array->vector_elements)
2524             num_components = type_without_array->vector_elements;
2525          else
2526             num_components = 4;
2527
2528          decl->mesa_index = var->data.location;
2529          decl->interp = (glsl_interp_mode) var->data.interpolation;
2530          decl->interp_loc = st_translate_interp_loc(var);
2531          decl->base_type = type_without_array->base_type;
2532          decl->usage_mask = u_bit_consecutive(component, num_components);
2533
2534          if (is_inout_array(shader->Stage, var, &remove_array)) {
2535             decl->array_id = num_input_arrays + 1;
2536             num_input_arrays++;
2537          } else {
2538             decl->array_id = 0;
2539          }
2540
2541          if (remove_array)
2542             decl->size = type_size(var->type->fields.array);
2543          else
2544             decl->size = type_size(var->type);
2545
2546          entry = new(mem_ctx) variable_storage(var,
2547                                                PROGRAM_INPUT,
2548                                                decl->mesa_index,
2549                                                decl->array_id);
2550          entry->component = component;
2551
2552          _mesa_hash_table_insert(this->variables, var, entry);
2553
2554          break;
2555       }
2556       case ir_var_shader_out: {
2557          assert(var->data.location != -1);
2558
2559          const glsl_type *type_without_array = var->type->without_array();
2560          struct inout_decl *decl = &outputs[num_outputs];
2561          unsigned component = var->data.location_frac;
2562          unsigned num_components;
2563          num_outputs++;
2564
2565          decl->invariant = var->data.invariant;
2566
2567          if (type_without_array->is_64bit())
2568             component = component / 2;
2569          if (type_without_array->vector_elements)
2570             num_components = type_without_array->vector_elements;
2571          else
2572             num_components = 4;
2573
2574          decl->mesa_index = var->data.location + FRAG_RESULT_MAX * var->data.index;
2575          decl->base_type = type_without_array->base_type;
2576          decl->usage_mask = u_bit_consecutive(component, num_components);
2577          if (var->data.stream & (1u << 31)) {
2578             decl->gs_out_streams = var->data.stream & ~(1u << 31);
2579          } else {
2580             assert(var->data.stream < 4);
2581             decl->gs_out_streams = 0;
2582             for (unsigned i = 0; i < num_components; ++i)
2583                decl->gs_out_streams |= var->data.stream << (2 * (component + i));
2584          }
2585
2586          if (is_inout_array(shader->Stage, var, &remove_array)) {
2587             decl->array_id = num_output_arrays + 1;
2588             num_output_arrays++;
2589          } else {
2590             decl->array_id = 0;
2591          }
2592
2593          if (remove_array)
2594             decl->size = type_size(var->type->fields.array);
2595          else
2596             decl->size = type_size(var->type);
2597
2598          if (var->data.fb_fetch_output) {
2599             st_dst_reg dst = st_dst_reg(get_temp(var->type));
2600             st_src_reg src = st_src_reg(PROGRAM_OUTPUT, decl->mesa_index,
2601                                         var->type, component, decl->array_id);
2602             emit_asm(NULL, TGSI_OPCODE_FBFETCH, dst, src);
2603             entry = new(mem_ctx) variable_storage(var, dst.file, dst.index,
2604                                                   dst.array_id);
2605          } else {
2606             entry = new(mem_ctx) variable_storage(var,
2607                                                   PROGRAM_OUTPUT,
2608                                                   decl->mesa_index,
2609                                                   decl->array_id);
2610          }
2611          entry->component = component;
2612
2613          _mesa_hash_table_insert(this->variables, var, entry);
2614
2615          break;
2616       }
2617       case ir_var_system_value:
2618          entry = new(mem_ctx) variable_storage(var,
2619                                                PROGRAM_SYSTEM_VALUE,
2620                                                var->data.location);
2621          break;
2622       case ir_var_auto:
2623       case ir_var_temporary:
2624          st_src_reg src = get_temp(var->type);
2625
2626          entry = new(mem_ctx) variable_storage(var, src.file, src.index,
2627                                                src.array_id);
2628          _mesa_hash_table_insert(this->variables, var, entry);
2629
2630          break;
2631       }
2632
2633       if (!entry) {
2634          printf("Failed to make storage for %s\n", var->name);
2635          exit(1);
2636       }
2637    }
2638
2639    this->result = st_src_reg(entry->file, entry->index, var->type,
2640                              entry->component, entry->array_id);
2641    if (this->shader->Stage == MESA_SHADER_VERTEX &&
2642        var->data.mode == ir_var_shader_in &&
2643        var->type->without_array()->is_double())
2644       this->result.is_double_vertex_input = true;
2645    if (!native_integers)
2646       this->result.type = GLSL_TYPE_FLOAT;
2647 }
2648
2649 static void
2650 shrink_array_declarations(struct inout_decl *decls, unsigned count,
2651                           GLbitfield64* usage_mask,
2652                           GLbitfield64 double_usage_mask,
2653                           GLbitfield* patch_usage_mask)
2654 {
2655    unsigned i;
2656    int j;
2657
2658    /* Fix array declarations by removing unused array elements at both ends
2659     * of the arrays. For example, mat4[3] where only mat[1] is used.
2660     */
2661    for (i = 0; i < count; i++) {
2662       struct inout_decl *decl = &decls[i];
2663       if (!decl->array_id)
2664          continue;
2665
2666       /* Shrink the beginning. */
2667       for (j = 0; j < (int)decl->size; j++) {
2668          if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
2669             if (*patch_usage_mask &
2670                 BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
2671                break;
2672          }
2673          else {
2674             if (*usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
2675                break;
2676             if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
2677                break;
2678          }
2679
2680          decl->mesa_index++;
2681          decl->size--;
2682          j--;
2683       }
2684
2685       /* Shrink the end. */
2686       for (j = decl->size-1; j >= 0; j--) {
2687          if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
2688             if (*patch_usage_mask &
2689                 BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
2690                break;
2691          }
2692          else {
2693             if (*usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
2694                break;
2695             if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
2696                break;
2697          }
2698
2699          decl->size--;
2700       }
2701
2702       /* When not all entries of an array are accessed, we mark them as used
2703        * here anyway, to ensure that the input/output mapping logic doesn't get
2704        * confused.
2705        *
2706        * TODO This happens when an array isn't used via indirect access, which
2707        * some game ports do (at least eON-based). There is an optimization
2708        * opportunity here by replacing the array declaration with non-array
2709        * declarations of those slots that are actually used.
2710        */
2711       for (j = 1; j < (int)decl->size; ++j) {
2712          if (decl->mesa_index >= VARYING_SLOT_PATCH0)
2713             *patch_usage_mask |= BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j);
2714          else
2715             *usage_mask |= BITFIELD64_BIT(decl->mesa_index + j);
2716       }
2717    }
2718 }
2719
2720 void
2721 glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
2722 {
2723    ir_constant *index;
2724    st_src_reg src;
2725    bool is_2D = false;
2726    ir_variable *var = ir->variable_referenced();
2727
2728    if (handle_bound_deref(ir->as_dereference()))
2729       return;
2730
2731    /* We only need the logic provided by st_glsl_storage_type_size()
2732     * for arrays of structs. Indirect sampler and image indexing is handled
2733     * elsewhere.
2734     */
2735    int element_size = ir->type->without_array()->is_record() ?
2736       st_glsl_storage_type_size(ir->type, var->data.bindless) :
2737       type_size(ir->type);
2738
2739    index = ir->array_index->constant_expression_value(ralloc_parent(ir));
2740
2741    ir->array->accept(this);
2742    src = this->result;
2743
2744    if (!src.has_index2) {
2745       switch (this->prog->Target) {
2746       case GL_TESS_CONTROL_PROGRAM_NV:
2747          is_2D = (src.file == PROGRAM_INPUT || src.file == PROGRAM_OUTPUT) &&
2748                  !ir->variable_referenced()->data.patch;
2749          break;
2750       case GL_TESS_EVALUATION_PROGRAM_NV:
2751          is_2D = src.file == PROGRAM_INPUT &&
2752                  !ir->variable_referenced()->data.patch;
2753          break;
2754       case GL_GEOMETRY_PROGRAM_NV:
2755          is_2D = src.file == PROGRAM_INPUT;
2756          break;
2757       }
2758    }
2759
2760    if (is_2D)
2761       element_size = 1;
2762
2763    if (index) {
2764
2765       if (this->prog->Target == GL_VERTEX_PROGRAM_ARB &&
2766           src.file == PROGRAM_INPUT)
2767          element_size = attrib_type_size(ir->type, true);
2768       if (is_2D) {
2769          src.index2D = index->value.i[0];
2770          src.has_index2 = true;
2771       } else
2772          src.index += index->value.i[0] * element_size;
2773    } else {
2774       /* Variable index array dereference.  It eats the "vec4" of the
2775        * base of the array and an index that offsets the TGSI register
2776        * index.
2777        */
2778       ir->array_index->accept(this);
2779
2780       st_src_reg index_reg;
2781
2782       if (element_size == 1) {
2783          index_reg = this->result;
2784       } else {
2785          index_reg = get_temp(native_integers ?
2786                               glsl_type::int_type : glsl_type::float_type);
2787
2788          emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(index_reg),
2789               this->result, st_src_reg_for_type(index_reg.type, element_size));
2790       }
2791
2792       /* If there was already a relative address register involved, add the
2793        * new and the old together to get the new offset.
2794        */
2795       if (!is_2D && src.reladdr != NULL) {
2796          st_src_reg accum_reg = get_temp(native_integers ?
2797                                 glsl_type::int_type : glsl_type::float_type);
2798
2799          emit_asm(ir, TGSI_OPCODE_ADD, st_dst_reg(accum_reg),
2800               index_reg, *src.reladdr);
2801
2802          index_reg = accum_reg;
2803       }
2804
2805       if (is_2D) {
2806          src.reladdr2 = ralloc(mem_ctx, st_src_reg);
2807          memcpy(src.reladdr2, &index_reg, sizeof(index_reg));
2808          src.index2D = 0;
2809          src.has_index2 = true;
2810       } else {
2811          src.reladdr = ralloc(mem_ctx, st_src_reg);
2812          memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2813       }
2814    }
2815
2816    /* Change the register type to the element type of the array. */
2817    src.type = ir->type->base_type;
2818
2819    this->result = src;
2820 }
2821
2822 void
2823 glsl_to_tgsi_visitor::visit(ir_dereference_record *ir)
2824 {
2825    unsigned int i;
2826    const glsl_type *struct_type = ir->record->type;
2827    ir_variable *var = ir->record->variable_referenced();
2828    int offset = 0;
2829
2830    if (handle_bound_deref(ir->as_dereference()))
2831       return;
2832
2833    ir->record->accept(this);
2834
2835    assert(ir->field_idx >= 0);
2836    assert(var);
2837    for (i = 0; i < struct_type->length; i++) {
2838       if (i == (unsigned) ir->field_idx)
2839          break;
2840       const glsl_type *member_type = struct_type->fields.structure[i].type;
2841       offset += st_glsl_storage_type_size(member_type, var->data.bindless);
2842    }
2843
2844    /* If the type is smaller than a vec4, replicate the last channel out. */
2845    if (ir->type->is_scalar() || ir->type->is_vector())
2846       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2847    else
2848       this->result.swizzle = SWIZZLE_NOOP;
2849
2850    this->result.index += offset;
2851    this->result.type = ir->type->base_type;
2852 }
2853
2854 /**
2855  * We want to be careful in assignment setup to hit the actual storage
2856  * instead of potentially using a temporary like we might with the
2857  * ir_dereference handler.
2858  */
2859 static st_dst_reg
2860 get_assignment_lhs(ir_dereference *ir, glsl_to_tgsi_visitor *v, int *component)
2861 {
2862    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2863     * access of a vector, it must be separated into a series conditional moves
2864     * before reaching this point (see ir_vec_index_to_cond_assign).
2865     */
2866    assert(ir->as_dereference());
2867    ir_dereference_array *deref_array = ir->as_dereference_array();
2868    if (deref_array) {
2869       assert(!deref_array->array->type->is_vector());
2870    }
2871
2872    /* Use the rvalue deref handler for the most part.  We write swizzles using
2873     * the writemask, but we do extract the base component for enhanced layouts
2874     * from the source swizzle.
2875     */
2876    ir->accept(v);
2877    *component = GET_SWZ(v->result.swizzle, 0);
2878    return st_dst_reg(v->result);
2879 }
2880
2881 /**
2882  * Process the condition of a conditional assignment
2883  *
2884  * Examines the condition of a conditional assignment to generate the optimal
2885  * first operand of a \c CMP instruction.  If the condition is a relational
2886  * operator with 0 (e.g., \c ir_binop_less), the value being compared will be
2887  * used as the source for the \c CMP instruction.  Otherwise the comparison
2888  * is processed to a boolean result, and the boolean result is used as the
2889  * operand to the CMP instruction.
2890  */
2891 bool
2892 glsl_to_tgsi_visitor::process_move_condition(ir_rvalue *ir)
2893 {
2894    ir_rvalue *src_ir = ir;
2895    bool negate = true;
2896    bool switch_order = false;
2897
2898    ir_expression *const expr = ir->as_expression();
2899
2900    if (native_integers) {
2901       if ((expr != NULL) && (expr->num_operands == 2)) {
2902          enum glsl_base_type type = expr->operands[0]->type->base_type;
2903          if (type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT ||
2904              type == GLSL_TYPE_BOOL) {
2905             if (expr->operation == ir_binop_equal) {
2906                if (expr->operands[0]->is_zero()) {
2907                   src_ir = expr->operands[1];
2908                   switch_order = true;
2909                }
2910                else if (expr->operands[1]->is_zero()) {
2911                   src_ir = expr->operands[0];
2912                   switch_order = true;
2913                }
2914             }
2915             else if (expr->operation == ir_binop_nequal) {
2916                if (expr->operands[0]->is_zero()) {
2917                   src_ir = expr->operands[1];
2918                }
2919                else if (expr->operands[1]->is_zero()) {
2920                   src_ir = expr->operands[0];
2921                }
2922             }
2923          }
2924       }
2925
2926       src_ir->accept(this);
2927       return switch_order;
2928    }
2929
2930    if ((expr != NULL) && (expr->num_operands == 2)) {
2931       bool zero_on_left = false;
2932
2933       if (expr->operands[0]->is_zero()) {
2934          src_ir = expr->operands[1];
2935          zero_on_left = true;
2936       } else if (expr->operands[1]->is_zero()) {
2937          src_ir = expr->operands[0];
2938          zero_on_left = false;
2939       }
2940
2941       /*      a is -  0  +            -  0  +
2942        * (a <  0)  T  F  F  ( a < 0)  T  F  F
2943        * (0 <  a)  F  F  T  (-a < 0)  F  F  T
2944        * (a >= 0)  F  T  T  ( a < 0)  T  F  F  (swap order of other operands)
2945        * (0 >= a)  T  T  F  (-a < 0)  F  F  T  (swap order of other operands)
2946        *
2947        * Note that exchanging the order of 0 and 'a' in the comparison simply
2948        * means that the value of 'a' should be negated.
2949        */
2950       if (src_ir != ir) {
2951          switch (expr->operation) {
2952          case ir_binop_less:
2953             switch_order = false;
2954             negate = zero_on_left;
2955             break;
2956
2957          case ir_binop_gequal:
2958             switch_order = true;
2959             negate = zero_on_left;
2960             break;
2961
2962          default:
2963             /* This isn't the right kind of comparison afterall, so make sure
2964              * the whole condition is visited.
2965              */
2966             src_ir = ir;
2967             break;
2968          }
2969       }
2970    }
2971
2972    src_ir->accept(this);
2973
2974    /* We use the TGSI_OPCODE_CMP (a < 0 ? b : c) for conditional moves, and the
2975     * condition we produced is 0.0 or 1.0.  By flipping the sign, we can
2976     * choose which value TGSI_OPCODE_CMP produces without an extra instruction
2977     * computing the condition.
2978     */
2979    if (negate)
2980       this->result.negate = ~this->result.negate;
2981
2982    return switch_order;
2983 }
2984
2985 void
2986 glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *type,
2987                                      st_dst_reg *l, st_src_reg *r,
2988                                      st_src_reg *cond, bool cond_swap)
2989 {
2990    if (type->is_record()) {
2991       for (unsigned int i = 0; i < type->length; i++) {
2992          emit_block_mov(ir, type->fields.structure[i].type, l, r,
2993                         cond, cond_swap);
2994       }
2995       return;
2996    }
2997
2998    if (type->is_array()) {
2999       for (unsigned int i = 0; i < type->length; i++) {
3000          emit_block_mov(ir, type->fields.array, l, r, cond, cond_swap);
3001       }
3002       return;
3003    }
3004
3005    if (type->is_matrix()) {
3006       const struct glsl_type *vec_type;
3007
3008       vec_type = glsl_type::get_instance(type->is_double()
3009                                          ? GLSL_TYPE_DOUBLE : GLSL_TYPE_FLOAT,
3010                                          type->vector_elements, 1);
3011
3012       for (int i = 0; i < type->matrix_columns; i++) {
3013          emit_block_mov(ir, vec_type, l, r, cond, cond_swap);
3014       }
3015       return;
3016    }
3017
3018    assert(type->is_scalar() || type->is_vector());
3019
3020    l->type = type->base_type;
3021    r->type = type->base_type;
3022    if (cond) {
3023       st_src_reg l_src = st_src_reg(*l);
3024
3025       if (l_src.file == PROGRAM_OUTPUT &&
3026           this->prog->Target == GL_FRAGMENT_PROGRAM_ARB &&
3027           (l_src.index == FRAG_RESULT_DEPTH ||
3028            l_src.index == FRAG_RESULT_STENCIL)) {
3029          /* This is a special case because the source swizzles will be shifted
3030           * later to account for the difference between GLSL (where they're
3031           * plain floats) and TGSI (where they're Z and Y components). */
3032          l_src.swizzle = SWIZZLE_XXXX;
3033       }
3034
3035       if (native_integers) {
3036          emit_asm(ir, TGSI_OPCODE_UCMP, *l, *cond,
3037               cond_swap ? l_src : *r,
3038               cond_swap ? *r : l_src);
3039       } else {
3040          emit_asm(ir, TGSI_OPCODE_CMP, *l, *cond,
3041               cond_swap ? l_src : *r,
3042               cond_swap ? *r : l_src);
3043       }
3044    } else {
3045       emit_asm(ir, TGSI_OPCODE_MOV, *l, *r);
3046    }
3047    l->index++;
3048    r->index++;
3049    if (type->is_dual_slot()) {
3050       l->index++;
3051       if (r->is_double_vertex_input == false)
3052          r->index++;
3053    }
3054 }
3055
3056 void
3057 glsl_to_tgsi_visitor::visit(ir_assignment *ir)
3058 {
3059    int dst_component;
3060    st_dst_reg l;
3061    st_src_reg r;
3062
3063    /* all generated instructions need to be flaged as precise */
3064    this->precise = is_precise(ir->lhs->variable_referenced());
3065    ir->rhs->accept(this);
3066    r = this->result;
3067
3068    l = get_assignment_lhs(ir->lhs, this, &dst_component);
3069
3070    {
3071       int swizzles[4];
3072       int first_enabled_chan = 0;
3073       int rhs_chan = 0;
3074       ir_variable *variable = ir->lhs->variable_referenced();
3075
3076       if (shader->Stage == MESA_SHADER_FRAGMENT &&
3077           variable->data.mode == ir_var_shader_out &&
3078           (variable->data.location == FRAG_RESULT_DEPTH ||
3079            variable->data.location == FRAG_RESULT_STENCIL)) {
3080          assert(ir->lhs->type->is_scalar());
3081          assert(ir->write_mask == WRITEMASK_X);
3082
3083          if (variable->data.location == FRAG_RESULT_DEPTH)
3084             l.writemask = WRITEMASK_Z;
3085          else {
3086             assert(variable->data.location == FRAG_RESULT_STENCIL);
3087             l.writemask = WRITEMASK_Y;
3088          }
3089       } else if (ir->write_mask == 0) {
3090          assert(!ir->lhs->type->is_scalar() && !ir->lhs->type->is_vector());
3091
3092          unsigned num_elements =
3093             ir->lhs->type->without_array()->vector_elements;
3094
3095          if (num_elements) {
3096             l.writemask = u_bit_consecutive(0, num_elements);
3097          } else {
3098             /* The type is a struct or an array of (array of) structs. */
3099             l.writemask = WRITEMASK_XYZW;
3100          }
3101       } else {
3102          l.writemask = ir->write_mask;
3103       }
3104
3105       for (int i = 0; i < 4; i++) {
3106          if (l.writemask & (1 << i)) {
3107             first_enabled_chan = GET_SWZ(r.swizzle, i);
3108             break;
3109          }
3110       }
3111
3112       l.writemask = l.writemask << dst_component;
3113
3114       /* Swizzle a small RHS vector into the channels being written.
3115        *
3116        * glsl ir treats write_mask as dictating how many channels are
3117        * present on the RHS while TGSI treats write_mask as just
3118        * showing which channels of the vec4 RHS get written.
3119        */
3120       for (int i = 0; i < 4; i++) {
3121          if (l.writemask & (1 << i))
3122             swizzles[i] = GET_SWZ(r.swizzle, rhs_chan++);
3123          else
3124             swizzles[i] = first_enabled_chan;
3125       }
3126       r.swizzle = MAKE_SWIZZLE4(swizzles[0], swizzles[1],
3127                                 swizzles[2], swizzles[3]);
3128    }
3129
3130    assert(l.file != PROGRAM_UNDEFINED);
3131    assert(r.file != PROGRAM_UNDEFINED);
3132
3133    if (ir->condition) {
3134       const bool switch_order = this->process_move_condition(ir->condition);
3135       st_src_reg condition = this->result;
3136
3137       emit_block_mov(ir, ir->lhs->type, &l, &r, &condition, switch_order);
3138    } else if (ir->rhs->as_expression() &&
3139               this->instructions.get_tail() &&
3140               ir->rhs == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->ir &&
3141               !((glsl_to_tgsi_instruction *)this->instructions.get_tail())->is_64bit_expanded &&
3142               type_size(ir->lhs->type) == 1 &&
3143               l.writemask == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->dst[0].writemask) {
3144       /* To avoid emitting an extra MOV when assigning an expression to a
3145        * variable, emit the last instruction of the expression again, but
3146        * replace the destination register with the target of the assignment.
3147        * Dead code elimination will remove the original instruction.
3148        */
3149       glsl_to_tgsi_instruction *inst, *new_inst;
3150       inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
3151       new_inst = emit_asm(ir, inst->op, l, inst->src[0], inst->src[1], inst->src[2], inst->src[3]);
3152       new_inst->saturate = inst->saturate;
3153       new_inst->resource = inst->resource;
3154       inst->dead_mask = inst->dst[0].writemask;
3155    } else {
3156       emit_block_mov(ir, ir->rhs->type, &l, &r, NULL, false);
3157    }
3158    this->precise = 0;
3159 }
3160
3161
3162 void
3163 glsl_to_tgsi_visitor::visit(ir_constant *ir)
3164 {
3165    st_src_reg src;
3166    GLdouble stack_vals[4] = { 0 };
3167    gl_constant_value *values = (gl_constant_value *) stack_vals;
3168    GLenum gl_type = GL_NONE;
3169    unsigned int i, elements;
3170    static int in_array = 0;
3171    gl_register_file file = in_array ? PROGRAM_CONSTANT : PROGRAM_IMMEDIATE;
3172
3173    /* Unfortunately, 4 floats is all we can get into
3174     * _mesa_add_typed_unnamed_constant.  So, make a temp to store an
3175     * aggregate constant and move each constant value into it.  If we
3176     * get lucky, copy propagation will eliminate the extra moves.
3177     */
3178    if (ir->type->is_record()) {
3179       st_src_reg temp_base = get_temp(ir->type);
3180       st_dst_reg temp = st_dst_reg(temp_base);
3181
3182       for (i = 0; i < ir->type->length; i++) {
3183          ir_constant *const field_value = ir->get_record_field(i);
3184          int size = type_size(field_value->type);
3185
3186          assert(size > 0);
3187
3188          field_value->accept(this);
3189          src = this->result;
3190
3191          for (unsigned j = 0; j < (unsigned int)size; j++) {
3192             emit_asm(ir, TGSI_OPCODE_MOV, temp, src);
3193
3194             src.index++;
3195             temp.index++;
3196          }
3197       }
3198       this->result = temp_base;
3199       return;
3200    }
3201
3202    if (ir->type->is_array()) {
3203       st_src_reg temp_base = get_temp(ir->type);
3204       st_dst_reg temp = st_dst_reg(temp_base);
3205       int size = type_size(ir->type->fields.array);
3206
3207       assert(size > 0);
3208       in_array++;
3209
3210       for (i = 0; i < ir->type->length; i++) {
3211          ir->const_elements[i]->accept(this);
3212          src = this->result;
3213          for (int j = 0; j < size; j++) {
3214             emit_asm(ir, TGSI_OPCODE_MOV, temp, src);
3215
3216             src.index++;
3217             temp.index++;
3218          }
3219       }
3220       this->result = temp_base;
3221       in_array--;
3222       return;
3223    }
3224
3225    if (ir->type->is_matrix()) {
3226       st_src_reg mat = get_temp(ir->type);
3227       st_dst_reg mat_column = st_dst_reg(mat);
3228
3229       for (i = 0; i < ir->type->matrix_columns; i++) {
3230          switch (ir->type->base_type) {
3231          case GLSL_TYPE_FLOAT:
3232             values = (gl_constant_value *)
3233                &ir->value.f[i * ir->type->vector_elements];
3234
3235             src = st_src_reg(file, -1, ir->type->base_type);
3236             src.index = add_constant(file,
3237                                      values,
3238                                      ir->type->vector_elements,
3239                                      GL_FLOAT,
3240                                      &src.swizzle);
3241             emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3242             break;
3243          case GLSL_TYPE_DOUBLE:
3244             values = (gl_constant_value *)
3245                &ir->value.d[i * ir->type->vector_elements];
3246             src = st_src_reg(file, -1, ir->type->base_type);
3247             src.index = add_constant(file,
3248                                      values,
3249                                      ir->type->vector_elements,
3250                                      GL_DOUBLE,
3251                                      &src.swizzle);
3252             if (ir->type->vector_elements >= 2) {
3253                mat_column.writemask = WRITEMASK_XY;
3254                src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
3255                                            SWIZZLE_X, SWIZZLE_Y);
3256                emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3257             } else {
3258                mat_column.writemask = WRITEMASK_X;
3259                src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X,
3260                                            SWIZZLE_X, SWIZZLE_X);
3261                emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3262             }
3263             src.index++;
3264             if (ir->type->vector_elements > 2) {
3265                if (ir->type->vector_elements == 4) {
3266                   mat_column.writemask = WRITEMASK_ZW;
3267                   src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
3268                                               SWIZZLE_X, SWIZZLE_Y);
3269                   emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3270                } else {
3271                   mat_column.writemask = WRITEMASK_Z;
3272                   src.swizzle = MAKE_SWIZZLE4(SWIZZLE_Y, SWIZZLE_Y,
3273                                               SWIZZLE_Y, SWIZZLE_Y);
3274                   emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
3275                   mat_column.writemask = WRITEMASK_XYZW;
3276                   src.swizzle = SWIZZLE_XYZW;
3277                }
3278                mat_column.index++;
3279             }
3280             break;
3281          default:
3282             unreachable("Illegal matrix constant type.\n");
3283             break;
3284          }
3285          mat_column.index++;
3286       }
3287       this->result = mat;
3288       return;
3289    }
3290
3291    elements = ir->type->vector_elements;
3292    switch (ir->type->base_type) {
3293    case GLSL_TYPE_FLOAT:
3294       gl_type = GL_FLOAT;
3295       for (i = 0; i < ir->type->vector_elements; i++) {
3296          values[i].f = ir->value.f[i];
3297       }
3298       break;
3299    case GLSL_TYPE_DOUBLE:
3300       gl_type = GL_DOUBLE;
3301       for (i = 0; i < ir->type->vector_elements; i++) {
3302          memcpy(&values[i * 2], &ir->value.d[i], sizeof(double));
3303       }
3304       break;
3305    case GLSL_TYPE_INT64:
3306       gl_type = GL_INT64_ARB;
3307       for (i = 0; i < ir->type->vector_elements; i++) {
3308          memcpy(&values[i * 2], &ir->value.d[i], sizeof(int64_t));
3309       }
3310       break;
3311    case GLSL_TYPE_UINT64:
3312       gl_type = GL_UNSIGNED_INT64_ARB;
3313       for (i = 0; i < ir->type->vector_elements; i++) {
3314          memcpy(&values[i * 2], &ir->value.d[i], sizeof(uint64_t));
3315       }
3316       break;
3317    case GLSL_TYPE_UINT:
3318       gl_type = native_integers ? GL_UNSIGNED_INT : GL_FLOAT;
3319       for (i = 0; i < ir->type->vector_elements; i++) {
3320          if (native_integers)
3321             values[i].u = ir->value.u[i];
3322          else
3323             values[i].f = ir->value.u[i];
3324       }
3325       break;
3326    case GLSL_TYPE_INT:
3327       gl_type = native_integers ? GL_INT : GL_FLOAT;
3328       for (i = 0; i < ir->type->vector_elements; i++) {
3329          if (native_integers)
3330             values[i].i = ir->value.i[i];
3331          else
3332             values[i].f = ir->value.i[i];
3333       }
3334       break;
3335    case GLSL_TYPE_BOOL:
3336       gl_type = native_integers ? GL_BOOL : GL_FLOAT;
3337       for (i = 0; i < ir->type->vector_elements; i++) {
3338          values[i].u = ir->value.b[i] ? ctx->Const.UniformBooleanTrue : 0;
3339       }
3340       break;
3341    case GLSL_TYPE_SAMPLER:
3342    case GLSL_TYPE_IMAGE:
3343       gl_type = GL_UNSIGNED_INT;
3344       elements = 2;
3345       values[0].u = ir->value.u64[0] & 0xffffffff;
3346       values[1].u = ir->value.u64[0] >> 32;
3347       break;
3348    default:
3349       assert(!"Non-float/uint/int/bool/sampler/image constant");
3350    }
3351
3352    this->result = st_src_reg(file, -1, ir->type);
3353    this->result.index = add_constant(file,
3354                                      values,
3355                                      elements,
3356                                      gl_type,
3357                                      &this->result.swizzle);
3358 }
3359
3360 void
3361 glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
3362 {
3363    exec_node *param = ir->actual_parameters.get_head();
3364    ir_dereference *deref = static_cast<ir_dereference *>(param);
3365    ir_variable *location = deref->variable_referenced();
3366    bool has_hw_atomics = st_context(ctx)->has_hw_atomics;
3367    /* Calculate the surface offset */
3368    st_src_reg offset;
3369    unsigned array_size = 0, base = 0;
3370    uint16_t index = 0;
3371    st_src_reg resource;
3372
3373    get_deref_offsets(deref, &array_size, &base, &index, &offset, false);
3374
3375    if (has_hw_atomics) {
3376       variable_storage *entry = find_variable_storage(location);
3377       st_src_reg buffer(PROGRAM_HW_ATOMIC, 0, GLSL_TYPE_ATOMIC_UINT,
3378                         location->data.binding);
3379
3380       if (!entry) {
3381          entry = new(mem_ctx) variable_storage(location, PROGRAM_HW_ATOMIC,
3382                                                num_atomics);
3383          _mesa_hash_table_insert(this->variables, location, entry);
3384
3385          atomic_info[num_atomics].location = location->data.location;
3386          atomic_info[num_atomics].binding = location->data.binding;
3387          atomic_info[num_atomics].size = location->type->arrays_of_arrays_size();
3388          if (atomic_info[num_atomics].size == 0)
3389             atomic_info[num_atomics].size = 1;
3390          atomic_info[num_atomics].array_id = 0;
3391          num_atomics++;
3392       }
3393
3394       if (offset.file != PROGRAM_UNDEFINED) {
3395          if (atomic_info[entry->index].array_id == 0) {
3396             num_atomic_arrays++;
3397             atomic_info[entry->index].array_id = num_atomic_arrays;
3398          }
3399          buffer.array_id = atomic_info[entry->index].array_id;
3400       }
3401
3402       buffer.index = index;
3403       buffer.index += location->data.offset / ATOMIC_COUNTER_SIZE;
3404       buffer.has_index2 = true;
3405
3406       if (offset.file != PROGRAM_UNDEFINED) {
3407          buffer.reladdr = ralloc(mem_ctx, st_src_reg);
3408          *buffer.reladdr = offset;
3409          emit_arl(ir, sampler_reladdr, offset);
3410       }
3411       offset = st_src_reg_for_int(0);
3412
3413       resource = buffer;
3414    } else {
3415       st_src_reg buffer(PROGRAM_BUFFER, location->data.binding,
3416                         GLSL_TYPE_ATOMIC_UINT);
3417
3418       if (offset.file != PROGRAM_UNDEFINED) {
3419          emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(offset),
3420                   offset, st_src_reg_for_int(ATOMIC_COUNTER_SIZE));
3421          emit_asm(ir, TGSI_OPCODE_ADD, st_dst_reg(offset),
3422                   offset, st_src_reg_for_int(location->data.offset + index * ATOMIC_COUNTER_SIZE));
3423       } else {
3424          offset = st_src_reg_for_int(location->data.offset + index * ATOMIC_COUNTER_SIZE);
3425       }
3426       resource = buffer;
3427    }
3428
3429    ir->return_deref->accept(this);
3430    st_dst_reg dst(this->result);
3431    dst.writemask = WRITEMASK_X;
3432
3433    glsl_to_tgsi_instruction *inst;
3434
3435    if (ir->callee->intrinsic_id == ir_intrinsic_atomic_counter_read) {
3436       inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, offset);
3437    } else if (ir->callee->intrinsic_id == ir_intrinsic_atomic_counter_increment) {
3438       inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset,
3439                       st_src_reg_for_int(1));
3440    } else if (ir->callee->intrinsic_id == ir_intrinsic_atomic_counter_predecrement) {
3441       inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset,
3442                       st_src_reg_for_int(-1));
3443       emit_asm(ir, TGSI_OPCODE_ADD, dst, this->result, st_src_reg_for_int(-1));
3444    } else {
3445       param = param->get_next();
3446       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3447       val->accept(this);
3448
3449       st_src_reg data = this->result, data2 = undef_src;
3450       enum tgsi_opcode opcode;
3451       switch (ir->callee->intrinsic_id) {
3452       case ir_intrinsic_atomic_counter_add:
3453          opcode = TGSI_OPCODE_ATOMUADD;
3454          break;
3455       case ir_intrinsic_atomic_counter_min:
3456          opcode = TGSI_OPCODE_ATOMIMIN;
3457          break;
3458       case ir_intrinsic_atomic_counter_max:
3459          opcode = TGSI_OPCODE_ATOMIMAX;
3460          break;
3461       case ir_intrinsic_atomic_counter_and:
3462          opcode = TGSI_OPCODE_ATOMAND;
3463          break;
3464       case ir_intrinsic_atomic_counter_or:
3465          opcode = TGSI_OPCODE_ATOMOR;
3466          break;
3467       case ir_intrinsic_atomic_counter_xor:
3468          opcode = TGSI_OPCODE_ATOMXOR;
3469          break;
3470       case ir_intrinsic_atomic_counter_exchange:
3471          opcode = TGSI_OPCODE_ATOMXCHG;
3472          break;
3473       case ir_intrinsic_atomic_counter_comp_swap: {
3474          opcode = TGSI_OPCODE_ATOMCAS;
3475          param = param->get_next();
3476          val = ((ir_instruction *)param)->as_rvalue();
3477          val->accept(this);
3478          data2 = this->result;
3479          break;
3480       }
3481       default:
3482          assert(!"Unexpected intrinsic");
3483          return;
3484       }
3485
3486       inst = emit_asm(ir, opcode, dst, offset, data, data2);
3487    }
3488
3489    inst->resource = resource;
3490 }
3491
3492 void
3493 glsl_to_tgsi_visitor::visit_ssbo_intrinsic(ir_call *ir)
3494 {
3495    exec_node *param = ir->actual_parameters.get_head();
3496
3497    ir_rvalue *block = ((ir_instruction *)param)->as_rvalue();
3498
3499    param = param->get_next();
3500    ir_rvalue *offset = ((ir_instruction *)param)->as_rvalue();
3501
3502    ir_constant *const_block = block->as_constant();
3503    int buf_base = st_context(ctx)->has_hw_atomics
3504       ? 0 : ctx->Const.Program[shader->Stage].MaxAtomicBuffers;
3505    st_src_reg buffer(
3506          PROGRAM_BUFFER,
3507          buf_base + (const_block ? const_block->value.u[0] : 0),
3508          GLSL_TYPE_UINT);
3509
3510    if (!const_block) {
3511       block->accept(this);
3512       buffer.reladdr = ralloc(mem_ctx, st_src_reg);
3513       *buffer.reladdr = this->result;
3514       emit_arl(ir, sampler_reladdr, this->result);
3515    }
3516
3517    /* Calculate the surface offset */
3518    offset->accept(this);
3519    st_src_reg off = this->result;
3520
3521    st_dst_reg dst = undef_dst;
3522    if (ir->return_deref) {
3523       ir->return_deref->accept(this);
3524       dst = st_dst_reg(this->result);
3525       dst.writemask = (1 << ir->return_deref->type->vector_elements) - 1;
3526    }
3527
3528    glsl_to_tgsi_instruction *inst;
3529
3530    if (ir->callee->intrinsic_id == ir_intrinsic_ssbo_load) {
3531       inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, off);
3532       if (dst.type == GLSL_TYPE_BOOL)
3533          emit_asm(ir, TGSI_OPCODE_USNE, dst, st_src_reg(dst),
3534                   st_src_reg_for_int(0));
3535    } else if (ir->callee->intrinsic_id == ir_intrinsic_ssbo_store) {
3536       param = param->get_next();
3537       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3538       val->accept(this);
3539
3540       param = param->get_next();
3541       ir_constant *write_mask = ((ir_instruction *)param)->as_constant();
3542       assert(write_mask);
3543       dst.writemask = write_mask->value.u[0];
3544
3545       dst.type = this->result.type;
3546       inst = emit_asm(ir, TGSI_OPCODE_STORE, dst, off, this->result);
3547    } else {
3548       param = param->get_next();
3549       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3550       val->accept(this);
3551
3552       st_src_reg data = this->result, data2 = undef_src;
3553       enum tgsi_opcode opcode;
3554       switch (ir->callee->intrinsic_id) {
3555       case ir_intrinsic_ssbo_atomic_add:
3556          opcode = TGSI_OPCODE_ATOMUADD;
3557          break;
3558       case ir_intrinsic_ssbo_atomic_min:
3559          opcode = TGSI_OPCODE_ATOMIMIN;
3560          break;
3561       case ir_intrinsic_ssbo_atomic_max:
3562          opcode = TGSI_OPCODE_ATOMIMAX;
3563          break;
3564       case ir_intrinsic_ssbo_atomic_and:
3565          opcode = TGSI_OPCODE_ATOMAND;
3566          break;
3567       case ir_intrinsic_ssbo_atomic_or:
3568          opcode = TGSI_OPCODE_ATOMOR;
3569          break;
3570       case ir_intrinsic_ssbo_atomic_xor:
3571          opcode = TGSI_OPCODE_ATOMXOR;
3572          break;
3573       case ir_intrinsic_ssbo_atomic_exchange:
3574          opcode = TGSI_OPCODE_ATOMXCHG;
3575          break;
3576       case ir_intrinsic_ssbo_atomic_comp_swap:
3577          opcode = TGSI_OPCODE_ATOMCAS;
3578          param = param->get_next();
3579          val = ((ir_instruction *)param)->as_rvalue();
3580          val->accept(this);
3581          data2 = this->result;
3582          break;
3583       default:
3584          assert(!"Unexpected intrinsic");
3585          return;
3586       }
3587
3588       inst = emit_asm(ir, opcode, dst, off, data, data2);
3589    }
3590
3591    param = param->get_next();
3592    ir_constant *access = NULL;
3593    if (!param->is_tail_sentinel()) {
3594       access = ((ir_instruction *)param)->as_constant();
3595       assert(access);
3596    }
3597
3598    add_buffer_to_load_and_stores(inst, &buffer, &this->instructions, access);
3599 }
3600
3601 void
3602 glsl_to_tgsi_visitor::visit_membar_intrinsic(ir_call *ir)
3603 {
3604    switch (ir->callee->intrinsic_id) {
3605    case ir_intrinsic_memory_barrier:
3606       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3607                st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER |
3608                                   TGSI_MEMBAR_ATOMIC_BUFFER |
3609                                   TGSI_MEMBAR_SHADER_IMAGE |
3610                                   TGSI_MEMBAR_SHARED));
3611       break;
3612    case ir_intrinsic_memory_barrier_atomic_counter:
3613       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3614                st_src_reg_for_int(TGSI_MEMBAR_ATOMIC_BUFFER));
3615       break;
3616    case ir_intrinsic_memory_barrier_buffer:
3617       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3618                st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER));
3619       break;
3620    case ir_intrinsic_memory_barrier_image:
3621       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3622                st_src_reg_for_int(TGSI_MEMBAR_SHADER_IMAGE));
3623       break;
3624    case ir_intrinsic_memory_barrier_shared:
3625       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3626                st_src_reg_for_int(TGSI_MEMBAR_SHARED));
3627       break;
3628    case ir_intrinsic_group_memory_barrier:
3629       emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
3630                st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER |
3631                                   TGSI_MEMBAR_ATOMIC_BUFFER |
3632                                   TGSI_MEMBAR_SHADER_IMAGE |
3633                                   TGSI_MEMBAR_SHARED |
3634                                   TGSI_MEMBAR_THREAD_GROUP));
3635       break;
3636    default:
3637       assert(!"Unexpected memory barrier intrinsic");
3638    }
3639 }
3640
3641 void
3642 glsl_to_tgsi_visitor::visit_shared_intrinsic(ir_call *ir)
3643 {
3644    exec_node *param = ir->actual_parameters.get_head();
3645
3646    ir_rvalue *offset = ((ir_instruction *)param)->as_rvalue();
3647
3648    st_src_reg buffer(PROGRAM_MEMORY, 0, GLSL_TYPE_UINT);
3649
3650    /* Calculate the surface offset */
3651    offset->accept(this);
3652    st_src_reg off = this->result;
3653
3654    st_dst_reg dst = undef_dst;
3655    if (ir->return_deref) {
3656       ir->return_deref->accept(this);
3657       dst = st_dst_reg(this->result);
3658       dst.writemask = (1 << ir->return_deref->type->vector_elements) - 1;
3659    }
3660
3661    glsl_to_tgsi_instruction *inst;
3662
3663    if (ir->callee->intrinsic_id == ir_intrinsic_shared_load) {
3664       inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, off);
3665       inst->resource = buffer;
3666    } else if (ir->callee->intrinsic_id == ir_intrinsic_shared_store) {
3667       param = param->get_next();
3668       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3669       val->accept(this);
3670
3671       param = param->get_next();
3672       ir_constant *write_mask = ((ir_instruction *)param)->as_constant();
3673       assert(write_mask);
3674       dst.writemask = write_mask->value.u[0];
3675
3676       dst.type = this->result.type;
3677       inst = emit_asm(ir, TGSI_OPCODE_STORE, dst, off, this->result);
3678       inst->resource = buffer;
3679    } else {
3680       param = param->get_next();
3681       ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
3682       val->accept(this);
3683
3684       st_src_reg data = this->result, data2 = undef_src;
3685       enum tgsi_opcode opcode;
3686       switch (ir->callee->intrinsic_id) {
3687       case ir_intrinsic_shared_atomic_add:
3688          opcode = TGSI_OPCODE_ATOMUADD;
3689          break;
3690       case ir_intrinsic_shared_atomic_min:
3691          opcode = TGSI_OPCODE_ATOMIMIN;
3692          break;
3693       case ir_intrinsic_shared_atomic_max:
3694          opcode = TGSI_OPCODE_ATOMIMAX;
3695          break;
3696       case ir_intrinsic_shared_atomic_and:
3697          opcode = TGSI_OPCODE_ATOMAND;
3698          break;
3699       case ir_intrinsic_shared_atomic_or:
3700          opcode = TGSI_OPCODE_ATOMOR;
3701          break;
3702       case ir_intrinsic_shared_atomic_xor:
3703          opcode = TGSI_OPCODE_ATOMXOR;
3704          break;
3705       case ir_intrinsic_shared_atomic_exchange:
3706          opcode = TGSI_OPCODE_ATOMXCHG;
3707          break;
3708       case ir_intrinsic_shared_atomic_comp_swap:
3709          opcode = TGSI_OPCODE_ATOMCAS;
3710          param = param->get_next();
3711          val = ((ir_instruction *)param)->as_rvalue();
3712          val->accept(this);
3713          data2 = this->result;
3714          break;
3715       default:
3716          assert(!"Unexpected intrinsic");
3717          return;
3718       }
3719
3720       inst = emit_asm(ir, opcode, dst, off, data, data2);
3721       inst->resource = buffer;
3722    }
3723 }
3724
3725 static void
3726 get_image_qualifiers(ir_dereference *ir, const glsl_type **type,
3727                      bool *memory_coherent, bool *memory_volatile,
3728                      bool *memory_restrict, bool *memory_read_only,
3729                      unsigned *image_format)
3730 {
3731
3732    switch (ir->ir_type) {
3733    case ir_type_dereference_record: {
3734       ir_dereference_record *deref_record = ir->as_dereference_record();
3735       const glsl_type *struct_type = deref_record->record->type;
3736       int fild_idx = deref_record->field_idx;
3737
3738       *type = struct_type->fields.structure[fild_idx].type->without_array();
3739       *memory_coherent =
3740          struct_type->fields.structure[fild_idx].memory_coherent;
3741       *memory_volatile =
3742          struct_type->fields.structure[fild_idx].memory_volatile;
3743       *memory_restrict =
3744          struct_type->fields.structure[fild_idx].memory_restrict;
3745       *memory_read_only =
3746          struct_type->fields.structure[fild_idx].memory_read_only;
3747       *image_format =
3748          struct_type->fields.structure[fild_idx].image_format;
3749       break;
3750    }
3751
3752    case ir_type_dereference_array: {
3753       ir_dereference_array *deref_arr = ir->as_dereference_array();
3754       get_image_qualifiers((ir_dereference *)deref_arr->array, type,
3755                            memory_coherent, memory_volatile, memory_restrict,
3756                            memory_read_only, image_format);
3757       break;
3758    }
3759
3760    case ir_type_dereference_variable: {
3761       ir_variable *var = ir->variable_referenced();
3762
3763       *type = var->type->without_array();
3764       *memory_coherent = var->data.memory_coherent;
3765       *memory_volatile = var->data.memory_volatile;
3766       *memory_restrict = var->data.memory_restrict;
3767       *memory_read_only = var->data.memory_read_only;
3768       *image_format = var->data.image_format;
3769       break;
3770    }
3771
3772    default:
3773       break;
3774    }
3775 }
3776
3777 void
3778 glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir)
3779 {
3780    exec_node *param = ir->actual_parameters.get_head();
3781
3782    ir_dereference *img = (ir_dereference *)param;
3783    const ir_variable *imgvar = img->variable_referenced();
3784    unsigned sampler_array_size = 1, sampler_base = 0;
3785    bool memory_coherent = false, memory_volatile = false,
3786         memory_restrict = false, memory_read_only = false;
3787    unsigned image_format = 0;
3788    const glsl_type *type = NULL;
3789
3790    get_image_qualifiers(img, &type, &memory_coherent, &memory_volatile,
3791                         &memory_restrict, &memory_read_only, &image_format);
3792
3793    st_src_reg reladdr;
3794    st_src_reg image(PROGRAM_IMAGE, 0, GLSL_TYPE_UINT);
3795    uint16_t index = 0;
3796    get_deref_offsets(img, &sampler_array_size, &sampler_base,
3797                      &index, &reladdr, !imgvar->contains_bindless());
3798
3799    image.index = index;
3800    if (reladdr.file != PROGRAM_UNDEFINED) {
3801       image.reladdr = ralloc(mem_ctx, st_src_reg);
3802       *image.reladdr = reladdr;
3803       emit_arl(ir, sampler_reladdr, reladdr);
3804    }
3805
3806    st_dst_reg dst = undef_dst;
3807    if (ir->return_deref) {
3808       ir->return_deref->accept(this);
3809       dst = st_dst_reg(this->result);
3810       dst.writemask = (1 << ir->return_deref->type->vector_elements) - 1;
3811    }
3812
3813    glsl_to_tgsi_instruction *inst;
3814
3815    st_src_reg bindless;
3816    if (imgvar->contains_bindless()) {
3817       img->accept(this);
3818       bindless = this->result;
3819    }
3820
3821    if (ir->callee->intrinsic_id == ir_intrinsic_image_size) {
3822       dst.writemask = WRITEMASK_XYZ;
3823       inst = emit_asm(ir, TGSI_OPCODE_RESQ, dst);
3824    } else if (ir->callee->intrinsic_id == ir_intrinsic_image_samples) {
3825       st_src_reg res = get_temp(glsl_type::ivec4_type);
3826       st_dst_reg dstres = st_dst_reg(res);
3827       dstres.writemask = WRITEMASK_W;
3828       inst = emit_asm(ir, TGSI_OPCODE_RESQ, dstres);
3829       res.swizzle = SWIZZLE_WWWW;
3830       emit_asm(ir, TGSI_OPCODE_MOV, dst, res);
3831    } else {
3832       st_src_reg arg1 = undef_src, arg2 = undef_src;
3833       st_src_reg coord;
3834       st_dst_reg coord_dst;
3835       coord = get_temp(glsl_type::ivec4_type);
3836       coord_dst = st_dst_reg(coord);
3837       coord_dst.writemask = (1 << type->coordinate_components()) - 1;
3838       param = param->get_next();
3839       ((ir_dereference *)param)->accept(this);
3840       emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
3841       coord.swizzle = SWIZZLE_XXXX;
3842       switch (type->coordinate_components()) {
3843       case 4: assert(!"unexpected coord count");
3844       /* fallthrough */
3845       case 3: coord.swizzle |= SWIZZLE_Z << 6;
3846       /* fallthrough */
3847       case 2: coord.swizzle |= SWIZZLE_Y << 3;
3848       }
3849
3850       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_MS) {
3851          param = param->get_next();
3852          ((ir_dereference *)param)->accept(this);
3853          st_src_reg sample = this->result;
3854          sample.swizzle = SWIZZLE_XXXX;
3855          coord_dst.writemask = WRITEMASK_W;
3856          emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, sample);
3857          coord.swizzle |= SWIZZLE_W << 9;
3858       }
3859
3860       param = param->get_next();
3861       if (!param->is_tail_sentinel()) {
3862          ((ir_dereference *)param)->accept(this);
3863          arg1 = this->result;
3864          param = param->get_next();
3865       }
3866
3867       if (!param->is_tail_sentinel()) {
3868          ((ir_dereference *)param)->accept(this);
3869          arg2 = this->result;
3870          param = param->get_next();
3871       }
3872
3873       assert(param->is_tail_sentinel());
3874
3875       enum tgsi_opcode opcode;
3876       switch (ir->callee->intrinsic_id) {
3877       case ir_intrinsic_image_load:
3878          opcode = TGSI_OPCODE_LOAD;
3879          break;
3880       case ir_intrinsic_image_store:
3881          opcode = TGSI_OPCODE_STORE;
3882          break;
3883       case ir_intrinsic_image_atomic_add:
3884          opcode = TGSI_OPCODE_ATOMUADD;
3885          break;
3886       case ir_intrinsic_image_atomic_min:
3887          opcode = TGSI_OPCODE_ATOMIMIN;
3888          break;
3889       case ir_intrinsic_image_atomic_max:
3890          opcode = TGSI_OPCODE_ATOMIMAX;
3891          break;
3892       case ir_intrinsic_image_atomic_and:
3893          opcode = TGSI_OPCODE_ATOMAND;
3894          break;
3895       case ir_intrinsic_image_atomic_or:
3896          opcode = TGSI_OPCODE_ATOMOR;
3897          break;
3898       case ir_intrinsic_image_atomic_xor:
3899          opcode = TGSI_OPCODE_ATOMXOR;
3900          break;
3901       case ir_intrinsic_image_atomic_exchange:
3902          opcode = TGSI_OPCODE_ATOMXCHG;
3903          break;
3904       case ir_intrinsic_image_atomic_comp_swap:
3905          opcode = TGSI_OPCODE_ATOMCAS;
3906          break;
3907       default:
3908          assert(!"Unexpected intrinsic");
3909          return;
3910       }
3911
3912       inst = emit_asm(ir, opcode, dst, coord, arg1, arg2);
3913       if (opcode == TGSI_OPCODE_STORE)
3914          inst->dst[0].writemask = WRITEMASK_XYZW;
3915    }
3916
3917    if (imgvar->contains_bindless()) {
3918       inst->resource = bindless;
3919       inst->resource.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
3920                                              SWIZZLE_X, SWIZZLE_Y);
3921    } else {
3922       inst->resource = image;
3923       inst->sampler_array_size = sampler_array_size;
3924       inst->sampler_base = sampler_base;
3925    }
3926
3927    inst->tex_target = type->sampler_index();
3928    inst->image_format = st_mesa_format_to_pipe_format(st_context(ctx),
3929          _mesa_get_shader_image_format(image_format));
3930    inst->read_only = memory_read_only;
3931
3932    if (memory_coherent)
3933       inst->buffer_access |= TGSI_MEMORY_COHERENT;
3934    if (memory_restrict)
3935       inst->buffer_access |= TGSI_MEMORY_RESTRICT;
3936    if (memory_volatile)
3937       inst->buffer_access |= TGSI_MEMORY_VOLATILE;
3938 }
3939
3940 void
3941 glsl_to_tgsi_visitor::visit_generic_intrinsic(ir_call *ir, enum tgsi_opcode op)
3942 {
3943    ir->return_deref->accept(this);
3944    st_dst_reg dst = st_dst_reg(this->result);
3945
3946    dst.writemask = u_bit_consecutive(0, ir->return_deref->var->type->vector_elements);
3947
3948    st_src_reg src[4] = { undef_src, undef_src, undef_src, undef_src };
3949    unsigned num_src = 0;
3950    foreach_in_list(ir_rvalue, param, &ir->actual_parameters) {
3951       assert(num_src < ARRAY_SIZE(src));
3952
3953       this->result.file = PROGRAM_UNDEFINED;
3954       param->accept(this);
3955       assert(this->result.file != PROGRAM_UNDEFINED);
3956
3957       src[num_src] = this->result;
3958       num_src++;
3959    }
3960
3961    emit_asm(ir, op, dst, src[0], src[1], src[2], src[3]);
3962 }
3963
3964 void
3965 glsl_to_tgsi_visitor::visit(ir_call *ir)
3966 {
3967    ir_function_signature *sig = ir->callee;
3968
3969    /* Filter out intrinsics */
3970    switch (sig->intrinsic_id) {
3971    case ir_intrinsic_atomic_counter_read:
3972    case ir_intrinsic_atomic_counter_increment:
3973    case ir_intrinsic_atomic_counter_predecrement:
3974    case ir_intrinsic_atomic_counter_add:
3975    case ir_intrinsic_atomic_counter_min:
3976    case ir_intrinsic_atomic_counter_max:
3977    case ir_intrinsic_atomic_counter_and:
3978    case ir_intrinsic_atomic_counter_or:
3979    case ir_intrinsic_atomic_counter_xor:
3980    case ir_intrinsic_atomic_counter_exchange:
3981    case ir_intrinsic_atomic_counter_comp_swap:
3982       visit_atomic_counter_intrinsic(ir);
3983       return;
3984
3985    case ir_intrinsic_ssbo_load:
3986    case ir_intrinsic_ssbo_store:
3987    case ir_intrinsic_ssbo_atomic_add:
3988    case ir_intrinsic_ssbo_atomic_min:
3989    case ir_intrinsic_ssbo_atomic_max:
3990    case ir_intrinsic_ssbo_atomic_and:
3991    case ir_intrinsic_ssbo_atomic_or:
3992    case ir_intrinsic_ssbo_atomic_xor:
3993    case ir_intrinsic_ssbo_atomic_exchange:
3994    case ir_intrinsic_ssbo_atomic_comp_swap:
3995       visit_ssbo_intrinsic(ir);
3996       return;
3997
3998    case ir_intrinsic_memory_barrier:
3999    case ir_intrinsic_memory_barrier_atomic_counter:
4000    case ir_intrinsic_memory_barrier_buffer:
4001    case ir_intrinsic_memory_barrier_image:
4002    case ir_intrinsic_memory_barrier_shared:
4003    case ir_intrinsic_group_memory_barrier:
4004       visit_membar_intrinsic(ir);
4005       return;
4006
4007    case ir_intrinsic_shared_load:
4008    case ir_intrinsic_shared_store:
4009    case ir_intrinsic_shared_atomic_add:
4010    case ir_intrinsic_shared_atomic_min:
4011    case ir_intrinsic_shared_atomic_max:
4012    case ir_intrinsic_shared_atomic_and:
4013    case ir_intrinsic_shared_atomic_or:
4014    case ir_intrinsic_shared_atomic_xor:
4015    case ir_intrinsic_shared_atomic_exchange:
4016    case ir_intrinsic_shared_atomic_comp_swap:
4017       visit_shared_intrinsic(ir);
4018       return;
4019
4020    case ir_intrinsic_image_load:
4021    case ir_intrinsic_image_store:
4022    case ir_intrinsic_image_atomic_add:
4023    case ir_intrinsic_image_atomic_min:
4024    case ir_intrinsic_image_atomic_max:
4025    case ir_intrinsic_image_atomic_and:
4026    case ir_intrinsic_image_atomic_or:
4027    case ir_intrinsic_image_atomic_xor:
4028    case ir_intrinsic_image_atomic_exchange:
4029    case ir_intrinsic_image_atomic_comp_swap:
4030    case ir_intrinsic_image_size:
4031    case ir_intrinsic_image_samples:
4032       visit_image_intrinsic(ir);
4033       return;
4034
4035    case ir_intrinsic_shader_clock:
4036       visit_generic_intrinsic(ir, TGSI_OPCODE_CLOCK);
4037       return;
4038
4039    case ir_intrinsic_vote_all:
4040       visit_generic_intrinsic(ir, TGSI_OPCODE_VOTE_ALL);
4041       return;
4042    case ir_intrinsic_vote_any:
4043       visit_generic_intrinsic(ir, TGSI_OPCODE_VOTE_ANY);
4044       return;
4045    case ir_intrinsic_vote_eq:
4046       visit_generic_intrinsic(ir, TGSI_OPCODE_VOTE_EQ);
4047       return;
4048    case ir_intrinsic_ballot:
4049       visit_generic_intrinsic(ir, TGSI_OPCODE_BALLOT);
4050       return;
4051    case ir_intrinsic_read_first_invocation:
4052       visit_generic_intrinsic(ir, TGSI_OPCODE_READ_FIRST);
4053       return;
4054    case ir_intrinsic_read_invocation:
4055       visit_generic_intrinsic(ir, TGSI_OPCODE_READ_INVOC);
4056       return;
4057
4058    case ir_intrinsic_invalid:
4059    case ir_intrinsic_generic_load:
4060    case ir_intrinsic_generic_store:
4061    case ir_intrinsic_generic_atomic_add:
4062    case ir_intrinsic_generic_atomic_and:
4063    case ir_intrinsic_generic_atomic_or:
4064    case ir_intrinsic_generic_atomic_xor:
4065    case ir_intrinsic_generic_atomic_min:
4066    case ir_intrinsic_generic_atomic_max:
4067    case ir_intrinsic_generic_atomic_exchange:
4068    case ir_intrinsic_generic_atomic_comp_swap:
4069    case ir_intrinsic_begin_invocation_interlock:
4070    case ir_intrinsic_end_invocation_interlock:
4071       unreachable("Invalid intrinsic");
4072    }
4073 }
4074
4075 void
4076 glsl_to_tgsi_visitor::calc_deref_offsets(ir_dereference *tail,
4077                                          unsigned *array_elements,
4078                                          uint16_t *index,
4079                                          st_src_reg *indirect,
4080                                          unsigned *location)
4081 {
4082    switch (tail->ir_type) {
4083    case ir_type_dereference_record: {
4084       ir_dereference_record *deref_record = tail->as_dereference_record();
4085       const glsl_type *struct_type = deref_record->record->type;
4086       int field_index = deref_record->field_idx;
4087
4088       calc_deref_offsets(deref_record->record->as_dereference(), array_elements, index, indirect, location);
4089
4090       assert(field_index >= 0);
4091       *location += struct_type->record_location_offset(field_index);
4092       break;
4093    }
4094
4095    case ir_type_dereference_array: {
4096       ir_dereference_array *deref_arr = tail->as_dereference_array();
4097
4098       void *mem_ctx = ralloc_parent(deref_arr);
4099       ir_constant *array_index =
4100          deref_arr->array_index->constant_expression_value(mem_ctx);
4101
4102       if (!array_index) {
4103          st_src_reg temp_reg;
4104          st_dst_reg temp_dst;
4105
4106          temp_reg = get_temp(glsl_type::uint_type);
4107          temp_dst = st_dst_reg(temp_reg);
4108          temp_dst.writemask = 1;
4109
4110          deref_arr->array_index->accept(this);
4111          if (*array_elements != 1)
4112             emit_asm(NULL, TGSI_OPCODE_MUL, temp_dst, this->result, st_src_reg_for_int(*array_elements));
4113          else
4114             emit_asm(NULL, TGSI_OPCODE_MOV, temp_dst, this->result);
4115
4116          if (indirect->file == PROGRAM_UNDEFINED)
4117             *indirect = temp_reg;
4118          else {
4119             temp_dst = st_dst_reg(*indirect);
4120             temp_dst.writemask = 1;
4121             emit_asm(NULL, TGSI_OPCODE_ADD, temp_dst, *indirect, temp_reg);
4122          }
4123       } else
4124          *index += array_index->value.u[0] * *array_elements;
4125
4126       *array_elements *= deref_arr->array->type->length;
4127
4128       calc_deref_offsets(deref_arr->array->as_dereference(), array_elements, index, indirect, location);
4129       break;
4130    }
4131    default:
4132       break;
4133    }
4134 }
4135
4136 void
4137 glsl_to_tgsi_visitor::get_deref_offsets(ir_dereference *ir,
4138                                         unsigned *array_size,
4139                                         unsigned *base,
4140                                         uint16_t *index,
4141                                         st_src_reg *reladdr,
4142                                         bool opaque)
4143 {
4144    GLuint shader = _mesa_program_enum_to_shader_stage(this->prog->Target);
4145    unsigned location = 0;
4146    ir_variable *var = ir->variable_referenced();
4147
4148    memset(reladdr, 0, sizeof(*reladdr));
4149    reladdr->file = PROGRAM_UNDEFINED;
4150
4151    *base = 0;
4152    *array_size = 1;
4153
4154    assert(var);
4155    location = var->data.location;
4156    calc_deref_offsets(ir, array_size, index, reladdr, &location);
4157
4158    /*
4159     * If we end up with no indirect then adjust the base to the index,
4160     * and set the array size to 1.
4161     */
4162    if (reladdr->file == PROGRAM_UNDEFINED) {
4163       *base = *index;
4164       *array_size = 1;
4165    }
4166
4167    if (opaque) {
4168       assert(location != 0xffffffff);
4169       *base += this->shader_program->data->UniformStorage[location].opaque[shader].index;
4170       *index += this->shader_program->data->UniformStorage[location].opaque[shader].index;
4171    }
4172 }
4173
4174 st_src_reg
4175 glsl_to_tgsi_visitor::canonicalize_gather_offset(st_src_reg offset)
4176 {
4177    if (offset.reladdr || offset.reladdr2 ||
4178        offset.has_index2 ||
4179        offset.file == PROGRAM_UNIFORM ||
4180        offset.file == PROGRAM_CONSTANT ||
4181        offset.file == PROGRAM_STATE_VAR) {
4182       st_src_reg tmp = get_temp(glsl_type::ivec2_type);
4183       st_dst_reg tmp_dst = st_dst_reg(tmp);
4184       tmp_dst.writemask = WRITEMASK_XY;
4185       emit_asm(NULL, TGSI_OPCODE_MOV, tmp_dst, offset);
4186       return tmp;
4187    }
4188
4189    return offset;
4190 }
4191
4192 bool
4193 glsl_to_tgsi_visitor::handle_bound_deref(ir_dereference *ir)
4194 {
4195    ir_variable *var = ir->variable_referenced();
4196
4197    if (!var || var->data.mode != ir_var_uniform || var->data.bindless ||
4198        !(ir->type->is_image() || ir->type->is_sampler()))
4199       return false;
4200
4201    /* Convert from bound sampler/image to bindless handle. */
4202    bool is_image = ir->type->is_image();
4203    st_src_reg resource(is_image ? PROGRAM_IMAGE : PROGRAM_SAMPLER, 0, GLSL_TYPE_UINT);
4204    uint16_t index = 0;
4205    unsigned array_size = 1, base = 0;
4206    st_src_reg reladdr;
4207    get_deref_offsets(ir, &array_size, &base, &index, &reladdr, true);
4208
4209    resource.index = index;
4210    if (reladdr.file != PROGRAM_UNDEFINED) {
4211       resource.reladdr = ralloc(mem_ctx, st_src_reg);
4212       *resource.reladdr = reladdr;
4213       emit_arl(ir, sampler_reladdr, reladdr);
4214    }
4215
4216    this->result = get_temp(glsl_type::uvec2_type);
4217    st_dst_reg dst(this->result);
4218    dst.writemask = WRITEMASK_XY;
4219
4220    glsl_to_tgsi_instruction *inst = emit_asm(
4221       ir, is_image ? TGSI_OPCODE_IMG2HND : TGSI_OPCODE_SAMP2HND, dst);
4222
4223    inst->tex_target = ir->type->sampler_index();
4224    inst->resource = resource;
4225    inst->sampler_array_size = array_size;
4226    inst->sampler_base = base;
4227
4228    return true;
4229 }
4230
4231 void
4232 glsl_to_tgsi_visitor::visit(ir_texture *ir)
4233 {
4234    st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy;
4235    st_src_reg offset[MAX_GLSL_TEXTURE_OFFSET], sample_index, component;
4236    st_src_reg levels_src, reladdr;
4237    st_dst_reg result_dst, coord_dst, cube_sc_dst;
4238    glsl_to_tgsi_instruction *inst = NULL;
4239    enum tgsi_opcode opcode = TGSI_OPCODE_NOP;
4240    const glsl_type *sampler_type = ir->sampler->type;
4241    unsigned sampler_array_size = 1, sampler_base = 0;
4242    bool is_cube_array = false, is_cube_shadow = false;
4243    ir_variable *var = ir->sampler->variable_referenced();
4244    unsigned i;
4245
4246    /* if we are a cube array sampler or a cube shadow */
4247    if (sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE) {
4248       is_cube_array = sampler_type->sampler_array;
4249       is_cube_shadow = sampler_type->sampler_shadow;
4250    }
4251
4252    if (ir->coordinate) {
4253       ir->coordinate->accept(this);
4254
4255       /* Put our coords in a temp.  We'll need to modify them for shadow,
4256        * projection, or LOD, so the only case we'd use it as-is is if
4257        * we're doing plain old texturing.  The optimization passes on
4258        * glsl_to_tgsi_visitor should handle cleaning up our mess in that case.
4259        */
4260       coord = get_temp(glsl_type::vec4_type);
4261       coord_dst = st_dst_reg(coord);
4262       coord_dst.writemask = (1 << ir->coordinate->type->vector_elements) - 1;
4263       emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
4264    }
4265
4266    if (ir->projector) {
4267       ir->projector->accept(this);
4268       projector = this->result;
4269    }
4270
4271    /* Storage for our result.  Ideally for an assignment we'd be using
4272     * the actual storage for the result here, instead.
4273     */
4274    result_src = get_temp(ir->type);
4275    result_dst = st_dst_reg(result_src);
4276    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
4277
4278    switch (ir->op) {
4279    case ir_tex:
4280       opcode = (is_cube_array && ir->shadow_comparator) ? TGSI_OPCODE_TEX2 : TGSI_OPCODE_TEX;
4281       if (ir->offset) {
4282          ir->offset->accept(this);
4283          offset[0] = this->result;
4284       }
4285       break;
4286    case ir_txb:
4287       if (is_cube_array || is_cube_shadow) {
4288          opcode = TGSI_OPCODE_TXB2;
4289       }
4290       else {
4291          opcode = TGSI_OPCODE_TXB;
4292       }
4293       ir->lod_info.bias->accept(this);
4294       lod_info = this->result;
4295       if (ir->offset) {
4296          ir->offset->accept(this);
4297          offset[0] = this->result;
4298       }
4299       break;
4300    case ir_txl:
4301       if (this->has_tex_txf_lz && ir->lod_info.lod->is_zero()) {
4302          opcode = TGSI_OPCODE_TEX_LZ;
4303       } else {
4304          opcode = is_cube_array ? TGSI_OPCODE_TXL2 : TGSI_OPCODE_TXL;
4305          ir->lod_info.lod->accept(this);
4306          lod_info = this->result;
4307       }
4308       if (ir->offset) {
4309          ir->offset->accept(this);
4310          offset[0] = this->result;
4311       }
4312       break;
4313    case ir_txd:
4314       opcode = TGSI_OPCODE_TXD;
4315       ir->lod_info.grad.dPdx->accept(this);
4316       dx = this->result;
4317       ir->lod_info.grad.dPdy->accept(this);
4318       dy = this->result;
4319       if (ir->offset) {
4320          ir->offset->accept(this);
4321          offset[0] = this->result;
4322       }
4323       break;
4324    case ir_txs:
4325       opcode = TGSI_OPCODE_TXQ;
4326       ir->lod_info.lod->accept(this);
4327       lod_info = this->result;
4328       break;
4329    case ir_query_levels:
4330       opcode = TGSI_OPCODE_TXQ;
4331       lod_info = undef_src;
4332       levels_src = get_temp(ir->type);
4333       break;
4334    case ir_txf:
4335       if (this->has_tex_txf_lz && ir->lod_info.lod->is_zero()) {
4336          opcode = TGSI_OPCODE_TXF_LZ;
4337       } else {
4338          opcode = TGSI_OPCODE_TXF;
4339          ir->lod_info.lod->accept(this);
4340          lod_info = this->result;
4341       }
4342       if (ir->offset) {
4343          ir->offset->accept(this);
4344          offset[0] = this->result;
4345       }
4346       break;
4347    case ir_txf_ms:
4348       opcode = TGSI_OPCODE_TXF;
4349       ir->lod_info.sample_index->accept(this);
4350       sample_index = this->result;
4351       break;
4352    case ir_tg4:
4353       opcode = TGSI_OPCODE_TG4;
4354       ir->lod_info.component->accept(this);
4355       component = this->result;
4356       if (ir->offset) {
4357          ir->offset->accept(this);
4358          if (ir->offset->type->is_array()) {
4359             const glsl_type *elt_type = ir->offset->type->fields.array;
4360             for (i = 0; i < ir->offset->type->length; i++) {
4361                offset[i] = this->result;
4362                offset[i].index += i * type_size(elt_type);
4363                offset[i].type = elt_type->base_type;
4364                offset[i].swizzle = swizzle_for_size(elt_type->vector_elements);
4365                offset[i] = canonicalize_gather_offset(offset[i]);
4366             }
4367          } else {
4368             offset[0] = canonicalize_gather_offset(this->result);
4369          }
4370       }
4371       break;
4372    case ir_lod:
4373       opcode = TGSI_OPCODE_LODQ;
4374       break;
4375    case ir_texture_samples:
4376       opcode = TGSI_OPCODE_TXQS;
4377       break;
4378    case ir_samples_identical:
4379       unreachable("Unexpected ir_samples_identical opcode");
4380    }
4381
4382    if (ir->projector) {
4383       if (opcode == TGSI_OPCODE_TEX) {
4384          /* Slot the projector in as the last component of the coord. */
4385          coord_dst.writemask = WRITEMASK_W;
4386          emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, projector);
4387          coord_dst.writemask = WRITEMASK_XYZW;
4388          opcode = TGSI_OPCODE_TXP;
4389       } else {
4390          st_src_reg coord_w = coord;
4391          coord_w.swizzle = SWIZZLE_WWWW;
4392
4393          /* For the other TEX opcodes there's no projective version
4394           * since the last slot is taken up by LOD info.  Do the
4395           * projective divide now.
4396           */
4397          coord_dst.writemask = WRITEMASK_W;
4398          emit_asm(ir, TGSI_OPCODE_RCP, coord_dst, projector);
4399
4400          /* In the case where we have to project the coordinates "by hand,"
4401           * the shadow comparator value must also be projected.
4402           */
4403          st_src_reg tmp_src = coord;
4404          if (ir->shadow_comparator) {
4405             /* Slot the shadow value in as the second to last component of the
4406              * coord.
4407              */
4408             ir->shadow_comparator->accept(this);
4409
4410             tmp_src = get_temp(glsl_type::vec4_type);
4411             st_dst_reg tmp_dst = st_dst_reg(tmp_src);
4412
4413             /* Projective division not allowed for array samplers. */
4414             assert(!sampler_type->sampler_array);
4415
4416             tmp_dst.writemask = WRITEMASK_Z;
4417             emit_asm(ir, TGSI_OPCODE_MOV, tmp_dst, this->result);
4418
4419             tmp_dst.writemask = WRITEMASK_XY;
4420             emit_asm(ir, TGSI_OPCODE_MOV, tmp_dst, coord);
4421          }
4422
4423          coord_dst.writemask = WRITEMASK_XYZ;
4424          emit_asm(ir, TGSI_OPCODE_MUL, coord_dst, tmp_src, coord_w);
4425
4426          coord_dst.writemask = WRITEMASK_XYZW;
4427          coord.swizzle = SWIZZLE_XYZW;
4428       }
4429    }
4430
4431    /* If projection is done and the opcode is not TGSI_OPCODE_TXP, then the
4432     * shadow comparator was put in the correct place (and projected) by the
4433     * code, above, that handles by-hand projection.
4434     */
4435    if (ir->shadow_comparator && (!ir->projector || opcode == TGSI_OPCODE_TXP)) {
4436       /* Slot the shadow value in as the second to last component of the
4437        * coord.
4438        */
4439       ir->shadow_comparator->accept(this);
4440
4441       if (is_cube_array) {
4442          cube_sc = get_temp(glsl_type::float_type);
4443          cube_sc_dst = st_dst_reg(cube_sc);
4444          cube_sc_dst.writemask = WRITEMASK_X;
4445          emit_asm(ir, TGSI_OPCODE_MOV, cube_sc_dst, this->result);
4446          cube_sc_dst.writemask = WRITEMASK_X;
4447       }
4448       else {
4449          if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_2D &&
4450               sampler_type->sampler_array) ||
4451              sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE) {
4452             coord_dst.writemask = WRITEMASK_W;
4453          } else {
4454             coord_dst.writemask = WRITEMASK_Z;
4455          }
4456          emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
4457          coord_dst.writemask = WRITEMASK_XYZW;
4458       }
4459    }
4460
4461    if (ir->op == ir_txf_ms) {
4462       coord_dst.writemask = WRITEMASK_W;
4463       emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, sample_index);
4464       coord_dst.writemask = WRITEMASK_XYZW;
4465    } else if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXB ||
4466        opcode == TGSI_OPCODE_TXF) {
4467       /* TGSI stores LOD or LOD bias in the last channel of the coords. */
4468       coord_dst.writemask = WRITEMASK_W;
4469       emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, lod_info);
4470       coord_dst.writemask = WRITEMASK_XYZW;
4471    }
4472
4473    st_src_reg sampler(PROGRAM_SAMPLER, 0, GLSL_TYPE_UINT);
4474
4475    uint16_t index = 0;
4476    get_deref_offsets(ir->sampler, &sampler_array_size, &sampler_base,
4477                      &index, &reladdr, !var->contains_bindless());
4478
4479    sampler.index = index;
4480    if (reladdr.file != PROGRAM_UNDEFINED) {
4481       sampler.reladdr = ralloc(mem_ctx, st_src_reg);
4482       *sampler.reladdr = reladdr;
4483       emit_arl(ir, sampler_reladdr, reladdr);
4484    }
4485
4486    st_src_reg bindless;
4487    if (var->contains_bindless()) {
4488       ir->sampler->accept(this);
4489       bindless = this->result;
4490    }
4491
4492    if (opcode == TGSI_OPCODE_TXD)
4493       inst = emit_asm(ir, opcode, result_dst, coord, dx, dy);
4494    else if (opcode == TGSI_OPCODE_TXQ) {
4495       if (ir->op == ir_query_levels) {
4496          /* the level is stored in W */
4497          inst = emit_asm(ir, opcode, st_dst_reg(levels_src), lod_info);
4498          result_dst.writemask = WRITEMASK_X;
4499          levels_src.swizzle = SWIZZLE_WWWW;
4500          emit_asm(ir, TGSI_OPCODE_MOV, result_dst, levels_src);
4501       } else
4502          inst = emit_asm(ir, opcode, result_dst, lod_info);
4503    } else if (opcode == TGSI_OPCODE_TXQS) {
4504       inst = emit_asm(ir, opcode, result_dst);
4505    } else if (opcode == TGSI_OPCODE_TXL2 || opcode == TGSI_OPCODE_TXB2) {
4506       inst = emit_asm(ir, opcode, result_dst, coord, lod_info);
4507    } else if (opcode == TGSI_OPCODE_TEX2) {
4508       inst = emit_asm(ir, opcode, result_dst, coord, cube_sc);
4509    } else if (opcode == TGSI_OPCODE_TG4) {
4510       if (is_cube_array && ir->shadow_comparator) {
4511          inst = emit_asm(ir, opcode, result_dst, coord, cube_sc);
4512       } else {
4513          inst = emit_asm(ir, opcode, result_dst, coord, component);
4514       }
4515    } else
4516       inst = emit_asm(ir, opcode, result_dst, coord);
4517
4518    if (ir->shadow_comparator)
4519       inst->tex_shadow = GL_TRUE;
4520
4521    if (var->contains_bindless()) {
4522       inst->resource = bindless;
4523       inst->resource.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
4524                                              SWIZZLE_X, SWIZZLE_Y);
4525    } else {
4526       inst->resource = sampler;
4527       inst->sampler_array_size = sampler_array_size;
4528       inst->sampler_base = sampler_base;
4529    }
4530
4531    if (ir->offset) {
4532       if (!inst->tex_offsets)
4533          inst->tex_offsets = rzalloc_array(inst, st_src_reg,
4534                                            MAX_GLSL_TEXTURE_OFFSET);
4535
4536       for (i = 0; i < MAX_GLSL_TEXTURE_OFFSET &&
4537                   offset[i].file != PROGRAM_UNDEFINED; i++)
4538          inst->tex_offsets[i] = offset[i];
4539       inst->tex_offset_num_offset = i;
4540    }
4541
4542    inst->tex_target = sampler_type->sampler_index();
4543    inst->tex_type = ir->type->base_type;
4544
4545    this->result = result_src;
4546 }
4547
4548 void
4549 glsl_to_tgsi_visitor::visit(ir_return *ir)
4550 {
4551    assert(!ir->get_value());
4552
4553    emit_asm(ir, TGSI_OPCODE_RET);
4554 }
4555
4556 void
4557 glsl_to_tgsi_visitor::visit(ir_discard *ir)
4558 {
4559    if (ir->condition) {
4560       ir->condition->accept(this);
4561       st_src_reg condition = this->result;
4562
4563       /* Convert the bool condition to a float so we can negate. */
4564       if (native_integers) {
4565          st_src_reg temp = get_temp(ir->condition->type);
4566          emit_asm(ir, TGSI_OPCODE_AND, st_dst_reg(temp),
4567               condition, st_src_reg_for_float(1.0));
4568          condition = temp;
4569       }
4570
4571       condition.negate = ~condition.negate;
4572       emit_asm(ir, TGSI_OPCODE_KILL_IF, undef_dst, condition);
4573    } else {
4574       /* unconditional kil */
4575       emit_asm(ir, TGSI_OPCODE_KILL);
4576    }
4577 }
4578
4579 void
4580 glsl_to_tgsi_visitor::visit(ir_if *ir)
4581 {
4582    enum tgsi_opcode if_opcode;
4583    glsl_to_tgsi_instruction *if_inst;
4584
4585    ir->condition->accept(this);
4586    assert(this->result.file != PROGRAM_UNDEFINED);
4587
4588    if_opcode = native_integers ? TGSI_OPCODE_UIF : TGSI_OPCODE_IF;
4589
4590    if_inst = emit_asm(ir->condition, if_opcode, undef_dst, this->result);
4591
4592    this->instructions.push_tail(if_inst);
4593
4594    visit_exec_list(&ir->then_instructions, this);
4595
4596    if (!ir->else_instructions.is_empty()) {
4597       emit_asm(ir->condition, TGSI_OPCODE_ELSE);
4598       visit_exec_list(&ir->else_instructions, this);
4599    }
4600
4601    if_inst = emit_asm(ir->condition, TGSI_OPCODE_ENDIF);
4602 }
4603
4604
4605 void
4606 glsl_to_tgsi_visitor::visit(ir_emit_vertex *ir)
4607 {
4608    assert(this->prog->Target == GL_GEOMETRY_PROGRAM_NV);
4609
4610    ir->stream->accept(this);
4611    emit_asm(ir, TGSI_OPCODE_EMIT, undef_dst, this->result);
4612 }
4613
4614 void
4615 glsl_to_tgsi_visitor::visit(ir_end_primitive *ir)
4616 {
4617    assert(this->prog->Target == GL_GEOMETRY_PROGRAM_NV);
4618
4619    ir->stream->accept(this);
4620    emit_asm(ir, TGSI_OPCODE_ENDPRIM, undef_dst, this->result);
4621 }
4622
4623 void
4624 glsl_to_tgsi_visitor::visit(ir_barrier *ir)
4625 {
4626    assert(this->prog->Target == GL_TESS_CONTROL_PROGRAM_NV ||
4627           this->prog->Target == GL_COMPUTE_PROGRAM_NV);
4628
4629    emit_asm(ir, TGSI_OPCODE_BARRIER);
4630 }
4631
4632 glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
4633 {
4634    STATIC_ASSERT(sizeof(samplers_used) * 8 >= PIPE_MAX_SAMPLERS);
4635
4636    result.file = PROGRAM_UNDEFINED;
4637    next_temp = 1;
4638    array_sizes = NULL;
4639    max_num_arrays = 0;
4640    next_array = 0;
4641    num_inputs = 0;
4642    num_outputs = 0;
4643    num_input_arrays = 0;
4644    num_output_arrays = 0;
4645    num_atomics = 0;
4646    num_atomic_arrays = 0;
4647    num_immediates = 0;
4648    num_address_regs = 0;
4649    samplers_used = 0;
4650    images_used = 0;
4651    indirect_addr_consts = false;
4652    wpos_transform_const = -1;
4653    native_integers = false;
4654    mem_ctx = ralloc_context(NULL);
4655    ctx = NULL;
4656    prog = NULL;
4657    precise = 0;
4658    shader_program = NULL;
4659    shader = NULL;
4660    options = NULL;
4661    have_sqrt = false;
4662    have_fma = false;
4663    use_shared_memory = false;
4664    has_tex_txf_lz = false;
4665    variables = NULL;
4666 }
4667
4668 static void var_destroy(struct hash_entry *entry)
4669 {
4670    variable_storage *storage = (variable_storage *)entry->data;
4671
4672    delete storage;
4673 }
4674
4675 glsl_to_tgsi_visitor::~glsl_to_tgsi_visitor()
4676 {
4677    _mesa_hash_table_destroy(variables, var_destroy);
4678    free(array_sizes);
4679    ralloc_free(mem_ctx);
4680 }
4681
4682 extern "C" void free_glsl_to_tgsi_visitor(glsl_to_tgsi_visitor *v)
4683 {
4684    delete v;
4685 }
4686
4687
4688 /**
4689  * Count resources used by the given gpu program (number of texture
4690  * samplers, etc).
4691  */
4692 static void
4693 count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
4694 {
4695    v->samplers_used = 0;
4696    v->images_used = 0;
4697    prog->info.textures_used_by_txf = 0;
4698
4699    foreach_in_list(glsl_to_tgsi_instruction, inst, &v->instructions) {
4700       if (inst->info->is_tex) {
4701          for (int i = 0; i < inst->sampler_array_size; i++) {
4702             unsigned idx = inst->sampler_base + i;
4703             v->samplers_used |= 1u << idx;
4704
4705             debug_assert(idx < (int)ARRAY_SIZE(v->sampler_types));
4706             v->sampler_types[idx] = inst->tex_type;
4707             v->sampler_targets[idx] =
4708                st_translate_texture_target(inst->tex_target, inst->tex_shadow);
4709
4710             if (inst->op == TGSI_OPCODE_TXF || inst->op == TGSI_OPCODE_TXF_LZ) {
4711                prog->info.textures_used_by_txf |= 1u << idx;
4712             }
4713          }
4714       }
4715
4716       if (inst->tex_target == TEXTURE_EXTERNAL_INDEX)
4717          prog->ExternalSamplersUsed |= 1 << inst->resource.index;
4718
4719       if (inst->resource.file != PROGRAM_UNDEFINED && (
4720                 is_resource_instruction(inst->op) ||
4721                 inst->op == TGSI_OPCODE_STORE)) {
4722          if (inst->resource.file == PROGRAM_MEMORY) {
4723             v->use_shared_memory = true;
4724          } else if (inst->resource.file == PROGRAM_IMAGE) {
4725             for (int i = 0; i < inst->sampler_array_size; i++) {
4726                unsigned idx = inst->sampler_base + i;
4727                v->images_used |= 1 << idx;
4728                v->image_targets[idx] =
4729                   st_translate_texture_target(inst->tex_target, false);
4730                v->image_formats[idx] = inst->image_format;
4731                v->image_wr[idx] = !inst->read_only;
4732             }
4733          }
4734       }
4735    }
4736    prog->SamplersUsed = v->samplers_used;
4737
4738    if (v->shader_program != NULL)
4739       _mesa_update_shader_textures_used(v->shader_program, prog);
4740 }
4741
4742 /**
4743  * Returns the mask of channels (bitmask of WRITEMASK_X,Y,Z,W) which
4744  * are read from the given src in this instruction
4745  */
4746 static int
4747 get_src_arg_mask(st_dst_reg dst, st_src_reg src)
4748 {
4749    int read_mask = 0, comp;
4750
4751    /* Now, given the src swizzle and the written channels, find which
4752     * components are actually read
4753     */
4754    for (comp = 0; comp < 4; ++comp) {
4755       const unsigned coord = GET_SWZ(src.swizzle, comp);
4756       assert(coord < 4);
4757       if (dst.writemask & (1 << comp) && coord <= SWIZZLE_W)
4758          read_mask |= 1 << coord;
4759    }
4760
4761    return read_mask;
4762 }
4763
4764 /**
4765  * This pass replaces CMP T0, T1 T2 T0 with MOV T0, T2 when the CMP
4766  * instruction is the first instruction to write to register T0.  There are
4767  * several lowering passes done in GLSL IR (e.g. branches and
4768  * relative addressing) that create a large number of conditional assignments
4769  * that ir_to_mesa converts to CMP instructions like the one mentioned above.
4770  *
4771  * Here is why this conversion is safe:
4772  * CMP T0, T1 T2 T0 can be expanded to:
4773  * if (T1 < 0.0)
4774  *   MOV T0, T2;
4775  * else
4776  *   MOV T0, T0;
4777  *
4778  * If (T1 < 0.0) evaluates to true then our replacement MOV T0, T2 is the same
4779  * as the original program.  If (T1 < 0.0) evaluates to false, executing
4780  * MOV T0, T0 will store a garbage value in T0 since T0 is uninitialized.
4781  * Therefore, it doesn't matter that we are replacing MOV T0, T0 with MOV T0, T2
4782  * because any instruction that was going to read from T0 after this was going
4783  * to read a garbage value anyway.
4784  */
4785 void
4786 glsl_to_tgsi_visitor::simplify_cmp(void)
4787 {
4788    int tempWritesSize = 0;
4789    unsigned *tempWrites = NULL;
4790    unsigned outputWrites[VARYING_SLOT_TESS_MAX];
4791
4792    memset(outputWrites, 0, sizeof(outputWrites));
4793
4794    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4795       unsigned prevWriteMask = 0;
4796
4797       /* Give up if we encounter relative addressing or flow control. */
4798       if (inst->dst[0].reladdr || inst->dst[0].reladdr2 ||
4799           inst->dst[1].reladdr || inst->dst[1].reladdr2 ||
4800           inst->info->is_branch ||
4801           inst->op == TGSI_OPCODE_CONT ||
4802           inst->op == TGSI_OPCODE_END ||
4803           inst->op == TGSI_OPCODE_RET) {
4804          break;
4805       }
4806
4807       if (inst->dst[0].file == PROGRAM_OUTPUT) {
4808          assert(inst->dst[0].index < (signed)ARRAY_SIZE(outputWrites));
4809          prevWriteMask = outputWrites[inst->dst[0].index];
4810          outputWrites[inst->dst[0].index] |= inst->dst[0].writemask;
4811       } else if (inst->dst[0].file == PROGRAM_TEMPORARY) {
4812          if (inst->dst[0].index >= tempWritesSize) {
4813             const int inc = 4096;
4814
4815             tempWrites = (unsigned*)
4816                          realloc(tempWrites,
4817                                  (tempWritesSize + inc) * sizeof(unsigned));
4818             if (!tempWrites)
4819                return;
4820
4821             memset(tempWrites + tempWritesSize, 0, inc * sizeof(unsigned));
4822             tempWritesSize += inc;
4823          }
4824
4825          prevWriteMask = tempWrites[inst->dst[0].index];
4826          tempWrites[inst->dst[0].index] |= inst->dst[0].writemask;
4827       } else
4828          continue;
4829
4830       /* For a CMP to be considered a conditional write, the destination
4831        * register and source register two must be the same. */
4832       if (inst->op == TGSI_OPCODE_CMP
4833           && !(inst->dst[0].writemask & prevWriteMask)
4834           && inst->src[2].file == inst->dst[0].file
4835           && inst->src[2].index == inst->dst[0].index
4836           && inst->dst[0].writemask ==
4837              get_src_arg_mask(inst->dst[0], inst->src[2])) {
4838
4839          inst->op = TGSI_OPCODE_MOV;
4840          inst->info = tgsi_get_opcode_info(inst->op);
4841          inst->src[0] = inst->src[1];
4842       }
4843    }
4844
4845    free(tempWrites);
4846 }
4847
4848 static void
4849 rename_temp_handle_src(struct rename_reg_pair *renames, st_src_reg *src)
4850 {
4851    if (src && src->file == PROGRAM_TEMPORARY) {
4852       int old_idx = src->index;
4853       if (renames[old_idx].valid)
4854          src->index = renames[old_idx].new_reg;
4855    }
4856 }
4857
4858 /* Replaces all references to a temporary register index with another index. */
4859 void
4860 glsl_to_tgsi_visitor::rename_temp_registers(struct rename_reg_pair *renames)
4861 {
4862    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4863       unsigned j;
4864       for (j = 0; j < num_inst_src_regs(inst); j++) {
4865          rename_temp_handle_src(renames, &inst->src[j]);
4866          rename_temp_handle_src(renames, inst->src[j].reladdr);
4867          rename_temp_handle_src(renames, inst->src[j].reladdr2);
4868       }
4869
4870       for (j = 0; j < inst->tex_offset_num_offset; j++) {
4871          rename_temp_handle_src(renames, &inst->tex_offsets[j]);
4872          rename_temp_handle_src(renames, inst->tex_offsets[j].reladdr);
4873          rename_temp_handle_src(renames, inst->tex_offsets[j].reladdr2);
4874       }
4875
4876       rename_temp_handle_src(renames, &inst->resource);
4877       rename_temp_handle_src(renames, inst->resource.reladdr);
4878       rename_temp_handle_src(renames, inst->resource.reladdr2);
4879
4880       for (j = 0; j < num_inst_dst_regs(inst); j++) {
4881          if (inst->dst[j].file == PROGRAM_TEMPORARY) {
4882             int old_idx = inst->dst[j].index;
4883             if (renames[old_idx].valid)
4884                inst->dst[j].index = renames[old_idx].new_reg;
4885          }
4886          rename_temp_handle_src(renames, inst->dst[j].reladdr);
4887          rename_temp_handle_src(renames, inst->dst[j].reladdr2);
4888       }
4889    }
4890 }
4891
4892 void
4893 glsl_to_tgsi_visitor::get_first_temp_write(int *first_writes)
4894 {
4895    int depth = 0; /* loop depth */
4896    int loop_start = -1; /* index of the first active BGNLOOP (if any) */
4897    unsigned i = 0, j;
4898
4899    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4900       for (j = 0; j < num_inst_dst_regs(inst); j++) {
4901          if (inst->dst[j].file == PROGRAM_TEMPORARY) {
4902             if (first_writes[inst->dst[j].index] == -1)
4903                 first_writes[inst->dst[j].index] = (depth == 0) ? i : loop_start;
4904          }
4905       }
4906
4907       if (inst->op == TGSI_OPCODE_BGNLOOP) {
4908          if (depth++ == 0)
4909             loop_start = i;
4910       } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
4911          if (--depth == 0)
4912             loop_start = -1;
4913       }
4914       assert(depth >= 0);
4915       i++;
4916    }
4917 }
4918
4919 void
4920 glsl_to_tgsi_visitor::get_first_temp_read(int *first_reads)
4921 {
4922    int depth = 0; /* loop depth */
4923    int loop_start = -1; /* index of the first active BGNLOOP (if any) */
4924    unsigned i = 0, j;
4925
4926    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4927       for (j = 0; j < num_inst_src_regs(inst); j++) {
4928          if (inst->src[j].file == PROGRAM_TEMPORARY) {
4929             if (first_reads[inst->src[j].index] == -1)
4930                 first_reads[inst->src[j].index] = (depth == 0) ? i : loop_start;
4931          }
4932       }
4933       for (j = 0; j < inst->tex_offset_num_offset; j++) {
4934          if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY) {
4935             if (first_reads[inst->tex_offsets[j].index] == -1)
4936                first_reads[inst->tex_offsets[j].index] = (depth == 0) ? i : loop_start;
4937          }
4938       }
4939       if (inst->op == TGSI_OPCODE_BGNLOOP) {
4940          if (depth++ == 0)
4941             loop_start = i;
4942       } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
4943          if (--depth == 0)
4944             loop_start = -1;
4945       }
4946       assert(depth >= 0);
4947       i++;
4948    }
4949 }
4950
4951 void
4952 glsl_to_tgsi_visitor::get_last_temp_read_first_temp_write(int *last_reads, int *first_writes)
4953 {
4954    int depth = 0; /* loop depth */
4955    int loop_start = -1; /* index of the first active BGNLOOP (if any) */
4956    unsigned i = 0, j;
4957    int k;
4958    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
4959       for (j = 0; j < num_inst_src_regs(inst); j++) {
4960          if (inst->src[j].file == PROGRAM_TEMPORARY)
4961             last_reads[inst->src[j].index] = (depth == 0) ? i : -2;
4962       }
4963       for (j = 0; j < num_inst_dst_regs(inst); j++) {
4964          if (inst->dst[j].file == PROGRAM_TEMPORARY) {
4965             if (first_writes[inst->dst[j].index] == -1)
4966                first_writes[inst->dst[j].index] = (depth == 0) ? i : loop_start;
4967             last_reads[inst->dst[j].index] = (depth == 0) ? i : -2;
4968          }
4969       }
4970       for (j = 0; j < inst->tex_offset_num_offset; j++) {
4971          if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY)
4972             last_reads[inst->tex_offsets[j].index] = (depth == 0) ? i : -2;
4973       }
4974       if (inst->op == TGSI_OPCODE_BGNLOOP) {
4975          if (depth++ == 0)
4976             loop_start = i;
4977       } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
4978          if (--depth == 0) {
4979             loop_start = -1;
4980             for (k = 0; k < this->next_temp; k++) {
4981                if (last_reads[k] == -2) {
4982                   last_reads[k] = i;
4983                }
4984             }
4985          }
4986       }
4987       assert(depth >= 0);
4988       i++;
4989    }
4990 }
4991
4992 void
4993 glsl_to_tgsi_visitor::get_last_temp_write(int *last_writes)
4994 {
4995    int depth = 0; /* loop depth */
4996    int i = 0, k;
4997    unsigned j;
4998
4999    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
5000       for (j = 0; j < num_inst_dst_regs(inst); j++) {
5001          if (inst->dst[j].file == PROGRAM_TEMPORARY)
5002             last_writes[inst->dst[j].index] = (depth == 0) ? i : -2;
5003       }
5004
5005       if (inst->op == TGSI_OPCODE_BGNLOOP)
5006          depth++;
5007       else if (inst->op == TGSI_OPCODE_ENDLOOP)
5008          if (--depth == 0) {
5009             for (k = 0; k < this->next_temp; k++) {
5010                if (last_writes[k] == -2) {
5011                   last_writes[k] = i;
5012                }
5013             }
5014          }
5015       assert(depth >= 0);
5016       i++;
5017    }
5018 }
5019
5020 /*
5021  * On a basic block basis, tracks available PROGRAM_TEMPORARY register
5022  * channels for copy propagation and updates following instructions to
5023  * use the original versions.
5024  *
5025  * The glsl_to_tgsi_visitor lazily produces code assuming that this pass
5026  * will occur.  As an example, a TXP production before this pass:
5027  *
5028  * 0: MOV TEMP[1], INPUT[4].xyyy;
5029  * 1: MOV TEMP[1].w, INPUT[4].wwww;
5030  * 2: TXP TEMP[2], TEMP[1], texture[0], 2D;
5031  *
5032  * and after:
5033  *
5034  * 0: MOV TEMP[1], INPUT[4].xyyy;
5035  * 1: MOV TEMP[1].w, INPUT[4].wwww;
5036  * 2: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D;
5037  *
5038  * which allows for dead code elimination on TEMP[1]'s writes.
5039  */
5040 void
5041 glsl_to_tgsi_visitor::copy_propagate(void)
5042 {
5043    glsl_to_tgsi_instruction **acp = rzalloc_array(mem_ctx,
5044                                                   glsl_to_tgsi_instruction *,
5045                                                   this->next_temp * 4);
5046    int *acp_level = rzalloc_array(mem_ctx, int, this->next_temp * 4);
5047    int level = 0;
5048
5049    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
5050       assert(inst->dst[0].file != PROGRAM_TEMPORARY
5051              || inst->dst[0].index < this->next_temp);
5052
5053       /* First, do any copy propagation possible into the src regs. */
5054       for (int r = 0; r < 3; r++) {
5055          glsl_to_tgsi_instruction *first = NULL;
5056          bool good = true;
5057          int acp_base = inst->src[r].index * 4;
5058
5059          if (inst->src[r].file != PROGRAM_TEMPORARY ||
5060              inst->src[r].reladdr ||
5061              inst->src[r].reladdr2)
5062             continue;
5063
5064          /* See if we can find entries in the ACP consisting of MOVs
5065           * from the same src register for all the swizzled channels
5066           * of this src register reference.
5067           */
5068          for (int i = 0; i < 4; i++) {
5069             int src_chan = GET_SWZ(inst->src[r].swizzle, i);
5070             glsl_to_tgsi_instruction *copy_chan = acp[acp_base + src_chan];
5071
5072             if (!copy_chan) {
5073                good = false;
5074                break;
5075             }
5076
5077             assert(acp_level[acp_base + src_chan] <= level);
5078
5079             if (!first) {
5080                first = copy_chan;
5081             } else {
5082                if (first->src[0].file != copy_chan->src[0].file ||
5083                    first->src[0].index != copy_chan->src[0].index ||
5084                    first->src[0].double_reg2 != copy_chan->src[0].double_reg2 ||
5085                    first->src[0].index2D != copy_chan->src[0].index2D) {
5086                   good = false;
5087                   break;
5088                }
5089             }
5090          }
5091
5092          if (good) {
5093             /* We've now validated that we can copy-propagate to
5094              * replace this src register reference.  Do it.
5095              */
5096             inst->src[r].file = first->src[0].file;
5097             inst->src[r].index = first->src[0].index;
5098             inst->src[r].index2D = first->src[0].index2D;
5099             inst->src[r].has_index2 = first->src[0].has_index2;
5100             inst->src[r].double_reg2 = first->src[0].double_reg2;
5101             inst->src[r].array_id = first->src[0].array_id;
5102
5103             int swizzle = 0;
5104             for (int i = 0; i < 4; i++) {
5105                int src_chan = GET_SWZ(inst->src[r].swizzle, i);
5106                glsl_to_tgsi_instruction *copy_inst = acp[acp_base + src_chan];
5107                swizzle |= (GET_SWZ(copy_inst->src[0].swizzle, src_chan) << (3 * i));
5108             }
5109             inst->src[r].swizzle = swizzle;
5110          }
5111       }
5112
5113       switch (inst->op) {
5114       case TGSI_OPCODE_BGNLOOP:
5115       case TGSI_OPCODE_ENDLOOP:
5116          /* End of a basic block, clear the ACP entirely. */
5117          memset(acp, 0, sizeof(*acp) * this->next_temp * 4);
5118          break;
5119
5120       case TGSI_OPCODE_IF:
5121       case TGSI_OPCODE_UIF:
5122          ++level;
5123          break;
5124
5125       case TGSI_OPCODE_ENDIF:
5126       case TGSI_OPCODE_ELSE:
5127          /* Clear all channels written inside the block from the ACP, but
5128           * leaving those that were not touched.
5129           */
5130          for (int r = 0; r < this->next_temp; r++) {
5131             for (int c = 0; c < 4; c++) {
5132                if (!acp[4 * r + c])
5133                   continue;
5134
5135                if (acp_level[4 * r + c] >= level)
5136                   acp[4 * r + c] = NULL;
5137             }
5138          }
5139          if (inst->op == TGSI_OPCODE_ENDIF)
5140             --level;
5141          break;
5142
5143       default:
5144          /* Continuing the block, clear any written channels from
5145           * the ACP.
5146           */
5147          for (int d = 0; d < 2; d++) {
5148             if (inst->dst[d].file == PROGRAM_TEMPORARY && inst->dst[d].reladdr) {
5149                /* Any temporary might be written, so no copy propagation
5150                 * across this instruction.
5151                 */
5152                memset(acp, 0, sizeof(*acp) * this->next_temp * 4);
5153             } else if (inst->dst[d].file == PROGRAM_OUTPUT &&
5154                        inst->dst[d].reladdr) {
5155                /* Any output might be written, so no copy propagation
5156                 * from outputs across this instruction.
5157                 */
5158                for (int r = 0; r < this->next_temp; r++) {
5159                   for (int c = 0; c < 4; c++) {
5160                      if (!acp[4 * r + c])
5161                         continue;
5162
5163                      if (acp[4 * r + c]->src[0].file == PROGRAM_OUTPUT)
5164                         acp[4 * r + c] = NULL;
5165                   }
5166                }
5167             } else if (inst->dst[d].file == PROGRAM_TEMPORARY ||
5168                        inst->dst[d].file == PROGRAM_OUTPUT) {
5169                /* Clear where it's used as dst. */
5170                if (inst->dst[d].file == PROGRAM_TEMPORARY) {
5171                   for (int c = 0; c < 4; c++) {
5172                      if (inst->dst[d].writemask & (1 << c))
5173                         acp[4 * inst->dst[d].index + c] = NULL;
5174                   }
5175                }
5176
5177                /* Clear where it's used as src. */
5178                for (int r = 0; r < this->next_temp; r++) {
5179                   for (int c = 0; c < 4; c++) {
5180                      if (!acp[4 * r + c])
5181                         continue;
5182
5183                      int src_chan = GET_SWZ(acp[4 * r + c]->src[0].swizzle, c);
5184
5185                      if (acp[4 * r + c]->src[0].file == inst->dst[d].file &&
5186                          acp[4 * r + c]->src[0].index == inst->dst[d].index &&
5187                          inst->dst[d].writemask & (1 << src_chan)) {
5188                         acp[4 * r + c] = NULL;
5189                      }
5190                   }
5191                }
5192             }
5193          }
5194          break;
5195       }
5196
5197       /* If this is a copy, add it to the ACP. */
5198       if (inst->op == TGSI_OPCODE_MOV &&
5199           inst->dst[0].file == PROGRAM_TEMPORARY &&
5200           !(inst->dst[0].file == inst->src[0].file &&
5201              inst->dst[0].index == inst->src[0].index) &&
5202           !inst->dst[0].reladdr &&
5203           !inst->dst[0].reladdr2 &&
5204           !inst->saturate &&
5205           inst->src[0].file != PROGRAM_ARRAY &&
5206           (inst->src[0].file != PROGRAM_OUTPUT ||
5207            this->shader->Stage != MESA_SHADER_TESS_CTRL) &&
5208           !inst->src[0].reladdr &&
5209           !inst->src[0].reladdr2 &&
5210           !inst->src[0].negate &&
5211           !inst->src[0].abs) {
5212          for (int i = 0; i < 4; i++) {
5213             if (inst->dst[0].writemask & (1 << i)) {
5214                acp[4 * inst->dst[0].index + i] = inst;
5215                acp_level[4 * inst->dst[0].index + i] = level;
5216             }
5217          }
5218       }
5219    }
5220
5221    ralloc_free(acp_level);
5222    ralloc_free(acp);
5223 }
5224
5225 static void
5226 dead_code_handle_reladdr(glsl_to_tgsi_instruction **writes, st_src_reg *reladdr)
5227 {
5228    if (reladdr && reladdr->file == PROGRAM_TEMPORARY) {
5229       /* Clear where it's used as src. */
5230       int swz = GET_SWZ(reladdr->swizzle, 0);
5231       writes[4 * reladdr->index + swz] = NULL;
5232    }
5233 }
5234
5235 /*
5236  * On a basic block basis, tracks available PROGRAM_TEMPORARY registers for dead
5237  * code elimination.
5238  *
5239  * The glsl_to_tgsi_visitor lazily produces code assuming that this pass
5240  * will occur.  As an example, a TXP production after copy propagation but
5241  * before this pass:
5242  *
5243  * 0: MOV TEMP[1], INPUT[4].xyyy;
5244  * 1: MOV TEMP[1].w, INPUT[4].wwww;
5245  * 2: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D;
5246  *
5247  * and after this pass:
5248  *
5249  * 0: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D;
5250  */
5251 int
5252 glsl_to_tgsi_visitor::eliminate_dead_code(void)
5253 {
5254    glsl_to_tgsi_instruction **writes = rzalloc_array(mem_ctx,
5255                                                      glsl_to_tgsi_instruction *,
5256                                                      this->next_temp * 4);
5257    int *write_level = rzalloc_array(mem_ctx, int, this->next_temp * 4);
5258    int level = 0;
5259    int removed = 0;
5260
5261    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
5262       assert(inst->dst[0].file != PROGRAM_TEMPORARY
5263              || inst->dst[0].index < this->next_temp);
5264
5265       switch (inst->op) {
5266       case TGSI_OPCODE_BGNLOOP:
5267       case TGSI_OPCODE_ENDLOOP:
5268       case TGSI_OPCODE_CONT:
5269       case TGSI_OPCODE_BRK:
5270          /* End of a basic block, clear the write array entirely.
5271           *
5272           * This keeps us from killing dead code when the writes are
5273           * on either side of a loop, even when the register isn't touched
5274           * inside the loop.  However, glsl_to_tgsi_visitor doesn't seem to emit
5275           * dead code of this type, so it shouldn't make a difference as long as
5276           * the dead code elimination pass in the GLSL compiler does its job.
5277           */
5278          memset(writes, 0, sizeof(*writes) * this->next_temp * 4);
5279          break;
5280
5281       case TGSI_OPCODE_ENDIF:
5282       case TGSI_OPCODE_ELSE:
5283          /* Promote the recorded level of all channels written inside the
5284           * preceding if or else block to the level above the if/else block.
5285           */
5286          for (int r = 0; r < this->next_temp; r++) {
5287             for (int c = 0; c < 4; c++) {
5288                if (!writes[4 * r + c])
5289                   continue;
5290
5291                if (write_level[4 * r + c] == level)
5292                   write_level[4 * r + c] = level-1;
5293             }
5294          }
5295          if (inst->op == TGSI_OPCODE_ENDIF)
5296             --level;
5297          break;
5298
5299       case TGSI_OPCODE_IF:
5300       case TGSI_OPCODE_UIF:
5301          ++level;
5302          /* fallthrough to default case to mark the condition as read */
5303       default:
5304          /* Continuing the block, clear any channels from the write array that
5305           * are read by this instruction.
5306           */
5307          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
5308             if (inst->src[i].file == PROGRAM_TEMPORARY && inst->src[i].reladdr){
5309                /* Any temporary might be read, so no dead code elimination
5310                 * across this instruction.
5311                 */
5312                memset(writes, 0, sizeof(*writes) * this->next_temp * 4);
5313             } else if (inst->src[i].file == PROGRAM_TEMPORARY) {
5314                /* Clear where it's used as src. */
5315                int src_chans = 1 << GET_SWZ(inst->src[i].swizzle, 0);
5316                src_chans |= 1 << GET_SWZ(inst->src[i].swizzle, 1);
5317                src_chans |= 1 << GET_SWZ(inst->src[i].swizzle, 2);
5318                src_chans |= 1 << GET_SWZ(inst->src[i].swizzle, 3);
5319
5320                for (int c = 0; c < 4; c++) {
5321                   if (src_chans & (1 << c))
5322                      writes[4 * inst->src[i].index + c] = NULL;
5323                }
5324             }
5325             dead_code_handle_reladdr(writes, inst->src[i].reladdr);
5326             dead_code_handle_reladdr(writes, inst->src[i].reladdr2);
5327          }
5328          for (unsigned i = 0; i < inst->tex_offset_num_offset; i++) {
5329             if (inst->tex_offsets[i].file == PROGRAM_TEMPORARY && inst->tex_offsets[i].reladdr){
5330                /* Any temporary might be read, so no dead code elimination
5331                 * across this instruction.
5332                 */
5333                memset(writes, 0, sizeof(*writes) * this->next_temp * 4);
5334             } else if (inst->tex_offsets[i].file == PROGRAM_TEMPORARY) {
5335                /* Clear where it's used as src. */
5336                int src_chans = 1 << GET_SWZ(inst->tex_offsets[i].swizzle, 0);
5337                src_chans |= 1 << GET_SWZ(inst->tex_offsets[i].swizzle, 1);
5338                src_chans |= 1 << GET_SWZ(inst->tex_offsets[i].swizzle, 2);
5339                src_chans |= 1 << GET_SWZ(inst->tex_offsets[i].swizzle, 3);
5340
5341                for (int c = 0; c < 4; c++) {
5342                   if (src_chans & (1 << c))
5343                      writes[4 * inst->tex_offsets[i].index + c] = NULL;
5344                }
5345             }
5346             dead_code_handle_reladdr(writes, inst->tex_offsets[i].reladdr);
5347             dead_code_handle_reladdr(writes, inst->tex_offsets[i].reladdr2);
5348          }
5349
5350          if (inst->resource.file == PROGRAM_TEMPORARY) {
5351             int src_chans;
5352
5353             src_chans  = 1 << GET_SWZ(inst->resource.swizzle, 0);
5354             src_chans |= 1 << GET_SWZ(inst->resource.swizzle, 1);
5355             src_chans |= 1 << GET_SWZ(inst->resource.swizzle, 2);
5356             src_chans |= 1 << GET_SWZ(inst->resource.swizzle, 3);
5357
5358             for (int c = 0; c < 4; c++) {
5359                if (src_chans & (1 << c))
5360                   writes[4 * inst->resource.index + c] = NULL;
5361             }
5362          }
5363          dead_code_handle_reladdr(writes, inst->resource.reladdr);
5364          dead_code_handle_reladdr(writes, inst->resource.reladdr2);
5365
5366          for (unsigned i = 0; i < ARRAY_SIZE(inst->dst); i++) {
5367             dead_code_handle_reladdr(writes, inst->dst[i].reladdr);
5368             dead_code_handle_reladdr(writes, inst->dst[i].reladdr2);
5369          }
5370          break;
5371       }
5372
5373       /* If this instruction writes to a temporary, add it to the write array.
5374        * If there is already an instruction in the write array for one or more
5375        * of the channels, flag that channel write as dead.
5376        */
5377       for (unsigned i = 0; i < ARRAY_SIZE(inst->dst); i++) {
5378          if (inst->dst[i].file == PROGRAM_TEMPORARY &&
5379              !inst->dst[i].reladdr) {
5380             for (int c = 0; c < 4; c++) {
5381                if (inst->dst[i].writemask & (1 << c)) {
5382                   if (writes[4 * inst->dst[i].index + c]) {
5383                      if (write_level[4 * inst->dst[i].index + c] < level)
5384                         continue;
5385                      else
5386                         writes[4 * inst->dst[i].index + c]->dead_mask |= (1 << c);
5387                   }
5388                   writes[4 * inst->dst[i].index + c] = inst;
5389                   write_level[4 * inst->dst[i].index + c] = level;
5390                }
5391             }
5392          }
5393       }
5394    }
5395
5396    /* Anything still in the write array at this point is dead code. */
5397    for (int r = 0; r < this->next_temp; r++) {
5398       for (int c = 0; c < 4; c++) {
5399          glsl_to_tgsi_instruction *inst = writes[4 * r + c];
5400          if (inst)
5401             inst->dead_mask |= (1 << c);
5402       }
5403    }
5404
5405    /* Now actually remove the instructions that are completely dead and update
5406     * the writemask of other instructions with dead channels.
5407     */
5408    foreach_in_list_safe(glsl_to_tgsi_instruction, inst, &this->instructions) {
5409       if (!inst->dead_mask || !inst->dst[0].writemask)
5410          continue;
5411       /* No amount of dead masks should remove memory stores */
5412       if (inst->info->is_store)
5413          continue;
5414
5415       if ((inst->dst[0].writemask & ~inst->dead_mask) == 0) {
5416          inst->remove();
5417          delete inst;
5418          removed++;
5419       } else {
5420          if (glsl_base_type_is_64bit(inst->dst[0].type)) {
5421             if (inst->dead_mask == WRITEMASK_XY ||
5422                 inst->dead_mask == WRITEMASK_ZW)
5423                inst->dst[0].writemask &= ~(inst->dead_mask);
5424          } else
5425             inst->dst[0].writemask &= ~(inst->dead_mask);
5426       }
5427    }
5428
5429    ralloc_free(write_level);
5430    ralloc_free(writes);
5431
5432    return removed;
5433 }
5434
5435 /* merge DFRACEXP instructions into one. */
5436 void
5437 glsl_to_tgsi_visitor::merge_two_dsts(void)
5438 {
5439    /* We never delete inst, but we may delete its successor. */
5440    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
5441       glsl_to_tgsi_instruction *inst2;
5442       unsigned defined;
5443
5444       if (num_inst_dst_regs(inst) != 2)
5445          continue;
5446
5447       if (inst->dst[0].file != PROGRAM_UNDEFINED &&
5448           inst->dst[1].file != PROGRAM_UNDEFINED)
5449          continue;
5450
5451       assert(inst->dst[0].file != PROGRAM_UNDEFINED ||
5452              inst->dst[1].file != PROGRAM_UNDEFINED);
5453
5454       if (inst->dst[0].file == PROGRAM_UNDEFINED)
5455          defined = 1;
5456       else
5457          defined = 0;
5458
5459       inst2 = (glsl_to_tgsi_instruction *) inst->next;
5460       while (!inst2->is_tail_sentinel()) {
5461          if (inst->op == inst2->op &&
5462              inst2->dst[defined].file == PROGRAM_UNDEFINED &&
5463              inst->src[0].file == inst2->src[0].file &&
5464              inst->src[0].index == inst2->src[0].index &&
5465              inst->src[0].type == inst2->src[0].type &&
5466              inst->src[0].swizzle == inst2->src[0].swizzle)
5467             break;
5468          inst2 = (glsl_to_tgsi_instruction *) inst2->next;
5469       }
5470
5471       if (inst2->is_tail_sentinel()) {
5472          /* Undefined destinations are not allowed, substitute with an unused
5473           * temporary register.
5474           */
5475          st_src_reg tmp = get_temp(glsl_type::vec4_type);
5476          inst->dst[defined ^ 1] = st_dst_reg(tmp);
5477          inst->dst[defined ^ 1].writemask = 0;
5478          continue;
5479       }
5480
5481       inst->dst[defined ^ 1] = inst2->dst[defined ^ 1];
5482       inst2->remove();
5483       delete inst2;
5484    }
5485 }
5486
5487 template <typename st_reg>
5488 void test_indirect_access(const st_reg& reg, bool *has_indirect_access)
5489 {
5490    if (reg.file == PROGRAM_ARRAY) {
5491       if (reg.reladdr || reg.reladdr2 || reg.has_index2) {
5492          has_indirect_access[reg.array_id] = true;
5493          if (reg.reladdr)
5494             test_indirect_access(*reg.reladdr, has_indirect_access);
5495          if (reg.reladdr2)
5496             test_indirect_access(*reg.reladdr2, has_indirect_access);
5497       }
5498    }
5499 }
5500
5501 template <typename st_reg>
5502 void remap_array(st_reg& reg, const int *array_remap_info,
5503                  const bool *has_indirect_access)
5504 {
5505    if (reg.file == PROGRAM_ARRAY) {
5506       if (!has_indirect_access[reg.array_id]) {
5507          reg.file = PROGRAM_TEMPORARY;
5508          reg.index = reg.index + array_remap_info[reg.array_id];
5509          reg.array_id = 0;
5510       } else {
5511          reg.array_id = array_remap_info[reg.array_id];
5512       }
5513
5514       if (reg.reladdr)
5515          remap_array(*reg.reladdr, array_remap_info, has_indirect_access);
5516
5517       if (reg.reladdr2)
5518          remap_array(*reg.reladdr2, array_remap_info, has_indirect_access);
5519    }
5520 }
5521
5522 /* One-dimensional arrays whose elements are only accessed directly are
5523  * replaced by an according set of temporary registers that then can become
5524  * subject to further optimization steps like copy propagation and
5525  * register merging.
5526  */
5527 void
5528 glsl_to_tgsi_visitor::split_arrays(void)
5529 {
5530    if (!next_array)
5531       return;
5532
5533    bool *has_indirect_access = rzalloc_array(mem_ctx, bool, next_array + 1);
5534
5535    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
5536       for (unsigned j = 0; j < num_inst_src_regs(inst); j++)
5537          test_indirect_access(inst->src[j], has_indirect_access);
5538
5539       for (unsigned j = 0; j < inst->tex_offset_num_offset; j++)
5540          test_indirect_access(inst->tex_offsets[j], has_indirect_access);
5541
5542       for (unsigned j = 0; j < num_inst_dst_regs(inst); j++)
5543          test_indirect_access(inst->dst[j], has_indirect_access);
5544
5545       test_indirect_access(inst->resource, has_indirect_access);
5546    }
5547
5548    unsigned array_offset = 0;
5549    unsigned n_remaining_arrays = 0;
5550
5551    /* Double use: For arrays that get split this value will contain
5552     * the base index of the temporary registers this array is replaced
5553     * with. For arrays that remain it contains the new array ID.
5554     */
5555    int *array_remap_info = rzalloc_array(has_indirect_access, int,
5556                                          next_array + 1);
5557
5558    for (unsigned i = 1; i <= next_array; ++i) {
5559       if (!has_indirect_access[i]) {
5560          array_remap_info[i] = this->next_temp + array_offset;
5561          array_offset += array_sizes[i - 1];
5562       } else {
5563          array_sizes[n_remaining_arrays] = array_sizes[i-1];
5564          array_remap_info[i] = ++n_remaining_arrays;
5565       }
5566    }
5567
5568    if (next_array !=  n_remaining_arrays) {
5569       foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
5570          for (unsigned j = 0; j < num_inst_src_regs(inst); j++)
5571             remap_array(inst->src[j], array_remap_info, has_indirect_access);
5572
5573          for (unsigned j = 0; j < inst->tex_offset_num_offset; j++)
5574             remap_array(inst->tex_offsets[j], array_remap_info, has_indirect_access);
5575
5576          for (unsigned j = 0; j < num_inst_dst_regs(inst); j++) {
5577             remap_array(inst->dst[j], array_remap_info, has_indirect_access);
5578          }
5579          remap_array(inst->resource, array_remap_info, has_indirect_access);
5580       }
5581    }
5582
5583    ralloc_free(has_indirect_access);
5584    this->next_temp += array_offset;
5585    next_array = n_remaining_arrays;
5586 }
5587
5588 /* Merges temporary registers together where possible to reduce the number of
5589  * registers needed to run a program.
5590  *
5591  * Produces optimal code only after copy propagation and dead code elimination
5592  * have been run. */
5593 void
5594 glsl_to_tgsi_visitor::merge_registers(void)
5595 {
5596    struct array_live_range *arr_live_ranges = NULL;
5597
5598    struct register_live_range *reg_live_ranges =
5599          rzalloc_array(mem_ctx, struct register_live_range, this->next_temp);
5600
5601    if (this->next_array > 0) {
5602       arr_live_ranges = new array_live_range[this->next_array];
5603       for (unsigned i = 0; i < this->next_array; ++i)
5604          arr_live_ranges[i] = array_live_range(i+1, this->array_sizes[i]);
5605    }
5606
5607
5608    if (get_temp_registers_required_live_ranges(reg_live_ranges, &this->instructions,
5609                                                this->next_temp, reg_live_ranges,
5610                                                this->next_array, arr_live_ranges)) {
5611       struct rename_reg_pair *renames =
5612             rzalloc_array(reg_live_ranges, struct rename_reg_pair, this->next_temp);
5613       get_temp_registers_remapping(reg_live_ranges, this->next_temp,
5614                                    reg_live_ranges, renames);
5615       rename_temp_registers(renames);
5616
5617       this->next_array =  merge_arrays(this->next_array, this->array_sizes,
5618                                        &this->instructions, arr_live_ranges);
5619    }
5620
5621    if (arr_live_ranges)
5622       delete[] arr_live_ranges;
5623
5624    ralloc_free(reg_live_ranges);
5625 }
5626
5627 /* Reassign indices to temporary registers by reusing unused indices created
5628  * by optimization passes. */
5629 void
5630 glsl_to_tgsi_visitor::renumber_registers(void)
5631 {
5632    int i = 0;
5633    int new_index = 0;
5634    int *first_writes = ralloc_array(mem_ctx, int, this->next_temp);
5635    struct rename_reg_pair *renames = rzalloc_array(mem_ctx, struct rename_reg_pair, this->next_temp);
5636
5637    for (i = 0; i < this->next_temp; i++) {
5638       first_writes[i] = -1;
5639    }
5640    get_first_temp_write(first_writes);
5641
5642    for (i = 0; i < this->next_temp; i++) {
5643       if (first_writes[i] < 0) continue;
5644       if (i != new_index) {
5645          renames[i].new_reg = new_index;
5646          renames[i].valid = true;
5647       }
5648       new_index++;
5649    }
5650
5651    rename_temp_registers(renames);
5652    this->next_temp = new_index;
5653    ralloc_free(renames);
5654    ralloc_free(first_writes);
5655 }
5656
5657 #ifndef NDEBUG
5658 void glsl_to_tgsi_visitor::print_stats()
5659 {
5660    int narray_registers = 0;
5661    for (unsigned i = 0; i < this->next_array; ++i)
5662       narray_registers += this->array_sizes[i];
5663
5664    int ninstructions = 0;
5665    foreach_in_list(glsl_to_tgsi_instruction, inst, &instructions) {
5666       ++ninstructions;
5667    }
5668
5669    simple_mtx_lock(&print_stats_mutex);
5670    stats_log << next_array << ", "
5671              << next_temp << ", "
5672              << narray_registers << ", "
5673              << next_temp + narray_registers << ", "
5674              << ninstructions << "\n";
5675    simple_mtx_unlock(&print_stats_mutex);
5676 }
5677 #endif
5678 /* ------------------------- TGSI conversion stuff -------------------------- */
5679
5680 /**
5681  * Intermediate state used during shader translation.
5682  */
5683 struct st_translate {
5684    struct ureg_program *ureg;
5685
5686    unsigned temps_size;
5687    struct ureg_dst *temps;
5688
5689    struct ureg_dst *arrays;
5690    unsigned num_temp_arrays;
5691    struct ureg_src *constants;
5692    int num_constants;
5693    struct ureg_src *immediates;
5694    int num_immediates;
5695    struct ureg_dst outputs[PIPE_MAX_SHADER_OUTPUTS];
5696    struct ureg_src inputs[PIPE_MAX_SHADER_INPUTS];
5697    struct ureg_dst address[3];
5698    struct ureg_src samplers[PIPE_MAX_SAMPLERS];
5699    struct ureg_src buffers[PIPE_MAX_SHADER_BUFFERS];
5700    struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
5701    struct ureg_src systemValues[SYSTEM_VALUE_MAX];
5702    struct ureg_src hw_atomics[PIPE_MAX_HW_ATOMIC_BUFFERS];
5703    struct ureg_src shared_memory;
5704    unsigned *array_sizes;
5705    struct inout_decl *input_decls;
5706    unsigned num_input_decls;
5707    struct inout_decl *output_decls;
5708    unsigned num_output_decls;
5709
5710    const ubyte *inputMapping;
5711    const ubyte *outputMapping;
5712
5713    enum pipe_shader_type procType;  /**< PIPE_SHADER_VERTEX/FRAGMENT */
5714    bool need_uarl;
5715 };
5716
5717 /** Map Mesa's SYSTEM_VALUE_x to TGSI_SEMANTIC_x */
5718 enum tgsi_semantic
5719 _mesa_sysval_to_semantic(unsigned sysval)
5720 {
5721    switch (sysval) {
5722    /* Vertex shader */
5723    case SYSTEM_VALUE_VERTEX_ID:
5724       return TGSI_SEMANTIC_VERTEXID;
5725    case SYSTEM_VALUE_INSTANCE_ID:
5726       return TGSI_SEMANTIC_INSTANCEID;
5727    case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
5728       return TGSI_SEMANTIC_VERTEXID_NOBASE;
5729    case SYSTEM_VALUE_BASE_VERTEX:
5730       return TGSI_SEMANTIC_BASEVERTEX;
5731    case SYSTEM_VALUE_BASE_INSTANCE:
5732       return TGSI_SEMANTIC_BASEINSTANCE;
5733    case SYSTEM_VALUE_DRAW_ID:
5734       return TGSI_SEMANTIC_DRAWID;
5735
5736    /* Geometry shader */
5737    case SYSTEM_VALUE_INVOCATION_ID:
5738       return TGSI_SEMANTIC_INVOCATIONID;
5739
5740    /* Fragment shader */
5741    case SYSTEM_VALUE_FRAG_COORD:
5742       return TGSI_SEMANTIC_POSITION;
5743    case SYSTEM_VALUE_FRONT_FACE:
5744       return TGSI_SEMANTIC_FACE;
5745    case SYSTEM_VALUE_SAMPLE_ID:
5746       return TGSI_SEMANTIC_SAMPLEID;
5747    case SYSTEM_VALUE_SAMPLE_POS:
5748       return TGSI_SEMANTIC_SAMPLEPOS;
5749    case SYSTEM_VALUE_SAMPLE_MASK_IN:
5750       return TGSI_SEMANTIC_SAMPLEMASK;
5751    case SYSTEM_VALUE_HELPER_INVOCATION:
5752       return TGSI_SEMANTIC_HELPER_INVOCATION;
5753
5754    /* Tessellation shader */
5755    case SYSTEM_VALUE_TESS_COORD:
5756       return TGSI_SEMANTIC_TESSCOORD;
5757    case SYSTEM_VALUE_VERTICES_IN:
5758       return TGSI_SEMANTIC_VERTICESIN;
5759    case SYSTEM_VALUE_PRIMITIVE_ID:
5760       return TGSI_SEMANTIC_PRIMID;
5761    case SYSTEM_VALUE_TESS_LEVEL_OUTER:
5762       return TGSI_SEMANTIC_TESSOUTER;
5763    case SYSTEM_VALUE_TESS_LEVEL_INNER:
5764       return TGSI_SEMANTIC_TESSINNER;
5765
5766    /* Compute shader */
5767    case SYSTEM_VALUE_LOCAL_INVOCATION_ID:
5768       return TGSI_SEMANTIC_THREAD_ID;
5769    case SYSTEM_VALUE_WORK_GROUP_ID:
5770       return TGSI_SEMANTIC_BLOCK_ID;
5771    case SYSTEM_VALUE_NUM_WORK_GROUPS:
5772       return TGSI_SEMANTIC_GRID_SIZE;
5773    case SYSTEM_VALUE_LOCAL_GROUP_SIZE:
5774       return TGSI_SEMANTIC_BLOCK_SIZE;
5775
5776    /* ARB_shader_ballot */
5777    case SYSTEM_VALUE_SUBGROUP_SIZE:
5778       return TGSI_SEMANTIC_SUBGROUP_SIZE;
5779    case SYSTEM_VALUE_SUBGROUP_INVOCATION:
5780       return TGSI_SEMANTIC_SUBGROUP_INVOCATION;
5781    case SYSTEM_VALUE_SUBGROUP_EQ_MASK:
5782       return TGSI_SEMANTIC_SUBGROUP_EQ_MASK;
5783    case SYSTEM_VALUE_SUBGROUP_GE_MASK:
5784       return TGSI_SEMANTIC_SUBGROUP_GE_MASK;
5785    case SYSTEM_VALUE_SUBGROUP_GT_MASK:
5786       return TGSI_SEMANTIC_SUBGROUP_GT_MASK;
5787    case SYSTEM_VALUE_SUBGROUP_LE_MASK:
5788       return TGSI_SEMANTIC_SUBGROUP_LE_MASK;
5789    case SYSTEM_VALUE_SUBGROUP_LT_MASK:
5790       return TGSI_SEMANTIC_SUBGROUP_LT_MASK;
5791
5792    /* Unhandled */
5793    case SYSTEM_VALUE_LOCAL_INVOCATION_INDEX:
5794    case SYSTEM_VALUE_GLOBAL_INVOCATION_ID:
5795    case SYSTEM_VALUE_VERTEX_CNT:
5796    case SYSTEM_VALUE_VARYING_COORD:
5797    default:
5798       assert(!"Unexpected SYSTEM_VALUE_ enum");
5799       return TGSI_SEMANTIC_COUNT;
5800    }
5801 }
5802
5803 /**
5804  * Map a glsl_to_tgsi constant/immediate to a TGSI immediate.
5805  */
5806 static struct ureg_src
5807 emit_immediate(struct st_translate *t,
5808                gl_constant_value values[4],
5809                GLenum type, int size)
5810 {
5811    struct ureg_program *ureg = t->ureg;
5812
5813    switch (type) {
5814    case GL_FLOAT:
5815       return ureg_DECL_immediate(ureg, &values[0].f, size);
5816    case GL_DOUBLE:
5817       return ureg_DECL_immediate_f64(ureg, (double *)&values[0].f, size);
5818    case GL_INT64_ARB:
5819       return ureg_DECL_immediate_int64(ureg, (int64_t *)&values[0].f, size);
5820    case GL_UNSIGNED_INT64_ARB:
5821       return ureg_DECL_immediate_uint64(ureg, (uint64_t *)&values[0].f, size);
5822    case GL_INT:
5823       return ureg_DECL_immediate_int(ureg, &values[0].i, size);
5824    case GL_UNSIGNED_INT:
5825    case GL_BOOL:
5826       return ureg_DECL_immediate_uint(ureg, &values[0].u, size);
5827    default:
5828       assert(!"should not get here - type must be float, int, uint, or bool");
5829       return ureg_src_undef();
5830    }
5831 }
5832
5833 /**
5834  * Map a glsl_to_tgsi dst register to a TGSI ureg_dst register.
5835  */
5836 static struct ureg_dst
5837 dst_register(struct st_translate *t, gl_register_file file, unsigned index,
5838              unsigned array_id)
5839 {
5840    unsigned array;
5841
5842    switch (file) {
5843    case PROGRAM_UNDEFINED:
5844       return ureg_dst_undef();
5845
5846    case PROGRAM_TEMPORARY:
5847       /* Allocate space for temporaries on demand. */
5848       if (index >= t->temps_size) {
5849          const int inc = align(index - t->temps_size + 1, 4096);
5850
5851          t->temps = (struct ureg_dst*)
5852                     realloc(t->temps,
5853                             (t->temps_size + inc) * sizeof(struct ureg_dst));
5854          if (!t->temps)
5855             return ureg_dst_undef();
5856
5857          memset(t->temps + t->temps_size, 0, inc * sizeof(struct ureg_dst));
5858          t->temps_size += inc;
5859       }
5860
5861       if (ureg_dst_is_undef(t->temps[index]))
5862          t->temps[index] = ureg_DECL_local_temporary(t->ureg);
5863
5864       return t->temps[index];
5865
5866    case PROGRAM_ARRAY:
5867       assert(array_id && array_id <= t->num_temp_arrays);
5868       array = array_id - 1;
5869
5870       if (ureg_dst_is_undef(t->arrays[array]))
5871          t->arrays[array] = ureg_DECL_array_temporary(
5872             t->ureg, t->array_sizes[array], TRUE);
5873
5874       return ureg_dst_array_offset(t->arrays[array], index);
5875
5876    case PROGRAM_OUTPUT:
5877       if (!array_id) {
5878          if (t->procType == PIPE_SHADER_FRAGMENT)
5879             assert(index < 2 * FRAG_RESULT_MAX);
5880          else if (t->procType == PIPE_SHADER_TESS_CTRL ||
5881                   t->procType == PIPE_SHADER_TESS_EVAL)
5882             assert(index < VARYING_SLOT_TESS_MAX);
5883          else
5884             assert(index < VARYING_SLOT_MAX);
5885
5886          assert(t->outputMapping[index] < ARRAY_SIZE(t->outputs));
5887          assert(t->outputs[t->outputMapping[index]].File != TGSI_FILE_NULL);
5888          return t->outputs[t->outputMapping[index]];
5889       }
5890       else {
5891          struct inout_decl *decl =
5892             find_inout_array(t->output_decls,
5893                              t->num_output_decls, array_id);
5894          unsigned mesa_index = decl->mesa_index;
5895          int slot = t->outputMapping[mesa_index];
5896
5897          assert(slot != -1 && t->outputs[slot].File == TGSI_FILE_OUTPUT);
5898
5899          struct ureg_dst dst = t->outputs[slot];
5900          dst.ArrayID = array_id;
5901          return ureg_dst_array_offset(dst, index - mesa_index);
5902       }
5903
5904    case PROGRAM_ADDRESS:
5905       return t->address[index];
5906
5907    default:
5908       assert(!"unknown dst register file");
5909       return ureg_dst_undef();
5910    }
5911 }
5912
5913 static struct ureg_src
5914 translate_src(struct st_translate *t, const st_src_reg *src_reg);
5915
5916 static struct ureg_src
5917 translate_addr(struct st_translate *t, const st_src_reg *reladdr,
5918                unsigned addr_index)
5919 {
5920    if (t->need_uarl || !reladdr->is_legal_tgsi_address_operand())
5921       return ureg_src(t->address[addr_index]);
5922
5923    return translate_src(t, reladdr);
5924 }
5925
5926 /**
5927  * Create a TGSI ureg_dst register from an st_dst_reg.
5928  */
5929 static struct ureg_dst
5930 translate_dst(struct st_translate *t,
5931               const st_dst_reg *dst_reg,
5932               bool saturate)
5933 {
5934    struct ureg_dst dst = dst_register(t, dst_reg->file, dst_reg->index,
5935                                       dst_reg->array_id);
5936
5937    if (dst.File == TGSI_FILE_NULL)
5938       return dst;
5939
5940    dst = ureg_writemask(dst, dst_reg->writemask);
5941
5942    if (saturate)
5943       dst = ureg_saturate(dst);
5944
5945    if (dst_reg->reladdr != NULL) {
5946       assert(dst_reg->file != PROGRAM_TEMPORARY);
5947       dst = ureg_dst_indirect(dst, translate_addr(t, dst_reg->reladdr, 0));
5948    }
5949
5950    if (dst_reg->has_index2) {
5951       if (dst_reg->reladdr2)
5952          dst = ureg_dst_dimension_indirect(dst,
5953                                            translate_addr(t, dst_reg->reladdr2, 1),
5954                                            dst_reg->index2D);
5955       else
5956          dst = ureg_dst_dimension(dst, dst_reg->index2D);
5957    }
5958
5959    return dst;
5960 }
5961
5962 /**
5963  * Create a TGSI ureg_src register from an st_src_reg.
5964  */
5965 static struct ureg_src
5966 translate_src(struct st_translate *t, const st_src_reg *src_reg)
5967 {
5968    struct ureg_src src;
5969    int index = src_reg->index;
5970    int double_reg2 = src_reg->double_reg2 ? 1 : 0;
5971
5972    switch (src_reg->file) {
5973    case PROGRAM_UNDEFINED:
5974       src = ureg_imm4f(t->ureg, 0, 0, 0, 0);
5975       break;
5976
5977    case PROGRAM_TEMPORARY:
5978    case PROGRAM_ARRAY:
5979       src = ureg_src(dst_register(t, src_reg->file, src_reg->index,
5980                                   src_reg->array_id));
5981       break;
5982
5983    case PROGRAM_OUTPUT: {
5984       struct ureg_dst dst = dst_register(t, src_reg->file, src_reg->index,
5985                                          src_reg->array_id);
5986       assert(dst.WriteMask != 0);
5987       unsigned shift = ffs(dst.WriteMask) - 1;
5988       src = ureg_swizzle(ureg_src(dst),
5989                          shift,
5990                          MIN2(shift + 1, 3),
5991                          MIN2(shift + 2, 3),
5992                          MIN2(shift + 3, 3));
5993       break;
5994    }
5995
5996    case PROGRAM_UNIFORM:
5997       assert(src_reg->index >= 0);
5998       src = src_reg->index < t->num_constants ?
5999                t->constants[src_reg->index] : ureg_imm4f(t->ureg, 0, 0, 0, 0);
6000       break;
6001    case PROGRAM_STATE_VAR:
6002    case PROGRAM_CONSTANT:       /* ie, immediate */
6003       if (src_reg->has_index2)
6004          src = ureg_src_register(TGSI_FILE_CONSTANT, src_reg->index);
6005       else
6006          src = src_reg->index >= 0 && src_reg->index < t->num_constants ?
6007                   t->constants[src_reg->index] : ureg_imm4f(t->ureg, 0, 0, 0, 0);
6008       break;
6009
6010    case PROGRAM_IMMEDIATE:
6011       assert(src_reg->index >= 0 && src_reg->index < t->num_immediates);
6012       src = t->immediates[src_reg->index];
6013       break;
6014
6015    case PROGRAM_INPUT:
6016       /* GLSL inputs are 64-bit containers, so we have to
6017        * map back to the original index and add the offset after
6018        * mapping. */
6019       index -= double_reg2;
6020       if (!src_reg->array_id) {
6021          assert(t->inputMapping[index] < ARRAY_SIZE(t->inputs));
6022          assert(t->inputs[t->inputMapping[index]].File != TGSI_FILE_NULL);
6023          src = t->inputs[t->inputMapping[index] + double_reg2];
6024       }
6025       else {
6026          struct inout_decl *decl = find_inout_array(t->input_decls,
6027                                                     t->num_input_decls,
6028                                                     src_reg->array_id);
6029          unsigned mesa_index = decl->mesa_index;
6030          int slot = t->inputMapping[mesa_index];
6031
6032          assert(slot != -1 && t->inputs[slot].File == TGSI_FILE_INPUT);
6033
6034          src = t->inputs[slot];
6035          src.ArrayID = src_reg->array_id;
6036          src = ureg_src_array_offset(src, index + double_reg2 - mesa_index);
6037       }
6038       break;
6039
6040    case PROGRAM_ADDRESS:
6041       src = ureg_src(t->address[src_reg->index]);
6042       break;
6043
6044    case PROGRAM_SYSTEM_VALUE:
6045       assert(src_reg->index < (int) ARRAY_SIZE(t->systemValues));
6046       src = t->systemValues[src_reg->index];
6047       break;
6048
6049    case PROGRAM_HW_ATOMIC:
6050       src = ureg_src_array_register(TGSI_FILE_HW_ATOMIC, src_reg->index,
6051                                     src_reg->array_id);
6052       break;
6053
6054    default:
6055       assert(!"unknown src register file");
6056       return ureg_src_undef();
6057    }
6058
6059    if (src_reg->has_index2) {
6060       /* 2D indexes occur with geometry shader inputs (attrib, vertex)
6061        * and UBO constant buffers (buffer, position).
6062        */
6063       if (src_reg->reladdr2)
6064          src = ureg_src_dimension_indirect(src,
6065                                            translate_addr(t, src_reg->reladdr2, 1),
6066                                            src_reg->index2D);
6067       else
6068          src = ureg_src_dimension(src, src_reg->index2D);
6069    }
6070
6071    src = ureg_swizzle(src,
6072                       GET_SWZ(src_reg->swizzle, 0) & 0x3,
6073                       GET_SWZ(src_reg->swizzle, 1) & 0x3,
6074                       GET_SWZ(src_reg->swizzle, 2) & 0x3,
6075                       GET_SWZ(src_reg->swizzle, 3) & 0x3);
6076
6077    if (src_reg->abs)
6078       src = ureg_abs(src);
6079
6080    if ((src_reg->negate & 0xf) == NEGATE_XYZW)
6081       src = ureg_negate(src);
6082
6083    if (src_reg->reladdr != NULL) {
6084       assert(src_reg->file != PROGRAM_TEMPORARY);
6085       src = ureg_src_indirect(src, translate_addr(t, src_reg->reladdr, 0));
6086    }
6087
6088    return src;
6089 }
6090
6091 static struct tgsi_texture_offset
6092 translate_tex_offset(struct st_translate *t,
6093                      const st_src_reg *in_offset)
6094 {
6095    struct tgsi_texture_offset offset;
6096    struct ureg_src src = translate_src(t, in_offset);
6097
6098    offset.File = src.File;
6099    offset.Index = src.Index;
6100    offset.SwizzleX = src.SwizzleX;
6101    offset.SwizzleY = src.SwizzleY;
6102    offset.SwizzleZ = src.SwizzleZ;
6103    offset.Padding = 0;
6104
6105    assert(!src.Indirect);
6106    assert(!src.DimIndirect);
6107    assert(!src.Dimension);
6108    assert(!src.Absolute); /* those shouldn't be used with integers anyway */
6109    assert(!src.Negate);
6110
6111    return offset;
6112 }
6113
6114 static void
6115 compile_tgsi_instruction(struct st_translate *t,
6116                          const glsl_to_tgsi_instruction *inst)
6117 {
6118    struct ureg_program *ureg = t->ureg;
6119    int i;
6120    struct ureg_dst dst[2];
6121    struct ureg_src src[4];
6122    struct tgsi_texture_offset texoffsets[MAX_GLSL_TEXTURE_OFFSET];
6123
6124    int num_dst;
6125    int num_src;
6126    enum tgsi_texture_type tex_target = TGSI_TEXTURE_BUFFER;
6127
6128    num_dst = num_inst_dst_regs(inst);
6129    num_src = num_inst_src_regs(inst);
6130
6131    for (i = 0; i < num_dst; i++)
6132       dst[i] = translate_dst(t,
6133                              &inst->dst[i],
6134                              inst->saturate);
6135
6136    for (i = 0; i < num_src; i++)
6137       src[i] = translate_src(t, &inst->src[i]);
6138
6139    switch (inst->op) {
6140    case TGSI_OPCODE_BGNLOOP:
6141    case TGSI_OPCODE_ELSE:
6142    case TGSI_OPCODE_ENDLOOP:
6143    case TGSI_OPCODE_IF:
6144    case TGSI_OPCODE_UIF:
6145       assert(num_dst == 0);
6146       ureg_insn(ureg, inst->op, NULL, 0, src, num_src, inst->precise);
6147       return;
6148
6149    case TGSI_OPCODE_TEX:
6150    case TGSI_OPCODE_TEX_LZ:
6151    case TGSI_OPCODE_TXB:
6152    case TGSI_OPCODE_TXD:
6153    case TGSI_OPCODE_TXL:
6154    case TGSI_OPCODE_TXP:
6155    case TGSI_OPCODE_TXQ:
6156    case TGSI_OPCODE_TXQS:
6157    case TGSI_OPCODE_TXF:
6158    case TGSI_OPCODE_TXF_LZ:
6159    case TGSI_OPCODE_TEX2:
6160    case TGSI_OPCODE_TXB2:
6161    case TGSI_OPCODE_TXL2:
6162    case TGSI_OPCODE_TG4:
6163    case TGSI_OPCODE_LODQ:
6164    case TGSI_OPCODE_SAMP2HND:
6165       if (inst->resource.file == PROGRAM_SAMPLER) {
6166          src[num_src] = t->samplers[inst->resource.index];
6167       } else {
6168          /* Bindless samplers. */
6169          src[num_src] = translate_src(t, &inst->resource);
6170       }
6171       assert(src[num_src].File != TGSI_FILE_NULL);
6172       if (inst->resource.reladdr)
6173          src[num_src] =
6174             ureg_src_indirect(src[num_src],
6175                               translate_addr(t, inst->resource.reladdr, 2));
6176       num_src++;
6177       for (i = 0; i < (int)inst->tex_offset_num_offset; i++) {
6178          texoffsets[i] = translate_tex_offset(t, &inst->tex_offsets[i]);
6179       }
6180       tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
6181
6182       ureg_tex_insn(ureg,
6183                     inst->op,
6184                     dst, num_dst,
6185                     tex_target,
6186                     st_translate_texture_type(inst->tex_type),
6187                     texoffsets, inst->tex_offset_num_offset,
6188                     src, num_src);
6189       return;
6190
6191    case TGSI_OPCODE_RESQ:
6192    case TGSI_OPCODE_LOAD:
6193    case TGSI_OPCODE_ATOMUADD:
6194    case TGSI_OPCODE_ATOMXCHG:
6195    case TGSI_OPCODE_ATOMCAS:
6196    case TGSI_OPCODE_ATOMAND:
6197    case TGSI_OPCODE_ATOMOR:
6198    case TGSI_OPCODE_ATOMXOR:
6199    case TGSI_OPCODE_ATOMUMIN:
6200    case TGSI_OPCODE_ATOMUMAX:
6201    case TGSI_OPCODE_ATOMIMIN:
6202    case TGSI_OPCODE_ATOMIMAX:
6203    case TGSI_OPCODE_IMG2HND:
6204       for (i = num_src - 1; i >= 0; i--)
6205          src[i + 1] = src[i];
6206       num_src++;
6207       if (inst->resource.file == PROGRAM_MEMORY) {
6208          src[0] = t->shared_memory;
6209       } else if (inst->resource.file == PROGRAM_BUFFER) {
6210          src[0] = t->buffers[inst->resource.index];
6211       } else if (inst->resource.file == PROGRAM_HW_ATOMIC) {
6212          src[0] = translate_src(t, &inst->resource);
6213       } else if (inst->resource.file == PROGRAM_CONSTANT) {
6214          assert(inst->resource.has_index2);
6215          src[0] = ureg_src_register(TGSI_FILE_CONSTBUF, inst->resource.index);
6216       } else {
6217          assert(inst->resource.file != PROGRAM_UNDEFINED);
6218          if (inst->resource.file == PROGRAM_IMAGE) {
6219             src[0] = t->images[inst->resource.index];
6220          } else {
6221             /* Bindless images. */
6222             src[0] = translate_src(t, &inst->resource);
6223          }
6224          tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
6225       }
6226       if (inst->resource.reladdr)
6227          src[0] = ureg_src_indirect(src[0],
6228                                     translate_addr(t, inst->resource.reladdr, 2));
6229       assert(src[0].File != TGSI_FILE_NULL);
6230       ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
6231                        inst->buffer_access,
6232                        tex_target, inst->image_format);
6233       break;
6234
6235    case TGSI_OPCODE_STORE:
6236       if (inst->resource.file == PROGRAM_MEMORY) {
6237          dst[0] = ureg_dst(t->shared_memory);
6238       } else if (inst->resource.file == PROGRAM_BUFFER) {
6239          dst[0] = ureg_dst(t->buffers[inst->resource.index]);
6240       } else {
6241          if (inst->resource.file == PROGRAM_IMAGE) {
6242             dst[0] = ureg_dst(t->images[inst->resource.index]);
6243          } else {
6244             /* Bindless images. */
6245             dst[0] = ureg_dst(translate_src(t, &inst->resource));
6246          }
6247          tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
6248       }
6249       dst[0] = ureg_writemask(dst[0], inst->dst[0].writemask);
6250       if (inst->resource.reladdr)
6251          dst[0] = ureg_dst_indirect(dst[0],
6252                                     translate_addr(t, inst->resource.reladdr, 2));
6253       assert(dst[0].File != TGSI_FILE_NULL);
6254       ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
6255                        inst->buffer_access,
6256                        tex_target, inst->image_format);
6257       break;
6258
6259    default:
6260       ureg_insn(ureg,
6261                 inst->op,
6262                 dst, num_dst,
6263                 src, num_src, inst->precise);
6264       break;
6265    }
6266 }
6267
6268 /**
6269  * Emit the TGSI instructions for inverting and adjusting WPOS.
6270  * This code is unavoidable because it also depends on whether
6271  * a FBO is bound (STATE_FB_WPOS_Y_TRANSFORM).
6272  */
6273 static void
6274 emit_wpos_adjustment(struct gl_context *ctx,
6275                      struct st_translate *t,
6276                      int wpos_transform_const,
6277                      boolean invert,
6278                      GLfloat adjX, GLfloat adjY[2])
6279 {
6280    struct ureg_program *ureg = t->ureg;
6281
6282    assert(wpos_transform_const >= 0);
6283
6284    /* Fragment program uses fragment position input.
6285     * Need to replace instances of INPUT[WPOS] with temp T
6286     * where T = INPUT[WPOS] is inverted by Y.
6287     */
6288    struct ureg_src wpostrans = ureg_DECL_constant(ureg, wpos_transform_const);
6289    struct ureg_dst wpos_temp = ureg_DECL_temporary(ureg);
6290    struct ureg_src *wpos =
6291       ctx->Const.GLSLFragCoordIsSysVal ?
6292          &t->systemValues[SYSTEM_VALUE_FRAG_COORD] :
6293          &t->inputs[t->inputMapping[VARYING_SLOT_POS]];
6294    struct ureg_src wpos_input = *wpos;
6295
6296    /* First, apply the coordinate shift: */
6297    if (adjX || adjY[0] || adjY[1]) {
6298       if (adjY[0] != adjY[1]) {
6299          /* Adjust the y coordinate by adjY[1] or adjY[0] respectively
6300           * depending on whether inversion is actually going to be applied
6301           * or not, which is determined by testing against the inversion
6302           * state variable used below, which will be either +1 or -1.
6303           */
6304          struct ureg_dst adj_temp = ureg_DECL_local_temporary(ureg);
6305
6306          ureg_CMP(ureg, adj_temp,
6307                   ureg_scalar(wpostrans, invert ? 2 : 0),
6308                   ureg_imm4f(ureg, adjX, adjY[0], 0.0f, 0.0f),
6309                   ureg_imm4f(ureg, adjX, adjY[1], 0.0f, 0.0f));
6310          ureg_ADD(ureg, wpos_temp, wpos_input, ureg_src(adj_temp));
6311       } else {
6312          ureg_ADD(ureg, wpos_temp, wpos_input,
6313                   ureg_imm4f(ureg, adjX, adjY[0], 0.0f, 0.0f));
6314       }
6315       wpos_input = ureg_src(wpos_temp);
6316    } else {
6317       /* MOV wpos_temp, input[wpos]
6318        */
6319       ureg_MOV(ureg, wpos_temp, wpos_input);
6320    }
6321
6322    /* Now the conditional y flip: STATE_FB_WPOS_Y_TRANSFORM.xy/zw will be
6323     * inversion/identity, or the other way around if we're drawing to an FBO.
6324     */
6325    if (invert) {
6326       /* MAD wpos_temp.y, wpos_input, wpostrans.xxxx, wpostrans.yyyy
6327        */
6328       ureg_MAD(ureg,
6329                ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y),
6330                wpos_input,
6331                ureg_scalar(wpostrans, 0),
6332                ureg_scalar(wpostrans, 1));
6333    } else {
6334       /* MAD wpos_temp.y, wpos_input, wpostrans.zzzz, wpostrans.wwww
6335        */
6336       ureg_MAD(ureg,
6337                ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y),
6338                wpos_input,
6339                ureg_scalar(wpostrans, 2),
6340                ureg_scalar(wpostrans, 3));
6341    }
6342
6343    /* Use wpos_temp as position input from here on:
6344     */
6345    *wpos = ureg_src(wpos_temp);
6346 }
6347
6348
6349 /**
6350  * Emit fragment position/ooordinate code.
6351  */
6352 static void
6353 emit_wpos(struct st_context *st,
6354           struct st_translate *t,
6355           const struct gl_program *program,
6356           struct ureg_program *ureg,
6357           int wpos_transform_const)
6358 {
6359    struct pipe_screen *pscreen = st->pipe->screen;
6360    GLfloat adjX = 0.0f;
6361    GLfloat adjY[2] = { 0.0f, 0.0f };
6362    boolean invert = FALSE;
6363
6364    /* Query the pixel center conventions supported by the pipe driver and set
6365     * adjX, adjY to help out if it cannot handle the requested one internally.
6366     *
6367     * The bias of the y-coordinate depends on whether y-inversion takes place
6368     * (adjY[1]) or not (adjY[0]), which is in turn dependent on whether we are
6369     * drawing to an FBO (causes additional inversion), and whether the pipe
6370     * driver origin and the requested origin differ (the latter condition is
6371     * stored in the 'invert' variable).
6372     *
6373     * For height = 100 (i = integer, h = half-integer, l = lower, u = upper):
6374     *
6375     * center shift only:
6376     * i -> h: +0.5
6377     * h -> i: -0.5
6378     *
6379     * inversion only:
6380     * l,i -> u,i: ( 0.0 + 1.0) * -1 + 100 = 99
6381     * l,h -> u,h: ( 0.5 + 0.0) * -1 + 100 = 99.5
6382     * u,i -> l,i: (99.0 + 1.0) * -1 + 100 = 0
6383     * u,h -> l,h: (99.5 + 0.0) * -1 + 100 = 0.5
6384     *
6385     * inversion and center shift:
6386     * l,i -> u,h: ( 0.0 + 0.5) * -1 + 100 = 99.5
6387     * l,h -> u,i: ( 0.5 + 0.5) * -1 + 100 = 99
6388     * u,i -> l,h: (99.0 + 0.5) * -1 + 100 = 0.5
6389     * u,h -> l,i: (99.5 + 0.5) * -1 + 100 = 0
6390     */
6391    if (program->OriginUpperLeft) {
6392       /* Fragment shader wants origin in upper-left */
6393       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT)) {
6394          /* the driver supports upper-left origin */
6395       }
6396       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT)) {
6397          /* the driver supports lower-left origin, need to invert Y */
6398          ureg_property(ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
6399                        TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
6400          invert = TRUE;
6401       }
6402       else
6403          assert(0);
6404    }
6405    else {
6406       /* Fragment shader wants origin in lower-left */
6407       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT))
6408          /* the driver supports lower-left origin */
6409          ureg_property(ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
6410                        TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
6411       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT))
6412          /* the driver supports upper-left origin, need to invert Y */
6413          invert = TRUE;
6414       else
6415          assert(0);
6416    }
6417
6418    if (program->PixelCenterInteger) {
6419       /* Fragment shader wants pixel center integer */
6420       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER)) {
6421          /* the driver supports pixel center integer */
6422          adjY[1] = 1.0f;
6423          ureg_property(ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
6424                        TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
6425       }
6426       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER)) {
6427          /* the driver supports pixel center half integer, need to bias X,Y */
6428          adjX = -0.5f;
6429          adjY[0] = -0.5f;
6430          adjY[1] = 0.5f;
6431       }
6432       else
6433          assert(0);
6434    }
6435    else {
6436       /* Fragment shader wants pixel center half integer */
6437       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER)) {
6438          /* the driver supports pixel center half integer */
6439       }
6440       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER)) {
6441          /* the driver supports pixel center integer, need to bias X,Y */
6442          adjX = adjY[0] = adjY[1] = 0.5f;
6443          ureg_property(ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
6444                        TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
6445       }
6446       else
6447          assert(0);
6448    }
6449
6450    /* we invert after adjustment so that we avoid the MOV to temporary,
6451     * and reuse the adjustment ADD instead */
6452    emit_wpos_adjustment(st->ctx, t, wpos_transform_const, invert, adjX, adjY);
6453 }
6454
6455 /**
6456  * OpenGL's fragment gl_FrontFace input is 1 for front-facing, 0 for back.
6457  * TGSI uses +1 for front, -1 for back.
6458  * This function converts the TGSI value to the GL value.  Simply clamping/
6459  * saturating the value to [0,1] does the job.
6460  */
6461 static void
6462 emit_face_var(struct gl_context *ctx, struct st_translate *t)
6463 {
6464    struct ureg_program *ureg = t->ureg;
6465    struct ureg_dst face_temp = ureg_DECL_temporary(ureg);
6466    struct ureg_src face_input = t->inputs[t->inputMapping[VARYING_SLOT_FACE]];
6467
6468    if (ctx->Const.NativeIntegers) {
6469       ureg_FSGE(ureg, face_temp, face_input, ureg_imm1f(ureg, 0));
6470    }
6471    else {
6472       /* MOV_SAT face_temp, input[face] */
6473       ureg_MOV(ureg, ureg_saturate(face_temp), face_input);
6474    }
6475
6476    /* Use face_temp as face input from here on: */
6477    t->inputs[t->inputMapping[VARYING_SLOT_FACE]] = ureg_src(face_temp);
6478 }
6479
6480 static void
6481 emit_compute_block_size(const struct gl_program *prog,
6482                         struct ureg_program *ureg) {
6483    ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH,
6484                  prog->info.cs.local_size[0]);
6485    ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT,
6486                  prog->info.cs.local_size[1]);
6487    ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH,
6488                  prog->info.cs.local_size[2]);
6489 }
6490
6491 struct sort_inout_decls {
6492    bool operator()(const struct inout_decl &a, const struct inout_decl &b) const {
6493       return mapping[a.mesa_index] < mapping[b.mesa_index];
6494    }
6495
6496    const ubyte *mapping;
6497 };
6498
6499 /* Sort the given array of decls by the corresponding slot (TGSI file index).
6500  *
6501  * This is for the benefit of older drivers which are broken when the
6502  * declarations aren't sorted in this way.
6503  */
6504 static void
6505 sort_inout_decls_by_slot(struct inout_decl *decls,
6506                          unsigned count,
6507                          const ubyte mapping[])
6508 {
6509    sort_inout_decls sorter;
6510    sorter.mapping = mapping;
6511    std::sort(decls, decls + count, sorter);
6512 }
6513
6514 static enum tgsi_interpolate_mode
6515 st_translate_interp(enum glsl_interp_mode glsl_qual, GLuint varying)
6516 {
6517    switch (glsl_qual) {
6518    case INTERP_MODE_NONE:
6519       if (varying == VARYING_SLOT_COL0 || varying == VARYING_SLOT_COL1)
6520          return TGSI_INTERPOLATE_COLOR;
6521       return TGSI_INTERPOLATE_PERSPECTIVE;
6522    case INTERP_MODE_SMOOTH:
6523       return TGSI_INTERPOLATE_PERSPECTIVE;
6524    case INTERP_MODE_FLAT:
6525       return TGSI_INTERPOLATE_CONSTANT;
6526    case INTERP_MODE_NOPERSPECTIVE:
6527       return TGSI_INTERPOLATE_LINEAR;
6528    default:
6529       assert(0 && "unexpected interp mode in st_translate_interp()");
6530       return TGSI_INTERPOLATE_PERSPECTIVE;
6531    }
6532 }
6533
6534 /**
6535  * Translate intermediate IR (glsl_to_tgsi_instruction) to TGSI format.
6536  * \param program  the program to translate
6537  * \param numInputs  number of input registers used
6538  * \param inputMapping  maps Mesa fragment program inputs to TGSI generic
6539  *                      input indexes
6540  * \param inputSemanticName  the TGSI_SEMANTIC flag for each input
6541  * \param inputSemanticIndex  the semantic index (ex: which texcoord) for
6542  *                            each input
6543  * \param interpMode  the TGSI_INTERPOLATE_LINEAR/PERSP mode for each input
6544  * \param numOutputs  number of output registers used
6545  * \param outputMapping  maps Mesa fragment program outputs to TGSI
6546  *                       generic outputs
6547  * \param outputSemanticName  the TGSI_SEMANTIC flag for each output
6548  * \param outputSemanticIndex  the semantic index (ex: which texcoord) for
6549  *                             each output
6550  *
6551  * \return  PIPE_OK or PIPE_ERROR_OUT_OF_MEMORY
6552  */
6553 extern "C" enum pipe_error
6554 st_translate_program(
6555    struct gl_context *ctx,
6556    enum pipe_shader_type procType,
6557    struct ureg_program *ureg,
6558    glsl_to_tgsi_visitor *program,
6559    const struct gl_program *proginfo,
6560    GLuint numInputs,
6561    const ubyte inputMapping[],
6562    const ubyte inputSlotToAttr[],
6563    const ubyte inputSemanticName[],
6564    const ubyte inputSemanticIndex[],
6565    const ubyte interpMode[],
6566    GLuint numOutputs,
6567    const ubyte outputMapping[],
6568    const ubyte outputSemanticName[],
6569    const ubyte outputSemanticIndex[])
6570 {
6571    struct pipe_screen *screen = st_context(ctx)->pipe->screen;
6572    struct st_translate *t;
6573    unsigned i;
6574    struct gl_program_constants *frag_const =
6575       &ctx->Const.Program[MESA_SHADER_FRAGMENT];
6576    enum pipe_error ret = PIPE_OK;
6577
6578    assert(numInputs <= ARRAY_SIZE(t->inputs));
6579    assert(numOutputs <= ARRAY_SIZE(t->outputs));
6580
6581    ASSERT_BITFIELD_SIZE(st_src_reg, type, GLSL_TYPE_ERROR);
6582    ASSERT_BITFIELD_SIZE(st_dst_reg, type, GLSL_TYPE_ERROR);
6583    ASSERT_BITFIELD_SIZE(glsl_to_tgsi_instruction, tex_type, GLSL_TYPE_ERROR);
6584    ASSERT_BITFIELD_SIZE(glsl_to_tgsi_instruction, image_format, PIPE_FORMAT_COUNT);
6585    ASSERT_BITFIELD_SIZE(glsl_to_tgsi_instruction, tex_target,
6586                         (gl_texture_index) (NUM_TEXTURE_TARGETS - 1));
6587    ASSERT_BITFIELD_SIZE(glsl_to_tgsi_instruction, image_format,
6588                         (enum pipe_format) (PIPE_FORMAT_COUNT - 1));
6589    ASSERT_BITFIELD_SIZE(glsl_to_tgsi_instruction, op,
6590                         (enum tgsi_opcode) (TGSI_OPCODE_LAST - 1));
6591
6592    t = CALLOC_STRUCT(st_translate);
6593    if (!t) {
6594       ret = PIPE_ERROR_OUT_OF_MEMORY;
6595       goto out;
6596    }
6597
6598    t->procType = procType;
6599    t->need_uarl = !screen->get_param(screen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS);
6600    t->inputMapping = inputMapping;
6601    t->outputMapping = outputMapping;
6602    t->ureg = ureg;
6603    t->num_temp_arrays = program->next_array;
6604    if (t->num_temp_arrays)
6605       t->arrays = (struct ureg_dst*)
6606                   calloc(t->num_temp_arrays, sizeof(t->arrays[0]));
6607
6608    /*
6609     * Declare input attributes.
6610     */
6611    switch (procType) {
6612    case PIPE_SHADER_FRAGMENT:
6613    case PIPE_SHADER_GEOMETRY:
6614    case PIPE_SHADER_TESS_EVAL:
6615    case PIPE_SHADER_TESS_CTRL:
6616       sort_inout_decls_by_slot(program->inputs, program->num_inputs, inputMapping);
6617
6618       for (i = 0; i < program->num_inputs; ++i) {
6619          struct inout_decl *decl = &program->inputs[i];
6620          unsigned slot = inputMapping[decl->mesa_index];
6621          struct ureg_src src;
6622          ubyte tgsi_usage_mask = decl->usage_mask;
6623
6624          if (glsl_base_type_is_64bit(decl->base_type)) {
6625             if (tgsi_usage_mask == 1)
6626                tgsi_usage_mask = TGSI_WRITEMASK_XY;
6627             else if (tgsi_usage_mask == 2)
6628                tgsi_usage_mask = TGSI_WRITEMASK_ZW;
6629             else
6630                tgsi_usage_mask = TGSI_WRITEMASK_XYZW;
6631          }
6632
6633          enum tgsi_interpolate_mode interp_mode = TGSI_INTERPOLATE_CONSTANT;
6634          enum tgsi_interpolate_loc interp_location = TGSI_INTERPOLATE_LOC_CENTER;
6635          if (procType == PIPE_SHADER_FRAGMENT) {
6636             assert(interpMode);
6637             interp_mode = interpMode[slot] != TGSI_INTERPOLATE_COUNT ?
6638                (enum tgsi_interpolate_mode) interpMode[slot] :
6639                st_translate_interp(decl->interp, inputSlotToAttr[slot]);
6640
6641             interp_location = (enum tgsi_interpolate_loc) decl->interp_loc;
6642          }
6643
6644          src = ureg_DECL_fs_input_cyl_centroid_layout(ureg,
6645                   (enum tgsi_semantic) inputSemanticName[slot],
6646                   inputSemanticIndex[slot],
6647                   interp_mode, 0, interp_location, slot, tgsi_usage_mask,
6648                   decl->array_id, decl->size);
6649
6650          for (unsigned j = 0; j < decl->size; ++j) {
6651             if (t->inputs[slot + j].File != TGSI_FILE_INPUT) {
6652                /* The ArrayID is set up in dst_register */
6653                t->inputs[slot + j] = src;
6654                t->inputs[slot + j].ArrayID = 0;
6655                t->inputs[slot + j].Index += j;
6656             }
6657          }
6658       }
6659       break;
6660    case PIPE_SHADER_VERTEX:
6661       for (i = 0; i < numInputs; i++) {
6662          t->inputs[i] = ureg_DECL_vs_input(ureg, i);
6663       }
6664       break;
6665    case PIPE_SHADER_COMPUTE:
6666       break;
6667    default:
6668       assert(0);
6669    }
6670
6671    /*
6672     * Declare output attributes.
6673     */
6674    switch (procType) {
6675    case PIPE_SHADER_FRAGMENT:
6676    case PIPE_SHADER_COMPUTE:
6677       break;
6678    case PIPE_SHADER_GEOMETRY:
6679    case PIPE_SHADER_TESS_EVAL:
6680    case PIPE_SHADER_TESS_CTRL:
6681    case PIPE_SHADER_VERTEX:
6682       sort_inout_decls_by_slot(program->outputs, program->num_outputs, outputMapping);
6683
6684       for (i = 0; i < program->num_outputs; ++i) {
6685          struct inout_decl *decl = &program->outputs[i];
6686          unsigned slot = outputMapping[decl->mesa_index];
6687          struct ureg_dst dst;
6688          ubyte tgsi_usage_mask = decl->usage_mask;
6689
6690          if (glsl_base_type_is_64bit(decl->base_type)) {
6691             if (tgsi_usage_mask == 1)
6692                tgsi_usage_mask = TGSI_WRITEMASK_XY;
6693             else if (tgsi_usage_mask == 2)
6694                tgsi_usage_mask = TGSI_WRITEMASK_ZW;
6695             else
6696                tgsi_usage_mask = TGSI_WRITEMASK_XYZW;
6697          }
6698
6699          dst = ureg_DECL_output_layout(ureg,
6700                      (enum tgsi_semantic) outputSemanticName[slot],
6701                      outputSemanticIndex[slot],
6702                      decl->gs_out_streams,
6703                      slot, tgsi_usage_mask, decl->array_id, decl->size, decl->invariant);
6704          dst.Invariant = decl->invariant;
6705          for (unsigned j = 0; j < decl->size; ++j) {
6706             if (t->outputs[slot + j].File != TGSI_FILE_OUTPUT) {
6707                /* The ArrayID is set up in dst_register */
6708                t->outputs[slot + j] = dst;
6709                t->outputs[slot + j].ArrayID = 0;
6710                t->outputs[slot + j].Index += j;
6711                t->outputs[slot + j].Invariant = decl->invariant;
6712             }
6713          }
6714       }
6715       break;
6716    default:
6717       assert(0);
6718    }
6719
6720    if (procType == PIPE_SHADER_FRAGMENT) {
6721       if (program->shader->Program->info.fs.early_fragment_tests ||
6722           program->shader->Program->info.fs.post_depth_coverage) {
6723          ureg_property(ureg, TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL, 1);
6724
6725          if (program->shader->Program->info.fs.post_depth_coverage)
6726             ureg_property(ureg, TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE, 1);
6727       }
6728
6729       if (proginfo->info.inputs_read & VARYING_BIT_POS) {
6730           /* Must do this after setting up t->inputs. */
6731           emit_wpos(st_context(ctx), t, proginfo, ureg,
6732                     program->wpos_transform_const);
6733       }
6734
6735       if (proginfo->info.inputs_read & VARYING_BIT_FACE)
6736          emit_face_var(ctx, t);
6737
6738       for (i = 0; i < numOutputs; i++) {
6739          switch (outputSemanticName[i]) {
6740          case TGSI_SEMANTIC_POSITION:
6741             t->outputs[i] = ureg_DECL_output(ureg,
6742                                              TGSI_SEMANTIC_POSITION, /* Z/Depth */
6743                                              outputSemanticIndex[i]);
6744             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_Z);
6745             break;
6746          case TGSI_SEMANTIC_STENCIL:
6747             t->outputs[i] = ureg_DECL_output(ureg,
6748                                              TGSI_SEMANTIC_STENCIL, /* Stencil */
6749                                              outputSemanticIndex[i]);
6750             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_Y);
6751             break;
6752          case TGSI_SEMANTIC_COLOR:
6753             t->outputs[i] = ureg_DECL_output(ureg,
6754                                              TGSI_SEMANTIC_COLOR,
6755                                              outputSemanticIndex[i]);
6756             break;
6757          case TGSI_SEMANTIC_SAMPLEMASK:
6758             t->outputs[i] = ureg_DECL_output(ureg,
6759                                              TGSI_SEMANTIC_SAMPLEMASK,
6760                                              outputSemanticIndex[i]);
6761             /* TODO: If we ever support more than 32 samples, this will have
6762              * to become an array.
6763              */
6764             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_X);
6765             break;
6766          default:
6767             assert(!"fragment shader outputs must be POSITION/STENCIL/COLOR");
6768             ret = PIPE_ERROR_BAD_INPUT;
6769             goto out;
6770          }
6771       }
6772    }
6773    else if (procType == PIPE_SHADER_VERTEX) {
6774       for (i = 0; i < numOutputs; i++) {
6775          if (outputSemanticName[i] == TGSI_SEMANTIC_FOG) {
6776             /* force register to contain a fog coordinate in the form (F, 0, 0, 1). */
6777             ureg_MOV(ureg,
6778                      ureg_writemask(t->outputs[i], TGSI_WRITEMASK_YZW),
6779                      ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f));
6780             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_X);
6781          }
6782       }
6783    }
6784
6785    if (procType == PIPE_SHADER_COMPUTE) {
6786       emit_compute_block_size(proginfo, ureg);
6787    }
6788
6789    /* Declare address register.
6790     */
6791    if (program->num_address_regs > 0) {
6792       assert(program->num_address_regs <= 3);
6793       for (int i = 0; i < program->num_address_regs; i++)
6794          t->address[i] = ureg_DECL_address(ureg);
6795    }
6796
6797    /* Declare misc input registers
6798     */
6799    {
6800       GLbitfield64 sysInputs = proginfo->info.system_values_read;
6801
6802       for (i = 0; sysInputs; i++) {
6803          if (sysInputs & (1ull << i)) {
6804             enum tgsi_semantic semName = _mesa_sysval_to_semantic(i);
6805
6806             t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
6807
6808             if (semName == TGSI_SEMANTIC_INSTANCEID ||
6809                 semName == TGSI_SEMANTIC_VERTEXID) {
6810                /* From Gallium perspective, these system values are always
6811                 * integer, and require native integer support.  However, if
6812                 * native integer is supported on the vertex stage but not the
6813                 * pixel stage (e.g, i915g + draw), Mesa will generate IR that
6814                 * assumes these system values are floats. To resolve the
6815                 * inconsistency, we insert a U2F.
6816                 */
6817                struct st_context *st = st_context(ctx);
6818                struct pipe_screen *pscreen = st->pipe->screen;
6819                assert(procType == PIPE_SHADER_VERTEX);
6820                assert(pscreen->get_shader_param(pscreen, PIPE_SHADER_VERTEX, PIPE_SHADER_CAP_INTEGERS));
6821                (void) pscreen;
6822                if (!ctx->Const.NativeIntegers) {
6823                   struct ureg_dst temp = ureg_DECL_local_temporary(t->ureg);
6824                   ureg_U2F(t->ureg, ureg_writemask(temp, TGSI_WRITEMASK_X),
6825                            t->systemValues[i]);
6826                   t->systemValues[i] = ureg_scalar(ureg_src(temp), 0);
6827                }
6828             }
6829
6830             if (procType == PIPE_SHADER_FRAGMENT &&
6831                 semName == TGSI_SEMANTIC_POSITION)
6832                emit_wpos(st_context(ctx), t, proginfo, ureg,
6833                          program->wpos_transform_const);
6834
6835             sysInputs &= ~(1ull << i);
6836          }
6837       }
6838    }
6839
6840    t->array_sizes = program->array_sizes;
6841    t->input_decls = program->inputs;
6842    t->num_input_decls = program->num_inputs;
6843    t->output_decls = program->outputs;
6844    t->num_output_decls = program->num_outputs;
6845
6846    /* Emit constants and uniforms.  TGSI uses a single index space for these,
6847     * so we put all the translated regs in t->constants.
6848     */
6849    if (proginfo->Parameters) {
6850       t->constants = (struct ureg_src *)
6851          calloc(proginfo->Parameters->NumParameters, sizeof(t->constants[0]));
6852       if (t->constants == NULL) {
6853          ret = PIPE_ERROR_OUT_OF_MEMORY;
6854          goto out;
6855       }
6856       t->num_constants = proginfo->Parameters->NumParameters;
6857
6858       for (i = 0; i < proginfo->Parameters->NumParameters; i++) {
6859          unsigned pvo = proginfo->Parameters->ParameterValueOffset[i];
6860
6861          switch (proginfo->Parameters->Parameters[i].Type) {
6862          case PROGRAM_STATE_VAR:
6863          case PROGRAM_UNIFORM:
6864             t->constants[i] = ureg_DECL_constant(ureg, i);
6865             break;
6866
6867          /* Emit immediates for PROGRAM_CONSTANT only when there's no indirect
6868           * addressing of the const buffer.
6869           * FIXME: Be smarter and recognize param arrays:
6870           * indirect addressing is only valid within the referenced
6871           * array.
6872           */
6873          case PROGRAM_CONSTANT:
6874             if (program->indirect_addr_consts)
6875                t->constants[i] = ureg_DECL_constant(ureg, i);
6876             else
6877                t->constants[i] = emit_immediate(t,
6878                                                 proginfo->Parameters->ParameterValues + pvo,
6879                                                 proginfo->Parameters->Parameters[i].DataType,
6880                                                 4);
6881             break;
6882          default:
6883             break;
6884          }
6885       }
6886    }
6887
6888    for (i = 0; i < proginfo->info.num_ubos; i++) {
6889       unsigned size = proginfo->sh.UniformBlocks[i]->UniformBufferSize;
6890       unsigned num_const_vecs = (size + 15) / 16;
6891       unsigned first, last;
6892       assert(num_const_vecs > 0);
6893       first = 0;
6894       last = num_const_vecs > 0 ? num_const_vecs - 1 : 0;
6895       ureg_DECL_constant2D(t->ureg, first, last, i + 1);
6896    }
6897
6898    /* Emit immediate values.
6899     */
6900    t->immediates = (struct ureg_src *)
6901       calloc(program->num_immediates, sizeof(struct ureg_src));
6902    if (t->immediates == NULL) {
6903       ret = PIPE_ERROR_OUT_OF_MEMORY;
6904       goto out;
6905    }
6906    t->num_immediates = program->num_immediates;
6907
6908    i = 0;
6909    foreach_in_list(immediate_storage, imm, &program->immediates) {
6910       assert(i < program->num_immediates);
6911       t->immediates[i++] = emit_immediate(t, imm->values, imm->type, imm->size32);
6912    }
6913    assert(i == program->num_immediates);
6914
6915    /* texture samplers */
6916    for (i = 0; i < frag_const->MaxTextureImageUnits; i++) {
6917       if (program->samplers_used & (1u << i)) {
6918          enum tgsi_return_type type =
6919             st_translate_texture_type(program->sampler_types[i]);
6920
6921          t->samplers[i] = ureg_DECL_sampler(ureg, i);
6922
6923          ureg_DECL_sampler_view(ureg, i, program->sampler_targets[i],
6924                                 type, type, type, type);
6925       }
6926    }
6927
6928    /* Declare atomic and shader storage buffers. */
6929    {
6930       struct gl_program *prog = program->prog;
6931
6932       if (!st_context(ctx)->has_hw_atomics) {
6933          for (i = 0; i < prog->info.num_abos; i++) {
6934             unsigned index = prog->sh.AtomicBuffers[i]->Binding;
6935             assert(index < frag_const->MaxAtomicBuffers);
6936             t->buffers[index] = ureg_DECL_buffer(ureg, index, true);
6937          }
6938       } else {
6939          for (i = 0; i < program->num_atomics; i++) {
6940             struct hwatomic_decl *ainfo = &program->atomic_info[i];
6941             gl_uniform_storage *uni_storage = &prog->sh.data->UniformStorage[ainfo->location];
6942             int base = uni_storage->offset / ATOMIC_COUNTER_SIZE;
6943             ureg_DECL_hw_atomic(ureg, base, base + ainfo->size - 1, ainfo->binding,
6944                                 ainfo->array_id);
6945          }
6946       }
6947
6948       assert(prog->info.num_ssbos <= frag_const->MaxShaderStorageBlocks);
6949       for (i = 0; i < prog->info.num_ssbos; i++) {
6950          unsigned index = i;
6951          if (!st_context(ctx)->has_hw_atomics)
6952             index += frag_const->MaxAtomicBuffers;
6953
6954          t->buffers[index] = ureg_DECL_buffer(ureg, index, false);
6955       }
6956    }
6957
6958    if (program->use_shared_memory)
6959       t->shared_memory = ureg_DECL_memory(ureg, TGSI_MEMORY_TYPE_SHARED);
6960
6961    for (i = 0; i < program->shader->Program->info.num_images; i++) {
6962       if (program->images_used & (1 << i)) {
6963          t->images[i] = ureg_DECL_image(ureg, i,
6964                                         program->image_targets[i],
6965                                         program->image_formats[i],
6966                                         program->image_wr[i],
6967                                         false);
6968       }
6969    }
6970
6971    /* Emit each instruction in turn:
6972     */
6973    foreach_in_list(glsl_to_tgsi_instruction, inst, &program->instructions)
6974       compile_tgsi_instruction(t, inst);
6975
6976    /* Set the next shader stage hint for VS and TES. */
6977    switch (procType) {
6978    case PIPE_SHADER_VERTEX:
6979    case PIPE_SHADER_TESS_EVAL:
6980       if (program->shader_program->SeparateShader)
6981          break;
6982
6983       for (i = program->shader->Stage+1; i <= MESA_SHADER_FRAGMENT; i++) {
6984          if (program->shader_program->_LinkedShaders[i]) {
6985             ureg_set_next_shader_processor(
6986                   ureg, pipe_shader_type_from_mesa((gl_shader_stage)i));
6987             break;
6988          }
6989       }
6990       break;
6991    default:
6992       ; /* nothing - silence compiler warning */
6993    }
6994
6995 out:
6996    if (t) {
6997       free(t->arrays);
6998       free(t->temps);
6999       free(t->constants);
7000       t->num_constants = 0;
7001       free(t->immediates);
7002       t->num_immediates = 0;
7003       FREE(t);
7004    }
7005
7006    return ret;
7007 }
7008 /* ----------------------------- End TGSI code ------------------------------ */
7009
7010
7011 /**
7012  * Convert a shader's GLSL IR into a Mesa gl_program, although without
7013  * generating Mesa IR.
7014  */
7015 static struct gl_program *
7016 get_mesa_program_tgsi(struct gl_context *ctx,
7017                       struct gl_shader_program *shader_program,
7018                       struct gl_linked_shader *shader)
7019 {
7020    glsl_to_tgsi_visitor* v;
7021    struct gl_program *prog;
7022    struct gl_shader_compiler_options *options =
7023          &ctx->Const.ShaderCompilerOptions[shader->Stage];
7024    struct pipe_screen *pscreen = ctx->st->pipe->screen;
7025    enum pipe_shader_type ptarget = pipe_shader_type_from_mesa(shader->Stage);
7026    unsigned skip_merge_registers;
7027
7028    validate_ir_tree(shader->ir);
7029
7030    prog = shader->Program;
7031
7032    prog->Parameters = _mesa_new_parameter_list();
7033    v = new glsl_to_tgsi_visitor();
7034    v->ctx = ctx;
7035    v->prog = prog;
7036    v->shader_program = shader_program;
7037    v->shader = shader;
7038    v->options = options;
7039    v->native_integers = ctx->Const.NativeIntegers;
7040
7041    v->have_sqrt = pscreen->get_shader_param(pscreen, ptarget,
7042                                             PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
7043    v->have_fma = pscreen->get_shader_param(pscreen, ptarget,
7044                                            PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED);
7045    v->has_tex_txf_lz = pscreen->get_param(pscreen,
7046                                           PIPE_CAP_TGSI_TEX_TXF_LZ);
7047    v->need_uarl = !pscreen->get_param(pscreen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS);
7048
7049    v->variables = _mesa_hash_table_create(v->mem_ctx, _mesa_hash_pointer,
7050                                           _mesa_key_pointer_equal);
7051    skip_merge_registers =
7052       pscreen->get_shader_param(pscreen, ptarget,
7053                                 PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS);
7054
7055    _mesa_generate_parameters_list_for_uniforms(ctx, shader_program, shader,
7056                                                prog->Parameters);
7057
7058    /* Remove reads from output registers. */
7059    if (!pscreen->get_param(pscreen, PIPE_CAP_TGSI_CAN_READ_OUTPUTS))
7060       lower_output_reads(shader->Stage, shader->ir);
7061
7062    /* Emit intermediate IR for main(). */
7063    visit_exec_list(shader->ir, v);
7064
7065 #if 0
7066    /* Print out some information (for debugging purposes) used by the
7067     * optimization passes. */
7068    {
7069       int i;
7070       int *first_writes = ralloc_array(v->mem_ctx, int, v->next_temp);
7071       int *first_reads = ralloc_array(v->mem_ctx, int, v->next_temp);
7072       int *last_writes = ralloc_array(v->mem_ctx, int, v->next_temp);
7073       int *last_reads = ralloc_array(v->mem_ctx, int, v->next_temp);
7074
7075       for (i = 0; i < v->next_temp; i++) {
7076          first_writes[i] = -1;
7077          first_reads[i] = -1;
7078          last_writes[i] = -1;
7079          last_reads[i] = -1;
7080       }
7081       v->get_first_temp_read(first_reads);
7082       v->get_last_temp_read_first_temp_write(last_reads, first_writes);
7083       v->get_last_temp_write(last_writes);
7084       for (i = 0; i < v->next_temp; i++)
7085          printf("Temp %d: FR=%3d FW=%3d LR=%3d LW=%3d\n", i, first_reads[i],
7086                 first_writes[i],
7087                 last_reads[i],
7088                 last_writes[i]);
7089       ralloc_free(first_writes);
7090       ralloc_free(first_reads);
7091       ralloc_free(last_writes);
7092       ralloc_free(last_reads);
7093    }
7094 #endif
7095
7096    /* Perform optimizations on the instructions in the glsl_to_tgsi_visitor. */
7097    v->simplify_cmp();
7098    v->copy_propagate();
7099
7100    while (v->eliminate_dead_code());
7101
7102    v->merge_two_dsts();
7103
7104    if (!skip_merge_registers) {
7105       v->split_arrays();
7106       v->copy_propagate();
7107       while (v->eliminate_dead_code());
7108
7109       v->merge_registers();
7110       v->copy_propagate();
7111       while (v->eliminate_dead_code());
7112    }
7113
7114    v->renumber_registers();
7115
7116    /* Write the END instruction. */
7117    v->emit_asm(NULL, TGSI_OPCODE_END);
7118
7119    if (ctx->_Shader->Flags & GLSL_DUMP) {
7120       _mesa_log("\n");
7121       _mesa_log("GLSL IR for linked %s program %d:\n",
7122              _mesa_shader_stage_to_string(shader->Stage),
7123              shader_program->Name);
7124       _mesa_print_ir(_mesa_get_log_file(), shader->ir, NULL);
7125       _mesa_log("\n\n");
7126    }
7127
7128    do_set_program_inouts(shader->ir, prog, shader->Stage);
7129    _mesa_copy_linked_program_data(shader_program, shader);
7130    shrink_array_declarations(v->inputs, v->num_inputs,
7131                              &prog->info.inputs_read,
7132                              prog->DualSlotInputs,
7133                              &prog->info.patch_inputs_read);
7134    shrink_array_declarations(v->outputs, v->num_outputs,
7135                              &prog->info.outputs_written, 0ULL,
7136                              &prog->info.patch_outputs_written);
7137    count_resources(v, prog);
7138
7139    /* The GLSL IR won't be needed anymore. */
7140    ralloc_free(shader->ir);
7141    shader->ir = NULL;
7142
7143    /* This must be done before the uniform storage is associated. */
7144    if (shader->Stage == MESA_SHADER_FRAGMENT &&
7145        (prog->info.inputs_read & VARYING_BIT_POS ||
7146         prog->info.system_values_read & (1ull << SYSTEM_VALUE_FRAG_COORD))) {
7147       static const gl_state_index16 wposTransformState[STATE_LENGTH] = {
7148          STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM
7149       };
7150
7151       v->wpos_transform_const = _mesa_add_state_reference(prog->Parameters,
7152                                                           wposTransformState);
7153    }
7154
7155    /* Avoid reallocation of the program parameter list, because the uniform
7156     * storage is only associated with the original parameter list.
7157     * This should be enough for Bitmap and DrawPixels constants.
7158     */
7159    _mesa_reserve_parameter_storage(prog->Parameters, 8);
7160
7161    /* This has to be done last.  Any operation the can cause
7162     * prog->ParameterValues to get reallocated (e.g., anything that adds a
7163     * program constant) has to happen before creating this linkage.
7164     */
7165    _mesa_associate_uniform_storage(ctx, shader_program, prog, true);
7166    if (!shader_program->data->LinkStatus) {
7167       free_glsl_to_tgsi_visitor(v);
7168       _mesa_reference_program(ctx, &shader->Program, NULL);
7169       return NULL;
7170    }
7171
7172    struct st_vertex_program *stvp;
7173    struct st_fragment_program *stfp;
7174    struct st_common_program *stp;
7175    struct st_compute_program *stcp;
7176
7177    switch (shader->Stage) {
7178    case MESA_SHADER_VERTEX:
7179       stvp = (struct st_vertex_program *)prog;
7180       stvp->glsl_to_tgsi = v;
7181       break;
7182    case MESA_SHADER_FRAGMENT:
7183       stfp = (struct st_fragment_program *)prog;
7184       stfp->glsl_to_tgsi = v;
7185       break;
7186    case MESA_SHADER_TESS_CTRL:
7187    case MESA_SHADER_TESS_EVAL:
7188    case MESA_SHADER_GEOMETRY:
7189       stp = st_common_program(prog);
7190       stp->glsl_to_tgsi = v;
7191       break;
7192    case MESA_SHADER_COMPUTE:
7193       stcp = (struct st_compute_program *)prog;
7194       stcp->glsl_to_tgsi = v;
7195       break;
7196    default:
7197       assert(!"should not be reached");
7198       return NULL;
7199    }
7200
7201    PRINT_STATS(v->print_stats());
7202
7203    return prog;
7204 }
7205
7206 /* See if there are unsupported control flow statements. */
7207 class ir_control_flow_info_visitor : public ir_hierarchical_visitor {
7208 private:
7209    const struct gl_shader_compiler_options *options;
7210 public:
7211    ir_control_flow_info_visitor(const struct gl_shader_compiler_options *options)
7212       : options(options),
7213         unsupported(false)
7214    {
7215    }
7216
7217    virtual ir_visitor_status visit_enter(ir_function *ir)
7218    {
7219       /* Other functions are skipped (same as glsl_to_tgsi). */
7220       if (strcmp(ir->name, "main") == 0)
7221          return visit_continue;
7222
7223       return visit_continue_with_parent;
7224    }
7225
7226    virtual ir_visitor_status visit_enter(ir_call *ir)
7227    {
7228       if (!ir->callee->is_intrinsic()) {
7229          unsupported = true; /* it's a function call */
7230          return visit_stop;
7231       }
7232       return visit_continue;
7233    }
7234
7235    virtual ir_visitor_status visit_enter(ir_return *ir)
7236    {
7237       if (options->EmitNoMainReturn) {
7238          unsupported = true;
7239          return visit_stop;
7240       }
7241       return visit_continue;
7242    }
7243
7244    bool unsupported;
7245 };
7246
7247 static bool
7248 has_unsupported_control_flow(exec_list *ir,
7249                              const struct gl_shader_compiler_options *options)
7250 {
7251    ir_control_flow_info_visitor visitor(options);
7252    visit_list_elements(&visitor, ir);
7253    return visitor.unsupported;
7254 }
7255
7256 extern "C" {
7257
7258 /**
7259  * Link a shader.
7260  * Called via ctx->Driver.LinkShader()
7261  * This actually involves converting GLSL IR into an intermediate TGSI-like IR
7262  * with code lowering and other optimizations.
7263  */
7264 GLboolean
7265 st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
7266 {
7267    struct pipe_screen *pscreen = ctx->st->pipe->screen;
7268
7269    enum pipe_shader_ir preferred_ir = (enum pipe_shader_ir)
7270       pscreen->get_shader_param(pscreen, PIPE_SHADER_VERTEX,
7271                                 PIPE_SHADER_CAP_PREFERRED_IR);
7272    bool use_nir = preferred_ir == PIPE_SHADER_IR_NIR;
7273
7274    /* Return early if we are loading the shader from on-disk cache */
7275    if (st_load_ir_from_disk_cache(ctx, prog, use_nir)) {
7276       return GL_TRUE;
7277    }
7278
7279    assert(prog->data->LinkStatus);
7280
7281    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
7282       if (prog->_LinkedShaders[i] == NULL)
7283          continue;
7284
7285       struct gl_linked_shader *shader = prog->_LinkedShaders[i];
7286       exec_list *ir = shader->ir;
7287       gl_shader_stage stage = shader->Stage;
7288       const struct gl_shader_compiler_options *options =
7289             &ctx->Const.ShaderCompilerOptions[stage];
7290       enum pipe_shader_type ptarget = pipe_shader_type_from_mesa(stage);
7291       bool have_dround = pscreen->get_shader_param(pscreen, ptarget,
7292                                                    PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED);
7293       bool have_dfrexp = pscreen->get_shader_param(pscreen, ptarget,
7294                                                    PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED);
7295       bool have_ldexp = pscreen->get_shader_param(pscreen, ptarget,
7296                                                   PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED);
7297       unsigned if_threshold = pscreen->get_shader_param(pscreen, ptarget,
7298                                                         PIPE_SHADER_CAP_LOWER_IF_THRESHOLD);
7299
7300       /* If there are forms of indirect addressing that the driver
7301        * cannot handle, perform the lowering pass.
7302        */
7303       if (options->EmitNoIndirectInput || options->EmitNoIndirectOutput ||
7304           options->EmitNoIndirectTemp || options->EmitNoIndirectUniform) {
7305          lower_variable_index_to_cond_assign(stage, ir,
7306                                              options->EmitNoIndirectInput,
7307                                              options->EmitNoIndirectOutput,
7308                                              options->EmitNoIndirectTemp,
7309                                              options->EmitNoIndirectUniform);
7310       }
7311
7312       if (!pscreen->get_param(pscreen, PIPE_CAP_INT64_DIVMOD))
7313          lower_64bit_integer_instructions(ir, DIV64 | MOD64);
7314
7315       if (ctx->Extensions.ARB_shading_language_packing) {
7316          unsigned lower_inst = LOWER_PACK_SNORM_2x16 |
7317                                LOWER_UNPACK_SNORM_2x16 |
7318                                LOWER_PACK_UNORM_2x16 |
7319                                LOWER_UNPACK_UNORM_2x16 |
7320                                LOWER_PACK_SNORM_4x8 |
7321                                LOWER_UNPACK_SNORM_4x8 |
7322                                LOWER_UNPACK_UNORM_4x8 |
7323                                LOWER_PACK_UNORM_4x8;
7324
7325          if (ctx->Extensions.ARB_gpu_shader5)
7326             lower_inst |= LOWER_PACK_USE_BFI |
7327                           LOWER_PACK_USE_BFE;
7328          if (!ctx->st->has_half_float_packing)
7329             lower_inst |= LOWER_PACK_HALF_2x16 |
7330                           LOWER_UNPACK_HALF_2x16;
7331
7332          lower_packing_builtins(ir, lower_inst);
7333       }
7334
7335       if (!pscreen->get_param(pscreen, PIPE_CAP_TEXTURE_GATHER_OFFSETS))
7336          lower_offset_arrays(ir);
7337       do_mat_op_to_vec(ir);
7338
7339       if (stage == MESA_SHADER_FRAGMENT)
7340          lower_blend_equation_advanced(
7341             shader, ctx->Extensions.KHR_blend_equation_advanced_coherent);
7342
7343       lower_instructions(ir,
7344                          MOD_TO_FLOOR |
7345                          FDIV_TO_MUL_RCP |
7346                          EXP_TO_EXP2 |
7347                          LOG_TO_LOG2 |
7348                          (have_ldexp ? 0 : LDEXP_TO_ARITH) |
7349                          (have_dfrexp ? 0 : DFREXP_DLDEXP_TO_ARITH) |
7350                          CARRY_TO_ARITH |
7351                          BORROW_TO_ARITH |
7352                          (have_dround ? 0 : DOPS_TO_DFRAC) |
7353                          (options->EmitNoPow ? POW_TO_EXP2 : 0) |
7354                          (!ctx->Const.NativeIntegers ? INT_DIV_TO_MUL_RCP : 0) |
7355                          (options->EmitNoSat ? SAT_TO_CLAMP : 0) |
7356                          (ctx->Const.ForceGLSLAbsSqrt ? SQRT_TO_ABS_SQRT : 0) |
7357                          /* Assume that if ARB_gpu_shader5 is not supported
7358                           * then all of the extended integer functions need
7359                           * lowering.  It may be necessary to add some caps
7360                           * for individual instructions.
7361                           */
7362                          (!ctx->Extensions.ARB_gpu_shader5
7363                           ? BIT_COUNT_TO_MATH |
7364                             EXTRACT_TO_SHIFTS |
7365                             INSERT_TO_SHIFTS |
7366                             REVERSE_TO_SHIFTS |
7367                             FIND_LSB_TO_FLOAT_CAST |
7368                             FIND_MSB_TO_FLOAT_CAST |
7369                             IMUL_HIGH_TO_MUL
7370                           : 0));
7371
7372       do_vec_index_to_cond_assign(ir);
7373       lower_vector_insert(ir, true);
7374       lower_quadop_vector(ir, false);
7375       lower_noise(ir);
7376       if (options->MaxIfDepth == 0) {
7377          lower_discard(ir);
7378       }
7379
7380       if (ctx->Const.GLSLOptimizeConservatively) {
7381          /* Do it once and repeat only if there's unsupported control flow. */
7382          do {
7383             do_common_optimization(ir, true, true, options,
7384                                    ctx->Const.NativeIntegers);
7385             lower_if_to_cond_assign((gl_shader_stage)i, ir,
7386                                     options->MaxIfDepth, if_threshold);
7387          } while (has_unsupported_control_flow(ir, options));
7388       } else {
7389          /* Repeat it until it stops making changes. */
7390          bool progress;
7391          do {
7392             progress = do_common_optimization(ir, true, true, options,
7393                                               ctx->Const.NativeIntegers);
7394             progress |= lower_if_to_cond_assign((gl_shader_stage)i, ir,
7395                                                 options->MaxIfDepth, if_threshold);
7396          } while (progress);
7397       }
7398
7399       /* Do this again to lower ir_binop_vector_extract introduced
7400        * by optimization passes.
7401        */
7402       do_vec_index_to_cond_assign(ir);
7403
7404       validate_ir_tree(ir);
7405    }
7406
7407    build_program_resource_list(ctx, prog);
7408
7409    if (use_nir)
7410       return st_link_nir(ctx, prog);
7411
7412    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
7413       struct gl_linked_shader *shader = prog->_LinkedShaders[i];
7414       if (shader == NULL)
7415          continue;
7416
7417       struct gl_program *linked_prog =
7418          get_mesa_program_tgsi(ctx, prog, shader);
7419       st_set_prog_affected_state_flags(linked_prog);
7420
7421       if (linked_prog) {
7422          if (!ctx->Driver.ProgramStringNotify(ctx,
7423                                               _mesa_shader_stage_to_program(i),
7424                                               linked_prog)) {
7425             _mesa_reference_program(ctx, &shader->Program, NULL);
7426             return GL_FALSE;
7427          }
7428       }
7429    }
7430
7431    return GL_TRUE;
7432 }
7433
7434 void
7435 st_translate_stream_output_info(glsl_to_tgsi_visitor *glsl_to_tgsi,
7436                                 const ubyte outputMapping[],
7437                                 struct pipe_stream_output_info *so)
7438 {
7439    if (!glsl_to_tgsi->shader_program->last_vert_prog)
7440       return;
7441
7442    struct gl_transform_feedback_info *info =
7443       glsl_to_tgsi->shader_program->last_vert_prog->sh.LinkedTransformFeedback;
7444    st_translate_stream_output_info2(info, outputMapping, so);
7445 }
7446
7447 void
7448 st_translate_stream_output_info2(struct gl_transform_feedback_info *info,
7449                                 const ubyte outputMapping[],
7450                                 struct pipe_stream_output_info *so)
7451 {
7452    unsigned i;
7453
7454    for (i = 0; i < info->NumOutputs; i++) {
7455       so->output[i].register_index =
7456          outputMapping[info->Outputs[i].OutputRegister];
7457       so->output[i].start_component = info->Outputs[i].ComponentOffset;
7458       so->output[i].num_components = info->Outputs[i].NumComponents;
7459       so->output[i].output_buffer = info->Outputs[i].OutputBuffer;
7460       so->output[i].dst_offset = info->Outputs[i].DstOffset;
7461       so->output[i].stream = info->Outputs[i].StreamId;
7462    }
7463
7464    for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
7465       so->stride[i] = info->Buffers[i].Stride;
7466    }
7467    so->num_outputs = info->NumOutputs;
7468 }
7469
7470 } /* extern "C" */