src/mesa/state_tracker/st_glsl_to_tgsi.cpp

   1 /*
   2  * Copyright (C) 2005-2007  Brian Paul   All Rights Reserved.
   3  * Copyright (C) 2008  VMware, Inc.   All Rights Reserved.
   4  * Copyright © 2010 Intel Corporation
   5  * Copyright © 2011 Bryan Cain
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the next
  15  * paragraph) shall be included in all copies or substantial portions of the
  16  * Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24  * DEALINGS IN THE SOFTWARE.
  25  */
  26
  27 /**
  28  * \file glsl_to_tgsi.cpp
  29  *
  30  * Translate GLSL IR to TGSI.
  31  */
  32
  33 #include <stdio.h>
  34 #include "main/compiler.h"
  35 #include "ir.h"
  36 #include "ir_visitor.h"
  37 #include "ir_print_visitor.h"
  38 #include "ir_expression_flattening.h"
  39 #include "glsl_types.h"
  40 #include "glsl_parser_extras.h"
  41 #include "../glsl/program.h"
  42 #include "ir_optimization.h"
  43 #include "ast.h"
  44
  45 extern "C" {
  46 #include "main/mtypes.h"
  47 #include "main/shaderapi.h"
  48 #include "main/shaderobj.h"
  49 #include "main/uniforms.h"
  50 #include "program/hash_table.h"
  51 #include "program/prog_instruction.h"
  52 #include "program/prog_optimize.h"
  53 #include "program/prog_print.h"
  54 #include "program/program.h"
  55 #include "program/prog_uniform.h"
  56 #include "program/prog_parameter.h"
  57 #include "program/sampler.h"
  58
  59 #include "pipe/p_compiler.h"
  60 #include "pipe/p_context.h"
  61 #include "pipe/p_screen.h"
  62 #include "pipe/p_shader_tokens.h"
  63 #include "pipe/p_state.h"
  64 #include "util/u_math.h"
  65 #include "tgsi/tgsi_ureg.h"
  66 #include "tgsi/tgsi_info.h"
  67 #include "st_context.h"
  68 #include "st_program.h"
  69 #include "st_glsl_to_tgsi.h"
  70 #include "st_mesa_to_tgsi.h"
  71 }
  72
  73 #define PROGRAM_IMMEDIATE PROGRAM_FILE_MAX
  74 #define PROGRAM_ANY_CONST ((1 << PROGRAM_LOCAL_PARAM) |  \
  75                            (1 << PROGRAM_ENV_PARAM) |    \
  76                            (1 << PROGRAM_STATE_VAR) |    \
  77                            (1 << PROGRAM_NAMED_PARAM) |  \
  78                            (1 << PROGRAM_CONSTANT) |     \
  79                            (1 << PROGRAM_UNIFORM))
  80
  81 #define MAX_TEMPS         4096
  82
  83 /* will be 4 for GLSL 4.00 */
  84 #define MAX_GLSL_TEXTURE_OFFSET 1
  85
  86 class st_src_reg;
  87 class st_dst_reg;
  88
  89 static int swizzle_for_size(int size);
  90
  91 /**
  92  * This struct is a corresponding struct to TGSI ureg_src.
  93  */
  94 class st_src_reg {
  95 public:
  96    st_src_reg(gl_register_file file, int index, const glsl_type *type)
  97    {
  98       this->file = file;
  99       this->index = index;
 100       if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
 101          this->swizzle = swizzle_for_size(type->vector_elements);
 102       else
 103          this->swizzle = SWIZZLE_XYZW;
 104       this->negate = 0;
 105       this->type = type ? type->base_type : GLSL_TYPE_ERROR;
 106       this->reladdr = NULL;
 107    }
 108
 109    st_src_reg(gl_register_file file, int index, int type)
 110    {
 111       this->type = type;
 112       this->file = file;
 113       this->index = index;
 114       this->swizzle = SWIZZLE_XYZW;
 115       this->negate = 0;
 116       this->reladdr = NULL;
 117    }
 118
 119    st_src_reg()
 120    {
 121       this->type = GLSL_TYPE_ERROR;
 122       this->file = PROGRAM_UNDEFINED;
 123       this->index = 0;
 124       this->swizzle = 0;
 125       this->negate = 0;
 126       this->reladdr = NULL;
 127    }
 128
 129    explicit st_src_reg(st_dst_reg reg);
 130
 131    gl_register_file file; /**< PROGRAM_* from Mesa */
 132    int index; /**< temporary index, VERT_ATTRIB_*, FRAG_ATTRIB_*, etc. */
 133    GLuint swizzle; /**< SWIZZLE_XYZWONEZERO swizzles from Mesa. */
 134    int negate; /**< NEGATE_XYZW mask from mesa */
 135    int type; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
 136    /** Register index should be offset by the integer in this reg. */
 137    st_src_reg *reladdr;
 138 };
 139
 140 class st_dst_reg {
 141 public:
 142    st_dst_reg(gl_register_file file, int writemask, int type)
 143    {
 144       this->file = file;
 145       this->index = 0;
 146       this->writemask = writemask;
 147       this->cond_mask = COND_TR;
 148       this->reladdr = NULL;
 149       this->type = type;
 150    }
 151
 152    st_dst_reg()
 153    {
 154       this->type = GLSL_TYPE_ERROR;
 155       this->file = PROGRAM_UNDEFINED;
 156       this->index = 0;
 157       this->writemask = 0;
 158       this->cond_mask = COND_TR;
 159       this->reladdr = NULL;
 160    }
 161
 162    explicit st_dst_reg(st_src_reg reg);
 163
 164    gl_register_file file; /**< PROGRAM_* from Mesa */
 165    int index; /**< temporary index, VERT_ATTRIB_*, FRAG_ATTRIB_*, etc. */
 166    int writemask; /**< Bitfield of WRITEMASK_[XYZW] */
 167    GLuint cond_mask:4;
 168    int type; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
 169    /** Register index should be offset by the integer in this reg. */
 170    st_src_reg *reladdr;
 171 };
 172
 173 st_src_reg::st_src_reg(st_dst_reg reg)
 174 {
 175    this->type = reg.type;
 176    this->file = reg.file;
 177    this->index = reg.index;
 178    this->swizzle = SWIZZLE_XYZW;
 179    this->negate = 0;
 180    this->reladdr = reg.reladdr;
 181 }
 182
 183 st_dst_reg::st_dst_reg(st_src_reg reg)
 184 {
 185    this->type = reg.type;
 186    this->file = reg.file;
 187    this->index = reg.index;
 188    this->writemask = WRITEMASK_XYZW;
 189    this->cond_mask = COND_TR;
 190    this->reladdr = reg.reladdr;
 191 }
 192
 193 class glsl_to_tgsi_instruction : public exec_node {
 194 public:
 195    /* Callers of this ralloc-based new need not call delete. It's
 196     * easier to just ralloc_free 'ctx' (or any of its ancestors). */
 197    static void* operator new(size_t size, void *ctx)
 198    {
 199       void *node;
 200
 201       node = rzalloc_size(ctx, size);
 202       assert(node != NULL);
 203
 204       return node;
 205    }
 206
 207    unsigned op;
 208    st_dst_reg dst;
 209    st_src_reg src[3];
 210    /** Pointer to the ir source this tree came from for debugging */
 211    ir_instruction *ir;
 212    GLboolean cond_update;
 213    bool saturate;
 214    int sampler; /**< sampler index */
 215    int tex_target; /**< One of TEXTURE_*_INDEX */
 216    GLboolean tex_shadow;
 217    struct tgsi_texture_offset tex_offsets[MAX_GLSL_TEXTURE_OFFSET];
 218    unsigned tex_offset_num_offset;
 219    int dead_mask; /**< Used in dead code elimination */
 220
 221    class function_entry *function; /* Set on TGSI_OPCODE_CAL or TGSI_OPCODE_BGNSUB */
 222 };
 223
 224 class variable_storage : public exec_node {
 225 public:
 226    variable_storage(ir_variable *var, gl_register_file file, int index)
 227       : file(file), index(index), var(var)
 228    {
 229       /* empty */
 230    }
 231
 232    gl_register_file file;
 233    int index;
 234    ir_variable *var; /* variable that maps to this, if any */
 235 };
 236
 237 class immediate_storage : public exec_node {
 238 public:
 239    immediate_storage(gl_constant_value *values, int size, int type)
 240    {
 241       memcpy(this->values, values, size * sizeof(gl_constant_value));
 242       this->size = size;
 243       this->type = type;
 244    }
 245
 246    gl_constant_value values[4];
 247    int size; /**< Number of components (1-4) */
 248    int type; /**< GL_FLOAT, GL_INT, GL_BOOL, or GL_UNSIGNED_INT */
 249 };
 250
 251 class function_entry : public exec_node {
 252 public:
 253    ir_function_signature *sig;
 254
 255    /**
 256     * identifier of this function signature used by the program.
 257     *
 258     * At the point that TGSI instructions for function calls are
 259     * generated, we don't know the address of the first instruction of
 260     * the function body.  So we make the BranchTarget that is called a
 261     * small integer and rewrite them during set_branchtargets().
 262     */
 263    int sig_id;
 264
 265    /**
 266     * Pointer to first instruction of the function body.
 267     *
 268     * Set during function body emits after main() is processed.
 269     */
 270    glsl_to_tgsi_instruction *bgn_inst;
 271
 272    /**
 273     * Index of the first instruction of the function body in actual TGSI.
 274     *
 275     * Set after conversion from glsl_to_tgsi_instruction to TGSI.
 276     */
 277    int inst;
 278
 279    /** Storage for the return value. */
 280    st_src_reg return_reg;
 281 };
 282
 283 class glsl_to_tgsi_visitor : public ir_visitor {
 284 public:
 285    glsl_to_tgsi_visitor();
 286    ~glsl_to_tgsi_visitor();
 287
 288    function_entry *current_function;
 289
 290    struct gl_context *ctx;
 291    struct gl_program *prog;
 292    struct gl_shader_program *shader_program;
 293    struct gl_shader_compiler_options *options;
 294
 295    int next_temp;
 296
 297    int num_address_regs;
 298    int samplers_used;
 299    bool indirect_addr_temps;
 300    bool indirect_addr_consts;
 301
 302    int glsl_version;
 303    bool native_integers;
 304
 305    variable_storage *find_variable_storage(ir_variable *var);
 306
 307    int add_constant(gl_register_file file, gl_constant_value values[4],
 308                     int size, int datatype, GLuint *swizzle_out);
 309
 310    function_entry *get_function_signature(ir_function_signature *sig);
 311
 312    st_src_reg get_temp(const glsl_type *type);
 313    void reladdr_to_temp(ir_instruction *ir, st_src_reg *reg, int *num_reladdr);
 314
 315    st_src_reg st_src_reg_for_float(float val);
 316    st_src_reg st_src_reg_for_int(int val);
 317    st_src_reg st_src_reg_for_type(int type, int val);
 318
 319    /**
 320     * \name Visit methods
 321     *
 322     * As typical for the visitor pattern, there must be one \c visit method for
 323     * each concrete subclass of \c ir_instruction.  Virtual base classes within
 324     * the hierarchy should not have \c visit methods.
 325     */
 326    /*@{*/
 327    virtual void visit(ir_variable *);
 328    virtual void visit(ir_loop *);
 329    virtual void visit(ir_loop_jump *);
 330    virtual void visit(ir_function_signature *);
 331    virtual void visit(ir_function *);
 332    virtual void visit(ir_expression *);
 333    virtual void visit(ir_swizzle *);
 334    virtual void visit(ir_dereference_variable  *);
 335    virtual void visit(ir_dereference_array *);
 336    virtual void visit(ir_dereference_record *);
 337    virtual void visit(ir_assignment *);
 338    virtual void visit(ir_constant *);
 339    virtual void visit(ir_call *);
 340    virtual void visit(ir_return *);
 341    virtual void visit(ir_discard *);
 342    virtual void visit(ir_texture *);
 343    virtual void visit(ir_if *);
 344    /*@}*/
 345
 346    st_src_reg result;
 347
 348    /** List of variable_storage */
 349    exec_list variables;
 350
 351    /** List of immediate_storage */
 352    exec_list immediates;
 353    int num_immediates;
 354
 355    /** List of function_entry */
 356    exec_list function_signatures;
 357    int next_signature_id;
 358
 359    /** List of glsl_to_tgsi_instruction */
 360    exec_list instructions;
 361
 362    glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op);
 363
 364    glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
 365                                 st_dst_reg dst, st_src_reg src0);
 366
 367    glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
 368                                 st_dst_reg dst, st_src_reg src0, st_src_reg src1);
 369
 370    glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
 371                                 st_dst_reg dst,
 372                                 st_src_reg src0, st_src_reg src1, st_src_reg src2);
 373
 374    unsigned get_opcode(ir_instruction *ir, unsigned op,
 375                     st_dst_reg dst,
 376                     st_src_reg src0, st_src_reg src1);
 377
 378    /**
 379     * Emit the correct dot-product instruction for the type of arguments
 380     */
 381    glsl_to_tgsi_instruction *emit_dp(ir_instruction *ir,
 382                                      st_dst_reg dst,
 383                                      st_src_reg src0,
 384                                      st_src_reg src1,
 385                                      unsigned elements);
 386
 387    void emit_scalar(ir_instruction *ir, unsigned op,
 388                     st_dst_reg dst, st_src_reg src0);
 389
 390    void emit_scalar(ir_instruction *ir, unsigned op,
 391                     st_dst_reg dst, st_src_reg src0, st_src_reg src1);
 392
 393    void try_emit_float_set(ir_instruction *ir, unsigned op, st_dst_reg dst);
 394
 395    void emit_arl(ir_instruction *ir, st_dst_reg dst, st_src_reg src0);
 396
 397    void emit_scs(ir_instruction *ir, unsigned op,
 398                  st_dst_reg dst, const st_src_reg &src);
 399
 400    bool try_emit_mad(ir_expression *ir,
 401               int mul_operand);
 402    bool try_emit_mad_for_and_not(ir_expression *ir,
 403               int mul_operand);
 404    bool try_emit_sat(ir_expression *ir);
 405
 406    void emit_swz(ir_expression *ir);
 407
 408    bool process_move_condition(ir_rvalue *ir);
 409
 410    void remove_output_reads(gl_register_file type);
 411    void simplify_cmp(void);
 412
 413    void rename_temp_register(int index, int new_index);
 414    int get_first_temp_read(int index);
 415    int get_first_temp_write(int index);
 416    int get_last_temp_read(int index);
 417    int get_last_temp_write(int index);
 418
 419    void copy_propagate(void);
 420    void eliminate_dead_code(void);
 421    int eliminate_dead_code_advanced(void);
 422    void merge_registers(void);
 423    void renumber_registers(void);
 424
 425    void *mem_ctx;
 426 };
 427
 428 static st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
 429
 430 static st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
 431
 432 static st_dst_reg address_reg = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT);
 433
 434 static void
 435 fail_link(struct gl_shader_program *prog, const char *fmt, ...) PRINTFLIKE(2, 3);
 436
 437 static void
 438 fail_link(struct gl_shader_program *prog, const char *fmt, ...)
 439 {
 440    va_list args;
 441    va_start(args, fmt);
 442    ralloc_vasprintf_append(&prog->InfoLog, fmt, args);
 443    va_end(args);
 444
 445    prog->LinkStatus = GL_FALSE;
 446 }
 447
 448 static int
 449 swizzle_for_size(int size)
 450 {
 451    int size_swizzles[4] = {
 452       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
 453       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
 454       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z),
 455       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W),
 456    };
 457
 458    assert((size >= 1) && (size <= 4));
 459    return size_swizzles[size - 1];
 460 }
 461
 462 static bool
 463 is_tex_instruction(unsigned opcode)
 464 {
 465    const tgsi_opcode_info* info = tgsi_get_opcode_info(opcode);
 466    return info->is_tex;
 467 }
 468
 469 static unsigned
 470 num_inst_dst_regs(unsigned opcode)
 471 {
 472    const tgsi_opcode_info* info = tgsi_get_opcode_info(opcode);
 473    return info->num_dst;
 474 }
 475
 476 static unsigned
 477 num_inst_src_regs(unsigned opcode)
 478 {
 479    const tgsi_opcode_info* info = tgsi_get_opcode_info(opcode);
 480    return info->is_tex ? info->num_src - 1 : info->num_src;
 481 }
 482
 483 glsl_to_tgsi_instruction *
 484 glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
 485                          st_dst_reg dst,
 486                          st_src_reg src0, st_src_reg src1, st_src_reg src2)
 487 {
 488    glsl_to_tgsi_instruction *inst = new(mem_ctx) glsl_to_tgsi_instruction();
 489    int num_reladdr = 0, i;
 490
 491    op = get_opcode(ir, op, dst, src0, src1);
 492
 493    /* If we have to do relative addressing, we want to load the ARL
 494     * reg directly for one of the regs, and preload the other reladdr
 495     * sources into temps.
 496     */
 497    num_reladdr += dst.reladdr != NULL;
 498    num_reladdr += src0.reladdr != NULL;
 499    num_reladdr += src1.reladdr != NULL;
 500    num_reladdr += src2.reladdr != NULL;
 501
 502    reladdr_to_temp(ir, &src2, &num_reladdr);
 503    reladdr_to_temp(ir, &src1, &num_reladdr);
 504    reladdr_to_temp(ir, &src0, &num_reladdr);
 505
 506    if (dst.reladdr) {
 507       emit_arl(ir, address_reg, *dst.reladdr);
 508       num_reladdr--;
 509    }
 510    assert(num_reladdr == 0);
 511
 512    inst->op = op;
 513    inst->dst = dst;
 514    inst->src[0] = src0;
 515    inst->src[1] = src1;
 516    inst->src[2] = src2;
 517    inst->ir = ir;
 518    inst->dead_mask = 0;
 519
 520    inst->function = NULL;
 521
 522    if (op == TGSI_OPCODE_ARL)
 523       this->num_address_regs = 1;
 524
 525    /* Update indirect addressing status used by TGSI */
 526    if (dst.reladdr) {
 527       switch(dst.file) {
 528       case PROGRAM_TEMPORARY:
 529          this->indirect_addr_temps = true;
 530          break;
 531       case PROGRAM_LOCAL_PARAM:
 532       case PROGRAM_ENV_PARAM:
 533       case PROGRAM_STATE_VAR:
 534       case PROGRAM_NAMED_PARAM:
 535       case PROGRAM_CONSTANT:
 536       case PROGRAM_UNIFORM:
 537          this->indirect_addr_consts = true;
 538          break;
 539       case PROGRAM_IMMEDIATE:
 540          assert(!"immediates should not have indirect addressing");
 541          break;
 542       default:
 543          break;
 544       }
 545    }
 546    else {
 547       for (i=0; i<3; i++) {
 548          if(inst->src[i].reladdr) {
 549             switch(inst->src[i].file) {
 550             case PROGRAM_TEMPORARY:
 551                this->indirect_addr_temps = true;
 552                break;
 553             case PROGRAM_LOCAL_PARAM:
 554             case PROGRAM_ENV_PARAM:
 555             case PROGRAM_STATE_VAR:
 556             case PROGRAM_NAMED_PARAM:
 557             case PROGRAM_CONSTANT:
 558             case PROGRAM_UNIFORM:
 559                this->indirect_addr_consts = true;
 560                break;
 561             case PROGRAM_IMMEDIATE:
 562                assert(!"immediates should not have indirect addressing");
 563                break;
 564             default:
 565                break;
 566             }
 567          }
 568       }
 569    }
 570
 571    this->instructions.push_tail(inst);
 572
 573    if (native_integers)
 574       try_emit_float_set(ir, op, dst);
 575
 576    return inst;
 577 }
 578
 579
 580 glsl_to_tgsi_instruction *
 581 glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
 582                          st_dst_reg dst, st_src_reg src0, st_src_reg src1)
 583 {
 584    return emit(ir, op, dst, src0, src1, undef_src);
 585 }
 586
 587 glsl_to_tgsi_instruction *
 588 glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
 589                          st_dst_reg dst, st_src_reg src0)
 590 {
 591    assert(dst.writemask != 0);
 592    return emit(ir, op, dst, src0, undef_src, undef_src);
 593 }
 594
 595 glsl_to_tgsi_instruction *
 596 glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op)
 597 {
 598    return emit(ir, op, undef_dst, undef_src, undef_src, undef_src);
 599 }
 600
 601  /**
 602  * Emits the code to convert the result of float SET instructions to integers.
 603  */
 604 void
 605 glsl_to_tgsi_visitor::try_emit_float_set(ir_instruction *ir, unsigned op,
 606                          st_dst_reg dst)
 607 {
 608    if ((op == TGSI_OPCODE_SEQ ||
 609         op == TGSI_OPCODE_SNE ||
 610         op == TGSI_OPCODE_SGE ||
 611         op == TGSI_OPCODE_SLT))
 612    {
 613       st_src_reg src = st_src_reg(dst);
 614       src.negate = ~src.negate;
 615       dst.type = GLSL_TYPE_FLOAT;
 616       emit(ir, TGSI_OPCODE_F2I, dst, src);
 617    }
 618 }
 619
 620 /**
 621  * Determines whether to use an integer, unsigned integer, or float opcode
 622  * based on the operands and input opcode, then emits the result.
 623  */
 624 unsigned
 625 glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
 626                          st_dst_reg dst,
 627                          st_src_reg src0, st_src_reg src1)
 628 {
 629    int type = GLSL_TYPE_FLOAT;
 630
 631    if (src0.type == GLSL_TYPE_FLOAT || src1.type == GLSL_TYPE_FLOAT)
 632       type = GLSL_TYPE_FLOAT;
 633    else if (native_integers)
 634       type = src0.type == GLSL_TYPE_BOOL ? GLSL_TYPE_INT : src0.type;
 635
 636 #define case4(c, f, i, u) \
 637    case TGSI_OPCODE_##c: \
 638       if (type == GLSL_TYPE_INT) op = TGSI_OPCODE_##i; \
 639       else if (type == GLSL_TYPE_UINT) op = TGSI_OPCODE_##u; \
 640       else op = TGSI_OPCODE_##f; \
 641       break;
 642 #define case3(f, i, u)  case4(f, f, i, u)
 643 #define case2fi(f, i)   case4(f, f, i, i)
 644 #define case2iu(i, u)   case4(i, LAST, i, u)
 645
 646    switch(op) {
 647       case2fi(ADD, UADD);
 648       case2fi(MUL, UMUL);
 649       case2fi(MAD, UMAD);
 650       case3(DIV, IDIV, UDIV);
 651       case3(MAX, IMAX, UMAX);
 652       case3(MIN, IMIN, UMIN);
 653       case2iu(MOD, UMOD);
 654
 655       case2fi(SEQ, USEQ);
 656       case2fi(SNE, USNE);
 657       case3(SGE, ISGE, USGE);
 658       case3(SLT, ISLT, USLT);
 659
 660       case2iu(ISHR, USHR);
 661
 662       default: break;
 663    }
 664
 665    assert(op != TGSI_OPCODE_LAST);
 666    return op;
 667 }
 668
 669 glsl_to_tgsi_instruction *
 670 glsl_to_tgsi_visitor::emit_dp(ir_instruction *ir,
 671                             st_dst_reg dst, st_src_reg src0, st_src_reg src1,
 672                             unsigned elements)
 673 {
 674    static const unsigned dot_opcodes[] = {
 675       TGSI_OPCODE_DP2, TGSI_OPCODE_DP3, TGSI_OPCODE_DP4
 676    };
 677
 678    return emit(ir, dot_opcodes[elements - 2], dst, src0, src1);
 679 }
 680
 681 /**
 682  * Emits TGSI scalar opcodes to produce unique answers across channels.
 683  *
 684  * Some TGSI opcodes are scalar-only, like ARB_fp/vp.  The src X
 685  * channel determines the result across all channels.  So to do a vec4
 686  * of this operation, we want to emit a scalar per source channel used
 687  * to produce dest channels.
 688  */
 689 void
 690 glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, unsigned op,
 691                                 st_dst_reg dst,
 692                                 st_src_reg orig_src0, st_src_reg orig_src1)
 693 {
 694    int i, j;
 695    int done_mask = ~dst.writemask;
 696
 697    /* TGSI RCP is a scalar operation splatting results to all channels,
 698     * like ARB_fp/vp.  So emit as many RCPs as necessary to cover our
 699     * dst channels.
 700     */
 701    for (i = 0; i < 4; i++) {
 702       GLuint this_mask = (1 << i);
 703       glsl_to_tgsi_instruction *inst;
 704       st_src_reg src0 = orig_src0;
 705       st_src_reg src1 = orig_src1;
 706
 707       if (done_mask & this_mask)
 708          continue;
 709
 710       GLuint src0_swiz = GET_SWZ(src0.swizzle, i);
 711       GLuint src1_swiz = GET_SWZ(src1.swizzle, i);
 712       for (j = i + 1; j < 4; j++) {
 713          /* If there is another enabled component in the destination that is
 714           * derived from the same inputs, generate its value on this pass as
 715           * well.
 716           */
 717          if (!(done_mask & (1 << j)) &&
 718              GET_SWZ(src0.swizzle, j) == src0_swiz &&
 719              GET_SWZ(src1.swizzle, j) == src1_swiz) {
 720             this_mask |= (1 << j);
 721          }
 722       }
 723       src0.swizzle = MAKE_SWIZZLE4(src0_swiz, src0_swiz,
 724                                    src0_swiz, src0_swiz);
 725       src1.swizzle = MAKE_SWIZZLE4(src1_swiz, src1_swiz,
 726                                   src1_swiz, src1_swiz);
 727
 728       inst = emit(ir, op, dst, src0, src1);
 729       inst->dst.writemask = this_mask;
 730       done_mask |= this_mask;
 731    }
 732 }
 733
 734 void
 735 glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, unsigned op,
 736                                 st_dst_reg dst, st_src_reg src0)
 737 {
 738    st_src_reg undef = undef_src;
 739
 740    undef.swizzle = SWIZZLE_XXXX;
 741
 742    emit_scalar(ir, op, dst, src0, undef);
 743 }
 744
 745 void
 746 glsl_to_tgsi_visitor::emit_arl(ir_instruction *ir,
 747                                 st_dst_reg dst, st_src_reg src0)
 748 {
 749    st_src_reg tmp = get_temp(glsl_type::float_type);
 750
 751    if (src0.type == GLSL_TYPE_INT)
 752       emit(NULL, TGSI_OPCODE_I2F, st_dst_reg(tmp), src0);
 753    else if (src0.type == GLSL_TYPE_UINT)
 754       emit(NULL, TGSI_OPCODE_U2F, st_dst_reg(tmp), src0);
 755    else
 756       tmp = src0;
 757
 758    emit(NULL, TGSI_OPCODE_ARL, dst, tmp);
 759 }
 760
 761 /**
 762  * Emit an TGSI_OPCODE_SCS instruction
 763  *
 764  * The \c SCS opcode functions a bit differently than the other TGSI opcodes.
 765  * Instead of splatting its result across all four components of the
 766  * destination, it writes one value to the \c x component and another value to
 767  * the \c y component.
 768  *
 769  * \param ir        IR instruction being processed
 770  * \param op        Either \c TGSI_OPCODE_SIN or \c TGSI_OPCODE_COS depending
 771  *                  on which value is desired.
 772  * \param dst       Destination register
 773  * \param src       Source register
 774  */
 775 void
 776 glsl_to_tgsi_visitor::emit_scs(ir_instruction *ir, unsigned op,
 777                              st_dst_reg dst,
 778                              const st_src_reg &src)
 779 {
 780    /* Vertex programs cannot use the SCS opcode.
 781     */
 782    if (this->prog->Target == GL_VERTEX_PROGRAM_ARB) {
 783       emit_scalar(ir, op, dst, src);
 784       return;
 785    }
 786
 787    const unsigned component = (op == TGSI_OPCODE_SIN) ? 0 : 1;
 788    const unsigned scs_mask = (1U << component);
 789    int done_mask = ~dst.writemask;
 790    st_src_reg tmp;
 791
 792    assert(op == TGSI_OPCODE_SIN || op == TGSI_OPCODE_COS);
 793
 794    /* If there are compnents in the destination that differ from the component
 795     * that will be written by the SCS instrution, we'll need a temporary.
 796     */
 797    if (scs_mask != unsigned(dst.writemask)) {
 798       tmp = get_temp(glsl_type::vec4_type);
 799    }
 800
 801    for (unsigned i = 0; i < 4; i++) {
 802       unsigned this_mask = (1U << i);
 803       st_src_reg src0 = src;
 804
 805       if ((done_mask & this_mask) != 0)
 806          continue;
 807
 808       /* The source swizzle specified which component of the source generates
 809        * sine / cosine for the current component in the destination.  The SCS
 810        * instruction requires that this value be swizzle to the X component.
 811        * Replace the current swizzle with a swizzle that puts the source in
 812        * the X component.
 813        */
 814       unsigned src0_swiz = GET_SWZ(src.swizzle, i);
 815
 816       src0.swizzle = MAKE_SWIZZLE4(src0_swiz, src0_swiz,
 817                                    src0_swiz, src0_swiz);
 818       for (unsigned j = i + 1; j < 4; j++) {
 819          /* If there is another enabled component in the destination that is
 820           * derived from the same inputs, generate its value on this pass as
 821           * well.
 822           */
 823          if (!(done_mask & (1 << j)) &&
 824              GET_SWZ(src0.swizzle, j) == src0_swiz) {
 825             this_mask |= (1 << j);
 826          }
 827       }
 828
 829       if (this_mask != scs_mask) {
 830          glsl_to_tgsi_instruction *inst;
 831          st_dst_reg tmp_dst = st_dst_reg(tmp);
 832
 833          /* Emit the SCS instruction.
 834           */
 835          inst = emit(ir, TGSI_OPCODE_SCS, tmp_dst, src0);
 836          inst->dst.writemask = scs_mask;
 837
 838          /* Move the result of the SCS instruction to the desired location in
 839           * the destination.
 840           */
 841          tmp.swizzle = MAKE_SWIZZLE4(component, component,
 842                                      component, component);
 843          inst = emit(ir, TGSI_OPCODE_SCS, dst, tmp);
 844          inst->dst.writemask = this_mask;
 845       } else {
 846          /* Emit the SCS instruction to write directly to the destination.
 847           */
 848          glsl_to_tgsi_instruction *inst = emit(ir, TGSI_OPCODE_SCS, dst, src0);
 849          inst->dst.writemask = scs_mask;
 850       }
 851
 852       done_mask |= this_mask;
 853    }
 854 }
 855
 856 int
 857 glsl_to_tgsi_visitor::add_constant(gl_register_file file,
 858                              gl_constant_value values[4], int size, int datatype,
 859                              GLuint *swizzle_out)
 860 {
 861    if (file == PROGRAM_CONSTANT) {
 862       return _mesa_add_typed_unnamed_constant(this->prog->Parameters, values,
 863                                               size, datatype, swizzle_out);
 864    } else {
 865       int index = 0;
 866       immediate_storage *entry;
 867       assert(file == PROGRAM_IMMEDIATE);
 868
 869       /* Search immediate storage to see if we already have an identical
 870        * immediate that we can use instead of adding a duplicate entry.
 871        */
 872       foreach_iter(exec_list_iterator, iter, this->immediates) {
 873          entry = (immediate_storage *)iter.get();
 874
 875          if (entry->size == size &&
 876              entry->type == datatype &&
 877              !memcmp(entry->values, values, size * sizeof(gl_constant_value))) {
 878              return index;
 879          }
 880          index++;
 881       }
 882
 883       /* Add this immediate to the list. */
 884       entry = new(mem_ctx) immediate_storage(values, size, datatype);
 885       this->immediates.push_tail(entry);
 886       this->num_immediates++;
 887       return index;
 888    }
 889 }
 890
 891 st_src_reg
 892 glsl_to_tgsi_visitor::st_src_reg_for_float(float val)
 893 {
 894    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_FLOAT);
 895    union gl_constant_value uval;
 896
 897    uval.f = val;
 898    src.index = add_constant(src.file, &uval, 1, GL_FLOAT, &src.swizzle);
 899
 900    return src;
 901 }
 902
 903 st_src_reg
 904 glsl_to_tgsi_visitor::st_src_reg_for_int(int val)
 905 {
 906    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_INT);
 907    union gl_constant_value uval;
 908
 909    assert(native_integers);
 910
 911    uval.i = val;
 912    src.index = add_constant(src.file, &uval, 1, GL_INT, &src.swizzle);
 913
 914    return src;
 915 }
 916
 917 st_src_reg
 918 glsl_to_tgsi_visitor::st_src_reg_for_type(int type, int val)
 919 {
 920    if (native_integers)
 921       return type == GLSL_TYPE_FLOAT ? st_src_reg_for_float(val) :
 922                                        st_src_reg_for_int(val);
 923    else
 924       return st_src_reg_for_float(val);
 925 }
 926
 927 static int
 928 type_size(const struct glsl_type *type)
 929 {
 930    unsigned int i;
 931    int size;
 932
 933    switch (type->base_type) {
 934    case GLSL_TYPE_UINT:
 935    case GLSL_TYPE_INT:
 936    case GLSL_TYPE_FLOAT:
 937    case GLSL_TYPE_BOOL:
 938       if (type->is_matrix()) {
 939          return type->matrix_columns;
 940       } else {
 941          /* Regardless of size of vector, it gets a vec4. This is bad
 942           * packing for things like floats, but otherwise arrays become a
 943           * mess.  Hopefully a later pass over the code can pack scalars
 944           * down if appropriate.
 945           */
 946          return 1;
 947       }
 948    case GLSL_TYPE_ARRAY:
 949       assert(type->length > 0);
 950       return type_size(type->fields.array) * type->length;
 951    case GLSL_TYPE_STRUCT:
 952       size = 0;
 953       for (i = 0; i < type->length; i++) {
 954          size += type_size(type->fields.structure[i].type);
 955       }
 956       return size;
 957    case GLSL_TYPE_SAMPLER:
 958       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 959        * at link time.
 960        */
 961       return 1;
 962    default:
 963       assert(0);
 964       return 0;
 965    }
 966 }
 967
 968 /**
 969  * In the initial pass of codegen, we assign temporary numbers to
 970  * intermediate results.  (not SSA -- variable assignments will reuse
 971  * storage).
 972  */
 973 st_src_reg
 974 glsl_to_tgsi_visitor::get_temp(const glsl_type *type)
 975 {
 976    st_src_reg src;
 977
 978    src.type = native_integers ? type->base_type : GLSL_TYPE_FLOAT;
 979    src.file = PROGRAM_TEMPORARY;
 980    src.index = next_temp;
 981    src.reladdr = NULL;
 982    next_temp += type_size(type);
 983
 984    if (type->is_array() || type->is_record()) {
 985       src.swizzle = SWIZZLE_NOOP;
 986    } else {
 987       src.swizzle = swizzle_for_size(type->vector_elements);
 988    }
 989    src.negate = 0;
 990
 991    return src;
 992 }
 993
 994 variable_storage *
 995 glsl_to_tgsi_visitor::find_variable_storage(ir_variable *var)
 996 {
 997
 998    variable_storage *entry;
 999
1000    foreach_iter(exec_list_iterator, iter, this->variables) {
1001       entry = (variable_storage *)iter.get();
1002
1003       if (entry->var == var)
1004          return entry;
1005    }
1006
1007    return NULL;
1008 }
1009
1010 void
1011 glsl_to_tgsi_visitor::visit(ir_variable *ir)
1012 {
1013    if (strcmp(ir->name, "gl_FragCoord") == 0) {
1014       struct gl_fragment_program *fp = (struct gl_fragment_program *)this->prog;
1015
1016       fp->OriginUpperLeft = ir->origin_upper_left;
1017       fp->PixelCenterInteger = ir->pixel_center_integer;
1018
1019    } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
1020       struct gl_fragment_program *fp = (struct gl_fragment_program *)this->prog;
1021       switch (ir->depth_layout) {
1022       case ir_depth_layout_none:
1023          fp->FragDepthLayout = FRAG_DEPTH_LAYOUT_NONE;
1024          break;
1025       case ir_depth_layout_any:
1026          fp->FragDepthLayout = FRAG_DEPTH_LAYOUT_ANY;
1027          break;
1028       case ir_depth_layout_greater:
1029          fp->FragDepthLayout = FRAG_DEPTH_LAYOUT_GREATER;
1030          break;
1031       case ir_depth_layout_less:
1032          fp->FragDepthLayout = FRAG_DEPTH_LAYOUT_LESS;
1033          break;
1034       case ir_depth_layout_unchanged:
1035          fp->FragDepthLayout = FRAG_DEPTH_LAYOUT_UNCHANGED;
1036          break;
1037       default:
1038          assert(0);
1039          break;
1040       }
1041    }
1042
1043    if (ir->mode == ir_var_uniform && strncmp(ir->name, "gl_", 3) == 0) {
1044       unsigned int i;
1045       const ir_state_slot *const slots = ir->state_slots;
1046       assert(ir->state_slots != NULL);
1047
1048       /* Check if this statevar's setup in the STATE file exactly
1049        * matches how we'll want to reference it as a
1050        * struct/array/whatever.  If not, then we need to move it into
1051        * temporary storage and hope that it'll get copy-propagated
1052        * out.
1053        */
1054       for (i = 0; i < ir->num_state_slots; i++) {
1055          if (slots[i].swizzle != SWIZZLE_XYZW) {
1056             break;
1057          }
1058       }
1059
1060       variable_storage *storage;
1061       st_dst_reg dst;
1062       if (i == ir->num_state_slots) {
1063          /* We'll set the index later. */
1064          storage = new(mem_ctx) variable_storage(ir, PROGRAM_STATE_VAR, -1);
1065          this->variables.push_tail(storage);
1066
1067          dst = undef_dst;
1068       } else {
1069          /* The variable_storage constructor allocates slots based on the size
1070           * of the type.  However, this had better match the number of state
1071           * elements that we're going to copy into the new temporary.
1072           */
1073          assert((int) ir->num_state_slots == type_size(ir->type));
1074
1075          storage = new(mem_ctx) variable_storage(ir, PROGRAM_TEMPORARY,
1076                                                  this->next_temp);
1077          this->variables.push_tail(storage);
1078          this->next_temp += type_size(ir->type);
1079
1080          dst = st_dst_reg(st_src_reg(PROGRAM_TEMPORARY, storage->index,
1081                native_integers ? ir->type->base_type : GLSL_TYPE_FLOAT));
1082       }
1083
1084
1085       for (unsigned int i = 0; i < ir->num_state_slots; i++) {
1086          int index = _mesa_add_state_reference(this->prog->Parameters,
1087                                                (gl_state_index *)slots[i].tokens);
1088
1089          if (storage->file == PROGRAM_STATE_VAR) {
1090             if (storage->index == -1) {
1091                storage->index = index;
1092             } else {
1093                assert(index == storage->index + (int)i);
1094             }
1095          } else {
1096             st_src_reg src(PROGRAM_STATE_VAR, index,
1097                   native_integers ? ir->type->base_type : GLSL_TYPE_FLOAT);
1098             src.swizzle = slots[i].swizzle;
1099             emit(ir, TGSI_OPCODE_MOV, dst, src);
1100             /* even a float takes up a whole vec4 reg in a struct/array. */
1101             dst.index++;
1102          }
1103       }
1104
1105       if (storage->file == PROGRAM_TEMPORARY &&
1106           dst.index != storage->index + (int) ir->num_state_slots) {
1107          fail_link(this->shader_program,
1108                    "failed to load builtin uniform `%s'  (%d/%d regs loaded)\n",
1109                    ir->name, dst.index - storage->index,
1110                    type_size(ir->type));
1111       }
1112    }
1113 }
1114
1115 void
1116 glsl_to_tgsi_visitor::visit(ir_loop *ir)
1117 {
1118    ir_dereference_variable *counter = NULL;
1119
1120    if (ir->counter != NULL)
1121       counter = new(ir) ir_dereference_variable(ir->counter);
1122
1123    if (ir->from != NULL) {
1124       assert(ir->counter != NULL);
1125
1126       ir_assignment *a = new(ir) ir_assignment(counter, ir->from, NULL);
1127
1128       a->accept(this);
1129       delete a;
1130    }
1131
1132    emit(NULL, TGSI_OPCODE_BGNLOOP);
1133
1134    if (ir->to) {
1135       ir_expression *e =
1136          new(ir) ir_expression(ir->cmp, glsl_type::bool_type,
1137                                counter, ir->to);
1138       ir_if *if_stmt =  new(ir) ir_if(e);
1139
1140       ir_loop_jump *brk = new(ir) ir_loop_jump(ir_loop_jump::jump_break);
1141
1142       if_stmt->then_instructions.push_tail(brk);
1143
1144       if_stmt->accept(this);
1145
1146       delete if_stmt;
1147       delete e;
1148       delete brk;
1149    }
1150
1151    visit_exec_list(&ir->body_instructions, this);
1152
1153    if (ir->increment) {
1154       ir_expression *e =
1155          new(ir) ir_expression(ir_binop_add, counter->type,
1156                                counter, ir->increment);
1157
1158       ir_assignment *a = new(ir) ir_assignment(counter, e, NULL);
1159
1160       a->accept(this);
1161       delete a;
1162       delete e;
1163    }
1164
1165    emit(NULL, TGSI_OPCODE_ENDLOOP);
1166 }
1167
1168 void
1169 glsl_to_tgsi_visitor::visit(ir_loop_jump *ir)
1170 {
1171    switch (ir->mode) {
1172    case ir_loop_jump::jump_break:
1173       emit(NULL, TGSI_OPCODE_BRK);
1174       break;
1175    case ir_loop_jump::jump_continue:
1176       emit(NULL, TGSI_OPCODE_CONT);
1177       break;
1178    }
1179 }
1180
1181
1182 void
1183 glsl_to_tgsi_visitor::visit(ir_function_signature *ir)
1184 {
1185    assert(0);
1186    (void)ir;
1187 }
1188
1189 void
1190 glsl_to_tgsi_visitor::visit(ir_function *ir)
1191 {
1192    /* Ignore function bodies other than main() -- we shouldn't see calls to
1193     * them since they should all be inlined before we get to glsl_to_tgsi.
1194     */
1195    if (strcmp(ir->name, "main") == 0) {
1196       const ir_function_signature *sig;
1197       exec_list empty;
1198
1199       sig = ir->matching_signature(&empty);
1200
1201       assert(sig);
1202
1203       foreach_iter(exec_list_iterator, iter, sig->body) {
1204          ir_instruction *ir = (ir_instruction *)iter.get();
1205
1206          ir->accept(this);
1207       }
1208    }
1209 }
1210
1211 bool
1212 glsl_to_tgsi_visitor::try_emit_mad(ir_expression *ir, int mul_operand)
1213 {
1214    int nonmul_operand = 1 - mul_operand;
1215    st_src_reg a, b, c;
1216    st_dst_reg result_dst;
1217
1218    ir_expression *expr = ir->operands[mul_operand]->as_expression();
1219    if (!expr || expr->operation != ir_binop_mul)
1220       return false;
1221
1222    expr->operands[0]->accept(this);
1223    a = this->result;
1224    expr->operands[1]->accept(this);
1225    b = this->result;
1226    ir->operands[nonmul_operand]->accept(this);
1227    c = this->result;
1228
1229    this->result = get_temp(ir->type);
1230    result_dst = st_dst_reg(this->result);
1231    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1232    emit(ir, TGSI_OPCODE_MAD, result_dst, a, b, c);
1233
1234    return true;
1235 }
1236
1237 /**
1238  * Emit MAD(a, -b, a) instead of AND(a, NOT(b))
1239  *
1240  * The logic values are 1.0 for true and 0.0 for false.  Logical-and is
1241  * implemented using multiplication, and logical-or is implemented using
1242  * addition.  Logical-not can be implemented as (true - x), or (1.0 - x).
1243  * As result, the logical expression (a & !b) can be rewritten as:
1244  *
1245  *     - a * !b
1246  *     - a * (1 - b)
1247  *     - (a * 1) - (a * b)
1248  *     - a + -(a * b)
1249  *     - a + (a * -b)
1250  *
1251  * This final expression can be implemented as a single MAD(a, -b, a)
1252  * instruction.
1253  */
1254 bool
1255 glsl_to_tgsi_visitor::try_emit_mad_for_and_not(ir_expression *ir, int try_operand)
1256 {
1257    const int other_operand = 1 - try_operand;
1258    st_src_reg a, b;
1259
1260    ir_expression *expr = ir->operands[try_operand]->as_expression();
1261    if (!expr || expr->operation != ir_unop_logic_not)
1262       return false;
1263
1264    ir->operands[other_operand]->accept(this);
1265    a = this->result;
1266    expr->operands[0]->accept(this);
1267    b = this->result;
1268
1269    b.negate = ~b.negate;
1270
1271    this->result = get_temp(ir->type);
1272    emit(ir, TGSI_OPCODE_MAD, st_dst_reg(this->result), a, b, a);
1273
1274    return true;
1275 }
1276
1277 bool
1278 glsl_to_tgsi_visitor::try_emit_sat(ir_expression *ir)
1279 {
1280    /* Saturates were only introduced to vertex programs in
1281     * NV_vertex_program3, so don't give them to drivers in the VP.
1282     */
1283    if (this->prog->Target == GL_VERTEX_PROGRAM_ARB)
1284       return false;
1285
1286    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1287    if (!sat_src)
1288       return false;
1289
1290    sat_src->accept(this);
1291    st_src_reg src = this->result;
1292
1293    /* If we generated an expression instruction into a temporary in
1294     * processing the saturate's operand, apply the saturate to that
1295     * instruction.  Otherwise, generate a MOV to do the saturate.
1296     *
1297     * Note that we have to be careful to only do this optimization if
1298     * the instruction in question was what generated src->result.  For
1299     * example, ir_dereference_array might generate a MUL instruction
1300     * to create the reladdr, and return us a src reg using that
1301     * reladdr.  That MUL result is not the value we're trying to
1302     * saturate.
1303     */
1304    ir_expression *sat_src_expr = sat_src->as_expression();
1305    if (sat_src_expr && (sat_src_expr->operation == ir_binop_mul ||
1306                         sat_src_expr->operation == ir_binop_add ||
1307                         sat_src_expr->operation == ir_binop_dot)) {
1308       glsl_to_tgsi_instruction *new_inst;
1309       new_inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
1310       new_inst->saturate = true;
1311    } else {
1312       this->result = get_temp(ir->type);
1313       st_dst_reg result_dst = st_dst_reg(this->result);
1314       result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1315       glsl_to_tgsi_instruction *inst;
1316       inst = emit(ir, TGSI_OPCODE_MOV, result_dst, src);
1317       inst->saturate = true;
1318    }
1319
1320    return true;
1321 }
1322
1323 void
1324 glsl_to_tgsi_visitor::reladdr_to_temp(ir_instruction *ir,
1325                                     st_src_reg *reg, int *num_reladdr)
1326 {
1327    if (!reg->reladdr)
1328       return;
1329
1330    emit_arl(ir, address_reg, *reg->reladdr);
1331
1332    if (*num_reladdr != 1) {
1333       st_src_reg temp = get_temp(glsl_type::vec4_type);
1334
1335       emit(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
1336       *reg = temp;
1337    }
1338
1339    (*num_reladdr)--;
1340 }
1341
1342 void
1343 glsl_to_tgsi_visitor::visit(ir_expression *ir)
1344 {
1345    unsigned int operand;
1346    st_src_reg op[Elements(ir->operands)];
1347    st_src_reg result_src;
1348    st_dst_reg result_dst;
1349
1350    /* Quick peephole: Emit MAD(a, b, c) instead of ADD(MUL(a, b), c)
1351     */
1352    if (ir->operation == ir_binop_add) {
1353       if (try_emit_mad(ir, 1))
1354          return;
1355       if (try_emit_mad(ir, 0))
1356          return;
1357    }
1358
1359    /* Quick peephole: Emit OPCODE_MAD(-a, -b, a) instead of AND(a, NOT(b))
1360     */
1361    if (ir->operation == ir_binop_logic_and) {
1362       if (try_emit_mad_for_and_not(ir, 1))
1363          return;
1364       if (try_emit_mad_for_and_not(ir, 0))
1365          return;
1366    }
1367
1368    if (try_emit_sat(ir))
1369       return;
1370
1371    if (ir->operation == ir_quadop_vector)
1372       assert(!"ir_quadop_vector should have been lowered");
1373
1374    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1375       this->result.file = PROGRAM_UNDEFINED;
1376       ir->operands[operand]->accept(this);
1377       if (this->result.file == PROGRAM_UNDEFINED) {
1378          ir_print_visitor v;
1379          printf("Failed to get tree for expression operand:\n");
1380          ir->operands[operand]->accept(&v);
1381          exit(1);
1382       }
1383       op[operand] = this->result;
1384
1385       /* Matrix expression operands should have been broken down to vector
1386        * operations already.
1387        */
1388       assert(!ir->operands[operand]->type->is_matrix());
1389    }
1390
1391    int vector_elements = ir->operands[0]->type->vector_elements;
1392    if (ir->operands[1]) {
1393       vector_elements = MAX2(vector_elements,
1394                              ir->operands[1]->type->vector_elements);
1395    }
1396
1397    this->result.file = PROGRAM_UNDEFINED;
1398
1399    /* Storage for our result.  Ideally for an assignment we'd be using
1400     * the actual storage for the result here, instead.
1401     */
1402    result_src = get_temp(ir->type);
1403    /* convenience for the emit functions below. */
1404    result_dst = st_dst_reg(result_src);
1405    /* Limit writes to the channels that will be used by result_src later.
1406     * This does limit this temp's use as a temporary for multi-instruction
1407     * sequences.
1408     */
1409    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1410
1411    switch (ir->operation) {
1412    case ir_unop_logic_not:
1413       if (result_dst.type != GLSL_TYPE_FLOAT)
1414          emit(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
1415       else {
1416          /* Previously 'SEQ dst, src, 0.0' was used for this.  However, many
1417           * older GPUs implement SEQ using multiple instructions (i915 uses two
1418           * SGE instructions and a MUL instruction).  Since our logic values are
1419           * 0.0 and 1.0, 1-x also implements !x.
1420           */
1421          op[0].negate = ~op[0].negate;
1422          emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], st_src_reg_for_float(1.0));
1423       }
1424       break;
1425    case ir_unop_neg:
1426       assert(result_dst.type == GLSL_TYPE_FLOAT || result_dst.type == GLSL_TYPE_INT);
1427       if (result_dst.type == GLSL_TYPE_INT)
1428          emit(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
1429       else {
1430          op[0].negate = ~op[0].negate;
1431          result_src = op[0];
1432       }
1433       break;
1434    case ir_unop_abs:
1435       assert(result_dst.type == GLSL_TYPE_FLOAT);
1436       emit(ir, TGSI_OPCODE_ABS, result_dst, op[0]);
1437       break;
1438    case ir_unop_sign:
1439       emit(ir, TGSI_OPCODE_SSG, result_dst, op[0]);
1440       break;
1441    case ir_unop_rcp:
1442       emit_scalar(ir, TGSI_OPCODE_RCP, result_dst, op[0]);
1443       break;
1444
1445    case ir_unop_exp2:
1446       emit_scalar(ir, TGSI_OPCODE_EX2, result_dst, op[0]);
1447       break;
1448    case ir_unop_exp:
1449    case ir_unop_log:
1450       assert(!"not reached: should be handled by ir_explog_to_explog2");
1451       break;
1452    case ir_unop_log2:
1453       emit_scalar(ir, TGSI_OPCODE_LG2, result_dst, op[0]);
1454       break;
1455    case ir_unop_sin:
1456       emit_scalar(ir, TGSI_OPCODE_SIN, result_dst, op[0]);
1457       break;
1458    case ir_unop_cos:
1459       emit_scalar(ir, TGSI_OPCODE_COS, result_dst, op[0]);
1460       break;
1461    case ir_unop_sin_reduced:
1462       emit_scs(ir, TGSI_OPCODE_SIN, result_dst, op[0]);
1463       break;
1464    case ir_unop_cos_reduced:
1465       emit_scs(ir, TGSI_OPCODE_COS, result_dst, op[0]);
1466       break;
1467
1468    case ir_unop_dFdx:
1469       emit(ir, TGSI_OPCODE_DDX, result_dst, op[0]);
1470       break;
1471    case ir_unop_dFdy:
1472       op[0].negate = ~op[0].negate;
1473       emit(ir, TGSI_OPCODE_DDY, result_dst, op[0]);
1474       break;
1475
1476    case ir_unop_noise: {
1477       /* At some point, a motivated person could add a better
1478        * implementation of noise.  Currently not even the nvidia
1479        * binary drivers do anything more than this.  In any case, the
1480        * place to do this is in the GL state tracker, not the poor
1481        * driver.
1482        */
1483       emit(ir, TGSI_OPCODE_MOV, result_dst, st_src_reg_for_float(0.5));
1484       break;
1485    }
1486
1487    case ir_binop_add:
1488       emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1489       break;
1490    case ir_binop_sub:
1491       emit(ir, TGSI_OPCODE_SUB, result_dst, op[0], op[1]);
1492       break;
1493
1494    case ir_binop_mul:
1495       emit(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
1496       break;
1497    case ir_binop_div:
1498       if (result_dst.type == GLSL_TYPE_FLOAT)
1499          assert(!"not reached: should be handled by ir_div_to_mul_rcp");
1500       else
1501          emit(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
1502       break;
1503    case ir_binop_mod:
1504       if (result_dst.type == GLSL_TYPE_FLOAT)
1505          assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
1506       else
1507          emit(ir, TGSI_OPCODE_MOD, result_dst, op[0], op[1]);
1508       break;
1509
1510    case ir_binop_less:
1511       emit(ir, TGSI_OPCODE_SLT, result_dst, op[0], op[1]);
1512       break;
1513    case ir_binop_greater:
1514       emit(ir, TGSI_OPCODE_SLT, result_dst, op[1], op[0]);
1515       break;
1516    case ir_binop_lequal:
1517       emit(ir, TGSI_OPCODE_SGE, result_dst, op[1], op[0]);
1518       break;
1519    case ir_binop_gequal:
1520       emit(ir, TGSI_OPCODE_SGE, result_dst, op[0], op[1]);
1521       break;
1522    case ir_binop_equal:
1523       emit(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
1524       break;
1525    case ir_binop_nequal:
1526       emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1527       break;
1528    case ir_binop_all_equal:
1529       /* "==" operator producing a scalar boolean. */
1530       if (ir->operands[0]->type->is_vector() ||
1531           ir->operands[1]->type->is_vector()) {
1532          st_src_reg temp = get_temp(native_integers ?
1533                glsl_type::get_instance(ir->operands[0]->type->base_type, 4, 1) :
1534                glsl_type::vec4_type);
1535          assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
1536          emit(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
1537
1538          /* After the dot-product, the value will be an integer on the
1539           * range [0,4].  Zero becomes 1.0, and positive values become zero.
1540           */
1541          emit_dp(ir, result_dst, temp, temp, vector_elements);
1542
1543          if (result_dst.type == GLSL_TYPE_FLOAT) {
1544             /* Negating the result of the dot-product gives values on the range
1545              * [-4, 0].  Zero becomes 1.0, and negative values become zero.
1546              * This is achieved using SGE.
1547              */
1548             st_src_reg sge_src = result_src;
1549             sge_src.negate = ~sge_src.negate;
1550             emit(ir, TGSI_OPCODE_SGE, result_dst, sge_src, st_src_reg_for_float(0.0));
1551          } else {
1552             /* The TGSI negate flag doesn't work for integers, so use SEQ 0
1553              * instead.
1554              */
1555             emit(ir, TGSI_OPCODE_SEQ, result_dst, result_src, st_src_reg_for_int(0));
1556          }
1557       } else {
1558          emit(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
1559       }
1560       break;
1561    case ir_binop_any_nequal:
1562       /* "!=" operator producing a scalar boolean. */
1563       if (ir->operands[0]->type->is_vector() ||
1564           ir->operands[1]->type->is_vector()) {
1565          st_src_reg temp = get_temp(native_integers ?
1566                glsl_type::get_instance(ir->operands[0]->type->base_type, 4, 1) :
1567                glsl_type::vec4_type);
1568          assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
1569          emit(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
1570
1571          /* After the dot-product, the value will be an integer on the
1572           * range [0,4].  Zero stays zero, and positive values become 1.0.
1573           */
1574          glsl_to_tgsi_instruction *const dp =
1575                emit_dp(ir, result_dst, temp, temp, vector_elements);
1576          if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB &&
1577              result_dst.type == GLSL_TYPE_FLOAT) {
1578             /* The clamping to [0,1] can be done for free in the fragment
1579              * shader with a saturate.
1580              */
1581             dp->saturate = true;
1582          } else if (result_dst.type == GLSL_TYPE_FLOAT) {
1583             /* Negating the result of the dot-product gives values on the range
1584              * [-4, 0].  Zero stays zero, and negative values become 1.0.  This
1585              * achieved using SLT.
1586              */
1587             st_src_reg slt_src = result_src;
1588             slt_src.negate = ~slt_src.negate;
1589             emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
1590          } else {
1591             emit(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_int(0));
1592          }
1593       } else {
1594          emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1595       }
1596       break;
1597
1598    case ir_unop_any: {
1599       assert(ir->operands[0]->type->is_vector());
1600
1601       /* After the dot-product, the value will be an integer on the
1602        * range [0,4].  Zero stays zero, and positive values become 1.0.
1603        */
1604       glsl_to_tgsi_instruction *const dp =
1605          emit_dp(ir, result_dst, op[0], op[0],
1606                  ir->operands[0]->type->vector_elements);
1607       if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB &&
1608           result_dst.type == GLSL_TYPE_FLOAT) {
1609               /* The clamping to [0,1] can be done for free in the fragment
1610                * shader with a saturate.
1611                */
1612               dp->saturate = true;
1613       } else if (result_dst.type == GLSL_TYPE_FLOAT) {
1614               /* Negating the result of the dot-product gives values on the range
1615                * [-4, 0].  Zero stays zero, and negative values become 1.0.  This
1616                * is achieved using SLT.
1617                */
1618               st_src_reg slt_src = result_src;
1619               slt_src.negate = ~slt_src.negate;
1620               emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
1621       }
1622       else {
1623          /* Use SNE 0 if integers are being used as boolean values. */
1624          emit(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_int(0));
1625       }
1626       break;
1627    }
1628
1629    case ir_binop_logic_xor:
1630       if (native_integers)
1631          emit(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
1632       else
1633          emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1634       break;
1635
1636    case ir_binop_logic_or: {
1637       if (native_integers) {
1638          /* If integers are used as booleans, we can use an actual "or"
1639           * instruction.
1640           */
1641          assert(native_integers);
1642          emit(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
1643       } else {
1644          /* After the addition, the value will be an integer on the
1645           * range [0,2].  Zero stays zero, and positive values become 1.0.
1646           */
1647          glsl_to_tgsi_instruction *add =
1648             emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1649          if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
1650             /* The clamping to [0,1] can be done for free in the fragment
1651              * shader with a saturate if floats are being used as boolean values.
1652              */
1653             add->saturate = true;
1654          } else {
1655             /* Negating the result of the addition gives values on the range
1656              * [-2, 0].  Zero stays zero, and negative values become 1.0.  This
1657              * is achieved using SLT.
1658              */
1659             st_src_reg slt_src = result_src;
1660             slt_src.negate = ~slt_src.negate;
1661             emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
1662          }
1663       }
1664       break;
1665    }
1666
1667    case ir_binop_logic_and:
1668       /* If native integers are disabled, the bool args are stored as float 0.0
1669        * or 1.0, so "mul" gives us "and".  If they're enabled, just use the
1670        * actual AND opcode.
1671        */
1672       if (native_integers)
1673          emit(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
1674       else
1675          emit(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
1676       break;
1677
1678    case ir_binop_dot:
1679       assert(ir->operands[0]->type->is_vector());
1680       assert(ir->operands[0]->type == ir->operands[1]->type);
1681       emit_dp(ir, result_dst, op[0], op[1],
1682               ir->operands[0]->type->vector_elements);
1683       break;
1684
1685    case ir_unop_sqrt:
1686       /* sqrt(x) = x * rsq(x). */
1687       emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0]);
1688       emit(ir, TGSI_OPCODE_MUL, result_dst, result_src, op[0]);
1689       /* For incoming channels <= 0, set the result to 0. */
1690       op[0].negate = ~op[0].negate;
1691       emit(ir, TGSI_OPCODE_CMP, result_dst,
1692                           op[0], result_src, st_src_reg_for_float(0.0));
1693       break;
1694    case ir_unop_rsq:
1695       emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0]);
1696       break;
1697    case ir_unop_i2f:
1698       if (native_integers) {
1699          emit(ir, TGSI_OPCODE_I2F, result_dst, op[0]);
1700          break;
1701       }
1702       /* fallthrough to next case otherwise */
1703    case ir_unop_b2f:
1704       if (native_integers) {
1705          emit(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_float(1.0));
1706          break;
1707       }
1708       /* fallthrough to next case otherwise */
1709    case ir_unop_i2u:
1710    case ir_unop_u2i:
1711       /* Converting between signed and unsigned integers is a no-op. */
1712       result_src = op[0];
1713       break;
1714    case ir_unop_b2i:
1715       if (native_integers) {
1716          /* Booleans are stored as integers using ~0 for true and 0 for false.
1717           * GLSL requires that int(bool) return 1 for true and 0 for false.
1718           * This conversion is done with AND, but it could be done with NEG.
1719           */
1720          emit(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_int(1));
1721       } else {
1722          /* Booleans and integers are both stored as floats when native
1723           * integers are disabled.
1724           */
1725          result_src = op[0];
1726       }
1727       break;
1728    case ir_unop_f2i:
1729       if (native_integers)
1730          emit(ir, TGSI_OPCODE_F2I, result_dst, op[0]);
1731       else
1732          emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
1733       break;
1734    case ir_unop_f2b:
1735       emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
1736       break;
1737    case ir_unop_i2b:
1738       if (native_integers)
1739          emit(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
1740       else
1741          emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
1742       break;
1743    case ir_unop_trunc:
1744       emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
1745       break;
1746    case ir_unop_ceil:
1747       op[0].negate = ~op[0].negate;
1748       emit(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
1749       result_src.negate = ~result_src.negate;
1750       break;
1751    case ir_unop_floor:
1752       emit(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
1753       break;
1754    case ir_unop_fract:
1755       emit(ir, TGSI_OPCODE_FRC, result_dst, op[0]);
1756       break;
1757
1758    case ir_binop_min:
1759       emit(ir, TGSI_OPCODE_MIN, result_dst, op[0], op[1]);
1760       break;
1761    case ir_binop_max:
1762       emit(ir, TGSI_OPCODE_MAX, result_dst, op[0], op[1]);
1763       break;
1764    case ir_binop_pow:
1765       emit_scalar(ir, TGSI_OPCODE_POW, result_dst, op[0], op[1]);
1766       break;
1767
1768    case ir_unop_bit_not:
1769       if (native_integers) {
1770          emit(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
1771          break;
1772       }
1773    case ir_unop_u2f:
1774       if (native_integers) {
1775          emit(ir, TGSI_OPCODE_U2F, result_dst, op[0]);
1776          break;
1777       }
1778    case ir_binop_lshift:
1779       if (native_integers) {
1780          emit(ir, TGSI_OPCODE_SHL, result_dst, op[0]);
1781          break;
1782       }
1783    case ir_binop_rshift:
1784       if (native_integers) {
1785          emit(ir, TGSI_OPCODE_ISHR, result_dst, op[0]);
1786          break;
1787       }
1788    case ir_binop_bit_and:
1789       if (native_integers) {
1790          emit(ir, TGSI_OPCODE_AND, result_dst, op[0]);
1791          break;
1792       }
1793    case ir_binop_bit_xor:
1794       if (native_integers) {
1795          emit(ir, TGSI_OPCODE_XOR, result_dst, op[0]);
1796          break;
1797       }
1798    case ir_binop_bit_or:
1799       if (native_integers) {
1800          emit(ir, TGSI_OPCODE_OR, result_dst, op[0]);
1801          break;
1802       }
1803    case ir_unop_round_even:
1804       assert(!"GLSL 1.30 features unsupported");
1805       break;
1806
1807    case ir_quadop_vector:
1808       /* This operation should have already been handled.
1809        */
1810       assert(!"Should not get here.");
1811       break;
1812    }
1813
1814    this->result = result_src;
1815 }
1816
1817
1818 void
1819 glsl_to_tgsi_visitor::visit(ir_swizzle *ir)
1820 {
1821    st_src_reg src;
1822    int i;
1823    int swizzle[4];
1824
1825    /* Note that this is only swizzles in expressions, not those on the left
1826     * hand side of an assignment, which do write masking.  See ir_assignment
1827     * for that.
1828     */
1829
1830    ir->val->accept(this);
1831    src = this->result;
1832    assert(src.file != PROGRAM_UNDEFINED);
1833
1834    for (i = 0; i < 4; i++) {
1835       if (i < ir->type->vector_elements) {
1836          switch (i) {
1837          case 0:
1838             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.x);
1839             break;
1840          case 1:
1841             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.y);
1842             break;
1843          case 2:
1844             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.z);
1845             break;
1846          case 3:
1847             swizzle[i] = GET_SWZ(src.swizzle, ir->mask.w);
1848             break;
1849          }
1850       } else {
1851          /* If the type is smaller than a vec4, replicate the last
1852           * channel out.
1853           */
1854          swizzle[i] = swizzle[ir->type->vector_elements - 1];
1855       }
1856    }
1857
1858    src.swizzle = MAKE_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1859
1860    this->result = src;
1861 }
1862
1863 void
1864 glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
1865 {
1866    variable_storage *entry = find_variable_storage(ir->var);
1867    ir_variable *var = ir->var;
1868
1869    if (!entry) {
1870       switch (var->mode) {
1871       case ir_var_uniform:
1872          entry = new(mem_ctx) variable_storage(var, PROGRAM_UNIFORM,
1873                                                var->location);
1874          this->variables.push_tail(entry);
1875          break;
1876       case ir_var_in:
1877       case ir_var_inout:
1878          /* The linker assigns locations for varyings and attributes,
1879           * including deprecated builtins (like gl_Color), user-assign
1880           * generic attributes (glBindVertexLocation), and
1881           * user-defined varyings.
1882           *
1883           * FINISHME: We would hit this path for function arguments.  Fix!
1884           */
1885          assert(var->location != -1);
1886          entry = new(mem_ctx) variable_storage(var,
1887                                                PROGRAM_INPUT,
1888                                                var->location);
1889          if (this->prog->Target == GL_VERTEX_PROGRAM_ARB &&
1890              var->location >= VERT_ATTRIB_GENERIC0) {
1891             _mesa_add_attribute(this->prog->Attributes,
1892                                 var->name,
1893                                 _mesa_sizeof_glsl_type(var->type->gl_type),
1894                                 var->type->gl_type,
1895                                 var->location - VERT_ATTRIB_GENERIC0);
1896          }
1897          break;
1898       case ir_var_out:
1899          assert(var->location != -1);
1900          entry = new(mem_ctx) variable_storage(var,
1901                                                PROGRAM_OUTPUT,
1902                                                var->location);
1903          break;
1904       case ir_var_system_value:
1905          entry = new(mem_ctx) variable_storage(var,
1906                                                PROGRAM_SYSTEM_VALUE,
1907                                                var->location);
1908          break;
1909       case ir_var_auto:
1910       case ir_var_temporary:
1911          entry = new(mem_ctx) variable_storage(var, PROGRAM_TEMPORARY,
1912                                                this->next_temp);
1913          this->variables.push_tail(entry);
1914
1915          next_temp += type_size(var->type);
1916          break;
1917       }
1918
1919       if (!entry) {
1920          printf("Failed to make storage for %s\n", var->name);
1921          exit(1);
1922       }
1923    }
1924
1925    this->result = st_src_reg(entry->file, entry->index, var->type);
1926    if (!native_integers)
1927       this->result.type = GLSL_TYPE_FLOAT;
1928 }
1929
1930 void
1931 glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
1932 {
1933    ir_constant *index;
1934    st_src_reg src;
1935    int element_size = type_size(ir->type);
1936
1937    index = ir->array_index->constant_expression_value();
1938
1939    ir->array->accept(this);
1940    src = this->result;
1941
1942    if (index) {
1943       src.index += index->value.i[0] * element_size;
1944    } else {
1945       /* Variable index array dereference.  It eats the "vec4" of the
1946        * base of the array and an index that offsets the TGSI register
1947        * index.
1948        */
1949       ir->array_index->accept(this);
1950
1951       st_src_reg index_reg;
1952
1953       if (element_size == 1) {
1954          index_reg = this->result;
1955       } else {
1956          index_reg = get_temp(native_integers ?
1957                               glsl_type::int_type : glsl_type::float_type);
1958
1959          emit(ir, TGSI_OPCODE_MUL, st_dst_reg(index_reg),
1960               this->result, st_src_reg_for_type(index_reg.type, element_size));
1961       }
1962
1963       /* If there was already a relative address register involved, add the
1964        * new and the old together to get the new offset.
1965        */
1966       if (src.reladdr != NULL) {
1967          st_src_reg accum_reg = get_temp(native_integers ?
1968                                 glsl_type::int_type : glsl_type::float_type);
1969
1970          emit(ir, TGSI_OPCODE_ADD, st_dst_reg(accum_reg),
1971               index_reg, *src.reladdr);
1972
1973          index_reg = accum_reg;
1974       }
1975
1976       src.reladdr = ralloc(mem_ctx, st_src_reg);
1977       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1978    }
1979
1980    /* If the type is smaller than a vec4, replicate the last channel out. */
1981    if (ir->type->is_scalar() || ir->type->is_vector())
1982       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1983    else
1984       src.swizzle = SWIZZLE_NOOP;
1985
1986    this->result = src;
1987 }
1988
1989 void
1990 glsl_to_tgsi_visitor::visit(ir_dereference_record *ir)
1991 {
1992    unsigned int i;
1993    const glsl_type *struct_type = ir->record->type;
1994    int offset = 0;
1995
1996    ir->record->accept(this);
1997
1998    for (i = 0; i < struct_type->length; i++) {
1999       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2000          break;
2001       offset += type_size(struct_type->fields.structure[i].type);
2002    }
2003
2004    /* If the type is smaller than a vec4, replicate the last channel out. */
2005    if (ir->type->is_scalar() || ir->type->is_vector())
2006       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2007    else
2008       this->result.swizzle = SWIZZLE_NOOP;
2009
2010    this->result.index += offset;
2011 }
2012
2013 /**
2014  * We want to be careful in assignment setup to hit the actual storage
2015  * instead of potentially using a temporary like we might with the
2016  * ir_dereference handler.
2017  */
2018 static st_dst_reg
2019 get_assignment_lhs(ir_dereference *ir, glsl_to_tgsi_visitor *v)
2020 {
2021    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2022     * access of a vector, it must be separated into a series conditional moves
2023     * before reaching this point (see ir_vec_index_to_cond_assign).
2024     */
2025    assert(ir->as_dereference());
2026    ir_dereference_array *deref_array = ir->as_dereference_array();
2027    if (deref_array) {
2028       assert(!deref_array->array->type->is_vector());
2029    }
2030
2031    /* Use the rvalue deref handler for the most part.  We'll ignore
2032     * swizzles in it and write swizzles using writemask, though.
2033     */
2034    ir->accept(v);
2035    return st_dst_reg(v->result);
2036 }
2037
2038 /**
2039  * Process the condition of a conditional assignment
2040  *
2041  * Examines the condition of a conditional assignment to generate the optimal
2042  * first operand of a \c CMP instruction.  If the condition is a relational
2043  * operator with 0 (e.g., \c ir_binop_less), the value being compared will be
2044  * used as the source for the \c CMP instruction.  Otherwise the comparison
2045  * is processed to a boolean result, and the boolean result is used as the
2046  * operand to the CMP instruction.
2047  */
2048 bool
2049 glsl_to_tgsi_visitor::process_move_condition(ir_rvalue *ir)
2050 {
2051    ir_rvalue *src_ir = ir;
2052    bool negate = true;
2053    bool switch_order = false;
2054
2055    ir_expression *const expr = ir->as_expression();
2056    if ((expr != NULL) && (expr->get_num_operands() == 2)) {
2057       bool zero_on_left = false;
2058
2059       if (expr->operands[0]->is_zero()) {
2060          src_ir = expr->operands[1];
2061          zero_on_left = true;
2062       } else if (expr->operands[1]->is_zero()) {
2063          src_ir = expr->operands[0];
2064          zero_on_left = false;
2065       }
2066
2067       /*      a is -  0  +            -  0  +
2068        * (a <  0)  T  F  F  ( a < 0)  T  F  F
2069        * (0 <  a)  F  F  T  (-a < 0)  F  F  T
2070        * (a <= 0)  T  T  F  (-a < 0)  F  F  T  (swap order of other operands)
2071        * (0 <= a)  F  T  T  ( a < 0)  T  F  F  (swap order of other operands)
2072        * (a >  0)  F  F  T  (-a < 0)  F  F  T
2073        * (0 >  a)  T  F  F  ( a < 0)  T  F  F
2074        * (a >= 0)  F  T  T  ( a < 0)  T  F  F  (swap order of other operands)
2075        * (0 >= a)  T  T  F  (-a < 0)  F  F  T  (swap order of other operands)
2076        *
2077        * Note that exchanging the order of 0 and 'a' in the comparison simply
2078        * means that the value of 'a' should be negated.
2079        */
2080       if (src_ir != ir) {
2081          switch (expr->operation) {
2082          case ir_binop_less:
2083             switch_order = false;
2084             negate = zero_on_left;
2085             break;
2086
2087          case ir_binop_greater:
2088             switch_order = false;
2089             negate = !zero_on_left;
2090             break;
2091
2092          case ir_binop_lequal:
2093             switch_order = true;
2094             negate = !zero_on_left;
2095             break;
2096
2097          case ir_binop_gequal:
2098             switch_order = true;
2099             negate = zero_on_left;
2100             break;
2101
2102          default:
2103             /* This isn't the right kind of comparison afterall, so make sure
2104              * the whole condition is visited.
2105              */
2106             src_ir = ir;
2107             break;
2108          }
2109       }
2110    }
2111
2112    src_ir->accept(this);
2113
2114    /* We use the TGSI_OPCODE_CMP (a < 0 ? b : c) for conditional moves, and the
2115     * condition we produced is 0.0 or 1.0.  By flipping the sign, we can
2116     * choose which value TGSI_OPCODE_CMP produces without an extra instruction
2117     * computing the condition.
2118     */
2119    if (negate)
2120       this->result.negate = ~this->result.negate;
2121
2122    return switch_order;
2123 }
2124
2125 void
2126 glsl_to_tgsi_visitor::visit(ir_assignment *ir)
2127 {
2128    st_dst_reg l;
2129    st_src_reg r;
2130    int i;
2131
2132    ir->rhs->accept(this);
2133    r = this->result;
2134
2135    l = get_assignment_lhs(ir->lhs, this);
2136
2137    /* FINISHME: This should really set to the correct maximal writemask for each
2138     * FINISHME: component written (in the loops below).  This case can only
2139     * FINISHME: occur for matrices, arrays, and structures.
2140     */
2141    if (ir->write_mask == 0) {
2142       assert(!ir->lhs->type->is_scalar() && !ir->lhs->type->is_vector());
2143       l.writemask = WRITEMASK_XYZW;
2144    } else if (ir->lhs->type->is_scalar() &&
2145               ir->lhs->variable_referenced()->mode == ir_var_out) {
2146       /* FINISHME: This hack makes writing to gl_FragDepth, which lives in the
2147        * FINISHME: W component of fragment shader output zero, work correctly.
2148        */
2149       l.writemask = WRITEMASK_XYZW;
2150    } else {
2151       int swizzles[4];
2152       int first_enabled_chan = 0;
2153       int rhs_chan = 0;
2154
2155       l.writemask = ir->write_mask;
2156
2157       for (int i = 0; i < 4; i++) {
2158          if (l.writemask & (1 << i)) {
2159             first_enabled_chan = GET_SWZ(r.swizzle, i);
2160             break;
2161          }
2162       }
2163
2164       /* Swizzle a small RHS vector into the channels being written.
2165        *
2166        * glsl ir treats write_mask as dictating how many channels are
2167        * present on the RHS while TGSI treats write_mask as just
2168        * showing which channels of the vec4 RHS get written.
2169        */
2170       for (int i = 0; i < 4; i++) {
2171          if (l.writemask & (1 << i))
2172             swizzles[i] = GET_SWZ(r.swizzle, rhs_chan++);
2173          else
2174             swizzles[i] = first_enabled_chan;
2175       }
2176       r.swizzle = MAKE_SWIZZLE4(swizzles[0], swizzles[1],
2177                                 swizzles[2], swizzles[3]);
2178    }
2179
2180    assert(l.file != PROGRAM_UNDEFINED);
2181    assert(r.file != PROGRAM_UNDEFINED);
2182
2183    if (ir->condition) {
2184       const bool switch_order = this->process_move_condition(ir->condition);
2185       st_src_reg condition = this->result;
2186
2187       for (i = 0; i < type_size(ir->lhs->type); i++) {
2188          st_src_reg l_src = st_src_reg(l);
2189          st_src_reg condition_temp = condition;
2190          l_src.swizzle = swizzle_for_size(ir->lhs->type->vector_elements);
2191
2192          if (native_integers) {
2193             /* This is necessary because TGSI's CMP instruction expects the
2194              * condition to be a float, and we store booleans as integers.
2195              * If TGSI had a UCMP instruction or similar, this extra
2196              * instruction would not be necessary.
2197              */
2198             condition_temp = get_temp(glsl_type::vec4_type);
2199             condition.negate = 0;
2200             emit(ir, TGSI_OPCODE_I2F, st_dst_reg(condition_temp), condition);
2201             condition_temp.swizzle = condition.swizzle;
2202          }
2203
2204          if (switch_order) {
2205             emit(ir, TGSI_OPCODE_CMP, l, condition_temp, l_src, r);
2206          } else {
2207             emit(ir, TGSI_OPCODE_CMP, l, condition_temp, r, l_src);
2208          }
2209
2210          l.index++;
2211          r.index++;
2212       }
2213    } else if (ir->rhs->as_expression() &&
2214               this->instructions.get_tail() &&
2215               ir->rhs == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->ir &&
2216               type_size(ir->lhs->type) == 1 &&
2217               l.writemask == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->dst.writemask) {
2218       /* To avoid emitting an extra MOV when assigning an expression to a
2219        * variable, emit the last instruction of the expression again, but
2220        * replace the destination register with the target of the assignment.
2221        * Dead code elimination will remove the original instruction.
2222        */
2223       glsl_to_tgsi_instruction *inst, *new_inst;
2224       inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
2225       new_inst = emit(ir, inst->op, l, inst->src[0], inst->src[1], inst->src[2]);
2226       new_inst->saturate = inst->saturate;
2227       inst->dead_mask = inst->dst.writemask;
2228    } else {
2229       for (i = 0; i < type_size(ir->lhs->type); i++) {
2230          emit(ir, TGSI_OPCODE_MOV, l, r);
2231          l.index++;
2232          r.index++;
2233       }
2234    }
2235 }
2236
2237
2238 void
2239 glsl_to_tgsi_visitor::visit(ir_constant *ir)
2240 {
2241    st_src_reg src;
2242    GLfloat stack_vals[4] = { 0 };
2243    gl_constant_value *values = (gl_constant_value *) stack_vals;
2244    GLenum gl_type = GL_NONE;
2245    unsigned int i;
2246    static int in_array = 0;
2247    gl_register_file file = in_array ? PROGRAM_CONSTANT : PROGRAM_IMMEDIATE;
2248
2249    /* Unfortunately, 4 floats is all we can get into
2250     * _mesa_add_typed_unnamed_constant.  So, make a temp to store an
2251     * aggregate constant and move each constant value into it.  If we
2252     * get lucky, copy propagation will eliminate the extra moves.
2253     */
2254    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2255       st_src_reg temp_base = get_temp(ir->type);
2256       st_dst_reg temp = st_dst_reg(temp_base);
2257
2258       foreach_iter(exec_list_iterator, iter, ir->components) {
2259          ir_constant *field_value = (ir_constant *)iter.get();
2260          int size = type_size(field_value->type);
2261
2262          assert(size > 0);
2263
2264          field_value->accept(this);
2265          src = this->result;
2266
2267          for (i = 0; i < (unsigned int)size; i++) {
2268             emit(ir, TGSI_OPCODE_MOV, temp, src);
2269
2270             src.index++;
2271             temp.index++;
2272          }
2273       }
2274       this->result = temp_base;
2275       return;
2276    }
2277
2278    if (ir->type->is_array()) {
2279       st_src_reg temp_base = get_temp(ir->type);
2280       st_dst_reg temp = st_dst_reg(temp_base);
2281       int size = type_size(ir->type->fields.array);
2282
2283       assert(size > 0);
2284       in_array++;
2285
2286       for (i = 0; i < ir->type->length; i++) {
2287          ir->array_elements[i]->accept(this);
2288          src = this->result;
2289          for (int j = 0; j < size; j++) {
2290             emit(ir, TGSI_OPCODE_MOV, temp, src);
2291
2292             src.index++;
2293             temp.index++;
2294          }
2295       }
2296       this->result = temp_base;
2297       in_array--;
2298       return;
2299    }
2300
2301    if (ir->type->is_matrix()) {
2302       st_src_reg mat = get_temp(ir->type);
2303       st_dst_reg mat_column = st_dst_reg(mat);
2304
2305       for (i = 0; i < ir->type->matrix_columns; i++) {
2306          assert(ir->type->base_type == GLSL_TYPE_FLOAT);
2307          values = (gl_constant_value *) &ir->value.f[i * ir->type->vector_elements];
2308
2309          src = st_src_reg(file, -1, ir->type->base_type);
2310          src.index = add_constant(file,
2311                                   values,
2312                                   ir->type->vector_elements,
2313                                   GL_FLOAT,
2314                                   &src.swizzle);
2315          emit(ir, TGSI_OPCODE_MOV, mat_column, src);
2316
2317          mat_column.index++;
2318       }
2319
2320       this->result = mat;
2321       return;
2322    }
2323
2324    switch (ir->type->base_type) {
2325    case GLSL_TYPE_FLOAT:
2326       gl_type = GL_FLOAT;
2327       for (i = 0; i < ir->type->vector_elements; i++) {
2328          values[i].f = ir->value.f[i];
2329       }
2330       break;
2331    case GLSL_TYPE_UINT:
2332       gl_type = native_integers ? GL_UNSIGNED_INT : GL_FLOAT;
2333       for (i = 0; i < ir->type->vector_elements; i++) {
2334          if (native_integers)
2335             values[i].u = ir->value.u[i];
2336          else
2337             values[i].f = ir->value.u[i];
2338       }
2339       break;
2340    case GLSL_TYPE_INT:
2341       gl_type = native_integers ? GL_INT : GL_FLOAT;
2342       for (i = 0; i < ir->type->vector_elements; i++) {
2343          if (native_integers)
2344             values[i].i = ir->value.i[i];
2345          else
2346             values[i].f = ir->value.i[i];
2347       }
2348       break;
2349    case GLSL_TYPE_BOOL:
2350       gl_type = native_integers ? GL_BOOL : GL_FLOAT;
2351       for (i = 0; i < ir->type->vector_elements; i++) {
2352          if (native_integers)
2353             values[i].b = ir->value.b[i];
2354          else
2355             values[i].f = ir->value.b[i];
2356       }
2357       break;
2358    default:
2359       assert(!"Non-float/uint/int/bool constant");
2360    }
2361
2362    this->result = st_src_reg(file, -1, ir->type);
2363    this->result.index = add_constant(file,
2364                                      values,
2365                                      ir->type->vector_elements,
2366                                      gl_type,
2367                                      &this->result.swizzle);
2368 }
2369
2370 function_entry *
2371 glsl_to_tgsi_visitor::get_function_signature(ir_function_signature *sig)
2372 {
2373    function_entry *entry;
2374
2375    foreach_iter(exec_list_iterator, iter, this->function_signatures) {
2376       entry = (function_entry *)iter.get();
2377
2378       if (entry->sig == sig)
2379          return entry;
2380    }
2381
2382    entry = ralloc(mem_ctx, function_entry);
2383    entry->sig = sig;
2384    entry->sig_id = this->next_signature_id++;
2385    entry->bgn_inst = NULL;
2386
2387    /* Allocate storage for all the parameters. */
2388    foreach_iter(exec_list_iterator, iter, sig->parameters) {
2389       ir_variable *param = (ir_variable *)iter.get();
2390       variable_storage *storage;
2391
2392       storage = find_variable_storage(param);
2393       assert(!storage);
2394
2395       storage = new(mem_ctx) variable_storage(param, PROGRAM_TEMPORARY,
2396                                               this->next_temp);
2397       this->variables.push_tail(storage);
2398
2399       this->next_temp += type_size(param->type);
2400    }
2401
2402    if (!sig->return_type->is_void()) {
2403       entry->return_reg = get_temp(sig->return_type);
2404    } else {
2405       entry->return_reg = undef_src;
2406    }
2407
2408    this->function_signatures.push_tail(entry);
2409    return entry;
2410 }
2411
2412 void
2413 glsl_to_tgsi_visitor::visit(ir_call *ir)
2414 {
2415    glsl_to_tgsi_instruction *call_inst;
2416    ir_function_signature *sig = ir->get_callee();
2417    function_entry *entry = get_function_signature(sig);
2418    int i;
2419
2420    /* Process in parameters. */
2421    exec_list_iterator sig_iter = sig->parameters.iterator();
2422    foreach_iter(exec_list_iterator, iter, *ir) {
2423       ir_rvalue *param_rval = (ir_rvalue *)iter.get();
2424       ir_variable *param = (ir_variable *)sig_iter.get();
2425
2426       if (param->mode == ir_var_in ||
2427           param->mode == ir_var_inout) {
2428          variable_storage *storage = find_variable_storage(param);
2429          assert(storage);
2430
2431          param_rval->accept(this);
2432          st_src_reg r = this->result;
2433
2434          st_dst_reg l;
2435          l.file = storage->file;
2436          l.index = storage->index;
2437          l.reladdr = NULL;
2438          l.writemask = WRITEMASK_XYZW;
2439          l.cond_mask = COND_TR;
2440
2441          for (i = 0; i < type_size(param->type); i++) {
2442             emit(ir, TGSI_OPCODE_MOV, l, r);
2443             l.index++;
2444             r.index++;
2445          }
2446       }
2447
2448       sig_iter.next();
2449    }
2450    assert(!sig_iter.has_next());
2451
2452    /* Emit call instruction */
2453    call_inst = emit(ir, TGSI_OPCODE_CAL);
2454    call_inst->function = entry;
2455
2456    /* Process out parameters. */
2457    sig_iter = sig->parameters.iterator();
2458    foreach_iter(exec_list_iterator, iter, *ir) {
2459       ir_rvalue *param_rval = (ir_rvalue *)iter.get();
2460       ir_variable *param = (ir_variable *)sig_iter.get();
2461
2462       if (param->mode == ir_var_out ||
2463           param->mode == ir_var_inout) {
2464          variable_storage *storage = find_variable_storage(param);
2465          assert(storage);
2466
2467          st_src_reg r;
2468          r.file = storage->file;
2469          r.index = storage->index;
2470          r.reladdr = NULL;
2471          r.swizzle = SWIZZLE_NOOP;
2472          r.negate = 0;
2473
2474          param_rval->accept(this);
2475          st_dst_reg l = st_dst_reg(this->result);
2476
2477          for (i = 0; i < type_size(param->type); i++) {
2478             emit(ir, TGSI_OPCODE_MOV, l, r);
2479             l.index++;
2480             r.index++;
2481          }
2482       }
2483
2484       sig_iter.next();
2485    }
2486    assert(!sig_iter.has_next());
2487
2488    /* Process return value. */
2489    this->result = entry->return_reg;
2490 }
2491
2492 void
2493 glsl_to_tgsi_visitor::visit(ir_texture *ir)
2494 {
2495    st_src_reg result_src, coord, lod_info, projector, dx, dy, offset;
2496    st_dst_reg result_dst, coord_dst;
2497    glsl_to_tgsi_instruction *inst = NULL;
2498    unsigned opcode = TGSI_OPCODE_NOP;
2499
2500    if (ir->coordinate) {
2501       ir->coordinate->accept(this);
2502
2503       /* Put our coords in a temp.  We'll need to modify them for shadow,
2504        * projection, or LOD, so the only case we'd use it as is is if
2505        * we're doing plain old texturing.  The optimization passes on
2506        * glsl_to_tgsi_visitor should handle cleaning up our mess in that case.
2507        */
2508       coord = get_temp(glsl_type::vec4_type);
2509       coord_dst = st_dst_reg(coord);
2510       emit(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
2511    }
2512
2513    if (ir->projector) {
2514       ir->projector->accept(this);
2515       projector = this->result;
2516    }
2517
2518    /* Storage for our result.  Ideally for an assignment we'd be using
2519     * the actual storage for the result here, instead.
2520     */
2521    result_src = get_temp(glsl_type::vec4_type);
2522    result_dst = st_dst_reg(result_src);
2523
2524    switch (ir->op) {
2525    case ir_tex:
2526       opcode = TGSI_OPCODE_TEX;
2527       break;
2528    case ir_txb:
2529       opcode = TGSI_OPCODE_TXB;
2530       ir->lod_info.bias->accept(this);
2531       lod_info = this->result;
2532       break;
2533    case ir_txl:
2534       opcode = TGSI_OPCODE_TXL;
2535       ir->lod_info.lod->accept(this);
2536       lod_info = this->result;
2537       break;
2538    case ir_txd:
2539       opcode = TGSI_OPCODE_TXD;
2540       ir->lod_info.grad.dPdx->accept(this);
2541       dx = this->result;
2542       ir->lod_info.grad.dPdy->accept(this);
2543       dy = this->result;
2544       break;
2545    case ir_txs:
2546       opcode = TGSI_OPCODE_TXQ;
2547       ir->lod_info.lod->accept(this);
2548       lod_info = this->result;
2549       break;
2550    case ir_txf:
2551       opcode = TGSI_OPCODE_TXF;
2552       ir->lod_info.lod->accept(this);
2553       lod_info = this->result;
2554       if (ir->offset) {
2555          ir->offset->accept(this);
2556          offset = this->result;
2557       }
2558       break;
2559    }
2560
2561    const glsl_type *sampler_type = ir->sampler->type;
2562
2563    if (ir->projector) {
2564       if (opcode == TGSI_OPCODE_TEX) {
2565          /* Slot the projector in as the last component of the coord. */
2566          coord_dst.writemask = WRITEMASK_W;
2567          emit(ir, TGSI_OPCODE_MOV, coord_dst, projector);
2568          coord_dst.writemask = WRITEMASK_XYZW;
2569          opcode = TGSI_OPCODE_TXP;
2570       } else {
2571          st_src_reg coord_w = coord;
2572          coord_w.swizzle = SWIZZLE_WWWW;
2573
2574          /* For the other TEX opcodes there's no projective version
2575           * since the last slot is taken up by LOD info.  Do the
2576           * projective divide now.
2577           */
2578          coord_dst.writemask = WRITEMASK_W;
2579          emit(ir, TGSI_OPCODE_RCP, coord_dst, projector);
2580
2581          /* In the case where we have to project the coordinates "by hand,"
2582           * the shadow comparator value must also be projected.
2583           */
2584          st_src_reg tmp_src = coord;
2585          if (ir->shadow_comparitor) {
2586             /* Slot the shadow value in as the second to last component of the
2587              * coord.
2588              */
2589             ir->shadow_comparitor->accept(this);
2590
2591             tmp_src = get_temp(glsl_type::vec4_type);
2592             st_dst_reg tmp_dst = st_dst_reg(tmp_src);
2593
2594             /* Projective division not allowed for array samplers. */
2595             assert(!sampler_type->sampler_array);
2596
2597             tmp_dst.writemask = WRITEMASK_Z;
2598             emit(ir, TGSI_OPCODE_MOV, tmp_dst, this->result);
2599
2600             tmp_dst.writemask = WRITEMASK_XY;
2601             emit(ir, TGSI_OPCODE_MOV, tmp_dst, coord);
2602          }
2603
2604          coord_dst.writemask = WRITEMASK_XYZ;
2605          emit(ir, TGSI_OPCODE_MUL, coord_dst, tmp_src, coord_w);
2606
2607          coord_dst.writemask = WRITEMASK_XYZW;
2608          coord.swizzle = SWIZZLE_XYZW;
2609       }
2610    }
2611
2612    /* If projection is done and the opcode is not TGSI_OPCODE_TXP, then the shadow
2613     * comparator was put in the correct place (and projected) by the code,
2614     * above, that handles by-hand projection.
2615     */
2616    if (ir->shadow_comparitor && (!ir->projector || opcode == TGSI_OPCODE_TXP)) {
2617       /* Slot the shadow value in as the second to last component of the
2618        * coord.
2619        */
2620       ir->shadow_comparitor->accept(this);
2621
2622       /* XXX This will need to be updated for cubemap array samplers. */
2623       if (sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_2D &&
2624           sampler_type->sampler_array) {
2625          coord_dst.writemask = WRITEMASK_W;
2626       } else {
2627          coord_dst.writemask = WRITEMASK_Z;
2628       }
2629
2630       emit(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
2631       coord_dst.writemask = WRITEMASK_XYZW;
2632    }
2633
2634    if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXB ||
2635        opcode == TGSI_OPCODE_TXF) {
2636       /* TGSI stores LOD or LOD bias in the last channel of the coords. */
2637       coord_dst.writemask = WRITEMASK_W;
2638       emit(ir, TGSI_OPCODE_MOV, coord_dst, lod_info);
2639       coord_dst.writemask = WRITEMASK_XYZW;
2640    }
2641
2642    if (opcode == TGSI_OPCODE_TXD)
2643       inst = emit(ir, opcode, result_dst, coord, dx, dy);
2644    else if (opcode == TGSI_OPCODE_TXQ)
2645       inst = emit(ir, opcode, result_dst, lod_info);
2646    else if (opcode == TGSI_OPCODE_TXF) {
2647       inst = emit(ir, opcode, result_dst, coord);
2648    } else
2649       inst = emit(ir, opcode, result_dst, coord);
2650
2651    if (ir->shadow_comparitor)
2652       inst->tex_shadow = GL_TRUE;
2653
2654    inst->sampler = _mesa_get_sampler_uniform_value(ir->sampler,
2655                                                    this->shader_program,
2656                                                    this->prog);
2657
2658    if (ir->offset) {
2659        inst->tex_offset_num_offset = 1;
2660        inst->tex_offsets[0].Index = offset.index;
2661        inst->tex_offsets[0].File = offset.file;
2662        inst->tex_offsets[0].SwizzleX = GET_SWZ(offset.swizzle, 0);
2663        inst->tex_offsets[0].SwizzleY = GET_SWZ(offset.swizzle, 1);
2664        inst->tex_offsets[0].SwizzleZ = GET_SWZ(offset.swizzle, 2);
2665    }
2666
2667    switch (sampler_type->sampler_dimensionality) {
2668    case GLSL_SAMPLER_DIM_1D:
2669       inst->tex_target = (sampler_type->sampler_array)
2670          ? TEXTURE_1D_ARRAY_INDEX : TEXTURE_1D_INDEX;
2671       break;
2672    case GLSL_SAMPLER_DIM_2D:
2673       inst->tex_target = (sampler_type->sampler_array)
2674          ? TEXTURE_2D_ARRAY_INDEX : TEXTURE_2D_INDEX;
2675       break;
2676    case GLSL_SAMPLER_DIM_3D:
2677       inst->tex_target = TEXTURE_3D_INDEX;
2678       break;
2679    case GLSL_SAMPLER_DIM_CUBE:
2680       inst->tex_target = TEXTURE_CUBE_INDEX;
2681       break;
2682    case GLSL_SAMPLER_DIM_RECT:
2683       inst->tex_target = TEXTURE_RECT_INDEX;
2684       break;
2685    case GLSL_SAMPLER_DIM_BUF:
2686       assert(!"FINISHME: Implement ARB_texture_buffer_object");
2687       break;
2688    default:
2689       assert(!"Should not get here.");
2690    }
2691
2692    this->result = result_src;
2693 }
2694
2695 void
2696 glsl_to_tgsi_visitor::visit(ir_return *ir)
2697 {
2698    if (ir->get_value()) {
2699       st_dst_reg l;
2700       int i;
2701
2702       assert(current_function);
2703
2704       ir->get_value()->accept(this);
2705       st_src_reg r = this->result;
2706
2707       l = st_dst_reg(current_function->return_reg);
2708
2709       for (i = 0; i < type_size(current_function->sig->return_type); i++) {
2710          emit(ir, TGSI_OPCODE_MOV, l, r);
2711          l.index++;
2712          r.index++;
2713       }
2714    }
2715
2716    emit(ir, TGSI_OPCODE_RET);
2717 }
2718
2719 void
2720 glsl_to_tgsi_visitor::visit(ir_discard *ir)
2721 {
2722    struct gl_fragment_program *fp = (struct gl_fragment_program *)this->prog;
2723
2724    if (ir->condition) {
2725       ir->condition->accept(this);
2726       this->result.negate = ~this->result.negate;
2727       emit(ir, TGSI_OPCODE_KIL, undef_dst, this->result);
2728    } else {
2729       emit(ir, TGSI_OPCODE_KILP);
2730    }
2731
2732    fp->UsesKill = GL_TRUE;
2733 }
2734
2735 void
2736 glsl_to_tgsi_visitor::visit(ir_if *ir)
2737 {
2738    glsl_to_tgsi_instruction *cond_inst, *if_inst;
2739    glsl_to_tgsi_instruction *prev_inst;
2740
2741    prev_inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
2742
2743    ir->condition->accept(this);
2744    assert(this->result.file != PROGRAM_UNDEFINED);
2745
2746    if (this->options->EmitCondCodes) {
2747       cond_inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
2748
2749       /* See if we actually generated any instruction for generating
2750        * the condition.  If not, then cook up a move to a temp so we
2751        * have something to set cond_update on.
2752        */
2753       if (cond_inst == prev_inst) {
2754          st_src_reg temp = get_temp(glsl_type::bool_type);
2755          cond_inst = emit(ir->condition, TGSI_OPCODE_MOV, st_dst_reg(temp), result);
2756       }
2757       cond_inst->cond_update = GL_TRUE;
2758
2759       if_inst = emit(ir->condition, TGSI_OPCODE_IF);
2760       if_inst->dst.cond_mask = COND_NE;
2761    } else {
2762       if_inst = emit(ir->condition, TGSI_OPCODE_IF, undef_dst, this->result);
2763    }
2764
2765    this->instructions.push_tail(if_inst);
2766
2767    visit_exec_list(&ir->then_instructions, this);
2768
2769    if (!ir->else_instructions.is_empty()) {
2770       emit(ir->condition, TGSI_OPCODE_ELSE);
2771       visit_exec_list(&ir->else_instructions, this);
2772    }
2773
2774    if_inst = emit(ir->condition, TGSI_OPCODE_ENDIF);
2775 }
2776
2777 glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
2778 {
2779    result.file = PROGRAM_UNDEFINED;
2780    next_temp = 1;
2781    next_signature_id = 1;
2782    num_immediates = 0;
2783    current_function = NULL;
2784    num_address_regs = 0;
2785    indirect_addr_temps = false;
2786    indirect_addr_consts = false;
2787    mem_ctx = ralloc_context(NULL);
2788 }
2789
2790 glsl_to_tgsi_visitor::~glsl_to_tgsi_visitor()
2791 {
2792    ralloc_free(mem_ctx);
2793 }
2794
2795 extern "C" void free_glsl_to_tgsi_visitor(glsl_to_tgsi_visitor *v)
2796 {
2797    delete v;
2798 }
2799
2800
2801 /**
2802  * Count resources used by the given gpu program (number of texture
2803  * samplers, etc).
2804  */
2805 static void
2806 count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
2807 {
2808    v->samplers_used = 0;
2809
2810    foreach_iter(exec_list_iterator, iter, v->instructions) {
2811       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
2812
2813       if (is_tex_instruction(inst->op)) {
2814          v->samplers_used |= 1 << inst->sampler;
2815
2816          prog->SamplerTargets[inst->sampler] =
2817             (gl_texture_index)inst->tex_target;
2818          if (inst->tex_shadow) {
2819             prog->ShadowSamplers |= 1 << inst->sampler;
2820          }
2821       }
2822    }
2823
2824    prog->SamplersUsed = v->samplers_used;
2825    _mesa_update_shader_textures_used(prog);
2826 }
2827
2828
2829 /**
2830  * Check if the given vertex/fragment/shader program is within the
2831  * resource limits of the context (number of texture units, etc).
2832  * If any of those checks fail, record a linker error.
2833  *
2834  * XXX more checks are needed...
2835  */
2836 static void
2837 check_resources(const struct gl_context *ctx,
2838                 struct gl_shader_program *shader_program,
2839                 glsl_to_tgsi_visitor *prog,
2840                 struct gl_program *proginfo)
2841 {
2842    switch (proginfo->Target) {
2843    case GL_VERTEX_PROGRAM_ARB:
2844       if (_mesa_bitcount(prog->samplers_used) >
2845           ctx->Const.MaxVertexTextureImageUnits) {
2846          fail_link(shader_program, "Too many vertex shader texture samplers");
2847       }
2848       if (proginfo->Parameters->NumParameters > MAX_UNIFORMS) {
2849          fail_link(shader_program, "Too many vertex shader constants");
2850       }
2851       break;
2852    case MESA_GEOMETRY_PROGRAM:
2853       if (_mesa_bitcount(prog->samplers_used) >
2854           ctx->Const.MaxGeometryTextureImageUnits) {
2855          fail_link(shader_program, "Too many geometry shader texture samplers");
2856       }
2857       if (proginfo->Parameters->NumParameters >
2858           MAX_GEOMETRY_UNIFORM_COMPONENTS / 4) {
2859          fail_link(shader_program, "Too many geometry shader constants");
2860       }
2861       break;
2862    case GL_FRAGMENT_PROGRAM_ARB:
2863       if (_mesa_bitcount(prog->samplers_used) >
2864           ctx->Const.MaxTextureImageUnits) {
2865          fail_link(shader_program, "Too many fragment shader texture samplers");
2866       }
2867       if (proginfo->Parameters->NumParameters > MAX_UNIFORMS) {
2868          fail_link(shader_program, "Too many fragment shader constants");
2869       }
2870       break;
2871    default:
2872       _mesa_problem(ctx, "unexpected program type in check_resources()");
2873    }
2874 }
2875
2876
2877
2878 struct uniform_sort {
2879    struct gl_uniform *u;
2880    int pos;
2881 };
2882
2883 /* The shader_program->Uniforms list is almost sorted in increasing
2884  * uniform->{Frag,Vert}Pos locations, but not quite when there are
2885  * uniforms shared between targets.  We need to add parameters in
2886  * increasing order for the targets.
2887  */
2888 static int
2889 sort_uniforms(const void *a, const void *b)
2890 {
2891    struct uniform_sort *u1 = (struct uniform_sort *)a;
2892    struct uniform_sort *u2 = (struct uniform_sort *)b;
2893
2894    return u1->pos - u2->pos;
2895 }
2896
2897 /* Add the uniforms to the parameters.  The linker chose locations
2898  * in our parameters lists (which weren't created yet), which the
2899  * uniforms code will use to poke values into our parameters list
2900  * when uniforms are updated.
2901  */
2902 static void
2903 add_uniforms_to_parameters_list(struct gl_shader_program *shader_program,
2904                                 struct gl_shader *shader,
2905                                 struct gl_program *prog)
2906 {
2907    unsigned int i;
2908    unsigned int next_sampler = 0, num_uniforms = 0;
2909    struct uniform_sort *sorted_uniforms;
2910
2911    sorted_uniforms = ralloc_array(NULL, struct uniform_sort,
2912                                   shader_program->Uniforms->NumUniforms);
2913
2914    for (i = 0; i < shader_program->Uniforms->NumUniforms; i++) {
2915       struct gl_uniform *uniform = shader_program->Uniforms->Uniforms + i;
2916       int parameter_index = -1;
2917
2918       switch (shader->Type) {
2919       case GL_VERTEX_SHADER:
2920          parameter_index = uniform->VertPos;
2921          break;
2922       case GL_FRAGMENT_SHADER:
2923          parameter_index = uniform->FragPos;
2924          break;
2925       case GL_GEOMETRY_SHADER:
2926          parameter_index = uniform->GeomPos;
2927          break;
2928       }
2929
2930       /* Only add uniforms used in our target. */
2931       if (parameter_index != -1) {
2932          sorted_uniforms[num_uniforms].pos = parameter_index;
2933          sorted_uniforms[num_uniforms].u = uniform;
2934          num_uniforms++;
2935       }
2936    }
2937
2938    qsort(sorted_uniforms, num_uniforms, sizeof(struct uniform_sort),
2939          sort_uniforms);
2940
2941    for (i = 0; i < num_uniforms; i++) {
2942       struct gl_uniform *uniform = sorted_uniforms[i].u;
2943       int parameter_index = sorted_uniforms[i].pos;
2944       const glsl_type *type = uniform->Type;
2945       unsigned int size;
2946
2947       if (type->is_vector() ||
2948           type->is_scalar()) {
2949          size = type->vector_elements;
2950       } else {
2951          size = type_size(type) * 4;
2952       }
2953
2954       gl_register_file file;
2955       if (type->is_sampler() ||
2956           (type->is_array() && type->fields.array->is_sampler())) {
2957          file = PROGRAM_SAMPLER;
2958       } else {
2959          file = PROGRAM_UNIFORM;
2960       }
2961
2962       GLint index = _mesa_lookup_parameter_index(prog->Parameters, -1,
2963                                                  uniform->Name);
2964
2965       if (index < 0) {
2966          index = _mesa_add_parameter(prog->Parameters, file,
2967                                      uniform->Name, size, type->gl_type,
2968                                      NULL, NULL, 0x0);
2969
2970          /* Sampler uniform values are stored in prog->SamplerUnits,
2971           * and the entry in that array is selected by this index we
2972           * store in ParameterValues[].
2973           */
2974          if (file == PROGRAM_SAMPLER) {
2975             for (unsigned int j = 0; j < size / 4; j++)
2976                prog->Parameters->ParameterValues[index + j][0].f = next_sampler++;
2977          }
2978
2979          /* The location chosen in the Parameters list here (returned
2980           * from _mesa_add_uniform) has to match what the linker chose.
2981           */
2982          if (index != parameter_index) {
2983             fail_link(shader_program, "Allocation of uniform `%s' to target "
2984                       "failed (%d vs %d)\n",
2985                       uniform->Name, index, parameter_index);
2986          }
2987       }
2988    }
2989
2990    ralloc_free(sorted_uniforms);
2991 }
2992
2993 static void
2994 set_uniform_initializer(struct gl_context *ctx, void *mem_ctx,
2995                         struct gl_shader_program *shader_program,
2996                         const char *name, const glsl_type *type,
2997                         ir_constant *val)
2998 {
2999    if (type->is_record()) {
3000       ir_constant *field_constant;
3001
3002       field_constant = (ir_constant *)val->components.get_head();
3003
3004       for (unsigned int i = 0; i < type->length; i++) {
3005          const glsl_type *field_type = type->fields.structure[i].type;
3006          const char *field_name = ralloc_asprintf(mem_ctx, "%s.%s", name,
3007                                             type->fields.structure[i].name);
3008          set_uniform_initializer(ctx, mem_ctx, shader_program, field_name,
3009                                  field_type, field_constant);
3010          field_constant = (ir_constant *)field_constant->next;
3011       }
3012       return;
3013    }
3014
3015    int loc = _mesa_get_uniform_location(ctx, shader_program, name);
3016
3017    if (loc == -1) {
3018       fail_link(shader_program,
3019                 "Couldn't find uniform for initializer %s\n", name);
3020       return;
3021    }
3022
3023    for (unsigned int i = 0; i < (type->is_array() ? type->length : 1); i++) {
3024       ir_constant *element;
3025       const glsl_type *element_type;
3026       if (type->is_array()) {
3027          element = val->array_elements[i];
3028          element_type = type->fields.array;
3029       } else {
3030          element = val;
3031          element_type = type;
3032       }
3033
3034       void *values;
3035
3036       if (element_type->base_type == GLSL_TYPE_BOOL) {
3037          int *conv = ralloc_array(mem_ctx, int, element_type->components());
3038          for (unsigned int j = 0; j < element_type->components(); j++) {
3039             conv[j] = element->value.b[j];
3040          }
3041          values = (void *)conv;
3042          element_type = glsl_type::get_instance(GLSL_TYPE_INT,
3043                                                 element_type->vector_elements,
3044                                                 1);
3045       } else {
3046          values = &element->value;
3047       }
3048
3049       if (element_type->is_matrix()) {
3050          _mesa_uniform_matrix(ctx, shader_program,
3051                               element_type->matrix_columns,
3052                               element_type->vector_elements,
3053                               loc, 1, GL_FALSE, (GLfloat *)values);
3054          loc += element_type->matrix_columns;
3055       } else {
3056          _mesa_uniform(ctx, shader_program, loc, element_type->matrix_columns,
3057                        values, element_type->gl_type);
3058          loc += type_size(element_type);
3059       }
3060    }
3061 }
3062
3063 /*
3064  * Scan/rewrite program to remove reads of custom (output) registers.
3065  * The passed type has to be either PROGRAM_OUTPUT or PROGRAM_VARYING
3066  * (for vertex shaders).
3067  * In GLSL shaders, varying vars can be read and written.
3068  * On some hardware, trying to read an output register causes trouble.
3069  * So, rewrite the program to use a temporary register in this case.
3070  *
3071  * Based on _mesa_remove_output_reads from programopt.c.
3072  */
3073 void
3074 glsl_to_tgsi_visitor::remove_output_reads(gl_register_file type)
3075 {
3076    GLuint i;
3077    GLint outputMap[VERT_RESULT_MAX];
3078    GLint outputTypes[VERT_RESULT_MAX];
3079    GLuint numVaryingReads = 0;
3080    GLboolean usedTemps[MAX_TEMPS];
3081    GLuint firstTemp = 0;
3082
3083    _mesa_find_used_registers(prog, PROGRAM_TEMPORARY,
3084                              usedTemps, MAX_TEMPS);
3085
3086    assert(type == PROGRAM_VARYING || type == PROGRAM_OUTPUT);
3087    assert(prog->Target == GL_VERTEX_PROGRAM_ARB || type != PROGRAM_VARYING);
3088
3089    for (i = 0; i < VERT_RESULT_MAX; i++)
3090       outputMap[i] = -1;
3091
3092    /* look for instructions which read from varying vars */
3093    foreach_iter(exec_list_iterator, iter, this->instructions) {
3094       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
3095       const GLuint numSrc = num_inst_src_regs(inst->op);
3096       GLuint j;
3097       for (j = 0; j < numSrc; j++) {
3098          if (inst->src[j].file == type) {
3099             /* replace the read with a temp reg */
3100             const GLuint var = inst->src[j].index;
3101             if (outputMap[var] == -1) {
3102                numVaryingReads++;
3103                outputMap[var] = _mesa_find_free_register(usedTemps,
3104                                                          MAX_TEMPS,
3105                                                          firstTemp);
3106                outputTypes[var] = inst->src[j].type;
3107                firstTemp = outputMap[var] + 1;
3108             }
3109             inst->src[j].file = PROGRAM_TEMPORARY;
3110             inst->src[j].index = outputMap[var];
3111          }
3112       }
3113    }
3114
3115    if (numVaryingReads == 0)
3116       return; /* nothing to be done */
3117
3118    /* look for instructions which write to the varying vars identified above */
3119    foreach_iter(exec_list_iterator, iter, this->instructions) {
3120       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
3121       if (inst->dst.file == type && outputMap[inst->dst.index] >= 0) {
3122          /* change inst to write to the temp reg, instead of the varying */
3123          inst->dst.file = PROGRAM_TEMPORARY;
3124          inst->dst.index = outputMap[inst->dst.index];
3125       }
3126    }
3127
3128    /* insert new MOV instructions at the end */
3129    for (i = 0; i < VERT_RESULT_MAX; i++) {
3130       if (outputMap[i] >= 0) {
3131          /* MOV VAR[i], TEMP[tmp]; */
3132          st_src_reg src = st_src_reg(PROGRAM_TEMPORARY, outputMap[i], outputTypes[i]);
3133          st_dst_reg dst = st_dst_reg(type, WRITEMASK_XYZW, outputTypes[i]);
3134          dst.index = i;
3135          this->emit(NULL, TGSI_OPCODE_MOV, dst, src);
3136       }
3137    }
3138 }
3139
3140 /**
3141  * Returns the mask of channels (bitmask of WRITEMASK_X,Y,Z,W) which
3142  * are read from the given src in this instruction
3143  */
3144 static int
3145 get_src_arg_mask(st_dst_reg dst, st_src_reg src)
3146 {
3147    int read_mask = 0, comp;
3148
3149    /* Now, given the src swizzle and the written channels, find which
3150     * components are actually read
3151     */
3152    for (comp = 0; comp < 4; ++comp) {
3153       const unsigned coord = GET_SWZ(src.swizzle, comp);
3154       ASSERT(coord < 4);
3155       if (dst.writemask & (1 << comp) && coord <= SWIZZLE_W)
3156          read_mask |= 1 << coord;
3157    }
3158
3159    return read_mask;
3160 }
3161
3162 /**
3163  * This pass replaces CMP T0, T1 T2 T0 with MOV T0, T2 when the CMP
3164  * instruction is the first instruction to write to register T0.  There are
3165  * several lowering passes done in GLSL IR (e.g. branches and
3166  * relative addressing) that create a large number of conditional assignments
3167  * that ir_to_mesa converts to CMP instructions like the one mentioned above.
3168  *
3169  * Here is why this conversion is safe:
3170  * CMP T0, T1 T2 T0 can be expanded to:
3171  * if (T1 < 0.0)
3172  *      MOV T0, T2;
3173  * else
3174  *      MOV T0, T0;
3175  *
3176  * If (T1 < 0.0) evaluates to true then our replacement MOV T0, T2 is the same
3177  * as the original program.  If (T1 < 0.0) evaluates to false, executing
3178  * MOV T0, T0 will store a garbage value in T0 since T0 is uninitialized.
3179  * Therefore, it doesn't matter that we are replacing MOV T0, T0 with MOV T0, T2
3180  * because any instruction that was going to read from T0 after this was going
3181  * to read a garbage value anyway.
3182  */
3183 void
3184 glsl_to_tgsi_visitor::simplify_cmp(void)
3185 {
3186    unsigned tempWrites[MAX_TEMPS];
3187    unsigned outputWrites[MAX_PROGRAM_OUTPUTS];
3188
3189    memset(tempWrites, 0, sizeof(tempWrites));
3190    memset(outputWrites, 0, sizeof(outputWrites));
3191
3192    foreach_iter(exec_list_iterator, iter, this->instructions) {
3193       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
3194       unsigned prevWriteMask = 0;
3195
3196       /* Give up if we encounter relative addressing or flow control. */
3197       if (inst->dst.reladdr ||
3198           tgsi_get_opcode_info(inst->op)->is_branch ||
3199           inst->op == TGSI_OPCODE_BGNSUB ||
3200           inst->op == TGSI_OPCODE_CONT ||
3201           inst->op == TGSI_OPCODE_END ||
3202           inst->op == TGSI_OPCODE_ENDSUB ||
3203           inst->op == TGSI_OPCODE_RET) {
3204          return;
3205       }
3206
3207       if (inst->dst.file == PROGRAM_OUTPUT) {
3208          assert(inst->dst.index < MAX_PROGRAM_OUTPUTS);
3209          prevWriteMask = outputWrites[inst->dst.index];
3210          outputWrites[inst->dst.index] |= inst->dst.writemask;
3211       } else if (inst->dst.file == PROGRAM_TEMPORARY) {
3212          assert(inst->dst.index < MAX_TEMPS);
3213          prevWriteMask = tempWrites[inst->dst.index];
3214          tempWrites[inst->dst.index] |= inst->dst.writemask;
3215       }
3216
3217       /* For a CMP to be considered a conditional write, the destination
3218        * register and source register two must be the same. */
3219       if (inst->op == TGSI_OPCODE_CMP
3220           && !(inst->dst.writemask & prevWriteMask)
3221           && inst->src[2].file == inst->dst.file
3222           && inst->src[2].index == inst->dst.index
3223           && inst->dst.writemask == get_src_arg_mask(inst->dst, inst->src[2])) {
3224
3225          inst->op = TGSI_OPCODE_MOV;
3226          inst->src[0] = inst->src[1];
3227       }
3228    }
3229 }
3230
3231 /* Replaces all references to a temporary register index with another index. */
3232 void
3233 glsl_to_tgsi_visitor::rename_temp_register(int index, int new_index)
3234 {
3235    foreach_iter(exec_list_iterator, iter, this->instructions) {
3236       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
3237       unsigned j;
3238
3239       for (j=0; j < num_inst_src_regs(inst->op); j++) {
3240          if (inst->src[j].file == PROGRAM_TEMPORARY &&
3241              inst->src[j].index == index) {
3242             inst->src[j].index = new_index;
3243          }
3244       }
3245
3246       if (inst->dst.file == PROGRAM_TEMPORARY && inst->dst.index == index) {
3247          inst->dst.index = new_index;
3248       }
3249    }
3250 }
3251
3252 int
3253 glsl_to_tgsi_visitor::get_first_temp_read(int index)
3254 {
3255    int depth = 0; /* loop depth */
3256    int loop_start = -1; /* index of the first active BGNLOOP (if any) */
3257    unsigned i = 0, j;
3258
3259    foreach_iter(exec_list_iterator, iter, this->instructions) {
3260       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
3261
3262       for (j=0; j < num_inst_src_regs(inst->op); j++) {
3263          if (inst->src[j].file == PROGRAM_TEMPORARY &&
3264              inst->src[j].index == index) {
3265             return (depth == 0) ? i : loop_start;
3266          }
3267       }
3268
3269       if (inst->op == TGSI_OPCODE_BGNLOOP) {
3270          if(depth++ == 0)
3271             loop_start = i;
3272       } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
3273          if (--depth == 0)
3274             loop_start = -1;
3275       }
3276       assert(depth >= 0);
3277
3278       i++;
3279    }
3280
3281    return -1;
3282 }
3283
3284 int
3285 glsl_to_tgsi_visitor::get_first_temp_write(int index)
3286 {
3287    int depth = 0; /* loop depth */
3288    int loop_start = -1; /* index of the first active BGNLOOP (if any) */
3289    int i = 0;
3290
3291    foreach_iter(exec_list_iterator, iter, this->instructions) {
3292       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
3293
3294       if (inst->dst.file == PROGRAM_TEMPORARY && inst->dst.index == index) {
3295          return (depth == 0) ? i : loop_start;
3296       }
3297
3298       if (inst->op == TGSI_OPCODE_BGNLOOP) {
3299          if(depth++ == 0)
3300             loop_start = i;
3301       } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
3302          if (--depth == 0)
3303             loop_start = -1;
3304       }
3305       assert(depth >= 0);
3306
3307       i++;
3308    }
3309
3310    return -1;
3311 }
3312
3313 int
3314 glsl_to_tgsi_visitor::get_last_temp_read(int index)
3315 {
3316    int depth = 0; /* loop depth */
3317    int last = -1; /* index of last instruction that reads the temporary */
3318    unsigned i = 0, j;
3319
3320    foreach_iter(exec_list_iterator, iter, this->instructions) {
3321       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
3322
3323       for (j=0; j < num_inst_src_regs(inst->op); j++) {
3324          if (inst->src[j].file == PROGRAM_TEMPORARY &&
3325              inst->src[j].index == index) {
3326             last = (depth == 0) ? i : -2;
3327          }
3328       }
3329
3330       if (inst->op == TGSI_OPCODE_BGNLOOP)
3331          depth++;
3332       else if (inst->op == TGSI_OPCODE_ENDLOOP)
3333          if (--depth == 0 && last == -2)
3334             last = i;
3335       assert(depth >= 0);
3336
3337       i++;
3338    }
3339
3340    assert(last >= -1);
3341    return last;
3342 }
3343
3344 int
3345 glsl_to_tgsi_visitor::get_last_temp_write(int index)
3346 {
3347    int depth = 0; /* loop depth */
3348    int last = -1; /* index of last instruction that writes to the temporary */
3349    int i = 0;
3350
3351    foreach_iter(exec_list_iterator, iter, this->instructions) {
3352       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
3353
3354       if (inst->dst.file == PROGRAM_TEMPORARY && inst->dst.index == index)
3355          last = (depth == 0) ? i : -2;
3356
3357       if (inst->op == TGSI_OPCODE_BGNLOOP)
3358          depth++;
3359       else if (inst->op == TGSI_OPCODE_ENDLOOP)
3360          if (--depth == 0 && last == -2)
3361             last = i;
3362       assert(depth >= 0);
3363
3364       i++;
3365    }
3366
3367    assert(last >= -1);
3368    return last;
3369 }
3370
3371 /*
3372  * On a basic block basis, tracks available PROGRAM_TEMPORARY register
3373  * channels for copy propagation and updates following instructions to
3374  * use the original versions.
3375  *
3376  * The glsl_to_tgsi_visitor lazily produces code assuming that this pass
3377  * will occur.  As an example, a TXP production before this pass:
3378  *
3379  * 0: MOV TEMP[1], INPUT[4].xyyy;
3380  * 1: MOV TEMP[1].w, INPUT[4].wwww;
3381  * 2: TXP TEMP[2], TEMP[1], texture[0], 2D;
3382  *
3383  * and after:
3384  *
3385  * 0: MOV TEMP[1], INPUT[4].xyyy;
3386  * 1: MOV TEMP[1].w, INPUT[4].wwww;
3387  * 2: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D;
3388  *
3389  * which allows for dead code elimination on TEMP[1]'s writes.
3390  */
3391 void
3392 glsl_to_tgsi_visitor::copy_propagate(void)
3393 {
3394    glsl_to_tgsi_instruction **acp = rzalloc_array(mem_ctx,
3395                                                     glsl_to_tgsi_instruction *,
3396                                                     this->next_temp * 4);
3397    int *acp_level = rzalloc_array(mem_ctx, int, this->next_temp * 4);
3398    int level = 0;
3399
3400    foreach_iter(exec_list_iterator, iter, this->instructions) {
3401       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
3402
3403       assert(inst->dst.file != PROGRAM_TEMPORARY
3404              || inst->dst.index < this->next_temp);
3405
3406       /* First, do any copy propagation possible into the src regs. */
3407       for (int r = 0; r < 3; r++) {
3408          glsl_to_tgsi_instruction *first = NULL;
3409          bool good = true;
3410          int acp_base = inst->src[r].index * 4;
3411
3412          if (inst->src[r].file != PROGRAM_TEMPORARY ||
3413              inst->src[r].reladdr)
3414             continue;
3415
3416          /* See if we can find entries in the ACP consisting of MOVs
3417           * from the same src register for all the swizzled channels
3418           * of this src register reference.
3419           */
3420          for (int i = 0; i < 4; i++) {
3421             int src_chan = GET_SWZ(inst->src[r].swizzle, i);
3422             glsl_to_tgsi_instruction *copy_chan = acp[acp_base + src_chan];
3423
3424             if (!copy_chan) {
3425                good = false;
3426                break;
3427             }
3428
3429             assert(acp_level[acp_base + src_chan] <= level);
3430
3431             if (!first) {
3432                first = copy_chan;
3433             } else {
3434                if (first->src[0].file != copy_chan->src[0].file ||
3435                    first->src[0].index != copy_chan->src[0].index) {
3436                   good = false;
3437                   break;
3438                }
3439             }
3440          }
3441
3442          if (good) {
3443             /* We've now validated that we can copy-propagate to
3444              * replace this src register reference.  Do it.
3445              */
3446             inst->src[r].file = first->src[0].file;
3447             inst->src[r].index = first->src[0].index;
3448
3449             int swizzle = 0;
3450             for (int i = 0; i < 4; i++) {
3451                int src_chan = GET_SWZ(inst->src[r].swizzle, i);
3452                glsl_to_tgsi_instruction *copy_inst = acp[acp_base + src_chan];
3453                swizzle |= (GET_SWZ(copy_inst->src[0].swizzle, src_chan) <<
3454                            (3 * i));
3455             }
3456             inst->src[r].swizzle = swizzle;
3457          }
3458       }
3459
3460       switch (inst->op) {
3461       case TGSI_OPCODE_BGNLOOP:
3462       case TGSI_OPCODE_ENDLOOP:
3463          /* End of a basic block, clear the ACP entirely. */
3464          memset(acp, 0, sizeof(*acp) * this->next_temp * 4);
3465          break;
3466
3467       case TGSI_OPCODE_IF:
3468          ++level;
3469          break;
3470
3471       case TGSI_OPCODE_ENDIF:
3472       case TGSI_OPCODE_ELSE:
3473          /* Clear all channels written inside the block from the ACP, but
3474           * leaving those that were not touched.
3475           */
3476          for (int r = 0; r < this->next_temp; r++) {
3477             for (int c = 0; c < 4; c++) {
3478                if (!acp[4 * r + c])
3479                   continue;
3480
3481                if (acp_level[4 * r + c] >= level)
3482                   acp[4 * r + c] = NULL;
3483             }
3484          }
3485          if (inst->op == TGSI_OPCODE_ENDIF)
3486             --level;
3487          break;
3488
3489       default:
3490          /* Continuing the block, clear any written channels from
3491           * the ACP.
3492           */
3493          if (inst->dst.file == PROGRAM_TEMPORARY && inst->dst.reladdr) {
3494             /* Any temporary might be written, so no copy propagation
3495              * across this instruction.
3496              */
3497             memset(acp, 0, sizeof(*acp) * this->next_temp * 4);
3498          } else if (inst->dst.file == PROGRAM_OUTPUT &&
3499                     inst->dst.reladdr) {
3500             /* Any output might be written, so no copy propagation
3501              * from outputs across this instruction.
3502              */
3503             for (int r = 0; r < this->next_temp; r++) {
3504                for (int c = 0; c < 4; c++) {
3505                   if (!acp[4 * r + c])
3506                      continue;
3507
3508                   if (acp[4 * r + c]->src[0].file == PROGRAM_OUTPUT)
3509                      acp[4 * r + c] = NULL;
3510                }
3511             }
3512          } else if (inst->dst.file == PROGRAM_TEMPORARY ||
3513                     inst->dst.file == PROGRAM_OUTPUT) {
3514             /* Clear where it's used as dst. */
3515             if (inst->dst.file == PROGRAM_TEMPORARY) {
3516                for (int c = 0; c < 4; c++) {
3517                   if (inst->dst.writemask & (1 << c)) {
3518                      acp[4 * inst->dst.index + c] = NULL;
3519                   }
3520                }
3521             }
3522
3523             /* Clear where it's used as src. */
3524             for (int r = 0; r < this->next_temp; r++) {
3525                for (int c = 0; c < 4; c++) {
3526                   if (!acp[4 * r + c])
3527                      continue;
3528
3529                   int src_chan = GET_SWZ(acp[4 * r + c]->src[0].swizzle, c);
3530
3531                   if (acp[4 * r + c]->src[0].file == inst->dst.file &&
3532                       acp[4 * r + c]->src[0].index == inst->dst.index &&
3533                       inst->dst.writemask & (1 << src_chan))
3534                   {
3535                      acp[4 * r + c] = NULL;
3536                   }
3537                }
3538             }
3539          }
3540          break;
3541       }
3542
3543       /* If this is a copy, add it to the ACP. */
3544       if (inst->op == TGSI_OPCODE_MOV &&
3545           inst->dst.file == PROGRAM_TEMPORARY &&
3546           !inst->dst.reladdr &&
3547           !inst->saturate &&
3548           !inst->src[0].reladdr &&
3549           !inst->src[0].negate) {
3550          for (int i = 0; i < 4; i++) {
3551             if (inst->dst.writemask & (1 << i)) {
3552                acp[4 * inst->dst.index + i] = inst;
3553                acp_level[4 * inst->dst.index + i] = level;
3554             }
3555          }
3556       }
3557    }
3558
3559    ralloc_free(acp_level);
3560    ralloc_free(acp);
3561 }
3562
3563 /*
3564  * Tracks available PROGRAM_TEMPORARY registers for dead code elimination.
3565  *
3566  * The glsl_to_tgsi_visitor lazily produces code assuming that this pass
3567  * will occur.  As an example, a TXP production after copy propagation but
3568  * before this pass:
3569  *
3570  * 0: MOV TEMP[1], INPUT[4].xyyy;
3571  * 1: MOV TEMP[1].w, INPUT[4].wwww;
3572  * 2: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D;
3573  *
3574  * and after this pass:
3575  *
3576  * 0: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D;
3577  *
3578  * FIXME: assumes that all functions are inlined (no support for BGNSUB/ENDSUB)
3579  * FIXME: doesn't eliminate all dead code inside of loops; it steps around them
3580  */
3581 void
3582 glsl_to_tgsi_visitor::eliminate_dead_code(void)
3583 {
3584    int i;
3585
3586    for (i=0; i < this->next_temp; i++) {
3587       int last_read = get_last_temp_read(i);
3588       int j = 0;
3589
3590       foreach_iter(exec_list_iterator, iter, this->instructions) {
3591          glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
3592
3593          if (inst->dst.file == PROGRAM_TEMPORARY && inst->dst.index == i &&
3594              j > last_read)
3595          {
3596             iter.remove();
3597             delete inst;
3598          }
3599
3600          j++;
3601       }
3602    }
3603 }
3604
3605 /*
3606  * On a basic block basis, tracks available PROGRAM_TEMPORARY registers for dead
3607  * code elimination.  This is less primitive than eliminate_dead_code(), as it
3608  * is per-channel and can detect consecutive writes without a read between them
3609  * as dead code.  However, there is some dead code that can be eliminated by
3610  * eliminate_dead_code() but not this function - for example, this function
3611  * cannot eliminate an instruction writing to a register that is never read and
3612  * is the only instruction writing to that register.
3613  *
3614  * The glsl_to_tgsi_visitor lazily produces code assuming that this pass
3615  * will occur.
3616  */
3617 int
3618 glsl_to_tgsi_visitor::eliminate_dead_code_advanced(void)
3619 {
3620    glsl_to_tgsi_instruction **writes = rzalloc_array(mem_ctx,
3621                                                      glsl_to_tgsi_instruction *,
3622                                                      this->next_temp * 4);
3623    int *write_level = rzalloc_array(mem_ctx, int, this->next_temp * 4);
3624    int level = 0;
3625    int removed = 0;
3626
3627    foreach_iter(exec_list_iterator, iter, this->instructions) {
3628       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
3629
3630       assert(inst->dst.file != PROGRAM_TEMPORARY
3631              || inst->dst.index < this->next_temp);
3632
3633       switch (inst->op) {
3634       case TGSI_OPCODE_BGNLOOP:
3635       case TGSI_OPCODE_ENDLOOP:
3636          /* End of a basic block, clear the write array entirely.
3637           * FIXME: This keeps us from killing dead code when the writes are
3638           * on either side of a loop, even when the register isn't touched
3639           * inside the loop.
3640           */
3641          memset(writes, 0, sizeof(*writes) * this->next_temp * 4);
3642          break;
3643
3644       case TGSI_OPCODE_ENDIF:
3645          --level;
3646          break;
3647
3648       case TGSI_OPCODE_ELSE:
3649          /* Clear all channels written inside the preceding if block from the
3650           * write array, but leave those that were not touched.
3651           *
3652           * FIXME: This destroys opportunities to remove dead code inside of
3653           * IF blocks that are followed by an ELSE block.
3654           */
3655          for (int r = 0; r < this->next_temp; r++) {
3656             for (int c = 0; c < 4; c++) {
3657                if (!writes[4 * r + c])
3658                          continue;
3659
3660                if (write_level[4 * r + c] >= level)
3661                          writes[4 * r + c] = NULL;
3662             }
3663          }
3664          break;
3665
3666       case TGSI_OPCODE_IF:
3667          ++level;
3668          /* fallthrough to default case to mark the condition as read */
3669
3670       default:
3671          /* Continuing the block, clear any channels from the write array that
3672           * are read by this instruction.
3673           */
3674          for (unsigned i = 0; i < Elements(inst->src); i++) {
3675             if (inst->src[i].file == PROGRAM_TEMPORARY && inst->src[i].reladdr){
3676                /* Any temporary might be read, so no dead code elimination
3677                 * across this instruction.
3678                 */
3679                memset(writes, 0, sizeof(*writes) * this->next_temp * 4);
3680             } else if (inst->src[i].file == PROGRAM_TEMPORARY) {
3681                /* Clear where it's used as src. */
3682                int src_chans = 1 << GET_SWZ(inst->src[i].swizzle, 0);
3683                src_chans |= 1 << GET_SWZ(inst->src[i].swizzle, 1);
3684                src_chans |= 1 << GET_SWZ(inst->src[i].swizzle, 2);
3685                src_chans |= 1 << GET_SWZ(inst->src[i].swizzle, 3);
3686
3687                for (int c = 0; c < 4; c++) {
3688                    if (src_chans & (1 << c)) {
3689                       writes[4 * inst->src[i].index + c] = NULL;
3690                    }
3691                }
3692             }
3693          }
3694          break;
3695       }
3696
3697       /* If this instruction writes to a temporary, add it to the write array.
3698        * If there is already an instruction in the write array for one or more
3699        * of the channels, flag that channel write as dead.
3700        */
3701       if (inst->dst.file == PROGRAM_TEMPORARY &&
3702           !inst->dst.reladdr &&
3703           !inst->saturate) {
3704          for (int c = 0; c < 4; c++) {
3705             if (inst->dst.writemask & (1 << c)) {
3706                if (writes[4 * inst->dst.index + c]) {
3707                   if (write_level[4 * inst->dst.index + c] < level)
3708                      continue;
3709                   else
3710                      writes[4 * inst->dst.index + c]->dead_mask |= (1 << c);
3711                }
3712                writes[4 * inst->dst.index + c] = inst;
3713                write_level[4 * inst->dst.index + c] = level;
3714             }
3715          }
3716       }
3717    }
3718
3719    /* Anything still in the write array at this point is dead code. */
3720    for (int r = 0; r < this->next_temp; r++) {
3721       for (int c = 0; c < 4; c++) {
3722          glsl_to_tgsi_instruction *inst = writes[4 * r + c];
3723          if (inst)
3724             inst->dead_mask |= (1 << c);
3725       }
3726    }
3727
3728    /* Now actually remove the instructions that are completely dead and update
3729     * the writemask of other instructions with dead channels.
3730     */
3731    foreach_iter(exec_list_iterator, iter, this->instructions) {
3732       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
3733
3734       if (!inst->dead_mask || !inst->dst.writemask)
3735          continue;
3736       else if (inst->dead_mask == inst->dst.writemask) {
3737          iter.remove();
3738          delete inst;
3739          removed++;
3740       } else
3741          inst->dst.writemask &= ~(inst->dead_mask);
3742    }
3743
3744    ralloc_free(write_level);
3745    ralloc_free(writes);
3746
3747    return removed;
3748 }
3749
3750 /* Merges temporary registers together where possible to reduce the number of
3751  * registers needed to run a program.
3752  *
3753  * Produces optimal code only after copy propagation and dead code elimination
3754  * have been run. */
3755 void
3756 glsl_to_tgsi_visitor::merge_registers(void)
3757 {
3758    int *last_reads = rzalloc_array(mem_ctx, int, this->next_temp);
3759    int *first_writes = rzalloc_array(mem_ctx, int, this->next_temp);
3760    int i, j;
3761
3762    /* Read the indices of the last read and first write to each temp register
3763     * into an array so that we don't have to traverse the instruction list as
3764     * much. */
3765    for (i=0; i < this->next_temp; i++) {
3766       last_reads[i] = get_last_temp_read(i);
3767       first_writes[i] = get_first_temp_write(i);
3768    }
3769
3770    /* Start looking for registers with non-overlapping usages that can be
3771     * merged together. */
3772    for (i=0; i < this->next_temp; i++) {
3773       /* Don't touch unused registers. */
3774       if (last_reads[i] < 0 || first_writes[i] < 0) continue;
3775
3776       for (j=0; j < this->next_temp; j++) {
3777          /* Don't touch unused registers. */
3778          if (last_reads[j] < 0 || first_writes[j] < 0) continue;
3779
3780          /* We can merge the two registers if the first write to j is after or
3781           * in the same instruction as the last read from i.  Note that the
3782           * register at index i will always be used earlier or at the same time
3783           * as the register at index j. */
3784          if (first_writes[i] <= first_writes[j] &&
3785              last_reads[i] <= first_writes[j])
3786          {
3787             rename_temp_register(j, i); /* Replace all references to j with i.*/
3788
3789             /* Update the first_writes and last_reads arrays with the new
3790              * values for the merged register index, and mark the newly unused
3791              * register index as such. */
3792             last_reads[i] = last_reads[j];
3793             first_writes[j] = -1;
3794             last_reads[j] = -1;
3795          }
3796       }
3797    }
3798
3799    ralloc_free(last_reads);
3800    ralloc_free(first_writes);
3801 }
3802
3803 /* Reassign indices to temporary registers by reusing unused indices created
3804  * by optimization passes. */
3805 void
3806 glsl_to_tgsi_visitor::renumber_registers(void)
3807 {
3808    int i = 0;
3809    int new_index = 0;
3810
3811    for (i=0; i < this->next_temp; i++) {
3812       if (get_first_temp_read(i) < 0) continue;
3813       if (i != new_index)
3814          rename_temp_register(i, new_index);
3815       new_index++;
3816    }
3817
3818    this->next_temp = new_index;
3819 }
3820
3821 /**
3822  * Returns a fragment program which implements the current pixel transfer ops.
3823  * Based on get_pixel_transfer_program in st_atom_pixeltransfer.c.
3824  */
3825 extern "C" void
3826 get_pixel_transfer_visitor(struct st_fragment_program *fp,
3827                            glsl_to_tgsi_visitor *original,
3828                            int scale_and_bias, int pixel_maps)
3829 {
3830    glsl_to_tgsi_visitor *v = new glsl_to_tgsi_visitor();
3831    struct st_context *st = st_context(original->ctx);
3832    struct gl_program *prog = &fp->Base.Base;
3833    struct gl_program_parameter_list *params = _mesa_new_parameter_list();
3834    st_src_reg coord, src0;
3835    st_dst_reg dst0;
3836    glsl_to_tgsi_instruction *inst;
3837
3838    /* Copy attributes of the glsl_to_tgsi_visitor in the original shader. */
3839    v->ctx = original->ctx;
3840    v->prog = prog;
3841    v->glsl_version = original->glsl_version;
3842    v->native_integers = original->native_integers;
3843    v->options = original->options;
3844    v->next_temp = original->next_temp;
3845    v->num_address_regs = original->num_address_regs;
3846    v->samplers_used = prog->SamplersUsed = original->samplers_used;
3847    v->indirect_addr_temps = original->indirect_addr_temps;
3848    v->indirect_addr_consts = original->indirect_addr_consts;
3849    memcpy(&v->immediates, &original->immediates, sizeof(v->immediates));
3850
3851    /*
3852     * Get initial pixel color from the texture.
3853     * TEX colorTemp, fragment.texcoord[0], texture[0], 2D;
3854     */
3855    coord = st_src_reg(PROGRAM_INPUT, FRAG_ATTRIB_TEX0, glsl_type::vec2_type);
3856    src0 = v->get_temp(glsl_type::vec4_type);
3857    dst0 = st_dst_reg(src0);
3858    inst = v->emit(NULL, TGSI_OPCODE_TEX, dst0, coord);
3859    inst->sampler = 0;
3860    inst->tex_target = TEXTURE_2D_INDEX;
3861
3862    prog->InputsRead |= (1 << FRAG_ATTRIB_TEX0);
3863    prog->SamplersUsed |= (1 << 0); /* mark sampler 0 as used */
3864    v->samplers_used |= (1 << 0);
3865
3866    if (scale_and_bias) {
3867       static const gl_state_index scale_state[STATE_LENGTH] =
3868          { STATE_INTERNAL, STATE_PT_SCALE,
3869            (gl_state_index) 0, (gl_state_index) 0, (gl_state_index) 0 };
3870       static const gl_state_index bias_state[STATE_LENGTH] =
3871          { STATE_INTERNAL, STATE_PT_BIAS,
3872            (gl_state_index) 0, (gl_state_index) 0, (gl_state_index) 0 };
3873       GLint scale_p, bias_p;
3874       st_src_reg scale, bias;
3875
3876       scale_p = _mesa_add_state_reference(params, scale_state);
3877       bias_p = _mesa_add_state_reference(params, bias_state);
3878
3879       /* MAD colorTemp, colorTemp, scale, bias; */
3880       scale = st_src_reg(PROGRAM_STATE_VAR, scale_p, GLSL_TYPE_FLOAT);
3881       bias = st_src_reg(PROGRAM_STATE_VAR, bias_p, GLSL_TYPE_FLOAT);
3882       inst = v->emit(NULL, TGSI_OPCODE_MAD, dst0, src0, scale, bias);
3883    }
3884
3885    if (pixel_maps) {
3886       st_src_reg temp = v->get_temp(glsl_type::vec4_type);
3887       st_dst_reg temp_dst = st_dst_reg(temp);
3888
3889       assert(st->pixel_xfer.pixelmap_texture);
3890
3891       /* With a little effort, we can do four pixel map look-ups with
3892        * two TEX instructions:
3893        */
3894
3895       /* TEX temp.rg, colorTemp.rgba, texture[1], 2D; */
3896       temp_dst.writemask = WRITEMASK_XY; /* write R,G */
3897       inst = v->emit(NULL, TGSI_OPCODE_TEX, temp_dst, src0);
3898       inst->sampler = 1;
3899       inst->tex_target = TEXTURE_2D_INDEX;
3900
3901       /* TEX temp.ba, colorTemp.baba, texture[1], 2D; */
3902       src0.swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W);
3903       temp_dst.writemask = WRITEMASK_ZW; /* write B,A */
3904       inst = v->emit(NULL, TGSI_OPCODE_TEX, temp_dst, src0);
3905       inst->sampler = 1;
3906       inst->tex_target = TEXTURE_2D_INDEX;
3907
3908       prog->SamplersUsed |= (1 << 1); /* mark sampler 1 as used */
3909       v->samplers_used |= (1 << 1);
3910
3911       /* MOV colorTemp, temp; */
3912       inst = v->emit(NULL, TGSI_OPCODE_MOV, dst0, temp);
3913    }
3914
3915    /* Now copy the instructions from the original glsl_to_tgsi_visitor into the
3916     * new visitor. */
3917    foreach_iter(exec_list_iterator, iter, original->instructions) {
3918       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
3919       st_src_reg src_regs[3];
3920
3921       if (inst->dst.file == PROGRAM_OUTPUT)
3922          prog->OutputsWritten |= BITFIELD64_BIT(inst->dst.index);
3923
3924       for (int i=0; i<3; i++) {
3925          src_regs[i] = inst->src[i];
3926          if (src_regs[i].file == PROGRAM_INPUT &&
3927              src_regs[i].index == FRAG_ATTRIB_COL0)
3928          {
3929             src_regs[i].file = PROGRAM_TEMPORARY;
3930             src_regs[i].index = src0.index;
3931          }
3932          else if (src_regs[i].file == PROGRAM_INPUT)
3933             prog->InputsRead |= (1 << src_regs[i].index);
3934       }
3935
3936       v->emit(NULL, inst->op, inst->dst, src_regs[0], src_regs[1], src_regs[2]);
3937    }
3938
3939    /* Make modifications to fragment program info. */
3940    prog->Parameters = _mesa_combine_parameter_lists(params,
3941                                                     original->prog->Parameters);
3942    prog->Attributes = _mesa_clone_parameter_list(original->prog->Attributes);
3943    prog->Varying = _mesa_clone_parameter_list(original->prog->Varying);
3944    _mesa_free_parameter_list(params);
3945    count_resources(v, prog);
3946    fp->glsl_to_tgsi = v;
3947 }
3948
3949 /**
3950  * Make fragment program for glBitmap:
3951  *   Sample the texture and kill the fragment if the bit is 0.
3952  * This program will be combined with the user's fragment program.
3953  *
3954  * Based on make_bitmap_fragment_program in st_cb_bitmap.c.
3955  */
3956 extern "C" void
3957 get_bitmap_visitor(struct st_fragment_program *fp,
3958                    glsl_to_tgsi_visitor *original, int samplerIndex)
3959 {
3960    glsl_to_tgsi_visitor *v = new glsl_to_tgsi_visitor();
3961    struct st_context *st = st_context(original->ctx);
3962    struct gl_program *prog = &fp->Base.Base;
3963    st_src_reg coord, src0;
3964    st_dst_reg dst0;
3965    glsl_to_tgsi_instruction *inst;
3966
3967    /* Copy attributes of the glsl_to_tgsi_visitor in the original shader. */
3968    v->ctx = original->ctx;
3969    v->prog = prog;
3970    v->glsl_version = original->glsl_version;
3971    v->native_integers = original->native_integers;
3972    v->options = original->options;
3973    v->next_temp = original->next_temp;
3974    v->num_address_regs = original->num_address_regs;
3975    v->samplers_used = prog->SamplersUsed = original->samplers_used;
3976    v->indirect_addr_temps = original->indirect_addr_temps;
3977    v->indirect_addr_consts = original->indirect_addr_consts;
3978    memcpy(&v->immediates, &original->immediates, sizeof(v->immediates));
3979
3980    /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */
3981    coord = st_src_reg(PROGRAM_INPUT, FRAG_ATTRIB_TEX0, glsl_type::vec2_type);
3982    src0 = v->get_temp(glsl_type::vec4_type);
3983    dst0 = st_dst_reg(src0);
3984    inst = v->emit(NULL, TGSI_OPCODE_TEX, dst0, coord);
3985    inst->sampler = samplerIndex;
3986    inst->tex_target = TEXTURE_2D_INDEX;
3987
3988    prog->InputsRead |= (1 << FRAG_ATTRIB_TEX0);
3989    prog->SamplersUsed |= (1 << samplerIndex); /* mark sampler as used */
3990    v->samplers_used |= (1 << samplerIndex);
3991
3992    /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */
3993    src0.negate = NEGATE_XYZW;
3994    if (st->bitmap.tex_format == PIPE_FORMAT_L8_UNORM)
3995       src0.swizzle = SWIZZLE_XXXX;
3996    inst = v->emit(NULL, TGSI_OPCODE_KIL, undef_dst, src0);
3997
3998    /* Now copy the instructions from the original glsl_to_tgsi_visitor into the
3999     * new visitor. */
4000    foreach_iter(exec_list_iterator, iter, original->instructions) {
4001       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
4002       st_src_reg src_regs[3];
4003
4004       if (inst->dst.file == PROGRAM_OUTPUT)
4005          prog->OutputsWritten |= BITFIELD64_BIT(inst->dst.index);
4006
4007       for (int i=0; i<3; i++) {
4008          src_regs[i] = inst->src[i];
4009          if (src_regs[i].file == PROGRAM_INPUT)
4010             prog->InputsRead |= (1 << src_regs[i].index);
4011       }
4012
4013       v->emit(NULL, inst->op, inst->dst, src_regs[0], src_regs[1], src_regs[2]);
4014    }
4015
4016    /* Make modifications to fragment program info. */
4017    prog->Parameters = _mesa_clone_parameter_list(original->prog->Parameters);
4018    prog->Attributes = _mesa_clone_parameter_list(original->prog->Attributes);
4019    prog->Varying = _mesa_clone_parameter_list(original->prog->Varying);
4020    count_resources(v, prog);
4021    fp->glsl_to_tgsi = v;
4022 }
4023
4024 /* ------------------------- TGSI conversion stuff -------------------------- */
4025 struct label {
4026    unsigned branch_target;
4027    unsigned token;
4028 };
4029
4030 /**
4031  * Intermediate state used during shader translation.
4032  */
4033 struct st_translate {
4034    struct ureg_program *ureg;
4035
4036    struct ureg_dst temps[MAX_TEMPS];
4037    struct ureg_src *constants;
4038    struct ureg_src *immediates;
4039    struct ureg_dst outputs[PIPE_MAX_SHADER_OUTPUTS];
4040    struct ureg_src inputs[PIPE_MAX_SHADER_INPUTS];
4041    struct ureg_dst address[1];
4042    struct ureg_src samplers[PIPE_MAX_SAMPLERS];
4043    struct ureg_src systemValues[SYSTEM_VALUE_MAX];
4044
4045    /* Extra info for handling point size clamping in vertex shader */
4046    struct ureg_dst pointSizeResult; /**< Actual point size output register */
4047    struct ureg_src pointSizeConst;  /**< Point size range constant register */
4048    GLint pointSizeOutIndex;         /**< Temp point size output register */
4049    GLboolean prevInstWrotePointSize;
4050
4051    const GLuint *inputMapping;
4052    const GLuint *outputMapping;
4053
4054    /* For every instruction that contains a label (eg CALL), keep
4055     * details so that we can go back afterwards and emit the correct
4056     * tgsi instruction number for each label.
4057     */
4058    struct label *labels;
4059    unsigned labels_size;
4060    unsigned labels_count;
4061
4062    /* Keep a record of the tgsi instruction number that each mesa
4063     * instruction starts at, will be used to fix up labels after
4064     * translation.
4065     */
4066    unsigned *insn;
4067    unsigned insn_size;
4068    unsigned insn_count;
4069
4070    unsigned procType;  /**< TGSI_PROCESSOR_VERTEX/FRAGMENT */
4071
4072    boolean error;
4073 };
4074
4075 /** Map Mesa's SYSTEM_VALUE_x to TGSI_SEMANTIC_x */
4076 static unsigned mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
4077    TGSI_SEMANTIC_FACE,
4078    TGSI_SEMANTIC_INSTANCEID
4079 };
4080
4081 /**
4082  * Make note of a branch to a label in the TGSI code.
4083  * After we've emitted all instructions, we'll go over the list
4084  * of labels built here and patch the TGSI code with the actual
4085  * location of each label.
4086  */
4087 static unsigned *get_label(struct st_translate *t, unsigned branch_target)
4088 {
4089    unsigned i;
4090
4091    if (t->labels_count + 1 >= t->labels_size) {
4092       t->labels_size = 1 << (util_logbase2(t->labels_size) + 1);
4093       t->labels = (struct label *)realloc(t->labels,
4094                                           t->labels_size * sizeof(struct label));
4095       if (t->labels == NULL) {
4096          static unsigned dummy;
4097          t->error = TRUE;
4098          return &dummy;
4099       }
4100    }
4101
4102    i = t->labels_count++;
4103    t->labels[i].branch_target = branch_target;
4104    return &t->labels[i].token;
4105 }
4106
4107 /**
4108  * Called prior to emitting the TGSI code for each instruction.
4109  * Allocate additional space for instructions if needed.
4110  * Update the insn[] array so the next glsl_to_tgsi_instruction points to
4111  * the next TGSI instruction.
4112  */
4113 static void set_insn_start(struct st_translate *t, unsigned start)
4114 {
4115    if (t->insn_count + 1 >= t->insn_size) {
4116       t->insn_size = 1 << (util_logbase2(t->insn_size) + 1);
4117       t->insn = (unsigned *)realloc(t->insn, t->insn_size * sizeof(t->insn[0]));
4118       if (t->insn == NULL) {
4119          t->error = TRUE;
4120          return;
4121       }
4122    }
4123
4124    t->insn[t->insn_count++] = start;
4125 }
4126
4127 /**
4128  * Map a glsl_to_tgsi constant/immediate to a TGSI immediate.
4129  */
4130 static struct ureg_src
4131 emit_immediate(struct st_translate *t,
4132                gl_constant_value values[4],
4133                int type, int size)
4134 {
4135    struct ureg_program *ureg = t->ureg;
4136
4137    switch(type)
4138    {
4139    case GL_FLOAT:
4140       return ureg_DECL_immediate(ureg, &values[0].f, size);
4141    case GL_INT:
4142       return ureg_DECL_immediate_int(ureg, &values[0].i, size);
4143    case GL_UNSIGNED_INT:
4144    case GL_BOOL:
4145       return ureg_DECL_immediate_uint(ureg, &values[0].u, size);
4146    default:
4147       assert(!"should not get here - type must be float, int, uint, or bool");
4148       return ureg_src_undef();
4149    }
4150 }
4151
4152 /**
4153  * Map a glsl_to_tgsi dst register to a TGSI ureg_dst register.
4154  */
4155 static struct ureg_dst
4156 dst_register(struct st_translate *t,
4157              gl_register_file file,
4158              GLuint index)
4159 {
4160    switch(file) {
4161    case PROGRAM_UNDEFINED:
4162       return ureg_dst_undef();
4163
4164    case PROGRAM_TEMPORARY:
4165       if (ureg_dst_is_undef(t->temps[index]))
4166          t->temps[index] = ureg_DECL_temporary(t->ureg);
4167
4168       return t->temps[index];
4169
4170    case PROGRAM_OUTPUT:
4171       if (t->procType == TGSI_PROCESSOR_VERTEX && index == VERT_RESULT_PSIZ)
4172          t->prevInstWrotePointSize = GL_TRUE;
4173
4174       if (t->procType == TGSI_PROCESSOR_VERTEX)
4175          assert(index < VERT_RESULT_MAX);
4176       else if (t->procType == TGSI_PROCESSOR_FRAGMENT)
4177          assert(index < FRAG_RESULT_MAX);
4178       else
4179          assert(index < GEOM_RESULT_MAX);
4180
4181       assert(t->outputMapping[index] < Elements(t->outputs));
4182
4183       return t->outputs[t->outputMapping[index]];
4184
4185    case PROGRAM_ADDRESS:
4186       return t->address[index];
4187
4188    default:
4189       assert(!"unknown dst register file");
4190       return ureg_dst_undef();
4191    }
4192 }
4193
4194 /**
4195  * Map a glsl_to_tgsi src register to a TGSI ureg_src register.
4196  */
4197 static struct ureg_src
4198 src_register(struct st_translate *t,
4199              gl_register_file file,
4200              GLuint index)
4201 {
4202    switch(file) {
4203    case PROGRAM_UNDEFINED:
4204       return ureg_src_undef();
4205
4206    case PROGRAM_TEMPORARY:
4207       assert(index >= 0);
4208       assert(index < Elements(t->temps));
4209       if (ureg_dst_is_undef(t->temps[index]))
4210          t->temps[index] = ureg_DECL_temporary(t->ureg);
4211       return ureg_src(t->temps[index]);
4212
4213    case PROGRAM_NAMED_PARAM:
4214    case PROGRAM_ENV_PARAM:
4215    case PROGRAM_LOCAL_PARAM:
4216    case PROGRAM_UNIFORM:
4217       assert(index >= 0);
4218       return t->constants[index];
4219    case PROGRAM_STATE_VAR:
4220    case PROGRAM_CONSTANT:       /* ie, immediate */
4221       if (index < 0)
4222          return ureg_DECL_constant(t->ureg, 0);
4223       else
4224          return t->constants[index];
4225
4226    case PROGRAM_IMMEDIATE:
4227       return t->immediates[index];
4228
4229    case PROGRAM_INPUT:
4230       assert(t->inputMapping[index] < Elements(t->inputs));
4231       return t->inputs[t->inputMapping[index]];
4232
4233    case PROGRAM_OUTPUT:
4234       assert(t->outputMapping[index] < Elements(t->outputs));
4235       return ureg_src(t->outputs[t->outputMapping[index]]); /* not needed? */
4236
4237    case PROGRAM_ADDRESS:
4238       return ureg_src(t->address[index]);
4239
4240    case PROGRAM_SYSTEM_VALUE:
4241       assert(index < Elements(t->systemValues));
4242       return t->systemValues[index];
4243
4244    default:
4245       assert(!"unknown src register file");
4246       return ureg_src_undef();
4247    }
4248 }
4249
4250 /**
4251  * Create a TGSI ureg_dst register from an st_dst_reg.
4252  */
4253 static struct ureg_dst
4254 translate_dst(struct st_translate *t,
4255               const st_dst_reg *dst_reg,
4256               bool saturate)
4257 {
4258    struct ureg_dst dst = dst_register(t,
4259                                       dst_reg->file,
4260                                       dst_reg->index);
4261
4262    dst = ureg_writemask(dst, dst_reg->writemask);
4263
4264    if (saturate)
4265       dst = ureg_saturate(dst);
4266
4267    if (dst_reg->reladdr != NULL)
4268       dst = ureg_dst_indirect(dst, ureg_src(t->address[0]));
4269
4270    return dst;
4271 }
4272
4273 /**
4274  * Create a TGSI ureg_src register from an st_src_reg.
4275  */
4276 static struct ureg_src
4277 translate_src(struct st_translate *t, const st_src_reg *src_reg)
4278 {
4279    struct ureg_src src = src_register(t, src_reg->file, src_reg->index);
4280
4281    src = ureg_swizzle(src,
4282                       GET_SWZ(src_reg->swizzle, 0) & 0x3,
4283                       GET_SWZ(src_reg->swizzle, 1) & 0x3,
4284                       GET_SWZ(src_reg->swizzle, 2) & 0x3,
4285                       GET_SWZ(src_reg->swizzle, 3) & 0x3);
4286
4287    if ((src_reg->negate & 0xf) == NEGATE_XYZW)
4288       src = ureg_negate(src);
4289
4290    if (src_reg->reladdr != NULL) {
4291       /* Normally ureg_src_indirect() would be used here, but a stupid compiler
4292        * bug in g++ makes ureg_src_indirect (an inline C function) erroneously
4293        * set the bit for src.Negate.  So we have to do the operation manually
4294        * here to work around the compiler's problems. */
4295       /*src = ureg_src_indirect(src, ureg_src(t->address[0]));*/
4296       struct ureg_src addr = ureg_src(t->address[0]);
4297       src.Indirect = 1;
4298       src.IndirectFile = addr.File;
4299       src.IndirectIndex = addr.Index;
4300       src.IndirectSwizzle = addr.SwizzleX;
4301
4302       if (src_reg->file != PROGRAM_INPUT &&
4303           src_reg->file != PROGRAM_OUTPUT) {
4304          /* If src_reg->index was negative, it was set to zero in
4305           * src_register().  Reassign it now.  But don't do this
4306           * for input/output regs since they get remapped while
4307           * const buffers don't.
4308           */
4309          src.Index = src_reg->index;
4310       }
4311    }
4312
4313    return src;
4314 }
4315
4316 static struct tgsi_texture_offset
4317 translate_tex_offset(struct st_translate *t,
4318                      const struct tgsi_texture_offset *in_offset)
4319 {
4320    struct tgsi_texture_offset offset;
4321
4322    assert(in_offset->File == PROGRAM_IMMEDIATE);
4323
4324    offset.File = TGSI_FILE_IMMEDIATE;
4325    offset.Index = in_offset->Index;
4326    offset.SwizzleX = in_offset->SwizzleX;
4327    offset.SwizzleY = in_offset->SwizzleY;
4328    offset.SwizzleZ = in_offset->SwizzleZ;
4329
4330    return offset;
4331 }
4332
4333 static void
4334 compile_tgsi_instruction(struct st_translate *t,
4335                          const glsl_to_tgsi_instruction *inst)
4336 {
4337    struct ureg_program *ureg = t->ureg;
4338    GLuint i;
4339    struct ureg_dst dst[1];
4340    struct ureg_src src[4];
4341    struct tgsi_texture_offset texoffsets[MAX_GLSL_TEXTURE_OFFSET];
4342
4343    unsigned num_dst;
4344    unsigned num_src;
4345
4346    num_dst = num_inst_dst_regs(inst->op);
4347    num_src = num_inst_src_regs(inst->op);
4348
4349    if (num_dst)
4350       dst[0] = translate_dst(t,
4351                              &inst->dst,
4352                              inst->saturate);
4353
4354    for (i = 0; i < num_src; i++)
4355       src[i] = translate_src(t, &inst->src[i]);
4356
4357    switch(inst->op) {
4358    case TGSI_OPCODE_BGNLOOP:
4359    case TGSI_OPCODE_CAL:
4360    case TGSI_OPCODE_ELSE:
4361    case TGSI_OPCODE_ENDLOOP:
4362    case TGSI_OPCODE_IF:
4363       assert(num_dst == 0);
4364       ureg_label_insn(ureg,
4365                       inst->op,
4366                       src, num_src,
4367                       get_label(t,
4368                                 inst->op == TGSI_OPCODE_CAL ? inst->function->sig_id : 0));
4369       return;
4370
4371    case TGSI_OPCODE_TEX:
4372    case TGSI_OPCODE_TXB:
4373    case TGSI_OPCODE_TXD:
4374    case TGSI_OPCODE_TXL:
4375    case TGSI_OPCODE_TXP:
4376    case TGSI_OPCODE_TXQ:
4377    case TGSI_OPCODE_TXF:
4378       src[num_src++] = t->samplers[inst->sampler];
4379       for (i = 0; i < inst->tex_offset_num_offset; i++) {
4380          texoffsets[i] = translate_tex_offset(t, &inst->tex_offsets[i]);
4381       }
4382       ureg_tex_insn(ureg,
4383                     inst->op,
4384                     dst, num_dst,
4385                     translate_texture_target(inst->tex_target, inst->tex_shadow),
4386                     texoffsets, inst->tex_offset_num_offset,
4387                     src, num_src);
4388       return;
4389
4390    case TGSI_OPCODE_SCS:
4391       dst[0] = ureg_writemask(dst[0], TGSI_WRITEMASK_XY);
4392       ureg_insn(ureg, inst->op, dst, num_dst, src, num_src);
4393       break;
4394
4395    default:
4396       ureg_insn(ureg,
4397                 inst->op,
4398                 dst, num_dst,
4399                 src, num_src);
4400       break;
4401    }
4402 }
4403
4404 /**
4405  * Emit the TGSI instructions to adjust the WPOS pixel center convention
4406  * Basically, add (adjX, adjY) to the fragment position.
4407  */
4408 static void
4409 emit_adjusted_wpos(struct st_translate *t,
4410                    const struct gl_program *program,
4411                    float adjX, float adjY)
4412 {
4413    struct ureg_program *ureg = t->ureg;
4414    struct ureg_dst wpos_temp = ureg_DECL_temporary(ureg);
4415    struct ureg_src wpos_input = t->inputs[t->inputMapping[FRAG_ATTRIB_WPOS]];
4416
4417    /* Note that we bias X and Y and pass Z and W through unchanged.
4418     * The shader might also use gl_FragCoord.w and .z.
4419     */
4420    ureg_ADD(ureg, wpos_temp, wpos_input,
4421             ureg_imm4f(ureg, adjX, adjY, 0.0f, 0.0f));
4422
4423    t->inputs[t->inputMapping[FRAG_ATTRIB_WPOS]] = ureg_src(wpos_temp);
4424 }
4425
4426
4427 /**
4428  * Emit the TGSI instructions for inverting the WPOS y coordinate.
4429  * This code is unavoidable because it also depends on whether
4430  * a FBO is bound (STATE_FB_WPOS_Y_TRANSFORM).
4431  */
4432 static void
4433 emit_wpos_inversion(struct st_translate *t,
4434                     const struct gl_program *program,
4435                     bool invert)
4436 {
4437    struct ureg_program *ureg = t->ureg;
4438
4439    /* Fragment program uses fragment position input.
4440     * Need to replace instances of INPUT[WPOS] with temp T
4441     * where T = INPUT[WPOS] by y is inverted.
4442     */
4443    static const gl_state_index wposTransformState[STATE_LENGTH]
4444       = { STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM,
4445           (gl_state_index)0, (gl_state_index)0, (gl_state_index)0 };
4446
4447    /* XXX: note we are modifying the incoming shader here!  Need to
4448     * do this before emitting the constant decls below, or this
4449     * will be missed:
4450     */
4451    unsigned wposTransConst = _mesa_add_state_reference(program->Parameters,
4452                                                        wposTransformState);
4453
4454    struct ureg_src wpostrans = ureg_DECL_constant(ureg, wposTransConst);
4455    struct ureg_dst wpos_temp;
4456    struct ureg_src wpos_input = t->inputs[t->inputMapping[FRAG_ATTRIB_WPOS]];
4457
4458    /* MOV wpos_temp, input[wpos]
4459     */
4460    if (wpos_input.File == TGSI_FILE_TEMPORARY)
4461       wpos_temp = ureg_dst(wpos_input);
4462    else {
4463       wpos_temp = ureg_DECL_temporary(ureg);
4464       ureg_MOV(ureg, wpos_temp, wpos_input);
4465    }
4466
4467    if (invert) {
4468       /* MAD wpos_temp.y, wpos_input, wpostrans.xxxx, wpostrans.yyyy
4469        */
4470       ureg_MAD(ureg,
4471                ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y),
4472                wpos_input,
4473                ureg_scalar(wpostrans, 0),
4474                ureg_scalar(wpostrans, 1));
4475    } else {
4476       /* MAD wpos_temp.y, wpos_input, wpostrans.zzzz, wpostrans.wwww
4477        */
4478       ureg_MAD(ureg,
4479                ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y),
4480                wpos_input,
4481                ureg_scalar(wpostrans, 2),
4482                ureg_scalar(wpostrans, 3));
4483    }
4484
4485    /* Use wpos_temp as position input from here on:
4486     */
4487    t->inputs[t->inputMapping[FRAG_ATTRIB_WPOS]] = ureg_src(wpos_temp);
4488 }
4489
4490
4491 /**
4492  * Emit fragment position/ooordinate code.
4493  */
4494 static void
4495 emit_wpos(struct st_context *st,
4496           struct st_translate *t,
4497           const struct gl_program *program,
4498           struct ureg_program *ureg)
4499 {
4500    const struct gl_fragment_program *fp =
4501       (const struct gl_fragment_program *) program;
4502    struct pipe_screen *pscreen = st->pipe->screen;
4503    boolean invert = FALSE;
4504
4505    if (fp->OriginUpperLeft) {
4506       /* Fragment shader wants origin in upper-left */
4507       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT)) {
4508          /* the driver supports upper-left origin */
4509       }
4510       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT)) {
4511          /* the driver supports lower-left origin, need to invert Y */
4512          ureg_property_fs_coord_origin(ureg, TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
4513          invert = TRUE;
4514       }
4515       else
4516          assert(0);
4517    }
4518    else {
4519       /* Fragment shader wants origin in lower-left */
4520       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT))
4521          /* the driver supports lower-left origin */
4522          ureg_property_fs_coord_origin(ureg, TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
4523       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT))
4524          /* the driver supports upper-left origin, need to invert Y */
4525          invert = TRUE;
4526       else
4527          assert(0);
4528    }
4529
4530    if (fp->PixelCenterInteger) {
4531       /* Fragment shader wants pixel center integer */
4532       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER))
4533          /* the driver supports pixel center integer */
4534          ureg_property_fs_coord_pixel_center(ureg, TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
4535       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER))
4536          /* the driver supports pixel center half integer, need to bias X,Y */
4537          emit_adjusted_wpos(t, program, 0.5f, invert ? 0.5f : -0.5f);
4538       else
4539          assert(0);
4540    }
4541    else {
4542       /* Fragment shader wants pixel center half integer */
4543       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER)) {
4544          /* the driver supports pixel center half integer */
4545       }
4546       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER)) {
4547          /* the driver supports pixel center integer, need to bias X,Y */
4548          ureg_property_fs_coord_pixel_center(ureg, TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
4549          emit_adjusted_wpos(t, program, 0.5f, invert ? -0.5f : 0.5f);
4550       }
4551       else
4552          assert(0);
4553    }
4554
4555    /* we invert after adjustment so that we avoid the MOV to temporary,
4556     * and reuse the adjustment ADD instead */
4557    emit_wpos_inversion(t, program, invert);
4558 }
4559
4560 /**
4561  * OpenGL's fragment gl_FrontFace input is 1 for front-facing, 0 for back.
4562  * TGSI uses +1 for front, -1 for back.
4563  * This function converts the TGSI value to the GL value.  Simply clamping/
4564  * saturating the value to [0,1] does the job.
4565  */
4566 static void
4567 emit_face_var(struct st_translate *t)
4568 {
4569    struct ureg_program *ureg = t->ureg;
4570    struct ureg_dst face_temp = ureg_DECL_temporary(ureg);
4571    struct ureg_src face_input = t->inputs[t->inputMapping[FRAG_ATTRIB_FACE]];
4572
4573    /* MOV_SAT face_temp, input[face] */
4574    face_temp = ureg_saturate(face_temp);
4575    ureg_MOV(ureg, face_temp, face_input);
4576
4577    /* Use face_temp as face input from here on: */
4578    t->inputs[t->inputMapping[FRAG_ATTRIB_FACE]] = ureg_src(face_temp);
4579 }
4580
4581 static void
4582 emit_edgeflags(struct st_translate *t)
4583 {
4584    struct ureg_program *ureg = t->ureg;
4585    struct ureg_dst edge_dst = t->outputs[t->outputMapping[VERT_RESULT_EDGE]];
4586    struct ureg_src edge_src = t->inputs[t->inputMapping[VERT_ATTRIB_EDGEFLAG]];
4587
4588    ureg_MOV(ureg, edge_dst, edge_src);
4589 }
4590
4591 /**
4592  * Translate intermediate IR (glsl_to_tgsi_instruction) to TGSI format.
4593  * \param program  the program to translate
4594  * \param numInputs  number of input registers used
4595  * \param inputMapping  maps Mesa fragment program inputs to TGSI generic
4596  *                      input indexes
4597  * \param inputSemanticName  the TGSI_SEMANTIC flag for each input
4598  * \param inputSemanticIndex  the semantic index (ex: which texcoord) for
4599  *                            each input
4600  * \param interpMode  the TGSI_INTERPOLATE_LINEAR/PERSP mode for each input
4601  * \param numOutputs  number of output registers used
4602  * \param outputMapping  maps Mesa fragment program outputs to TGSI
4603  *                       generic outputs
4604  * \param outputSemanticName  the TGSI_SEMANTIC flag for each output
4605  * \param outputSemanticIndex  the semantic index (ex: which texcoord) for
4606  *                             each output
4607  *
4608  * \return  PIPE_OK or PIPE_ERROR_OUT_OF_MEMORY
4609  */
4610 extern "C" enum pipe_error
4611 st_translate_program(
4612    struct gl_context *ctx,
4613    uint procType,
4614    struct ureg_program *ureg,
4615    glsl_to_tgsi_visitor *program,
4616    const struct gl_program *proginfo,
4617    GLuint numInputs,
4618    const GLuint inputMapping[],
4619    const ubyte inputSemanticName[],
4620    const ubyte inputSemanticIndex[],
4621    const GLuint interpMode[],
4622    GLuint numOutputs,
4623    const GLuint outputMapping[],
4624    const ubyte outputSemanticName[],
4625    const ubyte outputSemanticIndex[],
4626    boolean passthrough_edgeflags)
4627 {
4628    struct st_translate translate, *t;
4629    unsigned i;
4630    enum pipe_error ret = PIPE_OK;
4631
4632    assert(numInputs <= Elements(t->inputs));
4633    assert(numOutputs <= Elements(t->outputs));
4634
4635    t = &translate;
4636    memset(t, 0, sizeof *t);
4637
4638    t->procType = procType;
4639    t->inputMapping = inputMapping;
4640    t->outputMapping = outputMapping;
4641    t->ureg = ureg;
4642    t->pointSizeOutIndex = -1;
4643    t->prevInstWrotePointSize = GL_FALSE;
4644
4645    /*
4646     * Declare input attributes.
4647     */
4648    if (procType == TGSI_PROCESSOR_FRAGMENT) {
4649       for (i = 0; i < numInputs; i++) {
4650          t->inputs[i] = ureg_DECL_fs_input(ureg,
4651                                            inputSemanticName[i],
4652                                            inputSemanticIndex[i],
4653                                            interpMode[i]);
4654       }
4655
4656       if (proginfo->InputsRead & FRAG_BIT_WPOS) {
4657          /* Must do this after setting up t->inputs, and before
4658           * emitting constant references, below:
4659           */
4660           emit_wpos(st_context(ctx), t, proginfo, ureg);
4661       }
4662
4663       if (proginfo->InputsRead & FRAG_BIT_FACE)
4664          emit_face_var(t);
4665
4666       /*
4667        * Declare output attributes.
4668        */
4669       for (i = 0; i < numOutputs; i++) {
4670          switch (outputSemanticName[i]) {
4671          case TGSI_SEMANTIC_POSITION:
4672             t->outputs[i] = ureg_DECL_output(ureg,
4673                                              TGSI_SEMANTIC_POSITION, /* Z/Depth */
4674                                              outputSemanticIndex[i]);
4675             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_Z);
4676             break;
4677          case TGSI_SEMANTIC_STENCIL:
4678             t->outputs[i] = ureg_DECL_output(ureg,
4679                                              TGSI_SEMANTIC_STENCIL, /* Stencil */
4680                                              outputSemanticIndex[i]);
4681             t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_Y);
4682             break;
4683          case TGSI_SEMANTIC_COLOR:
4684             t->outputs[i] = ureg_DECL_output(ureg,
4685                                              TGSI_SEMANTIC_COLOR,
4686                                              outputSemanticIndex[i]);
4687             break;
4688          default:
4689             assert(!"fragment shader outputs must be POSITION/STENCIL/COLOR");
4690             return PIPE_ERROR_BAD_INPUT;
4691          }
4692       }
4693    }
4694    else if (procType == TGSI_PROCESSOR_GEOMETRY) {
4695       for (i = 0; i < numInputs; i++) {
4696          t->inputs[i] = ureg_DECL_gs_input(ureg,
4697                                            i,
4698                                            inputSemanticName[i],
4699                                            inputSemanticIndex[i]);
4700       }
4701
4702       for (i = 0; i < numOutputs; i++) {
4703          t->outputs[i] = ureg_DECL_output(ureg,
4704                                           outputSemanticName[i],
4705                                           outputSemanticIndex[i]);
4706       }
4707    }
4708    else {
4709       assert(procType == TGSI_PROCESSOR_VERTEX);
4710
4711       for (i = 0; i < numInputs; i++) {
4712          t->inputs[i] = ureg_DECL_vs_input(ureg, i);
4713       }
4714
4715       for (i = 0; i < numOutputs; i++) {
4716          t->outputs[i] = ureg_DECL_output(ureg,
4717                                           outputSemanticName[i],
4718                                           outputSemanticIndex[i]);
4719          if ((outputSemanticName[i] == TGSI_SEMANTIC_PSIZE) && proginfo->Id) {
4720             /* Writing to the point size result register requires special
4721              * handling to implement clamping.
4722              */
4723             static const gl_state_index pointSizeClampState[STATE_LENGTH]
4724                = { STATE_INTERNAL, STATE_POINT_SIZE_IMPL_CLAMP, (gl_state_index)0, (gl_state_index)0, (gl_state_index)0 };
4725                /* XXX: note we are modifying the incoming shader here!  Need to
4726                * do this before emitting the constant decls below, or this
4727                * will be missed.
4728                */
4729             unsigned pointSizeClampConst =
4730                _mesa_add_state_reference(proginfo->Parameters,
4731                                          pointSizeClampState);
4732             struct ureg_dst psizregtemp = ureg_DECL_temporary(ureg);
4733             t->pointSizeConst = ureg_DECL_constant(ureg, pointSizeClampConst);
4734             t->pointSizeResult = t->outputs[i];
4735             t->pointSizeOutIndex = i;
4736             t->outputs[i] = psizregtemp;
4737          }
4738       }
4739       if (passthrough_edgeflags)
4740          emit_edgeflags(t);
4741    }
4742
4743    /* Declare address register.
4744     */
4745    if (program->num_address_regs > 0) {
4746       assert(program->num_address_regs == 1);
4747       t->address[0] = ureg_DECL_address(ureg);
4748    }
4749
4750    /* Declare misc input registers
4751     */
4752    {
4753       GLbitfield sysInputs = proginfo->SystemValuesRead;
4754       unsigned numSys = 0;
4755       for (i = 0; sysInputs; i++) {
4756          if (sysInputs & (1 << i)) {
4757             unsigned semName = mesa_sysval_to_semantic[i];
4758             t->systemValues[i] = ureg_DECL_system_value(ureg, numSys, semName, 0);
4759             numSys++;
4760             sysInputs &= ~(1 << i);
4761          }
4762       }
4763    }
4764
4765    if (program->indirect_addr_temps) {
4766       /* If temps are accessed with indirect addressing, declare temporaries
4767        * in sequential order.  Else, we declare them on demand elsewhere.
4768        * (Note: the number of temporaries is equal to program->next_temp)
4769        */
4770       for (i = 0; i < (unsigned)program->next_temp; i++) {
4771          /* XXX use TGSI_FILE_TEMPORARY_ARRAY when it's supported by ureg */
4772          t->temps[i] = ureg_DECL_temporary(t->ureg);
4773       }
4774    }
4775
4776    /* Emit constants and uniforms.  TGSI uses a single index space for these,
4777     * so we put all the translated regs in t->constants.
4778     */
4779    if (proginfo->Parameters) {
4780       t->constants = (struct ureg_src *)CALLOC(proginfo->Parameters->NumParameters * sizeof(t->constants[0]));
4781       if (t->constants == NULL) {
4782          ret = PIPE_ERROR_OUT_OF_MEMORY;
4783          goto out;
4784       }
4785
4786       for (i = 0; i < proginfo->Parameters->NumParameters; i++) {
4787          switch (proginfo->Parameters->Parameters[i].Type) {
4788          case PROGRAM_ENV_PARAM:
4789          case PROGRAM_LOCAL_PARAM:
4790          case PROGRAM_STATE_VAR:
4791          case PROGRAM_NAMED_PARAM:
4792          case PROGRAM_UNIFORM:
4793             t->constants[i] = ureg_DECL_constant(ureg, i);
4794             break;
4795
4796          /* Emit immediates for PROGRAM_CONSTANT only when there's no indirect
4797           * addressing of the const buffer.
4798           * FIXME: Be smarter and recognize param arrays:
4799           * indirect addressing is only valid within the referenced
4800           * array.
4801           */
4802          case PROGRAM_CONSTANT:
4803             if (program->indirect_addr_consts)
4804                t->constants[i] = ureg_DECL_constant(ureg, i);
4805             else
4806                t->constants[i] = emit_immediate(t,
4807                                                 proginfo->Parameters->ParameterValues[i],
4808                                                 proginfo->Parameters->Parameters[i].DataType,
4809                                                 4);
4810             break;
4811          default:
4812             break;
4813          }
4814       }
4815    }
4816
4817    /* Emit immediate values.
4818     */
4819    t->immediates = (struct ureg_src *)CALLOC(program->num_immediates * sizeof(struct ureg_src));
4820    if (t->immediates == NULL) {
4821       ret = PIPE_ERROR_OUT_OF_MEMORY;
4822       goto out;
4823    }
4824    i = 0;
4825    foreach_iter(exec_list_iterator, iter, program->immediates) {
4826       immediate_storage *imm = (immediate_storage *)iter.get();
4827       t->immediates[i++] = emit_immediate(t, imm->values, imm->type, imm->size);
4828    }
4829
4830    /* texture samplers */
4831    for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
4832       if (program->samplers_used & (1 << i)) {
4833          t->samplers[i] = ureg_DECL_sampler(ureg, i);
4834       }
4835    }
4836
4837    /* Emit each instruction in turn:
4838     */
4839    foreach_iter(exec_list_iterator, iter, program->instructions) {
4840       set_insn_start(t, ureg_get_instruction_number(ureg));
4841       compile_tgsi_instruction(t, (glsl_to_tgsi_instruction *)iter.get());
4842
4843       if (t->prevInstWrotePointSize && proginfo->Id) {
4844          /* The previous instruction wrote to the (fake) vertex point size
4845           * result register.  Now we need to clamp that value to the min/max
4846           * point size range, putting the result into the real point size
4847           * register.
4848           * Note that we can't do this easily at the end of program due to
4849           * possible early return.
4850           */
4851          set_insn_start(t, ureg_get_instruction_number(ureg));
4852          ureg_MAX(t->ureg,
4853                   ureg_writemask(t->outputs[t->pointSizeOutIndex], WRITEMASK_X),
4854                   ureg_src(t->outputs[t->pointSizeOutIndex]),
4855                   ureg_swizzle(t->pointSizeConst, 1,1,1,1));
4856          ureg_MIN(t->ureg, ureg_writemask(t->pointSizeResult, WRITEMASK_X),
4857                   ureg_src(t->outputs[t->pointSizeOutIndex]),
4858                   ureg_swizzle(t->pointSizeConst, 2,2,2,2));
4859       }
4860       t->prevInstWrotePointSize = GL_FALSE;
4861    }
4862
4863    /* Fix up all emitted labels:
4864     */
4865    for (i = 0; i < t->labels_count; i++) {
4866       ureg_fixup_label(ureg, t->labels[i].token,
4867                        t->insn[t->labels[i].branch_target]);
4868    }
4869
4870 out:
4871    FREE(t->insn);
4872    FREE(t->labels);
4873    FREE(t->constants);
4874    FREE(t->immediates);
4875
4876    if (t->error) {
4877       debug_printf("%s: translate error flag set\n", __FUNCTION__);
4878    }
4879
4880    return ret;
4881 }
4882 /* ----------------------------- End TGSI code ------------------------------ */
4883
4884 /**
4885  * Convert a shader's GLSL IR into a Mesa gl_program, although without
4886  * generating Mesa IR.
4887  */
4888 static struct gl_program *
4889 get_mesa_program(struct gl_context *ctx,
4890                  struct gl_shader_program *shader_program,
4891                  struct gl_shader *shader)
4892 {
4893    glsl_to_tgsi_visitor* v = new glsl_to_tgsi_visitor();
4894    struct gl_program *prog;
4895    GLenum target;
4896    const char *target_string;
4897    bool progress;
4898    struct gl_shader_compiler_options *options =
4899          &ctx->ShaderCompilerOptions[_mesa_shader_type_to_index(shader->Type)];
4900
4901    switch (shader->Type) {
4902    case GL_VERTEX_SHADER:
4903       target = GL_VERTEX_PROGRAM_ARB;
4904       target_string = "vertex";
4905       break;
4906    case GL_FRAGMENT_SHADER:
4907       target = GL_FRAGMENT_PROGRAM_ARB;
4908       target_string = "fragment";
4909       break;
4910    case GL_GEOMETRY_SHADER:
4911       target = GL_GEOMETRY_PROGRAM_NV;
4912       target_string = "geometry";
4913       break;
4914    default:
4915       assert(!"should not be reached");
4916       return NULL;
4917    }
4918
4919    validate_ir_tree(shader->ir);
4920
4921    prog = ctx->Driver.NewProgram(ctx, target, shader_program->Name);
4922    if (!prog)
4923       return NULL;
4924    prog->Parameters = _mesa_new_parameter_list();
4925    prog->Varying = _mesa_new_parameter_list();
4926    prog->Attributes = _mesa_new_parameter_list();
4927    v->ctx = ctx;
4928    v->prog = prog;
4929    v->shader_program = shader_program;
4930    v->options = options;
4931    v->glsl_version = ctx->Const.GLSLVersion;
4932    v->native_integers = ctx->Const.NativeIntegers;
4933
4934    add_uniforms_to_parameters_list(shader_program, shader, prog);
4935
4936    /* Emit intermediate IR for main(). */
4937    visit_exec_list(shader->ir, v);
4938
4939    /* Now emit bodies for any functions that were used. */
4940    do {
4941       progress = GL_FALSE;
4942
4943       foreach_iter(exec_list_iterator, iter, v->function_signatures) {
4944          function_entry *entry = (function_entry *)iter.get();
4945
4946          if (!entry->bgn_inst) {
4947             v->current_function = entry;
4948
4949             entry->bgn_inst = v->emit(NULL, TGSI_OPCODE_BGNSUB);
4950             entry->bgn_inst->function = entry;
4951
4952             visit_exec_list(&entry->sig->body, v);
4953
4954             glsl_to_tgsi_instruction *last;
4955             last = (glsl_to_tgsi_instruction *)v->instructions.get_tail();
4956             if (last->op != TGSI_OPCODE_RET)
4957                v->emit(NULL, TGSI_OPCODE_RET);
4958
4959             glsl_to_tgsi_instruction *end;
4960             end = v->emit(NULL, TGSI_OPCODE_ENDSUB);
4961             end->function = entry;
4962
4963             progress = GL_TRUE;
4964          }
4965       }
4966    } while (progress);
4967
4968 #if 0
4969    /* Print out some information (for debugging purposes) used by the
4970     * optimization passes. */
4971    for (i=0; i < v->next_temp; i++) {
4972       int fr = v->get_first_temp_read(i);
4973       int fw = v->get_first_temp_write(i);
4974       int lr = v->get_last_temp_read(i);
4975       int lw = v->get_last_temp_write(i);
4976
4977       printf("Temp %d: FR=%3d FW=%3d LR=%3d LW=%3d\n", i, fr, fw, lr, lw);
4978       assert(fw <= fr);
4979    }
4980 #endif
4981
4982    /* Remove reads to output registers, and to varyings in vertex shaders. */
4983    v->remove_output_reads(PROGRAM_OUTPUT);
4984    if (target == GL_VERTEX_PROGRAM_ARB)
4985       v->remove_output_reads(PROGRAM_VARYING);
4986
4987    /* Perform optimizations on the instructions in the glsl_to_tgsi_visitor. */
4988    v->simplify_cmp();
4989    v->copy_propagate();
4990    while (v->eliminate_dead_code_advanced());
4991
4992    /* FIXME: These passes to optimize temporary registers don't work when there
4993     * is indirect addressing of the temporary register space.  We need proper
4994     * array support so that we don't have to give up these passes in every
4995     * shader that uses arrays.
4996     */
4997    if (!v->indirect_addr_temps) {
4998       v->eliminate_dead_code();
4999       v->merge_registers();
5000       v->renumber_registers();
5001    }
5002
5003    /* Write the END instruction. */
5004    v->emit(NULL, TGSI_OPCODE_END);
5005
5006    if (ctx->Shader.Flags & GLSL_DUMP) {
5007       printf("\n");
5008       printf("GLSL IR for linked %s program %d:\n", target_string,
5009              shader_program->Name);
5010       _mesa_print_ir(shader->ir, NULL);
5011       printf("\n");
5012       printf("\n");
5013    }
5014
5015    prog->Instructions = NULL;
5016    prog->NumInstructions = 0;
5017
5018    do_set_program_inouts(shader->ir, prog);
5019    count_resources(v, prog);
5020
5021    check_resources(ctx, shader_program, v, prog);
5022
5023    _mesa_reference_program(ctx, &shader->Program, prog);
5024
5025    struct st_vertex_program *stvp;
5026    struct st_fragment_program *stfp;
5027    struct st_geometry_program *stgp;
5028
5029    switch (shader->Type) {
5030    case GL_VERTEX_SHADER:
5031       stvp = (struct st_vertex_program *)prog;
5032       stvp->glsl_to_tgsi = v;
5033       break;
5034    case GL_FRAGMENT_SHADER:
5035       stfp = (struct st_fragment_program *)prog;
5036       stfp->glsl_to_tgsi = v;
5037       break;
5038    case GL_GEOMETRY_SHADER:
5039       stgp = (struct st_geometry_program *)prog;
5040       stgp->glsl_to_tgsi = v;
5041       break;
5042    default:
5043       assert(!"should not be reached");
5044       return NULL;
5045    }
5046
5047    return prog;
5048 }
5049
5050 extern "C" {
5051
5052 struct gl_shader *
5053 st_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
5054 {
5055    struct gl_shader *shader;
5056    assert(type == GL_FRAGMENT_SHADER || type == GL_VERTEX_SHADER ||
5057           type == GL_GEOMETRY_SHADER_ARB);
5058    shader = rzalloc(NULL, struct gl_shader);
5059    if (shader) {
5060       shader->Type = type;
5061       shader->Name = name;
5062       _mesa_init_shader(ctx, shader);
5063    }
5064    return shader;
5065 }
5066
5067 struct gl_shader_program *
5068 st_new_shader_program(struct gl_context *ctx, GLuint name)
5069 {
5070    struct gl_shader_program *shProg;
5071    shProg = rzalloc(NULL, struct gl_shader_program);
5072    if (shProg) {
5073       shProg->Name = name;
5074       _mesa_init_shader_program(ctx, shProg);
5075    }
5076    return shProg;
5077 }
5078
5079 /**
5080  * Link a shader.
5081  * Called via ctx->Driver.LinkShader()
5082  * This actually involves converting GLSL IR into an intermediate TGSI-like IR
5083  * with code lowering and other optimizations.
5084  */
5085 GLboolean
5086 st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
5087 {
5088    assert(prog->LinkStatus);
5089
5090    for (unsigned i = 0; i < MESA_SHADER_TYPES; i++) {
5091       if (prog->_LinkedShaders[i] == NULL)
5092          continue;
5093
5094       bool progress;
5095       exec_list *ir = prog->_LinkedShaders[i]->ir;
5096       const struct gl_shader_compiler_options *options =
5097             &ctx->ShaderCompilerOptions[_mesa_shader_type_to_index(prog->_LinkedShaders[i]->Type)];
5098
5099       do {
5100          progress = false;
5101
5102          /* Lowering */
5103          do_mat_op_to_vec(ir);
5104          lower_instructions(ir, (MOD_TO_FRACT | DIV_TO_MUL_RCP | EXP_TO_EXP2
5105                                  | LOG_TO_LOG2 | INT_DIV_TO_MUL_RCP
5106                                  | ((options->EmitNoPow) ? POW_TO_EXP2 : 0)));
5107
5108          progress = do_lower_jumps(ir, true, true, options->EmitNoMainReturn, options->EmitNoCont, options->EmitNoLoops) || progress;
5109
5110          progress = do_common_optimization(ir, true, options->MaxUnrollIterations) || progress;
5111
5112          progress = lower_quadop_vector(ir, false) || progress;
5113
5114          if (options->MaxIfDepth == 0)
5115             progress = lower_discard(ir) || progress;
5116
5117          progress = lower_if_to_cond_assign(ir, options->MaxIfDepth) || progress;
5118
5119          if (options->EmitNoNoise)
5120             progress = lower_noise(ir) || progress;
5121
5122          /* If there are forms of indirect addressing that the driver
5123           * cannot handle, perform the lowering pass.
5124           */
5125          if (options->EmitNoIndirectInput || options->EmitNoIndirectOutput
5126              || options->EmitNoIndirectTemp || options->EmitNoIndirectUniform)
5127            progress =
5128              lower_variable_index_to_cond_assign(ir,
5129                                                  options->EmitNoIndirectInput,
5130                                                  options->EmitNoIndirectOutput,
5131                                                  options->EmitNoIndirectTemp,
5132                                                  options->EmitNoIndirectUniform)
5133              || progress;
5134
5135          progress = do_vec_index_to_cond_assign(ir) || progress;
5136       } while (progress);
5137
5138       validate_ir_tree(ir);
5139    }
5140
5141    for (unsigned i = 0; i < MESA_SHADER_TYPES; i++) {
5142       struct gl_program *linked_prog;
5143
5144       if (prog->_LinkedShaders[i] == NULL)
5145          continue;
5146
5147       linked_prog = get_mesa_program(ctx, prog, prog->_LinkedShaders[i]);
5148
5149       if (linked_prog) {
5150          bool ok = true;
5151
5152          switch (prog->_LinkedShaders[i]->Type) {
5153          case GL_VERTEX_SHADER:
5154             _mesa_reference_vertprog(ctx, &prog->VertexProgram,
5155                                      (struct gl_vertex_program *)linked_prog);
5156             ok = ctx->Driver.ProgramStringNotify(ctx, GL_VERTEX_PROGRAM_ARB,
5157                                                  linked_prog);
5158             if (!ok) {
5159                _mesa_reference_vertprog(ctx, &prog->VertexProgram, NULL);
5160             }
5161             break;
5162          case GL_FRAGMENT_SHADER:
5163             _mesa_reference_fragprog(ctx, &prog->FragmentProgram,
5164                                      (struct gl_fragment_program *)linked_prog);
5165             ok = ctx->Driver.ProgramStringNotify(ctx, GL_FRAGMENT_PROGRAM_ARB,
5166                                                  linked_prog);
5167             if (!ok) {
5168                _mesa_reference_fragprog(ctx, &prog->FragmentProgram, NULL);
5169             }
5170             break;
5171          case GL_GEOMETRY_SHADER:
5172             _mesa_reference_geomprog(ctx, &prog->GeometryProgram,
5173                                      (struct gl_geometry_program *)linked_prog);
5174             ok = ctx->Driver.ProgramStringNotify(ctx, GL_GEOMETRY_PROGRAM_NV,
5175                                                  linked_prog);
5176             if (!ok) {
5177                _mesa_reference_geomprog(ctx, &prog->GeometryProgram, NULL);
5178             }
5179             break;
5180          }
5181          if (!ok) {
5182             _mesa_reference_program(ctx, &prog->_LinkedShaders[i]->Program, NULL);
5183             _mesa_reference_program(ctx, &linked_prog, NULL);
5184             return GL_FALSE;
5185          }
5186       }
5187
5188       _mesa_reference_program(ctx, &linked_prog, NULL);
5189    }
5190
5191    return GL_TRUE;
5192 }
5193
5194 } /* extern "C" */