st/mesa: increase size of gl_register_file bitfields
[mesa.git] / src / mesa / state_tracker / st_glsl_to_tgsi.cpp
1 /*
2 * Copyright (C) 2005-2007 Brian Paul All Rights Reserved.
3 * Copyright (C) 2008 VMware, Inc. All Rights Reserved.
4 * Copyright © 2010 Intel Corporation
5 * Copyright © 2011 Bryan Cain
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 * DEALINGS IN THE SOFTWARE.
25 */
26
27 /**
28 * \file glsl_to_tgsi.cpp
29 *
30 * Translate GLSL IR to TGSI.
31 */
32
33 #include "st_glsl_to_tgsi.h"
34
35 #include "compiler/glsl/glsl_parser_extras.h"
36 #include "compiler/glsl/ir_optimization.h"
37 #include "compiler/glsl/program.h"
38
39 #include "main/errors.h"
40 #include "main/shaderobj.h"
41 #include "main/uniforms.h"
42 #include "main/shaderapi.h"
43 #include "main/shaderimage.h"
44 #include "program/prog_instruction.h"
45
46 #include "pipe/p_context.h"
47 #include "pipe/p_screen.h"
48 #include "tgsi/tgsi_ureg.h"
49 #include "tgsi/tgsi_info.h"
50 #include "util/u_math.h"
51 #include "util/u_memory.h"
52 #include "st_program.h"
53 #include "st_mesa_to_tgsi.h"
54 #include "st_format.h"
55 #include "st_glsl_to_tgsi_temprename.h"
56
57 #include "util/hash_table.h"
58 #include <algorithm>
59
60 #define PROGRAM_ANY_CONST ((1 << PROGRAM_STATE_VAR) | \
61 (1 << PROGRAM_CONSTANT) | \
62 (1 << PROGRAM_UNIFORM))
63
64 #define MAX_GLSL_TEXTURE_OFFSET 4
65
66 #ifndef NDEBUG
67 #include "util/u_atomic.h"
68 #include "util/simple_mtx.h"
69 #include <fstream>
70 #include <ios>
71
72 /* Prepare to make it possible to specify log file */
73 static std::ofstream stats_log;
74
75 /* Helper function to check whether we want to write some statistics
76 * of the shader conversion.
77 */
78
79 static simple_mtx_t print_stats_mutex = _SIMPLE_MTX_INITIALIZER_NP;
80
81 static inline bool print_stats_enabled ()
82 {
83 static int stats_enabled = 0;
84
85 if (!stats_enabled) {
86 simple_mtx_lock(&print_stats_mutex);
87 if (!stats_enabled) {
88 const char *stats_filename = getenv("GLSL_TO_TGSI_PRINT_STATS");
89 if (stats_filename) {
90 bool write_header = std::ifstream(stats_filename).fail();
91 stats_log.open(stats_filename, std::ios_base::out | std::ios_base::app);
92 stats_enabled = stats_log.good() ? 1 : -1;
93 if (write_header)
94 stats_log << "arrays,temps,temps in arrays,total,instructions\n";
95 } else {
96 stats_enabled = -1;
97 }
98 }
99 simple_mtx_unlock(&print_stats_mutex);
100 }
101 return stats_enabled > 0;
102 }
103 #define PRINT_STATS(X) if (print_stats_enabled()) do { X; } while (false);
104 #else
105 #define PRINT_STATS(X)
106 #endif
107
108
109 static unsigned is_precise(const ir_variable *ir)
110 {
111 if (!ir)
112 return 0;
113 return ir->data.precise || ir->data.invariant;
114 }
115
116 class variable_storage {
117 DECLARE_RZALLOC_CXX_OPERATORS(variable_storage)
118
119 public:
120 variable_storage(ir_variable *var, gl_register_file file, int index,
121 unsigned array_id = 0)
122 : file(file), index(index), component(0), var(var), array_id(array_id)
123 {
124 assert(file != PROGRAM_ARRAY || array_id != 0);
125 }
126
127 gl_register_file file;
128 int index;
129
130 /* Explicit component location. This is given in terms of the GLSL-style
131 * swizzles where each double is a single component, i.e. for 64-bit types
132 * it can only be 0 or 1.
133 */
134 int component;
135 ir_variable *var; /* variable that maps to this, if any */
136 unsigned array_id;
137 };
138
139 class immediate_storage : public exec_node {
140 public:
141 immediate_storage(gl_constant_value *values, int size32, GLenum type)
142 {
143 memcpy(this->values, values, size32 * sizeof(gl_constant_value));
144 this->size32 = size32;
145 this->type = type;
146 }
147
148 /* doubles are stored across 2 gl_constant_values */
149 gl_constant_value values[4];
150 int size32; /**< Number of 32-bit components (1-4) */
151 GLenum type; /**< GL_DOUBLE, GL_FLOAT, GL_INT, GL_BOOL, or GL_UNSIGNED_INT */
152 };
153
154 static const st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
155 static const st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
156
157 struct inout_decl {
158 unsigned mesa_index;
159 unsigned array_id; /* TGSI ArrayID; 1-based: 0 means not an array */
160 unsigned size;
161 unsigned interp_loc;
162 unsigned gs_out_streams;
163 enum glsl_interp_mode interp;
164 enum glsl_base_type base_type;
165 ubyte usage_mask; /* GLSL-style usage-mask, i.e. single bit per double */
166 bool invariant;
167 };
168
169 static struct inout_decl *
170 find_inout_array(struct inout_decl *decls, unsigned count, unsigned array_id)
171 {
172 assert(array_id != 0);
173
174 for (unsigned i = 0; i < count; i++) {
175 struct inout_decl *decl = &decls[i];
176
177 if (array_id == decl->array_id) {
178 return decl;
179 }
180 }
181
182 return NULL;
183 }
184
185 static enum glsl_base_type
186 find_array_type(struct inout_decl *decls, unsigned count, unsigned array_id)
187 {
188 if (!array_id)
189 return GLSL_TYPE_ERROR;
190 struct inout_decl *decl = find_inout_array(decls, count, array_id);
191 if (decl)
192 return decl->base_type;
193 return GLSL_TYPE_ERROR;
194 }
195
196 struct hwatomic_decl {
197 unsigned location;
198 unsigned binding;
199 unsigned size;
200 unsigned array_id;
201 };
202
203 struct glsl_to_tgsi_visitor : public ir_visitor {
204 public:
205 glsl_to_tgsi_visitor();
206 ~glsl_to_tgsi_visitor();
207
208 struct gl_context *ctx;
209 struct gl_program *prog;
210 struct gl_shader_program *shader_program;
211 struct gl_linked_shader *shader;
212 struct gl_shader_compiler_options *options;
213
214 int next_temp;
215
216 unsigned *array_sizes;
217 unsigned max_num_arrays;
218 unsigned next_array;
219
220 struct inout_decl inputs[4 * PIPE_MAX_SHADER_INPUTS];
221 unsigned num_inputs;
222 unsigned num_input_arrays;
223 struct inout_decl outputs[4 * PIPE_MAX_SHADER_OUTPUTS];
224 unsigned num_outputs;
225 unsigned num_output_arrays;
226
227 struct hwatomic_decl atomic_info[PIPE_MAX_HW_ATOMIC_BUFFERS];
228 unsigned num_atomics;
229 unsigned num_atomic_arrays;
230 int num_address_regs;
231 uint32_t samplers_used;
232 glsl_base_type sampler_types[PIPE_MAX_SAMPLERS];
233 enum tgsi_texture_type sampler_targets[PIPE_MAX_SAMPLERS];
234 int images_used;
235 enum tgsi_texture_type image_targets[PIPE_MAX_SHADER_IMAGES];
236 enum pipe_format image_formats[PIPE_MAX_SHADER_IMAGES];
237 bool image_wr[PIPE_MAX_SHADER_IMAGES];
238 bool indirect_addr_consts;
239 int wpos_transform_const;
240
241 bool native_integers;
242 bool have_sqrt;
243 bool have_fma;
244 bool use_shared_memory;
245 bool has_tex_txf_lz;
246 bool precise;
247 bool need_uarl;
248 bool tg4_component_in_swizzle;
249
250 variable_storage *find_variable_storage(ir_variable *var);
251
252 int add_constant(gl_register_file file, gl_constant_value values[8],
253 int size, GLenum datatype, uint16_t *swizzle_out);
254
255 st_src_reg get_temp(const glsl_type *type);
256 void reladdr_to_temp(ir_instruction *ir, st_src_reg *reg, int *num_reladdr);
257
258 st_src_reg st_src_reg_for_double(double val);
259 st_src_reg st_src_reg_for_float(float val);
260 st_src_reg st_src_reg_for_int(int val);
261 st_src_reg st_src_reg_for_int64(int64_t val);
262 st_src_reg st_src_reg_for_type(enum glsl_base_type type, int val);
263
264 /**
265 * \name Visit methods
266 *
267 * As typical for the visitor pattern, there must be one \c visit method for
268 * each concrete subclass of \c ir_instruction. Virtual base classes within
269 * the hierarchy should not have \c visit methods.
270 */
271 /*@{*/
272 virtual void visit(ir_variable *);
273 virtual void visit(ir_loop *);
274 virtual void visit(ir_loop_jump *);
275 virtual void visit(ir_function_signature *);
276 virtual void visit(ir_function *);
277 virtual void visit(ir_expression *);
278 virtual void visit(ir_swizzle *);
279 virtual void visit(ir_dereference_variable *);
280 virtual void visit(ir_dereference_array *);
281 virtual void visit(ir_dereference_record *);
282 virtual void visit(ir_assignment *);
283 virtual void visit(ir_constant *);
284 virtual void visit(ir_call *);
285 virtual void visit(ir_return *);
286 virtual void visit(ir_discard *);
287 virtual void visit(ir_demote *);
288 virtual void visit(ir_texture *);
289 virtual void visit(ir_if *);
290 virtual void visit(ir_emit_vertex *);
291 virtual void visit(ir_end_primitive *);
292 virtual void visit(ir_barrier *);
293 /*@}*/
294
295 void ATTRIBUTE_NOINLINE visit_expression(ir_expression *, st_src_reg *);
296
297 void visit_atomic_counter_intrinsic(ir_call *);
298 void visit_ssbo_intrinsic(ir_call *);
299 void visit_membar_intrinsic(ir_call *);
300 void visit_shared_intrinsic(ir_call *);
301 void visit_image_intrinsic(ir_call *);
302 void visit_generic_intrinsic(ir_call *, enum tgsi_opcode op);
303
304 st_src_reg result;
305
306 /** List of variable_storage */
307 struct hash_table *variables;
308
309 /** List of immediate_storage */
310 exec_list immediates;
311 unsigned num_immediates;
312
313 /** List of glsl_to_tgsi_instruction */
314 exec_list instructions;
315
316 glsl_to_tgsi_instruction *emit_asm(ir_instruction *ir, enum tgsi_opcode op,
317 st_dst_reg dst = undef_dst,
318 st_src_reg src0 = undef_src,
319 st_src_reg src1 = undef_src,
320 st_src_reg src2 = undef_src,
321 st_src_reg src3 = undef_src);
322
323 glsl_to_tgsi_instruction *emit_asm(ir_instruction *ir, enum tgsi_opcode op,
324 st_dst_reg dst, st_dst_reg dst1,
325 st_src_reg src0 = undef_src,
326 st_src_reg src1 = undef_src,
327 st_src_reg src2 = undef_src,
328 st_src_reg src3 = undef_src);
329
330 enum tgsi_opcode get_opcode(enum tgsi_opcode op,
331 st_dst_reg dst,
332 st_src_reg src0, st_src_reg src1);
333
334 /**
335 * Emit the correct dot-product instruction for the type of arguments
336 */
337 glsl_to_tgsi_instruction *emit_dp(ir_instruction *ir,
338 st_dst_reg dst,
339 st_src_reg src0,
340 st_src_reg src1,
341 unsigned elements);
342
343 void emit_scalar(ir_instruction *ir, enum tgsi_opcode op,
344 st_dst_reg dst, st_src_reg src0);
345
346 void emit_scalar(ir_instruction *ir, enum tgsi_opcode op,
347 st_dst_reg dst, st_src_reg src0, st_src_reg src1);
348
349 void emit_arl(ir_instruction *ir, st_dst_reg dst, st_src_reg src0);
350
351 void get_deref_offsets(ir_dereference *ir,
352 unsigned *array_size,
353 unsigned *base,
354 uint16_t *index,
355 st_src_reg *reladdr,
356 bool opaque);
357 void calc_deref_offsets(ir_dereference *tail,
358 unsigned *array_elements,
359 uint16_t *index,
360 st_src_reg *indirect,
361 unsigned *location);
362 st_src_reg canonicalize_gather_offset(st_src_reg offset);
363 bool handle_bound_deref(ir_dereference *ir);
364
365 bool try_emit_mad(ir_expression *ir,
366 int mul_operand);
367 bool try_emit_mad_for_and_not(ir_expression *ir,
368 int mul_operand);
369
370 void emit_swz(ir_expression *ir);
371
372 bool process_move_condition(ir_rvalue *ir);
373
374 void simplify_cmp(void);
375
376 void rename_temp_registers(struct rename_reg_pair *renames);
377 void get_first_temp_read(int *first_reads);
378 void get_first_temp_write(int *first_writes);
379 void get_last_temp_read_first_temp_write(int *last_reads, int *first_writes);
380 void get_last_temp_write(int *last_writes);
381
382 void copy_propagate(void);
383 int eliminate_dead_code(void);
384
385 void split_arrays(void);
386 void merge_two_dsts(void);
387 void merge_registers(void);
388 void renumber_registers(void);
389
390 void emit_block_mov(ir_assignment *ir, const struct glsl_type *type,
391 st_dst_reg *l, st_src_reg *r,
392 st_src_reg *cond, bool cond_swap);
393
394 void print_stats();
395
396 void *mem_ctx;
397 };
398
399 static st_dst_reg address_reg = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X,
400 GLSL_TYPE_FLOAT, 0);
401 static st_dst_reg address_reg2 = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X,
402 GLSL_TYPE_FLOAT, 1);
403 static st_dst_reg sampler_reladdr = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X,
404 GLSL_TYPE_FLOAT, 2);
405
406 static void
407 fail_link(struct gl_shader_program *prog, const char *fmt, ...)
408 PRINTFLIKE(2, 3);
409
410 static void
411 fail_link(struct gl_shader_program *prog, const char *fmt, ...)
412 {
413 va_list args;
414 va_start(args, fmt);
415 ralloc_vasprintf_append(&prog->data->InfoLog, fmt, args);
416 va_end(args);
417
418 prog->data->LinkStatus = LINKING_FAILURE;
419 }
420
421 int
422 swizzle_for_size(int size)
423 {
424 static const int size_swizzles[4] = {
425 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
426 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
427 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z),
428 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W),
429 };
430
431 assert((size >= 1) && (size <= 4));
432 return size_swizzles[size - 1];
433 }
434
435
436 glsl_to_tgsi_instruction *
437 glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, enum tgsi_opcode op,
438 st_dst_reg dst, st_dst_reg dst1,
439 st_src_reg src0, st_src_reg src1,
440 st_src_reg src2, st_src_reg src3)
441 {
442 glsl_to_tgsi_instruction *inst = new(mem_ctx) glsl_to_tgsi_instruction();
443 int num_reladdr = 0, i, j;
444 bool dst_is_64bit[2];
445
446 op = get_opcode(op, dst, src0, src1);
447
448 /* If we have to do relative addressing, we want to load the ARL
449 * reg directly for one of the regs, and preload the other reladdr
450 * sources into temps.
451 */
452 num_reladdr += dst.reladdr != NULL || dst.reladdr2;
453 assert(!dst1.reladdr); /* should be lowered in earlier passes */
454 num_reladdr += src0.reladdr != NULL || src0.reladdr2 != NULL;
455 num_reladdr += src1.reladdr != NULL || src1.reladdr2 != NULL;
456 num_reladdr += src2.reladdr != NULL || src2.reladdr2 != NULL;
457 num_reladdr += src3.reladdr != NULL || src3.reladdr2 != NULL;
458
459 reladdr_to_temp(ir, &src3, &num_reladdr);
460 reladdr_to_temp(ir, &src2, &num_reladdr);
461 reladdr_to_temp(ir, &src1, &num_reladdr);
462 reladdr_to_temp(ir, &src0, &num_reladdr);
463
464 if (dst.reladdr || dst.reladdr2) {
465 if (dst.reladdr)
466 emit_arl(ir, address_reg, *dst.reladdr);
467 if (dst.reladdr2)
468 emit_arl(ir, address_reg2, *dst.reladdr2);
469 num_reladdr--;
470 }
471
472 assert(num_reladdr == 0);
473
474 /* inst->op has only 8 bits. */
475 STATIC_ASSERT(TGSI_OPCODE_LAST <= 255);
476
477 inst->op = op;
478 inst->precise = this->precise;
479 inst->info = tgsi_get_opcode_info(op);
480 inst->dst[0] = dst;
481 inst->dst[1] = dst1;
482 inst->src[0] = src0;
483 inst->src[1] = src1;
484 inst->src[2] = src2;
485 inst->src[3] = src3;
486 inst->is_64bit_expanded = false;
487 inst->ir = ir;
488 inst->dead_mask = 0;
489 inst->tex_offsets = NULL;
490 inst->tex_offset_num_offset = 0;
491 inst->saturate = 0;
492 inst->tex_shadow = 0;
493 /* default to float, for paths where this is not initialized
494 * (since 0==UINT which is likely wrong):
495 */
496 inst->tex_type = GLSL_TYPE_FLOAT;
497
498 /* Update indirect addressing status used by TGSI */
499 if (dst.reladdr || dst.reladdr2) {
500 switch (dst.file) {
501 case PROGRAM_STATE_VAR:
502 case PROGRAM_CONSTANT:
503 case PROGRAM_UNIFORM:
504 this->indirect_addr_consts = true;
505 break;
506 case PROGRAM_IMMEDIATE:
507 assert(!"immediates should not have indirect addressing");
508 break;
509 default:
510 break;
511 }
512 }
513 else {
514 for (i = 0; i < 4; i++) {
515 if (inst->src[i].reladdr) {
516 switch (inst->src[i].file) {
517 case PROGRAM_STATE_VAR:
518 case PROGRAM_CONSTANT:
519 case PROGRAM_UNIFORM:
520 this->indirect_addr_consts = true;
521 break;
522 case PROGRAM_IMMEDIATE:
523 assert(!"immediates should not have indirect addressing");
524 break;
525 default:
526 break;
527 }
528 }
529 }
530 }
531
532 /*
533 * This section contains the double processing.
534 * GLSL just represents doubles as single channel values,
535 * however most HW and TGSI represent doubles as pairs of register channels.
536 *
537 * so we have to fixup destination writemask/index and src swizzle/indexes.
538 * dest writemasks need to translate from single channel write mask
539 * to a dual-channel writemask, but also need to modify the index,
540 * if we are touching the Z,W fields in the pre-translated writemask.
541 *
542 * src channels have similiar index modifications along with swizzle
543 * changes to we pick the XY, ZW pairs from the correct index.
544 *
545 * GLSL [0].x -> TGSI [0].xy
546 * GLSL [0].y -> TGSI [0].zw
547 * GLSL [0].z -> TGSI [1].xy
548 * GLSL [0].w -> TGSI [1].zw
549 */
550 for (j = 0; j < 2; j++) {
551 dst_is_64bit[j] = glsl_base_type_is_64bit(inst->dst[j].type);
552 if (!dst_is_64bit[j] && inst->dst[j].file == PROGRAM_OUTPUT &&
553 inst->dst[j].type == GLSL_TYPE_ARRAY) {
554 enum glsl_base_type type = find_array_type(this->outputs,
555 this->num_outputs,
556 inst->dst[j].array_id);
557 if (glsl_base_type_is_64bit(type))
558 dst_is_64bit[j] = true;
559 }
560 }
561
562 if (dst_is_64bit[0] || dst_is_64bit[1] ||
563 glsl_base_type_is_64bit(inst->src[0].type)) {
564 glsl_to_tgsi_instruction *dinst = NULL;
565 int initial_src_swz[4], initial_src_idx[4];
566 int initial_dst_idx[2], initial_dst_writemask[2];
567 /* select the writemask for dst0 or dst1 */
568 unsigned writemask = inst->dst[1].file == PROGRAM_UNDEFINED
569 ? inst->dst[0].writemask : inst->dst[1].writemask;
570
571 /* copy out the writemask, index and swizzles for all src/dsts. */
572 for (j = 0; j < 2; j++) {
573 initial_dst_writemask[j] = inst->dst[j].writemask;
574 initial_dst_idx[j] = inst->dst[j].index;
575 }
576
577 for (j = 0; j < 4; j++) {
578 initial_src_swz[j] = inst->src[j].swizzle;
579 initial_src_idx[j] = inst->src[j].index;
580 }
581
582 /*
583 * scan all the components in the dst writemask
584 * generate an instruction for each of them if required.
585 */
586 st_src_reg addr;
587 while (writemask) {
588
589 int i = u_bit_scan(&writemask);
590
591 /* before emitting the instruction, see if we have to adjust
592 * load / store address */
593 if (i > 1 && (inst->op == TGSI_OPCODE_LOAD ||
594 inst->op == TGSI_OPCODE_STORE) &&
595 addr.file == PROGRAM_UNDEFINED) {
596 /* We have to advance the buffer address by 16 */
597 addr = get_temp(glsl_type::uint_type);
598 emit_asm(ir, TGSI_OPCODE_UADD, st_dst_reg(addr),
599 inst->src[0], st_src_reg_for_int(16));
600 }
601
602 /* first time use previous instruction */
603 if (dinst == NULL) {
604 dinst = inst;
605 } else {
606 /* create a new instructions for subsequent attempts */
607 dinst = new(mem_ctx) glsl_to_tgsi_instruction();
608 *dinst = *inst;
609 dinst->next = NULL;
610 dinst->prev = NULL;
611 }
612 this->instructions.push_tail(dinst);
613 dinst->is_64bit_expanded = true;
614
615 /* modify the destination if we are splitting */
616 for (j = 0; j < 2; j++) {
617 if (dst_is_64bit[j]) {
618 dinst->dst[j].writemask = (i & 1) ? WRITEMASK_ZW : WRITEMASK_XY;
619 dinst->dst[j].index = initial_dst_idx[j];
620 if (i > 1) {
621 if (dinst->op == TGSI_OPCODE_LOAD ||
622 dinst->op == TGSI_OPCODE_STORE)
623 dinst->src[0] = addr;
624 if (dinst->op != TGSI_OPCODE_STORE)
625 dinst->dst[j].index++;
626 }
627 } else {
628 /* if we aren't writing to a double, just get the bit of the
629 * initial writemask for this channel
630 */
631 dinst->dst[j].writemask = initial_dst_writemask[j] & (1 << i);
632 }
633 }
634
635 /* modify the src registers */
636 for (j = 0; j < 4; j++) {
637 int swz = GET_SWZ(initial_src_swz[j], i);
638
639 if (glsl_base_type_is_64bit(dinst->src[j].type)) {
640 dinst->src[j].index = initial_src_idx[j];
641 if (swz > 1) {
642 dinst->src[j].double_reg2 = true;
643 dinst->src[j].index++;
644 }
645
646 if (swz & 1)
647 dinst->src[j].swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W,
648 SWIZZLE_Z, SWIZZLE_W);
649 else
650 dinst->src[j].swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
651 SWIZZLE_X, SWIZZLE_Y);
652
653 } else {
654 /* some opcodes are special case in what they use as sources
655 * - [FUI]2D/[UI]2I64 is a float/[u]int src0, (D)LDEXP is
656 * integer src1
657 */
658 if (op == TGSI_OPCODE_F2D || op == TGSI_OPCODE_U2D ||
659 op == TGSI_OPCODE_I2D ||
660 op == TGSI_OPCODE_I2I64 || op == TGSI_OPCODE_U2I64 ||
661 op == TGSI_OPCODE_DLDEXP || op == TGSI_OPCODE_LDEXP ||
662 (op == TGSI_OPCODE_UCMP && dst_is_64bit[0])) {
663 dinst->src[j].swizzle = MAKE_SWIZZLE4(swz, swz, swz, swz);
664 }
665 }
666 }
667 }
668 inst = dinst;
669 } else {
670 this->instructions.push_tail(inst);
671 }
672
673
674 return inst;
675 }
676
677 glsl_to_tgsi_instruction *
678 glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, enum tgsi_opcode op,
679 st_dst_reg dst,
680 st_src_reg src0, st_src_reg src1,
681 st_src_reg src2, st_src_reg src3)
682 {
683 return emit_asm(ir, op, dst, undef_dst, src0, src1, src2, src3);
684 }
685
686 /**
687 * Determines whether to use an integer, unsigned integer, or float opcode
688 * based on the operands and input opcode, then emits the result.
689 */
690 enum tgsi_opcode
691 glsl_to_tgsi_visitor::get_opcode(enum tgsi_opcode op,
692 st_dst_reg dst,
693 st_src_reg src0, st_src_reg src1)
694 {
695 enum glsl_base_type type = GLSL_TYPE_FLOAT;
696
697 if (op == TGSI_OPCODE_MOV)
698 return op;
699
700 assert(src0.type != GLSL_TYPE_ARRAY);
701 assert(src0.type != GLSL_TYPE_STRUCT);
702 assert(src1.type != GLSL_TYPE_ARRAY);
703 assert(src1.type != GLSL_TYPE_STRUCT);
704
705 if (is_resource_instruction(op))
706 type = src1.type;
707 else if (src0.type == GLSL_TYPE_INT64 || src1.type == GLSL_TYPE_INT64)
708 type = GLSL_TYPE_INT64;
709 else if (src0.type == GLSL_TYPE_UINT64 || src1.type == GLSL_TYPE_UINT64)
710 type = GLSL_TYPE_UINT64;
711 else if (src0.type == GLSL_TYPE_DOUBLE || src1.type == GLSL_TYPE_DOUBLE)
712 type = GLSL_TYPE_DOUBLE;
713 else if (src0.type == GLSL_TYPE_FLOAT || src1.type == GLSL_TYPE_FLOAT)
714 type = GLSL_TYPE_FLOAT;
715 else if (native_integers)
716 type = src0.type == GLSL_TYPE_BOOL ? GLSL_TYPE_INT : src0.type;
717
718 #define case7(c, f, i, u, d, i64, ui64) \
719 case TGSI_OPCODE_##c: \
720 if (type == GLSL_TYPE_UINT64) \
721 op = TGSI_OPCODE_##ui64; \
722 else if (type == GLSL_TYPE_INT64) \
723 op = TGSI_OPCODE_##i64; \
724 else if (type == GLSL_TYPE_DOUBLE) \
725 op = TGSI_OPCODE_##d; \
726 else if (type == GLSL_TYPE_INT) \
727 op = TGSI_OPCODE_##i; \
728 else if (type == GLSL_TYPE_UINT) \
729 op = TGSI_OPCODE_##u; \
730 else \
731 op = TGSI_OPCODE_##f; \
732 break;
733
734 #define casecomp(c, f, i, u, d, i64, ui64) \
735 case TGSI_OPCODE_##c: \
736 if (type == GLSL_TYPE_INT64) \
737 op = TGSI_OPCODE_##i64; \
738 else if (type == GLSL_TYPE_UINT64) \
739 op = TGSI_OPCODE_##ui64; \
740 else if (type == GLSL_TYPE_DOUBLE) \
741 op = TGSI_OPCODE_##d; \
742 else if (type == GLSL_TYPE_INT || type == GLSL_TYPE_SUBROUTINE) \
743 op = TGSI_OPCODE_##i; \
744 else if (type == GLSL_TYPE_UINT) \
745 op = TGSI_OPCODE_##u; \
746 else if (native_integers) \
747 op = TGSI_OPCODE_##f; \
748 else \
749 op = TGSI_OPCODE_##c; \
750 break;
751
752 switch (op) {
753 /* Some instructions are initially selected without considering the type.
754 * This fixes the type:
755 *
756 * INIT FLOAT SINT UINT DOUBLE SINT64 UINT64
757 */
758 case7(ADD, ADD, UADD, UADD, DADD, U64ADD, U64ADD);
759 case7(CEIL, CEIL, LAST, LAST, DCEIL, LAST, LAST);
760 case7(DIV, DIV, IDIV, UDIV, DDIV, I64DIV, U64DIV);
761 case7(FMA, FMA, UMAD, UMAD, DFMA, LAST, LAST);
762 case7(FLR, FLR, LAST, LAST, DFLR, LAST, LAST);
763 case7(FRC, FRC, LAST, LAST, DFRAC, LAST, LAST);
764 case7(MUL, MUL, UMUL, UMUL, DMUL, U64MUL, U64MUL);
765 case7(MAD, MAD, UMAD, UMAD, DMAD, LAST, LAST);
766 case7(MAX, MAX, IMAX, UMAX, DMAX, I64MAX, U64MAX);
767 case7(MIN, MIN, IMIN, UMIN, DMIN, I64MIN, U64MIN);
768 case7(RCP, RCP, LAST, LAST, DRCP, LAST, LAST);
769 case7(ROUND, ROUND,LAST, LAST, DROUND, LAST, LAST);
770 case7(RSQ, RSQ, LAST, LAST, DRSQ, LAST, LAST);
771 case7(SQRT, SQRT, LAST, LAST, DSQRT, LAST, LAST);
772 case7(SSG, SSG, ISSG, ISSG, DSSG, I64SSG, I64SSG);
773 case7(TRUNC, TRUNC,LAST, LAST, DTRUNC, LAST, LAST);
774
775 case7(MOD, LAST, MOD, UMOD, LAST, I64MOD, U64MOD);
776 case7(SHL, LAST, SHL, SHL, LAST, U64SHL, U64SHL);
777 case7(IBFE, LAST, IBFE, UBFE, LAST, LAST, LAST);
778 case7(IMSB, LAST, IMSB, UMSB, LAST, LAST, LAST);
779 case7(IMUL_HI, LAST, IMUL_HI, UMUL_HI, LAST, LAST, LAST);
780 case7(ISHR, LAST, ISHR, USHR, LAST, I64SHR, U64SHR);
781 case7(ATOMIMAX,LAST, ATOMIMAX,ATOMUMAX,LAST, LAST, LAST);
782 case7(ATOMIMIN,LAST, ATOMIMIN,ATOMUMIN,LAST, LAST, LAST);
783 case7(ATOMUADD,ATOMFADD,ATOMUADD,ATOMUADD,LAST, LAST, LAST);
784
785 casecomp(SEQ, FSEQ, USEQ, USEQ, DSEQ, U64SEQ, U64SEQ);
786 casecomp(SNE, FSNE, USNE, USNE, DSNE, U64SNE, U64SNE);
787 casecomp(SGE, FSGE, ISGE, USGE, DSGE, I64SGE, U64SGE);
788 casecomp(SLT, FSLT, ISLT, USLT, DSLT, I64SLT, U64SLT);
789
790 default:
791 break;
792 }
793
794 assert(op != TGSI_OPCODE_LAST);
795 return op;
796 }
797
798 glsl_to_tgsi_instruction *
799 glsl_to_tgsi_visitor::emit_dp(ir_instruction *ir,
800 st_dst_reg dst, st_src_reg src0, st_src_reg src1,
801 unsigned elements)
802 {
803 static const enum tgsi_opcode dot_opcodes[] = {
804 TGSI_OPCODE_DP2, TGSI_OPCODE_DP3, TGSI_OPCODE_DP4
805 };
806
807 return emit_asm(ir, dot_opcodes[elements - 2], dst, src0, src1);
808 }
809
810 /**
811 * Emits TGSI scalar opcodes to produce unique answers across channels.
812 *
813 * Some TGSI opcodes are scalar-only, like ARB_fp/vp. The src X
814 * channel determines the result across all channels. So to do a vec4
815 * of this operation, we want to emit a scalar per source channel used
816 * to produce dest channels.
817 */
818 void
819 glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, enum tgsi_opcode op,
820 st_dst_reg dst,
821 st_src_reg orig_src0, st_src_reg orig_src1)
822 {
823 int i, j;
824 int done_mask = ~dst.writemask;
825
826 /* TGSI RCP is a scalar operation splatting results to all channels,
827 * like ARB_fp/vp. So emit as many RCPs as necessary to cover our
828 * dst channels.
829 */
830 for (i = 0; i < 4; i++) {
831 GLuint this_mask = (1 << i);
832 st_src_reg src0 = orig_src0;
833 st_src_reg src1 = orig_src1;
834
835 if (done_mask & this_mask)
836 continue;
837
838 GLuint src0_swiz = GET_SWZ(src0.swizzle, i);
839 GLuint src1_swiz = GET_SWZ(src1.swizzle, i);
840 for (j = i + 1; j < 4; j++) {
841 /* If there is another enabled component in the destination that is
842 * derived from the same inputs, generate its value on this pass as
843 * well.
844 */
845 if (!(done_mask & (1 << j)) &&
846 GET_SWZ(src0.swizzle, j) == src0_swiz &&
847 GET_SWZ(src1.swizzle, j) == src1_swiz) {
848 this_mask |= (1 << j);
849 }
850 }
851 src0.swizzle = MAKE_SWIZZLE4(src0_swiz, src0_swiz,
852 src0_swiz, src0_swiz);
853 src1.swizzle = MAKE_SWIZZLE4(src1_swiz, src1_swiz,
854 src1_swiz, src1_swiz);
855
856 dst.writemask = this_mask;
857 emit_asm(ir, op, dst, src0, src1);
858 done_mask |= this_mask;
859 }
860 }
861
862 void
863 glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, enum tgsi_opcode op,
864 st_dst_reg dst, st_src_reg src0)
865 {
866 st_src_reg undef = undef_src;
867
868 undef.swizzle = SWIZZLE_XXXX;
869
870 emit_scalar(ir, op, dst, src0, undef);
871 }
872
873 void
874 glsl_to_tgsi_visitor::emit_arl(ir_instruction *ir,
875 st_dst_reg dst, st_src_reg src0)
876 {
877 enum tgsi_opcode op = TGSI_OPCODE_ARL;
878
879 if (src0.type == GLSL_TYPE_INT || src0.type == GLSL_TYPE_UINT) {
880 if (!this->need_uarl && src0.is_legal_tgsi_address_operand())
881 return;
882
883 op = TGSI_OPCODE_UARL;
884 }
885
886 assert(dst.file == PROGRAM_ADDRESS);
887 if (dst.index >= this->num_address_regs)
888 this->num_address_regs = dst.index + 1;
889
890 emit_asm(NULL, op, dst, src0);
891 }
892
893 int
894 glsl_to_tgsi_visitor::add_constant(gl_register_file file,
895 gl_constant_value values[8], int size,
896 GLenum datatype,
897 uint16_t *swizzle_out)
898 {
899 if (file == PROGRAM_CONSTANT) {
900 GLuint swizzle = swizzle_out ? *swizzle_out : 0;
901 int result = _mesa_add_typed_unnamed_constant(this->prog->Parameters,
902 values, size, datatype,
903 &swizzle);
904 if (swizzle_out)
905 *swizzle_out = swizzle;
906 return result;
907 }
908
909 assert(file == PROGRAM_IMMEDIATE);
910
911 int index = 0;
912 immediate_storage *entry;
913 int size32 = size * ((datatype == GL_DOUBLE ||
914 datatype == GL_INT64_ARB ||
915 datatype == GL_UNSIGNED_INT64_ARB) ? 2 : 1);
916 int i;
917
918 /* Search immediate storage to see if we already have an identical
919 * immediate that we can use instead of adding a duplicate entry.
920 */
921 foreach_in_list(immediate_storage, entry, &this->immediates) {
922 immediate_storage *tmp = entry;
923
924 for (i = 0; i * 4 < size32; i++) {
925 int slot_size = MIN2(size32 - (i * 4), 4);
926 if (tmp->type != datatype || tmp->size32 != slot_size)
927 break;
928 if (memcmp(tmp->values, &values[i * 4],
929 slot_size * sizeof(gl_constant_value)))
930 break;
931
932 /* Everything matches, keep going until the full size is matched */
933 tmp = (immediate_storage *)tmp->next;
934 }
935
936 /* The full value matched */
937 if (i * 4 >= size32)
938 return index;
939
940 index++;
941 }
942
943 for (i = 0; i * 4 < size32; i++) {
944 int slot_size = MIN2(size32 - (i * 4), 4);
945 /* Add this immediate to the list. */
946 entry = new(mem_ctx) immediate_storage(&values[i * 4],
947 slot_size, datatype);
948 this->immediates.push_tail(entry);
949 this->num_immediates++;
950 }
951 return index;
952 }
953
954 st_src_reg
955 glsl_to_tgsi_visitor::st_src_reg_for_float(float val)
956 {
957 st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_FLOAT);
958 union gl_constant_value uval;
959
960 uval.f = val;
961 src.index = add_constant(src.file, &uval, 1, GL_FLOAT, &src.swizzle);
962
963 return src;
964 }
965
966 st_src_reg
967 glsl_to_tgsi_visitor::st_src_reg_for_double(double val)
968 {
969 st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_DOUBLE);
970 union gl_constant_value uval[2];
971
972 memcpy(uval, &val, sizeof(uval));
973 src.index = add_constant(src.file, uval, 1, GL_DOUBLE, &src.swizzle);
974 src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
975 return src;
976 }
977
978 st_src_reg
979 glsl_to_tgsi_visitor::st_src_reg_for_int(int val)
980 {
981 st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_INT);
982 union gl_constant_value uval;
983
984 assert(native_integers);
985
986 uval.i = val;
987 src.index = add_constant(src.file, &uval, 1, GL_INT, &src.swizzle);
988
989 return src;
990 }
991
992 st_src_reg
993 glsl_to_tgsi_visitor::st_src_reg_for_int64(int64_t val)
994 {
995 st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_INT64);
996 union gl_constant_value uval[2];
997
998 memcpy(uval, &val, sizeof(uval));
999 src.index = add_constant(src.file, uval, 1, GL_DOUBLE, &src.swizzle);
1000 src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
1001
1002 return src;
1003 }
1004
1005 st_src_reg
1006 glsl_to_tgsi_visitor::st_src_reg_for_type(enum glsl_base_type type, int val)
1007 {
1008 if (native_integers)
1009 return type == GLSL_TYPE_FLOAT ? st_src_reg_for_float(val) :
1010 st_src_reg_for_int(val);
1011 else
1012 return st_src_reg_for_float(val);
1013 }
1014
1015 static int
1016 attrib_type_size(const struct glsl_type *type, bool is_vs_input)
1017 {
1018 return type->count_attribute_slots(is_vs_input);
1019 }
1020
1021 static int
1022 type_size(const struct glsl_type *type)
1023 {
1024 return type->count_attribute_slots(false);
1025 }
1026
1027 static void
1028 add_buffer_to_load_and_stores(glsl_to_tgsi_instruction *inst, st_src_reg *buf,
1029 exec_list *instructions, ir_constant *access)
1030 {
1031 /**
1032 * emit_asm() might have actually split the op into pieces, e.g. for
1033 * double stores. We have to go back and fix up all the generated ops.
1034 */
1035 enum tgsi_opcode op = inst->op;
1036 do {
1037 inst->resource = *buf;
1038 if (access)
1039 inst->buffer_access = access->value.u[0];
1040
1041 if (inst == instructions->get_head_raw())
1042 break;
1043 inst = (glsl_to_tgsi_instruction *)inst->get_prev();
1044
1045 if (inst->op == TGSI_OPCODE_UADD) {
1046 if (inst == instructions->get_head_raw())
1047 break;
1048 inst = (glsl_to_tgsi_instruction *)inst->get_prev();
1049 }
1050 } while (inst->op == op && inst->resource.file == PROGRAM_UNDEFINED);
1051 }
1052
1053 /**
1054 * If the given GLSL type is an array or matrix or a structure containing
1055 * an array/matrix member, return true. Else return false.
1056 *
1057 * This is used to determine which kind of temp storage (PROGRAM_TEMPORARY
1058 * or PROGRAM_ARRAY) should be used for variables of this type. Anytime
1059 * we have an array that might be indexed with a variable, we need to use
1060 * the later storage type.
1061 */
1062 static bool
1063 type_has_array_or_matrix(const glsl_type *type)
1064 {
1065 if (type->is_array() || type->is_matrix())
1066 return true;
1067
1068 if (type->is_struct()) {
1069 for (unsigned i = 0; i < type->length; i++) {
1070 if (type_has_array_or_matrix(type->fields.structure[i].type)) {
1071 return true;
1072 }
1073 }
1074 }
1075
1076 return false;
1077 }
1078
1079
1080 /**
1081 * In the initial pass of codegen, we assign temporary numbers to
1082 * intermediate results. (not SSA -- variable assignments will reuse
1083 * storage).
1084 */
1085 st_src_reg
1086 glsl_to_tgsi_visitor::get_temp(const glsl_type *type)
1087 {
1088 st_src_reg src;
1089
1090 src.type = native_integers ? type->base_type : GLSL_TYPE_FLOAT;
1091 src.reladdr = NULL;
1092 src.negate = 0;
1093 src.abs = 0;
1094
1095 if (!options->EmitNoIndirectTemp && type_has_array_or_matrix(type)) {
1096 if (next_array >= max_num_arrays) {
1097 max_num_arrays += 32;
1098 array_sizes = (unsigned*)
1099 realloc(array_sizes, sizeof(array_sizes[0]) * max_num_arrays);
1100 }
1101
1102 src.file = PROGRAM_ARRAY;
1103 src.index = 0;
1104 src.array_id = next_array + 1;
1105 array_sizes[next_array] = type_size(type);
1106 ++next_array;
1107
1108 } else {
1109 src.file = PROGRAM_TEMPORARY;
1110 src.index = next_temp;
1111 next_temp += type_size(type);
1112 }
1113
1114 if (type->is_array() || type->is_struct()) {
1115 src.swizzle = SWIZZLE_NOOP;
1116 } else {
1117 src.swizzle = swizzle_for_size(type->vector_elements);
1118 }
1119
1120 return src;
1121 }
1122
1123 variable_storage *
1124 glsl_to_tgsi_visitor::find_variable_storage(ir_variable *var)
1125 {
1126 struct hash_entry *entry;
1127
1128 entry = _mesa_hash_table_search(this->variables, var);
1129 if (!entry)
1130 return NULL;
1131
1132 return (variable_storage *)entry->data;
1133 }
1134
1135 void
1136 glsl_to_tgsi_visitor::visit(ir_variable *ir)
1137 {
1138 if (ir->data.mode == ir_var_uniform && strncmp(ir->name, "gl_", 3) == 0) {
1139 unsigned int i;
1140 const ir_state_slot *const slots = ir->get_state_slots();
1141 assert(slots != NULL);
1142
1143 /* Check if this statevar's setup in the STATE file exactly
1144 * matches how we'll want to reference it as a
1145 * struct/array/whatever. If not, then we need to move it into
1146 * temporary storage and hope that it'll get copy-propagated
1147 * out.
1148 */
1149 for (i = 0; i < ir->get_num_state_slots(); i++) {
1150 if (slots[i].swizzle != SWIZZLE_XYZW) {
1151 break;
1152 }
1153 }
1154
1155 variable_storage *storage;
1156 st_dst_reg dst;
1157 if (i == ir->get_num_state_slots()) {
1158 /* We'll set the index later. */
1159 storage = new(mem_ctx) variable_storage(ir, PROGRAM_STATE_VAR, -1);
1160
1161 _mesa_hash_table_insert(this->variables, ir, storage);
1162
1163 dst = undef_dst;
1164 } else {
1165 /* The variable_storage constructor allocates slots based on the size
1166 * of the type. However, this had better match the number of state
1167 * elements that we're going to copy into the new temporary.
1168 */
1169 assert((int) ir->get_num_state_slots() == type_size(ir->type));
1170
1171 dst = st_dst_reg(get_temp(ir->type));
1172
1173 storage = new(mem_ctx) variable_storage(ir, dst.file, dst.index,
1174 dst.array_id);
1175
1176 _mesa_hash_table_insert(this->variables, ir, storage);
1177 }
1178
1179
1180 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1181 int index = _mesa_add_state_reference(this->prog->Parameters,
1182 slots[i].tokens);
1183
1184 if (storage->file == PROGRAM_STATE_VAR) {
1185 if (storage->index == -1) {
1186 storage->index = index;
1187 } else {
1188 assert(index == storage->index + (int)i);
1189 }
1190 } else {
1191 /* We use GLSL_TYPE_FLOAT here regardless of the actual type of
1192 * the data being moved since MOV does not care about the type of
1193 * data it is moving, and we don't want to declare registers with
1194 * array or struct types.
1195 */
1196 st_src_reg src(PROGRAM_STATE_VAR, index, GLSL_TYPE_FLOAT);
1197 src.swizzle = slots[i].swizzle;
1198 emit_asm(ir, TGSI_OPCODE_MOV, dst, src);
1199 /* even a float takes up a whole vec4 reg in a struct/array. */
1200 dst.index++;
1201 }
1202 }
1203
1204 if (storage->file == PROGRAM_TEMPORARY &&
1205 dst.index != storage->index + (int) ir->get_num_state_slots()) {
1206 fail_link(this->shader_program,
1207 "failed to load builtin uniform `%s' (%d/%d regs loaded)\n",
1208 ir->name, dst.index - storage->index,
1209 type_size(ir->type));
1210 }
1211 }
1212 }
1213
1214 void
1215 glsl_to_tgsi_visitor::visit(ir_loop *ir)
1216 {
1217 emit_asm(NULL, TGSI_OPCODE_BGNLOOP);
1218
1219 visit_exec_list(&ir->body_instructions, this);
1220
1221 emit_asm(NULL, TGSI_OPCODE_ENDLOOP);
1222 }
1223
1224 void
1225 glsl_to_tgsi_visitor::visit(ir_loop_jump *ir)
1226 {
1227 switch (ir->mode) {
1228 case ir_loop_jump::jump_break:
1229 emit_asm(NULL, TGSI_OPCODE_BRK);
1230 break;
1231 case ir_loop_jump::jump_continue:
1232 emit_asm(NULL, TGSI_OPCODE_CONT);
1233 break;
1234 }
1235 }
1236
1237
1238 void
1239 glsl_to_tgsi_visitor::visit(ir_function_signature *ir)
1240 {
1241 assert(0);
1242 (void)ir;
1243 }
1244
1245 void
1246 glsl_to_tgsi_visitor::visit(ir_function *ir)
1247 {
1248 /* Ignore function bodies other than main() -- we shouldn't see calls to
1249 * them since they should all be inlined before we get to glsl_to_tgsi.
1250 */
1251 if (strcmp(ir->name, "main") == 0) {
1252 const ir_function_signature *sig;
1253 exec_list empty;
1254
1255 sig = ir->matching_signature(NULL, &empty, false);
1256
1257 assert(sig);
1258
1259 foreach_in_list(ir_instruction, ir, &sig->body) {
1260 ir->accept(this);
1261 }
1262 }
1263 }
1264
1265 bool
1266 glsl_to_tgsi_visitor::try_emit_mad(ir_expression *ir, int mul_operand)
1267 {
1268 int nonmul_operand = 1 - mul_operand;
1269 st_src_reg a, b, c;
1270 st_dst_reg result_dst;
1271
1272 // there is no TGSI opcode for this
1273 if (ir->type->is_integer_64())
1274 return false;
1275
1276 ir_expression *expr = ir->operands[mul_operand]->as_expression();
1277 if (!expr || expr->operation != ir_binop_mul)
1278 return false;
1279
1280 expr->operands[0]->accept(this);
1281 a = this->result;
1282 expr->operands[1]->accept(this);
1283 b = this->result;
1284 ir->operands[nonmul_operand]->accept(this);
1285 c = this->result;
1286
1287 this->result = get_temp(ir->type);
1288 result_dst = st_dst_reg(this->result);
1289 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1290 emit_asm(ir, TGSI_OPCODE_MAD, result_dst, a, b, c);
1291
1292 return true;
1293 }
1294
1295 /**
1296 * Emit MAD(a, -b, a) instead of AND(a, NOT(b))
1297 *
1298 * The logic values are 1.0 for true and 0.0 for false. Logical-and is
1299 * implemented using multiplication, and logical-or is implemented using
1300 * addition. Logical-not can be implemented as (true - x), or (1.0 - x).
1301 * As result, the logical expression (a & !b) can be rewritten as:
1302 *
1303 * - a * !b
1304 * - a * (1 - b)
1305 * - (a * 1) - (a * b)
1306 * - a + -(a * b)
1307 * - a + (a * -b)
1308 *
1309 * This final expression can be implemented as a single MAD(a, -b, a)
1310 * instruction.
1311 */
1312 bool
1313 glsl_to_tgsi_visitor::try_emit_mad_for_and_not(ir_expression *ir,
1314 int try_operand)
1315 {
1316 const int other_operand = 1 - try_operand;
1317 st_src_reg a, b;
1318
1319 ir_expression *expr = ir->operands[try_operand]->as_expression();
1320 if (!expr || expr->operation != ir_unop_logic_not)
1321 return false;
1322
1323 ir->operands[other_operand]->accept(this);
1324 a = this->result;
1325 expr->operands[0]->accept(this);
1326 b = this->result;
1327
1328 b.negate = ~b.negate;
1329
1330 this->result = get_temp(ir->type);
1331 emit_asm(ir, TGSI_OPCODE_MAD, st_dst_reg(this->result), a, b, a);
1332
1333 return true;
1334 }
1335
1336 void
1337 glsl_to_tgsi_visitor::reladdr_to_temp(ir_instruction *ir,
1338 st_src_reg *reg, int *num_reladdr)
1339 {
1340 if (!reg->reladdr && !reg->reladdr2)
1341 return;
1342
1343 if (reg->reladdr)
1344 emit_arl(ir, address_reg, *reg->reladdr);
1345 if (reg->reladdr2)
1346 emit_arl(ir, address_reg2, *reg->reladdr2);
1347
1348 if (*num_reladdr != 1) {
1349 st_src_reg temp = get_temp(glsl_type::get_instance(reg->type, 4, 1));
1350
1351 emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
1352 *reg = temp;
1353 }
1354
1355 (*num_reladdr)--;
1356 }
1357
1358 void
1359 glsl_to_tgsi_visitor::visit(ir_expression *ir)
1360 {
1361 st_src_reg op[ARRAY_SIZE(ir->operands)];
1362
1363 /* Quick peephole: Emit MAD(a, b, c) instead of ADD(MUL(a, b), c)
1364 */
1365 if (!this->precise && ir->operation == ir_binop_add) {
1366 if (try_emit_mad(ir, 1))
1367 return;
1368 if (try_emit_mad(ir, 0))
1369 return;
1370 }
1371
1372 /* Quick peephole: Emit OPCODE_MAD(-a, -b, a) instead of AND(a, NOT(b))
1373 */
1374 if (!native_integers && ir->operation == ir_binop_logic_and) {
1375 if (try_emit_mad_for_and_not(ir, 1))
1376 return;
1377 if (try_emit_mad_for_and_not(ir, 0))
1378 return;
1379 }
1380
1381 if (ir->operation == ir_quadop_vector)
1382 assert(!"ir_quadop_vector should have been lowered");
1383
1384 for (unsigned int operand = 0; operand < ir->num_operands; operand++) {
1385 this->result.file = PROGRAM_UNDEFINED;
1386 ir->operands[operand]->accept(this);
1387 if (this->result.file == PROGRAM_UNDEFINED) {
1388 printf("Failed to get tree for expression operand:\n");
1389 ir->operands[operand]->print();
1390 printf("\n");
1391 exit(1);
1392 }
1393 op[operand] = this->result;
1394
1395 /* Matrix expression operands should have been broken down to vector
1396 * operations already.
1397 */
1398 assert(!ir->operands[operand]->type->is_matrix());
1399 }
1400
1401 visit_expression(ir, op);
1402 }
1403
1404 /* The non-recursive part of the expression visitor lives in a separate
1405 * function and should be prevented from being inlined, to avoid a stack
1406 * explosion when deeply nested expressions are visited.
1407 */
1408 void
1409 glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
1410 {
1411 st_src_reg result_src;
1412 st_dst_reg result_dst;
1413
1414 int vector_elements = ir->operands[0]->type->vector_elements;
1415 if (ir->operands[1] &&
1416 ir->operation != ir_binop_interpolate_at_offset &&
1417 ir->operation != ir_binop_interpolate_at_sample) {
1418 st_src_reg *swz_op = NULL;
1419 if (vector_elements > ir->operands[1]->type->vector_elements) {
1420 assert(ir->operands[1]->type->vector_elements == 1);
1421 swz_op = &op[1];
1422 } else if (vector_elements < ir->operands[1]->type->vector_elements) {
1423 assert(ir->operands[0]->type->vector_elements == 1);
1424 swz_op = &op[0];
1425 }
1426 if (swz_op) {
1427 uint16_t swizzle_x = GET_SWZ(swz_op->swizzle, 0);
1428 swz_op->swizzle = MAKE_SWIZZLE4(swizzle_x, swizzle_x,
1429 swizzle_x, swizzle_x);
1430 }
1431 vector_elements = MAX2(vector_elements,
1432 ir->operands[1]->type->vector_elements);
1433 }
1434 if (ir->operands[2] &&
1435 ir->operands[2]->type->vector_elements != vector_elements) {
1436 /* This can happen with ir_triop_lrp, i.e. glsl mix */
1437 assert(ir->operands[2]->type->vector_elements == 1);
1438 uint16_t swizzle_x = GET_SWZ(op[2].swizzle, 0);
1439 op[2].swizzle = MAKE_SWIZZLE4(swizzle_x, swizzle_x,
1440 swizzle_x, swizzle_x);
1441 }
1442
1443 this->result.file = PROGRAM_UNDEFINED;
1444
1445 /* Storage for our result. Ideally for an assignment we'd be using
1446 * the actual storage for the result here, instead.
1447 */
1448 result_src = get_temp(ir->type);
1449 /* convenience for the emit functions below. */
1450 result_dst = st_dst_reg(result_src);
1451 /* Limit writes to the channels that will be used by result_src later.
1452 * This does limit this temp's use as a temporary for multi-instruction
1453 * sequences.
1454 */
1455 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1456
1457 switch (ir->operation) {
1458 case ir_unop_logic_not:
1459 if (result_dst.type != GLSL_TYPE_FLOAT)
1460 emit_asm(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
1461 else {
1462 /* Previously 'SEQ dst, src, 0.0' was used for this. However, many
1463 * older GPUs implement SEQ using multiple instructions (i915 uses two
1464 * SGE instructions and a MUL instruction). Since our logic values are
1465 * 0.0 and 1.0, 1-x also implements !x.
1466 */
1467 op[0].negate = ~op[0].negate;
1468 emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0],
1469 st_src_reg_for_float(1.0));
1470 }
1471 break;
1472 case ir_unop_neg:
1473 if (result_dst.type == GLSL_TYPE_INT64 ||
1474 result_dst.type == GLSL_TYPE_UINT64)
1475 emit_asm(ir, TGSI_OPCODE_I64NEG, result_dst, op[0]);
1476 else if (result_dst.type == GLSL_TYPE_INT ||
1477 result_dst.type == GLSL_TYPE_UINT)
1478 emit_asm(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
1479 else if (result_dst.type == GLSL_TYPE_DOUBLE)
1480 emit_asm(ir, TGSI_OPCODE_DNEG, result_dst, op[0]);
1481 else {
1482 op[0].negate = ~op[0].negate;
1483 result_src = op[0];
1484 }
1485 break;
1486 case ir_unop_subroutine_to_int:
1487 emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
1488 break;
1489 case ir_unop_abs:
1490 if (result_dst.type == GLSL_TYPE_FLOAT)
1491 emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0].get_abs());
1492 else if (result_dst.type == GLSL_TYPE_DOUBLE)
1493 emit_asm(ir, TGSI_OPCODE_DABS, result_dst, op[0]);
1494 else if (result_dst.type == GLSL_TYPE_INT64 ||
1495 result_dst.type == GLSL_TYPE_UINT64)
1496 emit_asm(ir, TGSI_OPCODE_I64ABS, result_dst, op[0]);
1497 else
1498 emit_asm(ir, TGSI_OPCODE_IABS, result_dst, op[0]);
1499 break;
1500 case ir_unop_sign:
1501 emit_asm(ir, TGSI_OPCODE_SSG, result_dst, op[0]);
1502 break;
1503 case ir_unop_rcp:
1504 emit_scalar(ir, TGSI_OPCODE_RCP, result_dst, op[0]);
1505 break;
1506
1507 case ir_unop_exp2:
1508 emit_scalar(ir, TGSI_OPCODE_EX2, result_dst, op[0]);
1509 break;
1510 case ir_unop_exp:
1511 assert(!"not reached: should be handled by exp_to_exp2");
1512 break;
1513 case ir_unop_log:
1514 assert(!"not reached: should be handled by log_to_log2");
1515 break;
1516 case ir_unop_log2:
1517 emit_scalar(ir, TGSI_OPCODE_LG2, result_dst, op[0]);
1518 break;
1519 case ir_unop_sin:
1520 emit_scalar(ir, TGSI_OPCODE_SIN, result_dst, op[0]);
1521 break;
1522 case ir_unop_cos:
1523 emit_scalar(ir, TGSI_OPCODE_COS, result_dst, op[0]);
1524 break;
1525 case ir_unop_saturate: {
1526 glsl_to_tgsi_instruction *inst;
1527 inst = emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
1528 inst->saturate = true;
1529 break;
1530 }
1531
1532 case ir_unop_dFdx:
1533 case ir_unop_dFdx_coarse:
1534 emit_asm(ir, TGSI_OPCODE_DDX, result_dst, op[0]);
1535 break;
1536 case ir_unop_dFdx_fine:
1537 emit_asm(ir, TGSI_OPCODE_DDX_FINE, result_dst, op[0]);
1538 break;
1539 case ir_unop_dFdy:
1540 case ir_unop_dFdy_coarse:
1541 case ir_unop_dFdy_fine:
1542 {
1543 /* The X component contains 1 or -1 depending on whether the framebuffer
1544 * is a FBO or the window system buffer, respectively.
1545 * It is then multiplied with the source operand of DDY.
1546 */
1547 static const gl_state_index16 transform_y_state[STATE_LENGTH]
1548 = { STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM };
1549
1550 unsigned transform_y_index =
1551 _mesa_add_state_reference(this->prog->Parameters,
1552 transform_y_state);
1553
1554 st_src_reg transform_y = st_src_reg(PROGRAM_STATE_VAR,
1555 transform_y_index,
1556 glsl_type::vec4_type);
1557 transform_y.swizzle = SWIZZLE_XXXX;
1558
1559 st_src_reg temp = get_temp(glsl_type::vec4_type);
1560
1561 emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(temp), transform_y, op[0]);
1562 emit_asm(ir, ir->operation == ir_unop_dFdy_fine ?
1563 TGSI_OPCODE_DDY_FINE : TGSI_OPCODE_DDY, result_dst, temp);
1564 break;
1565 }
1566
1567 case ir_unop_frexp_sig:
1568 emit_asm(ir, TGSI_OPCODE_DFRACEXP, result_dst, undef_dst, op[0]);
1569 break;
1570
1571 case ir_unop_frexp_exp:
1572 emit_asm(ir, TGSI_OPCODE_DFRACEXP, undef_dst, result_dst, op[0]);
1573 break;
1574
1575 case ir_binop_add:
1576 emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1577 break;
1578 case ir_binop_sub:
1579 op[1].negate = ~op[1].negate;
1580 emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1581 break;
1582
1583 case ir_binop_mul:
1584 emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
1585 break;
1586 case ir_binop_div:
1587 emit_asm(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
1588 break;
1589 case ir_binop_mod:
1590 if (result_dst.type == GLSL_TYPE_FLOAT)
1591 assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
1592 else
1593 emit_asm(ir, TGSI_OPCODE_MOD, result_dst, op[0], op[1]);
1594 break;
1595
1596 case ir_binop_less:
1597 emit_asm(ir, TGSI_OPCODE_SLT, result_dst, op[0], op[1]);
1598 break;
1599 case ir_binop_gequal:
1600 emit_asm(ir, TGSI_OPCODE_SGE, result_dst, op[0], op[1]);
1601 break;
1602 case ir_binop_equal:
1603 emit_asm(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
1604 break;
1605 case ir_binop_nequal:
1606 emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1607 break;
1608 case ir_binop_all_equal:
1609 /* "==" operator producing a scalar boolean. */
1610 if (ir->operands[0]->type->is_vector() ||
1611 ir->operands[1]->type->is_vector()) {
1612 st_src_reg temp = get_temp(native_integers ?
1613 glsl_type::uvec4_type :
1614 glsl_type::vec4_type);
1615
1616 if (native_integers) {
1617 st_dst_reg temp_dst = st_dst_reg(temp);
1618 st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
1619
1620 if (ir->operands[0]->type->is_boolean() &&
1621 ir->operands[1]->as_constant() &&
1622 ir->operands[1]->as_constant()->is_one()) {
1623 emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), op[0]);
1624 } else {
1625 emit_asm(ir, TGSI_OPCODE_SEQ, st_dst_reg(temp), op[0], op[1]);
1626 }
1627
1628 /* Emit 1-3 AND operations to combine the SEQ results. */
1629 switch (ir->operands[0]->type->vector_elements) {
1630 case 2:
1631 break;
1632 case 3:
1633 temp_dst.writemask = WRITEMASK_Y;
1634 temp1.swizzle = SWIZZLE_YYYY;
1635 temp2.swizzle = SWIZZLE_ZZZZ;
1636 emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
1637 break;
1638 case 4:
1639 temp_dst.writemask = WRITEMASK_X;
1640 temp1.swizzle = SWIZZLE_XXXX;
1641 temp2.swizzle = SWIZZLE_YYYY;
1642 emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
1643 temp_dst.writemask = WRITEMASK_Y;
1644 temp1.swizzle = SWIZZLE_ZZZZ;
1645 temp2.swizzle = SWIZZLE_WWWW;
1646 emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
1647 }
1648
1649 temp1.swizzle = SWIZZLE_XXXX;
1650 temp2.swizzle = SWIZZLE_YYYY;
1651 emit_asm(ir, TGSI_OPCODE_AND, result_dst, temp1, temp2);
1652 } else {
1653 emit_asm(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
1654
1655 /* After the dot-product, the value will be an integer on the
1656 * range [0,4]. Zero becomes 1.0, and positive values become zero.
1657 */
1658 emit_dp(ir, result_dst, temp, temp, vector_elements);
1659
1660 /* Negating the result of the dot-product gives values on the range
1661 * [-4, 0]. Zero becomes 1.0, and negative values become zero.
1662 * This is achieved using SGE.
1663 */
1664 st_src_reg sge_src = result_src;
1665 sge_src.negate = ~sge_src.negate;
1666 emit_asm(ir, TGSI_OPCODE_SGE, result_dst, sge_src,
1667 st_src_reg_for_float(0.0));
1668 }
1669 } else {
1670 emit_asm(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
1671 }
1672 break;
1673 case ir_binop_any_nequal:
1674 /* "!=" operator producing a scalar boolean. */
1675 if (ir->operands[0]->type->is_vector() ||
1676 ir->operands[1]->type->is_vector()) {
1677 st_src_reg temp = get_temp(native_integers ?
1678 glsl_type::uvec4_type :
1679 glsl_type::vec4_type);
1680 if (ir->operands[0]->type->is_boolean() &&
1681 ir->operands[1]->as_constant() &&
1682 ir->operands[1]->as_constant()->is_zero()) {
1683 emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), op[0]);
1684 } else {
1685 emit_asm(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
1686 }
1687
1688 if (native_integers) {
1689 st_dst_reg temp_dst = st_dst_reg(temp);
1690 st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
1691
1692 /* Emit 1-3 OR operations to combine the SNE results. */
1693 switch (ir->operands[0]->type->vector_elements) {
1694 case 2:
1695 break;
1696 case 3:
1697 temp_dst.writemask = WRITEMASK_Y;
1698 temp1.swizzle = SWIZZLE_YYYY;
1699 temp2.swizzle = SWIZZLE_ZZZZ;
1700 emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
1701 break;
1702 case 4:
1703 temp_dst.writemask = WRITEMASK_X;
1704 temp1.swizzle = SWIZZLE_XXXX;
1705 temp2.swizzle = SWIZZLE_YYYY;
1706 emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
1707 temp_dst.writemask = WRITEMASK_Y;
1708 temp1.swizzle = SWIZZLE_ZZZZ;
1709 temp2.swizzle = SWIZZLE_WWWW;
1710 emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
1711 }
1712
1713 temp1.swizzle = SWIZZLE_XXXX;
1714 temp2.swizzle = SWIZZLE_YYYY;
1715 emit_asm(ir, TGSI_OPCODE_OR, result_dst, temp1, temp2);
1716 } else {
1717 /* After the dot-product, the value will be an integer on the
1718 * range [0,4]. Zero stays zero, and positive values become 1.0.
1719 */
1720 glsl_to_tgsi_instruction *const dp =
1721 emit_dp(ir, result_dst, temp, temp, vector_elements);
1722 if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
1723 /* The clamping to [0,1] can be done for free in the fragment
1724 * shader with a saturate.
1725 */
1726 dp->saturate = true;
1727 } else {
1728 /* Negating the result of the dot-product gives values on the
1729 * range [-4, 0]. Zero stays zero, and negative values become
1730 * 1.0. This achieved using SLT.
1731 */
1732 st_src_reg slt_src = result_src;
1733 slt_src.negate = ~slt_src.negate;
1734 emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src,
1735 st_src_reg_for_float(0.0));
1736 }
1737 }
1738 } else {
1739 emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1740 }
1741 break;
1742
1743 case ir_binop_logic_xor:
1744 if (native_integers)
1745 emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
1746 else
1747 emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1748 break;
1749
1750 case ir_binop_logic_or: {
1751 if (native_integers) {
1752 /* If integers are used as booleans, we can use an actual "or"
1753 * instruction.
1754 */
1755 assert(native_integers);
1756 emit_asm(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
1757 } else {
1758 /* After the addition, the value will be an integer on the
1759 * range [0,2]. Zero stays zero, and positive values become 1.0.
1760 */
1761 glsl_to_tgsi_instruction *add =
1762 emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1763 if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
1764 /* The clamping to [0,1] can be done for free in the fragment
1765 * shader with a saturate if floats are being used as boolean
1766 * values.
1767 */
1768 add->saturate = true;
1769 } else {
1770 /* Negating the result of the addition gives values on the range
1771 * [-2, 0]. Zero stays zero, and negative values become 1.0
1772 * This is achieved using SLT.
1773 */
1774 st_src_reg slt_src = result_src;
1775 slt_src.negate = ~slt_src.negate;
1776 emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src,
1777 st_src_reg_for_float(0.0));
1778 }
1779 }
1780 break;
1781 }
1782
1783 case ir_binop_logic_and:
1784 /* If native integers are disabled, the bool args are stored as float 0.0
1785 * or 1.0, so "mul" gives us "and". If they're enabled, just use the
1786 * actual AND opcode.
1787 */
1788 if (native_integers)
1789 emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
1790 else
1791 emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
1792 break;
1793
1794 case ir_binop_dot:
1795 assert(ir->operands[0]->type->is_vector());
1796 assert(ir->operands[0]->type == ir->operands[1]->type);
1797 emit_dp(ir, result_dst, op[0], op[1],
1798 ir->operands[0]->type->vector_elements);
1799 break;
1800
1801 case ir_unop_sqrt:
1802 if (have_sqrt) {
1803 emit_scalar(ir, TGSI_OPCODE_SQRT, result_dst, op[0]);
1804 } else {
1805 /* This is the only instruction sequence that makes the game "Risen"
1806 * render correctly. ABS is not required for the game, but since GLSL
1807 * declares negative values as "undefined", allowing us to do whatever
1808 * we want, I choose to use ABS to match DX9 and pre-GLSL RSQ
1809 * behavior.
1810 */
1811 emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0].get_abs());
1812 emit_scalar(ir, TGSI_OPCODE_RCP, result_dst, result_src);
1813 }
1814 break;
1815 case ir_unop_rsq:
1816 emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0]);
1817 break;
1818 case ir_unop_i2f:
1819 if (native_integers) {
1820 emit_asm(ir, TGSI_OPCODE_I2F, result_dst, op[0]);
1821 break;
1822 }
1823 /* fallthrough to next case otherwise */
1824 case ir_unop_b2f:
1825 if (native_integers) {
1826 emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0],
1827 st_src_reg_for_float(1.0));
1828 break;
1829 }
1830 /* fallthrough to next case otherwise */
1831 case ir_unop_i2u:
1832 case ir_unop_u2i:
1833 case ir_unop_i642u64:
1834 case ir_unop_u642i64:
1835 /* Converting between signed and unsigned integers is a no-op. */
1836 result_src = op[0];
1837 result_src.type = result_dst.type;
1838 break;
1839 case ir_unop_b2i:
1840 if (native_integers) {
1841 /* Booleans are stored as integers using ~0 for true and 0 for false.
1842 * GLSL requires that int(bool) return 1 for true and 0 for false.
1843 * This conversion is done with AND, but it could be done with NEG.
1844 */
1845 emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0],
1846 st_src_reg_for_int(1));
1847 } else {
1848 /* Booleans and integers are both stored as floats when native
1849 * integers are disabled.
1850 */
1851 result_src = op[0];
1852 }
1853 break;
1854 case ir_unop_f2i:
1855 if (native_integers)
1856 emit_asm(ir, TGSI_OPCODE_F2I, result_dst, op[0]);
1857 else
1858 emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
1859 break;
1860 case ir_unop_f2u:
1861 if (native_integers)
1862 emit_asm(ir, TGSI_OPCODE_F2U, result_dst, op[0]);
1863 else
1864 emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
1865 break;
1866 case ir_unop_bitcast_f2i:
1867 case ir_unop_bitcast_f2u:
1868 /* Make sure we don't propagate the negate modifier to integer opcodes. */
1869 if (op[0].negate || op[0].abs)
1870 emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
1871 else
1872 result_src = op[0];
1873 result_src.type = ir->operation == ir_unop_bitcast_f2i ? GLSL_TYPE_INT :
1874 GLSL_TYPE_UINT;
1875 break;
1876 case ir_unop_bitcast_i2f:
1877 case ir_unop_bitcast_u2f:
1878 result_src = op[0];
1879 result_src.type = GLSL_TYPE_FLOAT;
1880 break;
1881 case ir_unop_f2b:
1882 emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0],
1883 st_src_reg_for_float(0.0));
1884 break;
1885 case ir_unop_d2b:
1886 emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0],
1887 st_src_reg_for_double(0.0));
1888 break;
1889 case ir_unop_i2b:
1890 if (native_integers)
1891 emit_asm(ir, TGSI_OPCODE_USNE, result_dst, op[0],
1892 st_src_reg_for_int(0));
1893 else
1894 emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0],
1895 st_src_reg_for_float(0.0));
1896 break;
1897 case ir_unop_bitcast_u642d:
1898 case ir_unop_bitcast_i642d:
1899 result_src = op[0];
1900 result_src.type = GLSL_TYPE_DOUBLE;
1901 break;
1902 case ir_unop_bitcast_d2i64:
1903 result_src = op[0];
1904 result_src.type = GLSL_TYPE_INT64;
1905 break;
1906 case ir_unop_bitcast_d2u64:
1907 result_src = op[0];
1908 result_src.type = GLSL_TYPE_UINT64;
1909 break;
1910 case ir_unop_trunc:
1911 emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
1912 break;
1913 case ir_unop_ceil:
1914 emit_asm(ir, TGSI_OPCODE_CEIL, result_dst, op[0]);
1915 break;
1916 case ir_unop_floor:
1917 emit_asm(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
1918 break;
1919 case ir_unop_round_even:
1920 emit_asm(ir, TGSI_OPCODE_ROUND, result_dst, op[0]);
1921 break;
1922 case ir_unop_fract:
1923 emit_asm(ir, TGSI_OPCODE_FRC, result_dst, op[0]);
1924 break;
1925
1926 case ir_binop_min:
1927 emit_asm(ir, TGSI_OPCODE_MIN, result_dst, op[0], op[1]);
1928 break;
1929 case ir_binop_max:
1930 emit_asm(ir, TGSI_OPCODE_MAX, result_dst, op[0], op[1]);
1931 break;
1932 case ir_binop_pow:
1933 emit_scalar(ir, TGSI_OPCODE_POW, result_dst, op[0], op[1]);
1934 break;
1935
1936 case ir_unop_bit_not:
1937 if (native_integers) {
1938 emit_asm(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
1939 break;
1940 }
1941 /* fallthrough */
1942 case ir_unop_u2f:
1943 if (native_integers) {
1944 emit_asm(ir, TGSI_OPCODE_U2F, result_dst, op[0]);
1945 break;
1946 }
1947 /* fallthrough */
1948 case ir_binop_lshift:
1949 case ir_binop_rshift:
1950 if (native_integers) {
1951 enum tgsi_opcode opcode = ir->operation == ir_binop_lshift
1952 ? TGSI_OPCODE_SHL : TGSI_OPCODE_ISHR;
1953 st_src_reg count;
1954
1955 if (glsl_base_type_is_64bit(op[0].type)) {
1956 /* GLSL shift operations have 32-bit shift counts, but TGSI uses
1957 * 64 bits.
1958 */
1959 count = get_temp(glsl_type::u64vec(ir->operands[1]
1960 ->type->components()));
1961 emit_asm(ir, TGSI_OPCODE_U2I64, st_dst_reg(count), op[1]);
1962 } else {
1963 count = op[1];
1964 }
1965
1966 emit_asm(ir, opcode, result_dst, op[0], count);
1967 break;
1968 }
1969 /* fallthrough */
1970 case ir_binop_bit_and:
1971 if (native_integers) {
1972 emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
1973 break;
1974 }
1975 /* fallthrough */
1976 case ir_binop_bit_xor:
1977 if (native_integers) {
1978 emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
1979 break;
1980 }
1981 /* fallthrough */
1982 case ir_binop_bit_or:
1983 if (native_integers) {
1984 emit_asm(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
1985 break;
1986 }
1987
1988 assert(!"GLSL 1.30 features unsupported");
1989 break;
1990
1991 case ir_binop_ubo_load: {
1992 if (ctx->Const.UseSTD430AsDefaultPacking) {
1993 ir_rvalue *block = ir->operands[0];
1994 ir_rvalue *offset = ir->operands[1];
1995 ir_constant *const_block = block->as_constant();
1996
1997 st_src_reg cbuf(PROGRAM_CONSTANT,
1998 (const_block ? const_block->value.u[0] + 1 : 1),
1999 ir->type->base_type);
2000
2001 cbuf.has_index2 = true;
2002
2003 if (!const_block) {
2004 block->accept(this);
2005 cbuf.reladdr = ralloc(mem_ctx, st_src_reg);
2006 *cbuf.reladdr = this->result;
2007 emit_arl(ir, sampler_reladdr, this->result);
2008 }
2009
2010 /* Calculate the surface offset */
2011 offset->accept(this);
2012 st_src_reg off = this->result;
2013
2014 glsl_to_tgsi_instruction *inst =
2015 emit_asm(ir, TGSI_OPCODE_LOAD, result_dst, off);
2016
2017 if (result_dst.type == GLSL_TYPE_BOOL)
2018 emit_asm(ir, TGSI_OPCODE_USNE, result_dst, st_src_reg(result_dst),
2019 st_src_reg_for_int(0));
2020
2021 add_buffer_to_load_and_stores(inst, &cbuf, &this->instructions,
2022 NULL);
2023 } else {
2024 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
2025 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
2026 unsigned const_offset = const_offset_ir ?
2027 const_offset_ir->value.u[0] : 0;
2028 unsigned const_block = const_uniform_block ?
2029 const_uniform_block->value.u[0] + 1 : 1;
2030 st_src_reg index_reg = get_temp(glsl_type::uint_type);
2031 st_src_reg cbuf;
2032
2033 cbuf.type = ir->type->base_type;
2034 cbuf.file = PROGRAM_CONSTANT;
2035 cbuf.index = 0;
2036 cbuf.reladdr = NULL;
2037 cbuf.negate = 0;
2038 cbuf.abs = 0;
2039 cbuf.index2D = const_block;
2040
2041 assert(ir->type->is_vector() || ir->type->is_scalar());
2042
2043 if (const_offset_ir) {
2044 /* Constant index into constant buffer */
2045 cbuf.reladdr = NULL;
2046 cbuf.index = const_offset / 16;
2047 } else {
2048 ir_expression *offset_expr = ir->operands[1]->as_expression();
2049 st_src_reg offset = op[1];
2050
2051 /* The OpenGL spec is written in such a way that accesses with
2052 * non-constant offset are almost always vec4-aligned. The only
2053 * exception to this are members of structs in arrays of structs:
2054 * each struct in an array of structs is at least vec4-aligned,
2055 * but single-element and [ui]vec2 members of the struct may be at
2056 * an offset that is not a multiple of 16 bytes.
2057 *
2058 * Here, we extract that offset, relying on previous passes to
2059 * always generate offset expressions of the form
2060 * (+ expr constant_offset).
2061 *
2062 * Note that the std430 layout, which allows more cases of
2063 * alignment less than vec4 in arrays, is not supported for
2064 * uniform blocks, so we do not have to deal with it here.
2065 */
2066 if (offset_expr && offset_expr->operation == ir_binop_add) {
2067 const_offset_ir = offset_expr->operands[1]->as_constant();
2068 if (const_offset_ir) {
2069 const_offset = const_offset_ir->value.u[0];
2070 cbuf.index = const_offset / 16;
2071 offset_expr->operands[0]->accept(this);
2072 offset = this->result;
2073 }
2074 }
2075
2076 /* Relative/variable index into constant buffer */
2077 emit_asm(ir, TGSI_OPCODE_USHR, st_dst_reg(index_reg), offset,
2078 st_src_reg_for_int(4));
2079 cbuf.reladdr = ralloc(mem_ctx, st_src_reg);
2080 *cbuf.reladdr = index_reg;
2081 }
2082
2083 if (const_uniform_block) {
2084 /* Constant constant buffer */
2085 cbuf.reladdr2 = NULL;
2086 } else {
2087 /* Relative/variable constant buffer */
2088 cbuf.reladdr2 = ralloc(mem_ctx, st_src_reg);
2089 *cbuf.reladdr2 = op[0];
2090 }
2091 cbuf.has_index2 = true;
2092
2093 cbuf.swizzle = swizzle_for_size(ir->type->vector_elements);
2094 if (glsl_base_type_is_64bit(cbuf.type))
2095 cbuf.swizzle += MAKE_SWIZZLE4(const_offset % 16 / 8,
2096 const_offset % 16 / 8,
2097 const_offset % 16 / 8,
2098 const_offset % 16 / 8);
2099 else
2100 cbuf.swizzle += MAKE_SWIZZLE4(const_offset % 16 / 4,
2101 const_offset % 16 / 4,
2102 const_offset % 16 / 4,
2103 const_offset % 16 / 4);
2104
2105 if (ir->type->is_boolean()) {
2106 emit_asm(ir, TGSI_OPCODE_USNE, result_dst, cbuf,
2107 st_src_reg_for_int(0));
2108 } else {
2109 emit_asm(ir, TGSI_OPCODE_MOV, result_dst, cbuf);
2110 }
2111 }
2112 break;
2113 }
2114 case ir_triop_lrp:
2115 /* note: we have to reorder the three args here */
2116 emit_asm(ir, TGSI_OPCODE_LRP, result_dst, op[2], op[1], op[0]);
2117 break;
2118 case ir_triop_csel:
2119 if (this->ctx->Const.NativeIntegers)
2120 emit_asm(ir, TGSI_OPCODE_UCMP, result_dst, op[0], op[1], op[2]);
2121 else {
2122 op[0].negate = ~op[0].negate;
2123 emit_asm(ir, TGSI_OPCODE_CMP, result_dst, op[0], op[1], op[2]);
2124 }
2125 break;
2126 case ir_triop_bitfield_extract:
2127 emit_asm(ir, TGSI_OPCODE_IBFE, result_dst, op[0], op[1], op[2]);
2128 break;
2129 case ir_quadop_bitfield_insert:
2130 emit_asm(ir, TGSI_OPCODE_BFI, result_dst, op[0], op[1], op[2], op[3]);
2131 break;
2132 case ir_unop_bitfield_reverse:
2133 emit_asm(ir, TGSI_OPCODE_BREV, result_dst, op[0]);
2134 break;
2135 case ir_unop_bit_count:
2136 emit_asm(ir, TGSI_OPCODE_POPC, result_dst, op[0]);
2137 break;
2138 case ir_unop_find_msb:
2139 emit_asm(ir, TGSI_OPCODE_IMSB, result_dst, op[0]);
2140 break;
2141 case ir_unop_find_lsb:
2142 emit_asm(ir, TGSI_OPCODE_LSB, result_dst, op[0]);
2143 break;
2144 case ir_binop_imul_high:
2145 emit_asm(ir, TGSI_OPCODE_IMUL_HI, result_dst, op[0], op[1]);
2146 break;
2147 case ir_triop_fma:
2148 /* In theory, MAD is incorrect here. */
2149 if (have_fma)
2150 emit_asm(ir, TGSI_OPCODE_FMA, result_dst, op[0], op[1], op[2]);
2151 else
2152 emit_asm(ir, TGSI_OPCODE_MAD, result_dst, op[0], op[1], op[2]);
2153 break;
2154 case ir_unop_interpolate_at_centroid:
2155 emit_asm(ir, TGSI_OPCODE_INTERP_CENTROID, result_dst, op[0]);
2156 break;
2157 case ir_binop_interpolate_at_offset: {
2158 /* The y coordinate needs to be flipped for the default fb */
2159 static const gl_state_index16 transform_y_state[STATE_LENGTH]
2160 = { STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM };
2161
2162 unsigned transform_y_index =
2163 _mesa_add_state_reference(this->prog->Parameters,
2164 transform_y_state);
2165
2166 st_src_reg transform_y = st_src_reg(PROGRAM_STATE_VAR,
2167 transform_y_index,
2168 glsl_type::vec4_type);
2169 transform_y.swizzle = SWIZZLE_XXXX;
2170
2171 st_src_reg temp = get_temp(glsl_type::vec2_type);
2172 st_dst_reg temp_dst = st_dst_reg(temp);
2173
2174 emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[1]);
2175 temp_dst.writemask = WRITEMASK_Y;
2176 emit_asm(ir, TGSI_OPCODE_MUL, temp_dst, transform_y, op[1]);
2177 emit_asm(ir, TGSI_OPCODE_INTERP_OFFSET, result_dst, op[0], temp);
2178 break;
2179 }
2180 case ir_binop_interpolate_at_sample:
2181 emit_asm(ir, TGSI_OPCODE_INTERP_SAMPLE, result_dst, op[0], op[1]);
2182 break;
2183
2184 case ir_unop_d2f:
2185 emit_asm(ir, TGSI_OPCODE_D2F, result_dst, op[0]);
2186 break;
2187 case ir_unop_f2d:
2188 emit_asm(ir, TGSI_OPCODE_F2D, result_dst, op[0]);
2189 break;
2190 case ir_unop_d2i:
2191 emit_asm(ir, TGSI_OPCODE_D2I, result_dst, op[0]);
2192 break;
2193 case ir_unop_i2d:
2194 emit_asm(ir, TGSI_OPCODE_I2D, result_dst, op[0]);
2195 break;
2196 case ir_unop_d2u:
2197 emit_asm(ir, TGSI_OPCODE_D2U, result_dst, op[0]);
2198 break;
2199 case ir_unop_u2d:
2200 emit_asm(ir, TGSI_OPCODE_U2D, result_dst, op[0]);
2201 break;
2202 case ir_unop_unpack_double_2x32:
2203 case ir_unop_pack_double_2x32:
2204 case ir_unop_unpack_int_2x32:
2205 case ir_unop_pack_int_2x32:
2206 case ir_unop_unpack_uint_2x32:
2207 case ir_unop_pack_uint_2x32:
2208 case ir_unop_unpack_sampler_2x32:
2209 case ir_unop_pack_sampler_2x32:
2210 case ir_unop_unpack_image_2x32:
2211 case ir_unop_pack_image_2x32:
2212 emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
2213 break;
2214
2215 case ir_binop_ldexp:
2216 if (ir->operands[0]->type->is_double()) {
2217 emit_asm(ir, TGSI_OPCODE_DLDEXP, result_dst, op[0], op[1]);
2218 } else if (ir->operands[0]->type->is_float()) {
2219 emit_asm(ir, TGSI_OPCODE_LDEXP, result_dst, op[0], op[1]);
2220 } else {
2221 assert(!"Invalid ldexp for non-double opcode in glsl_to_tgsi_visitor::visit()");
2222 }
2223 break;
2224
2225 case ir_unop_pack_half_2x16:
2226 emit_asm(ir, TGSI_OPCODE_PK2H, result_dst, op[0]);
2227 break;
2228 case ir_unop_unpack_half_2x16:
2229 emit_asm(ir, TGSI_OPCODE_UP2H, result_dst, op[0]);
2230 break;
2231
2232 case ir_unop_get_buffer_size: {
2233 ir_constant *const_offset = ir->operands[0]->as_constant();
2234 st_src_reg buffer(
2235 PROGRAM_BUFFER,
2236 const_offset ? const_offset->value.u[0] : 0,
2237 GLSL_TYPE_UINT);
2238 if (!const_offset) {
2239 buffer.reladdr = ralloc(mem_ctx, st_src_reg);
2240 *buffer.reladdr = op[0];
2241 emit_arl(ir, sampler_reladdr, op[0]);
2242 }
2243 emit_asm(ir, TGSI_OPCODE_RESQ, result_dst)->resource = buffer;
2244 break;
2245 }
2246
2247 case ir_unop_u2i64:
2248 case ir_unop_u2u64:
2249 case ir_unop_b2i64: {
2250 st_src_reg temp = get_temp(glsl_type::uvec4_type);
2251 st_dst_reg temp_dst = st_dst_reg(temp);
2252 unsigned orig_swz = op[0].swizzle;
2253 /*
2254 * To convert unsigned to 64-bit:
2255 * zero Y channel, copy X channel.
2256 */
2257 temp_dst.writemask = WRITEMASK_Y;
2258 if (vector_elements > 1)
2259 temp_dst.writemask |= WRITEMASK_W;
2260 emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, st_src_reg_for_int(0));
2261 temp_dst.writemask = WRITEMASK_X;
2262 if (vector_elements > 1)
2263 temp_dst.writemask |= WRITEMASK_Z;
2264 op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(orig_swz, 0), GET_SWZ(orig_swz, 0),
2265 GET_SWZ(orig_swz, 1), GET_SWZ(orig_swz, 1));
2266 if (ir->operation == ir_unop_u2i64 || ir->operation == ir_unop_u2u64)
2267 emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[0]);
2268 else
2269 emit_asm(ir, TGSI_OPCODE_AND, temp_dst, op[0], st_src_reg_for_int(1));
2270 result_src = temp;
2271 result_src.type = GLSL_TYPE_UINT64;
2272 if (vector_elements > 2) {
2273 /* Subtle: We rely on the fact that get_temp here returns the next
2274 * TGSI temporary register directly after the temp register used for
2275 * the first two components, so that the result gets picked up
2276 * automatically.
2277 */
2278 st_src_reg temp = get_temp(glsl_type::uvec4_type);
2279 st_dst_reg temp_dst = st_dst_reg(temp);
2280 temp_dst.writemask = WRITEMASK_Y;
2281 if (vector_elements > 3)
2282 temp_dst.writemask |= WRITEMASK_W;
2283 emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, st_src_reg_for_int(0));
2284
2285 temp_dst.writemask = WRITEMASK_X;
2286 if (vector_elements > 3)
2287 temp_dst.writemask |= WRITEMASK_Z;
2288 op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(orig_swz, 2),
2289 GET_SWZ(orig_swz, 2),
2290 GET_SWZ(orig_swz, 3),
2291 GET_SWZ(orig_swz, 3));
2292 if (ir->operation == ir_unop_u2i64 || ir->operation == ir_unop_u2u64)
2293 emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[0]);
2294 else
2295 emit_asm(ir, TGSI_OPCODE_AND, temp_dst, op[0],
2296 st_src_reg_for_int(1));
2297 }
2298 break;
2299 }
2300 case ir_unop_i642i:
2301 case ir_unop_u642i:
2302 case ir_unop_u642u:
2303 case ir_unop_i642u: {
2304 st_src_reg temp = get_temp(glsl_type::uvec4_type);
2305 st_dst_reg temp_dst = st_dst_reg(temp);
2306 unsigned orig_swz = op[0].swizzle;
2307 unsigned orig_idx = op[0].index;
2308 int el;
2309 temp_dst.writemask = WRITEMASK_X;
2310
2311 for (el = 0; el < vector_elements; el++) {
2312 unsigned swz = GET_SWZ(orig_swz, el);
2313 if (swz & 1)
2314 op[0].swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_Z,
2315 SWIZZLE_Z, SWIZZLE_Z);
2316 else
2317 op[0].swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X,
2318 SWIZZLE_X, SWIZZLE_X);
2319 if (swz > 2)
2320 op[0].index = orig_idx + 1;
2321 op[0].type = GLSL_TYPE_UINT;
2322 temp_dst.writemask = WRITEMASK_X << el;
2323 emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[0]);
2324 }
2325 result_src = temp;
2326 if (ir->operation == ir_unop_u642u || ir->operation == ir_unop_i642u)
2327 result_src.type = GLSL_TYPE_UINT;
2328 else
2329 result_src.type = GLSL_TYPE_INT;
2330 break;
2331 }
2332 case ir_unop_i642b:
2333 emit_asm(ir, TGSI_OPCODE_U64SNE, result_dst, op[0],
2334 st_src_reg_for_int64(0));
2335 break;
2336 case ir_unop_i642f:
2337 emit_asm(ir, TGSI_OPCODE_I642F, result_dst, op[0]);
2338 break;
2339 case ir_unop_u642f:
2340 emit_asm(ir, TGSI_OPCODE_U642F, result_dst, op[0]);
2341 break;
2342 case ir_unop_i642d:
2343 emit_asm(ir, TGSI_OPCODE_I642D, result_dst, op[0]);
2344 break;
2345 case ir_unop_u642d:
2346 emit_asm(ir, TGSI_OPCODE_U642D, result_dst, op[0]);
2347 break;
2348 case ir_unop_i2i64:
2349 emit_asm(ir, TGSI_OPCODE_I2I64, result_dst, op[0]);
2350 break;
2351 case ir_unop_f2i64:
2352 emit_asm(ir, TGSI_OPCODE_F2I64, result_dst, op[0]);
2353 break;
2354 case ir_unop_d2i64:
2355 emit_asm(ir, TGSI_OPCODE_D2I64, result_dst, op[0]);
2356 break;
2357 case ir_unop_i2u64:
2358 emit_asm(ir, TGSI_OPCODE_I2I64, result_dst, op[0]);
2359 break;
2360 case ir_unop_f2u64:
2361 emit_asm(ir, TGSI_OPCODE_F2U64, result_dst, op[0]);
2362 break;
2363 case ir_unop_d2u64:
2364 emit_asm(ir, TGSI_OPCODE_D2U64, result_dst, op[0]);
2365 break;
2366 /* these might be needed */
2367 case ir_unop_pack_snorm_2x16:
2368 case ir_unop_pack_unorm_2x16:
2369 case ir_unop_pack_snorm_4x8:
2370 case ir_unop_pack_unorm_4x8:
2371
2372 case ir_unop_unpack_snorm_2x16:
2373 case ir_unop_unpack_unorm_2x16:
2374 case ir_unop_unpack_snorm_4x8:
2375 case ir_unop_unpack_unorm_4x8:
2376
2377 case ir_quadop_vector:
2378 case ir_binop_vector_extract:
2379 case ir_triop_vector_insert:
2380 case ir_binop_carry:
2381 case ir_binop_borrow:
2382 case ir_unop_ssbo_unsized_array_length:
2383 case ir_unop_atan:
2384 case ir_binop_atan2:
2385 case ir_unop_clz:
2386 case ir_binop_add_sat:
2387 case ir_binop_sub_sat:
2388 case ir_binop_abs_sub:
2389 case ir_binop_avg:
2390 case ir_binop_avg_round:
2391 case ir_binop_mul_32x16:
2392 case ir_unop_f162f:
2393 case ir_unop_f2f16:
2394 case ir_unop_f2fmp:
2395 case ir_unop_f162b:
2396 case ir_unop_b2f16:
2397 case ir_unop_i2i:
2398 case ir_unop_i2imp:
2399 case ir_unop_u2u:
2400 case ir_unop_u2ump:
2401 /* This operation is not supported, or should have already been handled.
2402 */
2403 assert(!"Invalid ir opcode in glsl_to_tgsi_visitor::visit()");
2404 break;
2405 }
2406
2407 this->result = result_src;
2408 }
2409
2410
2411 void
2412 glsl_to_tgsi_visitor::visit(ir_swizzle *ir)
2413 {
2414 st_src_reg src;
2415 int i;
2416 int swizzle[4] = {0};
2417
2418 /* Note that this is only swizzles in expressions, not those on the left
2419 * hand side of an assignment, which do write masking. See ir_assignment
2420 * for that.
2421 */
2422
2423 ir->val->accept(this);
2424 src = this->result;
2425 assert(src.file != PROGRAM_UNDEFINED);
2426 assert(ir->type->vector_elements > 0);
2427
2428 for (i = 0; i < 4; i++) {
2429 if (i < ir->type->vector_elements) {
2430 switch (i) {
2431 case 0:
2432 swizzle[i] = GET_SWZ(src.swizzle, ir->mask.x);
2433 break;
2434 case 1:
2435 swizzle[i] = GET_SWZ(src.swizzle, ir->mask.y);
2436 break;
2437 case 2:
2438 swizzle[i] = GET_SWZ(src.swizzle, ir->mask.z);
2439 break;
2440 case 3:
2441 swizzle[i] = GET_SWZ(src.swizzle, ir->mask.w);
2442 break;
2443 }
2444 } else {
2445 /* If the type is smaller than a vec4, replicate the last
2446 * channel out.
2447 */
2448 swizzle[i] = swizzle[ir->type->vector_elements - 1];
2449 }
2450 }
2451
2452 src.swizzle = MAKE_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2453
2454 this->result = src;
2455 }
2456
2457 /* Test if the variable is an array. Note that geometry and
2458 * tessellation shader inputs are outputs are always arrays (except
2459 * for patch inputs), so only the array element type is considered.
2460 */
2461 static bool
2462 is_inout_array(unsigned stage, ir_variable *var, bool *remove_array)
2463 {
2464 const glsl_type *type = var->type;
2465
2466 *remove_array = false;
2467
2468 if ((stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in) ||
2469 (stage == MESA_SHADER_FRAGMENT && var->data.mode == ir_var_shader_out))
2470 return false;
2471
2472 if (((stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in) ||
2473 (stage == MESA_SHADER_TESS_EVAL && var->data.mode == ir_var_shader_in) ||
2474 stage == MESA_SHADER_TESS_CTRL) &&
2475 !var->data.patch) {
2476 if (!var->type->is_array())
2477 return false; /* a system value probably */
2478
2479 type = var->type->fields.array;
2480 *remove_array = true;
2481 }
2482
2483 return type->is_array() || type->is_matrix();
2484 }
2485
2486 static unsigned
2487 st_translate_interp_loc(ir_variable *var)
2488 {
2489 if (var->data.centroid)
2490 return TGSI_INTERPOLATE_LOC_CENTROID;
2491 else if (var->data.sample)
2492 return TGSI_INTERPOLATE_LOC_SAMPLE;
2493 else
2494 return TGSI_INTERPOLATE_LOC_CENTER;
2495 }
2496
2497 void
2498 glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
2499 {
2500 variable_storage *entry;
2501 ir_variable *var = ir->var;
2502 bool remove_array;
2503
2504 if (handle_bound_deref(ir->as_dereference()))
2505 return;
2506
2507 entry = find_variable_storage(ir->var);
2508
2509 if (!entry) {
2510 switch (var->data.mode) {
2511 case ir_var_uniform:
2512 entry = new(mem_ctx) variable_storage(var, PROGRAM_UNIFORM,
2513 var->data.param_index);
2514 _mesa_hash_table_insert(this->variables, var, entry);
2515 break;
2516 case ir_var_shader_in: {
2517 /* The linker assigns locations for varyings and attributes,
2518 * including deprecated builtins (like gl_Color), user-assign
2519 * generic attributes (glBindVertexLocation), and
2520 * user-defined varyings.
2521 */
2522 assert(var->data.location != -1);
2523
2524 const glsl_type *type_without_array = var->type->without_array();
2525 struct inout_decl *decl = &inputs[num_inputs];
2526 unsigned component = var->data.location_frac;
2527 unsigned num_components;
2528 num_inputs++;
2529
2530 if (type_without_array->is_64bit())
2531 component = component / 2;
2532 if (type_without_array->vector_elements)
2533 num_components = type_without_array->vector_elements;
2534 else
2535 num_components = 4;
2536
2537 decl->mesa_index = var->data.location;
2538 decl->interp = (glsl_interp_mode) var->data.interpolation;
2539 decl->interp_loc = st_translate_interp_loc(var);
2540 decl->base_type = type_without_array->base_type;
2541 decl->usage_mask = u_bit_consecutive(component, num_components);
2542
2543 if (is_inout_array(shader->Stage, var, &remove_array)) {
2544 decl->array_id = num_input_arrays + 1;
2545 num_input_arrays++;
2546 } else {
2547 decl->array_id = 0;
2548 }
2549
2550 if (remove_array)
2551 decl->size = type_size(var->type->fields.array);
2552 else
2553 decl->size = type_size(var->type);
2554
2555 entry = new(mem_ctx) variable_storage(var,
2556 PROGRAM_INPUT,
2557 decl->mesa_index,
2558 decl->array_id);
2559 entry->component = component;
2560
2561 _mesa_hash_table_insert(this->variables, var, entry);
2562
2563 break;
2564 }
2565 case ir_var_shader_out: {
2566 assert(var->data.location != -1);
2567
2568 const glsl_type *type_without_array = var->type->without_array();
2569 struct inout_decl *decl = &outputs[num_outputs];
2570 unsigned component = var->data.location_frac;
2571 unsigned num_components;
2572 num_outputs++;
2573
2574 decl->invariant = var->data.invariant;
2575
2576 if (type_without_array->is_64bit())
2577 component = component / 2;
2578 if (type_without_array->vector_elements)
2579 num_components = type_without_array->vector_elements;
2580 else
2581 num_components = 4;
2582
2583 decl->mesa_index = var->data.location + FRAG_RESULT_MAX * var->data.index;
2584 decl->base_type = type_without_array->base_type;
2585 decl->usage_mask = u_bit_consecutive(component, num_components);
2586 if (var->data.stream & (1u << 31)) {
2587 decl->gs_out_streams = var->data.stream & ~(1u << 31);
2588 } else {
2589 assert(var->data.stream < 4);
2590 decl->gs_out_streams = 0;
2591 for (unsigned i = 0; i < num_components; ++i)
2592 decl->gs_out_streams |= var->data.stream << (2 * (component + i));
2593 }
2594
2595 if (is_inout_array(shader->Stage, var, &remove_array)) {
2596 decl->array_id = num_output_arrays + 1;
2597 num_output_arrays++;
2598 } else {
2599 decl->array_id = 0;
2600 }
2601
2602 if (remove_array)
2603 decl->size = type_size(var->type->fields.array);
2604 else
2605 decl->size = type_size(var->type);
2606
2607 if (var->data.fb_fetch_output) {
2608 st_dst_reg dst = st_dst_reg(get_temp(var->type));
2609 st_src_reg src = st_src_reg(PROGRAM_OUTPUT, decl->mesa_index,
2610 var->type, component, decl->array_id);
2611 emit_asm(NULL, TGSI_OPCODE_FBFETCH, dst, src);
2612 entry = new(mem_ctx) variable_storage(var, dst.file, dst.index,
2613 dst.array_id);
2614 } else {
2615 entry = new(mem_ctx) variable_storage(var,
2616 PROGRAM_OUTPUT,
2617 decl->mesa_index,
2618 decl->array_id);
2619 }
2620 entry->component = component;
2621
2622 _mesa_hash_table_insert(this->variables, var, entry);
2623
2624 break;
2625 }
2626 case ir_var_system_value:
2627 entry = new(mem_ctx) variable_storage(var,
2628 PROGRAM_SYSTEM_VALUE,
2629 var->data.location);
2630 break;
2631 case ir_var_auto:
2632 case ir_var_temporary:
2633 st_src_reg src = get_temp(var->type);
2634
2635 entry = new(mem_ctx) variable_storage(var, src.file, src.index,
2636 src.array_id);
2637 _mesa_hash_table_insert(this->variables, var, entry);
2638
2639 break;
2640 }
2641
2642 if (!entry) {
2643 printf("Failed to make storage for %s\n", var->name);
2644 exit(1);
2645 }
2646 }
2647
2648 this->result = st_src_reg(entry->file, entry->index, var->type,
2649 entry->component, entry->array_id);
2650 if (this->shader->Stage == MESA_SHADER_VERTEX &&
2651 var->data.mode == ir_var_shader_in &&
2652 var->type->without_array()->is_double())
2653 this->result.is_double_vertex_input = true;
2654 if (!native_integers)
2655 this->result.type = GLSL_TYPE_FLOAT;
2656 }
2657
2658 static void
2659 shrink_array_declarations(struct inout_decl *decls, unsigned count,
2660 GLbitfield64* usage_mask,
2661 GLbitfield64 double_usage_mask,
2662 GLbitfield* patch_usage_mask)
2663 {
2664 unsigned i;
2665 int j;
2666
2667 /* Fix array declarations by removing unused array elements at both ends
2668 * of the arrays. For example, mat4[3] where only mat[1] is used.
2669 */
2670 for (i = 0; i < count; i++) {
2671 struct inout_decl *decl = &decls[i];
2672 if (!decl->array_id)
2673 continue;
2674
2675 /* Shrink the beginning. */
2676 for (j = 0; j < (int)decl->size; j++) {
2677 if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
2678 if (*patch_usage_mask &
2679 BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
2680 break;
2681 }
2682 else {
2683 if (*usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
2684 break;
2685 if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
2686 break;
2687 }
2688
2689 decl->mesa_index++;
2690 decl->size--;
2691 j--;
2692 }
2693
2694 /* Shrink the end. */
2695 for (j = decl->size-1; j >= 0; j--) {
2696 if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
2697 if (*patch_usage_mask &
2698 BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
2699 break;
2700 }
2701 else {
2702 if (*usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
2703 break;
2704 if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
2705 break;
2706 }
2707
2708 decl->size--;
2709 }
2710
2711 /* When not all entries of an array are accessed, we mark them as used
2712 * here anyway, to ensure that the input/output mapping logic doesn't get
2713 * confused.
2714 *
2715 * TODO This happens when an array isn't used via indirect access, which
2716 * some game ports do (at least eON-based). There is an optimization
2717 * opportunity here by replacing the array declaration with non-array
2718 * declarations of those slots that are actually used.
2719 */
2720 for (j = 1; j < (int)decl->size; ++j) {
2721 if (decl->mesa_index >= VARYING_SLOT_PATCH0)
2722 *patch_usage_mask |= BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j);
2723 else
2724 *usage_mask |= BITFIELD64_BIT(decl->mesa_index + j);
2725 }
2726 }
2727 }
2728
2729
2730 static void
2731 mark_array_io(struct inout_decl *decls, unsigned count,
2732 GLbitfield64* usage_mask,
2733 GLbitfield64 double_usage_mask,
2734 GLbitfield* patch_usage_mask)
2735 {
2736 unsigned i;
2737 int j;
2738
2739 /* Fix array declarations by removing unused array elements at both ends
2740 * of the arrays. For example, mat4[3] where only mat[1] is used.
2741 */
2742 for (i = 0; i < count; i++) {
2743 struct inout_decl *decl = &decls[i];
2744 if (!decl->array_id)
2745 continue;
2746
2747 /* When not all entries of an array are accessed, we mark them as used
2748 * here anyway, to ensure that the input/output mapping logic doesn't get
2749 * confused.
2750 *
2751 * TODO This happens when an array isn't used via indirect access, which
2752 * some game ports do (at least eON-based). There is an optimization
2753 * opportunity here by replacing the array declaration with non-array
2754 * declarations of those slots that are actually used.
2755 */
2756 for (j = 0; j < (int)decl->size; ++j) {
2757 if (decl->mesa_index >= VARYING_SLOT_PATCH0)
2758 *patch_usage_mask |= BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j);
2759 else
2760 *usage_mask |= BITFIELD64_BIT(decl->mesa_index + j);
2761 }
2762 }
2763 }
2764
2765 void
2766 glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
2767 {
2768 ir_constant *index;
2769 st_src_reg src;
2770 bool is_2D = false;
2771 ir_variable *var = ir->variable_referenced();
2772
2773 if (handle_bound_deref(ir->as_dereference()))
2774 return;
2775
2776 /* We only need the logic provided by count_vec4_slots()
2777 * for arrays of structs. Indirect sampler and image indexing is handled
2778 * elsewhere.
2779 */
2780 int element_size = ir->type->without_array()->is_struct() ?
2781 ir->type->count_vec4_slots(false, var->data.bindless) :
2782 type_size(ir->type);
2783
2784 index = ir->array_index->constant_expression_value(ralloc_parent(ir));
2785
2786 ir->array->accept(this);
2787 src = this->result;
2788
2789 if (!src.has_index2) {
2790 switch (this->prog->Target) {
2791 case GL_TESS_CONTROL_PROGRAM_NV:
2792 is_2D = (src.file == PROGRAM_INPUT || src.file == PROGRAM_OUTPUT) &&
2793 !ir->variable_referenced()->data.patch;
2794 break;
2795 case GL_TESS_EVALUATION_PROGRAM_NV:
2796 is_2D = src.file == PROGRAM_INPUT &&
2797 !ir->variable_referenced()->data.patch;
2798 break;
2799 case GL_GEOMETRY_PROGRAM_NV:
2800 is_2D = src.file == PROGRAM_INPUT;
2801 break;
2802 }
2803 }
2804
2805 if (is_2D)
2806 element_size = 1;
2807
2808 if (index) {
2809
2810 if (this->prog->Target == GL_VERTEX_PROGRAM_ARB &&
2811 src.file == PROGRAM_INPUT)
2812 element_size = attrib_type_size(ir->type, true);
2813 if (is_2D) {
2814 src.index2D = index->value.i[0];
2815 src.has_index2 = true;
2816 } else
2817 src.index += index->value.i[0] * element_size;
2818 } else {
2819 /* Variable index array dereference. It eats the "vec4" of the
2820 * base of the array and an index that offsets the TGSI register
2821 * index.
2822 */
2823 ir->array_index->accept(this);
2824
2825 st_src_reg index_reg;
2826
2827 if (element_size == 1) {
2828 index_reg = this->result;
2829 } else {
2830 index_reg = get_temp(native_integers ?
2831 glsl_type::int_type : glsl_type::float_type);
2832
2833 emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(index_reg),
2834 this->result, st_src_reg_for_type(index_reg.type, element_size));
2835 }
2836
2837 /* If there was already a relative address register involved, add the
2838 * new and the old together to get the new offset.
2839 */
2840 if (!is_2D && src.reladdr != NULL) {
2841 st_src_reg accum_reg = get_temp(native_integers ?
2842 glsl_type::int_type : glsl_type::float_type);
2843
2844 emit_asm(ir, TGSI_OPCODE_ADD, st_dst_reg(accum_reg),
2845 index_reg, *src.reladdr);
2846
2847 index_reg = accum_reg;
2848 }
2849
2850 if (is_2D) {
2851 src.reladdr2 = ralloc(mem_ctx, st_src_reg);
2852 *src.reladdr2 = index_reg;
2853 src.index2D = 0;
2854 src.has_index2 = true;
2855 } else {
2856 src.reladdr = ralloc(mem_ctx, st_src_reg);
2857 *src.reladdr = index_reg;
2858 }
2859 }
2860
2861 /* Change the register type to the element type of the array. */
2862 src.type = ir->type->base_type;
2863
2864 this->result = src;
2865 }
2866
2867 void
2868 glsl_to_tgsi_visitor::visit(ir_dereference_record *ir)
2869 {
2870 unsigned int i;
2871 const glsl_type *struct_type = ir->record->type;
2872 ir_variable *var = ir->record->variable_referenced();
2873 int offset = 0;
2874
2875 if (handle_bound_deref(ir->as_dereference()))
2876 return;
2877
2878 ir->record->accept(this);
2879
2880 assert(ir->field_idx >= 0);
2881 assert(var);
2882 for (i = 0; i < struct_type->length; i++) {
2883 if (i == (unsigned) ir->field_idx)
2884 break;
2885 const glsl_type *member_type = struct_type->fields.structure[i].type;
2886 offset += member_type->count_vec4_slots(false, var->data.bindless);
2887 }
2888
2889 /* If the type is smaller than a vec4, replicate the last channel out. */
2890 if (ir->type->is_scalar() || ir->type->is_vector())
2891 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2892 else
2893 this->result.swizzle = SWIZZLE_NOOP;
2894
2895 this->result.index += offset;
2896 this->result.type = ir->type->base_type;
2897 }
2898
2899 /**
2900 * We want to be careful in assignment setup to hit the actual storage
2901 * instead of potentially using a temporary like we might with the
2902 * ir_dereference handler.
2903 */
2904 static st_dst_reg
2905 get_assignment_lhs(ir_dereference *ir, glsl_to_tgsi_visitor *v, int *component)
2906 {
2907 /* The LHS must be a dereference. If the LHS is a variable indexed array
2908 * access of a vector, it must be separated into a series conditional moves
2909 * before reaching this point (see ir_vec_index_to_cond_assign).
2910 */
2911 assert(ir->as_dereference());
2912 ir_dereference_array *deref_array = ir->as_dereference_array();
2913 if (deref_array) {
2914 assert(!deref_array->array->type->is_vector());
2915 }
2916
2917 /* Use the rvalue deref handler for the most part. We write swizzles using
2918 * the writemask, but we do extract the base component for enhanced layouts
2919 * from the source swizzle.
2920 */
2921 ir->accept(v);
2922 *component = GET_SWZ(v->result.swizzle, 0);
2923 return st_dst_reg(v->result);
2924 }
2925
2926 /**
2927 * Process the condition of a conditional assignment
2928 *
2929 * Examines the condition of a conditional assignment to generate the optimal
2930 * first operand of a \c CMP instruction. If the condition is a relational
2931 * operator with 0 (e.g., \c ir_binop_less), the value being compared will be
2932 * used as the source for the \c CMP instruction. Otherwise the comparison
2933 * is processed to a boolean result, and the boolean result is used as the
2934 * operand to the CMP instruction.
2935 */
2936 bool
2937 glsl_to_tgsi_visitor::process_move_condition(ir_rvalue *ir)
2938 {
2939 ir_rvalue *src_ir = ir;
2940 bool negate = true;
2941 bool switch_order = false;
2942
2943 ir_expression *const expr = ir->as_expression();
2944
2945 if (native_integers) {
2946 if ((expr != NULL) && (expr->num_operands == 2)) {
2947 enum glsl_base_type type = expr->operands[0]->type->base_type;
2948 if (type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT ||
2949 type == GLSL_TYPE_BOOL) {
2950 if (expr->operation == ir_binop_equal) {
2951 if (expr->operands[0]->is_zero()) {
2952 src_ir = expr->operands[1];
2953 switch_order = true;
2954 }
2955 else if (expr->operands[1]->is_zero()) {
2956 src_ir = expr->operands[0];
2957 switch_order = true;
2958 }
2959 }
2960 else if (expr->operation == ir_binop_nequal) {
2961 if (expr->operands[0]->is_zero()) {
2962 src_ir = expr->operands[1];
2963 }
2964 else if (expr->operands[1]->is_zero()) {
2965 src_ir = expr->operands[0];
2966 }
2967 }
2968 }
2969 }
2970
2971 src_ir->accept(this);
2972 return switch_order;
2973 }
2974
2975 if ((expr != NULL) && (expr->num_operands == 2)) {
2976 bool zero_on_left = false;
2977
2978 if (expr->operands[0]->is_zero()) {
2979 src_ir = expr->operands[1];
2980 zero_on_left = true;
2981 } else if (expr->operands[1]->is_zero()) {
2982 src_ir = expr->operands[0];
2983 zero_on_left = false;
2984 }
2985
2986 /* a is - 0 + - 0 +
2987 * (a < 0) T F F ( a < 0) T F F
2988 * (0 < a) F F T (-a < 0) F F T
2989 * (a >= 0) F T T ( a < 0) T F F (swap order of other operands)
2990 * (0 >= a) T T F (-a < 0) F F T (swap order of other operands)
2991