601b364b29521d06758e76f57028044e9e410143
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->sampler = 0;
47 this->texture_offset = 0;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->ir = v->base_ir;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_present = false;
53 this->mlen = 0;
54 this->base_mrf = 0;
55 this->offset = 0;
56 this->annotation = v->current_annotation;
57 }
58
59 vec4_instruction *
60 vec4_visitor::emit(vec4_instruction *inst)
61 {
62 this->instructions.push_tail(inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
69 {
70 new_inst->ir = inst->ir;
71 new_inst->annotation = inst->annotation;
72
73 inst->insert_before(new_inst);
74
75 return inst;
76 }
77
78 vec4_instruction *
79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
80 src_reg src0, src_reg src1, src_reg src2)
81 {
82 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
83 src0, src1, src2));
84 }
85
86
87 vec4_instruction *
88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
89 {
90 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
91 }
92
93 vec4_instruction *
94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
95 {
96 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
101 {
102 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode)
107 {
108 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
109 }
110
111 #define ALU1(op) \
112 vec4_instruction * \
113 vec4_visitor::op(dst_reg dst, src_reg src0) \
114 { \
115 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
116 src0); \
117 }
118
119 #define ALU2(op) \
120 vec4_instruction * \
121 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
122 { \
123 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
124 src0, src1); \
125 }
126
127 #define ALU3(op) \
128 vec4_instruction * \
129 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
130 { \
131 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
132 src0, src1, src2); \
133 }
134
135 ALU1(NOT)
136 ALU1(MOV)
137 ALU1(FRC)
138 ALU1(RNDD)
139 ALU1(RNDE)
140 ALU1(RNDZ)
141 ALU1(F32TO16)
142 ALU1(F16TO32)
143 ALU2(ADD)
144 ALU2(MUL)
145 ALU2(MACH)
146 ALU2(AND)
147 ALU2(OR)
148 ALU2(XOR)
149 ALU2(DP3)
150 ALU2(DP4)
151 ALU2(DPH)
152 ALU2(SHL)
153 ALU2(SHR)
154 ALU2(ASR)
155 ALU3(LRP)
156 ALU1(BFREV)
157 ALU3(BFE)
158 ALU2(BFI1)
159 ALU3(BFI2)
160 ALU1(FBH)
161 ALU1(FBL)
162 ALU1(CBIT)
163 ALU3(MAD)
164 ALU2(ADDC)
165 ALU2(SUBB)
166
167 /** Gen4 predicated IF. */
168 vec4_instruction *
169 vec4_visitor::IF(uint32_t predicate)
170 {
171 vec4_instruction *inst;
172
173 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
174 inst->predicate = predicate;
175
176 return inst;
177 }
178
179 /** Gen6 IF with embedded comparison. */
180 vec4_instruction *
181 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
182 {
183 assert(brw->gen == 6);
184
185 vec4_instruction *inst;
186
187 resolve_ud_negate(&src0);
188 resolve_ud_negate(&src1);
189
190 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
191 src0, src1);
192 inst->conditional_mod = condition;
193
194 return inst;
195 }
196
197 /**
198 * CMP: Sets the low bit of the destination channels with the result
199 * of the comparison, while the upper bits are undefined, and updates
200 * the flag register with the packed 16 bits of the result.
201 */
202 vec4_instruction *
203 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
204 {
205 vec4_instruction *inst;
206
207 /* original gen4 does type conversion to the destination type
208 * before before comparison, producing garbage results for floating
209 * point comparisons.
210 */
211 if (brw->gen == 4) {
212 dst.type = src0.type;
213 if (dst.file == HW_REG)
214 dst.fixed_hw_reg.type = dst.type;
215 }
216
217 resolve_ud_negate(&src0);
218 resolve_ud_negate(&src1);
219
220 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
221 inst->conditional_mod = condition;
222
223 return inst;
224 }
225
226 vec4_instruction *
227 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
228 {
229 vec4_instruction *inst;
230
231 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
232 dst, index);
233 inst->base_mrf = 14;
234 inst->mlen = 2;
235
236 return inst;
237 }
238
239 vec4_instruction *
240 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
241 {
242 vec4_instruction *inst;
243
244 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
245 dst, src, index);
246 inst->base_mrf = 13;
247 inst->mlen = 3;
248
249 return inst;
250 }
251
252 void
253 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
254 {
255 static enum opcode dot_opcodes[] = {
256 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
257 };
258
259 emit(dot_opcodes[elements - 2], dst, src0, src1);
260 }
261
262 src_reg
263 vec4_visitor::fix_3src_operand(src_reg src)
264 {
265 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
266 * able to use vertical stride of zero to replicate the vec4 uniform, like
267 *
268 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
269 *
270 * But you can't, since vertical stride is always four in three-source
271 * instructions. Instead, insert a MOV instruction to do the replication so
272 * that the three-source instruction can consume it.
273 */
274
275 /* The MOV is only needed if the source is a uniform or immediate. */
276 if (src.file != UNIFORM && src.file != IMM)
277 return src;
278
279 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
280 expanded.type = src.type;
281 emit(MOV(expanded, src));
282 return src_reg(expanded);
283 }
284
285 src_reg
286 vec4_visitor::fix_math_operand(src_reg src)
287 {
288 /* The gen6 math instruction ignores the source modifiers --
289 * swizzle, abs, negate, and at least some parts of the register
290 * region description.
291 *
292 * Rather than trying to enumerate all these cases, *always* expand the
293 * operand to a temp GRF for gen6.
294 *
295 * For gen7, keep the operand as-is, except if immediate, which gen7 still
296 * can't use.
297 */
298
299 if (brw->gen == 7 && src.file != IMM)
300 return src;
301
302 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
303 expanded.type = src.type;
304 emit(MOV(expanded, src));
305 return src_reg(expanded);
306 }
307
308 void
309 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
310 {
311 src = fix_math_operand(src);
312
313 if (dst.writemask != WRITEMASK_XYZW) {
314 /* The gen6 math instruction must be align1, so we can't do
315 * writemasks.
316 */
317 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
318
319 emit(opcode, temp_dst, src);
320
321 emit(MOV(dst, src_reg(temp_dst)));
322 } else {
323 emit(opcode, dst, src);
324 }
325 }
326
327 void
328 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
329 {
330 vec4_instruction *inst = emit(opcode, dst, src);
331 inst->base_mrf = 1;
332 inst->mlen = 1;
333 }
334
335 void
336 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
337 {
338 switch (opcode) {
339 case SHADER_OPCODE_RCP:
340 case SHADER_OPCODE_RSQ:
341 case SHADER_OPCODE_SQRT:
342 case SHADER_OPCODE_EXP2:
343 case SHADER_OPCODE_LOG2:
344 case SHADER_OPCODE_SIN:
345 case SHADER_OPCODE_COS:
346 break;
347 default:
348 assert(!"not reached: bad math opcode");
349 return;
350 }
351
352 if (brw->gen >= 6) {
353 return emit_math1_gen6(opcode, dst, src);
354 } else {
355 return emit_math1_gen4(opcode, dst, src);
356 }
357 }
358
359 void
360 vec4_visitor::emit_math2_gen6(enum opcode opcode,
361 dst_reg dst, src_reg src0, src_reg src1)
362 {
363 src0 = fix_math_operand(src0);
364 src1 = fix_math_operand(src1);
365
366 if (dst.writemask != WRITEMASK_XYZW) {
367 /* The gen6 math instruction must be align1, so we can't do
368 * writemasks.
369 */
370 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
371 temp_dst.type = dst.type;
372
373 emit(opcode, temp_dst, src0, src1);
374
375 emit(MOV(dst, src_reg(temp_dst)));
376 } else {
377 emit(opcode, dst, src0, src1);
378 }
379 }
380
381 void
382 vec4_visitor::emit_math2_gen4(enum opcode opcode,
383 dst_reg dst, src_reg src0, src_reg src1)
384 {
385 vec4_instruction *inst = emit(opcode, dst, src0, src1);
386 inst->base_mrf = 1;
387 inst->mlen = 2;
388 }
389
390 void
391 vec4_visitor::emit_math(enum opcode opcode,
392 dst_reg dst, src_reg src0, src_reg src1)
393 {
394 switch (opcode) {
395 case SHADER_OPCODE_POW:
396 case SHADER_OPCODE_INT_QUOTIENT:
397 case SHADER_OPCODE_INT_REMAINDER:
398 break;
399 default:
400 assert(!"not reached: unsupported binary math opcode");
401 return;
402 }
403
404 if (brw->gen >= 6) {
405 return emit_math2_gen6(opcode, dst, src0, src1);
406 } else {
407 return emit_math2_gen4(opcode, dst, src0, src1);
408 }
409 }
410
411 void
412 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
413 {
414 if (brw->gen < 7)
415 assert(!"ir_unop_pack_half_2x16 should be lowered");
416
417 assert(dst.type == BRW_REGISTER_TYPE_UD);
418 assert(src0.type == BRW_REGISTER_TYPE_F);
419
420 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
421 *
422 * Because this instruction does not have a 16-bit floating-point type,
423 * the destination data type must be Word (W).
424 *
425 * The destination must be DWord-aligned and specify a horizontal stride
426 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
427 * each destination channel and the upper word is not modified.
428 *
429 * The above restriction implies that the f32to16 instruction must use
430 * align1 mode, because only in align1 mode is it possible to specify
431 * horizontal stride. We choose here to defy the hardware docs and emit
432 * align16 instructions.
433 *
434 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
435 * instructions. I was partially successful in that the code passed all
436 * tests. However, the code was dubiously correct and fragile, and the
437 * tests were not harsh enough to probe that frailty. Not trusting the
438 * code, I chose instead to remain in align16 mode in defiance of the hw
439 * docs).
440 *
441 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
442 * simulator, emitting a f32to16 in align16 mode with UD as destination
443 * data type is safe. The behavior differs from that specified in the PRM
444 * in that the upper word of each destination channel is cleared to 0.
445 */
446
447 dst_reg tmp_dst(this, glsl_type::uvec2_type);
448 src_reg tmp_src(tmp_dst);
449
450 #if 0
451 /* Verify the undocumented behavior on which the following instructions
452 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
453 * then the result of the bit-or instruction below will be incorrect.
454 *
455 * You should inspect the disasm output in order to verify that the MOV is
456 * not optimized away.
457 */
458 emit(MOV(tmp_dst, src_reg(0x12345678u)));
459 #endif
460
461 /* Give tmp the form below, where "." means untouched.
462 *
463 * w z y x w z y x
464 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
465 *
466 * That the upper word of each write-channel be 0 is required for the
467 * following bit-shift and bit-or instructions to work. Note that this
468 * relies on the undocumented hardware behavior mentioned above.
469 */
470 tmp_dst.writemask = WRITEMASK_XY;
471 emit(F32TO16(tmp_dst, src0));
472
473 /* Give the write-channels of dst the form:
474 * 0xhhhh0000
475 */
476 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
477 emit(SHL(dst, tmp_src, src_reg(16u)));
478
479 /* Finally, give the write-channels of dst the form of packHalf2x16's
480 * output:
481 * 0xhhhhllll
482 */
483 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
484 emit(OR(dst, src_reg(dst), tmp_src));
485 }
486
487 void
488 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
489 {
490 if (brw->gen < 7)
491 assert(!"ir_unop_unpack_half_2x16 should be lowered");
492
493 assert(dst.type == BRW_REGISTER_TYPE_F);
494 assert(src0.type == BRW_REGISTER_TYPE_UD);
495
496 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
497 *
498 * Because this instruction does not have a 16-bit floating-point type,
499 * the source data type must be Word (W). The destination type must be
500 * F (Float).
501 *
502 * To use W as the source data type, we must adjust horizontal strides,
503 * which is only possible in align1 mode. All my [chadv] attempts at
504 * emitting align1 instructions for unpackHalf2x16 failed to pass the
505 * Piglit tests, so I gave up.
506 *
507 * I've verified that, on gen7 hardware and the simulator, it is safe to
508 * emit f16to32 in align16 mode with UD as source data type.
509 */
510
511 dst_reg tmp_dst(this, glsl_type::uvec2_type);
512 src_reg tmp_src(tmp_dst);
513
514 tmp_dst.writemask = WRITEMASK_X;
515 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
516
517 tmp_dst.writemask = WRITEMASK_Y;
518 emit(SHR(tmp_dst, src0, src_reg(16u)));
519
520 dst.writemask = WRITEMASK_XY;
521 emit(F16TO32(dst, tmp_src));
522 }
523
524 void
525 vec4_visitor::visit_instructions(const exec_list *list)
526 {
527 foreach_list(node, list) {
528 ir_instruction *ir = (ir_instruction *)node;
529
530 base_ir = ir;
531 ir->accept(this);
532 }
533 }
534
535
536 static int
537 type_size(const struct glsl_type *type)
538 {
539 unsigned int i;
540 int size;
541
542 switch (type->base_type) {
543 case GLSL_TYPE_UINT:
544 case GLSL_TYPE_INT:
545 case GLSL_TYPE_FLOAT:
546 case GLSL_TYPE_BOOL:
547 if (type->is_matrix()) {
548 return type->matrix_columns;
549 } else {
550 /* Regardless of size of vector, it gets a vec4. This is bad
551 * packing for things like floats, but otherwise arrays become a
552 * mess. Hopefully a later pass over the code can pack scalars
553 * down if appropriate.
554 */
555 return 1;
556 }
557 case GLSL_TYPE_ARRAY:
558 assert(type->length > 0);
559 return type_size(type->fields.array) * type->length;
560 case GLSL_TYPE_STRUCT:
561 size = 0;
562 for (i = 0; i < type->length; i++) {
563 size += type_size(type->fields.structure[i].type);
564 }
565 return size;
566 case GLSL_TYPE_SAMPLER:
567 /* Samplers take up one slot in UNIFORMS[], but they're baked in
568 * at link time.
569 */
570 return 1;
571 case GLSL_TYPE_ATOMIC_UINT:
572 return 0;
573 case GLSL_TYPE_IMAGE:
574 case GLSL_TYPE_VOID:
575 case GLSL_TYPE_ERROR:
576 case GLSL_TYPE_INTERFACE:
577 assert(0);
578 break;
579 }
580
581 return 0;
582 }
583
584 int
585 vec4_visitor::virtual_grf_alloc(int size)
586 {
587 if (virtual_grf_array_size <= virtual_grf_count) {
588 if (virtual_grf_array_size == 0)
589 virtual_grf_array_size = 16;
590 else
591 virtual_grf_array_size *= 2;
592 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
593 virtual_grf_array_size);
594 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
595 virtual_grf_array_size);
596 }
597 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
598 virtual_grf_reg_count += size;
599 virtual_grf_sizes[virtual_grf_count] = size;
600 return virtual_grf_count++;
601 }
602
603 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
604 {
605 init();
606
607 this->file = GRF;
608 this->reg = v->virtual_grf_alloc(type_size(type));
609
610 if (type->is_array() || type->is_record()) {
611 this->swizzle = BRW_SWIZZLE_NOOP;
612 } else {
613 this->swizzle = swizzle_for_size(type->vector_elements);
614 }
615
616 this->type = brw_type_for_base_type(type);
617 }
618
619 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
620 {
621 init();
622
623 this->file = GRF;
624 this->reg = v->virtual_grf_alloc(type_size(type));
625
626 if (type->is_array() || type->is_record()) {
627 this->writemask = WRITEMASK_XYZW;
628 } else {
629 this->writemask = (1 << type->vector_elements) - 1;
630 }
631
632 this->type = brw_type_for_base_type(type);
633 }
634
635 /* Our support for uniforms is piggy-backed on the struct
636 * gl_fragment_program, because that's where the values actually
637 * get stored, rather than in some global gl_shader_program uniform
638 * store.
639 */
640 void
641 vec4_visitor::setup_uniform_values(ir_variable *ir)
642 {
643 int namelen = strlen(ir->name);
644
645 /* The data for our (non-builtin) uniforms is stored in a series of
646 * gl_uniform_driver_storage structs for each subcomponent that
647 * glGetUniformLocation() could name. We know it's been set up in the same
648 * order we'd walk the type, so walk the list of storage and find anything
649 * with our name, or the prefix of a component that starts with our name.
650 */
651 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
652 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
653
654 if (strncmp(ir->name, storage->name, namelen) != 0 ||
655 (storage->name[namelen] != 0 &&
656 storage->name[namelen] != '.' &&
657 storage->name[namelen] != '[')) {
658 continue;
659 }
660
661 gl_constant_value *components = storage->storage;
662 unsigned vector_count = (MAX2(storage->array_elements, 1) *
663 storage->type->matrix_columns);
664
665 for (unsigned s = 0; s < vector_count; s++) {
666 uniform_vector_size[uniforms] = storage->type->vector_elements;
667
668 int i;
669 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
670 stage_prog_data->param[uniforms * 4 + i] = &components->f;
671 components++;
672 }
673 for (; i < 4; i++) {
674 static float zero = 0;
675 stage_prog_data->param[uniforms * 4 + i] = &zero;
676 }
677
678 uniforms++;
679 }
680 }
681 }
682
683 void
684 vec4_visitor::setup_uniform_clipplane_values()
685 {
686 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
687
688 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
689 this->uniform_vector_size[this->uniforms] = 4;
690 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
691 this->userplane[i].type = BRW_REGISTER_TYPE_F;
692 for (int j = 0; j < 4; ++j) {
693 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
694 }
695 ++this->uniforms;
696 }
697 }
698
699 /* Our support for builtin uniforms is even scarier than non-builtin.
700 * It sits on top of the PROG_STATE_VAR parameters that are
701 * automatically updated from GL context state.
702 */
703 void
704 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
705 {
706 const ir_state_slot *const slots = ir->state_slots;
707 assert(ir->state_slots != NULL);
708
709 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
710 /* This state reference has already been setup by ir_to_mesa,
711 * but we'll get the same index back here. We can reference
712 * ParameterValues directly, since unlike brw_fs.cpp, we never
713 * add new state references during compile.
714 */
715 int index = _mesa_add_state_reference(this->prog->Parameters,
716 (gl_state_index *)slots[i].tokens);
717 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
718
719 this->uniform_vector_size[this->uniforms] = 0;
720 /* Add each of the unique swizzled channels of the element.
721 * This will end up matching the size of the glsl_type of this field.
722 */
723 int last_swiz = -1;
724 for (unsigned int j = 0; j < 4; j++) {
725 int swiz = GET_SWZ(slots[i].swizzle, j);
726 last_swiz = swiz;
727
728 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
729 if (swiz <= last_swiz)
730 this->uniform_vector_size[this->uniforms]++;
731 }
732 this->uniforms++;
733 }
734 }
735
736 dst_reg *
737 vec4_visitor::variable_storage(ir_variable *var)
738 {
739 return (dst_reg *)hash_table_find(this->variable_ht, var);
740 }
741
742 void
743 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
744 {
745 ir_expression *expr = ir->as_expression();
746
747 *predicate = BRW_PREDICATE_NORMAL;
748
749 if (expr) {
750 src_reg op[2];
751 vec4_instruction *inst;
752
753 assert(expr->get_num_operands() <= 2);
754 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
755 expr->operands[i]->accept(this);
756 op[i] = this->result;
757
758 resolve_ud_negate(&op[i]);
759 }
760
761 switch (expr->operation) {
762 case ir_unop_logic_not:
763 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
764 inst->conditional_mod = BRW_CONDITIONAL_Z;
765 break;
766
767 case ir_binop_logic_xor:
768 inst = emit(XOR(dst_null_d(), op[0], op[1]));
769 inst->conditional_mod = BRW_CONDITIONAL_NZ;
770 break;
771
772 case ir_binop_logic_or:
773 inst = emit(OR(dst_null_d(), op[0], op[1]));
774 inst->conditional_mod = BRW_CONDITIONAL_NZ;
775 break;
776
777 case ir_binop_logic_and:
778 inst = emit(AND(dst_null_d(), op[0], op[1]));
779 inst->conditional_mod = BRW_CONDITIONAL_NZ;
780 break;
781
782 case ir_unop_f2b:
783 if (brw->gen >= 6) {
784 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
785 } else {
786 inst = emit(MOV(dst_null_f(), op[0]));
787 inst->conditional_mod = BRW_CONDITIONAL_NZ;
788 }
789 break;
790
791 case ir_unop_i2b:
792 if (brw->gen >= 6) {
793 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
794 } else {
795 inst = emit(MOV(dst_null_d(), op[0]));
796 inst->conditional_mod = BRW_CONDITIONAL_NZ;
797 }
798 break;
799
800 case ir_binop_all_equal:
801 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
802 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
803 break;
804
805 case ir_binop_any_nequal:
806 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
807 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
808 break;
809
810 case ir_unop_any:
811 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
812 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
813 break;
814
815 case ir_binop_greater:
816 case ir_binop_gequal:
817 case ir_binop_less:
818 case ir_binop_lequal:
819 case ir_binop_equal:
820 case ir_binop_nequal:
821 emit(CMP(dst_null_d(), op[0], op[1],
822 brw_conditional_for_comparison(expr->operation)));
823 break;
824
825 default:
826 assert(!"not reached");
827 break;
828 }
829 return;
830 }
831
832 ir->accept(this);
833
834 resolve_ud_negate(&this->result);
835
836 if (brw->gen >= 6) {
837 vec4_instruction *inst = emit(AND(dst_null_d(),
838 this->result, src_reg(1)));
839 inst->conditional_mod = BRW_CONDITIONAL_NZ;
840 } else {
841 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
842 inst->conditional_mod = BRW_CONDITIONAL_NZ;
843 }
844 }
845
846 /**
847 * Emit a gen6 IF statement with the comparison folded into the IF
848 * instruction.
849 */
850 void
851 vec4_visitor::emit_if_gen6(ir_if *ir)
852 {
853 ir_expression *expr = ir->condition->as_expression();
854
855 if (expr) {
856 src_reg op[2];
857 dst_reg temp;
858
859 assert(expr->get_num_operands() <= 2);
860 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
861 expr->operands[i]->accept(this);
862 op[i] = this->result;
863 }
864
865 switch (expr->operation) {
866 case ir_unop_logic_not:
867 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
868 return;
869
870 case ir_binop_logic_xor:
871 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
872 return;
873
874 case ir_binop_logic_or:
875 temp = dst_reg(this, glsl_type::bool_type);
876 emit(OR(temp, op[0], op[1]));
877 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
878 return;
879
880 case ir_binop_logic_and:
881 temp = dst_reg(this, glsl_type::bool_type);
882 emit(AND(temp, op[0], op[1]));
883 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
884 return;
885
886 case ir_unop_f2b:
887 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
888 return;
889
890 case ir_unop_i2b:
891 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
892 return;
893
894 case ir_binop_greater:
895 case ir_binop_gequal:
896 case ir_binop_less:
897 case ir_binop_lequal:
898 case ir_binop_equal:
899 case ir_binop_nequal:
900 emit(IF(op[0], op[1],
901 brw_conditional_for_comparison(expr->operation)));
902 return;
903
904 case ir_binop_all_equal:
905 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
906 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
907 return;
908
909 case ir_binop_any_nequal:
910 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
911 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
912 return;
913
914 case ir_unop_any:
915 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
916 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
917 return;
918
919 default:
920 assert(!"not reached");
921 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
922 return;
923 }
924 return;
925 }
926
927 ir->condition->accept(this);
928
929 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
930 }
931
932 void
933 vec4_visitor::visit(ir_variable *ir)
934 {
935 dst_reg *reg = NULL;
936
937 if (variable_storage(ir))
938 return;
939
940 switch (ir->data.mode) {
941 case ir_var_shader_in:
942 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
943 break;
944
945 case ir_var_shader_out:
946 reg = new(mem_ctx) dst_reg(this, ir->type);
947
948 for (int i = 0; i < type_size(ir->type); i++) {
949 output_reg[ir->data.location + i] = *reg;
950 output_reg[ir->data.location + i].reg_offset = i;
951 output_reg[ir->data.location + i].type =
952 brw_type_for_base_type(ir->type->get_scalar_type());
953 output_reg_annotation[ir->data.location + i] = ir->name;
954 }
955 break;
956
957 case ir_var_auto:
958 case ir_var_temporary:
959 reg = new(mem_ctx) dst_reg(this, ir->type);
960 break;
961
962 case ir_var_uniform:
963 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
964
965 /* Thanks to the lower_ubo_reference pass, we will see only
966 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
967 * variables, so no need for them to be in variable_ht.
968 *
969 * Atomic counters take no uniform storage, no need to do
970 * anything here.
971 */
972 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
973 return;
974
975 /* Track how big the whole uniform variable is, in case we need to put a
976 * copy of its data into pull constants for array access.
977 */
978 this->uniform_size[this->uniforms] = type_size(ir->type);
979
980 if (!strncmp(ir->name, "gl_", 3)) {
981 setup_builtin_uniform_values(ir);
982 } else {
983 setup_uniform_values(ir);
984 }
985 break;
986
987 case ir_var_system_value:
988 reg = make_reg_for_system_value(ir);
989 break;
990
991 default:
992 assert(!"not reached");
993 }
994
995 reg->type = brw_type_for_base_type(ir->type);
996 hash_table_insert(this->variable_ht, reg, ir);
997 }
998
999 void
1000 vec4_visitor::visit(ir_loop *ir)
1001 {
1002 /* We don't want debugging output to print the whole body of the
1003 * loop as the annotation.
1004 */
1005 this->base_ir = NULL;
1006
1007 emit(BRW_OPCODE_DO);
1008
1009 visit_instructions(&ir->body_instructions);
1010
1011 emit(BRW_OPCODE_WHILE);
1012 }
1013
1014 void
1015 vec4_visitor::visit(ir_loop_jump *ir)
1016 {
1017 switch (ir->mode) {
1018 case ir_loop_jump::jump_break:
1019 emit(BRW_OPCODE_BREAK);
1020 break;
1021 case ir_loop_jump::jump_continue:
1022 emit(BRW_OPCODE_CONTINUE);
1023 break;
1024 }
1025 }
1026
1027
1028 void
1029 vec4_visitor::visit(ir_function_signature *ir)
1030 {
1031 assert(0);
1032 (void)ir;
1033 }
1034
1035 void
1036 vec4_visitor::visit(ir_function *ir)
1037 {
1038 /* Ignore function bodies other than main() -- we shouldn't see calls to
1039 * them since they should all be inlined.
1040 */
1041 if (strcmp(ir->name, "main") == 0) {
1042 const ir_function_signature *sig;
1043 exec_list empty;
1044
1045 sig = ir->matching_signature(NULL, &empty);
1046
1047 assert(sig);
1048
1049 visit_instructions(&sig->body);
1050 }
1051 }
1052
1053 bool
1054 vec4_visitor::try_emit_sat(ir_expression *ir)
1055 {
1056 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1057 if (!sat_src)
1058 return false;
1059
1060 sat_src->accept(this);
1061 src_reg src = this->result;
1062
1063 this->result = src_reg(this, ir->type);
1064 vec4_instruction *inst;
1065 inst = emit(MOV(dst_reg(this->result), src));
1066 inst->saturate = true;
1067
1068 return true;
1069 }
1070
1071 bool
1072 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1073 {
1074 /* 3-src instructions were introduced in gen6. */
1075 if (brw->gen < 6)
1076 return false;
1077
1078 /* MAD can only handle floating-point data. */
1079 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1080 return false;
1081
1082 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1083 ir_expression *mul = ir->operands[mul_arg]->as_expression();
1084
1085 if (!mul || mul->operation != ir_binop_mul)
1086 return false;
1087
1088 nonmul->accept(this);
1089 src_reg src0 = fix_3src_operand(this->result);
1090
1091 mul->operands[0]->accept(this);
1092 src_reg src1 = fix_3src_operand(this->result);
1093
1094 mul->operands[1]->accept(this);
1095 src_reg src2 = fix_3src_operand(this->result);
1096
1097 this->result = src_reg(this, ir->type);
1098 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1099
1100 return true;
1101 }
1102
1103 void
1104 vec4_visitor::emit_bool_comparison(unsigned int op,
1105 dst_reg dst, src_reg src0, src_reg src1)
1106 {
1107 /* original gen4 does destination conversion before comparison. */
1108 if (brw->gen < 5)
1109 dst.type = src0.type;
1110
1111 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1112
1113 dst.type = BRW_REGISTER_TYPE_D;
1114 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1115 }
1116
1117 void
1118 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1119 src_reg src0, src_reg src1)
1120 {
1121 vec4_instruction *inst;
1122
1123 if (brw->gen >= 6) {
1124 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1125 inst->conditional_mod = conditionalmod;
1126 } else {
1127 emit(CMP(dst, src0, src1, conditionalmod));
1128
1129 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1130 inst->predicate = BRW_PREDICATE_NORMAL;
1131 }
1132 }
1133
1134 static bool
1135 is_16bit_constant(ir_rvalue *rvalue)
1136 {
1137 ir_constant *constant = rvalue->as_constant();
1138 if (!constant)
1139 return false;
1140
1141 if (constant->type != glsl_type::int_type &&
1142 constant->type != glsl_type::uint_type)
1143 return false;
1144
1145 return constant->value.u[0] < (1 << 16);
1146 }
1147
1148 void
1149 vec4_visitor::visit(ir_expression *ir)
1150 {
1151 unsigned int operand;
1152 src_reg op[Elements(ir->operands)];
1153 src_reg result_src;
1154 dst_reg result_dst;
1155 vec4_instruction *inst;
1156
1157 if (try_emit_sat(ir))
1158 return;
1159
1160 if (ir->operation == ir_binop_add) {
1161 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1162 return;
1163 }
1164
1165 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1166 this->result.file = BAD_FILE;
1167 ir->operands[operand]->accept(this);
1168 if (this->result.file == BAD_FILE) {
1169 printf("Failed to get tree for expression operand:\n");
1170 ir->operands[operand]->print();
1171 exit(1);
1172 }
1173 op[operand] = this->result;
1174
1175 /* Matrix expression operands should have been broken down to vector
1176 * operations already.
1177 */
1178 assert(!ir->operands[operand]->type->is_matrix());
1179 }
1180
1181 int vector_elements = ir->operands[0]->type->vector_elements;
1182 if (ir->operands[1]) {
1183 vector_elements = MAX2(vector_elements,
1184 ir->operands[1]->type->vector_elements);
1185 }
1186
1187 this->result.file = BAD_FILE;
1188
1189 /* Storage for our result. Ideally for an assignment we'd be using
1190 * the actual storage for the result here, instead.
1191 */
1192 result_src = src_reg(this, ir->type);
1193 /* convenience for the emit functions below. */
1194 result_dst = dst_reg(result_src);
1195 /* If nothing special happens, this is the result. */
1196 this->result = result_src;
1197 /* Limit writes to the channels that will be used by result_src later.
1198 * This does limit this temp's use as a temporary for multi-instruction
1199 * sequences.
1200 */
1201 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1202
1203 switch (ir->operation) {
1204 case ir_unop_logic_not:
1205 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1206 * ones complement of the whole register, not just bit 0.
1207 */
1208 emit(XOR(result_dst, op[0], src_reg(1)));
1209 break;
1210 case ir_unop_neg:
1211 op[0].negate = !op[0].negate;
1212 emit(MOV(result_dst, op[0]));
1213 break;
1214 case ir_unop_abs:
1215 op[0].abs = true;
1216 op[0].negate = false;
1217 emit(MOV(result_dst, op[0]));
1218 break;
1219
1220 case ir_unop_sign:
1221 if (ir->type->is_float()) {
1222 /* AND(val, 0x80000000) gives the sign bit.
1223 *
1224 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1225 * zero.
1226 */
1227 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1228
1229 op[0].type = BRW_REGISTER_TYPE_UD;
1230 result_dst.type = BRW_REGISTER_TYPE_UD;
1231 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1232
1233 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1234 inst->predicate = BRW_PREDICATE_NORMAL;
1235
1236 this->result.type = BRW_REGISTER_TYPE_F;
1237 } else {
1238 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1239 * -> non-negative val generates 0x00000000.
1240 * Predicated OR sets 1 if val is positive.
1241 */
1242 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1243
1244 emit(ASR(result_dst, op[0], src_reg(31)));
1245
1246 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1247 inst->predicate = BRW_PREDICATE_NORMAL;
1248 }
1249 break;
1250
1251 case ir_unop_rcp:
1252 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1253 break;
1254
1255 case ir_unop_exp2:
1256 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1257 break;
1258 case ir_unop_log2:
1259 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1260 break;
1261 case ir_unop_exp:
1262 case ir_unop_log:
1263 assert(!"not reached: should be handled by ir_explog_to_explog2");
1264 break;
1265 case ir_unop_sin:
1266 case ir_unop_sin_reduced:
1267 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1268 break;
1269 case ir_unop_cos:
1270 case ir_unop_cos_reduced:
1271 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1272 break;
1273
1274 case ir_unop_dFdx:
1275 case ir_unop_dFdy:
1276 assert(!"derivatives not valid in vertex shader");
1277 break;
1278
1279 case ir_unop_bitfield_reverse:
1280 emit(BFREV(result_dst, op[0]));
1281 break;
1282 case ir_unop_bit_count:
1283 emit(CBIT(result_dst, op[0]));
1284 break;
1285 case ir_unop_find_msb: {
1286 src_reg temp = src_reg(this, glsl_type::uint_type);
1287
1288 inst = emit(FBH(dst_reg(temp), op[0]));
1289 inst->dst.writemask = WRITEMASK_XYZW;
1290
1291 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1292 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1293 * subtract the result from 31 to convert the MSB count into an LSB count.
1294 */
1295
1296 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1297 temp.swizzle = BRW_SWIZZLE_NOOP;
1298 emit(MOV(result_dst, temp));
1299
1300 src_reg src_tmp = src_reg(result_dst);
1301 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1302
1303 src_tmp.negate = true;
1304 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1305 inst->predicate = BRW_PREDICATE_NORMAL;
1306 break;
1307 }
1308 case ir_unop_find_lsb:
1309 emit(FBL(result_dst, op[0]));
1310 break;
1311
1312 case ir_unop_noise:
1313 assert(!"not reached: should be handled by lower_noise");
1314 break;
1315
1316 case ir_binop_add:
1317 emit(ADD(result_dst, op[0], op[1]));
1318 break;
1319 case ir_binop_sub:
1320 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1321 break;
1322
1323 case ir_binop_mul:
1324 if (brw->gen < 8 && ir->type->is_integer()) {
1325 /* For integer multiplication, the MUL uses the low 16 bits of one of
1326 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1327 * accumulates in the contribution of the upper 16 bits of that
1328 * operand. If we can determine that one of the args is in the low
1329 * 16 bits, though, we can just emit a single MUL.
1330 */
1331 if (is_16bit_constant(ir->operands[0])) {
1332 if (brw->gen < 7)
1333 emit(MUL(result_dst, op[0], op[1]));
1334 else
1335 emit(MUL(result_dst, op[1], op[0]));
1336 } else if (is_16bit_constant(ir->operands[1])) {
1337 if (brw->gen < 7)
1338 emit(MUL(result_dst, op[1], op[0]));
1339 else
1340 emit(MUL(result_dst, op[0], op[1]));
1341 } else {
1342 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1343
1344 emit(MUL(acc, op[0], op[1]));
1345 emit(MACH(dst_null_d(), op[0], op[1]));
1346 emit(MOV(result_dst, src_reg(acc)));
1347 }
1348 } else {
1349 emit(MUL(result_dst, op[0], op[1]));
1350 }
1351 break;
1352 case ir_binop_imul_high: {
1353 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1354
1355 emit(MUL(acc, op[0], op[1]));
1356 emit(MACH(result_dst, op[0], op[1]));
1357 break;
1358 }
1359 case ir_binop_div:
1360 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1361 assert(ir->type->is_integer());
1362 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1363 break;
1364 case ir_binop_carry: {
1365 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1366
1367 emit(ADDC(dst_null_ud(), op[0], op[1]));
1368 emit(MOV(result_dst, src_reg(acc)));
1369 break;
1370 }
1371 case ir_binop_borrow: {
1372 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1373
1374 emit(SUBB(dst_null_ud(), op[0], op[1]));
1375 emit(MOV(result_dst, src_reg(acc)));
1376 break;
1377 }
1378 case ir_binop_mod:
1379 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1380 assert(ir->type->is_integer());
1381 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1382 break;
1383
1384 case ir_binop_less:
1385 case ir_binop_greater:
1386 case ir_binop_lequal:
1387 case ir_binop_gequal:
1388 case ir_binop_equal:
1389 case ir_binop_nequal: {
1390 emit(CMP(result_dst, op[0], op[1],
1391 brw_conditional_for_comparison(ir->operation)));
1392 emit(AND(result_dst, result_src, src_reg(0x1)));
1393 break;
1394 }
1395
1396 case ir_binop_all_equal:
1397 /* "==" operator producing a scalar boolean. */
1398 if (ir->operands[0]->type->is_vector() ||
1399 ir->operands[1]->type->is_vector()) {
1400 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1401 emit(MOV(result_dst, src_reg(0)));
1402 inst = emit(MOV(result_dst, src_reg(1)));
1403 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1404 } else {
1405 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1406 emit(AND(result_dst, result_src, src_reg(0x1)));
1407 }
1408 break;
1409 case ir_binop_any_nequal:
1410 /* "!=" operator producing a scalar boolean. */
1411 if (ir->operands[0]->type->is_vector() ||
1412 ir->operands[1]->type->is_vector()) {
1413 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1414
1415 emit(MOV(result_dst, src_reg(0)));
1416 inst = emit(MOV(result_dst, src_reg(1)));
1417 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1418 } else {
1419 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1420 emit(AND(result_dst, result_src, src_reg(0x1)));
1421 }
1422 break;
1423
1424 case ir_unop_any:
1425 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1426 emit(MOV(result_dst, src_reg(0)));
1427
1428 inst = emit(MOV(result_dst, src_reg(1)));
1429 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1430 break;
1431
1432 case ir_binop_logic_xor:
1433 emit(XOR(result_dst, op[0], op[1]));
1434 break;
1435
1436 case ir_binop_logic_or:
1437 emit(OR(result_dst, op[0], op[1]));
1438 break;
1439
1440 case ir_binop_logic_and:
1441 emit(AND(result_dst, op[0], op[1]));
1442 break;
1443
1444 case ir_binop_dot:
1445 assert(ir->operands[0]->type->is_vector());
1446 assert(ir->operands[0]->type == ir->operands[1]->type);
1447 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1448 break;
1449
1450 case ir_unop_sqrt:
1451 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1452 break;
1453 case ir_unop_rsq:
1454 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1455 break;
1456
1457 case ir_unop_bitcast_i2f:
1458 case ir_unop_bitcast_u2f:
1459 this->result = op[0];
1460 this->result.type = BRW_REGISTER_TYPE_F;
1461 break;
1462
1463 case ir_unop_bitcast_f2i:
1464 this->result = op[0];
1465 this->result.type = BRW_REGISTER_TYPE_D;
1466 break;
1467
1468 case ir_unop_bitcast_f2u:
1469 this->result = op[0];
1470 this->result.type = BRW_REGISTER_TYPE_UD;
1471 break;
1472
1473 case ir_unop_i2f:
1474 case ir_unop_i2u:
1475 case ir_unop_u2i:
1476 case ir_unop_u2f:
1477 case ir_unop_b2f:
1478 case ir_unop_b2i:
1479 case ir_unop_f2i:
1480 case ir_unop_f2u:
1481 emit(MOV(result_dst, op[0]));
1482 break;
1483 case ir_unop_f2b:
1484 case ir_unop_i2b: {
1485 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1486 emit(AND(result_dst, result_src, src_reg(1)));
1487 break;
1488 }
1489
1490 case ir_unop_trunc:
1491 emit(RNDZ(result_dst, op[0]));
1492 break;
1493 case ir_unop_ceil:
1494 op[0].negate = !op[0].negate;
1495 inst = emit(RNDD(result_dst, op[0]));
1496 this->result.negate = true;
1497 break;
1498 case ir_unop_floor:
1499 inst = emit(RNDD(result_dst, op[0]));
1500 break;
1501 case ir_unop_fract:
1502 inst = emit(FRC(result_dst, op[0]));
1503 break;
1504 case ir_unop_round_even:
1505 emit(RNDE(result_dst, op[0]));
1506 break;
1507
1508 case ir_binop_min:
1509 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1510 break;
1511 case ir_binop_max:
1512 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1513 break;
1514
1515 case ir_binop_pow:
1516 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1517 break;
1518
1519 case ir_unop_bit_not:
1520 inst = emit(NOT(result_dst, op[0]));
1521 break;
1522 case ir_binop_bit_and:
1523 inst = emit(AND(result_dst, op[0], op[1]));
1524 break;
1525 case ir_binop_bit_xor:
1526 inst = emit(XOR(result_dst, op[0], op[1]));
1527 break;
1528 case ir_binop_bit_or:
1529 inst = emit(OR(result_dst, op[0], op[1]));
1530 break;
1531
1532 case ir_binop_lshift:
1533 inst = emit(SHL(result_dst, op[0], op[1]));
1534 break;
1535
1536 case ir_binop_rshift:
1537 if (ir->type->base_type == GLSL_TYPE_INT)
1538 inst = emit(ASR(result_dst, op[0], op[1]));
1539 else
1540 inst = emit(SHR(result_dst, op[0], op[1]));
1541 break;
1542
1543 case ir_binop_bfm:
1544 emit(BFI1(result_dst, op[0], op[1]));
1545 break;
1546
1547 case ir_binop_ubo_load: {
1548 ir_constant *uniform_block = ir->operands[0]->as_constant();
1549 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1550 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1551 src_reg offset;
1552
1553 /* Now, load the vector from that offset. */
1554 assert(ir->type->is_vector() || ir->type->is_scalar());
1555
1556 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1557 packed_consts.type = result.type;
1558 src_reg surf_index =
1559 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1560 if (const_offset_ir) {
1561 if (brw->gen >= 8) {
1562 /* Store the offset in a GRF so we can send-from-GRF. */
1563 offset = src_reg(this, glsl_type::int_type);
1564 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1565 } else {
1566 /* Immediates are fine on older generations since they'll be moved
1567 * to a (potentially fake) MRF at the generator level.
1568 */
1569 offset = src_reg(const_offset / 16);
1570 }
1571 } else {
1572 offset = src_reg(this, glsl_type::uint_type);
1573 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1574 }
1575
1576 if (brw->gen >= 7) {
1577 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1578 grf_offset.type = offset.type;
1579
1580 emit(MOV(grf_offset, offset));
1581
1582 emit(new(mem_ctx) vec4_instruction(this,
1583 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1584 dst_reg(packed_consts),
1585 surf_index,
1586 src_reg(grf_offset)));
1587 } else {
1588 vec4_instruction *pull =
1589 emit(new(mem_ctx) vec4_instruction(this,
1590 VS_OPCODE_PULL_CONSTANT_LOAD,
1591 dst_reg(packed_consts),
1592 surf_index,
1593 offset));
1594 pull->base_mrf = 14;
1595 pull->mlen = 1;
1596 }
1597
1598 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1599 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1600 const_offset % 16 / 4,
1601 const_offset % 16 / 4,
1602 const_offset % 16 / 4);
1603
1604 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1605 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1606 emit(CMP(result_dst, packed_consts, src_reg(0u),
1607 BRW_CONDITIONAL_NZ));
1608 emit(AND(result_dst, result, src_reg(0x1)));
1609 } else {
1610 emit(MOV(result_dst, packed_consts));
1611 }
1612 break;
1613 }
1614
1615 case ir_binop_vector_extract:
1616 assert(!"should have been lowered by vec_index_to_cond_assign");
1617 break;
1618
1619 case ir_triop_fma:
1620 op[0] = fix_3src_operand(op[0]);
1621 op[1] = fix_3src_operand(op[1]);
1622 op[2] = fix_3src_operand(op[2]);
1623 /* Note that the instruction's argument order is reversed from GLSL
1624 * and the IR.
1625 */
1626 emit(MAD(result_dst, op[2], op[1], op[0]));
1627 break;
1628
1629 case ir_triop_lrp:
1630 op[0] = fix_3src_operand(op[0]);
1631 op[1] = fix_3src_operand(op[1]);
1632 op[2] = fix_3src_operand(op[2]);
1633 /* Note that the instruction's argument order is reversed from GLSL
1634 * and the IR.
1635 */
1636 emit(LRP(result_dst, op[2], op[1], op[0]));
1637 break;
1638
1639 case ir_triop_csel:
1640 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1641 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1642 inst->predicate = BRW_PREDICATE_NORMAL;
1643 break;
1644
1645 case ir_triop_bfi:
1646 op[0] = fix_3src_operand(op[0]);
1647 op[1] = fix_3src_operand(op[1]);
1648 op[2] = fix_3src_operand(op[2]);
1649 emit(BFI2(result_dst, op[0], op[1], op[2]));
1650 break;
1651
1652 case ir_triop_bitfield_extract:
1653 op[0] = fix_3src_operand(op[0]);
1654 op[1] = fix_3src_operand(op[1]);
1655 op[2] = fix_3src_operand(op[2]);
1656 /* Note that the instruction's argument order is reversed from GLSL
1657 * and the IR.
1658 */
1659 emit(BFE(result_dst, op[2], op[1], op[0]));
1660 break;
1661
1662 case ir_triop_vector_insert:
1663 assert(!"should have been lowered by lower_vector_insert");
1664 break;
1665
1666 case ir_quadop_bitfield_insert:
1667 assert(!"not reached: should be handled by "
1668 "bitfield_insert_to_bfm_bfi\n");
1669 break;
1670
1671 case ir_quadop_vector:
1672 assert(!"not reached: should be handled by lower_quadop_vector");
1673 break;
1674
1675 case ir_unop_pack_half_2x16:
1676 emit_pack_half_2x16(result_dst, op[0]);
1677 break;
1678 case ir_unop_unpack_half_2x16:
1679 emit_unpack_half_2x16(result_dst, op[0]);
1680 break;
1681 case ir_unop_pack_snorm_2x16:
1682 case ir_unop_pack_snorm_4x8:
1683 case ir_unop_pack_unorm_2x16:
1684 case ir_unop_pack_unorm_4x8:
1685 case ir_unop_unpack_snorm_2x16:
1686 case ir_unop_unpack_snorm_4x8:
1687 case ir_unop_unpack_unorm_2x16:
1688 case ir_unop_unpack_unorm_4x8:
1689 assert(!"not reached: should be handled by lower_packing_builtins");
1690 break;
1691 case ir_unop_unpack_half_2x16_split_x:
1692 case ir_unop_unpack_half_2x16_split_y:
1693 case ir_binop_pack_half_2x16_split:
1694 assert(!"not reached: should not occur in vertex shader");
1695 break;
1696 case ir_binop_ldexp:
1697 assert(!"not reached: should be handled by ldexp_to_arith()");
1698 break;
1699 }
1700 }
1701
1702
1703 void
1704 vec4_visitor::visit(ir_swizzle *ir)
1705 {
1706 src_reg src;
1707 int i = 0;
1708 int swizzle[4];
1709
1710 /* Note that this is only swizzles in expressions, not those on the left
1711 * hand side of an assignment, which do write masking. See ir_assignment
1712 * for that.
1713 */
1714
1715 ir->val->accept(this);
1716 src = this->result;
1717 assert(src.file != BAD_FILE);
1718
1719 for (i = 0; i < ir->type->vector_elements; i++) {
1720 switch (i) {
1721 case 0:
1722 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1723 break;
1724 case 1:
1725 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1726 break;
1727 case 2:
1728 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1729 break;
1730 case 3:
1731 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1732 break;
1733 }
1734 }
1735 for (; i < 4; i++) {
1736 /* Replicate the last channel out. */
1737 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1738 }
1739
1740 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1741
1742 this->result = src;
1743 }
1744
1745 void
1746 vec4_visitor::visit(ir_dereference_variable *ir)
1747 {
1748 const struct glsl_type *type = ir->type;
1749 dst_reg *reg = variable_storage(ir->var);
1750
1751 if (!reg) {
1752 fail("Failed to find variable storage for %s\n", ir->var->name);
1753 this->result = src_reg(brw_null_reg());
1754 return;
1755 }
1756
1757 this->result = src_reg(*reg);
1758
1759 /* System values get their swizzle from the dst_reg writemask */
1760 if (ir->var->data.mode == ir_var_system_value)
1761 return;
1762
1763 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1764 this->result.swizzle = swizzle_for_size(type->vector_elements);
1765 }
1766
1767
1768 int
1769 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1770 {
1771 /* Under normal circumstances array elements are stored consecutively, so
1772 * the stride is equal to the size of the array element.
1773 */
1774 return type_size(ir->type);
1775 }
1776
1777
1778 void
1779 vec4_visitor::visit(ir_dereference_array *ir)
1780 {
1781 ir_constant *constant_index;
1782 src_reg src;
1783 int array_stride = compute_array_stride(ir);
1784
1785 constant_index = ir->array_index->constant_expression_value();
1786
1787 ir->array->accept(this);
1788 src = this->result;
1789
1790 if (constant_index) {
1791 src.reg_offset += constant_index->value.i[0] * array_stride;
1792 } else {
1793 /* Variable index array dereference. It eats the "vec4" of the
1794 * base of the array and an index that offsets the Mesa register
1795 * index.
1796 */
1797 ir->array_index->accept(this);
1798
1799 src_reg index_reg;
1800
1801 if (array_stride == 1) {
1802 index_reg = this->result;
1803 } else {
1804 index_reg = src_reg(this, glsl_type::int_type);
1805
1806 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1807 }
1808
1809 if (src.reladdr) {
1810 src_reg temp = src_reg(this, glsl_type::int_type);
1811
1812 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1813
1814 index_reg = temp;
1815 }
1816
1817 src.reladdr = ralloc(mem_ctx, src_reg);
1818 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1819 }
1820
1821 /* If the type is smaller than a vec4, replicate the last channel out. */
1822 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1823 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1824 else
1825 src.swizzle = BRW_SWIZZLE_NOOP;
1826 src.type = brw_type_for_base_type(ir->type);
1827
1828 this->result = src;
1829 }
1830
1831 void
1832 vec4_visitor::visit(ir_dereference_record *ir)
1833 {
1834 unsigned int i;
1835 const glsl_type *struct_type = ir->record->type;
1836 int offset = 0;
1837
1838 ir->record->accept(this);
1839
1840 for (i = 0; i < struct_type->length; i++) {
1841 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1842 break;
1843 offset += type_size(struct_type->fields.structure[i].type);
1844 }
1845
1846 /* If the type is smaller than a vec4, replicate the last channel out. */
1847 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1848 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1849 else
1850 this->result.swizzle = BRW_SWIZZLE_NOOP;
1851 this->result.type = brw_type_for_base_type(ir->type);
1852
1853 this->result.reg_offset += offset;
1854 }
1855
1856 /**
1857 * We want to be careful in assignment setup to hit the actual storage
1858 * instead of potentially using a temporary like we might with the
1859 * ir_dereference handler.
1860 */
1861 static dst_reg
1862 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1863 {
1864 /* The LHS must be a dereference. If the LHS is a variable indexed array
1865 * access of a vector, it must be separated into a series conditional moves
1866 * before reaching this point (see ir_vec_index_to_cond_assign).
1867 */
1868 assert(ir->as_dereference());
1869 ir_dereference_array *deref_array = ir->as_dereference_array();
1870 if (deref_array) {
1871 assert(!deref_array->array->type->is_vector());
1872 }
1873
1874 /* Use the rvalue deref handler for the most part. We'll ignore
1875 * swizzles in it and write swizzles using writemask, though.
1876 */
1877 ir->accept(v);
1878 return dst_reg(v->result);
1879 }
1880
1881 void
1882 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1883 const struct glsl_type *type, uint32_t predicate)
1884 {
1885 if (type->base_type == GLSL_TYPE_STRUCT) {
1886 for (unsigned int i = 0; i < type->length; i++) {
1887 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1888 }
1889 return;
1890 }
1891
1892 if (type->is_array()) {
1893 for (unsigned int i = 0; i < type->length; i++) {
1894 emit_block_move(dst, src, type->fields.array, predicate);
1895 }
1896 return;
1897 }
1898
1899 if (type->is_matrix()) {
1900 const struct glsl_type *vec_type;
1901
1902 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1903 type->vector_elements, 1);
1904
1905 for (int i = 0; i < type->matrix_columns; i++) {
1906 emit_block_move(dst, src, vec_type, predicate);
1907 }
1908 return;
1909 }
1910
1911 assert(type->is_scalar() || type->is_vector());
1912
1913 dst->type = brw_type_for_base_type(type);
1914 src->type = dst->type;
1915
1916 dst->writemask = (1 << type->vector_elements) - 1;
1917
1918 src->swizzle = swizzle_for_size(type->vector_elements);
1919
1920 vec4_instruction *inst = emit(MOV(*dst, *src));
1921 inst->predicate = predicate;
1922
1923 dst->reg_offset++;
1924 src->reg_offset++;
1925 }
1926
1927
1928 /* If the RHS processing resulted in an instruction generating a
1929 * temporary value, and it would be easy to rewrite the instruction to
1930 * generate its result right into the LHS instead, do so. This ends
1931 * up reliably removing instructions where it can be tricky to do so
1932 * later without real UD chain information.
1933 */
1934 bool
1935 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1936 dst_reg dst,
1937 src_reg src,
1938 vec4_instruction *pre_rhs_inst,
1939 vec4_instruction *last_rhs_inst)
1940 {
1941 /* This could be supported, but it would take more smarts. */
1942 if (ir->condition)
1943 return false;
1944
1945 if (pre_rhs_inst == last_rhs_inst)
1946 return false; /* No instructions generated to work with. */
1947
1948 /* Make sure the last instruction generated our source reg. */
1949 if (src.file != GRF ||
1950 src.file != last_rhs_inst->dst.file ||
1951 src.reg != last_rhs_inst->dst.reg ||
1952 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1953 src.reladdr ||
1954 src.abs ||
1955 src.negate ||
1956 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1957 return false;
1958
1959 /* Check that that last instruction fully initialized the channels
1960 * we want to use, in the order we want to use them. We could
1961 * potentially reswizzle the operands of many instructions so that
1962 * we could handle out of order channels, but don't yet.
1963 */
1964
1965 for (unsigned i = 0; i < 4; i++) {
1966 if (dst.writemask & (1 << i)) {
1967 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1968 return false;
1969
1970 if (BRW_GET_SWZ(src.swizzle, i) != i)
1971 return false;
1972 }
1973 }
1974
1975 /* Success! Rewrite the instruction. */
1976 last_rhs_inst->dst.file = dst.file;
1977 last_rhs_inst->dst.reg = dst.reg;
1978 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1979 last_rhs_inst->dst.reladdr = dst.reladdr;
1980 last_rhs_inst->dst.writemask &= dst.writemask;
1981
1982 return true;
1983 }
1984
1985 void
1986 vec4_visitor::visit(ir_assignment *ir)
1987 {
1988 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1989 uint32_t predicate = BRW_PREDICATE_NONE;
1990
1991 if (!ir->lhs->type->is_scalar() &&
1992 !ir->lhs->type->is_vector()) {
1993 ir->rhs->accept(this);
1994 src_reg src = this->result;
1995
1996 if (ir->condition) {
1997 emit_bool_to_cond_code(ir->condition, &predicate);
1998 }
1999
2000 /* emit_block_move doesn't account for swizzles in the source register.
2001 * This should be ok, since the source register is a structure or an
2002 * array, and those can't be swizzled. But double-check to be sure.
2003 */
2004 assert(src.swizzle ==
2005 (ir->rhs->type->is_matrix()
2006 ? swizzle_for_size(ir->rhs->type->vector_elements)
2007 : BRW_SWIZZLE_NOOP));
2008
2009 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2010 return;
2011 }
2012
2013 /* Now we're down to just a scalar/vector with writemasks. */
2014 int i;
2015
2016 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2017 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2018
2019 ir->rhs->accept(this);
2020
2021 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2022
2023 src_reg src = this->result;
2024
2025 int swizzles[4];
2026 int first_enabled_chan = 0;
2027 int src_chan = 0;
2028
2029 assert(ir->lhs->type->is_vector() ||
2030 ir->lhs->type->is_scalar());
2031 dst.writemask = ir->write_mask;
2032
2033 for (int i = 0; i < 4; i++) {
2034 if (dst.writemask & (1 << i)) {
2035 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2036 break;
2037 }
2038 }
2039
2040 /* Swizzle a small RHS vector into the channels being written.
2041 *
2042 * glsl ir treats write_mask as dictating how many channels are
2043 * present on the RHS while in our instructions we need to make
2044 * those channels appear in the slots of the vec4 they're written to.
2045 */
2046 for (int i = 0; i < 4; i++) {
2047 if (dst.writemask & (1 << i))
2048 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2049 else
2050 swizzles[i] = first_enabled_chan;
2051 }
2052 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2053 swizzles[2], swizzles[3]);
2054
2055 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2056 return;
2057 }
2058
2059 if (ir->condition) {
2060 emit_bool_to_cond_code(ir->condition, &predicate);
2061 }
2062
2063 for (i = 0; i < type_size(ir->lhs->type); i++) {
2064 vec4_instruction *inst = emit(MOV(dst, src));
2065 inst->predicate = predicate;
2066
2067 dst.reg_offset++;
2068 src.reg_offset++;
2069 }
2070 }
2071
2072 void
2073 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2074 {
2075 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2076 foreach_list(node, &ir->components) {
2077 ir_constant *field_value = (ir_constant *)node;
2078
2079 emit_constant_values(dst, field_value);
2080 }
2081 return;
2082 }
2083
2084 if (ir->type->is_array()) {
2085 for (unsigned int i = 0; i < ir->type->length; i++) {
2086 emit_constant_values(dst, ir->array_elements[i]);
2087 }
2088 return;
2089 }
2090
2091 if (ir->type->is_matrix()) {
2092 for (int i = 0; i < ir->type->matrix_columns; i++) {
2093 float *vec = &ir->value.f[i * ir->type->vector_elements];
2094
2095 for (int j = 0; j < ir->type->vector_elements; j++) {
2096 dst->writemask = 1 << j;
2097 dst->type = BRW_REGISTER_TYPE_F;
2098
2099 emit(MOV(*dst, src_reg(vec[j])));
2100 }
2101 dst->reg_offset++;
2102 }
2103 return;
2104 }
2105
2106 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2107
2108 for (int i = 0; i < ir->type->vector_elements; i++) {
2109 if (!(remaining_writemask & (1 << i)))
2110 continue;
2111
2112 dst->writemask = 1 << i;
2113 dst->type = brw_type_for_base_type(ir->type);
2114
2115 /* Find other components that match the one we're about to
2116 * write. Emits fewer instructions for things like vec4(0.5,
2117 * 1.5, 1.5, 1.5).
2118 */
2119 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2120 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2121 if (ir->value.b[i] == ir->value.b[j])
2122 dst->writemask |= (1 << j);
2123 } else {
2124 /* u, i, and f storage all line up, so no need for a
2125 * switch case for comparing each type.
2126 */
2127 if (ir->value.u[i] == ir->value.u[j])
2128 dst->writemask |= (1 << j);
2129 }
2130 }
2131
2132 switch (ir->type->base_type) {
2133 case GLSL_TYPE_FLOAT:
2134 emit(MOV(*dst, src_reg(ir->value.f[i])));
2135 break;
2136 case GLSL_TYPE_INT:
2137 emit(MOV(*dst, src_reg(ir->value.i[i])));
2138 break;
2139 case GLSL_TYPE_UINT:
2140 emit(MOV(*dst, src_reg(ir->value.u[i])));
2141 break;
2142 case GLSL_TYPE_BOOL:
2143 emit(MOV(*dst, src_reg(ir->value.b[i])));
2144 break;
2145 default:
2146 assert(!"Non-float/uint/int/bool constant");
2147 break;
2148 }
2149
2150 remaining_writemask &= ~dst->writemask;
2151 }
2152 dst->reg_offset++;
2153 }
2154
2155 void
2156 vec4_visitor::visit(ir_constant *ir)
2157 {
2158 dst_reg dst = dst_reg(this, ir->type);
2159 this->result = src_reg(dst);
2160
2161 emit_constant_values(&dst, ir);
2162 }
2163
2164 void
2165 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2166 {
2167 ir_dereference *deref = static_cast<ir_dereference *>(
2168 ir->actual_parameters.get_head());
2169 ir_variable *location = deref->variable_referenced();
2170 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2171 location->data.atomic.buffer_index);
2172
2173 /* Calculate the surface offset */
2174 src_reg offset(this, glsl_type::uint_type);
2175 ir_dereference_array *deref_array = deref->as_dereference_array();
2176 if (deref_array) {
2177 deref_array->array_index->accept(this);
2178
2179 src_reg tmp(this, glsl_type::uint_type);
2180 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2181 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2182 } else {
2183 offset = location->data.atomic.offset;
2184 }
2185
2186 /* Emit the appropriate machine instruction */
2187 const char *callee = ir->callee->function_name();
2188 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2189
2190 if (!strcmp("__intrinsic_atomic_read", callee)) {
2191 emit_untyped_surface_read(surf_index, dst, offset);
2192
2193 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2194 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2195 src_reg(), src_reg());
2196
2197 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2198 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2199 src_reg(), src_reg());
2200 }
2201 }
2202
2203 void
2204 vec4_visitor::visit(ir_call *ir)
2205 {
2206 const char *callee = ir->callee->function_name();
2207
2208 if (!strcmp("__intrinsic_atomic_read", callee) ||
2209 !strcmp("__intrinsic_atomic_increment", callee) ||
2210 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2211 visit_atomic_counter_intrinsic(ir);
2212 } else {
2213 assert(!"Unsupported intrinsic.");
2214 }
2215 }
2216
2217 src_reg
2218 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2219 {
2220 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2221 inst->base_mrf = 2;
2222 inst->mlen = 1;
2223 inst->sampler = sampler;
2224 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2225 inst->dst.writemask = WRITEMASK_XYZW;
2226
2227 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2228 int param_base = inst->base_mrf;
2229 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2230 int zero_mask = 0xf & ~coord_mask;
2231
2232 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2233 coordinate));
2234
2235 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2236 src_reg(0)));
2237
2238 emit(inst);
2239 return src_reg(inst->dst);
2240 }
2241
2242 void
2243 vec4_visitor::visit(ir_texture *ir)
2244 {
2245 int sampler =
2246 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2247
2248 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2249 * emitting anything other than setting up the constant result.
2250 */
2251 if (ir->op == ir_tg4) {
2252 ir_constant *chan = ir->lod_info.component->as_constant();
2253 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2254 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2255 dst_reg result(this, ir->type);
2256 this->result = src_reg(result);
2257 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2258 return;
2259 }
2260 }
2261
2262 /* Should be lowered by do_lower_texture_projection */
2263 assert(!ir->projector);
2264
2265 /* Should be lowered */
2266 assert(!ir->offset || !ir->offset->type->is_array());
2267
2268 /* Generate code to compute all the subexpression trees. This has to be
2269 * done before loading any values into MRFs for the sampler message since
2270 * generating these values may involve SEND messages that need the MRFs.
2271 */
2272 src_reg coordinate;
2273 if (ir->coordinate) {
2274 ir->coordinate->accept(this);
2275 coordinate = this->result;
2276 }
2277
2278 src_reg shadow_comparitor;
2279 if (ir->shadow_comparitor) {
2280 ir->shadow_comparitor->accept(this);
2281 shadow_comparitor = this->result;
2282 }
2283
2284 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2285 src_reg offset_value;
2286 if (has_nonconstant_offset) {
2287 ir->offset->accept(this);
2288 offset_value = src_reg(this->result);
2289 }
2290
2291 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2292 src_reg lod, dPdx, dPdy, sample_index, mcs;
2293 switch (ir->op) {
2294 case ir_tex:
2295 lod = src_reg(0.0f);
2296 lod_type = glsl_type::float_type;
2297 break;
2298 case ir_txf:
2299 case ir_txl:
2300 case ir_txs:
2301 ir->lod_info.lod->accept(this);
2302 lod = this->result;
2303 lod_type = ir->lod_info.lod->type;
2304 break;
2305 case ir_query_levels:
2306 lod = src_reg(0);
2307 lod_type = glsl_type::int_type;
2308 break;
2309 case ir_txf_ms:
2310 ir->lod_info.sample_index->accept(this);
2311 sample_index = this->result;
2312 sample_index_type = ir->lod_info.sample_index->type;
2313
2314 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2315 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2316 else
2317 mcs = src_reg(0u);
2318 break;
2319 case ir_txd:
2320 ir->lod_info.grad.dPdx->accept(this);
2321 dPdx = this->result;
2322
2323 ir->lod_info.grad.dPdy->accept(this);
2324 dPdy = this->result;
2325
2326 lod_type = ir->lod_info.grad.dPdx->type;
2327 break;
2328 case ir_txb:
2329 case ir_lod:
2330 case ir_tg4:
2331 break;
2332 }
2333
2334 vec4_instruction *inst = NULL;
2335 switch (ir->op) {
2336 case ir_tex:
2337 case ir_txl:
2338 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2339 break;
2340 case ir_txd:
2341 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2342 break;
2343 case ir_txf:
2344 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2345 break;
2346 case ir_txf_ms:
2347 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2348 break;
2349 case ir_txs:
2350 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2351 break;
2352 case ir_tg4:
2353 if (has_nonconstant_offset)
2354 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2355 else
2356 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2357 break;
2358 case ir_query_levels:
2359 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2360 break;
2361 case ir_txb:
2362 assert(!"TXB is not valid for vertex shaders.");
2363 break;
2364 case ir_lod:
2365 assert(!"LOD is not valid for vertex shaders.");
2366 break;
2367 default:
2368 assert(!"Unrecognized tex op");
2369 }
2370
2371 if (ir->offset != NULL && ir->op != ir_txf)
2372 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2373
2374 /* Stuff the channel select bits in the top of the texture offset */
2375 if (ir->op == ir_tg4)
2376 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2377
2378 /* The message header is necessary for:
2379 * - Gen4 (always)
2380 * - Texel offsets
2381 * - Gather channel selection
2382 * - Sampler indices too large to fit in a 4-bit value.
2383 */
2384 inst->header_present =
2385 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2386 sampler >= 16;
2387 inst->base_mrf = 2;
2388 inst->mlen = inst->header_present + 1; /* always at least one */
2389 inst->sampler = sampler;
2390 inst->dst = dst_reg(this, ir->type);
2391 inst->dst.writemask = WRITEMASK_XYZW;
2392 inst->shadow_compare = ir->shadow_comparitor != NULL;
2393
2394 /* MRF for the first parameter */
2395 int param_base = inst->base_mrf + inst->header_present;
2396
2397 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2398 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2399 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2400 } else {
2401 /* Load the coordinate */
2402 /* FINISHME: gl_clamp_mask and saturate */
2403 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2404 int zero_mask = 0xf & ~coord_mask;
2405
2406 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2407 coordinate));
2408
2409 if (zero_mask != 0) {
2410 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2411 src_reg(0)));
2412 }
2413 /* Load the shadow comparitor */
2414 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2415 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2416 WRITEMASK_X),
2417 shadow_comparitor));
2418 inst->mlen++;
2419 }
2420
2421 /* Load the LOD info */
2422 if (ir->op == ir_tex || ir->op == ir_txl) {
2423 int mrf, writemask;
2424 if (brw->gen >= 5) {
2425 mrf = param_base + 1;
2426 if (ir->shadow_comparitor) {
2427 writemask = WRITEMASK_Y;
2428 /* mlen already incremented */
2429 } else {
2430 writemask = WRITEMASK_X;
2431 inst->mlen++;
2432 }
2433 } else /* brw->gen == 4 */ {
2434 mrf = param_base;
2435 writemask = WRITEMASK_W;
2436 }
2437 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2438 } else if (ir->op == ir_txf) {
2439 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2440 } else if (ir->op == ir_txf_ms) {
2441 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2442 sample_index));
2443 if (brw->gen >= 7)
2444 /* MCS data is in the first channel of `mcs`, but we need to get it into
2445 * the .y channel of the second vec4 of params, so replicate .x across
2446 * the whole vec4 and then mask off everything except .y
2447 */
2448 mcs.swizzle = BRW_SWIZZLE_XXXX;
2449 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2450 mcs));
2451 inst->mlen++;
2452 } else if (ir->op == ir_txd) {
2453 const glsl_type *type = lod_type;
2454
2455 if (brw->gen >= 5) {
2456 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2457 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2458 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2459 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2460 inst->mlen++;
2461
2462 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2463 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2464 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2465 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2466 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2467 inst->mlen++;
2468
2469 if (ir->shadow_comparitor) {
2470 emit(MOV(dst_reg(MRF, param_base + 2,
2471 ir->shadow_comparitor->type, WRITEMASK_Z),
2472 shadow_comparitor));
2473 }
2474 }
2475 } else /* brw->gen == 4 */ {
2476 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2477 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2478 inst->mlen += 2;
2479 }
2480 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2481 if (ir->shadow_comparitor) {
2482 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2483 shadow_comparitor));
2484 }
2485
2486 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2487 offset_value));
2488 inst->mlen++;
2489 }
2490 }
2491
2492 emit(inst);
2493
2494 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2495 * spec requires layers.
2496 */
2497 if (ir->op == ir_txs) {
2498 glsl_type const *type = ir->sampler->type;
2499 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2500 type->sampler_array) {
2501 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2502 writemask(inst->dst, WRITEMASK_Z),
2503 src_reg(inst->dst), src_reg(6));
2504 }
2505 }
2506
2507 if (brw->gen == 6 && ir->op == ir_tg4) {
2508 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2509 }
2510
2511 swizzle_result(ir, src_reg(inst->dst), sampler);
2512 }
2513
2514 /**
2515 * Apply workarounds for Gen6 gather with UINT/SINT
2516 */
2517 void
2518 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2519 {
2520 if (!wa)
2521 return;
2522
2523 int width = (wa & WA_8BIT) ? 8 : 16;
2524 dst_reg dst_f = dst;
2525 dst_f.type = BRW_REGISTER_TYPE_F;
2526
2527 /* Convert from UNORM to UINT */
2528 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2529 emit(MOV(dst, src_reg(dst_f)));
2530
2531 if (wa & WA_SIGN) {
2532 /* Reinterpret the UINT value as a signed INT value by
2533 * shifting the sign bit into place, then shifting back
2534 * preserving sign.
2535 */
2536 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2537 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2538 }
2539 }
2540
2541 /**
2542 * Set up the gather channel based on the swizzle, for gather4.
2543 */
2544 uint32_t
2545 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2546 {
2547 ir_constant *chan = ir->lod_info.component->as_constant();
2548 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2549 switch (swiz) {
2550 case SWIZZLE_X: return 0;
2551 case SWIZZLE_Y:
2552 /* gather4 sampler is broken for green channel on RG32F --
2553 * we must ask for blue instead.
2554 */
2555 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2556 return 2;
2557 return 1;
2558 case SWIZZLE_Z: return 2;
2559 case SWIZZLE_W: return 3;
2560 default:
2561 assert(!"Not reached"); /* zero, one swizzles handled already */
2562 return 0;
2563 }
2564 }
2565
2566 void
2567 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2568 {
2569 int s = key->tex.swizzles[sampler];
2570
2571 this->result = src_reg(this, ir->type);
2572 dst_reg swizzled_result(this->result);
2573
2574 if (ir->op == ir_query_levels) {
2575 /* # levels is in .w */
2576 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2577 emit(MOV(swizzled_result, orig_val));
2578 return;
2579 }
2580
2581 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2582 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2583 emit(MOV(swizzled_result, orig_val));
2584 return;
2585 }
2586
2587
2588 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2589 int swizzle[4] = {0};
2590
2591 for (int i = 0; i < 4; i++) {
2592 switch (GET_SWZ(s, i)) {
2593 case SWIZZLE_ZERO:
2594 zero_mask |= (1 << i);
2595 break;
2596 case SWIZZLE_ONE:
2597 one_mask |= (1 << i);
2598 break;
2599 default:
2600 copy_mask |= (1 << i);
2601 swizzle[i] = GET_SWZ(s, i);
2602 break;
2603 }
2604 }
2605
2606 if (copy_mask) {
2607 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2608 swizzled_result.writemask = copy_mask;
2609 emit(MOV(swizzled_result, orig_val));
2610 }
2611
2612 if (zero_mask) {
2613 swizzled_result.writemask = zero_mask;
2614 emit(MOV(swizzled_result, src_reg(0.0f)));
2615 }
2616
2617 if (one_mask) {
2618 swizzled_result.writemask = one_mask;
2619 emit(MOV(swizzled_result, src_reg(1.0f)));
2620 }
2621 }
2622
2623 void
2624 vec4_visitor::visit(ir_return *ir)
2625 {
2626 assert(!"not reached");
2627 }
2628
2629 void
2630 vec4_visitor::visit(ir_discard *ir)
2631 {
2632 assert(!"not reached");
2633 }
2634
2635 void
2636 vec4_visitor::visit(ir_if *ir)
2637 {
2638 /* Don't point the annotation at the if statement, because then it plus
2639 * the then and else blocks get printed.
2640 */
2641 this->base_ir = ir->condition;
2642
2643 if (brw->gen == 6) {
2644 emit_if_gen6(ir);
2645 } else {
2646 uint32_t predicate;
2647 emit_bool_to_cond_code(ir->condition, &predicate);
2648 emit(IF(predicate));
2649 }
2650
2651 visit_instructions(&ir->then_instructions);
2652
2653 if (!ir->else_instructions.is_empty()) {
2654 this->base_ir = ir->condition;
2655 emit(BRW_OPCODE_ELSE);
2656
2657 visit_instructions(&ir->else_instructions);
2658 }
2659
2660 this->base_ir = ir->condition;
2661 emit(BRW_OPCODE_ENDIF);
2662 }
2663
2664 void
2665 vec4_visitor::visit(ir_emit_vertex *)
2666 {
2667 assert(!"not reached");
2668 }
2669
2670 void
2671 vec4_visitor::visit(ir_end_primitive *)
2672 {
2673 assert(!"not reached");
2674 }
2675
2676 void
2677 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2678 dst_reg dst, src_reg offset,
2679 src_reg src0, src_reg src1)
2680 {
2681 unsigned mlen = 0;
2682
2683 /* Set the atomic operation offset. */
2684 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2685 mlen++;
2686
2687 /* Set the atomic operation arguments. */
2688 if (src0.file != BAD_FILE) {
2689 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2690 mlen++;
2691 }
2692
2693 if (src1.file != BAD_FILE) {
2694 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2695 mlen++;
2696 }
2697
2698 /* Emit the instruction. Note that this maps to the normal SIMD8
2699 * untyped atomic message on Ivy Bridge, but that's OK because
2700 * unused channels will be masked out.
2701 */
2702 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2703 src_reg(atomic_op), src_reg(surf_index));
2704 inst->base_mrf = 0;
2705 inst->mlen = mlen;
2706 }
2707
2708 void
2709 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2710 src_reg offset)
2711 {
2712 /* Set the surface read offset. */
2713 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2714
2715 /* Emit the instruction. Note that this maps to the normal SIMD8
2716 * untyped surface read message, but that's OK because unused
2717 * channels will be masked out.
2718 */
2719 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2720 dst, src_reg(surf_index));
2721 inst->base_mrf = 0;
2722 inst->mlen = 1;
2723 }
2724
2725 void
2726 vec4_visitor::emit_ndc_computation()
2727 {
2728 /* Get the position */
2729 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2730
2731 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2732 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2733 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2734
2735 current_annotation = "NDC";
2736 dst_reg ndc_w = ndc;
2737 ndc_w.writemask = WRITEMASK_W;
2738 src_reg pos_w = pos;
2739 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2740 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2741
2742 dst_reg ndc_xyz = ndc;
2743 ndc_xyz.writemask = WRITEMASK_XYZ;
2744
2745 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2746 }
2747
2748 void
2749 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2750 {
2751 if (brw->gen < 6 &&
2752 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2753 key->userclip_active || brw->has_negative_rhw_bug)) {
2754 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2755 dst_reg header1_w = header1;
2756 header1_w.writemask = WRITEMASK_W;
2757
2758 emit(MOV(header1, 0u));
2759
2760 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2761 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2762
2763 current_annotation = "Point size";
2764 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2765 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2766 }
2767
2768 if (key->userclip_active) {
2769 current_annotation = "Clipping flags";
2770 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2771 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2772
2773 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2774 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2775 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2776
2777 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2778 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2779 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2780 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2781 }
2782
2783 /* i965 clipping workaround:
2784 * 1) Test for -ve rhw
2785 * 2) If set,
2786 * set ndc = (0,0,0,0)
2787 * set ucp[6] = 1
2788 *
2789 * Later, clipping will detect ucp[6] and ensure the primitive is
2790 * clipped against all fixed planes.
2791 */
2792 if (brw->has_negative_rhw_bug) {
2793 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2794 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2795 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2796 vec4_instruction *inst;
2797 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2798 inst->predicate = BRW_PREDICATE_NORMAL;
2799 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2800 inst->predicate = BRW_PREDICATE_NORMAL;
2801 }
2802
2803 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2804 } else if (brw->gen < 6) {
2805 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2806 } else {
2807 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2808 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2809 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2810 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2811 }
2812 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2813 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2814 src_reg(output_reg[VARYING_SLOT_LAYER])));
2815 }
2816 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2817 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2818 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2819 }
2820 }
2821 }
2822
2823 void
2824 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2825 {
2826 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2827 *
2828 * "If a linked set of shaders forming the vertex stage contains no
2829 * static write to gl_ClipVertex or gl_ClipDistance, but the
2830 * application has requested clipping against user clip planes through
2831 * the API, then the coordinate written to gl_Position is used for
2832 * comparison against the user clip planes."
2833 *
2834 * This function is only called if the shader didn't write to
2835 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2836 * if the user wrote to it; otherwise we use gl_Position.
2837 */
2838 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2839 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2840 clip_vertex = VARYING_SLOT_POS;
2841 }
2842
2843 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2844 ++i) {
2845 reg.writemask = 1 << i;
2846 emit(DP4(reg,
2847 src_reg(output_reg[clip_vertex]),
2848 src_reg(this->userplane[i + offset])));
2849 }
2850 }
2851
2852 void
2853 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2854 {
2855 assert (varying < VARYING_SLOT_MAX);
2856 reg.type = output_reg[varying].type;
2857 current_annotation = output_reg_annotation[varying];
2858 /* Copy the register, saturating if necessary */
2859 vec4_instruction *inst = emit(MOV(reg,
2860 src_reg(output_reg[varying])));
2861 if ((varying == VARYING_SLOT_COL0 ||
2862 varying == VARYING_SLOT_COL1 ||
2863 varying == VARYING_SLOT_BFC0 ||
2864 varying == VARYING_SLOT_BFC1) &&
2865 key->clamp_vertex_color) {
2866 inst->saturate = true;
2867 }
2868 }
2869
2870 void
2871 vec4_visitor::emit_urb_slot(int mrf, int varying)
2872 {
2873 struct brw_reg hw_reg = brw_message_reg(mrf);
2874 dst_reg reg = dst_reg(MRF, mrf);
2875 reg.type = BRW_REGISTER_TYPE_F;
2876
2877 switch (varying) {
2878 case VARYING_SLOT_PSIZ:
2879 /* PSIZ is always in slot 0, and is coupled with other flags. */
2880 current_annotation = "indices, point width, clip flags";
2881 emit_psiz_and_flags(hw_reg);
2882 break;
2883 case BRW_VARYING_SLOT_NDC:
2884 current_annotation = "NDC";
2885 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2886 break;
2887 case VARYING_SLOT_POS:
2888 current_annotation = "gl_Position";
2889 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2890 break;
2891 case VARYING_SLOT_EDGE:
2892 /* This is present when doing unfilled polygons. We're supposed to copy
2893 * the edge flag from the user-provided vertex array
2894 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2895 * of that attribute (starts as 1.0f). This is then used in clipping to
2896 * determine which edges should be drawn as wireframe.
2897 */
2898 current_annotation = "edge flag";
2899 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2900 glsl_type::float_type, WRITEMASK_XYZW))));
2901 break;
2902 case BRW_VARYING_SLOT_PAD:
2903 /* No need to write to this slot */
2904 break;
2905 default:
2906 emit_generic_urb_slot(reg, varying);
2907 break;
2908 }
2909 }
2910
2911 static int
2912 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2913 {
2914 if (brw->gen >= 6) {
2915 /* URB data written (does not include the message header reg) must
2916 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2917 * section 5.4.3.2.2: URB_INTERLEAVED.
2918 *
2919 * URB entries are allocated on a multiple of 1024 bits, so an
2920 * extra 128 bits written here to make the end align to 256 is
2921 * no problem.
2922 */
2923 if ((mlen % 2) != 1)
2924 mlen++;
2925 }
2926
2927 return mlen;
2928 }
2929
2930
2931 /**
2932 * Generates the VUE payload plus the necessary URB write instructions to
2933 * output it.
2934 *
2935 * The VUE layout is documented in Volume 2a.
2936 */
2937 void
2938 vec4_visitor::emit_vertex()
2939 {
2940 /* MRF 0 is reserved for the debugger, so start with message header
2941 * in MRF 1.
2942 */
2943 int base_mrf = 1;
2944 int mrf = base_mrf;
2945 /* In the process of generating our URB write message contents, we
2946 * may need to unspill a register or load from an array. Those
2947 * reads would use MRFs 14-15.
2948 */
2949 int max_usable_mrf = 13;
2950
2951 /* The following assertion verifies that max_usable_mrf causes an
2952 * even-numbered amount of URB write data, which will meet gen6's
2953 * requirements for length alignment.
2954 */
2955 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2956
2957 /* First mrf is the g0-based message header containing URB handles and
2958 * such.
2959 */
2960 emit_urb_write_header(mrf++);
2961
2962 if (brw->gen < 6) {
2963 emit_ndc_computation();
2964 }
2965
2966 /* Lower legacy ff and ClipVertex clipping to clip distances */
2967 if (key->userclip_active && !prog->UsesClipDistanceOut) {
2968 current_annotation = "user clip distances";
2969
2970 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2971 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2972
2973 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2974 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2975 }
2976
2977 /* We may need to split this up into several URB writes, so do them in a
2978 * loop.
2979 */
2980 int slot = 0;
2981 bool complete = false;
2982 do {
2983 /* URB offset is in URB row increments, and each of our MRFs is half of
2984 * one of those, since we're doing interleaved writes.
2985 */
2986 int offset = slot / 2;
2987
2988 mrf = base_mrf + 1;
2989 for (; slot < prog_data->vue_map.num_slots; ++slot) {
2990 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2991
2992 /* If this was max_usable_mrf, we can't fit anything more into this
2993 * URB WRITE.
2994 */
2995 if (mrf > max_usable_mrf) {
2996 slot++;
2997 break;
2998 }
2999 }
3000
3001 complete = slot >= prog_data->vue_map.num_slots;
3002 current_annotation = "URB write";
3003 vec4_instruction *inst = emit_urb_write_opcode(complete);
3004 inst->base_mrf = base_mrf;
3005 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3006 inst->offset += offset;
3007 } while(!complete);
3008 }
3009
3010
3011 src_reg
3012 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3013 src_reg *reladdr, int reg_offset)
3014 {
3015 /* Because we store the values to scratch interleaved like our
3016 * vertex data, we need to scale the vec4 index by 2.
3017 */
3018 int message_header_scale = 2;
3019
3020 /* Pre-gen6, the message header uses byte offsets instead of vec4
3021 * (16-byte) offset units.
3022 */
3023 if (brw->gen < 6)
3024 message_header_scale *= 16;
3025
3026 if (reladdr) {
3027 src_reg index = src_reg(this, glsl_type::int_type);
3028
3029 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3030 emit_before(inst, MUL(dst_reg(index),
3031 index, src_reg(message_header_scale)));
3032
3033 return index;
3034 } else {
3035 return src_reg(reg_offset * message_header_scale);
3036 }
3037 }
3038
3039 src_reg
3040 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3041 src_reg *reladdr, int reg_offset)
3042 {
3043 if (reladdr) {
3044 src_reg index = src_reg(this, glsl_type::int_type);
3045
3046 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3047
3048 /* Pre-gen6, the message header uses byte offsets instead of vec4
3049 * (16-byte) offset units.
3050 */
3051 if (brw->gen < 6) {
3052 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3053 }
3054
3055 return index;
3056 } else if (brw->gen >= 8) {
3057 /* Store the offset in a GRF so we can send-from-GRF. */
3058 src_reg offset = src_reg(this, glsl_type::int_type);
3059 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3060 return offset;
3061 } else {
3062 int message_header_scale = brw->gen < 6 ? 16 : 1;
3063 return src_reg(reg_offset * message_header_scale);
3064 }
3065 }
3066
3067 /**
3068 * Emits an instruction before @inst to load the value named by @orig_src
3069 * from scratch space at @base_offset to @temp.
3070 *
3071 * @base_offset is measured in 32-byte units (the size of a register).
3072 */
3073 void
3074 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3075 dst_reg temp, src_reg orig_src,
3076 int base_offset)
3077 {
3078 int reg_offset = base_offset + orig_src.reg_offset;
3079 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3080
3081 emit_before(inst, SCRATCH_READ(temp, index));
3082 }
3083
3084 /**
3085 * Emits an instruction after @inst to store the value to be written
3086 * to @orig_dst to scratch space at @base_offset, from @temp.
3087 *
3088 * @base_offset is measured in 32-byte units (the size of a register).
3089 */
3090 void
3091 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3092 {
3093 int reg_offset = base_offset + inst->dst.reg_offset;
3094 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3095
3096 /* Create a temporary register to store *inst's result in.
3097 *
3098 * We have to be careful in MOVing from our temporary result register in
3099 * the scratch write. If we swizzle from channels of the temporary that
3100 * weren't initialized, it will confuse live interval analysis, which will
3101 * make spilling fail to make progress.
3102 */
3103 src_reg temp = src_reg(this, glsl_type::vec4_type);
3104 temp.type = inst->dst.type;
3105 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3106 int swizzles[4];
3107 for (int i = 0; i < 4; i++)
3108 if (inst->dst.writemask & (1 << i))
3109 swizzles[i] = i;
3110 else
3111 swizzles[i] = first_writemask_chan;
3112 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3113 swizzles[2], swizzles[3]);
3114
3115 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3116 inst->dst.writemask));
3117 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3118 write->predicate = inst->predicate;
3119 write->ir = inst->ir;
3120 write->annotation = inst->annotation;
3121 inst->insert_after(write);
3122
3123 inst->dst.file = temp.file;
3124 inst->dst.reg = temp.reg;
3125 inst->dst.reg_offset = temp.reg_offset;
3126 inst->dst.reladdr = NULL;
3127 }
3128
3129 /**
3130 * We can't generally support array access in GRF space, because a
3131 * single instruction's destination can only span 2 contiguous
3132 * registers. So, we send all GRF arrays that get variable index
3133 * access to scratch space.
3134 */
3135 void
3136 vec4_visitor::move_grf_array_access_to_scratch()
3137 {
3138 int scratch_loc[this->virtual_grf_count];
3139
3140 for (int i = 0; i < this->virtual_grf_count; i++) {
3141 scratch_loc[i] = -1;
3142 }
3143
3144 /* First, calculate the set of virtual GRFs that need to be punted
3145 * to scratch due to having any array access on them, and where in
3146 * scratch.
3147 */
3148 foreach_list(node, &this->instructions) {
3149 vec4_instruction *inst = (vec4_instruction *)node;
3150
3151 if (inst->dst.file == GRF && inst->dst.reladdr &&
3152 scratch_loc[inst->dst.reg] == -1) {
3153 scratch_loc[inst->dst.reg] = c->last_scratch;
3154 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3155 }
3156
3157 for (int i = 0 ; i < 3; i++) {
3158 src_reg *src = &inst->src[i];
3159
3160 if (src->file == GRF && src->reladdr &&
3161 scratch_loc[src->reg] == -1) {
3162 scratch_loc[src->reg] = c->last_scratch;
3163 c->last_scratch += this->virtual_grf_sizes[src->reg];
3164 }
3165 }
3166 }
3167
3168 /* Now, for anything that will be accessed through scratch, rewrite
3169 * it to load/store. Note that this is a _safe list walk, because
3170 * we may generate a new scratch_write instruction after the one
3171 * we're processing.
3172 */
3173 foreach_list_safe(node, &this->instructions) {
3174 vec4_instruction *inst = (vec4_instruction *)node;
3175
3176 /* Set up the annotation tracking for new generated instructions. */
3177 base_ir = inst->ir;
3178 current_annotation = inst->annotation;
3179
3180 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3181 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3182 }
3183
3184 for (int i = 0 ; i < 3; i++) {
3185 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3186 continue;
3187
3188 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3189
3190 emit_scratch_read(inst, temp, inst->src[i],
3191 scratch_loc[inst->src[i].reg]);
3192
3193 inst->src[i].file = temp.file;
3194 inst->src[i].reg = temp.reg;
3195 inst->src[i].reg_offset = temp.reg_offset;
3196 inst->src[i].reladdr = NULL;
3197 }
3198 }
3199 }
3200
3201 /**
3202 * Emits an instruction before @inst to load the value named by @orig_src
3203 * from the pull constant buffer (surface) at @base_offset to @temp.
3204 */
3205 void
3206 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3207 dst_reg temp, src_reg orig_src,
3208 int base_offset)
3209 {
3210 int reg_offset = base_offset + orig_src.reg_offset;
3211 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3212 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3213 vec4_instruction *load;
3214
3215 if (brw->gen >= 7) {
3216 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3217 grf_offset.type = offset.type;
3218 emit_before(inst, MOV(grf_offset, offset));
3219
3220 load = new(mem_ctx) vec4_instruction(this,
3221 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3222 temp, index, src_reg(grf_offset));
3223 } else {
3224 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3225 temp, index, offset);
3226 load->base_mrf = 14;
3227 load->mlen = 1;
3228 }
3229 emit_before(inst, load);
3230 }
3231
3232 /**
3233 * Implements array access of uniforms by inserting a
3234 * PULL_CONSTANT_LOAD instruction.
3235 *
3236 * Unlike temporary GRF array access (where we don't support it due to
3237 * the difficulty of doing relative addressing on instruction
3238 * destinations), we could potentially do array access of uniforms
3239 * that were loaded in GRF space as push constants. In real-world
3240 * usage we've seen, though, the arrays being used are always larger
3241 * than we could load as push constants, so just always move all
3242 * uniform array access out to a pull constant buffer.
3243 */
3244 void
3245 vec4_visitor::move_uniform_array_access_to_pull_constants()
3246 {
3247 int pull_constant_loc[this->uniforms];
3248
3249 for (int i = 0; i < this->uniforms; i++) {
3250 pull_constant_loc[i] = -1;
3251 }
3252
3253 /* Walk through and find array access of uniforms. Put a copy of that
3254 * uniform in the pull constant buffer.
3255 *
3256 * Note that we don't move constant-indexed accesses to arrays. No
3257 * testing has been done of the performance impact of this choice.
3258 */
3259 foreach_list_safe(node, &this->instructions) {
3260 vec4_instruction *inst = (vec4_instruction *)node;
3261
3262 for (int i = 0 ; i < 3; i++) {
3263 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3264 continue;
3265
3266 int uniform = inst->src[i].reg;
3267
3268 /* If this array isn't already present in the pull constant buffer,
3269 * add it.
3270 */
3271 if (pull_constant_loc[uniform] == -1) {
3272 const float **values = &stage_prog_data->param[uniform * 4];
3273
3274 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3275
3276 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3277 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3278 = values[j];
3279 }
3280 }
3281
3282 /* Set up the annotation tracking for new generated instructions. */
3283 base_ir = inst->ir;
3284 current_annotation = inst->annotation;
3285
3286 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3287
3288 emit_pull_constant_load(inst, temp, inst->src[i],
3289 pull_constant_loc[uniform]);
3290
3291 inst->src[i].file = temp.file;
3292 inst->src[i].reg = temp.reg;
3293 inst->src[i].reg_offset = temp.reg_offset;
3294 inst->src[i].reladdr = NULL;
3295 }
3296 }
3297
3298 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3299 * no need to track them as larger-than-vec4 objects. This will be
3300 * relied on in cutting out unused uniform vectors from push
3301 * constants.
3302 */
3303 split_uniform_registers();
3304 }
3305
3306 void
3307 vec4_visitor::resolve_ud_negate(src_reg *reg)
3308 {
3309 if (reg->type != BRW_REGISTER_TYPE_UD ||
3310 !reg->negate)
3311 return;
3312
3313 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3314 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3315 *reg = temp;
3316 }
3317
3318 vec4_visitor::vec4_visitor(struct brw_context *brw,
3319 struct brw_vec4_compile *c,
3320 struct gl_program *prog,
3321 const struct brw_vec4_prog_key *key,
3322 struct brw_vec4_prog_data *prog_data,
3323 struct gl_shader_program *shader_prog,
3324 struct brw_shader *shader,
3325 void *mem_ctx,
3326 bool debug_flag,
3327 bool no_spills,
3328 shader_time_shader_type st_base,
3329 shader_time_shader_type st_written,
3330 shader_time_shader_type st_reset)
3331 : sanity_param_count(0),
3332 fail_msg(NULL),
3333 first_non_payload_grf(0),
3334 need_all_constants_in_pull_buffer(false),
3335 debug_flag(debug_flag),
3336 no_spills(no_spills),
3337 st_base(st_base),
3338 st_written(st_written),
3339 st_reset(st_reset)
3340 {
3341 this->brw = brw;
3342 this->ctx = &brw->ctx;
3343 this->shader_prog = shader_prog;
3344 this->shader = shader;
3345
3346 this->mem_ctx = mem_ctx;
3347 this->failed = false;
3348
3349 this->base_ir = NULL;
3350 this->current_annotation = NULL;
3351 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3352
3353 this->c = c;
3354 this->prog = prog;
3355 this->key = key;
3356 this->prog_data = prog_data;
3357 this->stage_prog_data = &prog_data->base;
3358
3359 this->variable_ht = hash_table_ctor(0,
3360 hash_table_pointer_hash,
3361 hash_table_pointer_compare);
3362
3363 this->virtual_grf_start = NULL;
3364 this->virtual_grf_end = NULL;
3365 this->virtual_grf_sizes = NULL;
3366 this->virtual_grf_count = 0;
3367 this->virtual_grf_reg_map = NULL;
3368 this->virtual_grf_reg_count = 0;
3369 this->virtual_grf_array_size = 0;
3370 this->live_intervals_valid = false;
3371
3372 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3373
3374 this->uniforms = 0;
3375 }
3376
3377 vec4_visitor::~vec4_visitor()
3378 {
3379 hash_table_dtor(this->variable_ht);
3380 }
3381
3382
3383 void
3384 vec4_visitor::fail(const char *format, ...)
3385 {
3386 va_list va;
3387 char *msg;
3388
3389 if (failed)
3390 return;
3391
3392 failed = true;
3393
3394 va_start(va, format);
3395 msg = ralloc_vasprintf(mem_ctx, format, va);
3396 va_end(va);
3397 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3398
3399 this->fail_msg = msg;
3400
3401 if (debug_flag) {
3402 fprintf(stderr, "%s", msg);
3403 }
3404 }
3405
3406 } /* namespace brw */