i965/vec4: Allow CSE on uniform-vec4 expansion MOVs.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 extern "C" {
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
33 vec4_instruction::vec4_instruction(vec4_visitor *v,
34 enum opcode opcode, const dst_reg &dst,
35 const src_reg &src0, const src_reg &src1,
36 const src_reg &src2)
37 {
38 this->opcode = opcode;
39 this->dst = dst;
40 this->src[0] = src0;
41 this->src[1] = src1;
42 this->src[2] = src2;
43 this->saturate = false;
44 this->force_writemask_all = false;
45 this->no_dd_clear = false;
46 this->no_dd_check = false;
47 this->writes_accumulator = false;
48 this->conditional_mod = BRW_CONDITIONAL_NONE;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
70 vec4_instruction *new_inst)
71 {
72 new_inst->ir = inst->ir;
73 new_inst->annotation = inst->annotation;
74
75 inst->insert_before(block, new_inst);
76
77 return inst;
78 }
79
80 vec4_instruction *
81 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
82 const src_reg &src1, const src_reg &src2)
83 {
84 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
85 src0, src1, src2));
86 }
87
88
89 vec4_instruction *
90 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
91 const src_reg &src1)
92 {
93 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
94 }
95
96 vec4_instruction *
97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
98 {
99 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
100 }
101
102 vec4_instruction *
103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
104 {
105 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
106 }
107
108 vec4_instruction *
109 vec4_visitor::emit(enum opcode opcode)
110 {
111 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
112 }
113
114 #define ALU1(op) \
115 vec4_instruction * \
116 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
117 { \
118 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
119 src0); \
120 }
121
122 #define ALU2(op) \
123 vec4_instruction * \
124 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
125 const src_reg &src1) \
126 { \
127 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
128 src0, src1); \
129 }
130
131 #define ALU2_ACC(op) \
132 vec4_instruction * \
133 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
134 const src_reg &src1) \
135 { \
136 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
137 BRW_OPCODE_##op, dst, src0, src1); \
138 inst->writes_accumulator = true; \
139 return inst; \
140 }
141
142 #define ALU3(op) \
143 vec4_instruction * \
144 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
145 const src_reg &src1, const src_reg &src2) \
146 { \
147 assert(brw->gen >= 6); \
148 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
149 src0, src1, src2); \
150 }
151
152 ALU1(NOT)
153 ALU1(MOV)
154 ALU1(FRC)
155 ALU1(RNDD)
156 ALU1(RNDE)
157 ALU1(RNDZ)
158 ALU1(F32TO16)
159 ALU1(F16TO32)
160 ALU2(ADD)
161 ALU2(MUL)
162 ALU2_ACC(MACH)
163 ALU2(AND)
164 ALU2(OR)
165 ALU2(XOR)
166 ALU2(DP3)
167 ALU2(DP4)
168 ALU2(DPH)
169 ALU2(SHL)
170 ALU2(SHR)
171 ALU2(ASR)
172 ALU3(LRP)
173 ALU1(BFREV)
174 ALU3(BFE)
175 ALU2(BFI1)
176 ALU3(BFI2)
177 ALU1(FBH)
178 ALU1(FBL)
179 ALU1(CBIT)
180 ALU3(MAD)
181 ALU2_ACC(ADDC)
182 ALU2_ACC(SUBB)
183 ALU2(MAC)
184
185 /** Gen4 predicated IF. */
186 vec4_instruction *
187 vec4_visitor::IF(enum brw_predicate predicate)
188 {
189 vec4_instruction *inst;
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
192 inst->predicate = predicate;
193
194 return inst;
195 }
196
197 /** Gen6 IF with embedded comparison. */
198 vec4_instruction *
199 vec4_visitor::IF(src_reg src0, src_reg src1,
200 enum brw_conditional_mod condition)
201 {
202 assert(brw->gen == 6);
203
204 vec4_instruction *inst;
205
206 resolve_ud_negate(&src0);
207 resolve_ud_negate(&src1);
208
209 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
210 src0, src1);
211 inst->conditional_mod = condition;
212
213 return inst;
214 }
215
216 /**
217 * CMP: Sets the low bit of the destination channels with the result
218 * of the comparison, while the upper bits are undefined, and updates
219 * the flag register with the packed 16 bits of the result.
220 */
221 vec4_instruction *
222 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
223 enum brw_conditional_mod condition)
224 {
225 vec4_instruction *inst;
226
227 /* original gen4 does type conversion to the destination type
228 * before before comparison, producing garbage results for floating
229 * point comparisons.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 vec4_instruction *
247 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
248 {
249 vec4_instruction *inst;
250
251 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
252 dst, index);
253 inst->base_mrf = 14;
254 inst->mlen = 2;
255
256 return inst;
257 }
258
259 vec4_instruction *
260 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
261 const src_reg &index)
262 {
263 vec4_instruction *inst;
264
265 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
266 dst, src, index);
267 inst->base_mrf = 13;
268 inst->mlen = 3;
269
270 return inst;
271 }
272
273 void
274 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
275 {
276 static enum opcode dot_opcodes[] = {
277 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
278 };
279
280 emit(dot_opcodes[elements - 2], dst, src0, src1);
281 }
282
283 src_reg
284 vec4_visitor::fix_3src_operand(src_reg src)
285 {
286 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
287 * able to use vertical stride of zero to replicate the vec4 uniform, like
288 *
289 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
290 *
291 * But you can't, since vertical stride is always four in three-source
292 * instructions. Instead, insert a MOV instruction to do the replication so
293 * that the three-source instruction can consume it.
294 */
295
296 /* The MOV is only needed if the source is a uniform or immediate. */
297 if (src.file != UNIFORM && src.file != IMM)
298 return src;
299
300 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
301 return src;
302
303 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
304 expanded.type = src.type;
305 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
306 return src_reg(expanded);
307 }
308
309 src_reg
310 vec4_visitor::fix_math_operand(src_reg src)
311 {
312 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
313 return src;
314
315 /* The gen6 math instruction ignores the source modifiers --
316 * swizzle, abs, negate, and at least some parts of the register
317 * region description.
318 *
319 * Rather than trying to enumerate all these cases, *always* expand the
320 * operand to a temp GRF for gen6.
321 *
322 * For gen7, keep the operand as-is, except if immediate, which gen7 still
323 * can't use.
324 */
325
326 if (brw->gen == 7 && src.file != IMM)
327 return src;
328
329 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
330 expanded.type = src.type;
331 emit(MOV(expanded, src));
332 return src_reg(expanded);
333 }
334
335 void
336 vec4_visitor::emit_math(enum opcode opcode,
337 const dst_reg &dst,
338 const src_reg &src0, const src_reg &src1)
339 {
340 vec4_instruction *math =
341 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
342
343 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
344 /* MATH on Gen6 must be align1, so we can't do writemasks. */
345 math->dst = dst_reg(this, glsl_type::vec4_type);
346 math->dst.type = dst.type;
347 emit(MOV(dst, src_reg(math->dst)));
348 } else if (brw->gen < 6) {
349 math->base_mrf = 1;
350 math->mlen = src1.file == BAD_FILE ? 1 : 2;
351 }
352 }
353
354 void
355 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
356 {
357 if (brw->gen < 7) {
358 unreachable("ir_unop_pack_half_2x16 should be lowered");
359 }
360
361 assert(dst.type == BRW_REGISTER_TYPE_UD);
362 assert(src0.type == BRW_REGISTER_TYPE_F);
363
364 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
365 *
366 * Because this instruction does not have a 16-bit floating-point type,
367 * the destination data type must be Word (W).
368 *
369 * The destination must be DWord-aligned and specify a horizontal stride
370 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
371 * each destination channel and the upper word is not modified.
372 *
373 * The above restriction implies that the f32to16 instruction must use
374 * align1 mode, because only in align1 mode is it possible to specify
375 * horizontal stride. We choose here to defy the hardware docs and emit
376 * align16 instructions.
377 *
378 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
379 * instructions. I was partially successful in that the code passed all
380 * tests. However, the code was dubiously correct and fragile, and the
381 * tests were not harsh enough to probe that frailty. Not trusting the
382 * code, I chose instead to remain in align16 mode in defiance of the hw
383 * docs).
384 *
385 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
386 * simulator, emitting a f32to16 in align16 mode with UD as destination
387 * data type is safe. The behavior differs from that specified in the PRM
388 * in that the upper word of each destination channel is cleared to 0.
389 */
390
391 dst_reg tmp_dst(this, glsl_type::uvec2_type);
392 src_reg tmp_src(tmp_dst);
393
394 #if 0
395 /* Verify the undocumented behavior on which the following instructions
396 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
397 * then the result of the bit-or instruction below will be incorrect.
398 *
399 * You should inspect the disasm output in order to verify that the MOV is
400 * not optimized away.
401 */
402 emit(MOV(tmp_dst, src_reg(0x12345678u)));
403 #endif
404
405 /* Give tmp the form below, where "." means untouched.
406 *
407 * w z y x w z y x
408 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
409 *
410 * That the upper word of each write-channel be 0 is required for the
411 * following bit-shift and bit-or instructions to work. Note that this
412 * relies on the undocumented hardware behavior mentioned above.
413 */
414 tmp_dst.writemask = WRITEMASK_XY;
415 emit(F32TO16(tmp_dst, src0));
416
417 /* Give the write-channels of dst the form:
418 * 0xhhhh0000
419 */
420 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
421 emit(SHL(dst, tmp_src, src_reg(16u)));
422
423 /* Finally, give the write-channels of dst the form of packHalf2x16's
424 * output:
425 * 0xhhhhllll
426 */
427 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
428 emit(OR(dst, src_reg(dst), tmp_src));
429 }
430
431 void
432 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
433 {
434 if (brw->gen < 7) {
435 unreachable("ir_unop_unpack_half_2x16 should be lowered");
436 }
437
438 assert(dst.type == BRW_REGISTER_TYPE_F);
439 assert(src0.type == BRW_REGISTER_TYPE_UD);
440
441 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
442 *
443 * Because this instruction does not have a 16-bit floating-point type,
444 * the source data type must be Word (W). The destination type must be
445 * F (Float).
446 *
447 * To use W as the source data type, we must adjust horizontal strides,
448 * which is only possible in align1 mode. All my [chadv] attempts at
449 * emitting align1 instructions for unpackHalf2x16 failed to pass the
450 * Piglit tests, so I gave up.
451 *
452 * I've verified that, on gen7 hardware and the simulator, it is safe to
453 * emit f16to32 in align16 mode with UD as source data type.
454 */
455
456 dst_reg tmp_dst(this, glsl_type::uvec2_type);
457 src_reg tmp_src(tmp_dst);
458
459 tmp_dst.writemask = WRITEMASK_X;
460 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
461
462 tmp_dst.writemask = WRITEMASK_Y;
463 emit(SHR(tmp_dst, src0, src_reg(16u)));
464
465 dst.writemask = WRITEMASK_XY;
466 emit(F16TO32(dst, tmp_src));
467 }
468
469 void
470 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
471 {
472 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
473 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
474 * is not suitable to generate the shift values, but we can use the packed
475 * vector float and a type-converting MOV.
476 */
477 dst_reg shift(this, glsl_type::uvec4_type);
478 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
479
480 dst_reg shifted(this, glsl_type::uvec4_type);
481 src0.swizzle = BRW_SWIZZLE_XXXX;
482 emit(SHR(shifted, src0, src_reg(shift)));
483
484 shifted.type = BRW_REGISTER_TYPE_UB;
485 dst_reg f(this, glsl_type::vec4_type);
486 emit(MOV(f, src_reg(shifted)));
487
488 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
489 }
490
491 void
492 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
493 {
494 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
495 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
496 * is not suitable to generate the shift values, but we can use the packed
497 * vector float and a type-converting MOV.
498 */
499 dst_reg shift(this, glsl_type::uvec4_type);
500 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
501
502 dst_reg shifted(this, glsl_type::uvec4_type);
503 src0.swizzle = BRW_SWIZZLE_XXXX;
504 emit(SHR(shifted, src0, src_reg(shift)));
505
506 shifted.type = BRW_REGISTER_TYPE_B;
507 dst_reg f(this, glsl_type::vec4_type);
508 emit(MOV(f, src_reg(shifted)));
509
510 dst_reg scaled(this, glsl_type::vec4_type);
511 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
512
513 dst_reg max(this, glsl_type::vec4_type);
514 emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
515 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
516 }
517
518 void
519 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
520 {
521 dst_reg saturated(this, glsl_type::vec4_type);
522 vec4_instruction *inst = emit(MOV(saturated, src0));
523 inst->saturate = true;
524
525 dst_reg scaled(this, glsl_type::vec4_type);
526 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
527
528 dst_reg rounded(this, glsl_type::vec4_type);
529 emit(RNDE(rounded, src_reg(scaled)));
530
531 dst_reg u(this, glsl_type::uvec4_type);
532 emit(MOV(u, src_reg(rounded)));
533
534 src_reg bytes(u);
535 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
536 }
537
538 void
539 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
540 {
541 dst_reg max(this, glsl_type::vec4_type);
542 emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
543
544 dst_reg min(this, glsl_type::vec4_type);
545 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
546
547 dst_reg scaled(this, glsl_type::vec4_type);
548 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
549
550 dst_reg rounded(this, glsl_type::vec4_type);
551 emit(RNDE(rounded, src_reg(scaled)));
552
553 dst_reg i(this, glsl_type::ivec4_type);
554 emit(MOV(i, src_reg(rounded)));
555
556 src_reg bytes(i);
557 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
558 }
559
560 void
561 vec4_visitor::visit_instructions(const exec_list *list)
562 {
563 foreach_in_list(ir_instruction, ir, list) {
564 base_ir = ir;
565 ir->accept(this);
566 }
567 }
568
569
570 static int
571 type_size(const struct glsl_type *type)
572 {
573 unsigned int i;
574 int size;
575
576 switch (type->base_type) {
577 case GLSL_TYPE_UINT:
578 case GLSL_TYPE_INT:
579 case GLSL_TYPE_FLOAT:
580 case GLSL_TYPE_BOOL:
581 if (type->is_matrix()) {
582 return type->matrix_columns;
583 } else {
584 /* Regardless of size of vector, it gets a vec4. This is bad
585 * packing for things like floats, but otherwise arrays become a
586 * mess. Hopefully a later pass over the code can pack scalars
587 * down if appropriate.
588 */
589 return 1;
590 }
591 case GLSL_TYPE_ARRAY:
592 assert(type->length > 0);
593 return type_size(type->fields.array) * type->length;
594 case GLSL_TYPE_STRUCT:
595 size = 0;
596 for (i = 0; i < type->length; i++) {
597 size += type_size(type->fields.structure[i].type);
598 }
599 return size;
600 case GLSL_TYPE_SAMPLER:
601 /* Samplers take up no register space, since they're baked in at
602 * link time.
603 */
604 return 0;
605 case GLSL_TYPE_ATOMIC_UINT:
606 return 0;
607 case GLSL_TYPE_IMAGE:
608 case GLSL_TYPE_VOID:
609 case GLSL_TYPE_ERROR:
610 case GLSL_TYPE_INTERFACE:
611 unreachable("not reached");
612 }
613
614 return 0;
615 }
616
617 int
618 vec4_visitor::virtual_grf_alloc(int size)
619 {
620 if (virtual_grf_array_size <= virtual_grf_count) {
621 if (virtual_grf_array_size == 0)
622 virtual_grf_array_size = 16;
623 else
624 virtual_grf_array_size *= 2;
625 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
626 virtual_grf_array_size);
627 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
628 virtual_grf_array_size);
629 }
630 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
631 virtual_grf_reg_count += size;
632 virtual_grf_sizes[virtual_grf_count] = size;
633 return virtual_grf_count++;
634 }
635
636 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
637 {
638 init();
639
640 this->file = GRF;
641 this->reg = v->virtual_grf_alloc(type_size(type));
642
643 if (type->is_array() || type->is_record()) {
644 this->swizzle = BRW_SWIZZLE_NOOP;
645 } else {
646 this->swizzle = swizzle_for_size(type->vector_elements);
647 }
648
649 this->type = brw_type_for_base_type(type);
650 }
651
652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
653 {
654 assert(size > 0);
655
656 init();
657
658 this->file = GRF;
659 this->reg = v->virtual_grf_alloc(type_size(type) * size);
660
661 this->swizzle = BRW_SWIZZLE_NOOP;
662
663 this->type = brw_type_for_base_type(type);
664 }
665
666 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
667 {
668 init();
669
670 this->file = GRF;
671 this->reg = v->virtual_grf_alloc(type_size(type));
672
673 if (type->is_array() || type->is_record()) {
674 this->writemask = WRITEMASK_XYZW;
675 } else {
676 this->writemask = (1 << type->vector_elements) - 1;
677 }
678
679 this->type = brw_type_for_base_type(type);
680 }
681
682 /* Our support for uniforms is piggy-backed on the struct
683 * gl_fragment_program, because that's where the values actually
684 * get stored, rather than in some global gl_shader_program uniform
685 * store.
686 */
687 void
688 vec4_visitor::setup_uniform_values(ir_variable *ir)
689 {
690 int namelen = strlen(ir->name);
691
692 /* The data for our (non-builtin) uniforms is stored in a series of
693 * gl_uniform_driver_storage structs for each subcomponent that
694 * glGetUniformLocation() could name. We know it's been set up in the same
695 * order we'd walk the type, so walk the list of storage and find anything
696 * with our name, or the prefix of a component that starts with our name.
697 */
698 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
699 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
700
701 if (strncmp(ir->name, storage->name, namelen) != 0 ||
702 (storage->name[namelen] != 0 &&
703 storage->name[namelen] != '.' &&
704 storage->name[namelen] != '[')) {
705 continue;
706 }
707
708 gl_constant_value *components = storage->storage;
709 unsigned vector_count = (MAX2(storage->array_elements, 1) *
710 storage->type->matrix_columns);
711
712 for (unsigned s = 0; s < vector_count; s++) {
713 assert(uniforms < uniform_array_size);
714 uniform_vector_size[uniforms] = storage->type->vector_elements;
715
716 int i;
717 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
718 stage_prog_data->param[uniforms * 4 + i] = components;
719 components++;
720 }
721 for (; i < 4; i++) {
722 static gl_constant_value zero = { 0.0 };
723 stage_prog_data->param[uniforms * 4 + i] = &zero;
724 }
725
726 uniforms++;
727 }
728 }
729 }
730
731 void
732 vec4_visitor::setup_uniform_clipplane_values()
733 {
734 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
735
736 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
737 assert(this->uniforms < uniform_array_size);
738 this->uniform_vector_size[this->uniforms] = 4;
739 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
740 this->userplane[i].type = BRW_REGISTER_TYPE_F;
741 for (int j = 0; j < 4; ++j) {
742 stage_prog_data->param[this->uniforms * 4 + j] =
743 (gl_constant_value *) &clip_planes[i][j];
744 }
745 ++this->uniforms;
746 }
747 }
748
749 /* Our support for builtin uniforms is even scarier than non-builtin.
750 * It sits on top of the PROG_STATE_VAR parameters that are
751 * automatically updated from GL context state.
752 */
753 void
754 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
755 {
756 const ir_state_slot *const slots = ir->get_state_slots();
757 assert(slots != NULL);
758
759 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
760 /* This state reference has already been setup by ir_to_mesa,
761 * but we'll get the same index back here. We can reference
762 * ParameterValues directly, since unlike brw_fs.cpp, we never
763 * add new state references during compile.
764 */
765 int index = _mesa_add_state_reference(this->prog->Parameters,
766 (gl_state_index *)slots[i].tokens);
767 gl_constant_value *values =
768 &this->prog->Parameters->ParameterValues[index][0];
769
770 assert(this->uniforms < uniform_array_size);
771 this->uniform_vector_size[this->uniforms] = 0;
772 /* Add each of the unique swizzled channels of the element.
773 * This will end up matching the size of the glsl_type of this field.
774 */
775 int last_swiz = -1;
776 for (unsigned int j = 0; j < 4; j++) {
777 int swiz = GET_SWZ(slots[i].swizzle, j);
778 last_swiz = swiz;
779
780 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
781 assert(this->uniforms < uniform_array_size);
782 if (swiz <= last_swiz)
783 this->uniform_vector_size[this->uniforms]++;
784 }
785 this->uniforms++;
786 }
787 }
788
789 dst_reg *
790 vec4_visitor::variable_storage(ir_variable *var)
791 {
792 return (dst_reg *)hash_table_find(this->variable_ht, var);
793 }
794
795 void
796 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
797 enum brw_predicate *predicate)
798 {
799 ir_expression *expr = ir->as_expression();
800
801 *predicate = BRW_PREDICATE_NORMAL;
802
803 if (expr && expr->operation != ir_binop_ubo_load) {
804 src_reg op[3];
805 vec4_instruction *inst;
806
807 assert(expr->get_num_operands() <= 3);
808 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
809 expr->operands[i]->accept(this);
810 op[i] = this->result;
811
812 resolve_ud_negate(&op[i]);
813 }
814
815 switch (expr->operation) {
816 case ir_unop_logic_not:
817 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
818 inst->conditional_mod = BRW_CONDITIONAL_Z;
819 break;
820
821 case ir_binop_logic_xor:
822 inst = emit(XOR(dst_null_d(), op[0], op[1]));
823 inst->conditional_mod = BRW_CONDITIONAL_NZ;
824 break;
825
826 case ir_binop_logic_or:
827 inst = emit(OR(dst_null_d(), op[0], op[1]));
828 inst->conditional_mod = BRW_CONDITIONAL_NZ;
829 break;
830
831 case ir_binop_logic_and:
832 inst = emit(AND(dst_null_d(), op[0], op[1]));
833 inst->conditional_mod = BRW_CONDITIONAL_NZ;
834 break;
835
836 case ir_unop_f2b:
837 if (brw->gen >= 6) {
838 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
839 } else {
840 inst = emit(MOV(dst_null_f(), op[0]));
841 inst->conditional_mod = BRW_CONDITIONAL_NZ;
842 }
843 break;
844
845 case ir_unop_i2b:
846 if (brw->gen >= 6) {
847 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
848 } else {
849 inst = emit(MOV(dst_null_d(), op[0]));
850 inst->conditional_mod = BRW_CONDITIONAL_NZ;
851 }
852 break;
853
854 case ir_binop_all_equal:
855 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
856 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
857 break;
858
859 case ir_binop_any_nequal:
860 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
861 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
862 break;
863
864 case ir_unop_any:
865 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
866 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
867 break;
868
869 case ir_binop_greater:
870 case ir_binop_gequal:
871 case ir_binop_less:
872 case ir_binop_lequal:
873 case ir_binop_equal:
874 case ir_binop_nequal:
875 emit(CMP(dst_null_d(), op[0], op[1],
876 brw_conditional_for_comparison(expr->operation)));
877 break;
878
879 case ir_triop_csel: {
880 /* Expand the boolean condition into the flag register. */
881 inst = emit(MOV(dst_null_d(), op[0]));
882 inst->conditional_mod = BRW_CONDITIONAL_NZ;
883
884 /* Select which boolean to return. */
885 dst_reg temp(this, expr->operands[1]->type);
886 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
887 inst->predicate = BRW_PREDICATE_NORMAL;
888
889 /* Expand the result to a condition code. */
890 inst = emit(MOV(dst_null_d(), src_reg(temp)));
891 inst->conditional_mod = BRW_CONDITIONAL_NZ;
892 break;
893 }
894
895 default:
896 unreachable("not reached");
897 }
898 return;
899 }
900
901 ir->accept(this);
902
903 resolve_ud_negate(&this->result);
904
905 if (brw->gen >= 6) {
906 vec4_instruction *inst = emit(AND(dst_null_d(),
907 this->result, src_reg(1)));
908 inst->conditional_mod = BRW_CONDITIONAL_NZ;
909 } else {
910 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
911 inst->conditional_mod = BRW_CONDITIONAL_NZ;
912 }
913 }
914
915 /**
916 * Emit a gen6 IF statement with the comparison folded into the IF
917 * instruction.
918 */
919 void
920 vec4_visitor::emit_if_gen6(ir_if *ir)
921 {
922 ir_expression *expr = ir->condition->as_expression();
923
924 if (expr && expr->operation != ir_binop_ubo_load) {
925 src_reg op[3];
926 dst_reg temp;
927
928 assert(expr->get_num_operands() <= 3);
929 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
930 expr->operands[i]->accept(this);
931 op[i] = this->result;
932 }
933
934 switch (expr->operation) {
935 case ir_unop_logic_not:
936 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
937 return;
938
939 case ir_binop_logic_xor:
940 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
941 return;
942
943 case ir_binop_logic_or:
944 temp = dst_reg(this, glsl_type::bool_type);
945 emit(OR(temp, op[0], op[1]));
946 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
947 return;
948
949 case ir_binop_logic_and:
950 temp = dst_reg(this, glsl_type::bool_type);
951 emit(AND(temp, op[0], op[1]));
952 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
953 return;
954
955 case ir_unop_f2b:
956 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
957 return;
958
959 case ir_unop_i2b:
960 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
961 return;
962
963 case ir_binop_greater:
964 case ir_binop_gequal:
965 case ir_binop_less:
966 case ir_binop_lequal:
967 case ir_binop_equal:
968 case ir_binop_nequal:
969 emit(IF(op[0], op[1],
970 brw_conditional_for_comparison(expr->operation)));
971 return;
972
973 case ir_binop_all_equal:
974 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
975 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
976 return;
977
978 case ir_binop_any_nequal:
979 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
980 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
981 return;
982
983 case ir_unop_any:
984 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
985 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
986 return;
987
988 case ir_triop_csel: {
989 /* Expand the boolean condition into the flag register. */
990 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
991 inst->conditional_mod = BRW_CONDITIONAL_NZ;
992
993 /* Select which boolean to return. */
994 dst_reg temp(this, expr->operands[1]->type);
995 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
996 inst->predicate = BRW_PREDICATE_NORMAL;
997
998 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
999 return;
1000 }
1001
1002 default:
1003 unreachable("not reached");
1004 }
1005 return;
1006 }
1007
1008 ir->condition->accept(this);
1009
1010 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1011 }
1012
1013 void
1014 vec4_visitor::visit(ir_variable *ir)
1015 {
1016 dst_reg *reg = NULL;
1017
1018 if (variable_storage(ir))
1019 return;
1020
1021 switch (ir->data.mode) {
1022 case ir_var_shader_in:
1023 assert(ir->data.location != -1);
1024 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1025 break;
1026
1027 case ir_var_shader_out:
1028 assert(ir->data.location != -1);
1029 reg = new(mem_ctx) dst_reg(this, ir->type);
1030
1031 for (int i = 0; i < type_size(ir->type); i++) {
1032 output_reg[ir->data.location + i] = *reg;
1033 output_reg[ir->data.location + i].reg_offset = i;
1034 output_reg[ir->data.location + i].type =
1035 brw_type_for_base_type(ir->type->get_scalar_type());
1036 output_reg_annotation[ir->data.location + i] = ir->name;
1037 }
1038 break;
1039
1040 case ir_var_auto:
1041 case ir_var_temporary:
1042 reg = new(mem_ctx) dst_reg(this, ir->type);
1043 break;
1044
1045 case ir_var_uniform:
1046 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1047
1048 /* Thanks to the lower_ubo_reference pass, we will see only
1049 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1050 * variables, so no need for them to be in variable_ht.
1051 *
1052 * Some uniforms, such as samplers and atomic counters, have no actual
1053 * storage, so we should ignore them.
1054 */
1055 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1056 return;
1057
1058 /* Track how big the whole uniform variable is, in case we need to put a
1059 * copy of its data into pull constants for array access.
1060 */
1061 assert(this->uniforms < uniform_array_size);
1062 this->uniform_size[this->uniforms] = type_size(ir->type);
1063
1064 if (!strncmp(ir->name, "gl_", 3)) {
1065 setup_builtin_uniform_values(ir);
1066 } else {
1067 setup_uniform_values(ir);
1068 }
1069 break;
1070
1071 case ir_var_system_value:
1072 reg = make_reg_for_system_value(ir);
1073 break;
1074
1075 default:
1076 unreachable("not reached");
1077 }
1078
1079 reg->type = brw_type_for_base_type(ir->type);
1080 hash_table_insert(this->variable_ht, reg, ir);
1081 }
1082
1083 void
1084 vec4_visitor::visit(ir_loop *ir)
1085 {
1086 /* We don't want debugging output to print the whole body of the
1087 * loop as the annotation.
1088 */
1089 this->base_ir = NULL;
1090
1091 emit(BRW_OPCODE_DO);
1092
1093 visit_instructions(&ir->body_instructions);
1094
1095 emit(BRW_OPCODE_WHILE);
1096 }
1097
1098 void
1099 vec4_visitor::visit(ir_loop_jump *ir)
1100 {
1101 switch (ir->mode) {
1102 case ir_loop_jump::jump_break:
1103 emit(BRW_OPCODE_BREAK);
1104 break;
1105 case ir_loop_jump::jump_continue:
1106 emit(BRW_OPCODE_CONTINUE);
1107 break;
1108 }
1109 }
1110
1111
1112 void
1113 vec4_visitor::visit(ir_function_signature *)
1114 {
1115 unreachable("not reached");
1116 }
1117
1118 void
1119 vec4_visitor::visit(ir_function *ir)
1120 {
1121 /* Ignore function bodies other than main() -- we shouldn't see calls to
1122 * them since they should all be inlined.
1123 */
1124 if (strcmp(ir->name, "main") == 0) {
1125 const ir_function_signature *sig;
1126 exec_list empty;
1127
1128 sig = ir->matching_signature(NULL, &empty, false);
1129
1130 assert(sig);
1131
1132 visit_instructions(&sig->body);
1133 }
1134 }
1135
1136 bool
1137 vec4_visitor::try_emit_mad(ir_expression *ir)
1138 {
1139 /* 3-src instructions were introduced in gen6. */
1140 if (brw->gen < 6)
1141 return false;
1142
1143 /* MAD can only handle floating-point data. */
1144 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1145 return false;
1146
1147 ir_rvalue *nonmul = ir->operands[1];
1148 ir_expression *mul = ir->operands[0]->as_expression();
1149
1150 if (!mul || mul->operation != ir_binop_mul) {
1151 nonmul = ir->operands[0];
1152 mul = ir->operands[1]->as_expression();
1153
1154 if (!mul || mul->operation != ir_binop_mul)
1155 return false;
1156 }
1157
1158 nonmul->accept(this);
1159 src_reg src0 = fix_3src_operand(this->result);
1160
1161 mul->operands[0]->accept(this);
1162 src_reg src1 = fix_3src_operand(this->result);
1163
1164 mul->operands[1]->accept(this);
1165 src_reg src2 = fix_3src_operand(this->result);
1166
1167 this->result = src_reg(this, ir->type);
1168 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1169
1170 return true;
1171 }
1172
1173 bool
1174 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1175 {
1176 /* This optimization relies on CMP setting the destination to 0 when
1177 * false. Early hardware only sets the least significant bit, and
1178 * leaves the other bits undefined. So we can't use it.
1179 */
1180 if (brw->gen < 6)
1181 return false;
1182
1183 ir_expression *const cmp = ir->operands[0]->as_expression();
1184
1185 if (cmp == NULL)
1186 return false;
1187
1188 switch (cmp->operation) {
1189 case ir_binop_less:
1190 case ir_binop_greater:
1191 case ir_binop_lequal:
1192 case ir_binop_gequal:
1193 case ir_binop_equal:
1194 case ir_binop_nequal:
1195 break;
1196
1197 default:
1198 return false;
1199 }
1200
1201 cmp->operands[0]->accept(this);
1202 const src_reg cmp_src0 = this->result;
1203
1204 cmp->operands[1]->accept(this);
1205 const src_reg cmp_src1 = this->result;
1206
1207 this->result = src_reg(this, ir->type);
1208
1209 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1210 brw_conditional_for_comparison(cmp->operation)));
1211
1212 /* If the comparison is false, this->result will just happen to be zero.
1213 */
1214 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1215 this->result, src_reg(1.0f));
1216 inst->predicate = BRW_PREDICATE_NORMAL;
1217 inst->predicate_inverse = true;
1218
1219 return true;
1220 }
1221
1222 void
1223 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1224 src_reg src0, src_reg src1)
1225 {
1226 vec4_instruction *inst;
1227
1228 if (brw->gen >= 6) {
1229 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1230 inst->conditional_mod = conditionalmod;
1231 } else {
1232 emit(CMP(dst, src0, src1, conditionalmod));
1233
1234 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1235 inst->predicate = BRW_PREDICATE_NORMAL;
1236 }
1237 }
1238
1239 void
1240 vec4_visitor::emit_lrp(const dst_reg &dst,
1241 const src_reg &x, const src_reg &y, const src_reg &a)
1242 {
1243 if (brw->gen >= 6) {
1244 /* Note that the instruction's argument order is reversed from GLSL
1245 * and the IR.
1246 */
1247 emit(LRP(dst,
1248 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1249 } else {
1250 /* Earlier generations don't support three source operations, so we
1251 * need to emit x*(1-a) + y*a.
1252 */
1253 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1254 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1255 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1256 y_times_a.writemask = dst.writemask;
1257 one_minus_a.writemask = dst.writemask;
1258 x_times_one_minus_a.writemask = dst.writemask;
1259
1260 emit(MUL(y_times_a, y, a));
1261 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1262 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1263 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1264 }
1265 }
1266
1267 void
1268 vec4_visitor::visit(ir_expression *ir)
1269 {
1270 unsigned int operand;
1271 src_reg op[Elements(ir->operands)];
1272 vec4_instruction *inst;
1273
1274 if (ir->operation == ir_binop_add) {
1275 if (try_emit_mad(ir))
1276 return;
1277 }
1278
1279 if (ir->operation == ir_unop_b2f) {
1280 if (try_emit_b2f_of_compare(ir))
1281 return;
1282 }
1283
1284 /* Storage for our result. Ideally for an assignment we'd be using
1285 * the actual storage for the result here, instead.
1286 */
1287 dst_reg result_dst(this, ir->type);
1288 src_reg result_src(result_dst);
1289
1290 if (ir->operation == ir_triop_csel) {
1291 ir->operands[1]->accept(this);
1292 op[1] = this->result;
1293 ir->operands[2]->accept(this);
1294 op[2] = this->result;
1295
1296 enum brw_predicate predicate;
1297 emit_bool_to_cond_code(ir->operands[0], &predicate);
1298 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1299 inst->predicate = predicate;
1300 this->result = result_src;
1301 return;
1302 }
1303
1304 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1305 this->result.file = BAD_FILE;
1306 ir->operands[operand]->accept(this);
1307 if (this->result.file == BAD_FILE) {
1308 fprintf(stderr, "Failed to get tree for expression operand:\n");
1309 ir->operands[operand]->fprint(stderr);
1310 exit(1);
1311 }
1312 op[operand] = this->result;
1313
1314 /* Matrix expression operands should have been broken down to vector
1315 * operations already.
1316 */
1317 assert(!ir->operands[operand]->type->is_matrix());
1318 }
1319
1320 /* If nothing special happens, this is the result. */
1321 this->result = result_src;
1322
1323 switch (ir->operation) {
1324 case ir_unop_logic_not:
1325 if (ctx->Const.UniformBooleanTrue != 1) {
1326 emit(NOT(result_dst, op[0]));
1327 } else {
1328 emit(XOR(result_dst, op[0], src_reg(1u)));
1329 }
1330 break;
1331 case ir_unop_neg:
1332 op[0].negate = !op[0].negate;
1333 emit(MOV(result_dst, op[0]));
1334 break;
1335 case ir_unop_abs:
1336 op[0].abs = true;
1337 op[0].negate = false;
1338 emit(MOV(result_dst, op[0]));
1339 break;
1340
1341 case ir_unop_sign:
1342 if (ir->type->is_float()) {
1343 /* AND(val, 0x80000000) gives the sign bit.
1344 *
1345 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1346 * zero.
1347 */
1348 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1349
1350 op[0].type = BRW_REGISTER_TYPE_UD;
1351 result_dst.type = BRW_REGISTER_TYPE_UD;
1352 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1353
1354 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1355 inst->predicate = BRW_PREDICATE_NORMAL;
1356
1357 this->result.type = BRW_REGISTER_TYPE_F;
1358 } else {
1359 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1360 * -> non-negative val generates 0x00000000.
1361 * Predicated OR sets 1 if val is positive.
1362 */
1363 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1364
1365 emit(ASR(result_dst, op[0], src_reg(31)));
1366
1367 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1368 inst->predicate = BRW_PREDICATE_NORMAL;
1369 }
1370 break;
1371
1372 case ir_unop_rcp:
1373 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1374 break;
1375
1376 case ir_unop_exp2:
1377 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1378 break;
1379 case ir_unop_log2:
1380 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1381 break;
1382 case ir_unop_exp:
1383 case ir_unop_log:
1384 unreachable("not reached: should be handled by ir_explog_to_explog2");
1385 case ir_unop_sin:
1386 case ir_unop_sin_reduced:
1387 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1388 break;
1389 case ir_unop_cos:
1390 case ir_unop_cos_reduced:
1391 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1392 break;
1393
1394 case ir_unop_dFdx:
1395 case ir_unop_dFdx_coarse:
1396 case ir_unop_dFdx_fine:
1397 case ir_unop_dFdy:
1398 case ir_unop_dFdy_coarse:
1399 case ir_unop_dFdy_fine:
1400 unreachable("derivatives not valid in vertex shader");
1401
1402 case ir_unop_bitfield_reverse:
1403 emit(BFREV(result_dst, op[0]));
1404 break;
1405 case ir_unop_bit_count:
1406 emit(CBIT(result_dst, op[0]));
1407 break;
1408 case ir_unop_find_msb: {
1409 src_reg temp = src_reg(this, glsl_type::uint_type);
1410
1411 inst = emit(FBH(dst_reg(temp), op[0]));
1412 inst->dst.writemask = WRITEMASK_XYZW;
1413
1414 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1415 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1416 * subtract the result from 31 to convert the MSB count into an LSB count.
1417 */
1418
1419 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1420 temp.swizzle = BRW_SWIZZLE_NOOP;
1421 emit(MOV(result_dst, temp));
1422
1423 src_reg src_tmp = src_reg(result_dst);
1424 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1425
1426 src_tmp.negate = true;
1427 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1428 inst->predicate = BRW_PREDICATE_NORMAL;
1429 break;
1430 }
1431 case ir_unop_find_lsb:
1432 emit(FBL(result_dst, op[0]));
1433 break;
1434 case ir_unop_saturate:
1435 inst = emit(MOV(result_dst, op[0]));
1436 inst->saturate = true;
1437 break;
1438
1439 case ir_unop_noise:
1440 unreachable("not reached: should be handled by lower_noise");
1441
1442 case ir_binop_add:
1443 emit(ADD(result_dst, op[0], op[1]));
1444 break;
1445 case ir_binop_sub:
1446 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1447
1448 case ir_binop_mul:
1449 if (brw->gen < 8 && ir->type->is_integer()) {
1450 /* For integer multiplication, the MUL uses the low 16 bits of one of
1451 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1452 * accumulates in the contribution of the upper 16 bits of that
1453 * operand. If we can determine that one of the args is in the low
1454 * 16 bits, though, we can just emit a single MUL.
1455 */
1456 if (ir->operands[0]->is_uint16_constant()) {
1457 if (brw->gen < 7)
1458 emit(MUL(result_dst, op[0], op[1]));
1459 else
1460 emit(MUL(result_dst, op[1], op[0]));
1461 } else if (ir->operands[1]->is_uint16_constant()) {
1462 if (brw->gen < 7)
1463 emit(MUL(result_dst, op[1], op[0]));
1464 else
1465 emit(MUL(result_dst, op[0], op[1]));
1466 } else {
1467 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1468
1469 emit(MUL(acc, op[0], op[1]));
1470 emit(MACH(dst_null_d(), op[0], op[1]));
1471 emit(MOV(result_dst, src_reg(acc)));
1472 }
1473 } else {
1474 emit(MUL(result_dst, op[0], op[1]));
1475 }
1476 break;
1477 case ir_binop_imul_high: {
1478 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1479
1480 emit(MUL(acc, op[0], op[1]));
1481 emit(MACH(result_dst, op[0], op[1]));
1482 break;
1483 }
1484 case ir_binop_div:
1485 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1486 assert(ir->type->is_integer());
1487 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1488 break;
1489 case ir_binop_carry: {
1490 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1491
1492 emit(ADDC(dst_null_ud(), op[0], op[1]));
1493 emit(MOV(result_dst, src_reg(acc)));
1494 break;
1495 }
1496 case ir_binop_borrow: {
1497 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1498
1499 emit(SUBB(dst_null_ud(), op[0], op[1]));
1500 emit(MOV(result_dst, src_reg(acc)));
1501 break;
1502 }
1503 case ir_binop_mod:
1504 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1505 assert(ir->type->is_integer());
1506 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1507 break;
1508
1509 case ir_binop_less:
1510 case ir_binop_greater:
1511 case ir_binop_lequal:
1512 case ir_binop_gequal:
1513 case ir_binop_equal:
1514 case ir_binop_nequal: {
1515 emit(CMP(result_dst, op[0], op[1],
1516 brw_conditional_for_comparison(ir->operation)));
1517 if (ctx->Const.UniformBooleanTrue == 1) {
1518 emit(AND(result_dst, result_src, src_reg(1u)));
1519 }
1520 break;
1521 }
1522
1523 case ir_binop_all_equal:
1524 /* "==" operator producing a scalar boolean. */
1525 if (ir->operands[0]->type->is_vector() ||
1526 ir->operands[1]->type->is_vector()) {
1527 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1528 emit(MOV(result_dst, src_reg(0)));
1529 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1530 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1531 } else {
1532 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1533 if (ctx->Const.UniformBooleanTrue == 1) {
1534 emit(AND(result_dst, result_src, src_reg(1u)));
1535 }
1536 }
1537 break;
1538 case ir_binop_any_nequal:
1539 /* "!=" operator producing a scalar boolean. */
1540 if (ir->operands[0]->type->is_vector() ||
1541 ir->operands[1]->type->is_vector()) {
1542 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1543
1544 emit(MOV(result_dst, src_reg(0)));
1545 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1546 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1547 } else {
1548 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1549 if (ctx->Const.UniformBooleanTrue == 1) {
1550 emit(AND(result_dst, result_src, src_reg(1u)));
1551 }
1552 }
1553 break;
1554
1555 case ir_unop_any:
1556 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1557 emit(MOV(result_dst, src_reg(0)));
1558
1559 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1560 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1561 break;
1562
1563 case ir_binop_logic_xor:
1564 emit(XOR(result_dst, op[0], op[1]));
1565 break;
1566
1567 case ir_binop_logic_or:
1568 emit(OR(result_dst, op[0], op[1]));
1569 break;
1570
1571 case ir_binop_logic_and:
1572 emit(AND(result_dst, op[0], op[1]));
1573 break;
1574
1575 case ir_binop_dot:
1576 assert(ir->operands[0]->type->is_vector());
1577 assert(ir->operands[0]->type == ir->operands[1]->type);
1578 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1579 break;
1580
1581 case ir_unop_sqrt:
1582 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1583 break;
1584 case ir_unop_rsq:
1585 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1586 break;
1587
1588 case ir_unop_bitcast_i2f:
1589 case ir_unop_bitcast_u2f:
1590 this->result = op[0];
1591 this->result.type = BRW_REGISTER_TYPE_F;
1592 break;
1593
1594 case ir_unop_bitcast_f2i:
1595 this->result = op[0];
1596 this->result.type = BRW_REGISTER_TYPE_D;
1597 break;
1598
1599 case ir_unop_bitcast_f2u:
1600 this->result = op[0];
1601 this->result.type = BRW_REGISTER_TYPE_UD;
1602 break;
1603
1604 case ir_unop_i2f:
1605 case ir_unop_i2u:
1606 case ir_unop_u2i:
1607 case ir_unop_u2f:
1608 case ir_unop_f2i:
1609 case ir_unop_f2u:
1610 emit(MOV(result_dst, op[0]));
1611 break;
1612 case ir_unop_b2i:
1613 if (ctx->Const.UniformBooleanTrue != 1) {
1614 emit(AND(result_dst, op[0], src_reg(1u)));
1615 } else {
1616 emit(MOV(result_dst, op[0]));
1617 }
1618 break;
1619 case ir_unop_b2f:
1620 if (ctx->Const.UniformBooleanTrue != 1) {
1621 op[0].type = BRW_REGISTER_TYPE_UD;
1622 result_dst.type = BRW_REGISTER_TYPE_UD;
1623 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1624 result_dst.type = BRW_REGISTER_TYPE_F;
1625 } else {
1626 emit(MOV(result_dst, op[0]));
1627 }
1628 break;
1629 case ir_unop_f2b:
1630 case ir_unop_i2b:
1631 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1632 if (ctx->Const.UniformBooleanTrue == 1) {
1633 emit(AND(result_dst, result_src, src_reg(1u)));
1634 }
1635 break;
1636
1637 case ir_unop_trunc:
1638 emit(RNDZ(result_dst, op[0]));
1639 break;
1640 case ir_unop_ceil:
1641 op[0].negate = !op[0].negate;
1642 inst = emit(RNDD(result_dst, op[0]));
1643 this->result.negate = true;
1644 break;
1645 case ir_unop_floor:
1646 inst = emit(RNDD(result_dst, op[0]));
1647 break;
1648 case ir_unop_fract:
1649 inst = emit(FRC(result_dst, op[0]));
1650 break;
1651 case ir_unop_round_even:
1652 emit(RNDE(result_dst, op[0]));
1653 break;
1654
1655 case ir_binop_min:
1656 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1657 break;
1658 case ir_binop_max:
1659 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1660 break;
1661
1662 case ir_binop_pow:
1663 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1664 break;
1665
1666 case ir_unop_bit_not:
1667 inst = emit(NOT(result_dst, op[0]));
1668 break;
1669 case ir_binop_bit_and:
1670 inst = emit(AND(result_dst, op[0], op[1]));
1671 break;
1672 case ir_binop_bit_xor:
1673 inst = emit(XOR(result_dst, op[0], op[1]));
1674 break;
1675 case ir_binop_bit_or:
1676 inst = emit(OR(result_dst, op[0], op[1]));
1677 break;
1678
1679 case ir_binop_lshift:
1680 inst = emit(SHL(result_dst, op[0], op[1]));
1681 break;
1682
1683 case ir_binop_rshift:
1684 if (ir->type->base_type == GLSL_TYPE_INT)
1685 inst = emit(ASR(result_dst, op[0], op[1]));
1686 else
1687 inst = emit(SHR(result_dst, op[0], op[1]));
1688 break;
1689
1690 case ir_binop_bfm:
1691 emit(BFI1(result_dst, op[0], op[1]));
1692 break;
1693
1694 case ir_binop_ubo_load: {
1695 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1696 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1697 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1698 src_reg offset;
1699
1700 /* Now, load the vector from that offset. */
1701 assert(ir->type->is_vector() || ir->type->is_scalar());
1702
1703 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1704 packed_consts.type = result.type;
1705 src_reg surf_index;
1706
1707 if (const_uniform_block) {
1708 /* The block index is a constant, so just emit the binding table entry
1709 * as an immediate.
1710 */
1711 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1712 const_uniform_block->value.u[0]);
1713 } else {
1714 /* The block index is not a constant. Evaluate the index expression
1715 * per-channel and add the base UBO index; the generator will select
1716 * a value from any live channel.
1717 */
1718 surf_index = src_reg(this, glsl_type::uint_type);
1719 emit(ADD(dst_reg(surf_index), op[0],
1720 src_reg(prog_data->base.binding_table.ubo_start)));
1721
1722 /* Assume this may touch any UBO. It would be nice to provide
1723 * a tighter bound, but the array information is already lowered away.
1724 */
1725 brw_mark_surface_used(&prog_data->base,
1726 prog_data->base.binding_table.ubo_start +
1727 shader_prog->NumUniformBlocks - 1);
1728 }
1729
1730 if (const_offset_ir) {
1731 if (brw->gen >= 8) {
1732 /* Store the offset in a GRF so we can send-from-GRF. */
1733 offset = src_reg(this, glsl_type::int_type);
1734 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1735 } else {
1736 /* Immediates are fine on older generations since they'll be moved
1737 * to a (potentially fake) MRF at the generator level.
1738 */
1739 offset = src_reg(const_offset / 16);
1740 }
1741 } else {
1742 offset = src_reg(this, glsl_type::uint_type);
1743 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1744 }
1745
1746 if (brw->gen >= 7) {
1747 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1748 grf_offset.type = offset.type;
1749
1750 emit(MOV(grf_offset, offset));
1751
1752 emit(new(mem_ctx) vec4_instruction(this,
1753 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1754 dst_reg(packed_consts),
1755 surf_index,
1756 src_reg(grf_offset)));
1757 } else {
1758 vec4_instruction *pull =
1759 emit(new(mem_ctx) vec4_instruction(this,
1760 VS_OPCODE_PULL_CONSTANT_LOAD,
1761 dst_reg(packed_consts),
1762 surf_index,
1763 offset));
1764 pull->base_mrf = 14;
1765 pull->mlen = 1;
1766 }
1767
1768 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1769 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1770 const_offset % 16 / 4,
1771 const_offset % 16 / 4,
1772 const_offset % 16 / 4);
1773
1774 /* UBO bools are any nonzero int. We need to convert them to use the
1775 * value of true stored in ctx->Const.UniformBooleanTrue.
1776 */
1777 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1778 emit(CMP(result_dst, packed_consts, src_reg(0u),
1779 BRW_CONDITIONAL_NZ));
1780 if (ctx->Const.UniformBooleanTrue == 1) {
1781 emit(AND(result_dst, result, src_reg(1u)));
1782 }
1783 } else {
1784 emit(MOV(result_dst, packed_consts));
1785 }
1786 break;
1787 }
1788
1789 case ir_binop_vector_extract:
1790 unreachable("should have been lowered by vec_index_to_cond_assign");
1791
1792 case ir_triop_fma:
1793 op[0] = fix_3src_operand(op[0]);
1794 op[1] = fix_3src_operand(op[1]);
1795 op[2] = fix_3src_operand(op[2]);
1796 /* Note that the instruction's argument order is reversed from GLSL
1797 * and the IR.
1798 */
1799 emit(MAD(result_dst, op[2], op[1], op[0]));
1800 break;
1801
1802 case ir_triop_lrp:
1803 emit_lrp(result_dst, op[0], op[1], op[2]);
1804 break;
1805
1806 case ir_triop_csel:
1807 unreachable("already handled above");
1808 break;
1809
1810 case ir_triop_bfi:
1811 op[0] = fix_3src_operand(op[0]);
1812 op[1] = fix_3src_operand(op[1]);
1813 op[2] = fix_3src_operand(op[2]);
1814 emit(BFI2(result_dst, op[0], op[1], op[2]));
1815 break;
1816
1817 case ir_triop_bitfield_extract:
1818 op[0] = fix_3src_operand(op[0]);
1819 op[1] = fix_3src_operand(op[1]);
1820 op[2] = fix_3src_operand(op[2]);
1821 /* Note that the instruction's argument order is reversed from GLSL
1822 * and the IR.
1823 */
1824 emit(BFE(result_dst, op[2], op[1], op[0]));
1825 break;
1826
1827 case ir_triop_vector_insert:
1828 unreachable("should have been lowered by lower_vector_insert");
1829
1830 case ir_quadop_bitfield_insert:
1831 unreachable("not reached: should be handled by "
1832 "bitfield_insert_to_bfm_bfi\n");
1833
1834 case ir_quadop_vector:
1835 unreachable("not reached: should be handled by lower_quadop_vector");
1836
1837 case ir_unop_pack_half_2x16:
1838 emit_pack_half_2x16(result_dst, op[0]);
1839 break;
1840 case ir_unop_unpack_half_2x16:
1841 emit_unpack_half_2x16(result_dst, op[0]);
1842 break;
1843 case ir_unop_unpack_unorm_4x8:
1844 emit_unpack_unorm_4x8(result_dst, op[0]);
1845 break;
1846 case ir_unop_unpack_snorm_4x8:
1847 emit_unpack_snorm_4x8(result_dst, op[0]);
1848 break;
1849 case ir_unop_pack_unorm_4x8:
1850 emit_pack_unorm_4x8(result_dst, op[0]);
1851 break;
1852 case ir_unop_pack_snorm_4x8:
1853 emit_pack_snorm_4x8(result_dst, op[0]);
1854 break;
1855 case ir_unop_pack_snorm_2x16:
1856 case ir_unop_pack_unorm_2x16:
1857 case ir_unop_unpack_snorm_2x16:
1858 case ir_unop_unpack_unorm_2x16:
1859 unreachable("not reached: should be handled by lower_packing_builtins");
1860 case ir_unop_unpack_half_2x16_split_x:
1861 case ir_unop_unpack_half_2x16_split_y:
1862 case ir_binop_pack_half_2x16_split:
1863 case ir_unop_interpolate_at_centroid:
1864 case ir_binop_interpolate_at_sample:
1865 case ir_binop_interpolate_at_offset:
1866 unreachable("not reached: should not occur in vertex shader");
1867 case ir_binop_ldexp:
1868 unreachable("not reached: should be handled by ldexp_to_arith()");
1869 }
1870 }
1871
1872
1873 void
1874 vec4_visitor::visit(ir_swizzle *ir)
1875 {
1876 src_reg src;
1877 int i = 0;
1878 int swizzle[4];
1879
1880 /* Note that this is only swizzles in expressions, not those on the left
1881 * hand side of an assignment, which do write masking. See ir_assignment
1882 * for that.
1883 */
1884
1885 ir->val->accept(this);
1886 src = this->result;
1887 assert(src.file != BAD_FILE);
1888
1889 for (i = 0; i < ir->type->vector_elements; i++) {
1890 switch (i) {
1891 case 0:
1892 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1893 break;
1894 case 1:
1895 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1896 break;
1897 case 2:
1898 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1899 break;
1900 case 3:
1901 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1902 break;
1903 }
1904 }
1905 for (; i < 4; i++) {
1906 /* Replicate the last channel out. */
1907 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1908 }
1909
1910 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1911
1912 this->result = src;
1913 }
1914
1915 void
1916 vec4_visitor::visit(ir_dereference_variable *ir)
1917 {
1918 const struct glsl_type *type = ir->type;
1919 dst_reg *reg = variable_storage(ir->var);
1920
1921 if (!reg) {
1922 fail("Failed to find variable storage for %s\n", ir->var->name);
1923 this->result = src_reg(brw_null_reg());
1924 return;
1925 }
1926
1927 this->result = src_reg(*reg);
1928
1929 /* System values get their swizzle from the dst_reg writemask */
1930 if (ir->var->data.mode == ir_var_system_value)
1931 return;
1932
1933 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1934 this->result.swizzle = swizzle_for_size(type->vector_elements);
1935 }
1936
1937
1938 int
1939 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1940 {
1941 /* Under normal circumstances array elements are stored consecutively, so
1942 * the stride is equal to the size of the array element.
1943 */
1944 return type_size(ir->type);
1945 }
1946
1947
1948 void
1949 vec4_visitor::visit(ir_dereference_array *ir)
1950 {
1951 ir_constant *constant_index;
1952 src_reg src;
1953 int array_stride = compute_array_stride(ir);
1954
1955 constant_index = ir->array_index->constant_expression_value();
1956
1957 ir->array->accept(this);
1958 src = this->result;
1959
1960 if (constant_index) {
1961 src.reg_offset += constant_index->value.i[0] * array_stride;
1962 } else {
1963 /* Variable index array dereference. It eats the "vec4" of the
1964 * base of the array and an index that offsets the Mesa register
1965 * index.
1966 */
1967 ir->array_index->accept(this);
1968
1969 src_reg index_reg;
1970
1971 if (array_stride == 1) {
1972 index_reg = this->result;
1973 } else {
1974 index_reg = src_reg(this, glsl_type::int_type);
1975
1976 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1977 }
1978
1979 if (src.reladdr) {
1980 src_reg temp = src_reg(this, glsl_type::int_type);
1981
1982 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1983
1984 index_reg = temp;
1985 }
1986
1987 src.reladdr = ralloc(mem_ctx, src_reg);
1988 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1989 }
1990
1991 /* If the type is smaller than a vec4, replicate the last channel out. */
1992 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1993 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1994 else
1995 src.swizzle = BRW_SWIZZLE_NOOP;
1996 src.type = brw_type_for_base_type(ir->type);
1997
1998 this->result = src;
1999 }
2000
2001 void
2002 vec4_visitor::visit(ir_dereference_record *ir)
2003 {
2004 unsigned int i;
2005 const glsl_type *struct_type = ir->record->type;
2006 int offset = 0;
2007
2008 ir->record->accept(this);
2009
2010 for (i = 0; i < struct_type->length; i++) {
2011 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2012 break;
2013 offset += type_size(struct_type->fields.structure[i].type);
2014 }
2015
2016 /* If the type is smaller than a vec4, replicate the last channel out. */
2017 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2018 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2019 else
2020 this->result.swizzle = BRW_SWIZZLE_NOOP;
2021 this->result.type = brw_type_for_base_type(ir->type);
2022
2023 this->result.reg_offset += offset;
2024 }
2025
2026 /**
2027 * We want to be careful in assignment setup to hit the actual storage
2028 * instead of potentially using a temporary like we might with the
2029 * ir_dereference handler.
2030 */
2031 static dst_reg
2032 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2033 {
2034 /* The LHS must be a dereference. If the LHS is a variable indexed array
2035 * access of a vector, it must be separated into a series conditional moves
2036 * before reaching this point (see ir_vec_index_to_cond_assign).
2037 */
2038 assert(ir->as_dereference());
2039 ir_dereference_array *deref_array = ir->as_dereference_array();
2040 if (deref_array) {
2041 assert(!deref_array->array->type->is_vector());
2042 }
2043
2044 /* Use the rvalue deref handler for the most part. We'll ignore
2045 * swizzles in it and write swizzles using writemask, though.
2046 */
2047 ir->accept(v);
2048 return dst_reg(v->result);
2049 }
2050
2051 void
2052 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2053 const struct glsl_type *type,
2054 enum brw_predicate predicate)
2055 {
2056 if (type->base_type == GLSL_TYPE_STRUCT) {
2057 for (unsigned int i = 0; i < type->length; i++) {
2058 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2059 }
2060 return;
2061 }
2062
2063 if (type->is_array()) {
2064 for (unsigned int i = 0; i < type->length; i++) {
2065 emit_block_move(dst, src, type->fields.array, predicate);
2066 }
2067 return;
2068 }
2069
2070 if (type->is_matrix()) {
2071 const struct glsl_type *vec_type;
2072
2073 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2074 type->vector_elements, 1);
2075
2076 for (int i = 0; i < type->matrix_columns; i++) {
2077 emit_block_move(dst, src, vec_type, predicate);
2078 }
2079 return;
2080 }
2081
2082 assert(type->is_scalar() || type->is_vector());
2083
2084 dst->type = brw_type_for_base_type(type);
2085 src->type = dst->type;
2086
2087 dst->writemask = (1 << type->vector_elements) - 1;
2088
2089 src->swizzle = swizzle_for_size(type->vector_elements);
2090
2091 vec4_instruction *inst = emit(MOV(*dst, *src));
2092 inst->predicate = predicate;
2093
2094 dst->reg_offset++;
2095 src->reg_offset++;
2096 }
2097
2098
2099 /* If the RHS processing resulted in an instruction generating a
2100 * temporary value, and it would be easy to rewrite the instruction to
2101 * generate its result right into the LHS instead, do so. This ends
2102 * up reliably removing instructions where it can be tricky to do so
2103 * later without real UD chain information.
2104 */
2105 bool
2106 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2107 dst_reg dst,
2108 src_reg src,
2109 vec4_instruction *pre_rhs_inst,
2110 vec4_instruction *last_rhs_inst)
2111 {
2112 /* This could be supported, but it would take more smarts. */
2113 if (ir->condition)
2114 return false;
2115
2116 if (pre_rhs_inst == last_rhs_inst)
2117 return false; /* No instructions generated to work with. */
2118
2119 /* Make sure the last instruction generated our source reg. */
2120 if (src.file != GRF ||
2121 src.file != last_rhs_inst->dst.file ||
2122 src.reg != last_rhs_inst->dst.reg ||
2123 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2124 src.reladdr ||
2125 src.abs ||
2126 src.negate ||
2127 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2128 return false;
2129
2130 /* Check that that last instruction fully initialized the channels
2131 * we want to use, in the order we want to use them. We could
2132 * potentially reswizzle the operands of many instructions so that
2133 * we could handle out of order channels, but don't yet.
2134 */
2135
2136 for (unsigned i = 0; i < 4; i++) {
2137 if (dst.writemask & (1 << i)) {
2138 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2139 return false;
2140
2141 if (BRW_GET_SWZ(src.swizzle, i) != i)
2142 return false;
2143 }
2144 }
2145
2146 /* Success! Rewrite the instruction. */
2147 last_rhs_inst->dst.file = dst.file;
2148 last_rhs_inst->dst.reg = dst.reg;
2149 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2150 last_rhs_inst->dst.reladdr = dst.reladdr;
2151 last_rhs_inst->dst.writemask &= dst.writemask;
2152
2153 return true;
2154 }
2155
2156 void
2157 vec4_visitor::visit(ir_assignment *ir)
2158 {
2159 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2160 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2161
2162 if (!ir->lhs->type->is_scalar() &&
2163 !ir->lhs->type->is_vector()) {
2164 ir->rhs->accept(this);
2165 src_reg src = this->result;
2166
2167 if (ir->condition) {
2168 emit_bool_to_cond_code(ir->condition, &predicate);
2169 }
2170
2171 /* emit_block_move doesn't account for swizzles in the source register.
2172 * This should be ok, since the source register is a structure or an
2173 * array, and those can't be swizzled. But double-check to be sure.
2174 */
2175 assert(src.swizzle ==
2176 (ir->rhs->type->is_matrix()
2177 ? swizzle_for_size(ir->rhs->type->vector_elements)
2178 : BRW_SWIZZLE_NOOP));
2179
2180 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2181 return;
2182 }
2183
2184 /* Now we're down to just a scalar/vector with writemasks. */
2185 int i;
2186
2187 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2188 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2189
2190 ir->rhs->accept(this);
2191
2192 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2193
2194 src_reg src = this->result;
2195
2196 int swizzles[4];
2197 int first_enabled_chan = 0;
2198 int src_chan = 0;
2199
2200 assert(ir->lhs->type->is_vector() ||
2201 ir->lhs->type->is_scalar());
2202 dst.writemask = ir->write_mask;
2203
2204 for (int i = 0; i < 4; i++) {
2205 if (dst.writemask & (1 << i)) {
2206 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2207 break;
2208 }
2209 }
2210
2211 /* Swizzle a small RHS vector into the channels being written.
2212 *
2213 * glsl ir treats write_mask as dictating how many channels are
2214 * present on the RHS while in our instructions we need to make
2215 * those channels appear in the slots of the vec4 they're written to.
2216 */
2217 for (int i = 0; i < 4; i++) {
2218 if (dst.writemask & (1 << i))
2219 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2220 else
2221 swizzles[i] = first_enabled_chan;
2222 }
2223 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2224 swizzles[2], swizzles[3]);
2225
2226 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2227 return;
2228 }
2229
2230 if (ir->condition) {
2231 emit_bool_to_cond_code(ir->condition, &predicate);
2232 }
2233
2234 for (i = 0; i < type_size(ir->lhs->type); i++) {
2235 vec4_instruction *inst = emit(MOV(dst, src));
2236 inst->predicate = predicate;
2237
2238 dst.reg_offset++;
2239 src.reg_offset++;
2240 }
2241 }
2242
2243 void
2244 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2245 {
2246 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2247 foreach_in_list(ir_constant, field_value, &ir->components) {
2248 emit_constant_values(dst, field_value);
2249 }
2250 return;
2251 }
2252
2253 if (ir->type->is_array()) {
2254 for (unsigned int i = 0; i < ir->type->length; i++) {
2255 emit_constant_values(dst, ir->array_elements[i]);
2256 }
2257 return;
2258 }
2259
2260 if (ir->type->is_matrix()) {
2261 for (int i = 0; i < ir->type->matrix_columns; i++) {
2262 float *vec = &ir->value.f[i * ir->type->vector_elements];
2263
2264 for (int j = 0; j < ir->type->vector_elements; j++) {
2265 dst->writemask = 1 << j;
2266 dst->type = BRW_REGISTER_TYPE_F;
2267
2268 emit(MOV(*dst, src_reg(vec[j])));
2269 }
2270 dst->reg_offset++;
2271 }
2272 return;
2273 }
2274
2275 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2276
2277 for (int i = 0; i < ir->type->vector_elements; i++) {
2278 if (!(remaining_writemask & (1 << i)))
2279 continue;
2280
2281 dst->writemask = 1 << i;
2282 dst->type = brw_type_for_base_type(ir->type);
2283
2284 /* Find other components that match the one we're about to
2285 * write. Emits fewer instructions for things like vec4(0.5,
2286 * 1.5, 1.5, 1.5).
2287 */
2288 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2289 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2290 if (ir->value.b[i] == ir->value.b[j])
2291 dst->writemask |= (1 << j);
2292 } else {
2293 /* u, i, and f storage all line up, so no need for a
2294 * switch case for comparing each type.
2295 */
2296 if (ir->value.u[i] == ir->value.u[j])
2297 dst->writemask |= (1 << j);
2298 }
2299 }
2300
2301 switch (ir->type->base_type) {
2302 case GLSL_TYPE_FLOAT:
2303 emit(MOV(*dst, src_reg(ir->value.f[i])));
2304 break;
2305 case GLSL_TYPE_INT:
2306 emit(MOV(*dst, src_reg(ir->value.i[i])));
2307 break;
2308 case GLSL_TYPE_UINT:
2309 emit(MOV(*dst, src_reg(ir->value.u[i])));
2310 break;
2311 case GLSL_TYPE_BOOL:
2312 emit(MOV(*dst,
2313 src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2314 : 0u)));
2315 break;
2316 default:
2317 unreachable("Non-float/uint/int/bool constant");
2318 }
2319
2320 remaining_writemask &= ~dst->writemask;
2321 }
2322 dst->reg_offset++;
2323 }
2324
2325 void
2326 vec4_visitor::visit(ir_constant *ir)
2327 {
2328 dst_reg dst = dst_reg(this, ir->type);
2329 this->result = src_reg(dst);
2330
2331 emit_constant_values(&dst, ir);
2332 }
2333
2334 void
2335 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2336 {
2337 ir_dereference *deref = static_cast<ir_dereference *>(
2338 ir->actual_parameters.get_head());
2339 ir_variable *location = deref->variable_referenced();
2340 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2341 location->data.binding);
2342
2343 /* Calculate the surface offset */
2344 src_reg offset(this, glsl_type::uint_type);
2345 ir_dereference_array *deref_array = deref->as_dereference_array();
2346 if (deref_array) {
2347 deref_array->array_index->accept(this);
2348
2349 src_reg tmp(this, glsl_type::uint_type);
2350 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2351 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2352 } else {
2353 offset = location->data.atomic.offset;
2354 }
2355
2356 /* Emit the appropriate machine instruction */
2357 const char *callee = ir->callee->function_name();
2358 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2359
2360 if (!strcmp("__intrinsic_atomic_read", callee)) {
2361 emit_untyped_surface_read(surf_index, dst, offset);
2362
2363 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2364 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2365 src_reg(), src_reg());
2366
2367 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2368 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2369 src_reg(), src_reg());
2370 }
2371 }
2372
2373 void
2374 vec4_visitor::visit(ir_call *ir)
2375 {
2376 const char *callee = ir->callee->function_name();
2377
2378 if (!strcmp("__intrinsic_atomic_read", callee) ||
2379 !strcmp("__intrinsic_atomic_increment", callee) ||
2380 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2381 visit_atomic_counter_intrinsic(ir);
2382 } else {
2383 unreachable("Unsupported intrinsic.");
2384 }
2385 }
2386
2387 src_reg
2388 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2389 {
2390 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2391 inst->base_mrf = 2;
2392 inst->mlen = 1;
2393 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2394 inst->dst.writemask = WRITEMASK_XYZW;
2395
2396 inst->src[1] = sampler;
2397
2398 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2399 int param_base = inst->base_mrf;
2400 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2401 int zero_mask = 0xf & ~coord_mask;
2402
2403 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2404 coordinate));
2405
2406 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2407 src_reg(0)));
2408
2409 emit(inst);
2410 return src_reg(inst->dst);
2411 }
2412
2413 static bool
2414 is_high_sampler(struct brw_context *brw, src_reg sampler)
2415 {
2416 if (brw->gen < 8 && !brw->is_haswell)
2417 return false;
2418
2419 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2420 }
2421
2422 void
2423 vec4_visitor::visit(ir_texture *ir)
2424 {
2425 uint32_t sampler =
2426 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2427
2428 ir_rvalue *nonconst_sampler_index =
2429 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2430
2431 /* Handle non-constant sampler array indexing */
2432 src_reg sampler_reg;
2433 if (nonconst_sampler_index) {
2434 /* The highest sampler which may be used by this operation is
2435 * the last element of the array. Mark it here, because the generator
2436 * doesn't have enough information to determine the bound.
2437 */
2438 uint32_t array_size = ir->sampler->as_dereference_array()
2439 ->array->type->array_size();
2440
2441 uint32_t max_used = sampler + array_size - 1;
2442 if (ir->op == ir_tg4 && brw->gen < 8) {
2443 max_used += prog_data->base.binding_table.gather_texture_start;
2444 } else {
2445 max_used += prog_data->base.binding_table.texture_start;
2446 }
2447
2448 brw_mark_surface_used(&prog_data->base, max_used);
2449
2450 /* Emit code to evaluate the actual indexing expression */
2451 nonconst_sampler_index->accept(this);
2452 dst_reg temp(this, glsl_type::uint_type);
2453 emit(ADD(temp, this->result, src_reg(sampler)))
2454 ->force_writemask_all = true;
2455 sampler_reg = src_reg(temp);
2456 } else {
2457 /* Single sampler, or constant array index; the indexing expression
2458 * is just an immediate.
2459 */
2460 sampler_reg = src_reg(sampler);
2461 }
2462
2463 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2464 * emitting anything other than setting up the constant result.
2465 */
2466 if (ir->op == ir_tg4) {
2467 ir_constant *chan = ir->lod_info.component->as_constant();
2468 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2469 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2470 dst_reg result(this, ir->type);
2471 this->result = src_reg(result);
2472 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2473 return;
2474 }
2475 }
2476
2477 /* Should be lowered by do_lower_texture_projection */
2478 assert(!ir->projector);
2479
2480 /* Should be lowered */
2481 assert(!ir->offset || !ir->offset->type->is_array());
2482
2483 /* Generate code to compute all the subexpression trees. This has to be
2484 * done before loading any values into MRFs for the sampler message since
2485 * generating these values may involve SEND messages that need the MRFs.
2486 */
2487 src_reg coordinate;
2488 if (ir->coordinate) {
2489 ir->coordinate->accept(this);
2490 coordinate = this->result;
2491 }
2492
2493 src_reg shadow_comparitor;
2494 if (ir->shadow_comparitor) {
2495 ir->shadow_comparitor->accept(this);
2496 shadow_comparitor = this->result;
2497 }
2498
2499 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2500 src_reg offset_value;
2501 if (has_nonconstant_offset) {
2502 ir->offset->accept(this);
2503 offset_value = src_reg(this->result);
2504 }
2505
2506 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2507 src_reg lod, dPdx, dPdy, sample_index, mcs;
2508 switch (ir->op) {
2509 case ir_tex:
2510 lod = src_reg(0.0f);
2511 lod_type = glsl_type::float_type;
2512 break;
2513 case ir_txf:
2514 case ir_txl:
2515 case ir_txs:
2516 ir->lod_info.lod->accept(this);
2517 lod = this->result;
2518 lod_type = ir->lod_info.lod->type;
2519 break;
2520 case ir_query_levels:
2521 lod = src_reg(0);
2522 lod_type = glsl_type::int_type;
2523 break;
2524 case ir_txf_ms:
2525 ir->lod_info.sample_index->accept(this);
2526 sample_index = this->result;
2527 sample_index_type = ir->lod_info.sample_index->type;
2528
2529 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2530 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2531 else
2532 mcs = src_reg(0u);
2533 break;
2534 case ir_txd:
2535 ir->lod_info.grad.dPdx->accept(this);
2536 dPdx = this->result;
2537
2538 ir->lod_info.grad.dPdy->accept(this);
2539 dPdy = this->result;
2540
2541 lod_type = ir->lod_info.grad.dPdx->type;
2542 break;
2543 case ir_txb:
2544 case ir_lod:
2545 case ir_tg4:
2546 break;
2547 }
2548
2549 enum opcode opcode;
2550 switch (ir->op) {
2551 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2552 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2553 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2554 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2555 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2556 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2557 case ir_tg4: opcode = has_nonconstant_offset
2558 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2559 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2560 case ir_txb:
2561 unreachable("TXB is not valid for vertex shaders.");
2562 case ir_lod:
2563 unreachable("LOD is not valid for vertex shaders.");
2564 default:
2565 unreachable("Unrecognized tex op");
2566 }
2567
2568 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2569
2570 if (ir->offset != NULL && !has_nonconstant_offset) {
2571 inst->offset =
2572 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2573 ir->offset->type->vector_elements);
2574 }
2575
2576 /* Stuff the channel select bits in the top of the texture offset */
2577 if (ir->op == ir_tg4)
2578 inst->offset |= gather_channel(ir, sampler) << 16;
2579
2580 /* The message header is necessary for:
2581 * - Gen4 (always)
2582 * - Texel offsets
2583 * - Gather channel selection
2584 * - Sampler indices too large to fit in a 4-bit value.
2585 */
2586 inst->header_present =
2587 brw->gen < 5 || inst->offset != 0 || ir->op == ir_tg4 ||
2588 is_high_sampler(brw, sampler_reg);
2589 inst->base_mrf = 2;
2590 inst->mlen = inst->header_present + 1; /* always at least one */
2591 inst->dst = dst_reg(this, ir->type);
2592 inst->dst.writemask = WRITEMASK_XYZW;
2593 inst->shadow_compare = ir->shadow_comparitor != NULL;
2594
2595 inst->src[1] = sampler_reg;
2596
2597 /* MRF for the first parameter */
2598 int param_base = inst->base_mrf + inst->header_present;
2599
2600 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2601 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2602 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2603 } else {
2604 /* Load the coordinate */
2605 /* FINISHME: gl_clamp_mask and saturate */
2606 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2607 int zero_mask = 0xf & ~coord_mask;
2608
2609 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2610 coordinate));
2611
2612 if (zero_mask != 0) {
2613 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2614 src_reg(0)));
2615 }
2616 /* Load the shadow comparitor */
2617 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2618 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2619 WRITEMASK_X),
2620 shadow_comparitor));
2621 inst->mlen++;
2622 }
2623
2624 /* Load the LOD info */
2625 if (ir->op == ir_tex || ir->op == ir_txl) {
2626 int mrf, writemask;
2627 if (brw->gen >= 5) {
2628 mrf = param_base + 1;
2629 if (ir->shadow_comparitor) {
2630 writemask = WRITEMASK_Y;
2631 /* mlen already incremented */
2632 } else {
2633 writemask = WRITEMASK_X;
2634 inst->mlen++;
2635 }
2636 } else /* brw->gen == 4 */ {
2637 mrf = param_base;
2638 writemask = WRITEMASK_W;
2639 }
2640 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2641 } else if (ir->op == ir_txf) {
2642 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2643 } else if (ir->op == ir_txf_ms) {
2644 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2645 sample_index));
2646 if (brw->gen >= 7) {
2647 /* MCS data is in the first channel of `mcs`, but we need to get it into
2648 * the .y channel of the second vec4 of params, so replicate .x across
2649 * the whole vec4 and then mask off everything except .y
2650 */
2651 mcs.swizzle = BRW_SWIZZLE_XXXX;
2652 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2653 mcs));
2654 }
2655 inst->mlen++;
2656 } else if (ir->op == ir_txd) {
2657 const glsl_type *type = lod_type;
2658
2659 if (brw->gen >= 5) {
2660 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2661 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2662 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2663 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2664 inst->mlen++;
2665
2666 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2667 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2668 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2669 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2670 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2671 inst->mlen++;
2672
2673 if (ir->shadow_comparitor) {
2674 emit(MOV(dst_reg(MRF, param_base + 2,
2675 ir->shadow_comparitor->type, WRITEMASK_Z),
2676 shadow_comparitor));
2677 }
2678 }
2679 } else /* brw->gen == 4 */ {
2680 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2681 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2682 inst->mlen += 2;
2683 }
2684 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2685 if (ir->shadow_comparitor) {
2686 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2687 shadow_comparitor));
2688 }
2689
2690 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2691 offset_value));
2692 inst->mlen++;
2693 }
2694 }
2695
2696 emit(inst);
2697
2698 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2699 * spec requires layers.
2700 */
2701 if (ir->op == ir_txs) {
2702 glsl_type const *type = ir->sampler->type;
2703 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2704 type->sampler_array) {
2705 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2706 writemask(inst->dst, WRITEMASK_Z),
2707 src_reg(inst->dst), src_reg(6));
2708 }
2709 }
2710
2711 if (brw->gen == 6 && ir->op == ir_tg4) {
2712 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2713 }
2714
2715 swizzle_result(ir, src_reg(inst->dst), sampler);
2716 }
2717
2718 /**
2719 * Apply workarounds for Gen6 gather with UINT/SINT
2720 */
2721 void
2722 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2723 {
2724 if (!wa)
2725 return;
2726
2727 int width = (wa & WA_8BIT) ? 8 : 16;
2728 dst_reg dst_f = dst;
2729 dst_f.type = BRW_REGISTER_TYPE_F;
2730
2731 /* Convert from UNORM to UINT */
2732 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2733 emit(MOV(dst, src_reg(dst_f)));
2734
2735 if (wa & WA_SIGN) {
2736 /* Reinterpret the UINT value as a signed INT value by
2737 * shifting the sign bit into place, then shifting back
2738 * preserving sign.
2739 */
2740 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2741 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2742 }
2743 }
2744
2745 /**
2746 * Set up the gather channel based on the swizzle, for gather4.
2747 */
2748 uint32_t
2749 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2750 {
2751 ir_constant *chan = ir->lod_info.component->as_constant();
2752 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2753 switch (swiz) {
2754 case SWIZZLE_X: return 0;
2755 case SWIZZLE_Y:
2756 /* gather4 sampler is broken for green channel on RG32F --
2757 * we must ask for blue instead.
2758 */
2759 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2760 return 2;
2761 return 1;
2762 case SWIZZLE_Z: return 2;
2763 case SWIZZLE_W: return 3;
2764 default:
2765 unreachable("Not reached"); /* zero, one swizzles handled already */
2766 }
2767 }
2768
2769 void
2770 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2771 {
2772 int s = key->tex.swizzles[sampler];
2773
2774 this->result = src_reg(this, ir->type);
2775 dst_reg swizzled_result(this->result);
2776
2777 if (ir->op == ir_query_levels) {
2778 /* # levels is in .w */
2779 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2780 emit(MOV(swizzled_result, orig_val));
2781 return;
2782 }
2783
2784 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2785 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2786 emit(MOV(swizzled_result, orig_val));
2787 return;
2788 }
2789
2790
2791 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2792 int swizzle[4] = {0};
2793
2794 for (int i = 0; i < 4; i++) {
2795 switch (GET_SWZ(s, i)) {
2796 case SWIZZLE_ZERO:
2797 zero_mask |= (1 << i);
2798 break;
2799 case SWIZZLE_ONE:
2800 one_mask |= (1 << i);
2801 break;
2802 default:
2803 copy_mask |= (1 << i);
2804 swizzle[i] = GET_SWZ(s, i);
2805 break;
2806 }
2807 }
2808
2809 if (copy_mask) {
2810 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2811 swizzled_result.writemask = copy_mask;
2812 emit(MOV(swizzled_result, orig_val));
2813 }
2814
2815 if (zero_mask) {
2816 swizzled_result.writemask = zero_mask;
2817 emit(MOV(swizzled_result, src_reg(0.0f)));
2818 }
2819
2820 if (one_mask) {
2821 swizzled_result.writemask = one_mask;
2822 emit(MOV(swizzled_result, src_reg(1.0f)));
2823 }
2824 }
2825
2826 void
2827 vec4_visitor::visit(ir_return *)
2828 {
2829 unreachable("not reached");
2830 }
2831
2832 void
2833 vec4_visitor::visit(ir_discard *)
2834 {
2835 unreachable("not reached");
2836 }
2837
2838 void
2839 vec4_visitor::visit(ir_if *ir)
2840 {
2841 /* Don't point the annotation at the if statement, because then it plus
2842 * the then and else blocks get printed.
2843 */
2844 this->base_ir = ir->condition;
2845
2846 if (brw->gen == 6) {
2847 emit_if_gen6(ir);
2848 } else {
2849 enum brw_predicate predicate;
2850 emit_bool_to_cond_code(ir->condition, &predicate);
2851 emit(IF(predicate));
2852 }
2853
2854 visit_instructions(&ir->then_instructions);
2855
2856 if (!ir->else_instructions.is_empty()) {
2857 this->base_ir = ir->condition;
2858 emit(BRW_OPCODE_ELSE);
2859
2860 visit_instructions(&ir->else_instructions);
2861 }
2862
2863 this->base_ir = ir->condition;
2864 emit(BRW_OPCODE_ENDIF);
2865 }
2866
2867 void
2868 vec4_visitor::visit(ir_emit_vertex *)
2869 {
2870 unreachable("not reached");
2871 }
2872
2873 void
2874 vec4_visitor::visit(ir_end_primitive *)
2875 {
2876 unreachable("not reached");
2877 }
2878
2879 void
2880 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2881 dst_reg dst, src_reg offset,
2882 src_reg src0, src_reg src1)
2883 {
2884 unsigned mlen = 0;
2885
2886 /* Set the atomic operation offset. */
2887 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2888 mlen++;
2889
2890 /* Set the atomic operation arguments. */
2891 if (src0.file != BAD_FILE) {
2892 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2893 mlen++;
2894 }
2895
2896 if (src1.file != BAD_FILE) {
2897 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2898 mlen++;
2899 }
2900
2901 /* Emit the instruction. Note that this maps to the normal SIMD8
2902 * untyped atomic message on Ivy Bridge, but that's OK because
2903 * unused channels will be masked out.
2904 */
2905 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2906 src_reg(atomic_op), src_reg(surf_index));
2907 inst->base_mrf = 0;
2908 inst->mlen = mlen;
2909 }
2910
2911 void
2912 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2913 src_reg offset)
2914 {
2915 /* Set the surface read offset. */
2916 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2917
2918 /* Emit the instruction. Note that this maps to the normal SIMD8
2919 * untyped surface read message, but that's OK because unused
2920 * channels will be masked out.
2921 */
2922 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2923 dst, src_reg(surf_index));
2924 inst->base_mrf = 0;
2925 inst->mlen = 1;
2926 }
2927
2928 void
2929 vec4_visitor::emit_ndc_computation()
2930 {
2931 /* Get the position */
2932 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2933
2934 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2935 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2936 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2937
2938 current_annotation = "NDC";
2939 dst_reg ndc_w = ndc;
2940 ndc_w.writemask = WRITEMASK_W;
2941 src_reg pos_w = pos;
2942 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2943 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2944
2945 dst_reg ndc_xyz = ndc;
2946 ndc_xyz.writemask = WRITEMASK_XYZ;
2947
2948 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2949 }
2950
2951 void
2952 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2953 {
2954 if (brw->gen < 6 &&
2955 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2956 key->userclip_active || brw->has_negative_rhw_bug)) {
2957 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2958 dst_reg header1_w = header1;
2959 header1_w.writemask = WRITEMASK_W;
2960
2961 emit(MOV(header1, 0u));
2962
2963 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2964 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2965
2966 current_annotation = "Point size";
2967 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2968 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2969 }
2970
2971 if (key->userclip_active) {
2972 current_annotation = "Clipping flags";
2973 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2974 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2975
2976 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2977 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2978 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2979
2980 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2981 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2982 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2983 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2984 }
2985
2986 /* i965 clipping workaround:
2987 * 1) Test for -ve rhw
2988 * 2) If set,
2989 * set ndc = (0,0,0,0)
2990 * set ucp[6] = 1
2991 *
2992 * Later, clipping will detect ucp[6] and ensure the primitive is
2993 * clipped against all fixed planes.
2994 */
2995 if (brw->has_negative_rhw_bug) {
2996 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2997 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2998 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2999 vec4_instruction *inst;
3000 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3001 inst->predicate = BRW_PREDICATE_NORMAL;
3002 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3003 inst->predicate = BRW_PREDICATE_NORMAL;
3004 }
3005
3006 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3007 } else if (brw->gen < 6) {
3008 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3009 } else {
3010 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3011 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3012 dst_reg reg_w = reg;
3013 reg_w.writemask = WRITEMASK_W;
3014 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3015 }
3016 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3017 dst_reg reg_y = reg;
3018 reg_y.writemask = WRITEMASK_Y;
3019 reg_y.type = BRW_REGISTER_TYPE_D;
3020 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3021 }
3022 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3023 dst_reg reg_z = reg;
3024 reg_z.writemask = WRITEMASK_Z;
3025 reg_z.type = BRW_REGISTER_TYPE_D;
3026 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3027 }
3028 }
3029 }
3030
3031 void
3032 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3033 {
3034 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3035 *
3036 * "If a linked set of shaders forming the vertex stage contains no
3037 * static write to gl_ClipVertex or gl_ClipDistance, but the
3038 * application has requested clipping against user clip planes through
3039 * the API, then the coordinate written to gl_Position is used for
3040 * comparison against the user clip planes."
3041 *
3042 * This function is only called if the shader didn't write to
3043 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3044 * if the user wrote to it; otherwise we use gl_Position.
3045 */
3046 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3047 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3048 clip_vertex = VARYING_SLOT_POS;
3049 }
3050
3051 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3052 ++i) {
3053 reg.writemask = 1 << i;
3054 emit(DP4(reg,
3055 src_reg(output_reg[clip_vertex]),
3056 src_reg(this->userplane[i + offset])));
3057 }
3058 }
3059
3060 vec4_instruction *
3061 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3062 {
3063 assert (varying < VARYING_SLOT_MAX);
3064 reg.type = output_reg[varying].type;
3065 current_annotation = output_reg_annotation[varying];
3066 /* Copy the register, saturating if necessary */
3067 return emit(MOV(reg, src_reg(output_reg[varying])));
3068 }
3069
3070 void
3071 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3072 {
3073 reg.type = BRW_REGISTER_TYPE_F;
3074
3075 switch (varying) {
3076 case VARYING_SLOT_PSIZ:
3077 {
3078 /* PSIZ is always in slot 0, and is coupled with other flags. */
3079 current_annotation = "indices, point width, clip flags";
3080 emit_psiz_and_flags(reg);
3081 break;
3082 }
3083 case BRW_VARYING_SLOT_NDC:
3084 current_annotation = "NDC";
3085 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3086 break;
3087 case VARYING_SLOT_POS:
3088 current_annotation = "gl_Position";
3089 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3090 break;
3091 case VARYING_SLOT_EDGE:
3092 /* This is present when doing unfilled polygons. We're supposed to copy
3093 * the edge flag from the user-provided vertex array
3094 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3095 * of that attribute (starts as 1.0f). This is then used in clipping to
3096 * determine which edges should be drawn as wireframe.
3097 */
3098 current_annotation = "edge flag";
3099 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3100 glsl_type::float_type, WRITEMASK_XYZW))));
3101 break;
3102 case BRW_VARYING_SLOT_PAD:
3103 /* No need to write to this slot */
3104 break;
3105 case VARYING_SLOT_COL0:
3106 case VARYING_SLOT_COL1:
3107 case VARYING_SLOT_BFC0:
3108 case VARYING_SLOT_BFC1: {
3109 /* These built-in varyings are only supported in compatibility mode,
3110 * and we only support GS in core profile. So, this must be a vertex
3111 * shader.
3112 */
3113 assert(stage == MESA_SHADER_VERTEX);
3114 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3115 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3116 inst->saturate = true;
3117 break;
3118 }
3119
3120 default:
3121 emit_generic_urb_slot(reg, varying);
3122 break;
3123 }
3124 }
3125
3126 static int
3127 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3128 {
3129 if (brw->gen >= 6) {
3130 /* URB data written (does not include the message header reg) must
3131 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3132 * section 5.4.3.2.2: URB_INTERLEAVED.
3133 *
3134 * URB entries are allocated on a multiple of 1024 bits, so an
3135 * extra 128 bits written here to make the end align to 256 is
3136 * no problem.
3137 */
3138 if ((mlen % 2) != 1)
3139 mlen++;
3140 }
3141
3142 return mlen;
3143 }
3144
3145
3146 /**
3147 * Generates the VUE payload plus the necessary URB write instructions to
3148 * output it.
3149 *
3150 * The VUE layout is documented in Volume 2a.
3151 */
3152 void
3153 vec4_visitor::emit_vertex()
3154 {
3155 /* MRF 0 is reserved for the debugger, so start with message header
3156 * in MRF 1.
3157 */
3158 int base_mrf = 1;
3159 int mrf = base_mrf;
3160 /* In the process of generating our URB write message contents, we
3161 * may need to unspill a register or load from an array. Those
3162 * reads would use MRFs 14-15.
3163 */
3164 int max_usable_mrf = 13;
3165
3166 /* The following assertion verifies that max_usable_mrf causes an
3167 * even-numbered amount of URB write data, which will meet gen6's
3168 * requirements for length alignment.
3169 */
3170 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3171
3172 /* First mrf is the g0-based message header containing URB handles and
3173 * such.
3174 */
3175 emit_urb_write_header(mrf++);
3176
3177 if (brw->gen < 6) {
3178 emit_ndc_computation();
3179 }
3180
3181 /* Lower legacy ff and ClipVertex clipping to clip distances */
3182 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3183 current_annotation = "user clip distances";
3184
3185 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3186 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3187
3188 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3189 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3190 }
3191
3192 /* We may need to split this up into several URB writes, so do them in a
3193 * loop.
3194 */
3195 int slot = 0;
3196 bool complete = false;
3197 do {
3198 /* URB offset is in URB row increments, and each of our MRFs is half of
3199 * one of those, since we're doing interleaved writes.
3200 */
3201 int offset = slot / 2;
3202
3203 mrf = base_mrf + 1;
3204 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3205 emit_urb_slot(dst_reg(MRF, mrf++),
3206 prog_data->vue_map.slot_to_varying[slot]);
3207
3208 /* If this was max_usable_mrf, we can't fit anything more into this
3209 * URB WRITE.
3210 */
3211 if (mrf > max_usable_mrf) {
3212 slot++;
3213 break;
3214 }
3215 }
3216
3217 complete = slot >= prog_data->vue_map.num_slots;
3218 current_annotation = "URB write";
3219 vec4_instruction *inst = emit_urb_write_opcode(complete);
3220 inst->base_mrf = base_mrf;
3221 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3222 inst->offset += offset;
3223 } while(!complete);
3224 }
3225
3226
3227 src_reg
3228 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3229 src_reg *reladdr, int reg_offset)
3230 {
3231 /* Because we store the values to scratch interleaved like our
3232 * vertex data, we need to scale the vec4 index by 2.
3233 */
3234 int message_header_scale = 2;
3235
3236 /* Pre-gen6, the message header uses byte offsets instead of vec4
3237 * (16-byte) offset units.
3238 */
3239 if (brw->gen < 6)
3240 message_header_scale *= 16;
3241
3242 if (reladdr) {
3243 src_reg index = src_reg(this, glsl_type::int_type);
3244
3245 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3246 src_reg(reg_offset)));
3247 emit_before(block, inst, MUL(dst_reg(index), index,
3248 src_reg(message_header_scale)));
3249
3250 return index;
3251 } else {
3252 return src_reg(reg_offset * message_header_scale);
3253 }
3254 }
3255
3256 src_reg
3257 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3258 src_reg *reladdr, int reg_offset)
3259 {
3260 if (reladdr) {
3261 src_reg index = src_reg(this, glsl_type::int_type);
3262
3263 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3264 src_reg(reg_offset)));
3265
3266 /* Pre-gen6, the message header uses byte offsets instead of vec4
3267 * (16-byte) offset units.
3268 */
3269 if (brw->gen < 6) {
3270 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3271 }
3272
3273 return index;
3274 } else if (brw->gen >= 8) {
3275 /* Store the offset in a GRF so we can send-from-GRF. */
3276 src_reg offset = src_reg(this, glsl_type::int_type);
3277 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3278 return offset;
3279 } else {
3280 int message_header_scale = brw->gen < 6 ? 16 : 1;
3281 return src_reg(reg_offset * message_header_scale);
3282 }
3283 }
3284
3285 /**
3286 * Emits an instruction before @inst to load the value named by @orig_src
3287 * from scratch space at @base_offset to @temp.
3288 *
3289 * @base_offset is measured in 32-byte units (the size of a register).
3290 */
3291 void
3292 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3293 dst_reg temp, src_reg orig_src,
3294 int base_offset)
3295 {
3296 int reg_offset = base_offset + orig_src.reg_offset;
3297 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3298 reg_offset);
3299
3300 emit_before(block, inst, SCRATCH_READ(temp, index));
3301 }
3302
3303 /**
3304 * Emits an instruction after @inst to store the value to be written
3305 * to @orig_dst to scratch space at @base_offset, from @temp.
3306 *
3307 * @base_offset is measured in 32-byte units (the size of a register).
3308 */
3309 void
3310 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3311 int base_offset)
3312 {
3313 int reg_offset = base_offset + inst->dst.reg_offset;
3314 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3315 reg_offset);
3316
3317 /* Create a temporary register to store *inst's result in.
3318 *
3319 * We have to be careful in MOVing from our temporary result register in
3320 * the scratch write. If we swizzle from channels of the temporary that
3321 * weren't initialized, it will confuse live interval analysis, which will
3322 * make spilling fail to make progress.
3323 */
3324 src_reg temp = src_reg(this, glsl_type::vec4_type);
3325 temp.type = inst->dst.type;
3326 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3327 int swizzles[4];
3328 for (int i = 0; i < 4; i++)
3329 if (inst->dst.writemask & (1 << i))
3330 swizzles[i] = i;
3331 else
3332 swizzles[i] = first_writemask_chan;
3333 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3334 swizzles[2], swizzles[3]);
3335
3336 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3337 inst->dst.writemask));
3338 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3339 write->predicate = inst->predicate;
3340 write->ir = inst->ir;
3341 write->annotation = inst->annotation;
3342 inst->insert_after(block, write);
3343
3344 inst->dst.file = temp.file;
3345 inst->dst.reg = temp.reg;
3346 inst->dst.reg_offset = temp.reg_offset;
3347 inst->dst.reladdr = NULL;
3348 }
3349
3350 /**
3351 * We can't generally support array access in GRF space, because a
3352 * single instruction's destination can only span 2 contiguous
3353 * registers. So, we send all GRF arrays that get variable index
3354 * access to scratch space.
3355 */
3356 void
3357 vec4_visitor::move_grf_array_access_to_scratch()
3358 {
3359 int scratch_loc[this->virtual_grf_count];
3360 memset(scratch_loc, -1, sizeof(scratch_loc));
3361
3362 /* First, calculate the set of virtual GRFs that need to be punted
3363 * to scratch due to having any array access on them, and where in
3364 * scratch.
3365 */
3366 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3367 if (inst->dst.file == GRF && inst->dst.reladdr &&
3368 scratch_loc[inst->dst.reg] == -1) {
3369 scratch_loc[inst->dst.reg] = c->last_scratch;
3370 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3371 }
3372
3373 for (int i = 0 ; i < 3; i++) {
3374 src_reg *src = &inst->src[i];
3375
3376 if (src->file == GRF && src->reladdr &&
3377 scratch_loc[src->reg] == -1) {
3378 scratch_loc[src->reg] = c->last_scratch;
3379 c->last_scratch += this->virtual_grf_sizes[src->reg];
3380 }
3381 }
3382 }
3383
3384 /* Now, for anything that will be accessed through scratch, rewrite
3385 * it to load/store. Note that this is a _safe list walk, because
3386 * we may generate a new scratch_write instruction after the one
3387 * we're processing.
3388 */
3389 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3390 /* Set up the annotation tracking for new generated instructions. */
3391 base_ir = inst->ir;
3392 current_annotation = inst->annotation;
3393
3394 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3395 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3396 }
3397
3398 for (int i = 0 ; i < 3; i++) {
3399 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3400 continue;
3401
3402 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3403
3404 emit_scratch_read(block, inst, temp, inst->src[i],
3405 scratch_loc[inst->src[i].reg]);
3406
3407 inst->src[i].file = temp.file;
3408 inst->src[i].reg = temp.reg;
3409 inst->src[i].reg_offset = temp.reg_offset;
3410 inst->src[i].reladdr = NULL;
3411 }
3412 }
3413 }
3414
3415 /**
3416 * Emits an instruction before @inst to load the value named by @orig_src
3417 * from the pull constant buffer (surface) at @base_offset to @temp.
3418 */
3419 void
3420 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3421 dst_reg temp, src_reg orig_src,
3422 int base_offset)
3423 {
3424 int reg_offset = base_offset + orig_src.reg_offset;
3425 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3426 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3427 reg_offset);
3428 vec4_instruction *load;
3429
3430 if (brw->gen >= 7) {
3431 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3432 grf_offset.type = offset.type;
3433 emit_before(block, inst, MOV(grf_offset, offset));
3434
3435 load = new(mem_ctx) vec4_instruction(this,
3436 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3437 temp, index, src_reg(grf_offset));
3438 } else {
3439 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3440 temp, index, offset);
3441 load->base_mrf = 14;
3442 load->mlen = 1;
3443 }
3444 emit_before(block, inst, load);
3445 }
3446
3447 /**
3448 * Implements array access of uniforms by inserting a
3449 * PULL_CONSTANT_LOAD instruction.
3450 *
3451 * Unlike temporary GRF array access (where we don't support it due to
3452 * the difficulty of doing relative addressing on instruction
3453 * destinations), we could potentially do array access of uniforms
3454 * that were loaded in GRF space as push constants. In real-world
3455 * usage we've seen, though, the arrays being used are always larger
3456 * than we could load as push constants, so just always move all
3457 * uniform array access out to a pull constant buffer.
3458 */
3459 void
3460 vec4_visitor::move_uniform_array_access_to_pull_constants()
3461 {
3462 int pull_constant_loc[this->uniforms];
3463 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3464 bool nested_reladdr;
3465
3466 /* Walk through and find array access of uniforms. Put a copy of that
3467 * uniform in the pull constant buffer.
3468 *
3469 * Note that we don't move constant-indexed accesses to arrays. No
3470 * testing has been done of the performance impact of this choice.
3471 */
3472 do {
3473 nested_reladdr = false;
3474
3475 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3476 for (int i = 0 ; i < 3; i++) {
3477 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3478 continue;
3479
3480 int uniform = inst->src[i].reg;
3481
3482 if (inst->src[i].reladdr->reladdr)
3483 nested_reladdr = true; /* will need another pass */
3484
3485 /* If this array isn't already present in the pull constant buffer,
3486 * add it.
3487 */
3488 if (pull_constant_loc[uniform] == -1) {
3489 const gl_constant_value **values =
3490 &stage_prog_data->param[uniform * 4];
3491
3492 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3493
3494 assert(uniform < uniform_array_size);
3495 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3496 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3497 = values[j];
3498 }
3499 }
3500
3501 /* Set up the annotation tracking for new generated instructions. */
3502 base_ir = inst->ir;
3503 current_annotation = inst->annotation;
3504
3505 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3506
3507 emit_pull_constant_load(block, inst, temp, inst->src[i],
3508 pull_constant_loc[uniform]);
3509
3510 inst->src[i].file = temp.file;
3511 inst->src[i].reg = temp.reg;
3512 inst->src[i].reg_offset = temp.reg_offset;
3513 inst->src[i].reladdr = NULL;
3514 }
3515 }
3516 } while (nested_reladdr);
3517
3518 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3519 * no need to track them as larger-than-vec4 objects. This will be
3520 * relied on in cutting out unused uniform vectors from push
3521 * constants.
3522 */
3523 split_uniform_registers();
3524 }
3525
3526 void
3527 vec4_visitor::resolve_ud_negate(src_reg *reg)
3528 {
3529 if (reg->type != BRW_REGISTER_TYPE_UD ||
3530 !reg->negate)
3531 return;
3532
3533 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3534 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3535 *reg = temp;
3536 }
3537
3538 vec4_visitor::vec4_visitor(struct brw_context *brw,
3539 struct brw_vec4_compile *c,
3540 struct gl_program *prog,
3541 const struct brw_vec4_prog_key *key,
3542 struct brw_vec4_prog_data *prog_data,
3543 struct gl_shader_program *shader_prog,
3544 gl_shader_stage stage,
3545 void *mem_ctx,
3546 bool debug_flag,
3547 bool no_spills,
3548 shader_time_shader_type st_base,
3549 shader_time_shader_type st_written,
3550 shader_time_shader_type st_reset)
3551 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3552 c(c),
3553 key(key),
3554 prog_data(prog_data),
3555 sanity_param_count(0),
3556 fail_msg(NULL),
3557 first_non_payload_grf(0),
3558 need_all_constants_in_pull_buffer(false),
3559 debug_flag(debug_flag),
3560 no_spills(no_spills),
3561 st_base(st_base),
3562 st_written(st_written),
3563 st_reset(st_reset)
3564 {
3565 this->mem_ctx = mem_ctx;
3566 this->failed = false;
3567
3568 this->base_ir = NULL;
3569 this->current_annotation = NULL;
3570 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3571
3572 this->variable_ht = hash_table_ctor(0,
3573 hash_table_pointer_hash,
3574 hash_table_pointer_compare);
3575
3576 this->virtual_grf_start = NULL;
3577 this->virtual_grf_end = NULL;
3578 this->virtual_grf_sizes = NULL;
3579 this->virtual_grf_count = 0;
3580 this->virtual_grf_reg_map = NULL;
3581 this->virtual_grf_reg_count = 0;
3582 this->virtual_grf_array_size = 0;
3583 this->live_intervals = NULL;
3584
3585 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3586
3587 this->uniforms = 0;
3588
3589 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3590 * at least one. See setup_uniforms() in brw_vec4.cpp.
3591 */
3592 this->uniform_array_size = 1;
3593 if (prog_data) {
3594 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3595 }
3596
3597 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3598 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3599 }
3600
3601 vec4_visitor::~vec4_visitor()
3602 {
3603 hash_table_dtor(this->variable_ht);
3604 }
3605
3606
3607 void
3608 vec4_visitor::fail(const char *format, ...)
3609 {
3610 va_list va;
3611 char *msg;
3612
3613 if (failed)
3614 return;
3615
3616 failed = true;
3617
3618 va_start(va, format);
3619 msg = ralloc_vasprintf(mem_ctx, format, va);
3620 va_end(va);
3621 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3622
3623 this->fail_msg = msg;
3624
3625 if (debug_flag) {
3626 fprintf(stderr, "%s", msg);
3627 }
3628 }
3629
3630 } /* namespace brw */