05bda43bb2be8128af794f9062b8ac6cef66494c
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(brw->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(brw->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (brw->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (brw->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (brw->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (brw->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_DOUBLE:
616 case GLSL_TYPE_ERROR:
617 case GLSL_TYPE_INTERFACE:
618 unreachable("not reached");
619 }
620
621 return 0;
622 }
623
624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
625 {
626 init();
627
628 this->file = GRF;
629 this->reg = v->alloc.allocate(type_size(type));
630
631 if (type->is_array() || type->is_record()) {
632 this->swizzle = BRW_SWIZZLE_NOOP;
633 } else {
634 this->swizzle = swizzle_for_size(type->vector_elements);
635 }
636
637 this->type = brw_type_for_base_type(type);
638 }
639
640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
641 {
642 assert(size > 0);
643
644 init();
645
646 this->file = GRF;
647 this->reg = v->alloc.allocate(type_size(type) * size);
648
649 this->swizzle = BRW_SWIZZLE_NOOP;
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
655 {
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(type_size(type));
660
661 if (type->is_array() || type->is_record()) {
662 this->writemask = WRITEMASK_XYZW;
663 } else {
664 this->writemask = (1 << type->vector_elements) - 1;
665 }
666
667 this->type = brw_type_for_base_type(type);
668 }
669
670 /* Our support for uniforms is piggy-backed on the struct
671 * gl_fragment_program, because that's where the values actually
672 * get stored, rather than in some global gl_shader_program uniform
673 * store.
674 */
675 void
676 vec4_visitor::setup_uniform_values(ir_variable *ir)
677 {
678 int namelen = strlen(ir->name);
679
680 /* The data for our (non-builtin) uniforms is stored in a series of
681 * gl_uniform_driver_storage structs for each subcomponent that
682 * glGetUniformLocation() could name. We know it's been set up in the same
683 * order we'd walk the type, so walk the list of storage and find anything
684 * with our name, or the prefix of a component that starts with our name.
685 */
686 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
687 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
688
689 if (strncmp(ir->name, storage->name, namelen) != 0 ||
690 (storage->name[namelen] != 0 &&
691 storage->name[namelen] != '.' &&
692 storage->name[namelen] != '[')) {
693 continue;
694 }
695
696 gl_constant_value *components = storage->storage;
697 unsigned vector_count = (MAX2(storage->array_elements, 1) *
698 storage->type->matrix_columns);
699
700 for (unsigned s = 0; s < vector_count; s++) {
701 assert(uniforms < uniform_array_size);
702 uniform_vector_size[uniforms] = storage->type->vector_elements;
703
704 int i;
705 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
706 stage_prog_data->param[uniforms * 4 + i] = components;
707 components++;
708 }
709 for (; i < 4; i++) {
710 static gl_constant_value zero = { 0.0 };
711 stage_prog_data->param[uniforms * 4 + i] = &zero;
712 }
713
714 uniforms++;
715 }
716 }
717 }
718
719 void
720 vec4_visitor::setup_uniform_clipplane_values()
721 {
722 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
723
724 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
725 assert(this->uniforms < uniform_array_size);
726 this->uniform_vector_size[this->uniforms] = 4;
727 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
728 this->userplane[i].type = BRW_REGISTER_TYPE_F;
729 for (int j = 0; j < 4; ++j) {
730 stage_prog_data->param[this->uniforms * 4 + j] =
731 (gl_constant_value *) &clip_planes[i][j];
732 }
733 ++this->uniforms;
734 }
735 }
736
737 /* Our support for builtin uniforms is even scarier than non-builtin.
738 * It sits on top of the PROG_STATE_VAR parameters that are
739 * automatically updated from GL context state.
740 */
741 void
742 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
743 {
744 const ir_state_slot *const slots = ir->get_state_slots();
745 assert(slots != NULL);
746
747 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
748 /* This state reference has already been setup by ir_to_mesa,
749 * but we'll get the same index back here. We can reference
750 * ParameterValues directly, since unlike brw_fs.cpp, we never
751 * add new state references during compile.
752 */
753 int index = _mesa_add_state_reference(this->prog->Parameters,
754 (gl_state_index *)slots[i].tokens);
755 gl_constant_value *values =
756 &this->prog->Parameters->ParameterValues[index][0];
757
758 assert(this->uniforms < uniform_array_size);
759 this->uniform_vector_size[this->uniforms] = 0;
760 /* Add each of the unique swizzled channels of the element.
761 * This will end up matching the size of the glsl_type of this field.
762 */
763 int last_swiz = -1;
764 for (unsigned int j = 0; j < 4; j++) {
765 int swiz = GET_SWZ(slots[i].swizzle, j);
766 last_swiz = swiz;
767
768 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
769 assert(this->uniforms < uniform_array_size);
770 if (swiz <= last_swiz)
771 this->uniform_vector_size[this->uniforms]++;
772 }
773 this->uniforms++;
774 }
775 }
776
777 dst_reg *
778 vec4_visitor::variable_storage(ir_variable *var)
779 {
780 return (dst_reg *)hash_table_find(this->variable_ht, var);
781 }
782
783 void
784 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
785 enum brw_predicate *predicate)
786 {
787 ir_expression *expr = ir->as_expression();
788
789 *predicate = BRW_PREDICATE_NORMAL;
790
791 if (expr && expr->operation != ir_binop_ubo_load) {
792 src_reg op[3];
793 vec4_instruction *inst;
794
795 assert(expr->get_num_operands() <= 3);
796 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
797 expr->operands[i]->accept(this);
798 op[i] = this->result;
799
800 resolve_ud_negate(&op[i]);
801 }
802
803 switch (expr->operation) {
804 case ir_unop_logic_not:
805 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
806 inst->conditional_mod = BRW_CONDITIONAL_Z;
807 break;
808
809 case ir_binop_logic_xor:
810 if (brw->gen <= 5) {
811 src_reg temp = src_reg(this, ir->type);
812 emit(XOR(dst_reg(temp), op[0], op[1]));
813 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
814 } else {
815 inst = emit(XOR(dst_null_d(), op[0], op[1]));
816 }
817 inst->conditional_mod = BRW_CONDITIONAL_NZ;
818 break;
819
820 case ir_binop_logic_or:
821 if (brw->gen <= 5) {
822 src_reg temp = src_reg(this, ir->type);
823 emit(OR(dst_reg(temp), op[0], op[1]));
824 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
825 } else {
826 inst = emit(OR(dst_null_d(), op[0], op[1]));
827 }
828 inst->conditional_mod = BRW_CONDITIONAL_NZ;
829 break;
830
831 case ir_binop_logic_and:
832 if (brw->gen <= 5) {
833 src_reg temp = src_reg(this, ir->type);
834 emit(AND(dst_reg(temp), op[0], op[1]));
835 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
836 } else {
837 inst = emit(AND(dst_null_d(), op[0], op[1]));
838 }
839 inst->conditional_mod = BRW_CONDITIONAL_NZ;
840 break;
841
842 case ir_unop_f2b:
843 if (brw->gen >= 6) {
844 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
845 } else {
846 inst = emit(MOV(dst_null_f(), op[0]));
847 inst->conditional_mod = BRW_CONDITIONAL_NZ;
848 }
849 break;
850
851 case ir_unop_i2b:
852 if (brw->gen >= 6) {
853 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
854 } else {
855 inst = emit(MOV(dst_null_d(), op[0]));
856 inst->conditional_mod = BRW_CONDITIONAL_NZ;
857 }
858 break;
859
860 case ir_binop_all_equal:
861 if (brw->gen <= 5) {
862 resolve_bool_comparison(expr->operands[0], &op[0]);
863 resolve_bool_comparison(expr->operands[1], &op[1]);
864 }
865 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
866 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
867 break;
868
869 case ir_binop_any_nequal:
870 if (brw->gen <= 5) {
871 resolve_bool_comparison(expr->operands[0], &op[0]);
872 resolve_bool_comparison(expr->operands[1], &op[1]);
873 }
874 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
875 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
876 break;
877
878 case ir_unop_any:
879 if (brw->gen <= 5) {
880 resolve_bool_comparison(expr->operands[0], &op[0]);
881 }
882 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
883 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
884 break;
885
886 case ir_binop_greater:
887 case ir_binop_gequal:
888 case ir_binop_less:
889 case ir_binop_lequal:
890 case ir_binop_equal:
891 case ir_binop_nequal:
892 if (brw->gen <= 5) {
893 resolve_bool_comparison(expr->operands[0], &op[0]);
894 resolve_bool_comparison(expr->operands[1], &op[1]);
895 }
896 emit(CMP(dst_null_d(), op[0], op[1],
897 brw_conditional_for_comparison(expr->operation)));
898 break;
899
900 case ir_triop_csel: {
901 /* Expand the boolean condition into the flag register. */
902 inst = emit(MOV(dst_null_d(), op[0]));
903 inst->conditional_mod = BRW_CONDITIONAL_NZ;
904
905 /* Select which boolean to return. */
906 dst_reg temp(this, expr->operands[1]->type);
907 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
908 inst->predicate = BRW_PREDICATE_NORMAL;
909
910 /* Expand the result to a condition code. */
911 inst = emit(MOV(dst_null_d(), src_reg(temp)));
912 inst->conditional_mod = BRW_CONDITIONAL_NZ;
913 break;
914 }
915
916 default:
917 unreachable("not reached");
918 }
919 return;
920 }
921
922 ir->accept(this);
923
924 resolve_ud_negate(&this->result);
925
926 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
927 inst->conditional_mod = BRW_CONDITIONAL_NZ;
928 }
929
930 /**
931 * Emit a gen6 IF statement with the comparison folded into the IF
932 * instruction.
933 */
934 void
935 vec4_visitor::emit_if_gen6(ir_if *ir)
936 {
937 ir_expression *expr = ir->condition->as_expression();
938
939 if (expr && expr->operation != ir_binop_ubo_load) {
940 src_reg op[3];
941 dst_reg temp;
942
943 assert(expr->get_num_operands() <= 3);
944 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
945 expr->operands[i]->accept(this);
946 op[i] = this->result;
947 }
948
949 switch (expr->operation) {
950 case ir_unop_logic_not:
951 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
952 return;
953
954 case ir_binop_logic_xor:
955 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
956 return;
957
958 case ir_binop_logic_or:
959 temp = dst_reg(this, glsl_type::bool_type);
960 emit(OR(temp, op[0], op[1]));
961 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
962 return;
963
964 case ir_binop_logic_and:
965 temp = dst_reg(this, glsl_type::bool_type);
966 emit(AND(temp, op[0], op[1]));
967 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
968 return;
969
970 case ir_unop_f2b:
971 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
972 return;
973
974 case ir_unop_i2b:
975 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
976 return;
977
978 case ir_binop_greater:
979 case ir_binop_gequal:
980 case ir_binop_less:
981 case ir_binop_lequal:
982 case ir_binop_equal:
983 case ir_binop_nequal:
984 emit(IF(op[0], op[1],
985 brw_conditional_for_comparison(expr->operation)));
986 return;
987
988 case ir_binop_all_equal:
989 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
990 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
991 return;
992
993 case ir_binop_any_nequal:
994 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
995 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
996 return;
997
998 case ir_unop_any:
999 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1000 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1001 return;
1002
1003 case ir_triop_csel: {
1004 /* Expand the boolean condition into the flag register. */
1005 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1006 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1007
1008 /* Select which boolean to return. */
1009 dst_reg temp(this, expr->operands[1]->type);
1010 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1011 inst->predicate = BRW_PREDICATE_NORMAL;
1012
1013 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1014 return;
1015 }
1016
1017 default:
1018 unreachable("not reached");
1019 }
1020 return;
1021 }
1022
1023 ir->condition->accept(this);
1024
1025 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1026 }
1027
1028 void
1029 vec4_visitor::visit(ir_variable *ir)
1030 {
1031 dst_reg *reg = NULL;
1032
1033 if (variable_storage(ir))
1034 return;
1035
1036 switch (ir->data.mode) {
1037 case ir_var_shader_in:
1038 assert(ir->data.location != -1);
1039 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1040 break;
1041
1042 case ir_var_shader_out:
1043 assert(ir->data.location != -1);
1044 reg = new(mem_ctx) dst_reg(this, ir->type);
1045
1046 for (int i = 0; i < type_size(ir->type); i++) {
1047 output_reg[ir->data.location + i] = *reg;
1048 output_reg[ir->data.location + i].reg_offset = i;
1049 output_reg[ir->data.location + i].type =
1050 brw_type_for_base_type(ir->type->get_scalar_type());
1051 output_reg_annotation[ir->data.location + i] = ir->name;
1052 }
1053 break;
1054
1055 case ir_var_auto:
1056 case ir_var_temporary:
1057 reg = new(mem_ctx) dst_reg(this, ir->type);
1058 break;
1059
1060 case ir_var_uniform:
1061 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1062
1063 /* Thanks to the lower_ubo_reference pass, we will see only
1064 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1065 * variables, so no need for them to be in variable_ht.
1066 *
1067 * Some uniforms, such as samplers and atomic counters, have no actual
1068 * storage, so we should ignore them.
1069 */
1070 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1071 return;
1072
1073 /* Track how big the whole uniform variable is, in case we need to put a
1074 * copy of its data into pull constants for array access.
1075 */
1076 assert(this->uniforms < uniform_array_size);
1077 this->uniform_size[this->uniforms] = type_size(ir->type);
1078
1079 if (!strncmp(ir->name, "gl_", 3)) {
1080 setup_builtin_uniform_values(ir);
1081 } else {
1082 setup_uniform_values(ir);
1083 }
1084 break;
1085
1086 case ir_var_system_value:
1087 reg = make_reg_for_system_value(ir);
1088 break;
1089
1090 default:
1091 unreachable("not reached");
1092 }
1093
1094 reg->type = brw_type_for_base_type(ir->type);
1095 hash_table_insert(this->variable_ht, reg, ir);
1096 }
1097
1098 void
1099 vec4_visitor::visit(ir_loop *ir)
1100 {
1101 /* We don't want debugging output to print the whole body of the
1102 * loop as the annotation.
1103 */
1104 this->base_ir = NULL;
1105
1106 emit(BRW_OPCODE_DO);
1107
1108 visit_instructions(&ir->body_instructions);
1109
1110 emit(BRW_OPCODE_WHILE);
1111 }
1112
1113 void
1114 vec4_visitor::visit(ir_loop_jump *ir)
1115 {
1116 switch (ir->mode) {
1117 case ir_loop_jump::jump_break:
1118 emit(BRW_OPCODE_BREAK);
1119 break;
1120 case ir_loop_jump::jump_continue:
1121 emit(BRW_OPCODE_CONTINUE);
1122 break;
1123 }
1124 }
1125
1126
1127 void
1128 vec4_visitor::visit(ir_function_signature *)
1129 {
1130 unreachable("not reached");
1131 }
1132
1133 void
1134 vec4_visitor::visit(ir_function *ir)
1135 {
1136 /* Ignore function bodies other than main() -- we shouldn't see calls to
1137 * them since they should all be inlined.
1138 */
1139 if (strcmp(ir->name, "main") == 0) {
1140 const ir_function_signature *sig;
1141 exec_list empty;
1142
1143 sig = ir->matching_signature(NULL, &empty, false);
1144
1145 assert(sig);
1146
1147 visit_instructions(&sig->body);
1148 }
1149 }
1150
1151 bool
1152 vec4_visitor::try_emit_mad(ir_expression *ir)
1153 {
1154 /* 3-src instructions were introduced in gen6. */
1155 if (brw->gen < 6)
1156 return false;
1157
1158 /* MAD can only handle floating-point data. */
1159 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1160 return false;
1161
1162 ir_rvalue *nonmul;
1163 ir_expression *mul;
1164 bool mul_negate, mul_abs;
1165
1166 for (int i = 0; i < 2; i++) {
1167 mul_negate = false;
1168 mul_abs = false;
1169
1170 mul = ir->operands[i]->as_expression();
1171 nonmul = ir->operands[1 - i];
1172
1173 if (mul && mul->operation == ir_unop_abs) {
1174 mul = mul->operands[0]->as_expression();
1175 mul_abs = true;
1176 } else if (mul && mul->operation == ir_unop_neg) {
1177 mul = mul->operands[0]->as_expression();
1178 mul_negate = true;
1179 }
1180
1181 if (mul && mul->operation == ir_binop_mul)
1182 break;
1183 }
1184
1185 if (!mul || mul->operation != ir_binop_mul)
1186 return false;
1187
1188 nonmul->accept(this);
1189 src_reg src0 = fix_3src_operand(this->result);
1190
1191 mul->operands[0]->accept(this);
1192 src_reg src1 = fix_3src_operand(this->result);
1193 src1.negate ^= mul_negate;
1194 src1.abs = mul_abs;
1195 if (mul_abs)
1196 src1.negate = false;
1197
1198 mul->operands[1]->accept(this);
1199 src_reg src2 = fix_3src_operand(this->result);
1200 src2.abs = mul_abs;
1201 if (mul_abs)
1202 src2.negate = false;
1203
1204 this->result = src_reg(this, ir->type);
1205 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1206
1207 return true;
1208 }
1209
1210 bool
1211 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1212 {
1213 /* This optimization relies on CMP setting the destination to 0 when
1214 * false. Early hardware only sets the least significant bit, and
1215 * leaves the other bits undefined. So we can't use it.
1216 */
1217 if (brw->gen < 6)
1218 return false;
1219
1220 ir_expression *const cmp = ir->operands[0]->as_expression();
1221
1222 if (cmp == NULL)
1223 return false;
1224
1225 switch (cmp->operation) {
1226 case ir_binop_less:
1227 case ir_binop_greater:
1228 case ir_binop_lequal:
1229 case ir_binop_gequal:
1230 case ir_binop_equal:
1231 case ir_binop_nequal:
1232 break;
1233
1234 default:
1235 return false;
1236 }
1237
1238 cmp->operands[0]->accept(this);
1239 const src_reg cmp_src0 = this->result;
1240
1241 cmp->operands[1]->accept(this);
1242 const src_reg cmp_src1 = this->result;
1243
1244 this->result = src_reg(this, ir->type);
1245
1246 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1247 brw_conditional_for_comparison(cmp->operation)));
1248
1249 /* If the comparison is false, this->result will just happen to be zero.
1250 */
1251 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1252 this->result, src_reg(1.0f));
1253 inst->predicate = BRW_PREDICATE_NORMAL;
1254 inst->predicate_inverse = true;
1255
1256 return true;
1257 }
1258
1259 void
1260 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1261 src_reg src0, src_reg src1)
1262 {
1263 vec4_instruction *inst;
1264
1265 if (brw->gen >= 6) {
1266 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1267 inst->conditional_mod = conditionalmod;
1268 } else {
1269 emit(CMP(dst, src0, src1, conditionalmod));
1270
1271 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1272 inst->predicate = BRW_PREDICATE_NORMAL;
1273 }
1274 }
1275
1276 void
1277 vec4_visitor::emit_lrp(const dst_reg &dst,
1278 const src_reg &x, const src_reg &y, const src_reg &a)
1279 {
1280 if (brw->gen >= 6) {
1281 /* Note that the instruction's argument order is reversed from GLSL
1282 * and the IR.
1283 */
1284 emit(LRP(dst,
1285 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1286 } else {
1287 /* Earlier generations don't support three source operations, so we
1288 * need to emit x*(1-a) + y*a.
1289 */
1290 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1291 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1292 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1293 y_times_a.writemask = dst.writemask;
1294 one_minus_a.writemask = dst.writemask;
1295 x_times_one_minus_a.writemask = dst.writemask;
1296
1297 emit(MUL(y_times_a, y, a));
1298 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1299 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1300 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1301 }
1302 }
1303
1304 void
1305 vec4_visitor::visit(ir_expression *ir)
1306 {
1307 unsigned int operand;
1308 src_reg op[ARRAY_SIZE(ir->operands)];
1309 vec4_instruction *inst;
1310
1311 if (ir->operation == ir_binop_add) {
1312 if (try_emit_mad(ir))
1313 return;
1314 }
1315
1316 if (ir->operation == ir_unop_b2f) {
1317 if (try_emit_b2f_of_compare(ir))
1318 return;
1319 }
1320
1321 /* Storage for our result. Ideally for an assignment we'd be using
1322 * the actual storage for the result here, instead.
1323 */
1324 dst_reg result_dst(this, ir->type);
1325 src_reg result_src(result_dst);
1326
1327 if (ir->operation == ir_triop_csel) {
1328 ir->operands[1]->accept(this);
1329 op[1] = this->result;
1330 ir->operands[2]->accept(this);
1331 op[2] = this->result;
1332
1333 enum brw_predicate predicate;
1334 emit_bool_to_cond_code(ir->operands[0], &predicate);
1335 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1336 inst->predicate = predicate;
1337 this->result = result_src;
1338 return;
1339 }
1340
1341 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1342 this->result.file = BAD_FILE;
1343 ir->operands[operand]->accept(this);
1344 if (this->result.file == BAD_FILE) {
1345 fprintf(stderr, "Failed to get tree for expression operand:\n");
1346 ir->operands[operand]->fprint(stderr);
1347 exit(1);
1348 }
1349 op[operand] = this->result;
1350
1351 /* Matrix expression operands should have been broken down to vector
1352 * operations already.
1353 */
1354 assert(!ir->operands[operand]->type->is_matrix());
1355 }
1356
1357 /* If nothing special happens, this is the result. */
1358 this->result = result_src;
1359
1360 switch (ir->operation) {
1361 case ir_unop_logic_not:
1362 emit(NOT(result_dst, op[0]));
1363 break;
1364 case ir_unop_neg:
1365 op[0].negate = !op[0].negate;
1366 emit(MOV(result_dst, op[0]));
1367 break;
1368 case ir_unop_abs:
1369 op[0].abs = true;
1370 op[0].negate = false;
1371 emit(MOV(result_dst, op[0]));
1372 break;
1373
1374 case ir_unop_sign:
1375 if (ir->type->is_float()) {
1376 /* AND(val, 0x80000000) gives the sign bit.
1377 *
1378 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1379 * zero.
1380 */
1381 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1382
1383 op[0].type = BRW_REGISTER_TYPE_UD;
1384 result_dst.type = BRW_REGISTER_TYPE_UD;
1385 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1386
1387 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1388 inst->predicate = BRW_PREDICATE_NORMAL;
1389
1390 this->result.type = BRW_REGISTER_TYPE_F;
1391 } else {
1392 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1393 * -> non-negative val generates 0x00000000.
1394 * Predicated OR sets 1 if val is positive.
1395 */
1396 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1397
1398 emit(ASR(result_dst, op[0], src_reg(31)));
1399
1400 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1401 inst->predicate = BRW_PREDICATE_NORMAL;
1402 }
1403 break;
1404
1405 case ir_unop_rcp:
1406 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1407 break;
1408
1409 case ir_unop_exp2:
1410 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1411 break;
1412 case ir_unop_log2:
1413 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1414 break;
1415 case ir_unop_exp:
1416 case ir_unop_log:
1417 unreachable("not reached: should be handled by ir_explog_to_explog2");
1418 case ir_unop_sin:
1419 case ir_unop_sin_reduced:
1420 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1421 break;
1422 case ir_unop_cos:
1423 case ir_unop_cos_reduced:
1424 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1425 break;
1426
1427 case ir_unop_dFdx:
1428 case ir_unop_dFdx_coarse:
1429 case ir_unop_dFdx_fine:
1430 case ir_unop_dFdy:
1431 case ir_unop_dFdy_coarse:
1432 case ir_unop_dFdy_fine:
1433 unreachable("derivatives not valid in vertex shader");
1434
1435 case ir_unop_bitfield_reverse:
1436 emit(BFREV(result_dst, op[0]));
1437 break;
1438 case ir_unop_bit_count:
1439 emit(CBIT(result_dst, op[0]));
1440 break;
1441 case ir_unop_find_msb: {
1442 src_reg temp = src_reg(this, glsl_type::uint_type);
1443
1444 inst = emit(FBH(dst_reg(temp), op[0]));
1445 inst->dst.writemask = WRITEMASK_XYZW;
1446
1447 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1448 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1449 * subtract the result from 31 to convert the MSB count into an LSB count.
1450 */
1451
1452 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1453 temp.swizzle = BRW_SWIZZLE_NOOP;
1454 emit(MOV(result_dst, temp));
1455
1456 src_reg src_tmp = src_reg(result_dst);
1457 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1458
1459 src_tmp.negate = true;
1460 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1461 inst->predicate = BRW_PREDICATE_NORMAL;
1462 break;
1463 }
1464 case ir_unop_find_lsb:
1465 emit(FBL(result_dst, op[0]));
1466 break;
1467 case ir_unop_saturate:
1468 inst = emit(MOV(result_dst, op[0]));
1469 inst->saturate = true;
1470 break;
1471
1472 case ir_unop_noise:
1473 unreachable("not reached: should be handled by lower_noise");
1474
1475 case ir_binop_add:
1476 emit(ADD(result_dst, op[0], op[1]));
1477 break;
1478 case ir_binop_sub:
1479 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1480
1481 case ir_binop_mul:
1482 if (brw->gen < 8 && ir->type->is_integer()) {
1483 /* For integer multiplication, the MUL uses the low 16 bits of one of
1484 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1485 * accumulates in the contribution of the upper 16 bits of that
1486 * operand. If we can determine that one of the args is in the low
1487 * 16 bits, though, we can just emit a single MUL.
1488 */
1489 if (ir->operands[0]->is_uint16_constant()) {
1490 if (brw->gen < 7)
1491 emit(MUL(result_dst, op[0], op[1]));
1492 else
1493 emit(MUL(result_dst, op[1], op[0]));
1494 } else if (ir->operands[1]->is_uint16_constant()) {
1495 if (brw->gen < 7)
1496 emit(MUL(result_dst, op[1], op[0]));
1497 else
1498 emit(MUL(result_dst, op[0], op[1]));
1499 } else {
1500 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1501
1502 emit(MUL(acc, op[0], op[1]));
1503 emit(MACH(dst_null_d(), op[0], op[1]));
1504 emit(MOV(result_dst, src_reg(acc)));
1505 }
1506 } else {
1507 emit(MUL(result_dst, op[0], op[1]));
1508 }
1509 break;
1510 case ir_binop_imul_high: {
1511 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1512
1513 emit(MUL(acc, op[0], op[1]));
1514 emit(MACH(result_dst, op[0], op[1]));
1515 break;
1516 }
1517 case ir_binop_div:
1518 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1519 assert(ir->type->is_integer());
1520 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1521 break;
1522 case ir_binop_carry: {
1523 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1524
1525 emit(ADDC(dst_null_ud(), op[0], op[1]));
1526 emit(MOV(result_dst, src_reg(acc)));
1527 break;
1528 }
1529 case ir_binop_borrow: {
1530 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1531
1532 emit(SUBB(dst_null_ud(), op[0], op[1]));
1533 emit(MOV(result_dst, src_reg(acc)));
1534 break;
1535 }
1536 case ir_binop_mod:
1537 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1538 assert(ir->type->is_integer());
1539 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1540 break;
1541
1542 case ir_binop_less:
1543 case ir_binop_greater:
1544 case ir_binop_lequal:
1545 case ir_binop_gequal:
1546 case ir_binop_equal:
1547 case ir_binop_nequal: {
1548 if (brw->gen <= 5) {
1549 resolve_bool_comparison(ir->operands[0], &op[0]);
1550 resolve_bool_comparison(ir->operands[1], &op[1]);
1551 }
1552 emit(CMP(result_dst, op[0], op[1],
1553 brw_conditional_for_comparison(ir->operation)));
1554 break;
1555 }
1556
1557 case ir_binop_all_equal:
1558 if (brw->gen <= 5) {
1559 resolve_bool_comparison(ir->operands[0], &op[0]);
1560 resolve_bool_comparison(ir->operands[1], &op[1]);
1561 }
1562
1563 /* "==" operator producing a scalar boolean. */
1564 if (ir->operands[0]->type->is_vector() ||
1565 ir->operands[1]->type->is_vector()) {
1566 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1567 emit(MOV(result_dst, src_reg(0)));
1568 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1569 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1570 } else {
1571 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1572 }
1573 break;
1574 case ir_binop_any_nequal:
1575 if (brw->gen <= 5) {
1576 resolve_bool_comparison(ir->operands[0], &op[0]);
1577 resolve_bool_comparison(ir->operands[1], &op[1]);
1578 }
1579
1580 /* "!=" operator producing a scalar boolean. */
1581 if (ir->operands[0]->type->is_vector() ||
1582 ir->operands[1]->type->is_vector()) {
1583 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1584
1585 emit(MOV(result_dst, src_reg(0)));
1586 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1587 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1588 } else {
1589 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1590 }
1591 break;
1592
1593 case ir_unop_any:
1594 if (brw->gen <= 5) {
1595 resolve_bool_comparison(ir->operands[0], &op[0]);
1596 }
1597 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1598 emit(MOV(result_dst, src_reg(0)));
1599
1600 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1601 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1602 break;
1603
1604 case ir_binop_logic_xor:
1605 emit(XOR(result_dst, op[0], op[1]));
1606 break;
1607
1608 case ir_binop_logic_or:
1609 emit(OR(result_dst, op[0], op[1]));
1610 break;
1611
1612 case ir_binop_logic_and:
1613 emit(AND(result_dst, op[0], op[1]));
1614 break;
1615
1616 case ir_binop_dot:
1617 assert(ir->operands[0]->type->is_vector());
1618 assert(ir->operands[0]->type == ir->operands[1]->type);
1619 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1620 break;
1621
1622 case ir_unop_sqrt:
1623 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1624 break;
1625 case ir_unop_rsq:
1626 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1627 break;
1628
1629 case ir_unop_bitcast_i2f:
1630 case ir_unop_bitcast_u2f:
1631 this->result = op[0];
1632 this->result.type = BRW_REGISTER_TYPE_F;
1633 break;
1634
1635 case ir_unop_bitcast_f2i:
1636 this->result = op[0];
1637 this->result.type = BRW_REGISTER_TYPE_D;
1638 break;
1639
1640 case ir_unop_bitcast_f2u:
1641 this->result = op[0];
1642 this->result.type = BRW_REGISTER_TYPE_UD;
1643 break;
1644
1645 case ir_unop_i2f:
1646 case ir_unop_i2u:
1647 case ir_unop_u2i:
1648 case ir_unop_u2f:
1649 case ir_unop_f2i:
1650 case ir_unop_f2u:
1651 emit(MOV(result_dst, op[0]));
1652 break;
1653 case ir_unop_b2i:
1654 emit(AND(result_dst, op[0], src_reg(1)));
1655 break;
1656 case ir_unop_b2f:
1657 if (brw->gen <= 5) {
1658 resolve_bool_comparison(ir->operands[0], &op[0]);
1659 }
1660 op[0].type = BRW_REGISTER_TYPE_D;
1661 result_dst.type = BRW_REGISTER_TYPE_D;
1662 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1663 result_dst.type = BRW_REGISTER_TYPE_F;
1664 break;
1665 case ir_unop_f2b:
1666 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1667 break;
1668 case ir_unop_i2b:
1669 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1670 break;
1671
1672 case ir_unop_trunc:
1673 emit(RNDZ(result_dst, op[0]));
1674 break;
1675 case ir_unop_ceil: {
1676 src_reg tmp = src_reg(this, ir->type);
1677 op[0].negate = !op[0].negate;
1678 emit(RNDD(dst_reg(tmp), op[0]));
1679 tmp.negate = true;
1680 emit(MOV(result_dst, tmp));
1681 }
1682 break;
1683 case ir_unop_floor:
1684 inst = emit(RNDD(result_dst, op[0]));
1685 break;
1686 case ir_unop_fract:
1687 inst = emit(FRC(result_dst, op[0]));
1688 break;
1689 case ir_unop_round_even:
1690 emit(RNDE(result_dst, op[0]));
1691 break;
1692
1693 case ir_binop_min:
1694 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1695 break;
1696 case ir_binop_max:
1697 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1698 break;
1699
1700 case ir_binop_pow:
1701 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1702 break;
1703
1704 case ir_unop_bit_not:
1705 inst = emit(NOT(result_dst, op[0]));
1706 break;
1707 case ir_binop_bit_and:
1708 inst = emit(AND(result_dst, op[0], op[1]));
1709 break;
1710 case ir_binop_bit_xor:
1711 inst = emit(XOR(result_dst, op[0], op[1]));
1712 break;
1713 case ir_binop_bit_or:
1714 inst = emit(OR(result_dst, op[0], op[1]));
1715 break;
1716
1717 case ir_binop_lshift:
1718 inst = emit(SHL(result_dst, op[0], op[1]));
1719 break;
1720
1721 case ir_binop_rshift:
1722 if (ir->type->base_type == GLSL_TYPE_INT)
1723 inst = emit(ASR(result_dst, op[0], op[1]));
1724 else
1725 inst = emit(SHR(result_dst, op[0], op[1]));
1726 break;
1727
1728 case ir_binop_bfm:
1729 emit(BFI1(result_dst, op[0], op[1]));
1730 break;
1731
1732 case ir_binop_ubo_load: {
1733 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1734 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1735 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1736 src_reg offset;
1737
1738 /* Now, load the vector from that offset. */
1739 assert(ir->type->is_vector() || ir->type->is_scalar());
1740
1741 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1742 packed_consts.type = result.type;
1743 src_reg surf_index;
1744
1745 if (const_uniform_block) {
1746 /* The block index is a constant, so just emit the binding table entry
1747 * as an immediate.
1748 */
1749 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1750 const_uniform_block->value.u[0]);
1751 } else {
1752 /* The block index is not a constant. Evaluate the index expression
1753 * per-channel and add the base UBO index; the generator will select
1754 * a value from any live channel.
1755 */
1756 surf_index = src_reg(this, glsl_type::uint_type);
1757 emit(ADD(dst_reg(surf_index), op[0],
1758 src_reg(prog_data->base.binding_table.ubo_start)));
1759
1760 /* Assume this may touch any UBO. It would be nice to provide
1761 * a tighter bound, but the array information is already lowered away.
1762 */
1763 brw_mark_surface_used(&prog_data->base,
1764 prog_data->base.binding_table.ubo_start +
1765 shader_prog->NumUniformBlocks - 1);
1766 }
1767
1768 if (const_offset_ir) {
1769 if (brw->gen >= 8) {
1770 /* Store the offset in a GRF so we can send-from-GRF. */
1771 offset = src_reg(this, glsl_type::int_type);
1772 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1773 } else {
1774 /* Immediates are fine on older generations since they'll be moved
1775 * to a (potentially fake) MRF at the generator level.
1776 */
1777 offset = src_reg(const_offset / 16);
1778 }
1779 } else {
1780 offset = src_reg(this, glsl_type::uint_type);
1781 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1782 }
1783
1784 if (brw->gen >= 7) {
1785 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1786
1787 /* We have to use a message header on Skylake to get SIMD4x2 mode.
1788 * Reserve space for the register.
1789 */
1790 if (brw->gen >= 9) {
1791 grf_offset.reg_offset++;
1792 alloc.sizes[grf_offset.reg] = 2;
1793 }
1794
1795 grf_offset.type = offset.type;
1796
1797 emit(MOV(grf_offset, offset));
1798
1799 vec4_instruction *pull =
1800 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1801 dst_reg(packed_consts),
1802 surf_index,
1803 src_reg(grf_offset)));
1804 pull->mlen = 1;
1805 } else {
1806 vec4_instruction *pull =
1807 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1808 dst_reg(packed_consts),
1809 surf_index,
1810 offset));
1811 pull->base_mrf = 14;
1812 pull->mlen = 1;
1813 }
1814
1815 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1816 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1817 const_offset % 16 / 4,
1818 const_offset % 16 / 4,
1819 const_offset % 16 / 4);
1820
1821 /* UBO bools are any nonzero int. We need to convert them to use the
1822 * value of true stored in ctx->Const.UniformBooleanTrue.
1823 */
1824 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1825 emit(CMP(result_dst, packed_consts, src_reg(0u),
1826 BRW_CONDITIONAL_NZ));
1827 } else {
1828 emit(MOV(result_dst, packed_consts));
1829 }
1830 break;
1831 }
1832
1833 case ir_binop_vector_extract:
1834 unreachable("should have been lowered by vec_index_to_cond_assign");
1835
1836 case ir_triop_fma:
1837 op[0] = fix_3src_operand(op[0]);
1838 op[1] = fix_3src_operand(op[1]);
1839 op[2] = fix_3src_operand(op[2]);
1840 /* Note that the instruction's argument order is reversed from GLSL
1841 * and the IR.
1842 */
1843 emit(MAD(result_dst, op[2], op[1], op[0]));
1844 break;
1845
1846 case ir_triop_lrp:
1847 emit_lrp(result_dst, op[0], op[1], op[2]);
1848 break;
1849
1850 case ir_triop_csel:
1851 unreachable("already handled above");
1852 break;
1853
1854 case ir_triop_bfi:
1855 op[0] = fix_3src_operand(op[0]);
1856 op[1] = fix_3src_operand(op[1]);
1857 op[2] = fix_3src_operand(op[2]);
1858 emit(BFI2(result_dst, op[0], op[1], op[2]));
1859 break;
1860
1861 case ir_triop_bitfield_extract:
1862 op[0] = fix_3src_operand(op[0]);
1863 op[1] = fix_3src_operand(op[1]);
1864 op[2] = fix_3src_operand(op[2]);
1865 /* Note that the instruction's argument order is reversed from GLSL
1866 * and the IR.
1867 */
1868 emit(BFE(result_dst, op[2], op[1], op[0]));
1869 break;
1870
1871 case ir_triop_vector_insert:
1872 unreachable("should have been lowered by lower_vector_insert");
1873
1874 case ir_quadop_bitfield_insert:
1875 unreachable("not reached: should be handled by "
1876 "bitfield_insert_to_bfm_bfi\n");
1877
1878 case ir_quadop_vector:
1879 unreachable("not reached: should be handled by lower_quadop_vector");
1880
1881 case ir_unop_pack_half_2x16:
1882 emit_pack_half_2x16(result_dst, op[0]);
1883 break;
1884 case ir_unop_unpack_half_2x16:
1885 emit_unpack_half_2x16(result_dst, op[0]);
1886 break;
1887 case ir_unop_unpack_unorm_4x8:
1888 emit_unpack_unorm_4x8(result_dst, op[0]);
1889 break;
1890 case ir_unop_unpack_snorm_4x8:
1891 emit_unpack_snorm_4x8(result_dst, op[0]);
1892 break;
1893 case ir_unop_pack_unorm_4x8:
1894 emit_pack_unorm_4x8(result_dst, op[0]);
1895 break;
1896 case ir_unop_pack_snorm_4x8:
1897 emit_pack_snorm_4x8(result_dst, op[0]);
1898 break;
1899 case ir_unop_pack_snorm_2x16:
1900 case ir_unop_pack_unorm_2x16:
1901 case ir_unop_unpack_snorm_2x16:
1902 case ir_unop_unpack_unorm_2x16:
1903 unreachable("not reached: should be handled by lower_packing_builtins");
1904 case ir_unop_unpack_half_2x16_split_x:
1905 case ir_unop_unpack_half_2x16_split_y:
1906 case ir_binop_pack_half_2x16_split:
1907 case ir_unop_interpolate_at_centroid:
1908 case ir_binop_interpolate_at_sample:
1909 case ir_binop_interpolate_at_offset:
1910 unreachable("not reached: should not occur in vertex shader");
1911 case ir_binop_ldexp:
1912 unreachable("not reached: should be handled by ldexp_to_arith()");
1913 case ir_unop_d2f:
1914 case ir_unop_f2d:
1915 case ir_unop_d2i:
1916 case ir_unop_i2d:
1917 case ir_unop_d2u:
1918 case ir_unop_u2d:
1919 case ir_unop_d2b:
1920 case ir_unop_pack_double_2x32:
1921 case ir_unop_unpack_double_2x32:
1922 case ir_unop_frexp_sig:
1923 case ir_unop_frexp_exp:
1924 unreachable("fp64 todo");
1925 }
1926 }
1927
1928
1929 void
1930 vec4_visitor::visit(ir_swizzle *ir)
1931 {
1932 src_reg src;
1933 int i = 0;
1934 int swizzle[4];
1935
1936 /* Note that this is only swizzles in expressions, not those on the left
1937 * hand side of an assignment, which do write masking. See ir_assignment
1938 * for that.
1939 */
1940
1941 ir->val->accept(this);
1942 src = this->result;
1943 assert(src.file != BAD_FILE);
1944
1945 for (i = 0; i < ir->type->vector_elements; i++) {
1946 switch (i) {
1947 case 0:
1948 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1949 break;
1950 case 1:
1951 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1952 break;
1953 case 2:
1954 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1955 break;
1956 case 3:
1957 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1958 break;
1959 }
1960 }
1961 for (; i < 4; i++) {
1962 /* Replicate the last channel out. */
1963 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1964 }
1965
1966 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1967
1968 this->result = src;
1969 }
1970
1971 void
1972 vec4_visitor::visit(ir_dereference_variable *ir)
1973 {
1974 const struct glsl_type *type = ir->type;
1975 dst_reg *reg = variable_storage(ir->var);
1976
1977 if (!reg) {
1978 fail("Failed to find variable storage for %s\n", ir->var->name);
1979 this->result = src_reg(brw_null_reg());
1980 return;
1981 }
1982
1983 this->result = src_reg(*reg);
1984
1985 /* System values get their swizzle from the dst_reg writemask */
1986 if (ir->var->data.mode == ir_var_system_value)
1987 return;
1988
1989 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1990 this->result.swizzle = swizzle_for_size(type->vector_elements);
1991 }
1992
1993
1994 int
1995 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1996 {
1997 /* Under normal circumstances array elements are stored consecutively, so
1998 * the stride is equal to the size of the array element.
1999 */
2000 return type_size(ir->type);
2001 }
2002
2003
2004 void
2005 vec4_visitor::visit(ir_dereference_array *ir)
2006 {
2007 ir_constant *constant_index;
2008 src_reg src;
2009 int array_stride = compute_array_stride(ir);
2010
2011 constant_index = ir->array_index->constant_expression_value();
2012
2013 ir->array->accept(this);
2014 src = this->result;
2015
2016 if (constant_index) {
2017 src.reg_offset += constant_index->value.i[0] * array_stride;
2018 } else {
2019 /* Variable index array dereference. It eats the "vec4" of the
2020 * base of the array and an index that offsets the Mesa register
2021 * index.
2022 */
2023 ir->array_index->accept(this);
2024
2025 src_reg index_reg;
2026
2027 if (array_stride == 1) {
2028 index_reg = this->result;
2029 } else {
2030 index_reg = src_reg(this, glsl_type::int_type);
2031
2032 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2033 }
2034
2035 if (src.reladdr) {
2036 src_reg temp = src_reg(this, glsl_type::int_type);
2037
2038 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2039
2040 index_reg = temp;
2041 }
2042
2043 src.reladdr = ralloc(mem_ctx, src_reg);
2044 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2045 }
2046
2047 /* If the type is smaller than a vec4, replicate the last channel out. */
2048 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2049 src.swizzle = swizzle_for_size(ir->type->vector_elements);
2050 else
2051 src.swizzle = BRW_SWIZZLE_NOOP;
2052 src.type = brw_type_for_base_type(ir->type);
2053
2054 this->result = src;
2055 }
2056
2057 void
2058 vec4_visitor::visit(ir_dereference_record *ir)
2059 {
2060 unsigned int i;
2061 const glsl_type *struct_type = ir->record->type;
2062 int offset = 0;
2063
2064 ir->record->accept(this);
2065
2066 for (i = 0; i < struct_type->length; i++) {
2067 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2068 break;
2069 offset += type_size(struct_type->fields.structure[i].type);
2070 }
2071
2072 /* If the type is smaller than a vec4, replicate the last channel out. */
2073 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2074 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2075 else
2076 this->result.swizzle = BRW_SWIZZLE_NOOP;
2077 this->result.type = brw_type_for_base_type(ir->type);
2078
2079 this->result.reg_offset += offset;
2080 }
2081
2082 /**
2083 * We want to be careful in assignment setup to hit the actual storage
2084 * instead of potentially using a temporary like we might with the
2085 * ir_dereference handler.
2086 */
2087 static dst_reg
2088 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2089 {
2090 /* The LHS must be a dereference. If the LHS is a variable indexed array
2091 * access of a vector, it must be separated into a series conditional moves
2092 * before reaching this point (see ir_vec_index_to_cond_assign).
2093 */
2094 assert(ir->as_dereference());
2095 ir_dereference_array *deref_array = ir->as_dereference_array();
2096 if (deref_array) {
2097 assert(!deref_array->array->type->is_vector());
2098 }
2099
2100 /* Use the rvalue deref handler for the most part. We'll ignore
2101 * swizzles in it and write swizzles using writemask, though.
2102 */
2103 ir->accept(v);
2104 return dst_reg(v->result);
2105 }
2106
2107 void
2108 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2109 const struct glsl_type *type,
2110 enum brw_predicate predicate)
2111 {
2112 if (type->base_type == GLSL_TYPE_STRUCT) {
2113 for (unsigned int i = 0; i < type->length; i++) {
2114 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2115 }
2116 return;
2117 }
2118
2119 if (type->is_array()) {
2120 for (unsigned int i = 0; i < type->length; i++) {
2121 emit_block_move(dst, src, type->fields.array, predicate);
2122 }
2123 return;
2124 }
2125
2126 if (type->is_matrix()) {
2127 const struct glsl_type *vec_type;
2128
2129 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2130 type->vector_elements, 1);
2131
2132 for (int i = 0; i < type->matrix_columns; i++) {
2133 emit_block_move(dst, src, vec_type, predicate);
2134 }
2135 return;
2136 }
2137
2138 assert(type->is_scalar() || type->is_vector());
2139
2140 dst->type = brw_type_for_base_type(type);
2141 src->type = dst->type;
2142
2143 dst->writemask = (1 << type->vector_elements) - 1;
2144
2145 src->swizzle = swizzle_for_size(type->vector_elements);
2146
2147 vec4_instruction *inst = emit(MOV(*dst, *src));
2148 inst->predicate = predicate;
2149
2150 dst->reg_offset++;
2151 src->reg_offset++;
2152 }
2153
2154
2155 /* If the RHS processing resulted in an instruction generating a
2156 * temporary value, and it would be easy to rewrite the instruction to
2157 * generate its result right into the LHS instead, do so. This ends
2158 * up reliably removing instructions where it can be tricky to do so
2159 * later without real UD chain information.
2160 */
2161 bool
2162 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2163 dst_reg dst,
2164 src_reg src,
2165 vec4_instruction *pre_rhs_inst,
2166 vec4_instruction *last_rhs_inst)
2167 {
2168 /* This could be supported, but it would take more smarts. */
2169 if (ir->condition)
2170 return false;
2171
2172 if (pre_rhs_inst == last_rhs_inst)
2173 return false; /* No instructions generated to work with. */
2174
2175 /* Make sure the last instruction generated our source reg. */
2176 if (src.file != GRF ||
2177 src.file != last_rhs_inst->dst.file ||
2178 src.reg != last_rhs_inst->dst.reg ||
2179 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2180 src.reladdr ||
2181 src.abs ||
2182 src.negate ||
2183 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2184 return false;
2185
2186 /* Check that that last instruction fully initialized the channels
2187 * we want to use, in the order we want to use them. We could
2188 * potentially reswizzle the operands of many instructions so that
2189 * we could handle out of order channels, but don't yet.
2190 */
2191
2192 for (unsigned i = 0; i < 4; i++) {
2193 if (dst.writemask & (1 << i)) {
2194 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2195 return false;
2196
2197 if (BRW_GET_SWZ(src.swizzle, i) != i)
2198 return false;
2199 }
2200 }
2201
2202 /* Success! Rewrite the instruction. */
2203 last_rhs_inst->dst.file = dst.file;
2204 last_rhs_inst->dst.reg = dst.reg;
2205 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2206 last_rhs_inst->dst.reladdr = dst.reladdr;
2207 last_rhs_inst->dst.writemask &= dst.writemask;
2208
2209 return true;
2210 }
2211
2212 void
2213 vec4_visitor::visit(ir_assignment *ir)
2214 {
2215 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2216 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2217
2218 if (!ir->lhs->type->is_scalar() &&
2219 !ir->lhs->type->is_vector()) {
2220 ir->rhs->accept(this);
2221 src_reg src = this->result;
2222
2223 if (ir->condition) {
2224 emit_bool_to_cond_code(ir->condition, &predicate);
2225 }
2226
2227 /* emit_block_move doesn't account for swizzles in the source register.
2228 * This should be ok, since the source register is a structure or an
2229 * array, and those can't be swizzled. But double-check to be sure.
2230 */
2231 assert(src.swizzle ==
2232 (ir->rhs->type->is_matrix()
2233 ? swizzle_for_size(ir->rhs->type->vector_elements)
2234 : BRW_SWIZZLE_NOOP));
2235
2236 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2237 return;
2238 }
2239
2240 /* Now we're down to just a scalar/vector with writemasks. */
2241 int i;
2242
2243 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2244 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2245
2246 ir->rhs->accept(this);
2247
2248 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2249
2250 src_reg src = this->result;
2251
2252 int swizzles[4];
2253 int first_enabled_chan = 0;
2254 int src_chan = 0;
2255
2256 assert(ir->lhs->type->is_vector() ||
2257 ir->lhs->type->is_scalar());
2258 dst.writemask = ir->write_mask;
2259
2260 for (int i = 0; i < 4; i++) {
2261 if (dst.writemask & (1 << i)) {
2262 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2263 break;
2264 }
2265 }
2266
2267 /* Swizzle a small RHS vector into the channels being written.
2268 *
2269 * glsl ir treats write_mask as dictating how many channels are
2270 * present on the RHS while in our instructions we need to make
2271 * those channels appear in the slots of the vec4 they're written to.
2272 */
2273 for (int i = 0; i < 4; i++) {
2274 if (dst.writemask & (1 << i))
2275 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2276 else
2277 swizzles[i] = first_enabled_chan;
2278 }
2279 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2280 swizzles[2], swizzles[3]);
2281
2282 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2283 return;
2284 }
2285
2286 if (ir->condition) {
2287 emit_bool_to_cond_code(ir->condition, &predicate);
2288 }
2289
2290 for (i = 0; i < type_size(ir->lhs->type); i++) {
2291 vec4_instruction *inst = emit(MOV(dst, src));
2292 inst->predicate = predicate;
2293
2294 dst.reg_offset++;
2295 src.reg_offset++;
2296 }
2297 }
2298
2299 void
2300 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2301 {
2302 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2303 foreach_in_list(ir_constant, field_value, &ir->components) {
2304 emit_constant_values(dst, field_value);
2305 }
2306 return;
2307 }
2308
2309 if (ir->type->is_array()) {
2310 for (unsigned int i = 0; i < ir->type->length; i++) {
2311 emit_constant_values(dst, ir->array_elements[i]);
2312 }
2313 return;
2314 }
2315
2316 if (ir->type->is_matrix()) {
2317 for (int i = 0; i < ir->type->matrix_columns; i++) {
2318 float *vec = &ir->value.f[i * ir->type->vector_elements];
2319
2320 for (int j = 0; j < ir->type->vector_elements; j++) {
2321 dst->writemask = 1 << j;
2322 dst->type = BRW_REGISTER_TYPE_F;
2323
2324 emit(MOV(*dst, src_reg(vec[j])));
2325 }
2326 dst->reg_offset++;
2327 }
2328 return;
2329 }
2330
2331 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2332
2333 for (int i = 0; i < ir->type->vector_elements; i++) {
2334 if (!(remaining_writemask & (1 << i)))
2335 continue;
2336
2337 dst->writemask = 1 << i;
2338 dst->type = brw_type_for_base_type(ir->type);
2339
2340 /* Find other components that match the one we're about to
2341 * write. Emits fewer instructions for things like vec4(0.5,
2342 * 1.5, 1.5, 1.5).
2343 */
2344 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2345 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2346 if (ir->value.b[i] == ir->value.b[j])
2347 dst->writemask |= (1 << j);
2348 } else {
2349 /* u, i, and f storage all line up, so no need for a
2350 * switch case for comparing each type.
2351 */
2352 if (ir->value.u[i] == ir->value.u[j])
2353 dst->writemask |= (1 << j);
2354 }
2355 }
2356
2357 switch (ir->type->base_type) {
2358 case GLSL_TYPE_FLOAT:
2359 emit(MOV(*dst, src_reg(ir->value.f[i])));
2360 break;
2361 case GLSL_TYPE_INT:
2362 emit(MOV(*dst, src_reg(ir->value.i[i])));
2363 break;
2364 case GLSL_TYPE_UINT:
2365 emit(MOV(*dst, src_reg(ir->value.u[i])));
2366 break;
2367 case GLSL_TYPE_BOOL:
2368 emit(MOV(*dst,
2369 src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2370 : 0)));
2371 break;
2372 default:
2373 unreachable("Non-float/uint/int/bool constant");
2374 }
2375
2376 remaining_writemask &= ~dst->writemask;
2377 }
2378 dst->reg_offset++;
2379 }
2380
2381 void
2382 vec4_visitor::visit(ir_constant *ir)
2383 {
2384 dst_reg dst = dst_reg(this, ir->type);
2385 this->result = src_reg(dst);
2386
2387 emit_constant_values(&dst, ir);
2388 }
2389
2390 void
2391 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2392 {
2393 ir_dereference *deref = static_cast<ir_dereference *>(
2394 ir->actual_parameters.get_head());
2395 ir_variable *location = deref->variable_referenced();
2396 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2397 location->data.binding);
2398
2399 /* Calculate the surface offset */
2400 src_reg offset(this, glsl_type::uint_type);
2401 ir_dereference_array *deref_array = deref->as_dereference_array();
2402 if (deref_array) {
2403 deref_array->array_index->accept(this);
2404
2405 src_reg tmp(this, glsl_type::uint_type);
2406 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2407 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2408 } else {
2409 offset = location->data.atomic.offset;
2410 }
2411
2412 /* Emit the appropriate machine instruction */
2413 const char *callee = ir->callee->function_name();
2414 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2415
2416 if (!strcmp("__intrinsic_atomic_read", callee)) {
2417 emit_untyped_surface_read(surf_index, dst, offset);
2418
2419 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2420 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2421 src_reg(), src_reg());
2422
2423 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2424 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2425 src_reg(), src_reg());
2426 }
2427 }
2428
2429 void
2430 vec4_visitor::visit(ir_call *ir)
2431 {
2432 const char *callee = ir->callee->function_name();
2433
2434 if (!strcmp("__intrinsic_atomic_read", callee) ||
2435 !strcmp("__intrinsic_atomic_increment", callee) ||
2436 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2437 visit_atomic_counter_intrinsic(ir);
2438 } else {
2439 unreachable("Unsupported intrinsic.");
2440 }
2441 }
2442
2443 src_reg
2444 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2445 {
2446 vec4_instruction *inst =
2447 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2448 dst_reg(this, glsl_type::uvec4_type));
2449 inst->base_mrf = 2;
2450 inst->mlen = 1;
2451 inst->src[1] = sampler;
2452
2453 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2454 int param_base = inst->base_mrf;
2455 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2456 int zero_mask = 0xf & ~coord_mask;
2457
2458 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2459 coordinate));
2460
2461 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2462 src_reg(0)));
2463
2464 emit(inst);
2465 return src_reg(inst->dst);
2466 }
2467
2468 static bool
2469 is_high_sampler(struct brw_context *brw, src_reg sampler)
2470 {
2471 if (brw->gen < 8 && !brw->is_haswell)
2472 return false;
2473
2474 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2475 }
2476
2477 void
2478 vec4_visitor::visit(ir_texture *ir)
2479 {
2480 uint32_t sampler =
2481 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2482
2483 ir_rvalue *nonconst_sampler_index =
2484 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2485
2486 /* Handle non-constant sampler array indexing */
2487 src_reg sampler_reg;
2488 if (nonconst_sampler_index) {
2489 /* The highest sampler which may be used by this operation is
2490 * the last element of the array. Mark it here, because the generator
2491 * doesn't have enough information to determine the bound.
2492 */
2493 uint32_t array_size = ir->sampler->as_dereference_array()
2494 ->array->type->array_size();
2495
2496 uint32_t max_used = sampler + array_size - 1;
2497 if (ir->op == ir_tg4 && brw->gen < 8) {
2498 max_used += prog_data->base.binding_table.gather_texture_start;
2499 } else {
2500 max_used += prog_data->base.binding_table.texture_start;
2501 }
2502
2503 brw_mark_surface_used(&prog_data->base, max_used);
2504
2505 /* Emit code to evaluate the actual indexing expression */
2506 nonconst_sampler_index->accept(this);
2507 dst_reg temp(this, glsl_type::uint_type);
2508 emit(ADD(temp, this->result, src_reg(sampler)))
2509 ->force_writemask_all = true;
2510 sampler_reg = src_reg(temp);
2511 } else {
2512 /* Single sampler, or constant array index; the indexing expression
2513 * is just an immediate.
2514 */
2515 sampler_reg = src_reg(sampler);
2516 }
2517
2518 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2519 * emitting anything other than setting up the constant result.
2520 */
2521 if (ir->op == ir_tg4) {
2522 ir_constant *chan = ir->lod_info.component->as_constant();
2523 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2524 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2525 dst_reg result(this, ir->type);
2526 this->result = src_reg(result);
2527 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2528 return;
2529 }
2530 }
2531
2532 /* Should be lowered by do_lower_texture_projection */
2533 assert(!ir->projector);
2534
2535 /* Should be lowered */
2536 assert(!ir->offset || !ir->offset->type->is_array());
2537
2538 /* Generate code to compute all the subexpression trees. This has to be
2539 * done before loading any values into MRFs for the sampler message since
2540 * generating these values may involve SEND messages that need the MRFs.
2541 */
2542 src_reg coordinate;
2543 if (ir->coordinate) {
2544 ir->coordinate->accept(this);
2545 coordinate = this->result;
2546 }
2547
2548 src_reg shadow_comparitor;
2549 if (ir->shadow_comparitor) {
2550 ir->shadow_comparitor->accept(this);
2551 shadow_comparitor = this->result;
2552 }
2553
2554 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2555 src_reg offset_value;
2556 if (has_nonconstant_offset) {
2557 ir->offset->accept(this);
2558 offset_value = src_reg(this->result);
2559 }
2560
2561 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2562 src_reg lod, dPdx, dPdy, sample_index, mcs;
2563 switch (ir->op) {
2564 case ir_tex:
2565 lod = src_reg(0.0f);
2566 lod_type = glsl_type::float_type;
2567 break;
2568 case ir_txf:
2569 case ir_txl:
2570 case ir_txs:
2571 ir->lod_info.lod->accept(this);
2572 lod = this->result;
2573 lod_type = ir->lod_info.lod->type;
2574 break;
2575 case ir_query_levels:
2576 lod = src_reg(0);
2577 lod_type = glsl_type::int_type;
2578 break;
2579 case ir_txf_ms:
2580 ir->lod_info.sample_index->accept(this);
2581 sample_index = this->result;
2582 sample_index_type = ir->lod_info.sample_index->type;
2583
2584 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2585 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2586 else
2587 mcs = src_reg(0u);
2588 break;
2589 case ir_txd:
2590 ir->lod_info.grad.dPdx->accept(this);
2591 dPdx = this->result;
2592
2593 ir->lod_info.grad.dPdy->accept(this);
2594 dPdy = this->result;
2595
2596 lod_type = ir->lod_info.grad.dPdx->type;
2597 break;
2598 case ir_txb:
2599 case ir_lod:
2600 case ir_tg4:
2601 break;
2602 }
2603
2604 enum opcode opcode;
2605 switch (ir->op) {
2606 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2607 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2608 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2609 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2610 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2611 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2612 case ir_tg4: opcode = has_nonconstant_offset
2613 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2614 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2615 case ir_txb:
2616 unreachable("TXB is not valid for vertex shaders.");
2617 case ir_lod:
2618 unreachable("LOD is not valid for vertex shaders.");
2619 default:
2620 unreachable("Unrecognized tex op");
2621 }
2622
2623 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2624 opcode, dst_reg(this, ir->type));
2625
2626 if (ir->offset != NULL && !has_nonconstant_offset) {
2627 inst->offset =
2628 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2629 ir->offset->type->vector_elements);
2630 }
2631
2632 /* Stuff the channel select bits in the top of the texture offset */
2633 if (ir->op == ir_tg4)
2634 inst->offset |= gather_channel(ir, sampler) << 16;
2635
2636 /* The message header is necessary for:
2637 * - Gen4 (always)
2638 * - Gen9+ for selecting SIMD4x2
2639 * - Texel offsets
2640 * - Gather channel selection
2641 * - Sampler indices too large to fit in a 4-bit value.
2642 */
2643 inst->header_present =
2644 brw->gen < 5 || brw->gen >= 9 ||
2645 inst->offset != 0 || ir->op == ir_tg4 ||
2646 is_high_sampler(brw, sampler_reg);
2647 inst->base_mrf = 2;
2648 inst->mlen = inst->header_present + 1; /* always at least one */
2649 inst->dst.writemask = WRITEMASK_XYZW;
2650 inst->shadow_compare = ir->shadow_comparitor != NULL;
2651
2652 inst->src[1] = sampler_reg;
2653
2654 /* MRF for the first parameter */
2655 int param_base = inst->base_mrf + inst->header_present;
2656
2657 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2658 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2659 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2660 } else {
2661 /* Load the coordinate */
2662 /* FINISHME: gl_clamp_mask and saturate */
2663 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2664 int zero_mask = 0xf & ~coord_mask;
2665
2666 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2667 coordinate));
2668
2669 if (zero_mask != 0) {
2670 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2671 src_reg(0)));
2672 }
2673 /* Load the shadow comparitor */
2674 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2675 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2676 WRITEMASK_X),
2677 shadow_comparitor));
2678 inst->mlen++;
2679 }
2680
2681 /* Load the LOD info */
2682 if (ir->op == ir_tex || ir->op == ir_txl) {
2683 int mrf, writemask;
2684 if (brw->gen >= 5) {
2685 mrf = param_base + 1;
2686 if (ir->shadow_comparitor) {
2687 writemask = WRITEMASK_Y;
2688 /* mlen already incremented */
2689 } else {
2690 writemask = WRITEMASK_X;
2691 inst->mlen++;
2692 }
2693 } else /* brw->gen == 4 */ {
2694 mrf = param_base;
2695 writemask = WRITEMASK_W;
2696 }
2697 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2698 } else if (ir->op == ir_txf) {
2699 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2700 } else if (ir->op == ir_txf_ms) {
2701 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2702 sample_index));
2703 if (brw->gen >= 7) {
2704 /* MCS data is in the first channel of `mcs`, but we need to get it into
2705 * the .y channel of the second vec4 of params, so replicate .x across
2706 * the whole vec4 and then mask off everything except .y
2707 */
2708 mcs.swizzle = BRW_SWIZZLE_XXXX;
2709 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2710 mcs));
2711 }
2712 inst->mlen++;
2713 } else if (ir->op == ir_txd) {
2714 const glsl_type *type = lod_type;
2715
2716 if (brw->gen >= 5) {
2717 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2718 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2719 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2720 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2721 inst->mlen++;
2722
2723 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2724 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2725 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2726 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2727 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2728 inst->mlen++;
2729
2730 if (ir->shadow_comparitor) {
2731 emit(MOV(dst_reg(MRF, param_base + 2,
2732 ir->shadow_comparitor->type, WRITEMASK_Z),
2733 shadow_comparitor));
2734 }
2735 }
2736 } else /* brw->gen == 4 */ {
2737 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2738 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2739 inst->mlen += 2;
2740 }
2741 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2742 if (ir->shadow_comparitor) {
2743 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2744 shadow_comparitor));
2745 }
2746
2747 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2748 offset_value));
2749 inst->mlen++;
2750 }
2751 }
2752
2753 emit(inst);
2754
2755 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2756 * spec requires layers.
2757 */
2758 if (ir->op == ir_txs) {
2759 glsl_type const *type = ir->sampler->type;
2760 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2761 type->sampler_array) {
2762 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2763 writemask(inst->dst, WRITEMASK_Z),
2764 src_reg(inst->dst), src_reg(6));
2765 }
2766 }
2767
2768 if (brw->gen == 6 && ir->op == ir_tg4) {
2769 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2770 }
2771
2772 swizzle_result(ir, src_reg(inst->dst), sampler);
2773 }
2774
2775 /**
2776 * Apply workarounds for Gen6 gather with UINT/SINT
2777 */
2778 void
2779 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2780 {
2781 if (!wa)
2782 return;
2783
2784 int width = (wa & WA_8BIT) ? 8 : 16;
2785 dst_reg dst_f = dst;
2786 dst_f.type = BRW_REGISTER_TYPE_F;
2787
2788 /* Convert from UNORM to UINT */
2789 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2790 emit(MOV(dst, src_reg(dst_f)));
2791
2792 if (wa & WA_SIGN) {
2793 /* Reinterpret the UINT value as a signed INT value by
2794 * shifting the sign bit into place, then shifting back
2795 * preserving sign.
2796 */
2797 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2798 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2799 }
2800 }
2801
2802 /**
2803 * Set up the gather channel based on the swizzle, for gather4.
2804 */
2805 uint32_t
2806 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2807 {
2808 ir_constant *chan = ir->lod_info.component->as_constant();
2809 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2810 switch (swiz) {
2811 case SWIZZLE_X: return 0;
2812 case SWIZZLE_Y:
2813 /* gather4 sampler is broken for green channel on RG32F --
2814 * we must ask for blue instead.
2815 */
2816 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2817 return 2;
2818 return 1;
2819 case SWIZZLE_Z: return 2;
2820 case SWIZZLE_W: return 3;
2821 default:
2822 unreachable("Not reached"); /* zero, one swizzles handled already */
2823 }
2824 }
2825
2826 void
2827 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2828 {
2829 int s = key->tex.swizzles[sampler];
2830
2831 this->result = src_reg(this, ir->type);
2832 dst_reg swizzled_result(this->result);
2833
2834 if (ir->op == ir_query_levels) {
2835 /* # levels is in .w */
2836 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2837 emit(MOV(swizzled_result, orig_val));
2838 return;
2839 }
2840
2841 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2842 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2843 emit(MOV(swizzled_result, orig_val));
2844 return;
2845 }
2846
2847
2848 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2849 int swizzle[4] = {0};
2850
2851 for (int i = 0; i < 4; i++) {
2852 switch (GET_SWZ(s, i)) {
2853 case SWIZZLE_ZERO:
2854 zero_mask |= (1 << i);
2855 break;
2856 case SWIZZLE_ONE:
2857 one_mask |= (1 << i);
2858 break;
2859 default:
2860 copy_mask |= (1 << i);
2861 swizzle[i] = GET_SWZ(s, i);
2862 break;
2863 }
2864 }
2865
2866 if (copy_mask) {
2867 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2868 swizzled_result.writemask = copy_mask;
2869 emit(MOV(swizzled_result, orig_val));
2870 }
2871
2872 if (zero_mask) {
2873 swizzled_result.writemask = zero_mask;
2874 emit(MOV(swizzled_result, src_reg(0.0f)));
2875 }
2876
2877 if (one_mask) {
2878 swizzled_result.writemask = one_mask;
2879 emit(MOV(swizzled_result, src_reg(1.0f)));
2880 }
2881 }
2882
2883 void
2884 vec4_visitor::visit(ir_return *)
2885 {
2886 unreachable("not reached");
2887 }
2888
2889 void
2890 vec4_visitor::visit(ir_discard *)
2891 {
2892 unreachable("not reached");
2893 }
2894
2895 void
2896 vec4_visitor::visit(ir_if *ir)
2897 {
2898 /* Don't point the annotation at the if statement, because then it plus
2899 * the then and else blocks get printed.
2900 */
2901 this->base_ir = ir->condition;
2902
2903 if (brw->gen == 6) {
2904 emit_if_gen6(ir);
2905 } else {
2906 enum brw_predicate predicate;
2907 emit_bool_to_cond_code(ir->condition, &predicate);
2908 emit(IF(predicate));
2909 }
2910
2911 visit_instructions(&ir->then_instructions);
2912
2913 if (!ir->else_instructions.is_empty()) {
2914 this->base_ir = ir->condition;
2915 emit(BRW_OPCODE_ELSE);
2916
2917 visit_instructions(&ir->else_instructions);
2918 }
2919
2920 this->base_ir = ir->condition;
2921 emit(BRW_OPCODE_ENDIF);
2922 }
2923
2924 void
2925 vec4_visitor::visit(ir_emit_vertex *)
2926 {
2927 unreachable("not reached");
2928 }
2929
2930 void
2931 vec4_visitor::visit(ir_end_primitive *)
2932 {
2933 unreachable("not reached");
2934 }
2935
2936 void
2937 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2938 dst_reg dst, src_reg offset,
2939 src_reg src0, src_reg src1)
2940 {
2941 unsigned mlen = 0;
2942
2943 /* Set the atomic operation offset. */
2944 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2945 mlen++;
2946
2947 /* Set the atomic operation arguments. */
2948 if (src0.file != BAD_FILE) {
2949 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2950 mlen++;
2951 }
2952
2953 if (src1.file != BAD_FILE) {
2954 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2955 mlen++;
2956 }
2957
2958 /* Emit the instruction. Note that this maps to the normal SIMD8
2959 * untyped atomic message on Ivy Bridge, but that's OK because
2960 * unused channels will be masked out.
2961 */
2962 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2963 src_reg(atomic_op), src_reg(surf_index));
2964 inst->base_mrf = 0;
2965 inst->mlen = mlen;
2966 }
2967
2968 void
2969 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2970 src_reg offset)
2971 {
2972 /* Set the surface read offset. */
2973 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2974
2975 /* Emit the instruction. Note that this maps to the normal SIMD8
2976 * untyped surface read message, but that's OK because unused
2977 * channels will be masked out.
2978 */
2979 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2980 dst, src_reg(surf_index));
2981 inst->base_mrf = 0;
2982 inst->mlen = 1;
2983 }
2984
2985 void
2986 vec4_visitor::emit_ndc_computation()
2987 {
2988 /* Get the position */
2989 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2990
2991 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2992 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2993 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2994
2995 current_annotation = "NDC";
2996 dst_reg ndc_w = ndc;
2997 ndc_w.writemask = WRITEMASK_W;
2998 src_reg pos_w = pos;
2999 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3000 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3001
3002 dst_reg ndc_xyz = ndc;
3003 ndc_xyz.writemask = WRITEMASK_XYZ;
3004
3005 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3006 }
3007
3008 void
3009 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3010 {
3011 if (brw->gen < 6 &&
3012 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3013 key->userclip_active || brw->has_negative_rhw_bug)) {
3014 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3015 dst_reg header1_w = header1;
3016 header1_w.writemask = WRITEMASK_W;
3017
3018 emit(MOV(header1, 0u));
3019
3020 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3021 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3022
3023 current_annotation = "Point size";
3024 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3025 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3026 }
3027
3028 if (key->userclip_active) {
3029 current_annotation = "Clipping flags";
3030 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3031 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3032
3033 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3034 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3035 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3036
3037 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3038 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3039 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3040 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3041 }
3042
3043 /* i965 clipping workaround:
3044 * 1) Test for -ve rhw
3045 * 2) If set,
3046 * set ndc = (0,0,0,0)
3047 * set ucp[6] = 1
3048 *
3049 * Later, clipping will detect ucp[6] and ensure the primitive is
3050 * clipped against all fixed planes.
3051 */
3052 if (brw->has_negative_rhw_bug) {
3053 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3054 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3055 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3056 vec4_instruction *inst;
3057 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3058 inst->predicate = BRW_PREDICATE_NORMAL;
3059 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3060 inst->predicate = BRW_PREDICATE_NORMAL;
3061 }
3062
3063 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3064 } else if (brw->gen < 6) {
3065 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3066 } else {
3067 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3068 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3069 dst_reg reg_w = reg;
3070 reg_w.writemask = WRITEMASK_W;
3071 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3072 }
3073 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3074 dst_reg reg_y = reg;
3075 reg_y.writemask = WRITEMASK_Y;
3076 reg_y.type = BRW_REGISTER_TYPE_D;
3077 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3078 }
3079 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3080 dst_reg reg_z = reg;
3081 reg_z.writemask = WRITEMASK_Z;
3082 reg_z.type = BRW_REGISTER_TYPE_D;
3083 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3084 }
3085 }
3086 }
3087
3088 void
3089 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3090 {
3091 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3092 *
3093 * "If a linked set of shaders forming the vertex stage contains no
3094 * static write to gl_ClipVertex or gl_ClipDistance, but the
3095 * application has requested clipping against user clip planes through
3096 * the API, then the coordinate written to gl_Position is used for
3097 * comparison against the user clip planes."
3098 *
3099 * This function is only called if the shader didn't write to
3100 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3101 * if the user wrote to it; otherwise we use gl_Position.
3102 */
3103 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3104 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3105 clip_vertex = VARYING_SLOT_POS;
3106 }
3107
3108 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3109 ++i) {
3110 reg.writemask = 1 << i;
3111 emit(DP4(reg,
3112 src_reg(output_reg[clip_vertex]),
3113 src_reg(this->userplane[i + offset])));
3114 }
3115 }
3116
3117 vec4_instruction *
3118 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3119 {
3120 assert (varying < VARYING_SLOT_MAX);
3121 reg.type = output_reg[varying].type;
3122 current_annotation = output_reg_annotation[varying];
3123 /* Copy the register, saturating if necessary */
3124 return emit(MOV(reg, src_reg(output_reg[varying])));
3125 }
3126
3127 void
3128 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3129 {
3130 reg.type = BRW_REGISTER_TYPE_F;
3131
3132 switch (varying) {
3133 case VARYING_SLOT_PSIZ:
3134 {
3135 /* PSIZ is always in slot 0, and is coupled with other flags. */
3136 current_annotation = "indices, point width, clip flags";
3137 emit_psiz_and_flags(reg);
3138 break;
3139 }
3140 case BRW_VARYING_SLOT_NDC:
3141 current_annotation = "NDC";
3142 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3143 break;
3144 case VARYING_SLOT_POS:
3145 current_annotation = "gl_Position";
3146 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3147 break;
3148 case VARYING_SLOT_EDGE:
3149 /* This is present when doing unfilled polygons. We're supposed to copy
3150 * the edge flag from the user-provided vertex array
3151 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3152 * of that attribute (starts as 1.0f). This is then used in clipping to
3153 * determine which edges should be drawn as wireframe.
3154 */
3155 current_annotation = "edge flag";
3156 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3157 glsl_type::float_type, WRITEMASK_XYZW))));
3158 break;
3159 case BRW_VARYING_SLOT_PAD:
3160 /* No need to write to this slot */
3161 break;
3162 case VARYING_SLOT_COL0:
3163 case VARYING_SLOT_COL1:
3164 case VARYING_SLOT_BFC0:
3165 case VARYING_SLOT_BFC1: {
3166 /* These built-in varyings are only supported in compatibility mode,
3167 * and we only support GS in core profile. So, this must be a vertex
3168 * shader.
3169 */
3170 assert(stage == MESA_SHADER_VERTEX);
3171 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3172 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3173 inst->saturate = true;
3174 break;
3175 }
3176
3177 default:
3178 emit_generic_urb_slot(reg, varying);
3179 break;
3180 }
3181 }
3182
3183 static int
3184 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3185 {
3186 if (brw->gen >= 6) {
3187 /* URB data written (does not include the message header reg) must
3188 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3189 * section 5.4.3.2.2: URB_INTERLEAVED.
3190 *
3191 * URB entries are allocated on a multiple of 1024 bits, so an
3192 * extra 128 bits written here to make the end align to 256 is
3193 * no problem.
3194 */
3195 if ((mlen % 2) != 1)
3196 mlen++;
3197 }
3198
3199 return mlen;
3200 }
3201
3202
3203 /**
3204 * Generates the VUE payload plus the necessary URB write instructions to
3205 * output it.
3206 *
3207 * The VUE layout is documented in Volume 2a.
3208 */
3209 void
3210 vec4_visitor::emit_vertex()
3211 {
3212 /* MRF 0 is reserved for the debugger, so start with message header
3213 * in MRF 1.
3214 */
3215 int base_mrf = 1;
3216 int mrf = base_mrf;
3217 /* In the process of generating our URB write message contents, we
3218 * may need to unspill a register or load from an array. Those
3219 * reads would use MRFs 14-15.
3220 */
3221 int max_usable_mrf = 13;
3222
3223 /* The following assertion verifies that max_usable_mrf causes an
3224 * even-numbered amount of URB write data, which will meet gen6's
3225 * requirements for length alignment.
3226 */
3227 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3228
3229 /* First mrf is the g0-based message header containing URB handles and
3230 * such.
3231 */
3232 emit_urb_write_header(mrf++);
3233
3234 if (brw->gen < 6) {
3235 emit_ndc_computation();
3236 }
3237
3238 /* Lower legacy ff and ClipVertex clipping to clip distances */
3239 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3240 current_annotation = "user clip distances";
3241
3242 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3243 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3244
3245 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3246 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3247 }
3248
3249 /* We may need to split this up into several URB writes, so do them in a
3250 * loop.
3251 */
3252 int slot = 0;
3253 bool complete = false;
3254 do {
3255 /* URB offset is in URB row increments, and each of our MRFs is half of
3256 * one of those, since we're doing interleaved writes.
3257 */
3258 int offset = slot / 2;
3259
3260 mrf = base_mrf + 1;
3261 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3262 emit_urb_slot(dst_reg(MRF, mrf++),
3263 prog_data->vue_map.slot_to_varying[slot]);
3264
3265 /* If this was max_usable_mrf, we can't fit anything more into this
3266 * URB WRITE.
3267 */
3268 if (mrf > max_usable_mrf) {
3269 slot++;
3270 break;
3271 }
3272 }
3273
3274 complete = slot >= prog_data->vue_map.num_slots;
3275 current_annotation = "URB write";
3276 vec4_instruction *inst = emit_urb_write_opcode(complete);
3277 inst->base_mrf = base_mrf;
3278 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3279 inst->offset += offset;
3280 } while(!complete);
3281 }
3282
3283
3284 src_reg
3285 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3286 src_reg *reladdr, int reg_offset)
3287 {
3288 /* Because we store the values to scratch interleaved like our
3289 * vertex data, we need to scale the vec4 index by 2.
3290 */
3291 int message_header_scale = 2;
3292
3293 /* Pre-gen6, the message header uses byte offsets instead of vec4
3294 * (16-byte) offset units.
3295 */
3296 if (brw->gen < 6)
3297 message_header_scale *= 16;
3298
3299 if (reladdr) {
3300 src_reg index = src_reg(this, glsl_type::int_type);
3301
3302 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3303 src_reg(reg_offset)));
3304 emit_before(block, inst, MUL(dst_reg(index), index,
3305 src_reg(message_header_scale)));
3306
3307 return index;
3308 } else {
3309 return src_reg(reg_offset * message_header_scale);
3310 }
3311 }
3312
3313 src_reg
3314 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3315 src_reg *reladdr, int reg_offset)
3316 {
3317 if (reladdr) {
3318 src_reg index = src_reg(this, glsl_type::int_type);
3319
3320 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3321 src_reg(reg_offset)));
3322
3323 /* Pre-gen6, the message header uses byte offsets instead of vec4
3324 * (16-byte) offset units.
3325 */
3326 if (brw->gen < 6) {
3327 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3328 }
3329
3330 return index;
3331 } else if (brw->gen >= 8) {
3332 /* Store the offset in a GRF so we can send-from-GRF. */
3333 src_reg offset = src_reg(this, glsl_type::int_type);
3334 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3335 return offset;
3336 } else {
3337 int message_header_scale = brw->gen < 6 ? 16 : 1;
3338 return src_reg(reg_offset * message_header_scale);
3339 }
3340 }
3341
3342 /**
3343 * Emits an instruction before @inst to load the value named by @orig_src
3344 * from scratch space at @base_offset to @temp.
3345 *
3346 * @base_offset is measured in 32-byte units (the size of a register).
3347 */
3348 void
3349 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3350 dst_reg temp, src_reg orig_src,
3351 int base_offset)
3352 {
3353 int reg_offset = base_offset + orig_src.reg_offset;
3354 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3355 reg_offset);
3356
3357 emit_before(block, inst, SCRATCH_READ(temp, index));
3358 }
3359
3360 /**
3361 * Emits an instruction after @inst to store the value to be written
3362 * to @orig_dst to scratch space at @base_offset, from @temp.
3363 *
3364 * @base_offset is measured in 32-byte units (the size of a register).
3365 */
3366 void
3367 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3368 int base_offset)
3369 {
3370 int reg_offset = base_offset + inst->dst.reg_offset;
3371 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3372 reg_offset);
3373
3374 /* Create a temporary register to store *inst's result in.
3375 *
3376 * We have to be careful in MOVing from our temporary result register in
3377 * the scratch write. If we swizzle from channels of the temporary that
3378 * weren't initialized, it will confuse live interval analysis, which will
3379 * make spilling fail to make progress.
3380 */
3381 src_reg temp = src_reg(this, glsl_type::vec4_type);
3382 temp.type = inst->dst.type;
3383 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3384 int swizzles[4];
3385 for (int i = 0; i < 4; i++)
3386 if (inst->dst.writemask & (1 << i))
3387 swizzles[i] = i;
3388 else
3389 swizzles[i] = first_writemask_chan;
3390 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3391 swizzles[2], swizzles[3]);
3392
3393 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3394 inst->dst.writemask));
3395 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3396 write->predicate = inst->predicate;
3397 write->ir = inst->ir;
3398 write->annotation = inst->annotation;
3399 inst->insert_after(block, write);
3400
3401 inst->dst.file = temp.file;
3402 inst->dst.reg = temp.reg;
3403 inst->dst.reg_offset = temp.reg_offset;
3404 inst->dst.reladdr = NULL;
3405 }
3406
3407 /**
3408 * We can't generally support array access in GRF space, because a
3409 * single instruction's destination can only span 2 contiguous
3410 * registers. So, we send all GRF arrays that get variable index
3411 * access to scratch space.
3412 */
3413 void
3414 vec4_visitor::move_grf_array_access_to_scratch()
3415 {
3416 int scratch_loc[this->alloc.count];
3417 memset(scratch_loc, -1, sizeof(scratch_loc));
3418
3419 /* First, calculate the set of virtual GRFs that need to be punted
3420 * to scratch due to having any array access on them, and where in
3421 * scratch.
3422 */
3423 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3424 if (inst->dst.file == GRF && inst->dst.reladdr &&
3425 scratch_loc[inst->dst.reg] == -1) {
3426 scratch_loc[inst->dst.reg] = c->last_scratch;
3427 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3428 }
3429
3430 for (int i = 0 ; i < 3; i++) {
3431 src_reg *src = &inst->src[i];
3432
3433 if (src->file == GRF && src->reladdr &&
3434 scratch_loc[src->reg] == -1) {
3435 scratch_loc[src->reg] = c->last_scratch;
3436 c->last_scratch += this->alloc.sizes[src->reg];
3437 }
3438 }
3439 }
3440
3441 /* Now, for anything that will be accessed through scratch, rewrite
3442 * it to load/store. Note that this is a _safe list walk, because
3443 * we may generate a new scratch_write instruction after the one
3444 * we're processing.
3445 */
3446 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3447 /* Set up the annotation tracking for new generated instructions. */
3448 base_ir = inst->ir;
3449 current_annotation = inst->annotation;
3450
3451 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3452 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3453 }
3454
3455 for (int i = 0 ; i < 3; i++) {
3456 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3457 continue;
3458
3459 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3460
3461 emit_scratch_read(block, inst, temp, inst->src[i],
3462 scratch_loc[inst->src[i].reg]);
3463
3464 inst->src[i].file = temp.file;
3465 inst->src[i].reg = temp.reg;
3466 inst->src[i].reg_offset = temp.reg_offset;
3467 inst->src[i].reladdr = NULL;
3468 }
3469 }
3470 }
3471
3472 /**
3473 * Emits an instruction before @inst to load the value named by @orig_src
3474 * from the pull constant buffer (surface) at @base_offset to @temp.
3475 */
3476 void
3477 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3478 dst_reg temp, src_reg orig_src,
3479 int base_offset)
3480 {
3481 int reg_offset = base_offset + orig_src.reg_offset;
3482 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3483 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3484 reg_offset);
3485 vec4_instruction *load;
3486
3487 if (brw->gen >= 7) {
3488 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3489
3490 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3491 * Reserve space for the register.
3492 */
3493 if (brw->gen >= 9) {
3494 grf_offset.reg_offset++;
3495 alloc.sizes[grf_offset.reg] = 2;
3496 }
3497
3498 grf_offset.type = offset.type;
3499 emit_before(block, inst, MOV(grf_offset, offset));
3500
3501 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3502 temp, index, src_reg(grf_offset));
3503 load->mlen = 1;
3504 } else {
3505 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3506 temp, index, offset);
3507 load->base_mrf = 14;
3508 load->mlen = 1;
3509 }
3510 emit_before(block, inst, load);
3511 }
3512
3513 /**
3514 * Implements array access of uniforms by inserting a
3515 * PULL_CONSTANT_LOAD instruction.
3516 *
3517 * Unlike temporary GRF array access (where we don't support it due to
3518 * the difficulty of doing relative addressing on instruction
3519 * destinations), we could potentially do array access of uniforms
3520 * that were loaded in GRF space as push constants. In real-world
3521 * usage we've seen, though, the arrays being used are always larger
3522 * than we could load as push constants, so just always move all
3523 * uniform array access out to a pull constant buffer.
3524 */
3525 void
3526 vec4_visitor::move_uniform_array_access_to_pull_constants()
3527 {
3528 int pull_constant_loc[this->uniforms];
3529 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3530 bool nested_reladdr;
3531
3532 /* Walk through and find array access of uniforms. Put a copy of that
3533 * uniform in the pull constant buffer.
3534 *
3535 * Note that we don't move constant-indexed accesses to arrays. No
3536 * testing has been done of the performance impact of this choice.
3537 */
3538 do {
3539 nested_reladdr = false;
3540
3541 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3542 for (int i = 0 ; i < 3; i++) {
3543 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3544 continue;
3545
3546 int uniform = inst->src[i].reg;
3547
3548 if (inst->src[i].reladdr->reladdr)
3549 nested_reladdr = true; /* will need another pass */
3550
3551 /* If this array isn't already present in the pull constant buffer,
3552 * add it.
3553 */
3554 if (pull_constant_loc[uniform] == -1) {
3555 const gl_constant_value **values =
3556 &stage_prog_data->param[uniform * 4];
3557
3558 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3559
3560 assert(uniform < uniform_array_size);
3561 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3562 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3563 = values[j];
3564 }
3565 }
3566
3567 /* Set up the annotation tracking for new generated instructions. */
3568 base_ir = inst->ir;
3569 current_annotation = inst->annotation;
3570
3571 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3572
3573 emit_pull_constant_load(block, inst, temp, inst->src[i],
3574 pull_constant_loc[uniform]);
3575
3576 inst->src[i].file = temp.file;
3577 inst->src[i].reg = temp.reg;
3578 inst->src[i].reg_offset = temp.reg_offset;
3579 inst->src[i].reladdr = NULL;
3580 }
3581 }
3582 } while (nested_reladdr);
3583
3584 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3585 * no need to track them as larger-than-vec4 objects. This will be
3586 * relied on in cutting out unused uniform vectors from push
3587 * constants.
3588 */
3589 split_uniform_registers();
3590 }
3591
3592 void
3593 vec4_visitor::resolve_ud_negate(src_reg *reg)
3594 {
3595 if (reg->type != BRW_REGISTER_TYPE_UD ||
3596 !reg->negate)
3597 return;
3598
3599 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3600 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3601 *reg = temp;
3602 }
3603
3604 /**
3605 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3606 *
3607 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3608 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3609 */
3610 void
3611 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3612 {
3613 assert(brw->gen <= 5);
3614
3615 if (!rvalue->type->is_boolean())
3616 return;
3617
3618 src_reg and_result = src_reg(this, rvalue->type);
3619 src_reg neg_result = src_reg(this, rvalue->type);
3620 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3621 emit(MOV(dst_reg(neg_result), negate(and_result)));
3622 *reg = neg_result;
3623 }
3624
3625 vec4_visitor::vec4_visitor(struct brw_context *brw,
3626 struct brw_vec4_compile *c,
3627 struct gl_program *prog,
3628 const struct brw_vue_prog_key *key,
3629 struct brw_vue_prog_data *prog_data,
3630 struct gl_shader_program *shader_prog,
3631 gl_shader_stage stage,
3632 void *mem_ctx,
3633 bool no_spills,
3634 shader_time_shader_type st_base,
3635 shader_time_shader_type st_written,
3636 shader_time_shader_type st_reset)
3637 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3638 c(c),
3639 key(key),
3640 prog_data(prog_data),
3641 sanity_param_count(0),
3642 fail_msg(NULL),
3643 first_non_payload_grf(0),
3644 need_all_constants_in_pull_buffer(false),
3645 no_spills(no_spills),
3646 st_base(st_base),
3647 st_written(st_written),
3648 st_reset(st_reset)
3649 {
3650 this->mem_ctx = mem_ctx;
3651 this->failed = false;
3652
3653 this->base_ir = NULL;
3654 this->current_annotation = NULL;
3655 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3656
3657 this->variable_ht = hash_table_ctor(0,
3658 hash_table_pointer_hash,
3659 hash_table_pointer_compare);
3660
3661 this->virtual_grf_start = NULL;
3662 this->virtual_grf_end = NULL;
3663 this->live_intervals = NULL;
3664
3665 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3666
3667 this->uniforms = 0;
3668
3669 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3670 * at least one. See setup_uniforms() in brw_vec4.cpp.
3671 */
3672 this->uniform_array_size = 1;
3673 if (prog_data) {
3674 this->uniform_array_size =
3675 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3676 }
3677
3678 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3679 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3680 }
3681
3682 vec4_visitor::~vec4_visitor()
3683 {
3684 hash_table_dtor(this->variable_ht);
3685 }
3686
3687
3688 void
3689 vec4_visitor::fail(const char *format, ...)
3690 {
3691 va_list va;
3692 char *msg;
3693
3694 if (failed)
3695 return;
3696
3697 failed = true;
3698
3699 va_start(va, format);
3700 msg = ralloc_vasprintf(mem_ctx, format, va);
3701 va_end(va);
3702 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3703
3704 this->fail_msg = msg;
3705
3706 if (debug_enabled) {
3707 fprintf(stderr, "%s", msg);
3708 }
3709 }
3710
3711 } /* namespace brw */