i965: Rename setup_vector_uniform_values to setup_vec4_uniform_value
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(const src_reg &src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::resolve_source_modifiers(const src_reg &src)
317 {
318 if (!src.abs && !src.negate)
319 return src;
320
321 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
322 resolved.type = src.type;
323 emit(MOV(resolved, src));
324
325 return src_reg(resolved);
326 }
327
328 src_reg
329 vec4_visitor::fix_math_operand(const src_reg &src)
330 {
331 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
332 return src;
333
334 /* The gen6 math instruction ignores the source modifiers --
335 * swizzle, abs, negate, and at least some parts of the register
336 * region description.
337 *
338 * Rather than trying to enumerate all these cases, *always* expand the
339 * operand to a temp GRF for gen6.
340 *
341 * For gen7, keep the operand as-is, except if immediate, which gen7 still
342 * can't use.
343 */
344
345 if (devinfo->gen == 7 && src.file != IMM)
346 return src;
347
348 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
349 expanded.type = src.type;
350 emit(MOV(expanded, src));
351 return src_reg(expanded);
352 }
353
354 vec4_instruction *
355 vec4_visitor::emit_math(enum opcode opcode,
356 const dst_reg &dst,
357 const src_reg &src0, const src_reg &src1)
358 {
359 vec4_instruction *math =
360 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
361
362 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
363 /* MATH on Gen6 must be align1, so we can't do writemasks. */
364 math->dst = dst_reg(this, glsl_type::vec4_type);
365 math->dst.type = dst.type;
366 math = emit(MOV(dst, src_reg(math->dst)));
367 } else if (devinfo->gen < 6) {
368 math->base_mrf = 1;
369 math->mlen = src1.file == BAD_FILE ? 1 : 2;
370 }
371
372 return math;
373 }
374
375 void
376 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
377 {
378 if (devinfo->gen < 7) {
379 unreachable("ir_unop_pack_half_2x16 should be lowered");
380 }
381
382 assert(dst.type == BRW_REGISTER_TYPE_UD);
383 assert(src0.type == BRW_REGISTER_TYPE_F);
384
385 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
386 *
387 * Because this instruction does not have a 16-bit floating-point type,
388 * the destination data type must be Word (W).
389 *
390 * The destination must be DWord-aligned and specify a horizontal stride
391 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
392 * each destination channel and the upper word is not modified.
393 *
394 * The above restriction implies that the f32to16 instruction must use
395 * align1 mode, because only in align1 mode is it possible to specify
396 * horizontal stride. We choose here to defy the hardware docs and emit
397 * align16 instructions.
398 *
399 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
400 * instructions. I was partially successful in that the code passed all
401 * tests. However, the code was dubiously correct and fragile, and the
402 * tests were not harsh enough to probe that frailty. Not trusting the
403 * code, I chose instead to remain in align16 mode in defiance of the hw
404 * docs).
405 *
406 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
407 * simulator, emitting a f32to16 in align16 mode with UD as destination
408 * data type is safe. The behavior differs from that specified in the PRM
409 * in that the upper word of each destination channel is cleared to 0.
410 */
411
412 dst_reg tmp_dst(this, glsl_type::uvec2_type);
413 src_reg tmp_src(tmp_dst);
414
415 #if 0
416 /* Verify the undocumented behavior on which the following instructions
417 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
418 * then the result of the bit-or instruction below will be incorrect.
419 *
420 * You should inspect the disasm output in order to verify that the MOV is
421 * not optimized away.
422 */
423 emit(MOV(tmp_dst, src_reg(0x12345678u)));
424 #endif
425
426 /* Give tmp the form below, where "." means untouched.
427 *
428 * w z y x w z y x
429 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
430 *
431 * That the upper word of each write-channel be 0 is required for the
432 * following bit-shift and bit-or instructions to work. Note that this
433 * relies on the undocumented hardware behavior mentioned above.
434 */
435 tmp_dst.writemask = WRITEMASK_XY;
436 emit(F32TO16(tmp_dst, src0));
437
438 /* Give the write-channels of dst the form:
439 * 0xhhhh0000
440 */
441 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
442 emit(SHL(dst, tmp_src, src_reg(16u)));
443
444 /* Finally, give the write-channels of dst the form of packHalf2x16's
445 * output:
446 * 0xhhhhllll
447 */
448 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
449 emit(OR(dst, src_reg(dst), tmp_src));
450 }
451
452 void
453 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
454 {
455 if (devinfo->gen < 7) {
456 unreachable("ir_unop_unpack_half_2x16 should be lowered");
457 }
458
459 assert(dst.type == BRW_REGISTER_TYPE_F);
460 assert(src0.type == BRW_REGISTER_TYPE_UD);
461
462 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
463 *
464 * Because this instruction does not have a 16-bit floating-point type,
465 * the source data type must be Word (W). The destination type must be
466 * F (Float).
467 *
468 * To use W as the source data type, we must adjust horizontal strides,
469 * which is only possible in align1 mode. All my [chadv] attempts at
470 * emitting align1 instructions for unpackHalf2x16 failed to pass the
471 * Piglit tests, so I gave up.
472 *
473 * I've verified that, on gen7 hardware and the simulator, it is safe to
474 * emit f16to32 in align16 mode with UD as source data type.
475 */
476
477 dst_reg tmp_dst(this, glsl_type::uvec2_type);
478 src_reg tmp_src(tmp_dst);
479
480 tmp_dst.writemask = WRITEMASK_X;
481 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
482
483 tmp_dst.writemask = WRITEMASK_Y;
484 emit(SHR(tmp_dst, src0, src_reg(16u)));
485
486 dst.writemask = WRITEMASK_XY;
487 emit(F16TO32(dst, tmp_src));
488 }
489
490 void
491 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
492 {
493 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
494 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
495 * is not suitable to generate the shift values, but we can use the packed
496 * vector float and a type-converting MOV.
497 */
498 dst_reg shift(this, glsl_type::uvec4_type);
499 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
500
501 dst_reg shifted(this, glsl_type::uvec4_type);
502 src0.swizzle = BRW_SWIZZLE_XXXX;
503 emit(SHR(shifted, src0, src_reg(shift)));
504
505 shifted.type = BRW_REGISTER_TYPE_UB;
506 dst_reg f(this, glsl_type::vec4_type);
507 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
508
509 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
510 }
511
512 void
513 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
514 {
515 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
516 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
517 * is not suitable to generate the shift values, but we can use the packed
518 * vector float and a type-converting MOV.
519 */
520 dst_reg shift(this, glsl_type::uvec4_type);
521 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
522
523 dst_reg shifted(this, glsl_type::uvec4_type);
524 src0.swizzle = BRW_SWIZZLE_XXXX;
525 emit(SHR(shifted, src0, src_reg(shift)));
526
527 shifted.type = BRW_REGISTER_TYPE_B;
528 dst_reg f(this, glsl_type::vec4_type);
529 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
533
534 dst_reg max(this, glsl_type::vec4_type);
535 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
536 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
537 }
538
539 void
540 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
541 {
542 dst_reg saturated(this, glsl_type::vec4_type);
543 vec4_instruction *inst = emit(MOV(saturated, src0));
544 inst->saturate = true;
545
546 dst_reg scaled(this, glsl_type::vec4_type);
547 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
548
549 dst_reg rounded(this, glsl_type::vec4_type);
550 emit(RNDE(rounded, src_reg(scaled)));
551
552 dst_reg u(this, glsl_type::uvec4_type);
553 emit(MOV(u, src_reg(rounded)));
554
555 src_reg bytes(u);
556 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
557 }
558
559 void
560 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
561 {
562 dst_reg max(this, glsl_type::vec4_type);
563 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
564
565 dst_reg min(this, glsl_type::vec4_type);
566 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
567
568 dst_reg scaled(this, glsl_type::vec4_type);
569 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
570
571 dst_reg rounded(this, glsl_type::vec4_type);
572 emit(RNDE(rounded, src_reg(scaled)));
573
574 dst_reg i(this, glsl_type::ivec4_type);
575 emit(MOV(i, src_reg(rounded)));
576
577 src_reg bytes(i);
578 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
579 }
580
581 void
582 vec4_visitor::visit_instructions(const exec_list *list)
583 {
584 foreach_in_list(ir_instruction, ir, list) {
585 base_ir = ir;
586 ir->accept(this);
587 }
588 }
589
590 /**
591 * Returns the minimum number of vec4 elements needed to pack a type.
592 *
593 * For simple types, it will return 1 (a single vec4); for matrices, the
594 * number of columns; for array and struct, the sum of the vec4_size of
595 * each of its elements; and for sampler and atomic, zero.
596 *
597 * This method is useful to calculate how much register space is needed to
598 * store a particular type.
599 */
600 int
601 vec4_visitor::type_size(const struct glsl_type *type)
602 {
603 unsigned int i;
604 int size;
605
606 switch (type->base_type) {
607 case GLSL_TYPE_UINT:
608 case GLSL_TYPE_INT:
609 case GLSL_TYPE_FLOAT:
610 case GLSL_TYPE_BOOL:
611 if (type->is_matrix()) {
612 return type->matrix_columns;
613 } else {
614 /* Regardless of size of vector, it gets a vec4. This is bad
615 * packing for things like floats, but otherwise arrays become a
616 * mess. Hopefully a later pass over the code can pack scalars
617 * down if appropriate.
618 */
619 return 1;
620 }
621 case GLSL_TYPE_ARRAY:
622 assert(type->length > 0);
623 return type_size(type->fields.array) * type->length;
624 case GLSL_TYPE_STRUCT:
625 size = 0;
626 for (i = 0; i < type->length; i++) {
627 size += type_size(type->fields.structure[i].type);
628 }
629 return size;
630 case GLSL_TYPE_SUBROUTINE:
631 return 1;
632
633 case GLSL_TYPE_SAMPLER:
634 /* Samplers take up no register space, since they're baked in at
635 * link time.
636 */
637 return 0;
638 case GLSL_TYPE_ATOMIC_UINT:
639 return 0;
640 case GLSL_TYPE_IMAGE:
641 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
642 case GLSL_TYPE_VOID:
643 case GLSL_TYPE_DOUBLE:
644 case GLSL_TYPE_ERROR:
645 case GLSL_TYPE_INTERFACE:
646 unreachable("not reached");
647 }
648
649 return 0;
650 }
651
652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
653 {
654 init();
655
656 this->file = GRF;
657 this->reg = v->alloc.allocate(v->type_size(type));
658
659 if (type->is_array() || type->is_record()) {
660 this->swizzle = BRW_SWIZZLE_NOOP;
661 } else {
662 this->swizzle = brw_swizzle_for_size(type->vector_elements);
663 }
664
665 this->type = brw_type_for_base_type(type);
666 }
667
668 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
669 {
670 assert(size > 0);
671
672 init();
673
674 this->file = GRF;
675 this->reg = v->alloc.allocate(v->type_size(type) * size);
676
677 this->swizzle = BRW_SWIZZLE_NOOP;
678
679 this->type = brw_type_for_base_type(type);
680 }
681
682 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
683 {
684 init();
685
686 this->file = GRF;
687 this->reg = v->alloc.allocate(v->type_size(type));
688
689 if (type->is_array() || type->is_record()) {
690 this->writemask = WRITEMASK_XYZW;
691 } else {
692 this->writemask = (1 << type->vector_elements) - 1;
693 }
694
695 this->type = brw_type_for_base_type(type);
696 }
697
698 void
699 vec4_visitor::setup_vec4_uniform_value(const gl_constant_value *values,
700 unsigned n)
701 {
702 static const gl_constant_value zero = { 0 };
703
704 for (unsigned i = 0; i < n; ++i)
705 stage_prog_data->param[4 * uniforms + i] = &values[i];
706
707 for (unsigned i = n; i < 4; ++i)
708 stage_prog_data->param[4 * uniforms + i] = &zero;
709
710 uniform_vector_size[uniforms++] = n;
711 }
712
713 /* Our support for uniforms is piggy-backed on the struct
714 * gl_fragment_program, because that's where the values actually
715 * get stored, rather than in some global gl_shader_program uniform
716 * store.
717 */
718 void
719 vec4_visitor::setup_uniform_values(ir_variable *ir)
720 {
721 int namelen = strlen(ir->name);
722
723 /* The data for our (non-builtin) uniforms is stored in a series of
724 * gl_uniform_driver_storage structs for each subcomponent that
725 * glGetUniformLocation() could name. We know it's been set up in the same
726 * order we'd walk the type, so walk the list of storage and find anything
727 * with our name, or the prefix of a component that starts with our name.
728 */
729 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
730 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
731
732 if (storage->builtin)
733 continue;
734
735 if (strncmp(ir->name, storage->name, namelen) != 0 ||
736 (storage->name[namelen] != 0 &&
737 storage->name[namelen] != '.' &&
738 storage->name[namelen] != '[')) {
739 continue;
740 }
741
742 const unsigned vector_count = (MAX2(storage->array_elements, 1) *
743 storage->type->matrix_columns);
744 const unsigned vector_size = storage->type->vector_elements;
745
746 for (unsigned s = 0; s < vector_count; s++)
747 setup_vec4_uniform_value(&storage->storage[s * vector_size],
748 vector_size);
749 }
750 }
751
752 void
753 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
754 {
755 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
756 assert(this->uniforms < uniform_array_size);
757 this->uniform_vector_size[this->uniforms] = 4;
758 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
759 this->userplane[i].type = BRW_REGISTER_TYPE_F;
760 for (int j = 0; j < 4; ++j) {
761 stage_prog_data->param[this->uniforms * 4 + j] =
762 (gl_constant_value *) &clip_planes[i][j];
763 }
764 ++this->uniforms;
765 }
766 }
767
768 /* Our support for builtin uniforms is even scarier than non-builtin.
769 * It sits on top of the PROG_STATE_VAR parameters that are
770 * automatically updated from GL context state.
771 */
772 void
773 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
774 {
775 const ir_state_slot *const slots = ir->get_state_slots();
776 assert(slots != NULL);
777
778 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
779 /* This state reference has already been setup by ir_to_mesa,
780 * but we'll get the same index back here. We can reference
781 * ParameterValues directly, since unlike brw_fs.cpp, we never
782 * add new state references during compile.
783 */
784 int index = _mesa_add_state_reference(this->prog->Parameters,
785 (gl_state_index *)slots[i].tokens);
786 gl_constant_value *values =
787 &this->prog->Parameters->ParameterValues[index][0];
788
789 assert(this->uniforms < uniform_array_size);
790
791 for (unsigned j = 0; j < 4; j++)
792 stage_prog_data->param[this->uniforms * 4 + j] =
793 &values[GET_SWZ(slots[i].swizzle, j)];
794
795 this->uniform_vector_size[this->uniforms] =
796 (ir->type->is_scalar() || ir->type->is_vector() ||
797 ir->type->is_matrix() ? ir->type->vector_elements : 4);
798
799 this->uniforms++;
800 }
801 }
802
803 dst_reg *
804 vec4_visitor::variable_storage(ir_variable *var)
805 {
806 return (dst_reg *)hash_table_find(this->variable_ht, var);
807 }
808
809 void
810 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
811 enum brw_predicate *predicate)
812 {
813 ir_expression *expr = ir->as_expression();
814
815 *predicate = BRW_PREDICATE_NORMAL;
816
817 if (expr && expr->operation != ir_binop_ubo_load) {
818 src_reg op[3];
819 vec4_instruction *inst;
820
821 assert(expr->get_num_operands() <= 3);
822 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
823 expr->operands[i]->accept(this);
824 op[i] = this->result;
825
826 resolve_ud_negate(&op[i]);
827 }
828
829 switch (expr->operation) {
830 case ir_unop_logic_not:
831 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
832 inst->conditional_mod = BRW_CONDITIONAL_Z;
833 break;
834
835 case ir_binop_logic_xor:
836 if (devinfo->gen <= 5) {
837 src_reg temp = src_reg(this, ir->type);
838 emit(XOR(dst_reg(temp), op[0], op[1]));
839 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
840 } else {
841 inst = emit(XOR(dst_null_d(), op[0], op[1]));
842 }
843 inst->conditional_mod = BRW_CONDITIONAL_NZ;
844 break;
845
846 case ir_binop_logic_or:
847 if (devinfo->gen <= 5) {
848 src_reg temp = src_reg(this, ir->type);
849 emit(OR(dst_reg(temp), op[0], op[1]));
850 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
851 } else {
852 inst = emit(OR(dst_null_d(), op[0], op[1]));
853 }
854 inst->conditional_mod = BRW_CONDITIONAL_NZ;
855 break;
856
857 case ir_binop_logic_and:
858 if (devinfo->gen <= 5) {
859 src_reg temp = src_reg(this, ir->type);
860 emit(AND(dst_reg(temp), op[0], op[1]));
861 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
862 } else {
863 inst = emit(AND(dst_null_d(), op[0], op[1]));
864 }
865 inst->conditional_mod = BRW_CONDITIONAL_NZ;
866 break;
867
868 case ir_unop_f2b:
869 if (devinfo->gen >= 6) {
870 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
871 } else {
872 inst = emit(MOV(dst_null_f(), op[0]));
873 inst->conditional_mod = BRW_CONDITIONAL_NZ;
874 }
875 break;
876
877 case ir_unop_i2b:
878 if (devinfo->gen >= 6) {
879 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
880 } else {
881 inst = emit(MOV(dst_null_d(), op[0]));
882 inst->conditional_mod = BRW_CONDITIONAL_NZ;
883 }
884 break;
885
886 case ir_binop_all_equal:
887 if (devinfo->gen <= 5) {
888 resolve_bool_comparison(expr->operands[0], &op[0]);
889 resolve_bool_comparison(expr->operands[1], &op[1]);
890 }
891 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
892 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
893 break;
894
895 case ir_binop_any_nequal:
896 if (devinfo->gen <= 5) {
897 resolve_bool_comparison(expr->operands[0], &op[0]);
898 resolve_bool_comparison(expr->operands[1], &op[1]);
899 }
900 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
901 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
902 break;
903
904 case ir_unop_any:
905 if (devinfo->gen <= 5) {
906 resolve_bool_comparison(expr->operands[0], &op[0]);
907 }
908 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
909 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
910 break;
911
912 case ir_binop_greater:
913 case ir_binop_gequal:
914 case ir_binop_less:
915 case ir_binop_lequal:
916 case ir_binop_equal:
917 case ir_binop_nequal:
918 if (devinfo->gen <= 5) {
919 resolve_bool_comparison(expr->operands[0], &op[0]);
920 resolve_bool_comparison(expr->operands[1], &op[1]);
921 }
922 emit(CMP(dst_null_d(), op[0], op[1],
923 brw_conditional_for_comparison(expr->operation)));
924 break;
925
926 case ir_triop_csel: {
927 /* Expand the boolean condition into the flag register. */
928 inst = emit(MOV(dst_null_d(), op[0]));
929 inst->conditional_mod = BRW_CONDITIONAL_NZ;
930
931 /* Select which boolean to return. */
932 dst_reg temp(this, expr->operands[1]->type);
933 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
934 inst->predicate = BRW_PREDICATE_NORMAL;
935
936 /* Expand the result to a condition code. */
937 inst = emit(MOV(dst_null_d(), src_reg(temp)));
938 inst->conditional_mod = BRW_CONDITIONAL_NZ;
939 break;
940 }
941
942 default:
943 unreachable("not reached");
944 }
945 return;
946 }
947
948 ir->accept(this);
949
950 resolve_ud_negate(&this->result);
951
952 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
953 inst->conditional_mod = BRW_CONDITIONAL_NZ;
954 }
955
956 /**
957 * Emit a gen6 IF statement with the comparison folded into the IF
958 * instruction.
959 */
960 void
961 vec4_visitor::emit_if_gen6(ir_if *ir)
962 {
963 ir_expression *expr = ir->condition->as_expression();
964
965 if (expr && expr->operation != ir_binop_ubo_load) {
966 src_reg op[3];
967 dst_reg temp;
968
969 assert(expr->get_num_operands() <= 3);
970 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
971 expr->operands[i]->accept(this);
972 op[i] = this->result;
973 }
974
975 switch (expr->operation) {
976 case ir_unop_logic_not:
977 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
978 return;
979
980 case ir_binop_logic_xor:
981 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
982 return;
983
984 case ir_binop_logic_or:
985 temp = dst_reg(this, glsl_type::bool_type);
986 emit(OR(temp, op[0], op[1]));
987 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
988 return;
989
990 case ir_binop_logic_and:
991 temp = dst_reg(this, glsl_type::bool_type);
992 emit(AND(temp, op[0], op[1]));
993 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
994 return;
995
996 case ir_unop_f2b:
997 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
998 return;
999
1000 case ir_unop_i2b:
1001 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1002 return;
1003
1004 case ir_binop_greater:
1005 case ir_binop_gequal:
1006 case ir_binop_less:
1007 case ir_binop_lequal:
1008 case ir_binop_equal:
1009 case ir_binop_nequal:
1010 emit(IF(op[0], op[1],
1011 brw_conditional_for_comparison(expr->operation)));
1012 return;
1013
1014 case ir_binop_all_equal:
1015 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1016 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1017 return;
1018
1019 case ir_binop_any_nequal:
1020 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1021 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1022 return;
1023
1024 case ir_unop_any:
1025 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1026 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1027 return;
1028
1029 case ir_triop_csel: {
1030 /* Expand the boolean condition into the flag register. */
1031 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1032 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1033
1034 /* Select which boolean to return. */
1035 dst_reg temp(this, expr->operands[1]->type);
1036 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1037 inst->predicate = BRW_PREDICATE_NORMAL;
1038
1039 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1040 return;
1041 }
1042
1043 default:
1044 unreachable("not reached");
1045 }
1046 return;
1047 }
1048
1049 ir->condition->accept(this);
1050
1051 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1052 }
1053
1054 void
1055 vec4_visitor::visit(ir_variable *ir)
1056 {
1057 dst_reg *reg = NULL;
1058
1059 if (variable_storage(ir))
1060 return;
1061
1062 switch (ir->data.mode) {
1063 case ir_var_shader_in:
1064 assert(ir->data.location != -1);
1065 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1066 break;
1067
1068 case ir_var_shader_out:
1069 assert(ir->data.location != -1);
1070 reg = new(mem_ctx) dst_reg(this, ir->type);
1071
1072 for (int i = 0; i < type_size(ir->type); i++) {
1073 output_reg[ir->data.location + i] = *reg;
1074 output_reg[ir->data.location + i].reg_offset = i;
1075 output_reg_annotation[ir->data.location + i] = ir->name;
1076 }
1077 break;
1078
1079 case ir_var_auto:
1080 case ir_var_temporary:
1081 reg = new(mem_ctx) dst_reg(this, ir->type);
1082 break;
1083
1084 case ir_var_uniform:
1085 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1086
1087 /* Thanks to the lower_ubo_reference pass, we will see only
1088 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1089 * variables, so no need for them to be in variable_ht.
1090 *
1091 * Some uniforms, such as samplers and atomic counters, have no actual
1092 * storage, so we should ignore them.
1093 */
1094 if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1095 return;
1096
1097 /* Track how big the whole uniform variable is, in case we need to put a
1098 * copy of its data into pull constants for array access.
1099 */
1100 assert(this->uniforms < uniform_array_size);
1101 this->uniform_size[this->uniforms] = type_size(ir->type);
1102
1103 if (!strncmp(ir->name, "gl_", 3)) {
1104 setup_builtin_uniform_values(ir);
1105 } else {
1106 setup_uniform_values(ir);
1107 }
1108 break;
1109
1110 case ir_var_system_value:
1111 reg = make_reg_for_system_value(ir->data.location, ir->type);
1112 break;
1113
1114 default:
1115 unreachable("not reached");
1116 }
1117
1118 reg->type = brw_type_for_base_type(ir->type);
1119 hash_table_insert(this->variable_ht, reg, ir);
1120 }
1121
1122 void
1123 vec4_visitor::visit(ir_loop *ir)
1124 {
1125 /* We don't want debugging output to print the whole body of the
1126 * loop as the annotation.
1127 */
1128 this->base_ir = NULL;
1129
1130 emit(BRW_OPCODE_DO);
1131
1132 visit_instructions(&ir->body_instructions);
1133
1134 emit(BRW_OPCODE_WHILE);
1135 }
1136
1137 void
1138 vec4_visitor::visit(ir_loop_jump *ir)
1139 {
1140 switch (ir->mode) {
1141 case ir_loop_jump::jump_break:
1142 emit(BRW_OPCODE_BREAK);
1143 break;
1144 case ir_loop_jump::jump_continue:
1145 emit(BRW_OPCODE_CONTINUE);
1146 break;
1147 }
1148 }
1149
1150
1151 void
1152 vec4_visitor::visit(ir_function_signature *)
1153 {
1154 unreachable("not reached");
1155 }
1156
1157 void
1158 vec4_visitor::visit(ir_function *ir)
1159 {
1160 /* Ignore function bodies other than main() -- we shouldn't see calls to
1161 * them since they should all be inlined.
1162 */
1163 if (strcmp(ir->name, "main") == 0) {
1164 const ir_function_signature *sig;
1165 exec_list empty;
1166
1167 sig = ir->matching_signature(NULL, &empty, false);
1168
1169 assert(sig);
1170
1171 visit_instructions(&sig->body);
1172 }
1173 }
1174
1175 bool
1176 vec4_visitor::try_emit_mad(ir_expression *ir)
1177 {
1178 /* 3-src instructions were introduced in gen6. */
1179 if (devinfo->gen < 6)
1180 return false;
1181
1182 /* MAD can only handle floating-point data. */
1183 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1184 return false;
1185
1186 ir_rvalue *nonmul;
1187 ir_expression *mul;
1188 bool mul_negate, mul_abs;
1189
1190 for (int i = 0; i < 2; i++) {
1191 mul_negate = false;
1192 mul_abs = false;
1193
1194 mul = ir->operands[i]->as_expression();
1195 nonmul = ir->operands[1 - i];
1196
1197 if (mul && mul->operation == ir_unop_abs) {
1198 mul = mul->operands[0]->as_expression();
1199 mul_abs = true;
1200 } else if (mul && mul->operation == ir_unop_neg) {
1201 mul = mul->operands[0]->as_expression();
1202 mul_negate = true;
1203 }
1204
1205 if (mul && mul->operation == ir_binop_mul)
1206 break;
1207 }
1208
1209 if (!mul || mul->operation != ir_binop_mul)
1210 return false;
1211
1212 nonmul->accept(this);
1213 src_reg src0 = fix_3src_operand(this->result);
1214
1215 mul->operands[0]->accept(this);
1216 src_reg src1 = fix_3src_operand(this->result);
1217 src1.negate ^= mul_negate;
1218 src1.abs = mul_abs;
1219 if (mul_abs)
1220 src1.negate = false;
1221
1222 mul->operands[1]->accept(this);
1223 src_reg src2 = fix_3src_operand(this->result);
1224 src2.abs = mul_abs;
1225 if (mul_abs)
1226 src2.negate = false;
1227
1228 this->result = src_reg(this, ir->type);
1229 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1230
1231 return true;
1232 }
1233
1234 bool
1235 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1236 {
1237 /* This optimization relies on CMP setting the destination to 0 when
1238 * false. Early hardware only sets the least significant bit, and
1239 * leaves the other bits undefined. So we can't use it.
1240 */
1241 if (devinfo->gen < 6)
1242 return false;
1243
1244 ir_expression *const cmp = ir->operands[0]->as_expression();
1245
1246 if (cmp == NULL)
1247 return false;
1248
1249 switch (cmp->operation) {
1250 case ir_binop_less:
1251 case ir_binop_greater:
1252 case ir_binop_lequal:
1253 case ir_binop_gequal:
1254 case ir_binop_equal:
1255 case ir_binop_nequal:
1256 break;
1257
1258 default:
1259 return false;
1260 }
1261
1262 cmp->operands[0]->accept(this);
1263 const src_reg cmp_src0 = this->result;
1264
1265 cmp->operands[1]->accept(this);
1266 const src_reg cmp_src1 = this->result;
1267
1268 this->result = src_reg(this, ir->type);
1269
1270 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1271 brw_conditional_for_comparison(cmp->operation)));
1272
1273 /* If the comparison is false, this->result will just happen to be zero.
1274 */
1275 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1276 this->result, src_reg(1.0f));
1277 inst->predicate = BRW_PREDICATE_NORMAL;
1278 inst->predicate_inverse = true;
1279
1280 return true;
1281 }
1282
1283 vec4_instruction *
1284 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1285 src_reg src0, src_reg src1)
1286 {
1287 vec4_instruction *inst;
1288
1289 if (devinfo->gen >= 6) {
1290 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1291 inst->conditional_mod = conditionalmod;
1292 } else {
1293 emit(CMP(dst, src0, src1, conditionalmod));
1294
1295 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1296 inst->predicate = BRW_PREDICATE_NORMAL;
1297 }
1298
1299 return inst;
1300 }
1301
1302 vec4_instruction *
1303 vec4_visitor::emit_lrp(const dst_reg &dst,
1304 const src_reg &x, const src_reg &y, const src_reg &a)
1305 {
1306 if (devinfo->gen >= 6) {
1307 /* Note that the instruction's argument order is reversed from GLSL
1308 * and the IR.
1309 */
1310 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1311 fix_3src_operand(x)));
1312 } else {
1313 /* Earlier generations don't support three source operations, so we
1314 * need to emit x*(1-a) + y*a.
1315 */
1316 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1317 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1318 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1319 y_times_a.writemask = dst.writemask;
1320 one_minus_a.writemask = dst.writemask;
1321 x_times_one_minus_a.writemask = dst.writemask;
1322
1323 emit(MUL(y_times_a, y, a));
1324 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1325 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1326 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1327 }
1328 }
1329
1330 /**
1331 * Emits the instructions needed to perform a pull constant load. before_block
1332 * and before_inst can be NULL in which case the instruction will be appended
1333 * to the end of the instruction list.
1334 */
1335 void
1336 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1337 src_reg surf_index,
1338 src_reg offset_reg,
1339 bblock_t *before_block,
1340 vec4_instruction *before_inst)
1341 {
1342 assert((before_inst == NULL && before_block == NULL) ||
1343 (before_inst && before_block));
1344
1345 vec4_instruction *pull;
1346
1347 if (devinfo->gen >= 9) {
1348 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1349 src_reg header(this, glsl_type::uvec4_type, 2);
1350
1351 pull = new(mem_ctx)
1352 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1353 dst_reg(header));
1354
1355 if (before_inst)
1356 emit_before(before_block, before_inst, pull);
1357 else
1358 emit(pull);
1359
1360 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1361 offset_reg.type);
1362 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1363
1364 if (before_inst)
1365 emit_before(before_block, before_inst, pull);
1366 else
1367 emit(pull);
1368
1369 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1370 dst,
1371 surf_index,
1372 header);
1373 pull->mlen = 2;
1374 pull->header_size = 1;
1375 } else if (devinfo->gen >= 7) {
1376 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1377
1378 grf_offset.type = offset_reg.type;
1379
1380 pull = MOV(grf_offset, offset_reg);
1381
1382 if (before_inst)
1383 emit_before(before_block, before_inst, pull);
1384 else
1385 emit(pull);
1386
1387 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1388 dst,
1389 surf_index,
1390 src_reg(grf_offset));
1391 pull->mlen = 1;
1392 } else {
1393 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1394 dst,
1395 surf_index,
1396 offset_reg);
1397 pull->base_mrf = 14;
1398 pull->mlen = 1;
1399 }
1400
1401 if (before_inst)
1402 emit_before(before_block, before_inst, pull);
1403 else
1404 emit(pull);
1405 }
1406
1407 src_reg
1408 vec4_visitor::emit_uniformize(const src_reg &src)
1409 {
1410 const src_reg chan_index(this, glsl_type::uint_type);
1411 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1412 src.type);
1413
1414 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1415 ->force_writemask_all = true;
1416 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1417 ->force_writemask_all = true;
1418
1419 return src_reg(dst);
1420 }
1421
1422 void
1423 vec4_visitor::visit(ir_expression *ir)
1424 {
1425 unsigned int operand;
1426 src_reg op[ARRAY_SIZE(ir->operands)];
1427 vec4_instruction *inst;
1428
1429 if (ir->operation == ir_binop_add) {
1430 if (try_emit_mad(ir))
1431 return;
1432 }
1433
1434 if (ir->operation == ir_unop_b2f) {
1435 if (try_emit_b2f_of_compare(ir))
1436 return;
1437 }
1438
1439 /* Storage for our result. Ideally for an assignment we'd be using
1440 * the actual storage for the result here, instead.
1441 */
1442 dst_reg result_dst(this, ir->type);
1443 src_reg result_src(result_dst);
1444
1445 if (ir->operation == ir_triop_csel) {
1446 ir->operands[1]->accept(this);
1447 op[1] = this->result;
1448 ir->operands[2]->accept(this);
1449 op[2] = this->result;
1450
1451 enum brw_predicate predicate;
1452 emit_bool_to_cond_code(ir->operands[0], &predicate);
1453 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1454 inst->predicate = predicate;
1455 this->result = result_src;
1456 return;
1457 }
1458
1459 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1460 this->result.file = BAD_FILE;
1461 ir->operands[operand]->accept(this);
1462 if (this->result.file == BAD_FILE) {
1463 fprintf(stderr, "Failed to get tree for expression operand:\n");
1464 ir->operands[operand]->fprint(stderr);
1465 exit(1);
1466 }
1467 op[operand] = this->result;
1468
1469 /* Matrix expression operands should have been broken down to vector
1470 * operations already.
1471 */
1472 assert(!ir->operands[operand]->type->is_matrix());
1473 }
1474
1475 /* If nothing special happens, this is the result. */
1476 this->result = result_src;
1477
1478 switch (ir->operation) {
1479 case ir_unop_logic_not:
1480 emit(NOT(result_dst, op[0]));
1481 break;
1482 case ir_unop_neg:
1483 op[0].negate = !op[0].negate;
1484 emit(MOV(result_dst, op[0]));
1485 break;
1486 case ir_unop_abs:
1487 op[0].abs = true;
1488 op[0].negate = false;
1489 emit(MOV(result_dst, op[0]));
1490 break;
1491
1492 case ir_unop_sign:
1493 if (ir->type->is_float()) {
1494 /* AND(val, 0x80000000) gives the sign bit.
1495 *
1496 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1497 * zero.
1498 */
1499 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1500
1501 op[0].type = BRW_REGISTER_TYPE_UD;
1502 result_dst.type = BRW_REGISTER_TYPE_UD;
1503 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1504
1505 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1506 inst->predicate = BRW_PREDICATE_NORMAL;
1507
1508 this->result.type = BRW_REGISTER_TYPE_F;
1509 } else {
1510 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1511 * -> non-negative val generates 0x00000000.
1512 * Predicated OR sets 1 if val is positive.
1513 */
1514 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1515
1516 emit(ASR(result_dst, op[0], src_reg(31)));
1517
1518 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1519 inst->predicate = BRW_PREDICATE_NORMAL;
1520 }
1521 break;
1522
1523 case ir_unop_rcp:
1524 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1525 break;
1526
1527 case ir_unop_exp2:
1528 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1529 break;
1530 case ir_unop_log2:
1531 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1532 break;
1533 case ir_unop_exp:
1534 case ir_unop_log:
1535 unreachable("not reached: should be handled by ir_explog_to_explog2");
1536 case ir_unop_sin:
1537 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1538 break;
1539 case ir_unop_cos:
1540 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1541 break;
1542
1543 case ir_unop_dFdx:
1544 case ir_unop_dFdx_coarse:
1545 case ir_unop_dFdx_fine:
1546 case ir_unop_dFdy:
1547 case ir_unop_dFdy_coarse:
1548 case ir_unop_dFdy_fine:
1549 unreachable("derivatives not valid in vertex shader");
1550
1551 case ir_unop_bitfield_reverse:
1552 emit(BFREV(result_dst, op[0]));
1553 break;
1554 case ir_unop_bit_count:
1555 emit(CBIT(result_dst, op[0]));
1556 break;
1557 case ir_unop_find_msb: {
1558 src_reg temp = src_reg(this, glsl_type::uint_type);
1559
1560 inst = emit(FBH(dst_reg(temp), op[0]));
1561 inst->dst.writemask = WRITEMASK_XYZW;
1562
1563 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1564 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1565 * subtract the result from 31 to convert the MSB count into an LSB count.
1566 */
1567
1568 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1569 temp.swizzle = BRW_SWIZZLE_NOOP;
1570 emit(MOV(result_dst, temp));
1571
1572 src_reg src_tmp = src_reg(result_dst);
1573 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1574
1575 src_tmp.negate = true;
1576 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1577 inst->predicate = BRW_PREDICATE_NORMAL;
1578 break;
1579 }
1580 case ir_unop_find_lsb:
1581 emit(FBL(result_dst, op[0]));
1582 break;
1583 case ir_unop_saturate:
1584 inst = emit(MOV(result_dst, op[0]));
1585 inst->saturate = true;
1586 break;
1587
1588 case ir_unop_noise:
1589 unreachable("not reached: should be handled by lower_noise");
1590
1591 case ir_unop_subroutine_to_int:
1592 emit(MOV(result_dst, op[0]));
1593 break;
1594
1595 case ir_binop_add:
1596 emit(ADD(result_dst, op[0], op[1]));
1597 break;
1598 case ir_binop_sub:
1599 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1600
1601 case ir_binop_mul:
1602 if (devinfo->gen < 8 && ir->type->is_integer()) {
1603 /* For integer multiplication, the MUL uses the low 16 bits of one of
1604 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1605 * accumulates in the contribution of the upper 16 bits of that
1606 * operand. If we can determine that one of the args is in the low
1607 * 16 bits, though, we can just emit a single MUL.
1608 */
1609 if (ir->operands[0]->is_uint16_constant()) {
1610 if (devinfo->gen < 7)
1611 emit(MUL(result_dst, op[0], op[1]));
1612 else
1613 emit(MUL(result_dst, op[1], op[0]));
1614 } else if (ir->operands[1]->is_uint16_constant()) {
1615 if (devinfo->gen < 7)
1616 emit(MUL(result_dst, op[1], op[0]));
1617 else
1618 emit(MUL(result_dst, op[0], op[1]));
1619 } else {
1620 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1621
1622 emit(MUL(acc, op[0], op[1]));
1623 emit(MACH(dst_null_d(), op[0], op[1]));
1624 emit(MOV(result_dst, src_reg(acc)));
1625 }
1626 } else {
1627 emit(MUL(result_dst, op[0], op[1]));
1628 }
1629 break;
1630 case ir_binop_imul_high: {
1631 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1632
1633 emit(MUL(acc, op[0], op[1]));
1634 emit(MACH(result_dst, op[0], op[1]));
1635 break;
1636 }
1637 case ir_binop_div:
1638 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1639 assert(ir->type->is_integer());
1640 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1641 break;
1642
1643 case ir_binop_carry:
1644 unreachable("Should have been lowered by carry_to_arith().");
1645
1646 case ir_binop_borrow:
1647 unreachable("Should have been lowered by borrow_to_arith().");
1648
1649 case ir_binop_mod:
1650 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1651 assert(ir->type->is_integer());
1652 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1653 break;
1654
1655 case ir_binop_less:
1656 case ir_binop_greater:
1657 case ir_binop_lequal:
1658 case ir_binop_gequal:
1659 case ir_binop_equal:
1660 case ir_binop_nequal: {
1661 if (devinfo->gen <= 5) {
1662 resolve_bool_comparison(ir->operands[0], &op[0]);
1663 resolve_bool_comparison(ir->operands[1], &op[1]);
1664 }
1665 emit(CMP(result_dst, op[0], op[1],
1666 brw_conditional_for_comparison(ir->operation)));
1667 break;
1668 }
1669
1670 case ir_binop_all_equal:
1671 if (devinfo->gen <= 5) {
1672 resolve_bool_comparison(ir->operands[0], &op[0]);
1673 resolve_bool_comparison(ir->operands[1], &op[1]);
1674 }
1675
1676 /* "==" operator producing a scalar boolean. */
1677 if (ir->operands[0]->type->is_vector() ||
1678 ir->operands[1]->type->is_vector()) {
1679 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1680 emit(MOV(result_dst, src_reg(0)));
1681 inst = emit(MOV(result_dst, src_reg(~0)));
1682 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1683 } else {
1684 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1685 }
1686 break;
1687 case ir_binop_any_nequal:
1688 if (devinfo->gen <= 5) {
1689 resolve_bool_comparison(ir->operands[0], &op[0]);
1690 resolve_bool_comparison(ir->operands[1], &op[1]);
1691 }
1692
1693 /* "!=" operator producing a scalar boolean. */
1694 if (ir->operands[0]->type->is_vector() ||
1695 ir->operands[1]->type->is_vector()) {
1696 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1697
1698 emit(MOV(result_dst, src_reg(0)));
1699 inst = emit(MOV(result_dst, src_reg(~0)));
1700 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1701 } else {
1702 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1703 }
1704 break;
1705
1706 case ir_unop_any:
1707 if (devinfo->gen <= 5) {
1708 resolve_bool_comparison(ir->operands[0], &op[0]);
1709 }
1710 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1711 emit(MOV(result_dst, src_reg(0)));
1712
1713 inst = emit(MOV(result_dst, src_reg(~0)));
1714 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1715 break;
1716
1717 case ir_binop_logic_xor:
1718 emit(XOR(result_dst, op[0], op[1]));
1719 break;
1720
1721 case ir_binop_logic_or:
1722 emit(OR(result_dst, op[0], op[1]));
1723 break;
1724
1725 case ir_binop_logic_and:
1726 emit(AND(result_dst, op[0], op[1]));
1727 break;
1728
1729 case ir_binop_dot:
1730 assert(ir->operands[0]->type->is_vector());
1731 assert(ir->operands[0]->type == ir->operands[1]->type);
1732 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1733 break;
1734
1735 case ir_unop_sqrt:
1736 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1737 break;
1738 case ir_unop_rsq:
1739 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1740 break;
1741
1742 case ir_unop_bitcast_i2f:
1743 case ir_unop_bitcast_u2f:
1744 this->result = op[0];
1745 this->result.type = BRW_REGISTER_TYPE_F;
1746 break;
1747
1748 case ir_unop_bitcast_f2i:
1749 this->result = op[0];
1750 this->result.type = BRW_REGISTER_TYPE_D;
1751 break;
1752
1753 case ir_unop_bitcast_f2u:
1754 this->result = op[0];
1755 this->result.type = BRW_REGISTER_TYPE_UD;
1756 break;
1757
1758 case ir_unop_i2f:
1759 case ir_unop_i2u:
1760 case ir_unop_u2i:
1761 case ir_unop_u2f:
1762 case ir_unop_f2i:
1763 case ir_unop_f2u:
1764 emit(MOV(result_dst, op[0]));
1765 break;
1766 case ir_unop_b2i:
1767 case ir_unop_b2f:
1768 if (devinfo->gen <= 5) {
1769 resolve_bool_comparison(ir->operands[0], &op[0]);
1770 }
1771 emit(MOV(result_dst, negate(op[0])));
1772 break;
1773 case ir_unop_f2b:
1774 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1775 break;
1776 case ir_unop_i2b:
1777 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1778 break;
1779
1780 case ir_unop_trunc:
1781 emit(RNDZ(result_dst, op[0]));
1782 break;
1783 case ir_unop_ceil: {
1784 src_reg tmp = src_reg(this, ir->type);
1785 op[0].negate = !op[0].negate;
1786 emit(RNDD(dst_reg(tmp), op[0]));
1787 tmp.negate = true;
1788 emit(MOV(result_dst, tmp));
1789 }
1790 break;
1791 case ir_unop_floor:
1792 inst = emit(RNDD(result_dst, op[0]));
1793 break;
1794 case ir_unop_fract:
1795 inst = emit(FRC(result_dst, op[0]));
1796 break;
1797 case ir_unop_round_even:
1798 emit(RNDE(result_dst, op[0]));
1799 break;
1800
1801 case ir_binop_min:
1802 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1803 break;
1804 case ir_binop_max:
1805 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1806 break;
1807
1808 case ir_binop_pow:
1809 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1810 break;
1811
1812 case ir_unop_bit_not:
1813 inst = emit(NOT(result_dst, op[0]));
1814 break;
1815 case ir_binop_bit_and:
1816 inst = emit(AND(result_dst, op[0], op[1]));
1817 break;
1818 case ir_binop_bit_xor:
1819 inst = emit(XOR(result_dst, op[0], op[1]));
1820 break;
1821 case ir_binop_bit_or:
1822 inst = emit(OR(result_dst, op[0], op[1]));
1823 break;
1824
1825 case ir_binop_lshift:
1826 inst = emit(SHL(result_dst, op[0], op[1]));
1827 break;
1828
1829 case ir_binop_rshift:
1830 if (ir->type->base_type == GLSL_TYPE_INT)
1831 inst = emit(ASR(result_dst, op[0], op[1]));
1832 else
1833 inst = emit(SHR(result_dst, op[0], op[1]));
1834 break;
1835
1836 case ir_binop_bfm:
1837 emit(BFI1(result_dst, op[0], op[1]));
1838 break;
1839
1840 case ir_binop_ubo_load: {
1841 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1842 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1843 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1844 src_reg offset;
1845
1846 /* Now, load the vector from that offset. */
1847 assert(ir->type->is_vector() || ir->type->is_scalar());
1848
1849 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1850 packed_consts.type = result.type;
1851 src_reg surf_index;
1852
1853 if (const_uniform_block) {
1854 /* The block index is a constant, so just emit the binding table entry
1855 * as an immediate.
1856 */
1857 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1858 const_uniform_block->value.u[0]);
1859 } else {
1860 /* The block index is not a constant. Evaluate the index expression
1861 * per-channel and add the base UBO index; we have to select a value
1862 * from any live channel.
1863 */
1864 surf_index = src_reg(this, glsl_type::uint_type);
1865 emit(ADD(dst_reg(surf_index), op[0],
1866 src_reg(prog_data->base.binding_table.ubo_start)));
1867 surf_index = emit_uniformize(surf_index);
1868
1869 /* Assume this may touch any UBO. It would be nice to provide
1870 * a tighter bound, but the array information is already lowered away.
1871 */
1872 brw_mark_surface_used(&prog_data->base,
1873 prog_data->base.binding_table.ubo_start +
1874 shader_prog->NumUniformBlocks - 1);
1875 }
1876
1877 if (const_offset_ir) {
1878 if (devinfo->gen >= 8) {
1879 /* Store the offset in a GRF so we can send-from-GRF. */
1880 offset = src_reg(this, glsl_type::int_type);
1881 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1882 } else {
1883 /* Immediates are fine on older generations since they'll be moved
1884 * to a (potentially fake) MRF at the generator level.
1885 */
1886 offset = src_reg(const_offset / 16);
1887 }
1888 } else {
1889 offset = src_reg(this, glsl_type::uint_type);
1890 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1891 }
1892
1893 emit_pull_constant_load_reg(dst_reg(packed_consts),
1894 surf_index,
1895 offset,
1896 NULL, NULL /* before_block/inst */);
1897
1898 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1899 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1900 const_offset % 16 / 4,
1901 const_offset % 16 / 4,
1902 const_offset % 16 / 4);
1903
1904 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1905 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1906 emit(CMP(result_dst, packed_consts, src_reg(0u),
1907 BRW_CONDITIONAL_NZ));
1908 } else {
1909 emit(MOV(result_dst, packed_consts));
1910 }
1911 break;
1912 }
1913
1914 case ir_binop_vector_extract:
1915 unreachable("should have been lowered by vec_index_to_cond_assign");
1916
1917 case ir_triop_fma:
1918 op[0] = fix_3src_operand(op[0]);
1919 op[1] = fix_3src_operand(op[1]);
1920 op[2] = fix_3src_operand(op[2]);
1921 /* Note that the instruction's argument order is reversed from GLSL
1922 * and the IR.
1923 */
1924 emit(MAD(result_dst, op[2], op[1], op[0]));
1925 break;
1926
1927 case ir_triop_lrp:
1928 emit_lrp(result_dst, op[0], op[1], op[2]);
1929 break;
1930
1931 case ir_triop_csel:
1932 unreachable("already handled above");
1933 break;
1934
1935 case ir_triop_bfi:
1936 op[0] = fix_3src_operand(op[0]);
1937 op[1] = fix_3src_operand(op[1]);
1938 op[2] = fix_3src_operand(op[2]);
1939 emit(BFI2(result_dst, op[0], op[1], op[2]));
1940 break;
1941
1942 case ir_triop_bitfield_extract:
1943 op[0] = fix_3src_operand(op[0]);
1944 op[1] = fix_3src_operand(op[1]);
1945 op[2] = fix_3src_operand(op[2]);
1946 /* Note that the instruction's argument order is reversed from GLSL
1947 * and the IR.
1948 */
1949 emit(BFE(result_dst, op[2], op[1], op[0]));
1950 break;
1951
1952 case ir_triop_vector_insert:
1953 unreachable("should have been lowered by lower_vector_insert");
1954
1955 case ir_quadop_bitfield_insert:
1956 unreachable("not reached: should be handled by "
1957 "bitfield_insert_to_bfm_bfi\n");
1958
1959 case ir_quadop_vector:
1960 unreachable("not reached: should be handled by lower_quadop_vector");
1961
1962 case ir_unop_pack_half_2x16:
1963 emit_pack_half_2x16(result_dst, op[0]);
1964 break;
1965 case ir_unop_unpack_half_2x16:
1966 emit_unpack_half_2x16(result_dst, op[0]);
1967 break;
1968 case ir_unop_unpack_unorm_4x8:
1969 emit_unpack_unorm_4x8(result_dst, op[0]);
1970 break;
1971 case ir_unop_unpack_snorm_4x8:
1972 emit_unpack_snorm_4x8(result_dst, op[0]);
1973 break;
1974 case ir_unop_pack_unorm_4x8:
1975 emit_pack_unorm_4x8(result_dst, op[0]);
1976 break;
1977 case ir_unop_pack_snorm_4x8:
1978 emit_pack_snorm_4x8(result_dst, op[0]);
1979 break;
1980 case ir_unop_pack_snorm_2x16:
1981 case ir_unop_pack_unorm_2x16:
1982 case ir_unop_unpack_snorm_2x16:
1983 case ir_unop_unpack_unorm_2x16:
1984 unreachable("not reached: should be handled by lower_packing_builtins");
1985 case ir_unop_unpack_half_2x16_split_x:
1986 case ir_unop_unpack_half_2x16_split_y:
1987 case ir_binop_pack_half_2x16_split:
1988 case ir_unop_interpolate_at_centroid:
1989 case ir_binop_interpolate_at_sample:
1990 case ir_binop_interpolate_at_offset:
1991 unreachable("not reached: should not occur in vertex shader");
1992 case ir_binop_ldexp:
1993 unreachable("not reached: should be handled by ldexp_to_arith()");
1994 case ir_unop_d2f:
1995 case ir_unop_f2d:
1996 case ir_unop_d2i:
1997 case ir_unop_i2d:
1998 case ir_unop_d2u:
1999 case ir_unop_u2d:
2000 case ir_unop_d2b:
2001 case ir_unop_pack_double_2x32:
2002 case ir_unop_unpack_double_2x32:
2003 case ir_unop_frexp_sig:
2004 case ir_unop_frexp_exp:
2005 unreachable("fp64 todo");
2006 }
2007 }
2008
2009
2010 void
2011 vec4_visitor::visit(ir_swizzle *ir)
2012 {
2013 /* Note that this is only swizzles in expressions, not those on the left
2014 * hand side of an assignment, which do write masking. See ir_assignment
2015 * for that.
2016 */
2017 const unsigned swz = brw_compose_swizzle(
2018 brw_swizzle_for_size(ir->type->vector_elements),
2019 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2020
2021 ir->val->accept(this);
2022 this->result = swizzle(this->result, swz);
2023 }
2024
2025 void
2026 vec4_visitor::visit(ir_dereference_variable *ir)
2027 {
2028 const struct glsl_type *type = ir->type;
2029 dst_reg *reg = variable_storage(ir->var);
2030
2031 if (!reg) {
2032 fail("Failed to find variable storage for %s\n", ir->var->name);
2033 this->result = src_reg(brw_null_reg());
2034 return;
2035 }
2036
2037 this->result = src_reg(*reg);
2038
2039 /* System values get their swizzle from the dst_reg writemask */
2040 if (ir->var->data.mode == ir_var_system_value)
2041 return;
2042
2043 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2044 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2045 }
2046
2047
2048 int
2049 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2050 {
2051 /* Under normal circumstances array elements are stored consecutively, so
2052 * the stride is equal to the size of the array element.
2053 */
2054 return type_size(ir->type);
2055 }
2056
2057
2058 void
2059 vec4_visitor::visit(ir_dereference_array *ir)
2060 {
2061 ir_constant *constant_index;
2062 src_reg src;
2063 int array_stride = compute_array_stride(ir);
2064
2065 constant_index = ir->array_index->constant_expression_value();
2066
2067 ir->array->accept(this);
2068 src = this->result;
2069
2070 if (constant_index) {
2071 src.reg_offset += constant_index->value.i[0] * array_stride;
2072 } else {
2073 /* Variable index array dereference. It eats the "vec4" of the
2074 * base of the array and an index that offsets the Mesa register
2075 * index.
2076 */
2077 ir->array_index->accept(this);
2078
2079 src_reg index_reg;
2080
2081 if (array_stride == 1) {
2082 index_reg = this->result;
2083 } else {
2084 index_reg = src_reg(this, glsl_type::int_type);
2085
2086 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2087 }
2088
2089 if (src.reladdr) {
2090 src_reg temp = src_reg(this, glsl_type::int_type);
2091
2092 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2093
2094 index_reg = temp;
2095 }
2096
2097 src.reladdr = ralloc(mem_ctx, src_reg);
2098 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2099 }
2100
2101 /* If the type is smaller than a vec4, replicate the last channel out. */
2102 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2103 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2104 else
2105 src.swizzle = BRW_SWIZZLE_NOOP;
2106 src.type = brw_type_for_base_type(ir->type);
2107
2108 this->result = src;
2109 }
2110
2111 void
2112 vec4_visitor::visit(ir_dereference_record *ir)
2113 {
2114 unsigned int i;
2115 const glsl_type *struct_type = ir->record->type;
2116 int offset = 0;
2117
2118 ir->record->accept(this);
2119
2120 for (i = 0; i < struct_type->length; i++) {
2121 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2122 break;
2123 offset += type_size(struct_type->fields.structure[i].type);
2124 }
2125
2126 /* If the type is smaller than a vec4, replicate the last channel out. */
2127 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2128 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2129 else
2130 this->result.swizzle = BRW_SWIZZLE_NOOP;
2131 this->result.type = brw_type_for_base_type(ir->type);
2132
2133 this->result.reg_offset += offset;
2134 }
2135
2136 /**
2137 * We want to be careful in assignment setup to hit the actual storage
2138 * instead of potentially using a temporary like we might with the
2139 * ir_dereference handler.
2140 */
2141 static dst_reg
2142 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2143 {
2144 /* The LHS must be a dereference. If the LHS is a variable indexed array
2145 * access of a vector, it must be separated into a series conditional moves
2146 * before reaching this point (see ir_vec_index_to_cond_assign).
2147 */
2148 assert(ir->as_dereference());
2149 ir_dereference_array *deref_array = ir->as_dereference_array();
2150 if (deref_array) {
2151 assert(!deref_array->array->type->is_vector());
2152 }
2153
2154 /* Use the rvalue deref handler for the most part. We'll ignore
2155 * swizzles in it and write swizzles using writemask, though.
2156 */
2157 ir->accept(v);
2158 return dst_reg(v->result);
2159 }
2160
2161 void
2162 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2163 const struct glsl_type *type,
2164 enum brw_predicate predicate)
2165 {
2166 if (type->base_type == GLSL_TYPE_STRUCT) {
2167 for (unsigned int i = 0; i < type->length; i++) {
2168 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2169 }
2170 return;
2171 }
2172
2173 if (type->is_array()) {
2174 for (unsigned int i = 0; i < type->length; i++) {
2175 emit_block_move(dst, src, type->fields.array, predicate);
2176 }
2177 return;
2178 }
2179
2180 if (type->is_matrix()) {
2181 const struct glsl_type *vec_type;
2182
2183 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2184 type->vector_elements, 1);
2185
2186 for (int i = 0; i < type->matrix_columns; i++) {
2187 emit_block_move(dst, src, vec_type, predicate);
2188 }
2189 return;
2190 }
2191
2192 assert(type->is_scalar() || type->is_vector());
2193
2194 dst->type = brw_type_for_base_type(type);
2195 src->type = dst->type;
2196
2197 dst->writemask = (1 << type->vector_elements) - 1;
2198
2199 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2200
2201 vec4_instruction *inst = emit(MOV(*dst, *src));
2202 inst->predicate = predicate;
2203
2204 dst->reg_offset++;
2205 src->reg_offset++;
2206 }
2207
2208
2209 /* If the RHS processing resulted in an instruction generating a
2210 * temporary value, and it would be easy to rewrite the instruction to
2211 * generate its result right into the LHS instead, do so. This ends
2212 * up reliably removing instructions where it can be tricky to do so
2213 * later without real UD chain information.
2214 */
2215 bool
2216 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2217 dst_reg dst,
2218 src_reg src,
2219 vec4_instruction *pre_rhs_inst,
2220 vec4_instruction *last_rhs_inst)
2221 {
2222 /* This could be supported, but it would take more smarts. */
2223 if (ir->condition)
2224 return false;
2225
2226 if (pre_rhs_inst == last_rhs_inst)
2227 return false; /* No instructions generated to work with. */
2228
2229 /* Make sure the last instruction generated our source reg. */
2230 if (src.file != GRF ||
2231 src.file != last_rhs_inst->dst.file ||
2232 src.reg != last_rhs_inst->dst.reg ||
2233 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2234 src.reladdr ||
2235 src.abs ||
2236 src.negate ||
2237 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2238 return false;
2239
2240 /* Check that that last instruction fully initialized the channels
2241 * we want to use, in the order we want to use them. We could
2242 * potentially reswizzle the operands of many instructions so that
2243 * we could handle out of order channels, but don't yet.
2244 */
2245
2246 for (unsigned i = 0; i < 4; i++) {
2247 if (dst.writemask & (1 << i)) {
2248 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2249 return false;
2250
2251 if (BRW_GET_SWZ(src.swizzle, i) != i)
2252 return false;
2253 }
2254 }
2255
2256 /* Success! Rewrite the instruction. */
2257 last_rhs_inst->dst.file = dst.file;
2258 last_rhs_inst->dst.reg = dst.reg;
2259 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2260 last_rhs_inst->dst.reladdr = dst.reladdr;
2261 last_rhs_inst->dst.writemask &= dst.writemask;
2262
2263 return true;
2264 }
2265
2266 void
2267 vec4_visitor::visit(ir_assignment *ir)
2268 {
2269 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2270 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2271
2272 if (!ir->lhs->type->is_scalar() &&
2273 !ir->lhs->type->is_vector()) {
2274 ir->rhs->accept(this);
2275 src_reg src = this->result;
2276
2277 if (ir->condition) {
2278 emit_bool_to_cond_code(ir->condition, &predicate);
2279 }
2280
2281 /* emit_block_move doesn't account for swizzles in the source register.
2282 * This should be ok, since the source register is a structure or an
2283 * array, and those can't be swizzled. But double-check to be sure.
2284 */
2285 assert(src.swizzle ==
2286 (ir->rhs->type->is_matrix()
2287 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2288 : BRW_SWIZZLE_NOOP));
2289
2290 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2291 return;
2292 }
2293
2294 /* Now we're down to just a scalar/vector with writemasks. */
2295 int i;
2296
2297 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2298 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2299
2300 ir->rhs->accept(this);
2301
2302 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2303
2304 int swizzles[4];
2305 int src_chan = 0;
2306
2307 assert(ir->lhs->type->is_vector() ||
2308 ir->lhs->type->is_scalar());
2309 dst.writemask = ir->write_mask;
2310
2311 /* Swizzle a small RHS vector into the channels being written.
2312 *
2313 * glsl ir treats write_mask as dictating how many channels are
2314 * present on the RHS while in our instructions we need to make
2315 * those channels appear in the slots of the vec4 they're written to.
2316 */
2317 for (int i = 0; i < 4; i++)
2318 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2319
2320 src_reg src = swizzle(this->result,
2321 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2322 swizzles[2], swizzles[3]));
2323
2324 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2325 return;
2326 }
2327
2328 if (ir->condition) {
2329 emit_bool_to_cond_code(ir->condition, &predicate);
2330 }
2331
2332 for (i = 0; i < type_size(ir->lhs->type); i++) {
2333 vec4_instruction *inst = emit(MOV(dst, src));
2334 inst->predicate = predicate;
2335
2336 dst.reg_offset++;
2337 src.reg_offset++;
2338 }
2339 }
2340
2341 void
2342 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2343 {
2344 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2345 foreach_in_list(ir_constant, field_value, &ir->components) {
2346 emit_constant_values(dst, field_value);
2347 }
2348 return;
2349 }
2350
2351 if (ir->type->is_array()) {
2352 for (unsigned int i = 0; i < ir->type->length; i++) {
2353 emit_constant_values(dst, ir->array_elements[i]);
2354 }
2355 return;
2356 }
2357
2358 if (ir->type->is_matrix()) {
2359 for (int i = 0; i < ir->type->matrix_columns; i++) {
2360 float *vec = &ir->value.f[i * ir->type->vector_elements];
2361
2362 for (int j = 0; j < ir->type->vector_elements; j++) {
2363 dst->writemask = 1 << j;
2364 dst->type = BRW_REGISTER_TYPE_F;
2365
2366 emit(MOV(*dst, src_reg(vec[j])));
2367 }
2368 dst->reg_offset++;
2369 }
2370 return;
2371 }
2372
2373 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2374
2375 for (int i = 0; i < ir->type->vector_elements; i++) {
2376 if (!(remaining_writemask & (1 << i)))
2377 continue;
2378
2379 dst->writemask = 1 << i;
2380 dst->type = brw_type_for_base_type(ir->type);
2381
2382 /* Find other components that match the one we're about to
2383 * write. Emits fewer instructions for things like vec4(0.5,
2384 * 1.5, 1.5, 1.5).
2385 */
2386 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2387 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2388 if (ir->value.b[i] == ir->value.b[j])
2389 dst->writemask |= (1 << j);
2390 } else {
2391 /* u, i, and f storage all line up, so no need for a
2392 * switch case for comparing each type.
2393 */
2394 if (ir->value.u[i] == ir->value.u[j])
2395 dst->writemask |= (1 << j);
2396 }
2397 }
2398
2399 switch (ir->type->base_type) {
2400 case GLSL_TYPE_FLOAT:
2401 emit(MOV(*dst, src_reg(ir->value.f[i])));
2402 break;
2403 case GLSL_TYPE_INT:
2404 emit(MOV(*dst, src_reg(ir->value.i[i])));
2405 break;
2406 case GLSL_TYPE_UINT:
2407 emit(MOV(*dst, src_reg(ir->value.u[i])));
2408 break;
2409 case GLSL_TYPE_BOOL:
2410 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2411 break;
2412 default:
2413 unreachable("Non-float/uint/int/bool constant");
2414 }
2415
2416 remaining_writemask &= ~dst->writemask;
2417 }
2418 dst->reg_offset++;
2419 }
2420
2421 void
2422 vec4_visitor::visit(ir_constant *ir)
2423 {
2424 dst_reg dst = dst_reg(this, ir->type);
2425 this->result = src_reg(dst);
2426
2427 emit_constant_values(&dst, ir);
2428 }
2429
2430 void
2431 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2432 {
2433 ir_dereference *deref = static_cast<ir_dereference *>(
2434 ir->actual_parameters.get_head());
2435 ir_variable *location = deref->variable_referenced();
2436 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2437 location->data.binding);
2438
2439 /* Calculate the surface offset */
2440 src_reg offset(this, glsl_type::uint_type);
2441 ir_dereference_array *deref_array = deref->as_dereference_array();
2442 if (deref_array) {
2443 deref_array->array_index->accept(this);
2444
2445 src_reg tmp(this, glsl_type::uint_type);
2446 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2447 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2448 } else {
2449 offset = location->data.atomic.offset;
2450 }
2451
2452 /* Emit the appropriate machine instruction */
2453 const char *callee = ir->callee->function_name();
2454 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2455
2456 if (!strcmp("__intrinsic_atomic_read", callee)) {
2457 emit_untyped_surface_read(surf_index, dst, offset);
2458
2459 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2460 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2461 src_reg(), src_reg());
2462
2463 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2464 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2465 src_reg(), src_reg());
2466 }
2467
2468 brw_mark_surface_used(stage_prog_data, surf_index);
2469 }
2470
2471 void
2472 vec4_visitor::visit(ir_call *ir)
2473 {
2474 const char *callee = ir->callee->function_name();
2475
2476 if (!strcmp("__intrinsic_atomic_read", callee) ||
2477 !strcmp("__intrinsic_atomic_increment", callee) ||
2478 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2479 visit_atomic_counter_intrinsic(ir);
2480 } else {
2481 unreachable("Unsupported intrinsic.");
2482 }
2483 }
2484
2485 src_reg
2486 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2487 src_reg coordinate, src_reg sampler)
2488 {
2489 vec4_instruction *inst =
2490 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2491 dst_reg(this, glsl_type::uvec4_type));
2492 inst->base_mrf = 2;
2493 inst->src[1] = sampler;
2494
2495 int param_base;
2496
2497 if (devinfo->gen >= 9) {
2498 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2499 vec4_instruction *header_inst = new(mem_ctx)
2500 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2501 dst_reg(MRF, inst->base_mrf));
2502
2503 emit(header_inst);
2504
2505 inst->mlen = 2;
2506 inst->header_size = 1;
2507 param_base = inst->base_mrf + 1;
2508 } else {
2509 inst->mlen = 1;
2510 param_base = inst->base_mrf;
2511 }
2512
2513 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2514 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2515 int zero_mask = 0xf & ~coord_mask;
2516
2517 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2518 coordinate));
2519
2520 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2521 src_reg(0)));
2522
2523 emit(inst);
2524 return src_reg(inst->dst);
2525 }
2526
2527 bool
2528 vec4_visitor::is_high_sampler(src_reg sampler)
2529 {
2530 if (devinfo->gen < 8 && !devinfo->is_haswell)
2531 return false;
2532
2533 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2534 }
2535
2536 void
2537 vec4_visitor::emit_texture(ir_texture_opcode op,
2538 dst_reg dest,
2539 const glsl_type *dest_type,
2540 src_reg coordinate,
2541 int coord_components,
2542 src_reg shadow_comparitor,
2543 src_reg lod, src_reg lod2,
2544 src_reg sample_index,
2545 uint32_t constant_offset,
2546 src_reg offset_value,
2547 src_reg mcs,
2548 bool is_cube_array,
2549 uint32_t sampler,
2550 src_reg sampler_reg)
2551 {
2552 enum opcode opcode;
2553 switch (op) {
2554 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2555 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2556 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2557 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2558 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2559 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2560 case ir_tg4: opcode = offset_value.file != BAD_FILE
2561 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2562 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2563 case ir_txb:
2564 unreachable("TXB is not valid for vertex shaders.");
2565 case ir_lod:
2566 unreachable("LOD is not valid for vertex shaders.");
2567 default:
2568 unreachable("Unrecognized tex op");
2569 }
2570
2571 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2572 opcode, dst_reg(this, dest_type));
2573
2574 inst->offset = constant_offset;
2575
2576 /* The message header is necessary for:
2577 * - Gen4 (always)
2578 * - Gen9+ for selecting SIMD4x2
2579 * - Texel offsets
2580 * - Gather channel selection
2581 * - Sampler indices too large to fit in a 4-bit value.
2582 */
2583 inst->header_size =
2584 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2585 inst->offset != 0 || op == ir_tg4 ||
2586 is_high_sampler(sampler_reg)) ? 1 : 0;
2587 inst->base_mrf = 2;
2588 inst->mlen = inst->header_size + 1; /* always at least one */
2589 inst->dst.writemask = WRITEMASK_XYZW;
2590 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2591
2592 inst->src[1] = sampler_reg;
2593
2594 /* MRF for the first parameter */
2595 int param_base = inst->base_mrf + inst->header_size;
2596
2597 if (op == ir_txs || op == ir_query_levels) {
2598 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2599 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2600 } else {
2601 /* Load the coordinate */
2602 /* FINISHME: gl_clamp_mask and saturate */
2603 int coord_mask = (1 << coord_components) - 1;
2604 int zero_mask = 0xf & ~coord_mask;
2605
2606 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2607 coordinate));
2608
2609 if (zero_mask != 0) {
2610 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2611 src_reg(0)));
2612 }
2613 /* Load the shadow comparitor */
2614 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2615 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2616 WRITEMASK_X),
2617 shadow_comparitor));
2618 inst->mlen++;
2619 }
2620
2621 /* Load the LOD info */
2622 if (op == ir_tex || op == ir_txl) {
2623 int mrf, writemask;
2624 if (devinfo->gen >= 5) {
2625 mrf = param_base + 1;
2626 if (shadow_comparitor.file != BAD_FILE) {
2627 writemask = WRITEMASK_Y;
2628 /* mlen already incremented */
2629 } else {
2630 writemask = WRITEMASK_X;
2631 inst->mlen++;
2632 }
2633 } else /* devinfo->gen == 4 */ {
2634 mrf = param_base;
2635 writemask = WRITEMASK_W;
2636 }
2637 lod.swizzle = BRW_SWIZZLE_XXXX;
2638 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2639 } else if (op == ir_txf) {
2640 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2641 } else if (op == ir_txf_ms) {
2642 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2643 sample_index));
2644 if (devinfo->gen >= 7) {
2645 /* MCS data is in the first channel of `mcs`, but we need to get it into
2646 * the .y channel of the second vec4 of params, so replicate .x across
2647 * the whole vec4 and then mask off everything except .y
2648 */
2649 mcs.swizzle = BRW_SWIZZLE_XXXX;
2650 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2651 mcs));
2652 }
2653 inst->mlen++;
2654 } else if (op == ir_txd) {
2655 const brw_reg_type type = lod.type;
2656
2657 if (devinfo->gen >= 5) {
2658 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2659 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2660 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2661 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2662 inst->mlen++;
2663
2664 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2665 lod.swizzle = BRW_SWIZZLE_ZZZZ;
2666 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2667 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2668 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2669 inst->mlen++;
2670
2671 if (shadow_comparitor.file != BAD_FILE) {
2672 emit(MOV(dst_reg(MRF, param_base + 2,
2673 shadow_comparitor.type, WRITEMASK_Z),
2674 shadow_comparitor));
2675 }
2676 }
2677 } else /* devinfo->gen == 4 */ {
2678 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2679 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2680 inst->mlen += 2;
2681 }
2682 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2683 if (shadow_comparitor.file != BAD_FILE) {
2684 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2685 shadow_comparitor));
2686 }
2687
2688 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2689 offset_value));
2690 inst->mlen++;
2691 }
2692 }
2693
2694 emit(inst);
2695
2696 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2697 * spec requires layers.
2698 */
2699 if (op == ir_txs && is_cube_array) {
2700 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2701 writemask(inst->dst, WRITEMASK_Z),
2702 src_reg(inst->dst), src_reg(6));
2703 }
2704
2705 if (devinfo->gen == 6 && op == ir_tg4) {
2706 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2707 }
2708
2709 swizzle_result(op, dest,
2710 src_reg(inst->dst), sampler, dest_type);
2711 }
2712
2713 void
2714 vec4_visitor::visit(ir_texture *ir)
2715 {
2716 uint32_t sampler =
2717 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2718
2719 ir_rvalue *nonconst_sampler_index =
2720 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2721
2722 /* Handle non-constant sampler array indexing */
2723 src_reg sampler_reg;
2724 if (nonconst_sampler_index) {
2725 /* The highest sampler which may be used by this operation is
2726 * the last element of the array. Mark it here, because the generator
2727 * doesn't have enough information to determine the bound.
2728 */
2729 uint32_t array_size = ir->sampler->as_dereference_array()
2730 ->array->type->array_size();
2731
2732 uint32_t max_used = sampler + array_size - 1;
2733 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2734 max_used += prog_data->base.binding_table.gather_texture_start;
2735 } else {
2736 max_used += prog_data->base.binding_table.texture_start;
2737 }
2738
2739 brw_mark_surface_used(&prog_data->base, max_used);
2740
2741 /* Emit code to evaluate the actual indexing expression */
2742 nonconst_sampler_index->accept(this);
2743 src_reg temp(this, glsl_type::uint_type);
2744 emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2745 sampler_reg = emit_uniformize(temp);
2746 } else {
2747 /* Single sampler, or constant array index; the indexing expression
2748 * is just an immediate.
2749 */
2750 sampler_reg = src_reg(sampler);
2751 }
2752
2753 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2754 * emitting anything other than setting up the constant result.
2755 */
2756 if (ir->op == ir_tg4) {
2757 ir_constant *chan = ir->lod_info.component->as_constant();
2758 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2759 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2760 dst_reg result(this, ir->type);
2761 this->result = src_reg(result);
2762 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2763 return;
2764 }
2765 }
2766
2767 /* Should be lowered by do_lower_texture_projection */
2768 assert(!ir->projector);
2769
2770 /* Should be lowered */
2771 assert(!ir->offset || !ir->offset->type->is_array());
2772
2773 /* Generate code to compute all the subexpression trees. This has to be
2774 * done before loading any values into MRFs for the sampler message since
2775 * generating these values may involve SEND messages that need the MRFs.
2776 */
2777 src_reg coordinate;
2778 int coord_components = 0;
2779 if (ir->coordinate) {
2780 coord_components = ir->coordinate->type->vector_elements;
2781 ir->coordinate->accept(this);
2782 coordinate = this->result;
2783 }
2784
2785 src_reg shadow_comparitor;
2786 if (ir->shadow_comparitor) {
2787 ir->shadow_comparitor->accept(this);
2788 shadow_comparitor = this->result;
2789 }
2790
2791 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2792 src_reg offset_value;
2793 if (has_nonconstant_offset) {
2794 ir->offset->accept(this);
2795 offset_value = src_reg(this->result);
2796 }
2797
2798 src_reg lod, lod2, sample_index, mcs;
2799 switch (ir->op) {
2800 case ir_tex:
2801 lod = src_reg(0.0f);
2802 break;
2803 case ir_txf:
2804 case ir_txl:
2805 case ir_txs:
2806 ir->lod_info.lod->accept(this);
2807 lod = this->result;
2808 break;
2809 case ir_query_levels:
2810 lod = src_reg(0);
2811 break;
2812 case ir_txf_ms:
2813 ir->lod_info.sample_index->accept(this);
2814 sample_index = this->result;
2815
2816 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2817 mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2818 else
2819 mcs = src_reg(0u);
2820 break;
2821 case ir_txd:
2822 ir->lod_info.grad.dPdx->accept(this);
2823 lod = this->result;
2824
2825 ir->lod_info.grad.dPdy->accept(this);
2826 lod2 = this->result;
2827 break;
2828 case ir_txb:
2829 case ir_lod:
2830 case ir_tg4:
2831 break;
2832 }
2833
2834 uint32_t constant_offset = 0;
2835 if (ir->offset != NULL && !has_nonconstant_offset) {
2836 constant_offset =
2837 brw_texture_offset(ir->offset->as_constant()->value.i,
2838 ir->offset->type->vector_elements);
2839 }
2840
2841 /* Stuff the channel select bits in the top of the texture offset */
2842 if (ir->op == ir_tg4)
2843 constant_offset |=
2844 gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2845 sampler) << 16;
2846
2847 glsl_type const *type = ir->sampler->type;
2848 bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2849 type->sampler_array;
2850
2851 this->result = src_reg(this, ir->type);
2852 dst_reg dest = dst_reg(this->result);
2853
2854 emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2855 shadow_comparitor,
2856 lod, lod2, sample_index,
2857 constant_offset, offset_value,
2858 mcs, is_cube_array, sampler, sampler_reg);
2859 }
2860
2861 /**
2862 * Apply workarounds for Gen6 gather with UINT/SINT
2863 */
2864 void
2865 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2866 {
2867 if (!wa)
2868 return;
2869
2870 int width = (wa & WA_8BIT) ? 8 : 16;
2871 dst_reg dst_f = dst;
2872 dst_f.type = BRW_REGISTER_TYPE_F;
2873
2874 /* Convert from UNORM to UINT */
2875 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2876 emit(MOV(dst, src_reg(dst_f)));
2877
2878 if (wa & WA_SIGN) {
2879 /* Reinterpret the UINT value as a signed INT value by
2880 * shifting the sign bit into place, then shifting back
2881 * preserving sign.
2882 */
2883 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2884 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2885 }
2886 }
2887
2888 /**
2889 * Set up the gather channel based on the swizzle, for gather4.
2890 */
2891 uint32_t
2892 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2893 {
2894 int swiz = GET_SWZ(key->tex.swizzles[sampler], gather_component);
2895 switch (swiz) {
2896 case SWIZZLE_X: return 0;
2897 case SWIZZLE_Y:
2898 /* gather4 sampler is broken for green channel on RG32F --
2899 * we must ask for blue instead.
2900 */
2901 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2902 return 2;
2903 return 1;
2904 case SWIZZLE_Z: return 2;
2905 case SWIZZLE_W: return 3;
2906 default:
2907 unreachable("Not reached"); /* zero, one swizzles handled already */
2908 }
2909 }
2910
2911 void
2912 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2913 src_reg orig_val, uint32_t sampler,
2914 const glsl_type *dest_type)
2915 {
2916 int s = key->tex.swizzles[sampler];
2917
2918 dst_reg swizzled_result = dest;
2919
2920 if (op == ir_query_levels) {
2921 /* # levels is in .w */
2922 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2923 emit(MOV(swizzled_result, orig_val));
2924 return;
2925 }
2926
2927 if (op == ir_txs || dest_type == glsl_type::float_type
2928 || s == SWIZZLE_NOOP || op == ir_tg4) {
2929 emit(MOV(swizzled_result, orig_val));
2930 return;
2931 }
2932
2933
2934 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2935 int swizzle[4] = {0};
2936
2937 for (int i = 0; i < 4; i++) {
2938 switch (GET_SWZ(s, i)) {
2939 case SWIZZLE_ZERO:
2940 zero_mask |= (1 << i);
2941 break;
2942 case SWIZZLE_ONE:
2943 one_mask |= (1 << i);
2944 break;
2945 default:
2946 copy_mask |= (1 << i);
2947 swizzle[i] = GET_SWZ(s, i);
2948 break;
2949 }
2950 }
2951
2952 if (copy_mask) {
2953 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2954 swizzled_result.writemask = copy_mask;
2955 emit(MOV(swizzled_result, orig_val));
2956 }
2957
2958 if (zero_mask) {
2959 swizzled_result.writemask = zero_mask;
2960 emit(MOV(swizzled_result, src_reg(0.0f)));
2961 }
2962
2963 if (one_mask) {
2964 swizzled_result.writemask = one_mask;
2965 emit(MOV(swizzled_result, src_reg(1.0f)));
2966 }
2967 }
2968
2969 void
2970 vec4_visitor::visit(ir_return *)
2971 {
2972 unreachable("not reached");
2973 }
2974
2975 void
2976 vec4_visitor::visit(ir_discard *)
2977 {
2978 unreachable("not reached");
2979 }
2980
2981 void
2982 vec4_visitor::visit(ir_if *ir)
2983 {
2984 /* Don't point the annotation at the if statement, because then it plus
2985 * the then and else blocks get printed.
2986 */
2987 this->base_ir = ir->condition;
2988
2989 if (devinfo->gen == 6) {
2990 emit_if_gen6(ir);
2991 } else {
2992 enum brw_predicate predicate;
2993 emit_bool_to_cond_code(ir->condition, &predicate);
2994 emit(IF(predicate));
2995 }
2996
2997 visit_instructions(&ir->then_instructions);
2998
2999 if (!ir->else_instructions.is_empty()) {
3000 this->base_ir = ir->condition;
3001 emit(BRW_OPCODE_ELSE);
3002
3003 visit_instructions(&ir->else_instructions);
3004 }
3005
3006 this->base_ir = ir->condition;
3007 emit(BRW_OPCODE_ENDIF);
3008 }
3009
3010 void
3011 vec4_visitor::gs_emit_vertex(int stream_id)
3012 {
3013 unreachable("not reached");
3014 }
3015
3016 void
3017 vec4_visitor::visit(ir_emit_vertex *)
3018 {
3019 unreachable("not reached");
3020 }
3021
3022 void
3023 vec4_visitor::gs_end_primitive()
3024 {
3025 unreachable("not reached");
3026 }
3027
3028
3029 void
3030 vec4_visitor::visit(ir_end_primitive *)
3031 {
3032 unreachable("not reached");
3033 }
3034
3035 void
3036 vec4_visitor::visit(ir_barrier *)
3037 {
3038 unreachable("not reached");
3039 }
3040
3041 void
3042 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3043 dst_reg dst, src_reg offset,
3044 src_reg src0, src_reg src1)
3045 {
3046 unsigned mlen = 0;
3047
3048 /* Set the atomic operation offset. */
3049 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3050 mlen++;
3051
3052 /* Set the atomic operation arguments. */
3053 if (src0.file != BAD_FILE) {
3054 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3055 mlen++;
3056 }
3057
3058 if (src1.file != BAD_FILE) {
3059 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3060 mlen++;
3061 }
3062
3063 /* Emit the instruction. Note that this maps to the normal SIMD8
3064 * untyped atomic message on Ivy Bridge, but that's OK because
3065 * unused channels will be masked out.
3066 */
3067 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3068 brw_message_reg(0),
3069 src_reg(surf_index), src_reg(atomic_op));
3070 inst->mlen = mlen;
3071 }
3072
3073 void
3074 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3075 src_reg offset)
3076 {
3077 /* Set the surface read offset. */
3078 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3079
3080 /* Emit the instruction. Note that this maps to the normal SIMD8
3081 * untyped surface read message, but that's OK because unused
3082 * channels will be masked out.
3083 */
3084 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3085 brw_message_reg(0),
3086 src_reg(surf_index), src_reg(1));
3087 inst->mlen = 1;
3088 }
3089
3090 void
3091 vec4_visitor::emit_ndc_computation()
3092 {
3093 /* Get the position */
3094 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3095
3096 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3097 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3098 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3099
3100 current_annotation = "NDC";
3101 dst_reg ndc_w = ndc;
3102 ndc_w.writemask = WRITEMASK_W;
3103 src_reg pos_w = pos;
3104 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3105 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3106
3107 dst_reg ndc_xyz = ndc;
3108 ndc_xyz.writemask = WRITEMASK_XYZ;
3109
3110 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3111 }
3112
3113 void
3114 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3115 {
3116 if (devinfo->gen < 6 &&
3117 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3118 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3119 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3120 dst_reg header1_w = header1;
3121 header1_w.writemask = WRITEMASK_W;
3122
3123 emit(MOV(header1, 0u));
3124
3125 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3126 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3127
3128 current_annotation = "Point size";
3129 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3130 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3131 }
3132
3133 if (key->userclip_active) {
3134 current_annotation = "Clipping flags";
3135 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3136 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3137
3138 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3139 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3140 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3141
3142 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3143 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3144 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3145 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3146 }
3147
3148 /* i965 clipping workaround:
3149 * 1) Test for -ve rhw
3150 * 2) If set,
3151 * set ndc = (0,0,0,0)
3152 * set ucp[6] = 1
3153 *
3154 * Later, clipping will detect ucp[6] and ensure the primitive is
3155 * clipped against all fixed planes.
3156 */
3157 if (devinfo->has_negative_rhw_bug) {
3158 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3159 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3160 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3161 vec4_instruction *inst;
3162 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3163 inst->predicate = BRW_PREDICATE_NORMAL;
3164 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3165 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3166 inst->predicate = BRW_PREDICATE_NORMAL;
3167 }
3168
3169 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3170 } else if (devinfo->gen < 6) {
3171 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3172 } else {
3173 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3174 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3175 dst_reg reg_w = reg;
3176 reg_w.writemask = WRITEMASK_W;
3177 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3178 reg_as_src.type = reg_w.type;
3179 reg_as_src.swizzle = brw_swizzle_for_size(1);
3180 emit(MOV(reg_w, reg_as_src));
3181 }
3182 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3183 dst_reg reg_y = reg;
3184 reg_y.writemask = WRITEMASK_Y;
3185 reg_y.type = BRW_REGISTER_TYPE_D;
3186 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3187 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3188 }
3189 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3190 dst_reg reg_z = reg;
3191 reg_z.writemask = WRITEMASK_Z;
3192 reg_z.type = BRW_REGISTER_TYPE_D;
3193 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3194 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3195 }
3196 }
3197 }
3198
3199 void
3200 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3201 {
3202 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3203 *
3204 * "If a linked set of shaders forming the vertex stage contains no
3205 * static write to gl_ClipVertex or gl_ClipDistance, but the
3206 * application has requested clipping against user clip planes through
3207 * the API, then the coordinate written to gl_Position is used for
3208 * comparison against the user clip planes."
3209 *
3210 * This function is only called if the shader didn't write to
3211 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3212 * if the user wrote to it; otherwise we use gl_Position.
3213 */
3214 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3215 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3216 clip_vertex = VARYING_SLOT_POS;
3217 }
3218
3219 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3220 ++i) {
3221 reg.writemask = 1 << i;
3222 emit(DP4(reg,
3223 src_reg(output_reg[clip_vertex]),
3224 src_reg(this->userplane[i + offset])));
3225 }
3226 }
3227
3228 vec4_instruction *
3229 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3230 {
3231 assert(varying < VARYING_SLOT_MAX);
3232 assert(output_reg[varying].type == reg.type);
3233 current_annotation = output_reg_annotation[varying];
3234 /* Copy the register, saturating if necessary */
3235 return emit(MOV(reg, src_reg(output_reg[varying])));
3236 }
3237
3238 void
3239 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3240 {
3241 reg.type = BRW_REGISTER_TYPE_F;
3242 output_reg[varying].type = reg.type;
3243
3244 switch (varying) {
3245 case VARYING_SLOT_PSIZ:
3246 {
3247 /* PSIZ is always in slot 0, and is coupled with other flags. */
3248 current_annotation = "indices, point width, clip flags";
3249 emit_psiz_and_flags(reg);
3250 break;
3251 }
3252 case BRW_VARYING_SLOT_NDC:
3253 current_annotation = "NDC";
3254 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3255 break;
3256 case VARYING_SLOT_POS:
3257 current_annotation = "gl_Position";
3258 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3259 break;
3260 case VARYING_SLOT_EDGE:
3261 /* This is present when doing unfilled polygons. We're supposed to copy
3262 * the edge flag from the user-provided vertex array
3263 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3264 * of that attribute (starts as 1.0f). This is then used in clipping to
3265 * determine which edges should be drawn as wireframe.
3266 */
3267 current_annotation = "edge flag";
3268 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3269 glsl_type::float_type, WRITEMASK_XYZW))));
3270 break;
3271 case BRW_VARYING_SLOT_PAD:
3272 /* No need to write to this slot */
3273 break;
3274 case VARYING_SLOT_COL0:
3275 case VARYING_SLOT_COL1:
3276 case VARYING_SLOT_BFC0:
3277 case VARYING_SLOT_BFC1: {
3278 /* These built-in varyings are only supported in compatibility mode,
3279 * and we only support GS in core profile. So, this must be a vertex
3280 * shader.
3281 */
3282 assert(stage == MESA_SHADER_VERTEX);
3283 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3284 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3285 inst->saturate = true;
3286 break;
3287 }
3288
3289 default:
3290 emit_generic_urb_slot(reg, varying);
3291 break;
3292 }
3293 }
3294
3295 static int
3296 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3297 {
3298 if (devinfo->gen >= 6) {
3299 /* URB data written (does not include the message header reg) must
3300 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3301 * section 5.4.3.2.2: URB_INTERLEAVED.
3302 *
3303 * URB entries are allocated on a multiple of 1024 bits, so an
3304 * extra 128 bits written here to make the end align to 256 is
3305 * no problem.
3306 */
3307 if ((mlen % 2) != 1)
3308 mlen++;
3309 }
3310
3311 return mlen;
3312 }
3313
3314
3315 /**
3316 * Generates the VUE payload plus the necessary URB write instructions to
3317 * output it.
3318 *
3319 * The VUE layout is documented in Volume 2a.
3320 */
3321 void
3322 vec4_visitor::emit_vertex()
3323 {
3324 /* MRF 0 is reserved for the debugger, so start with message header
3325 * in MRF 1.
3326 */
3327 int base_mrf = 1;
3328 int mrf = base_mrf;
3329 /* In the process of generating our URB write message contents, we
3330 * may need to unspill a register or load from an array. Those
3331 * reads would use MRFs 14-15.
3332 */
3333 int max_usable_mrf = 13;
3334
3335 /* The following assertion verifies that max_usable_mrf causes an
3336 * even-numbered amount of URB write data, which will meet gen6's
3337 * requirements for length alignment.
3338 */
3339 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3340
3341 /* First mrf is the g0-based message header containing URB handles and
3342 * such.
3343 */
3344 emit_urb_write_header(mrf++);
3345
3346 if (devinfo->gen < 6) {
3347 emit_ndc_computation();
3348 }
3349
3350 /* Lower legacy ff and ClipVertex clipping to clip distances */
3351 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3352 current_annotation = "user clip distances";
3353
3354 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3355 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3356
3357 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3358 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3359 }
3360
3361 /* We may need to split this up into several URB writes, so do them in a
3362 * loop.
3363 */
3364 int slot = 0;
3365 bool complete = false;
3366 do {
3367 /* URB offset is in URB row increments, and each of our MRFs is half of
3368 * one of those, since we're doing interleaved writes.
3369 */
3370 int offset = slot / 2;
3371
3372 mrf = base_mrf + 1;
3373 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3374 emit_urb_slot(dst_reg(MRF, mrf++),
3375 prog_data->vue_map.slot_to_varying[slot]);
3376
3377 /* If this was max_usable_mrf, we can't fit anything more into this
3378 * URB WRITE.
3379 */
3380 if (mrf > max_usable_mrf) {
3381 slot++;
3382 break;
3383 }
3384 }
3385
3386 complete = slot >= prog_data->vue_map.num_slots;
3387 current_annotation = "URB write";
3388 vec4_instruction *inst = emit_urb_write_opcode(complete);
3389 inst->base_mrf = base_mrf;
3390 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3391 inst->offset += offset;
3392 } while(!complete);
3393 }
3394
3395
3396 src_reg
3397 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3398 src_reg *reladdr, int reg_offset)
3399 {
3400 /* Because we store the values to scratch interleaved like our
3401 * vertex data, we need to scale the vec4 index by 2.
3402 */
3403 int message_header_scale = 2;
3404
3405 /* Pre-gen6, the message header uses byte offsets instead of vec4
3406 * (16-byte) offset units.
3407 */
3408 if (devinfo->gen < 6)
3409 message_header_scale *= 16;
3410
3411 if (reladdr) {
3412 src_reg index = src_reg(this, glsl_type::int_type);
3413
3414 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3415 src_reg(reg_offset)));
3416 emit_before(block, inst, MUL(dst_reg(index), index,
3417 src_reg(message_header_scale)));
3418
3419 return index;
3420 } else {
3421 return src_reg(reg_offset * message_header_scale);
3422 }
3423 }
3424
3425 src_reg
3426 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3427 src_reg *reladdr, int reg_offset)
3428 {
3429 if (reladdr) {
3430 src_reg index = src_reg(this, glsl_type::int_type);
3431
3432 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3433 src_reg(reg_offset)));
3434
3435 /* Pre-gen6, the message header uses byte offsets instead of vec4
3436 * (16-byte) offset units.
3437 */
3438 if (devinfo->gen < 6) {
3439 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3440 }
3441
3442 return index;
3443 } else if (devinfo->gen >= 8) {
3444 /* Store the offset in a GRF so we can send-from-GRF. */
3445 src_reg offset = src_reg(this, glsl_type::int_type);
3446 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3447 return offset;
3448 } else {
3449 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3450 return src_reg(reg_offset * message_header_scale);
3451 }
3452 }
3453
3454 /**
3455 * Emits an instruction before @inst to load the value named by @orig_src
3456 * from scratch space at @base_offset to @temp.
3457 *
3458 * @base_offset is measured in 32-byte units (the size of a register).
3459 */
3460 void
3461 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3462 dst_reg temp, src_reg orig_src,
3463 int base_offset)
3464 {
3465 int reg_offset = base_offset + orig_src.reg_offset;
3466 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3467 reg_offset);
3468
3469 emit_before(block, inst, SCRATCH_READ(temp, index));
3470 }
3471
3472 /**
3473 * Emits an instruction after @inst to store the value to be written
3474 * to @orig_dst to scratch space at @base_offset, from @temp.
3475 *
3476 * @base_offset is measured in 32-byte units (the size of a register).
3477 */
3478 void
3479 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3480 int base_offset)
3481 {
3482 int reg_offset = base_offset + inst->dst.reg_offset;
3483 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3484 reg_offset);
3485
3486 /* Create a temporary register to store *inst's result in.
3487 *
3488 * We have to be careful in MOVing from our temporary result register in
3489 * the scratch write. If we swizzle from channels of the temporary that
3490 * weren't initialized, it will confuse live interval analysis, which will
3491 * make spilling fail to make progress.
3492 */
3493 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3494 inst->dst.type),
3495 brw_swizzle_for_mask(inst->dst.writemask));
3496 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3497 inst->dst.writemask));
3498 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3499 if (inst->opcode != BRW_OPCODE_SEL)
3500 write->predicate = inst->predicate;
3501 write->ir = inst->ir;
3502 write->annotation = inst->annotation;
3503 inst->insert_after(block, write);
3504
3505 inst->dst.file = temp.file;
3506 inst->dst.reg = temp.reg;
3507 inst->dst.reg_offset = temp.reg_offset;
3508 inst->dst.reladdr = NULL;
3509 }
3510
3511 /**
3512 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3513 * adds the scratch read(s) before \p inst. The function also checks for
3514 * recursive reladdr scratch accesses, issuing the corresponding scratch
3515 * loads and rewriting reladdr references accordingly.
3516 *
3517 * \return \p src if it did not require a scratch load, otherwise, the
3518 * register holding the result of the scratch load that the caller should
3519 * use to rewrite src.
3520 */
3521 src_reg
3522 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3523 vec4_instruction *inst, src_reg src)
3524 {
3525 /* Resolve recursive reladdr scratch access by calling ourselves
3526 * with src.reladdr
3527 */
3528 if (src.reladdr)
3529 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3530 *src.reladdr);
3531
3532 /* Now handle scratch access on src */
3533 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3534 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3535 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3536 src.reg = temp.reg;
3537 src.reg_offset = temp.reg_offset;
3538 src.reladdr = NULL;
3539 }
3540
3541 return src;
3542 }
3543
3544 /**
3545 * We can't generally support array access in GRF space, because a
3546 * single instruction's destination can only span 2 contiguous
3547 * registers. So, we send all GRF arrays that get variable index
3548 * access to scratch space.
3549 */
3550 void
3551 vec4_visitor::move_grf_array_access_to_scratch()
3552 {
3553 int scratch_loc[this->alloc.count];
3554 memset(scratch_loc, -1, sizeof(scratch_loc));
3555
3556 /* First, calculate the set of virtual GRFs that need to be punted
3557 * to scratch due to having any array access on them, and where in
3558 * scratch.
3559 */
3560 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3561 if (inst->dst.file == GRF && inst->dst.reladdr) {
3562 if (scratch_loc[inst->dst.reg] == -1) {
3563 scratch_loc[inst->dst.reg] = last_scratch;
3564 last_scratch += this->alloc.sizes[inst->dst.reg];
3565 }
3566
3567 for (src_reg *iter = inst->dst.reladdr;
3568 iter->reladdr;
3569 iter = iter->reladdr) {
3570 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3571 scratch_loc[iter->reg] = last_scratch;
3572 last_scratch += this->alloc.sizes[iter->reg];
3573 }
3574 }
3575 }
3576
3577 for (int i = 0 ; i < 3; i++) {
3578 for (src_reg *iter = &inst->src[i];
3579 iter->reladdr;
3580 iter = iter->reladdr) {
3581 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3582 scratch_loc[iter->reg] = last_scratch;
3583 last_scratch += this->alloc.sizes[iter->reg];
3584 }
3585 }
3586 }
3587 }
3588
3589 /* Now, for anything that will be accessed through scratch, rewrite
3590 * it to load/store. Note that this is a _safe list walk, because
3591 * we may generate a new scratch_write instruction after the one
3592 * we're processing.
3593 */
3594 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3595 /* Set up the annotation tracking for new generated instructions. */
3596 base_ir = inst->ir;
3597 current_annotation = inst->annotation;
3598
3599 /* First handle scratch access on the dst. Notice we have to handle
3600 * the case where the dst's reladdr also points to scratch space.
3601 */
3602 if (inst->dst.reladdr)
3603 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3604 *inst->dst.reladdr);
3605
3606 /* Now that we have handled any (possibly recursive) reladdr scratch
3607 * accesses for dst we can safely do the scratch write for dst itself
3608 */
3609 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3610 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3611
3612 /* Now handle scratch access on any src. In this case, since inst->src[i]
3613 * already is a src_reg, we can just call emit_resolve_reladdr with
3614 * inst->src[i] and it will take care of handling scratch loads for
3615 * both src and src.reladdr (recursively).
3616 */
3617 for (int i = 0 ; i < 3; i++) {
3618 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3619 inst->src[i]);
3620 }
3621 }
3622 }
3623
3624 /**
3625 * Emits an instruction before @inst to load the value named by @orig_src
3626 * from the pull constant buffer (surface) at @base_offset to @temp.
3627 */
3628 void
3629 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3630 dst_reg temp, src_reg orig_src,
3631 int base_offset)
3632 {
3633 int reg_offset = base_offset + orig_src.reg_offset;
3634 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3635 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3636 reg_offset);
3637
3638 emit_pull_constant_load_reg(temp,
3639 index,
3640 offset,
3641 block, inst);
3642 }
3643
3644 /**
3645 * Implements array access of uniforms by inserting a
3646 * PULL_CONSTANT_LOAD instruction.
3647 *
3648 * Unlike temporary GRF array access (where we don't support it due to
3649 * the difficulty of doing relative addressing on instruction
3650 * destinations), we could potentially do array access of uniforms
3651 * that were loaded in GRF space as push constants. In real-world
3652 * usage we've seen, though, the arrays being used are always larger
3653 * than we could load as push constants, so just always move all
3654 * uniform array access out to a pull constant buffer.
3655 */
3656 void
3657 vec4_visitor::move_uniform_array_access_to_pull_constants()
3658 {
3659 int pull_constant_loc[this->uniforms];
3660 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3661 bool nested_reladdr;
3662
3663 /* Walk through and find array access of uniforms. Put a copy of that
3664 * uniform in the pull constant buffer.
3665 *
3666 * Note that we don't move constant-indexed accesses to arrays. No
3667 * testing has been done of the performance impact of this choice.
3668 */
3669 do {
3670 nested_reladdr = false;
3671
3672 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3673 for (int i = 0 ; i < 3; i++) {
3674 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3675 continue;
3676
3677 int uniform = inst->src[i].reg;
3678
3679 if (inst->src[i].reladdr->reladdr)
3680 nested_reladdr = true; /* will need another pass */
3681
3682 /* If this array isn't already present in the pull constant buffer,
3683 * add it.
3684 */
3685 if (pull_constant_loc[uniform] == -1) {
3686 const gl_constant_value **values =
3687 &stage_prog_data->param[uniform * 4];
3688
3689 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3690
3691 assert(uniform < uniform_array_size);
3692 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3693 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3694 = values[j];
3695 }
3696 }
3697
3698 /* Set up the annotation tracking for new generated instructions. */
3699 base_ir = inst->ir;
3700 current_annotation = inst->annotation;
3701
3702 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3703
3704 emit_pull_constant_load(block, inst, temp, inst->src[i],
3705 pull_constant_loc[uniform]);
3706
3707 inst->src[i].file = temp.file;
3708 inst->src[i].reg = temp.reg;
3709 inst->src[i].reg_offset = temp.reg_offset;
3710 inst->src[i].reladdr = NULL;
3711 }
3712 }
3713 } while (nested_reladdr);
3714
3715 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3716 * no need to track them as larger-than-vec4 objects. This will be
3717 * relied on in cutting out unused uniform vectors from push
3718 * constants.
3719 */
3720 split_uniform_registers();
3721 }
3722
3723 void
3724 vec4_visitor::resolve_ud_negate(src_reg *reg)
3725 {
3726 if (reg->type != BRW_REGISTER_TYPE_UD ||
3727 !reg->negate)
3728 return;
3729
3730 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3731 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3732 *reg = temp;
3733 }
3734
3735 /**
3736 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3737 *
3738 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3739 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3740 */
3741 void
3742 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3743 {
3744 assert(devinfo->gen <= 5);
3745
3746 if (!rvalue->type->is_boolean())
3747 return;
3748
3749 src_reg and_result = src_reg(this, rvalue->type);
3750 src_reg neg_result = src_reg(this, rvalue->type);
3751 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3752 emit(MOV(dst_reg(neg_result), negate(and_result)));
3753 *reg = neg_result;
3754 }
3755
3756 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3757 void *log_data,
3758 struct gl_program *prog,
3759 const struct brw_vue_prog_key *key,
3760 struct brw_vue_prog_data *prog_data,
3761 struct gl_shader_program *shader_prog,
3762 gl_shader_stage stage,
3763 void *mem_ctx,
3764 bool no_spills,
3765 int shader_time_index)
3766 : backend_shader(compiler, log_data, mem_ctx,
3767 shader_prog, prog, &prog_data->base, stage),
3768 key(key),
3769 prog_data(prog_data),
3770 sanity_param_count(0),
3771 fail_msg(NULL),
3772 first_non_payload_grf(0),
3773 need_all_constants_in_pull_buffer(false),
3774 no_spills(no_spills),
3775 shader_time_index(shader_time_index),
3776 last_scratch(0)
3777 {
3778 this->failed = false;
3779
3780 this->base_ir = NULL;
3781 this->current_annotation = NULL;
3782 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3783
3784 this->variable_ht = hash_table_ctor(0,
3785 hash_table_pointer_hash,
3786 hash_table_pointer_compare);
3787
3788 this->virtual_grf_start = NULL;
3789 this->virtual_grf_end = NULL;
3790 this->live_intervals = NULL;
3791
3792 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3793
3794 this->uniforms = 0;
3795
3796 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3797 * at least one. See setup_uniforms() in brw_vec4.cpp.
3798 */
3799 this->uniform_array_size = 1;
3800 if (prog_data) {
3801 this->uniform_array_size =
3802 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3803 }
3804
3805 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3806 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3807 }
3808
3809 vec4_visitor::~vec4_visitor()
3810 {
3811 hash_table_dtor(this->variable_ht);
3812 }
3813
3814
3815 void
3816 vec4_visitor::fail(const char *format, ...)
3817 {
3818 va_list va;
3819 char *msg;
3820
3821 if (failed)
3822 return;
3823
3824 failed = true;
3825
3826 va_start(va, format);
3827 msg = ralloc_vasprintf(mem_ctx, format, va);
3828 va_end(va);
3829 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3830
3831 this->fail_msg = msg;
3832
3833 if (debug_enabled) {
3834 fprintf(stderr, "%s", msg);
3835 }
3836 }
3837
3838 } /* namespace brw */