Merge remote-tracking branch 'mesa-public/master' into vulkan
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(const src_reg &src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::resolve_source_modifiers(const src_reg &src)
317 {
318 if (!src.abs && !src.negate)
319 return src;
320
321 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
322 resolved.type = src.type;
323 emit(MOV(resolved, src));
324
325 return src_reg(resolved);
326 }
327
328 src_reg
329 vec4_visitor::fix_math_operand(const src_reg &src)
330 {
331 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
332 return src;
333
334 /* The gen6 math instruction ignores the source modifiers --
335 * swizzle, abs, negate, and at least some parts of the register
336 * region description.
337 *
338 * Rather than trying to enumerate all these cases, *always* expand the
339 * operand to a temp GRF for gen6.
340 *
341 * For gen7, keep the operand as-is, except if immediate, which gen7 still
342 * can't use.
343 */
344
345 if (devinfo->gen == 7 && src.file != IMM)
346 return src;
347
348 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
349 expanded.type = src.type;
350 emit(MOV(expanded, src));
351 return src_reg(expanded);
352 }
353
354 vec4_instruction *
355 vec4_visitor::emit_math(enum opcode opcode,
356 const dst_reg &dst,
357 const src_reg &src0, const src_reg &src1)
358 {
359 vec4_instruction *math =
360 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
361
362 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
363 /* MATH on Gen6 must be align1, so we can't do writemasks. */
364 math->dst = dst_reg(this, glsl_type::vec4_type);
365 math->dst.type = dst.type;
366 math = emit(MOV(dst, src_reg(math->dst)));
367 } else if (devinfo->gen < 6) {
368 math->base_mrf = 1;
369 math->mlen = src1.file == BAD_FILE ? 1 : 2;
370 }
371
372 return math;
373 }
374
375 void
376 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
377 {
378 if (devinfo->gen < 7) {
379 unreachable("ir_unop_pack_half_2x16 should be lowered");
380 }
381
382 assert(dst.type == BRW_REGISTER_TYPE_UD);
383 assert(src0.type == BRW_REGISTER_TYPE_F);
384
385 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
386 *
387 * Because this instruction does not have a 16-bit floating-point type,
388 * the destination data type must be Word (W).
389 *
390 * The destination must be DWord-aligned and specify a horizontal stride
391 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
392 * each destination channel and the upper word is not modified.
393 *
394 * The above restriction implies that the f32to16 instruction must use
395 * align1 mode, because only in align1 mode is it possible to specify
396 * horizontal stride. We choose here to defy the hardware docs and emit
397 * align16 instructions.
398 *
399 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
400 * instructions. I was partially successful in that the code passed all
401 * tests. However, the code was dubiously correct and fragile, and the
402 * tests were not harsh enough to probe that frailty. Not trusting the
403 * code, I chose instead to remain in align16 mode in defiance of the hw
404 * docs).
405 *
406 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
407 * simulator, emitting a f32to16 in align16 mode with UD as destination
408 * data type is safe. The behavior differs from that specified in the PRM
409 * in that the upper word of each destination channel is cleared to 0.
410 */
411
412 dst_reg tmp_dst(this, glsl_type::uvec2_type);
413 src_reg tmp_src(tmp_dst);
414
415 #if 0
416 /* Verify the undocumented behavior on which the following instructions
417 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
418 * then the result of the bit-or instruction below will be incorrect.
419 *
420 * You should inspect the disasm output in order to verify that the MOV is
421 * not optimized away.
422 */
423 emit(MOV(tmp_dst, src_reg(0x12345678u)));
424 #endif
425
426 /* Give tmp the form below, where "." means untouched.
427 *
428 * w z y x w z y x
429 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
430 *
431 * That the upper word of each write-channel be 0 is required for the
432 * following bit-shift and bit-or instructions to work. Note that this
433 * relies on the undocumented hardware behavior mentioned above.
434 */
435 tmp_dst.writemask = WRITEMASK_XY;
436 emit(F32TO16(tmp_dst, src0));
437
438 /* Give the write-channels of dst the form:
439 * 0xhhhh0000
440 */
441 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
442 emit(SHL(dst, tmp_src, src_reg(16u)));
443
444 /* Finally, give the write-channels of dst the form of packHalf2x16's
445 * output:
446 * 0xhhhhllll
447 */
448 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
449 emit(OR(dst, src_reg(dst), tmp_src));
450 }
451
452 void
453 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
454 {
455 if (devinfo->gen < 7) {
456 unreachable("ir_unop_unpack_half_2x16 should be lowered");
457 }
458
459 assert(dst.type == BRW_REGISTER_TYPE_F);
460 assert(src0.type == BRW_REGISTER_TYPE_UD);
461
462 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
463 *
464 * Because this instruction does not have a 16-bit floating-point type,
465 * the source data type must be Word (W). The destination type must be
466 * F (Float).
467 *
468 * To use W as the source data type, we must adjust horizontal strides,
469 * which is only possible in align1 mode. All my [chadv] attempts at
470 * emitting align1 instructions for unpackHalf2x16 failed to pass the
471 * Piglit tests, so I gave up.
472 *
473 * I've verified that, on gen7 hardware and the simulator, it is safe to
474 * emit f16to32 in align16 mode with UD as source data type.
475 */
476
477 dst_reg tmp_dst(this, glsl_type::uvec2_type);
478 src_reg tmp_src(tmp_dst);
479
480 tmp_dst.writemask = WRITEMASK_X;
481 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
482
483 tmp_dst.writemask = WRITEMASK_Y;
484 emit(SHR(tmp_dst, src0, src_reg(16u)));
485
486 dst.writemask = WRITEMASK_XY;
487 emit(F16TO32(dst, tmp_src));
488 }
489
490 void
491 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
492 {
493 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
494 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
495 * is not suitable to generate the shift values, but we can use the packed
496 * vector float and a type-converting MOV.
497 */
498 dst_reg shift(this, glsl_type::uvec4_type);
499 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
500
501 dst_reg shifted(this, glsl_type::uvec4_type);
502 src0.swizzle = BRW_SWIZZLE_XXXX;
503 emit(SHR(shifted, src0, src_reg(shift)));
504
505 shifted.type = BRW_REGISTER_TYPE_UB;
506 dst_reg f(this, glsl_type::vec4_type);
507 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
508
509 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
510 }
511
512 void
513 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
514 {
515 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
516 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
517 * is not suitable to generate the shift values, but we can use the packed
518 * vector float and a type-converting MOV.
519 */
520 dst_reg shift(this, glsl_type::uvec4_type);
521 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
522
523 dst_reg shifted(this, glsl_type::uvec4_type);
524 src0.swizzle = BRW_SWIZZLE_XXXX;
525 emit(SHR(shifted, src0, src_reg(shift)));
526
527 shifted.type = BRW_REGISTER_TYPE_B;
528 dst_reg f(this, glsl_type::vec4_type);
529 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
533
534 dst_reg max(this, glsl_type::vec4_type);
535 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
536 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
537 }
538
539 void
540 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
541 {
542 dst_reg saturated(this, glsl_type::vec4_type);
543 vec4_instruction *inst = emit(MOV(saturated, src0));
544 inst->saturate = true;
545
546 dst_reg scaled(this, glsl_type::vec4_type);
547 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
548
549 dst_reg rounded(this, glsl_type::vec4_type);
550 emit(RNDE(rounded, src_reg(scaled)));
551
552 dst_reg u(this, glsl_type::uvec4_type);
553 emit(MOV(u, src_reg(rounded)));
554
555 src_reg bytes(u);
556 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
557 }
558
559 void
560 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
561 {
562 dst_reg max(this, glsl_type::vec4_type);
563 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
564
565 dst_reg min(this, glsl_type::vec4_type);
566 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
567
568 dst_reg scaled(this, glsl_type::vec4_type);
569 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
570
571 dst_reg rounded(this, glsl_type::vec4_type);
572 emit(RNDE(rounded, src_reg(scaled)));
573
574 dst_reg i(this, glsl_type::ivec4_type);
575 emit(MOV(i, src_reg(rounded)));
576
577 src_reg bytes(i);
578 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
579 }
580
581 void
582 vec4_visitor::visit_instructions(const exec_list *list)
583 {
584 foreach_in_list(ir_instruction, ir, list) {
585 base_ir = ir;
586 ir->accept(this);
587 }
588 }
589
590 /**
591 * Returns the minimum number of vec4 elements needed to pack a type.
592 *
593 * For simple types, it will return 1 (a single vec4); for matrices, the
594 * number of columns; for array and struct, the sum of the vec4_size of
595 * each of its elements; and for sampler and atomic, zero.
596 *
597 * This method is useful to calculate how much register space is needed to
598 * store a particular type.
599 */
600 int
601 vec4_visitor::type_size(const struct glsl_type *type)
602 {
603 unsigned int i;
604 int size;
605
606 switch (type->base_type) {
607 case GLSL_TYPE_UINT:
608 case GLSL_TYPE_INT:
609 case GLSL_TYPE_FLOAT:
610 case GLSL_TYPE_BOOL:
611 if (type->is_matrix()) {
612 return type->matrix_columns;
613 } else {
614 /* Regardless of size of vector, it gets a vec4. This is bad
615 * packing for things like floats, but otherwise arrays become a
616 * mess. Hopefully a later pass over the code can pack scalars
617 * down if appropriate.
618 */
619 return 1;
620 }
621 case GLSL_TYPE_ARRAY:
622 assert(type->length > 0);
623 return type_size(type->fields.array) * type->length;
624 case GLSL_TYPE_STRUCT:
625 size = 0;
626 for (i = 0; i < type->length; i++) {
627 size += type_size(type->fields.structure[i].type);
628 }
629 return size;
630 case GLSL_TYPE_SUBROUTINE:
631 return 1;
632
633 case GLSL_TYPE_SAMPLER:
634 /* Samplers take up no register space, since they're baked in at
635 * link time.
636 */
637 return 0;
638 case GLSL_TYPE_ATOMIC_UINT:
639 return 0;
640 case GLSL_TYPE_IMAGE:
641 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
642 case GLSL_TYPE_VOID:
643 case GLSL_TYPE_DOUBLE:
644 case GLSL_TYPE_ERROR:
645 case GLSL_TYPE_INTERFACE:
646 case GLSL_TYPE_FUNCTION:
647 unreachable("not reached");
648 }
649
650 return 0;
651 }
652
653 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
654 {
655 init();
656
657 this->file = GRF;
658 this->reg = v->alloc.allocate(v->type_size(type));
659
660 if (type->is_array() || type->is_record()) {
661 this->swizzle = BRW_SWIZZLE_NOOP;
662 } else {
663 this->swizzle = brw_swizzle_for_size(type->vector_elements);
664 }
665
666 this->type = brw_type_for_base_type(type);
667 }
668
669 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
670 {
671 assert(size > 0);
672
673 init();
674
675 this->file = GRF;
676 this->reg = v->alloc.allocate(v->type_size(type) * size);
677
678 this->swizzle = BRW_SWIZZLE_NOOP;
679
680 this->type = brw_type_for_base_type(type);
681 }
682
683 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
684 {
685 init();
686
687 this->file = GRF;
688 this->reg = v->alloc.allocate(v->type_size(type));
689
690 if (type->is_array() || type->is_record()) {
691 this->writemask = WRITEMASK_XYZW;
692 } else {
693 this->writemask = (1 << type->vector_elements) - 1;
694 }
695
696 this->type = brw_type_for_base_type(type);
697 }
698
699 void
700 vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
701 unsigned n)
702 {
703 static const gl_constant_value zero = { 0 };
704
705 for (unsigned i = 0; i < n; ++i)
706 stage_prog_data->param[4 * uniforms + i] = &values[i];
707
708 for (unsigned i = n; i < 4; ++i)
709 stage_prog_data->param[4 * uniforms + i] = &zero;
710
711 uniform_vector_size[uniforms++] = n;
712 }
713
714 /* Our support for uniforms is piggy-backed on the struct
715 * gl_fragment_program, because that's where the values actually
716 * get stored, rather than in some global gl_shader_program uniform
717 * store.
718 */
719 void
720 vec4_visitor::setup_uniform_values(ir_variable *ir)
721 {
722 int namelen = strlen(ir->name);
723
724 /* The data for our (non-builtin) uniforms is stored in a series of
725 * gl_uniform_driver_storage structs for each subcomponent that
726 * glGetUniformLocation() could name. We know it's been set up in the same
727 * order we'd walk the type, so walk the list of storage and find anything
728 * with our name, or the prefix of a component that starts with our name.
729 */
730 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
731 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
732
733 if (storage->builtin)
734 continue;
735
736 if (strncmp(ir->name, storage->name, namelen) != 0 ||
737 (storage->name[namelen] != 0 &&
738 storage->name[namelen] != '.' &&
739 storage->name[namelen] != '[')) {
740 continue;
741 }
742
743 const unsigned vector_count = (MAX2(storage->array_elements, 1) *
744 storage->type->matrix_columns);
745 const unsigned vector_size = storage->type->vector_elements;
746
747 for (unsigned s = 0; s < vector_count; s++)
748 setup_vector_uniform_values(&storage->storage[s * vector_size],
749 vector_size);
750 }
751 }
752
753 void
754 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
755 {
756 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
757 assert(this->uniforms < uniform_array_size);
758 this->uniform_vector_size[this->uniforms] = 4;
759 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
760 this->userplane[i].type = BRW_REGISTER_TYPE_F;
761 for (int j = 0; j < 4; ++j) {
762 stage_prog_data->param[this->uniforms * 4 + j] =
763 (gl_constant_value *) &clip_planes[i][j];
764 }
765 ++this->uniforms;
766 }
767 }
768
769 /* Our support for builtin uniforms is even scarier than non-builtin.
770 * It sits on top of the PROG_STATE_VAR parameters that are
771 * automatically updated from GL context state.
772 */
773 void
774 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
775 {
776 const ir_state_slot *const slots = ir->get_state_slots();
777 assert(slots != NULL);
778
779 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
780 /* This state reference has already been setup by ir_to_mesa,
781 * but we'll get the same index back here. We can reference
782 * ParameterValues directly, since unlike brw_fs.cpp, we never
783 * add new state references during compile.
784 */
785 int index = _mesa_add_state_reference(this->prog->Parameters,
786 (gl_state_index *)slots[i].tokens);
787 gl_constant_value *values =
788 &this->prog->Parameters->ParameterValues[index][0];
789
790 assert(this->uniforms < uniform_array_size);
791
792 for (unsigned j = 0; j < 4; j++)
793 stage_prog_data->param[this->uniforms * 4 + j] =
794 &values[GET_SWZ(slots[i].swizzle, j)];
795
796 this->uniform_vector_size[this->uniforms] =
797 (ir->type->is_scalar() || ir->type->is_vector() ||
798 ir->type->is_matrix() ? ir->type->vector_elements : 4);
799
800 this->uniforms++;
801 }
802 }
803
804 dst_reg *
805 vec4_visitor::variable_storage(ir_variable *var)
806 {
807 return (dst_reg *)hash_table_find(this->variable_ht, var);
808 }
809
810 void
811 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
812 enum brw_predicate *predicate)
813 {
814 ir_expression *expr = ir->as_expression();
815
816 *predicate = BRW_PREDICATE_NORMAL;
817
818 if (expr && expr->operation != ir_binop_ubo_load) {
819 src_reg op[3];
820 vec4_instruction *inst;
821
822 assert(expr->get_num_operands() <= 3);
823 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
824 expr->operands[i]->accept(this);
825 op[i] = this->result;
826
827 resolve_ud_negate(&op[i]);
828 }
829
830 switch (expr->operation) {
831 case ir_unop_logic_not:
832 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
833 inst->conditional_mod = BRW_CONDITIONAL_Z;
834 break;
835
836 case ir_binop_logic_xor:
837 if (devinfo->gen <= 5) {
838 src_reg temp = src_reg(this, ir->type);
839 emit(XOR(dst_reg(temp), op[0], op[1]));
840 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
841 } else {
842 inst = emit(XOR(dst_null_d(), op[0], op[1]));
843 }
844 inst->conditional_mod = BRW_CONDITIONAL_NZ;
845 break;
846
847 case ir_binop_logic_or:
848 if (devinfo->gen <= 5) {
849 src_reg temp = src_reg(this, ir->type);
850 emit(OR(dst_reg(temp), op[0], op[1]));
851 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
852 } else {
853 inst = emit(OR(dst_null_d(), op[0], op[1]));
854 }
855 inst->conditional_mod = BRW_CONDITIONAL_NZ;
856 break;
857
858 case ir_binop_logic_and:
859 if (devinfo->gen <= 5) {
860 src_reg temp = src_reg(this, ir->type);
861 emit(AND(dst_reg(temp), op[0], op[1]));
862 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
863 } else {
864 inst = emit(AND(dst_null_d(), op[0], op[1]));
865 }
866 inst->conditional_mod = BRW_CONDITIONAL_NZ;
867 break;
868
869 case ir_unop_f2b:
870 if (devinfo->gen >= 6) {
871 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
872 } else {
873 inst = emit(MOV(dst_null_f(), op[0]));
874 inst->conditional_mod = BRW_CONDITIONAL_NZ;
875 }
876 break;
877
878 case ir_unop_i2b:
879 if (devinfo->gen >= 6) {
880 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
881 } else {
882 inst = emit(MOV(dst_null_d(), op[0]));
883 inst->conditional_mod = BRW_CONDITIONAL_NZ;
884 }
885 break;
886
887 case ir_binop_all_equal:
888 if (devinfo->gen <= 5) {
889 resolve_bool_comparison(expr->operands[0], &op[0]);
890 resolve_bool_comparison(expr->operands[1], &op[1]);
891 }
892 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
893 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
894 break;
895
896 case ir_binop_any_nequal:
897 if (devinfo->gen <= 5) {
898 resolve_bool_comparison(expr->operands[0], &op[0]);
899 resolve_bool_comparison(expr->operands[1], &op[1]);
900 }
901 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
902 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
903 break;
904
905 case ir_unop_any:
906 if (devinfo->gen <= 5) {
907 resolve_bool_comparison(expr->operands[0], &op[0]);
908 }
909 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
910 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
911 break;
912
913 case ir_binop_greater:
914 case ir_binop_gequal:
915 case ir_binop_less:
916 case ir_binop_lequal:
917 case ir_binop_equal:
918 case ir_binop_nequal:
919 if (devinfo->gen <= 5) {
920 resolve_bool_comparison(expr->operands[0], &op[0]);
921 resolve_bool_comparison(expr->operands[1], &op[1]);
922 }
923 emit(CMP(dst_null_d(), op[0], op[1],
924 brw_conditional_for_comparison(expr->operation)));
925 break;
926
927 case ir_triop_csel: {
928 /* Expand the boolean condition into the flag register. */
929 inst = emit(MOV(dst_null_d(), op[0]));
930 inst->conditional_mod = BRW_CONDITIONAL_NZ;
931
932 /* Select which boolean to return. */
933 dst_reg temp(this, expr->operands[1]->type);
934 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
935 inst->predicate = BRW_PREDICATE_NORMAL;
936
937 /* Expand the result to a condition code. */
938 inst = emit(MOV(dst_null_d(), src_reg(temp)));
939 inst->conditional_mod = BRW_CONDITIONAL_NZ;
940 break;
941 }
942
943 default:
944 unreachable("not reached");
945 }
946 return;
947 }
948
949 ir->accept(this);
950
951 resolve_ud_negate(&this->result);
952
953 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
954 inst->conditional_mod = BRW_CONDITIONAL_NZ;
955 }
956
957 /**
958 * Emit a gen6 IF statement with the comparison folded into the IF
959 * instruction.
960 */
961 void
962 vec4_visitor::emit_if_gen6(ir_if *ir)
963 {
964 ir_expression *expr = ir->condition->as_expression();
965
966 if (expr && expr->operation != ir_binop_ubo_load) {
967 src_reg op[3];
968 dst_reg temp;
969
970 assert(expr->get_num_operands() <= 3);
971 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
972 expr->operands[i]->accept(this);
973 op[i] = this->result;
974 }
975
976 switch (expr->operation) {
977 case ir_unop_logic_not:
978 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
979 return;
980
981 case ir_binop_logic_xor:
982 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
983 return;
984
985 case ir_binop_logic_or:
986 temp = dst_reg(this, glsl_type::bool_type);
987 emit(OR(temp, op[0], op[1]));
988 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
989 return;
990
991 case ir_binop_logic_and:
992 temp = dst_reg(this, glsl_type::bool_type);
993 emit(AND(temp, op[0], op[1]));
994 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
995 return;
996
997 case ir_unop_f2b:
998 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
999 return;
1000
1001 case ir_unop_i2b:
1002 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1003 return;
1004
1005 case ir_binop_greater:
1006 case ir_binop_gequal:
1007 case ir_binop_less:
1008 case ir_binop_lequal:
1009 case ir_binop_equal:
1010 case ir_binop_nequal:
1011 emit(IF(op[0], op[1],
1012 brw_conditional_for_comparison(expr->operation)));
1013 return;
1014
1015 case ir_binop_all_equal:
1016 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1017 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1018 return;
1019
1020 case ir_binop_any_nequal:
1021 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1022 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1023 return;
1024
1025 case ir_unop_any:
1026 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1027 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1028 return;
1029
1030 case ir_triop_csel: {
1031 /* Expand the boolean condition into the flag register. */
1032 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1033 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1034
1035 /* Select which boolean to return. */
1036 dst_reg temp(this, expr->operands[1]->type);
1037 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1038 inst->predicate = BRW_PREDICATE_NORMAL;
1039
1040 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1041 return;
1042 }
1043
1044 default:
1045 unreachable("not reached");
1046 }
1047 return;
1048 }
1049
1050 ir->condition->accept(this);
1051
1052 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1053 }
1054
1055 void
1056 vec4_visitor::visit(ir_variable *ir)
1057 {
1058 dst_reg *reg = NULL;
1059
1060 if (variable_storage(ir))
1061 return;
1062
1063 switch (ir->data.mode) {
1064 case ir_var_shader_in:
1065 assert(ir->data.location != -1);
1066 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1067 break;
1068
1069 case ir_var_shader_out:
1070 assert(ir->data.location != -1);
1071 reg = new(mem_ctx) dst_reg(this, ir->type);
1072
1073 for (int i = 0; i < type_size(ir->type); i++) {
1074 output_reg[ir->data.location + i] = *reg;
1075 output_reg[ir->data.location + i].reg_offset = i;
1076 output_reg_annotation[ir->data.location + i] = ir->name;
1077 }
1078 break;
1079
1080 case ir_var_auto:
1081 case ir_var_temporary:
1082 reg = new(mem_ctx) dst_reg(this, ir->type);
1083 break;
1084
1085 case ir_var_uniform:
1086 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1087
1088 /* Thanks to the lower_ubo_reference pass, we will see only
1089 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1090 * variables, so no need for them to be in variable_ht.
1091 *
1092 * Some uniforms, such as samplers and atomic counters, have no actual
1093 * storage, so we should ignore them.
1094 */
1095 if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1096 return;
1097
1098 /* Track how big the whole uniform variable is, in case we need to put a
1099 * copy of its data into pull constants for array access.
1100 */
1101 assert(this->uniforms < uniform_array_size);
1102 this->uniform_size[this->uniforms] = type_size(ir->type);
1103
1104 if (!strncmp(ir->name, "gl_", 3)) {
1105 setup_builtin_uniform_values(ir);
1106 } else {
1107 setup_uniform_values(ir);
1108 }
1109 break;
1110
1111 case ir_var_system_value:
1112 reg = make_reg_for_system_value(ir->data.location, ir->type);
1113 break;
1114
1115 default:
1116 unreachable("not reached");
1117 }
1118
1119 reg->type = brw_type_for_base_type(ir->type);
1120 hash_table_insert(this->variable_ht, reg, ir);
1121 }
1122
1123 void
1124 vec4_visitor::visit(ir_loop *ir)
1125 {
1126 /* We don't want debugging output to print the whole body of the
1127 * loop as the annotation.
1128 */
1129 this->base_ir = NULL;
1130
1131 emit(BRW_OPCODE_DO);
1132
1133 visit_instructions(&ir->body_instructions);
1134
1135 emit(BRW_OPCODE_WHILE);
1136 }
1137
1138 void
1139 vec4_visitor::visit(ir_loop_jump *ir)
1140 {
1141 switch (ir->mode) {
1142 case ir_loop_jump::jump_break:
1143 emit(BRW_OPCODE_BREAK);
1144 break;
1145 case ir_loop_jump::jump_continue:
1146 emit(BRW_OPCODE_CONTINUE);
1147 break;
1148 }
1149 }
1150
1151
1152 void
1153 vec4_visitor::visit(ir_function_signature *)
1154 {
1155 unreachable("not reached");
1156 }
1157
1158 void
1159 vec4_visitor::visit(ir_function *ir)
1160 {
1161 /* Ignore function bodies other than main() -- we shouldn't see calls to
1162 * them since they should all be inlined.
1163 */
1164 if (strcmp(ir->name, "main") == 0) {
1165 const ir_function_signature *sig;
1166 exec_list empty;
1167
1168 sig = ir->matching_signature(NULL, &empty, false);
1169
1170 assert(sig);
1171
1172 visit_instructions(&sig->body);
1173 }
1174 }
1175
1176 bool
1177 vec4_visitor::try_emit_mad(ir_expression *ir)
1178 {
1179 /* 3-src instructions were introduced in gen6. */
1180 if (devinfo->gen < 6)
1181 return false;
1182
1183 /* MAD can only handle floating-point data. */
1184 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1185 return false;
1186
1187 ir_rvalue *nonmul;
1188 ir_expression *mul;
1189 bool mul_negate, mul_abs;
1190
1191 for (int i = 0; i < 2; i++) {
1192 mul_negate = false;
1193 mul_abs = false;
1194
1195 mul = ir->operands[i]->as_expression();
1196 nonmul = ir->operands[1 - i];
1197
1198 if (mul && mul->operation == ir_unop_abs) {
1199 mul = mul->operands[0]->as_expression();
1200 mul_abs = true;
1201 } else if (mul && mul->operation == ir_unop_neg) {
1202 mul = mul->operands[0]->as_expression();
1203 mul_negate = true;
1204 }
1205
1206 if (mul && mul->operation == ir_binop_mul)
1207 break;
1208 }
1209
1210 if (!mul || mul->operation != ir_binop_mul)
1211 return false;
1212
1213 nonmul->accept(this);
1214 src_reg src0 = fix_3src_operand(this->result);
1215
1216 mul->operands[0]->accept(this);
1217 src_reg src1 = fix_3src_operand(this->result);
1218 src1.negate ^= mul_negate;
1219 src1.abs = mul_abs;
1220 if (mul_abs)
1221 src1.negate = false;
1222
1223 mul->operands[1]->accept(this);
1224 src_reg src2 = fix_3src_operand(this->result);
1225 src2.abs = mul_abs;
1226 if (mul_abs)
1227 src2.negate = false;
1228
1229 this->result = src_reg(this, ir->type);
1230 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1231
1232 return true;
1233 }
1234
1235 bool
1236 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1237 {
1238 /* This optimization relies on CMP setting the destination to 0 when
1239 * false. Early hardware only sets the least significant bit, and
1240 * leaves the other bits undefined. So we can't use it.
1241 */
1242 if (devinfo->gen < 6)
1243 return false;
1244
1245 ir_expression *const cmp = ir->operands[0]->as_expression();
1246
1247 if (cmp == NULL)
1248 return false;
1249
1250 switch (cmp->operation) {
1251 case ir_binop_less:
1252 case ir_binop_greater:
1253 case ir_binop_lequal:
1254 case ir_binop_gequal:
1255 case ir_binop_equal:
1256 case ir_binop_nequal:
1257 break;
1258
1259 default:
1260 return false;
1261 }
1262
1263 cmp->operands[0]->accept(this);
1264 const src_reg cmp_src0 = this->result;
1265
1266 cmp->operands[1]->accept(this);
1267 const src_reg cmp_src1 = this->result;
1268
1269 this->result = src_reg(this, ir->type);
1270
1271 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1272 brw_conditional_for_comparison(cmp->operation)));
1273
1274 /* If the comparison is false, this->result will just happen to be zero.
1275 */
1276 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1277 this->result, src_reg(1.0f));
1278 inst->predicate = BRW_PREDICATE_NORMAL;
1279 inst->predicate_inverse = true;
1280
1281 return true;
1282 }
1283
1284 vec4_instruction *
1285 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1286 src_reg src0, src_reg src1)
1287 {
1288 vec4_instruction *inst;
1289
1290 if (devinfo->gen >= 6) {
1291 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1292 inst->conditional_mod = conditionalmod;
1293 } else {
1294 emit(CMP(dst, src0, src1, conditionalmod));
1295
1296 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1297 inst->predicate = BRW_PREDICATE_NORMAL;
1298 }
1299
1300 return inst;
1301 }
1302
1303 vec4_instruction *
1304 vec4_visitor::emit_lrp(const dst_reg &dst,
1305 const src_reg &x, const src_reg &y, const src_reg &a)
1306 {
1307 if (devinfo->gen >= 6) {
1308 /* Note that the instruction's argument order is reversed from GLSL
1309 * and the IR.
1310 */
1311 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1312 fix_3src_operand(x)));
1313 } else {
1314 /* Earlier generations don't support three source operations, so we
1315 * need to emit x*(1-a) + y*a.
1316 */
1317 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1318 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1319 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1320 y_times_a.writemask = dst.writemask;
1321 one_minus_a.writemask = dst.writemask;
1322 x_times_one_minus_a.writemask = dst.writemask;
1323
1324 emit(MUL(y_times_a, y, a));
1325 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1326 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1327 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1328 }
1329 }
1330
1331 /**
1332 * Emits the instructions needed to perform a pull constant load. before_block
1333 * and before_inst can be NULL in which case the instruction will be appended
1334 * to the end of the instruction list.
1335 */
1336 void
1337 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1338 src_reg surf_index,
1339 src_reg offset_reg,
1340 bblock_t *before_block,
1341 vec4_instruction *before_inst)
1342 {
1343 assert((before_inst == NULL && before_block == NULL) ||
1344 (before_inst && before_block));
1345
1346 vec4_instruction *pull;
1347
1348 if (devinfo->gen >= 9) {
1349 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1350 src_reg header(this, glsl_type::uvec4_type, 2);
1351
1352 pull = new(mem_ctx)
1353 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1354 dst_reg(header));
1355
1356 if (before_inst)
1357 emit_before(before_block, before_inst, pull);
1358 else
1359 emit(pull);
1360
1361 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1362 offset_reg.type);
1363 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1364
1365 if (before_inst)
1366 emit_before(before_block, before_inst, pull);
1367 else
1368 emit(pull);
1369
1370 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1371 dst,
1372 surf_index,
1373 header);
1374 pull->mlen = 2;
1375 pull->header_size = 1;
1376 } else if (devinfo->gen >= 7) {
1377 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1378
1379 grf_offset.type = offset_reg.type;
1380
1381 pull = MOV(grf_offset, offset_reg);
1382
1383 if (before_inst)
1384 emit_before(before_block, before_inst, pull);
1385 else
1386 emit(pull);
1387
1388 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1389 dst,
1390 surf_index,
1391 src_reg(grf_offset));
1392 pull->mlen = 1;
1393 } else {
1394 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1395 dst,
1396 surf_index,
1397 offset_reg);
1398 pull->base_mrf = 14;
1399 pull->mlen = 1;
1400 }
1401
1402 if (before_inst)
1403 emit_before(before_block, before_inst, pull);
1404 else
1405 emit(pull);
1406 }
1407
1408 src_reg
1409 vec4_visitor::emit_uniformize(const src_reg &src)
1410 {
1411 const src_reg chan_index(this, glsl_type::uint_type);
1412 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1413 src.type);
1414
1415 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1416 ->force_writemask_all = true;
1417 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1418 ->force_writemask_all = true;
1419
1420 return src_reg(dst);
1421 }
1422
1423 void
1424 vec4_visitor::visit(ir_expression *ir)
1425 {
1426 unsigned int operand;
1427 src_reg op[ARRAY_SIZE(ir->operands)];
1428 vec4_instruction *inst;
1429
1430 if (ir->operation == ir_binop_add) {
1431 if (try_emit_mad(ir))
1432 return;
1433 }
1434
1435 if (ir->operation == ir_unop_b2f) {
1436 if (try_emit_b2f_of_compare(ir))
1437 return;
1438 }
1439
1440 /* Storage for our result. Ideally for an assignment we'd be using
1441 * the actual storage for the result here, instead.
1442 */
1443 dst_reg result_dst(this, ir->type);
1444 src_reg result_src(result_dst);
1445
1446 if (ir->operation == ir_triop_csel) {
1447 ir->operands[1]->accept(this);
1448 op[1] = this->result;
1449 ir->operands[2]->accept(this);
1450 op[2] = this->result;
1451
1452 enum brw_predicate predicate;
1453 emit_bool_to_cond_code(ir->operands[0], &predicate);
1454 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1455 inst->predicate = predicate;
1456 this->result = result_src;
1457 return;
1458 }
1459
1460 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1461 this->result.file = BAD_FILE;
1462 ir->operands[operand]->accept(this);
1463 if (this->result.file == BAD_FILE) {
1464 fprintf(stderr, "Failed to get tree for expression operand:\n");
1465 ir->operands[operand]->fprint(stderr);
1466 exit(1);
1467 }
1468 op[operand] = this->result;
1469
1470 /* Matrix expression operands should have been broken down to vector
1471 * operations already.
1472 */
1473 assert(!ir->operands[operand]->type->is_matrix());
1474 }
1475
1476 /* If nothing special happens, this is the result. */
1477 this->result = result_src;
1478
1479 switch (ir->operation) {
1480 case ir_unop_logic_not:
1481 emit(NOT(result_dst, op[0]));
1482 break;
1483 case ir_unop_neg:
1484 op[0].negate = !op[0].negate;
1485 emit(MOV(result_dst, op[0]));
1486 break;
1487 case ir_unop_abs:
1488 op[0].abs = true;
1489 op[0].negate = false;
1490 emit(MOV(result_dst, op[0]));
1491 break;
1492
1493 case ir_unop_sign:
1494 if (ir->type->is_float()) {
1495 /* AND(val, 0x80000000) gives the sign bit.
1496 *
1497 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1498 * zero.
1499 */
1500 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1501
1502 op[0].type = BRW_REGISTER_TYPE_UD;
1503 result_dst.type = BRW_REGISTER_TYPE_UD;
1504 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1505
1506 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1507 inst->predicate = BRW_PREDICATE_NORMAL;
1508
1509 this->result.type = BRW_REGISTER_TYPE_F;
1510 } else {
1511 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1512 * -> non-negative val generates 0x00000000.
1513 * Predicated OR sets 1 if val is positive.
1514 */
1515 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1516
1517 emit(ASR(result_dst, op[0], src_reg(31)));
1518
1519 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1520 inst->predicate = BRW_PREDICATE_NORMAL;
1521 }
1522 break;
1523
1524 case ir_unop_rcp:
1525 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1526 break;
1527
1528 case ir_unop_exp2:
1529 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1530 break;
1531 case ir_unop_log2:
1532 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1533 break;
1534 case ir_unop_exp:
1535 case ir_unop_log:
1536 unreachable("not reached: should be handled by ir_explog_to_explog2");
1537 case ir_unop_sin:
1538 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1539 break;
1540 case ir_unop_cos:
1541 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1542 break;
1543
1544 case ir_unop_dFdx:
1545 case ir_unop_dFdx_coarse:
1546 case ir_unop_dFdx_fine:
1547 case ir_unop_dFdy:
1548 case ir_unop_dFdy_coarse:
1549 case ir_unop_dFdy_fine:
1550 unreachable("derivatives not valid in vertex shader");
1551
1552 case ir_unop_bitfield_reverse:
1553 emit(BFREV(result_dst, op[0]));
1554 break;
1555 case ir_unop_bit_count:
1556 emit(CBIT(result_dst, op[0]));
1557 break;
1558 case ir_unop_find_msb: {
1559 src_reg temp = src_reg(this, glsl_type::uint_type);
1560
1561 inst = emit(FBH(dst_reg(temp), op[0]));
1562 inst->dst.writemask = WRITEMASK_XYZW;
1563
1564 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1565 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1566 * subtract the result from 31 to convert the MSB count into an LSB count.
1567 */
1568
1569 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1570 temp.swizzle = BRW_SWIZZLE_NOOP;
1571 emit(MOV(result_dst, temp));
1572
1573 src_reg src_tmp = src_reg(result_dst);
1574 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1575
1576 src_tmp.negate = true;
1577 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1578 inst->predicate = BRW_PREDICATE_NORMAL;
1579 break;
1580 }
1581 case ir_unop_find_lsb:
1582 emit(FBL(result_dst, op[0]));
1583 break;
1584 case ir_unop_saturate:
1585 inst = emit(MOV(result_dst, op[0]));
1586 inst->saturate = true;
1587 break;
1588
1589 case ir_unop_noise:
1590 unreachable("not reached: should be handled by lower_noise");
1591
1592 case ir_unop_subroutine_to_int:
1593 emit(MOV(result_dst, op[0]));
1594 break;
1595
1596 case ir_binop_add:
1597 emit(ADD(result_dst, op[0], op[1]));
1598 break;
1599 case ir_binop_sub:
1600 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1601
1602 case ir_binop_mul:
1603 if (devinfo->gen < 8 && ir->type->is_integer()) {
1604 /* For integer multiplication, the MUL uses the low 16 bits of one of
1605 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1606 * accumulates in the contribution of the upper 16 bits of that
1607 * operand. If we can determine that one of the args is in the low
1608 * 16 bits, though, we can just emit a single MUL.
1609 */
1610 if (ir->operands[0]->is_uint16_constant()) {
1611 if (devinfo->gen < 7)
1612 emit(MUL(result_dst, op[0], op[1]));
1613 else
1614 emit(MUL(result_dst, op[1], op[0]));
1615 } else if (ir->operands[1]->is_uint16_constant()) {
1616 if (devinfo->gen < 7)
1617 emit(MUL(result_dst, op[1], op[0]));
1618 else
1619 emit(MUL(result_dst, op[0], op[1]));
1620 } else {
1621 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1622
1623 emit(MUL(acc, op[0], op[1]));
1624 emit(MACH(dst_null_d(), op[0], op[1]));
1625 emit(MOV(result_dst, src_reg(acc)));
1626 }
1627 } else {
1628 emit(MUL(result_dst, op[0], op[1]));
1629 }
1630 break;
1631 case ir_binop_imul_high: {
1632 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1633
1634 emit(MUL(acc, op[0], op[1]));
1635 emit(MACH(result_dst, op[0], op[1]));
1636 break;
1637 }
1638 case ir_binop_div:
1639 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1640 assert(ir->type->is_integer());
1641 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1642 break;
1643
1644 case ir_binop_carry:
1645 unreachable("Should have been lowered by carry_to_arith().");
1646
1647 case ir_binop_borrow:
1648 unreachable("Should have been lowered by borrow_to_arith().");
1649
1650 case ir_binop_mod:
1651 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1652 assert(ir->type->is_integer());
1653 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1654 break;
1655
1656 case ir_binop_less:
1657 case ir_binop_greater:
1658 case ir_binop_lequal:
1659 case ir_binop_gequal:
1660 case ir_binop_equal:
1661 case ir_binop_nequal: {
1662 if (devinfo->gen <= 5) {
1663 resolve_bool_comparison(ir->operands[0], &op[0]);
1664 resolve_bool_comparison(ir->operands[1], &op[1]);
1665 }
1666 emit(CMP(result_dst, op[0], op[1],
1667 brw_conditional_for_comparison(ir->operation)));
1668 break;
1669 }
1670
1671 case ir_binop_all_equal:
1672 if (devinfo->gen <= 5) {
1673 resolve_bool_comparison(ir->operands[0], &op[0]);
1674 resolve_bool_comparison(ir->operands[1], &op[1]);
1675 }
1676
1677 /* "==" operator producing a scalar boolean. */
1678 if (ir->operands[0]->type->is_vector() ||
1679 ir->operands[1]->type->is_vector()) {
1680 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1681 emit(MOV(result_dst, src_reg(0)));
1682 inst = emit(MOV(result_dst, src_reg(~0)));
1683 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1684 } else {
1685 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1686 }
1687 break;
1688 case ir_binop_any_nequal:
1689 if (devinfo->gen <= 5) {
1690 resolve_bool_comparison(ir->operands[0], &op[0]);
1691 resolve_bool_comparison(ir->operands[1], &op[1]);
1692 }
1693
1694 /* "!=" operator producing a scalar boolean. */
1695 if (ir->operands[0]->type->is_vector() ||
1696 ir->operands[1]->type->is_vector()) {
1697 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1698
1699 emit(MOV(result_dst, src_reg(0)));
1700 inst = emit(MOV(result_dst, src_reg(~0)));
1701 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1702 } else {
1703 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1704 }
1705 break;
1706
1707 case ir_unop_any:
1708 if (devinfo->gen <= 5) {
1709 resolve_bool_comparison(ir->operands[0], &op[0]);
1710 }
1711 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1712 emit(MOV(result_dst, src_reg(0)));
1713
1714 inst = emit(MOV(result_dst, src_reg(~0)));
1715 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1716 break;
1717
1718 case ir_binop_logic_xor:
1719 emit(XOR(result_dst, op[0], op[1]));
1720 break;
1721
1722 case ir_binop_logic_or:
1723 emit(OR(result_dst, op[0], op[1]));
1724 break;
1725
1726 case ir_binop_logic_and:
1727 emit(AND(result_dst, op[0], op[1]));
1728 break;
1729
1730 case ir_binop_dot:
1731 assert(ir->operands[0]->type->is_vector());
1732 assert(ir->operands[0]->type == ir->operands[1]->type);
1733 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1734 break;
1735
1736 case ir_unop_sqrt:
1737 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1738 break;
1739 case ir_unop_rsq:
1740 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1741 break;
1742
1743 case ir_unop_bitcast_i2f:
1744 case ir_unop_bitcast_u2f:
1745 this->result = op[0];
1746 this->result.type = BRW_REGISTER_TYPE_F;
1747 break;
1748
1749 case ir_unop_bitcast_f2i:
1750 this->result = op[0];
1751 this->result.type = BRW_REGISTER_TYPE_D;
1752 break;
1753
1754 case ir_unop_bitcast_f2u:
1755 this->result = op[0];
1756 this->result.type = BRW_REGISTER_TYPE_UD;
1757 break;
1758
1759 case ir_unop_i2f:
1760 case ir_unop_i2u:
1761 case ir_unop_u2i:
1762 case ir_unop_u2f:
1763 case ir_unop_f2i:
1764 case ir_unop_f2u:
1765 emit(MOV(result_dst, op[0]));
1766 break;
1767 case ir_unop_b2i:
1768 case ir_unop_b2f:
1769 if (devinfo->gen <= 5) {
1770 resolve_bool_comparison(ir->operands[0], &op[0]);
1771 }
1772 emit(MOV(result_dst, negate(op[0])));
1773 break;
1774 case ir_unop_f2b:
1775 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1776 break;
1777 case ir_unop_i2b:
1778 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1779 break;
1780
1781 case ir_unop_trunc:
1782 emit(RNDZ(result_dst, op[0]));
1783 break;
1784 case ir_unop_ceil: {
1785 src_reg tmp = src_reg(this, ir->type);
1786 op[0].negate = !op[0].negate;
1787 emit(RNDD(dst_reg(tmp), op[0]));
1788 tmp.negate = true;
1789 emit(MOV(result_dst, tmp));
1790 }
1791 break;
1792 case ir_unop_floor:
1793 inst = emit(RNDD(result_dst, op[0]));
1794 break;
1795 case ir_unop_fract:
1796 inst = emit(FRC(result_dst, op[0]));
1797 break;
1798 case ir_unop_round_even:
1799 emit(RNDE(result_dst, op[0]));
1800 break;
1801
1802 case ir_binop_min:
1803 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1804 break;
1805 case ir_binop_max:
1806 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1807 break;
1808
1809 case ir_binop_pow:
1810 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1811 break;
1812
1813 case ir_unop_bit_not:
1814 inst = emit(NOT(result_dst, op[0]));
1815 break;
1816 case ir_binop_bit_and:
1817 inst = emit(AND(result_dst, op[0], op[1]));
1818 break;
1819 case ir_binop_bit_xor:
1820 inst = emit(XOR(result_dst, op[0], op[1]));
1821 break;
1822 case ir_binop_bit_or:
1823 inst = emit(OR(result_dst, op[0], op[1]));
1824 break;
1825
1826 case ir_binop_lshift:
1827 inst = emit(SHL(result_dst, op[0], op[1]));
1828 break;
1829
1830 case ir_binop_rshift:
1831 if (ir->type->base_type == GLSL_TYPE_INT)
1832 inst = emit(ASR(result_dst, op[0], op[1]));
1833 else
1834 inst = emit(SHR(result_dst, op[0], op[1]));
1835 break;
1836
1837 case ir_binop_bfm:
1838 emit(BFI1(result_dst, op[0], op[1]));
1839 break;
1840
1841 case ir_binop_ubo_load: {
1842 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1843 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1844 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1845 src_reg offset;
1846
1847 /* Now, load the vector from that offset. */
1848 assert(ir->type->is_vector() || ir->type->is_scalar());
1849
1850 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1851 packed_consts.type = result.type;
1852 src_reg surf_index;
1853
1854 if (const_uniform_block) {
1855 /* The block index is a constant, so just emit the binding table entry
1856 * as an immediate.
1857 */
1858 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1859 const_uniform_block->value.u[0]);
1860 } else {
1861 /* The block index is not a constant. Evaluate the index expression
1862 * per-channel and add the base UBO index; we have to select a value
1863 * from any live channel.
1864 */
1865 surf_index = src_reg(this, glsl_type::uint_type);
1866 emit(ADD(dst_reg(surf_index), op[0],
1867 src_reg(prog_data->base.binding_table.ubo_start)));
1868 surf_index = emit_uniformize(surf_index);
1869
1870 /* Assume this may touch any UBO. It would be nice to provide
1871 * a tighter bound, but the array information is already lowered away.
1872 */
1873 brw_mark_surface_used(&prog_data->base,
1874 prog_data->base.binding_table.ubo_start +
1875 shader_prog->NumUniformBlocks - 1);
1876 }
1877
1878 if (const_offset_ir) {
1879 if (devinfo->gen >= 8) {
1880 /* Store the offset in a GRF so we can send-from-GRF. */
1881 offset = src_reg(this, glsl_type::int_type);
1882 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1883 } else {
1884 /* Immediates are fine on older generations since they'll be moved
1885 * to a (potentially fake) MRF at the generator level.
1886 */
1887 offset = src_reg(const_offset / 16);
1888 }
1889 } else {
1890 offset = src_reg(this, glsl_type::uint_type);
1891 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1892 }
1893
1894 emit_pull_constant_load_reg(dst_reg(packed_consts),
1895 surf_index,
1896 offset,
1897 NULL, NULL /* before_block/inst */);
1898
1899 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1900 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1901 const_offset % 16 / 4,
1902 const_offset % 16 / 4,
1903 const_offset % 16 / 4);
1904
1905 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1906 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1907 emit(CMP(result_dst, packed_consts, src_reg(0u),
1908 BRW_CONDITIONAL_NZ));
1909 } else {
1910 emit(MOV(result_dst, packed_consts));
1911 }
1912 break;
1913 }
1914
1915 case ir_binop_vector_extract:
1916 unreachable("should have been lowered by vec_index_to_cond_assign");
1917
1918 case ir_triop_fma:
1919 op[0] = fix_3src_operand(op[0]);
1920 op[1] = fix_3src_operand(op[1]);
1921 op[2] = fix_3src_operand(op[2]);
1922 /* Note that the instruction's argument order is reversed from GLSL
1923 * and the IR.
1924 */
1925 emit(MAD(result_dst, op[2], op[1], op[0]));
1926 break;
1927
1928 case ir_triop_lrp:
1929 emit_lrp(result_dst, op[0], op[1], op[2]);
1930 break;
1931
1932 case ir_triop_csel:
1933 unreachable("already handled above");
1934 break;
1935
1936 case ir_triop_bfi:
1937 op[0] = fix_3src_operand(op[0]);
1938 op[1] = fix_3src_operand(op[1]);
1939 op[2] = fix_3src_operand(op[2]);
1940 emit(BFI2(result_dst, op[0], op[1], op[2]));
1941 break;
1942
1943 case ir_triop_bitfield_extract:
1944 op[0] = fix_3src_operand(op[0]);
1945 op[1] = fix_3src_operand(op[1]);
1946 op[2] = fix_3src_operand(op[2]);
1947 /* Note that the instruction's argument order is reversed from GLSL
1948 * and the IR.
1949 */
1950 emit(BFE(result_dst, op[2], op[1], op[0]));
1951 break;
1952
1953 case ir_triop_vector_insert:
1954 unreachable("should have been lowered by lower_vector_insert");
1955
1956 case ir_quadop_bitfield_insert:
1957 unreachable("not reached: should be handled by "
1958 "bitfield_insert_to_bfm_bfi\n");
1959
1960 case ir_quadop_vector:
1961 unreachable("not reached: should be handled by lower_quadop_vector");
1962
1963 case ir_unop_pack_half_2x16:
1964 emit_pack_half_2x16(result_dst, op[0]);
1965 break;
1966 case ir_unop_unpack_half_2x16:
1967 emit_unpack_half_2x16(result_dst, op[0]);
1968 break;
1969 case ir_unop_unpack_unorm_4x8:
1970 emit_unpack_unorm_4x8(result_dst, op[0]);
1971 break;
1972 case ir_unop_unpack_snorm_4x8:
1973 emit_unpack_snorm_4x8(result_dst, op[0]);
1974 break;
1975 case ir_unop_pack_unorm_4x8:
1976 emit_pack_unorm_4x8(result_dst, op[0]);
1977 break;
1978 case ir_unop_pack_snorm_4x8:
1979 emit_pack_snorm_4x8(result_dst, op[0]);
1980 break;
1981 case ir_unop_pack_snorm_2x16:
1982 case ir_unop_pack_unorm_2x16:
1983 case ir_unop_unpack_snorm_2x16:
1984 case ir_unop_unpack_unorm_2x16:
1985 unreachable("not reached: should be handled by lower_packing_builtins");
1986 case ir_unop_unpack_half_2x16_split_x:
1987 case ir_unop_unpack_half_2x16_split_y:
1988 case ir_binop_pack_half_2x16_split:
1989 case ir_unop_interpolate_at_centroid:
1990 case ir_binop_interpolate_at_sample:
1991 case ir_binop_interpolate_at_offset:
1992 unreachable("not reached: should not occur in vertex shader");
1993 case ir_binop_ldexp:
1994 unreachable("not reached: should be handled by ldexp_to_arith()");
1995 case ir_unop_d2f:
1996 case ir_unop_f2d:
1997 case ir_unop_d2i:
1998 case ir_unop_i2d:
1999 case ir_unop_d2u:
2000 case ir_unop_u2d:
2001 case ir_unop_d2b:
2002 case ir_unop_pack_double_2x32:
2003 case ir_unop_unpack_double_2x32:
2004 case ir_unop_frexp_sig:
2005 case ir_unop_frexp_exp:
2006 unreachable("fp64 todo");
2007 }
2008 }
2009
2010
2011 void
2012 vec4_visitor::visit(ir_swizzle *ir)
2013 {
2014 /* Note that this is only swizzles in expressions, not those on the left
2015 * hand side of an assignment, which do write masking. See ir_assignment
2016 * for that.
2017 */
2018 const unsigned swz = brw_compose_swizzle(
2019 brw_swizzle_for_size(ir->type->vector_elements),
2020 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2021
2022 ir->val->accept(this);
2023 this->result = swizzle(this->result, swz);
2024 }
2025
2026 void
2027 vec4_visitor::visit(ir_dereference_variable *ir)
2028 {
2029 const struct glsl_type *type = ir->type;
2030 dst_reg *reg = variable_storage(ir->var);
2031
2032 if (!reg) {
2033 fail("Failed to find variable storage for %s\n", ir->var->name);
2034 this->result = src_reg(brw_null_reg());
2035 return;
2036 }
2037
2038 this->result = src_reg(*reg);
2039
2040 /* System values get their swizzle from the dst_reg writemask */
2041 if (ir->var->data.mode == ir_var_system_value)
2042 return;
2043
2044 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2045 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2046 }
2047
2048
2049 int
2050 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2051 {
2052 /* Under normal circumstances array elements are stored consecutively, so
2053 * the stride is equal to the size of the array element.
2054 */
2055 return type_size(ir->type);
2056 }
2057
2058
2059 void
2060 vec4_visitor::visit(ir_dereference_array *ir)
2061 {
2062 ir_constant *constant_index;
2063 src_reg src;
2064 int array_stride = compute_array_stride(ir);
2065
2066 constant_index = ir->array_index->constant_expression_value();
2067
2068 ir->array->accept(this);
2069 src = this->result;
2070
2071 if (constant_index) {
2072 src.reg_offset += constant_index->value.i[0] * array_stride;
2073 } else {
2074 /* Variable index array dereference. It eats the "vec4" of the
2075 * base of the array and an index that offsets the Mesa register
2076 * index.
2077 */
2078 ir->array_index->accept(this);
2079
2080 src_reg index_reg;
2081
2082 if (array_stride == 1) {
2083 index_reg = this->result;
2084 } else {
2085 index_reg = src_reg(this, glsl_type::int_type);
2086
2087 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2088 }
2089
2090 if (src.reladdr) {
2091 src_reg temp = src_reg(this, glsl_type::int_type);
2092
2093 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2094
2095 index_reg = temp;
2096 }
2097
2098 src.reladdr = ralloc(mem_ctx, src_reg);
2099 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2100 }
2101
2102 /* If the type is smaller than a vec4, replicate the last channel out. */
2103 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2104 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2105 else
2106 src.swizzle = BRW_SWIZZLE_NOOP;
2107 src.type = brw_type_for_base_type(ir->type);
2108
2109 this->result = src;
2110 }
2111
2112 void
2113 vec4_visitor::visit(ir_dereference_record *ir)
2114 {
2115 unsigned int i;
2116 const glsl_type *struct_type = ir->record->type;
2117 int offset = 0;
2118
2119 ir->record->accept(this);
2120
2121 for (i = 0; i < struct_type->length; i++) {
2122 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2123 break;
2124 offset += type_size(struct_type->fields.structure[i].type);
2125 }
2126
2127 /* If the type is smaller than a vec4, replicate the last channel out. */
2128 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2129 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2130 else
2131 this->result.swizzle = BRW_SWIZZLE_NOOP;
2132 this->result.type = brw_type_for_base_type(ir->type);
2133
2134 this->result.reg_offset += offset;
2135 }
2136
2137 /**
2138 * We want to be careful in assignment setup to hit the actual storage
2139 * instead of potentially using a temporary like we might with the
2140 * ir_dereference handler.
2141 */
2142 static dst_reg
2143 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2144 {
2145 /* The LHS must be a dereference. If the LHS is a variable indexed array
2146 * access of a vector, it must be separated into a series conditional moves
2147 * before reaching this point (see ir_vec_index_to_cond_assign).
2148 */
2149 assert(ir->as_dereference());
2150 ir_dereference_array *deref_array = ir->as_dereference_array();
2151 if (deref_array) {
2152 assert(!deref_array->array->type->is_vector());
2153 }
2154
2155 /* Use the rvalue deref handler for the most part. We'll ignore
2156 * swizzles in it and write swizzles using writemask, though.
2157 */
2158 ir->accept(v);
2159 return dst_reg(v->result);
2160 }
2161
2162 void
2163 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2164 const struct glsl_type *type,
2165 enum brw_predicate predicate)
2166 {
2167 if (type->base_type == GLSL_TYPE_STRUCT) {
2168 for (unsigned int i = 0; i < type->length; i++) {
2169 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2170 }
2171 return;
2172 }
2173
2174 if (type->is_array()) {
2175 for (unsigned int i = 0; i < type->length; i++) {
2176 emit_block_move(dst, src, type->fields.array, predicate);
2177 }
2178 return;
2179 }
2180
2181 if (type->is_matrix()) {
2182 const struct glsl_type *vec_type;
2183
2184 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2185 type->vector_elements, 1);
2186
2187 for (int i = 0; i < type->matrix_columns; i++) {
2188 emit_block_move(dst, src, vec_type, predicate);
2189 }
2190 return;
2191 }
2192
2193 assert(type->is_scalar() || type->is_vector());
2194
2195 dst->type = brw_type_for_base_type(type);
2196 src->type = dst->type;
2197
2198 dst->writemask = (1 << type->vector_elements) - 1;
2199
2200 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2201
2202 vec4_instruction *inst = emit(MOV(*dst, *src));
2203 inst->predicate = predicate;
2204
2205 dst->reg_offset++;
2206 src->reg_offset++;
2207 }
2208
2209
2210 /* If the RHS processing resulted in an instruction generating a
2211 * temporary value, and it would be easy to rewrite the instruction to
2212 * generate its result right into the LHS instead, do so. This ends
2213 * up reliably removing instructions where it can be tricky to do so
2214 * later without real UD chain information.
2215 */
2216 bool
2217 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2218 dst_reg dst,
2219 src_reg src,
2220 vec4_instruction *pre_rhs_inst,
2221 vec4_instruction *last_rhs_inst)
2222 {
2223 /* This could be supported, but it would take more smarts. */
2224 if (ir->condition)
2225 return false;
2226
2227 if (pre_rhs_inst == last_rhs_inst)
2228 return false; /* No instructions generated to work with. */
2229
2230 /* Make sure the last instruction generated our source reg. */
2231 if (src.file != GRF ||
2232 src.file != last_rhs_inst->dst.file ||
2233 src.reg != last_rhs_inst->dst.reg ||
2234 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2235 src.reladdr ||
2236 src.abs ||
2237 src.negate ||
2238 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2239 return false;
2240
2241 /* Check that that last instruction fully initialized the channels
2242 * we want to use, in the order we want to use them. We could
2243 * potentially reswizzle the operands of many instructions so that
2244 * we could handle out of order channels, but don't yet.
2245 */
2246
2247 for (unsigned i = 0; i < 4; i++) {
2248 if (dst.writemask & (1 << i)) {
2249 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2250 return false;
2251
2252 if (BRW_GET_SWZ(src.swizzle, i) != i)
2253 return false;
2254 }
2255 }
2256
2257 /* Success! Rewrite the instruction. */
2258 last_rhs_inst->dst.file = dst.file;
2259 last_rhs_inst->dst.reg = dst.reg;
2260 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2261 last_rhs_inst->dst.reladdr = dst.reladdr;
2262 last_rhs_inst->dst.writemask &= dst.writemask;
2263
2264 return true;
2265 }
2266
2267 void
2268 vec4_visitor::visit(ir_assignment *ir)
2269 {
2270 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2271 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2272
2273 if (!ir->lhs->type->is_scalar() &&
2274 !ir->lhs->type->is_vector()) {
2275 ir->rhs->accept(this);
2276 src_reg src = this->result;
2277
2278 if (ir->condition) {
2279 emit_bool_to_cond_code(ir->condition, &predicate);
2280 }
2281
2282 /* emit_block_move doesn't account for swizzles in the source register.
2283 * This should be ok, since the source register is a structure or an
2284 * array, and those can't be swizzled. But double-check to be sure.
2285 */
2286 assert(src.swizzle ==
2287 (ir->rhs->type->is_matrix()
2288 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2289 : BRW_SWIZZLE_NOOP));
2290
2291 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2292 return;
2293 }
2294
2295 /* Now we're down to just a scalar/vector with writemasks. */
2296 int i;
2297
2298 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2299 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2300
2301 ir->rhs->accept(this);
2302
2303 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2304
2305 int swizzles[4];
2306 int src_chan = 0;
2307
2308 assert(ir->lhs->type->is_vector() ||
2309 ir->lhs->type->is_scalar());
2310 dst.writemask = ir->write_mask;
2311
2312 /* Swizzle a small RHS vector into the channels being written.
2313 *
2314 * glsl ir treats write_mask as dictating how many channels are
2315 * present on the RHS while in our instructions we need to make
2316 * those channels appear in the slots of the vec4 they're written to.
2317 */
2318 for (int i = 0; i < 4; i++)
2319 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2320
2321 src_reg src = swizzle(this->result,
2322 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2323 swizzles[2], swizzles[3]));
2324
2325 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2326 return;
2327 }
2328
2329 if (ir->condition) {
2330 emit_bool_to_cond_code(ir->condition, &predicate);
2331 }
2332
2333 for (i = 0; i < type_size(ir->lhs->type); i++) {
2334 vec4_instruction *inst = emit(MOV(dst, src));
2335 inst->predicate = predicate;
2336
2337 dst.reg_offset++;
2338 src.reg_offset++;
2339 }
2340 }
2341
2342 void
2343 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2344 {
2345 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2346 foreach_in_list(ir_constant, field_value, &ir->components) {
2347 emit_constant_values(dst, field_value);
2348 }
2349 return;
2350 }
2351
2352 if (ir->type->is_array()) {
2353 for (unsigned int i = 0; i < ir->type->length; i++) {
2354 emit_constant_values(dst, ir->array_elements[i]);
2355 }
2356 return;
2357 }
2358
2359 if (ir->type->is_matrix()) {
2360 for (int i = 0; i < ir->type->matrix_columns; i++) {
2361 float *vec = &ir->value.f[i * ir->type->vector_elements];
2362
2363 for (int j = 0; j < ir->type->vector_elements; j++) {
2364 dst->writemask = 1 << j;
2365 dst->type = BRW_REGISTER_TYPE_F;
2366
2367 emit(MOV(*dst, src_reg(vec[j])));
2368 }
2369 dst->reg_offset++;
2370 }
2371 return;
2372 }
2373
2374 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2375
2376 for (int i = 0; i < ir->type->vector_elements; i++) {
2377 if (!(remaining_writemask & (1 << i)))
2378 continue;
2379
2380 dst->writemask = 1 << i;
2381 dst->type = brw_type_for_base_type(ir->type);
2382
2383 /* Find other components that match the one we're about to
2384 * write. Emits fewer instructions for things like vec4(0.5,
2385 * 1.5, 1.5, 1.5).
2386 */
2387 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2388 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2389 if (ir->value.b[i] == ir->value.b[j])
2390 dst->writemask |= (1 << j);
2391 } else {
2392 /* u, i, and f storage all line up, so no need for a
2393 * switch case for comparing each type.
2394 */
2395 if (ir->value.u[i] == ir->value.u[j])
2396 dst->writemask |= (1 << j);
2397 }
2398 }
2399
2400 switch (ir->type->base_type) {
2401 case GLSL_TYPE_FLOAT:
2402 emit(MOV(*dst, src_reg(ir->value.f[i])));
2403 break;
2404 case GLSL_TYPE_INT:
2405 emit(MOV(*dst, src_reg(ir->value.i[i])));
2406 break;
2407 case GLSL_TYPE_UINT:
2408 emit(MOV(*dst, src_reg(ir->value.u[i])));
2409 break;
2410 case GLSL_TYPE_BOOL:
2411 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2412 break;
2413 default:
2414 unreachable("Non-float/uint/int/bool constant");
2415 }
2416
2417 remaining_writemask &= ~dst->writemask;
2418 }
2419 dst->reg_offset++;
2420 }
2421
2422 void
2423 vec4_visitor::visit(ir_constant *ir)
2424 {
2425 dst_reg dst = dst_reg(this, ir->type);
2426 this->result = src_reg(dst);
2427
2428 emit_constant_values(&dst, ir);
2429 }
2430
2431 void
2432 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2433 {
2434 ir_dereference *deref = static_cast<ir_dereference *>(
2435 ir->actual_parameters.get_head());
2436 ir_variable *location = deref->variable_referenced();
2437 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2438 location->data.binding);
2439
2440 /* Calculate the surface offset */
2441 src_reg offset(this, glsl_type::uint_type);
2442 ir_dereference_array *deref_array = deref->as_dereference_array();
2443 if (deref_array) {
2444 deref_array->array_index->accept(this);
2445
2446 src_reg tmp(this, glsl_type::uint_type);
2447 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2448 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2449 } else {
2450 offset = location->data.atomic.offset;
2451 }
2452
2453 /* Emit the appropriate machine instruction */
2454 const char *callee = ir->callee->function_name();
2455 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2456
2457 if (!strcmp("__intrinsic_atomic_read", callee)) {
2458 emit_untyped_surface_read(surf_index, dst, offset);
2459
2460 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2461 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2462 src_reg(), src_reg());
2463
2464 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2465 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2466 src_reg(), src_reg());
2467 }
2468
2469 brw_mark_surface_used(stage_prog_data, surf_index);
2470 }
2471
2472 void
2473 vec4_visitor::visit(ir_call *ir)
2474 {
2475 const char *callee = ir->callee->function_name();
2476
2477 if (!strcmp("__intrinsic_atomic_read", callee) ||
2478 !strcmp("__intrinsic_atomic_increment", callee) ||
2479 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2480 visit_atomic_counter_intrinsic(ir);
2481 } else {
2482 unreachable("Unsupported intrinsic.");
2483 }
2484 }
2485
2486 src_reg
2487 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2488 src_reg coordinate, src_reg sampler)
2489 {
2490 vec4_instruction *inst =
2491 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2492 dst_reg(this, glsl_type::uvec4_type));
2493 inst->base_mrf = 2;
2494 inst->src[1] = sampler;
2495
2496 int param_base;
2497
2498 if (devinfo->gen >= 9) {
2499 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2500 vec4_instruction *header_inst = new(mem_ctx)
2501 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2502 dst_reg(MRF, inst->base_mrf));
2503
2504 emit(header_inst);
2505
2506 inst->mlen = 2;
2507 inst->header_size = 1;
2508 param_base = inst->base_mrf + 1;
2509 } else {
2510 inst->mlen = 1;
2511 param_base = inst->base_mrf;
2512 }
2513
2514 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2515 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2516 int zero_mask = 0xf & ~coord_mask;
2517
2518 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2519 coordinate));
2520
2521 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2522 src_reg(0)));
2523
2524 emit(inst);
2525 return src_reg(inst->dst);
2526 }
2527
2528 bool
2529 vec4_visitor::is_high_sampler(src_reg sampler)
2530 {
2531 if (devinfo->gen < 8 && !devinfo->is_haswell)
2532 return false;
2533
2534 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2535 }
2536
2537 void
2538 vec4_visitor::emit_texture(ir_texture_opcode op,
2539 dst_reg dest,
2540 const glsl_type *dest_type,
2541 src_reg coordinate,
2542 int coord_components,
2543 src_reg shadow_comparitor,
2544 src_reg lod, src_reg lod2,
2545 src_reg sample_index,
2546 uint32_t constant_offset,
2547 src_reg offset_value,
2548 src_reg mcs,
2549 bool is_cube_array,
2550 uint32_t sampler,
2551 src_reg sampler_reg)
2552 {
2553 enum opcode opcode;
2554 switch (op) {
2555 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2556 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2557 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2558 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2559 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2560 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2561 case ir_tg4: opcode = offset_value.file != BAD_FILE
2562 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2563 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2564 case ir_txb:
2565 unreachable("TXB is not valid for vertex shaders.");
2566 case ir_lod:
2567 unreachable("LOD is not valid for vertex shaders.");
2568 default:
2569 unreachable("Unrecognized tex op");
2570 }
2571
2572 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2573 opcode, dst_reg(this, dest_type));
2574
2575 inst->offset = constant_offset;
2576
2577 /* The message header is necessary for:
2578 * - Gen4 (always)
2579 * - Gen9+ for selecting SIMD4x2
2580 * - Texel offsets
2581 * - Gather channel selection
2582 * - Sampler indices too large to fit in a 4-bit value.
2583 */
2584 inst->header_size =
2585 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2586 inst->offset != 0 || op == ir_tg4 ||
2587 is_high_sampler(sampler_reg)) ? 1 : 0;
2588 inst->base_mrf = 2;
2589 inst->mlen = inst->header_size + 1; /* always at least one */
2590 inst->dst.writemask = WRITEMASK_XYZW;
2591 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2592
2593 inst->src[1] = sampler_reg;
2594
2595 /* MRF for the first parameter */
2596 int param_base = inst->base_mrf + inst->header_size;
2597
2598 if (op == ir_txs || op == ir_query_levels) {
2599 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2600 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2601 } else {
2602 /* Load the coordinate */
2603 /* FINISHME: gl_clamp_mask and saturate */
2604 int coord_mask = (1 << coord_components) - 1;
2605 int zero_mask = 0xf & ~coord_mask;
2606
2607 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2608 coordinate));
2609
2610 if (zero_mask != 0) {
2611 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2612 src_reg(0)));
2613 }
2614 /* Load the shadow comparitor */
2615 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2616 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2617 WRITEMASK_X),
2618 shadow_comparitor));
2619 inst->mlen++;
2620 }
2621
2622 /* Load the LOD info */
2623 if (op == ir_tex || op == ir_txl) {
2624 int mrf, writemask;
2625 if (devinfo->gen >= 5) {
2626 mrf = param_base + 1;
2627 if (shadow_comparitor.file != BAD_FILE) {
2628 writemask = WRITEMASK_Y;
2629 /* mlen already incremented */
2630 } else {
2631 writemask = WRITEMASK_X;
2632 inst->mlen++;
2633 }
2634 } else /* devinfo->gen == 4 */ {
2635 mrf = param_base;
2636 writemask = WRITEMASK_W;
2637 }
2638 lod.swizzle = BRW_SWIZZLE_XXXX;
2639 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2640 } else if (op == ir_txf) {
2641 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2642 } else if (op == ir_txf_ms) {
2643 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2644 sample_index));
2645 if (devinfo->gen >= 7) {
2646 /* MCS data is in the first channel of `mcs`, but we need to get it into
2647 * the .y channel of the second vec4 of params, so replicate .x across
2648 * the whole vec4 and then mask off everything except .y
2649 */
2650 mcs.swizzle = BRW_SWIZZLE_XXXX;
2651 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2652 mcs));
2653 }
2654 inst->mlen++;
2655 } else if (op == ir_txd) {
2656 const brw_reg_type type = lod.type;
2657
2658 if (devinfo->gen >= 5) {
2659 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2660 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2661 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2662 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2663 inst->mlen++;
2664
2665 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2666 lod.swizzle = BRW_SWIZZLE_ZZZZ;
2667 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2668 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2669 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2670 inst->mlen++;
2671
2672 if (shadow_comparitor.file != BAD_FILE) {
2673 emit(MOV(dst_reg(MRF, param_base + 2,
2674 shadow_comparitor.type, WRITEMASK_Z),
2675 shadow_comparitor));
2676 }
2677 }
2678 } else /* devinfo->gen == 4 */ {
2679 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2680 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2681 inst->mlen += 2;
2682 }
2683 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2684 if (shadow_comparitor.file != BAD_FILE) {
2685 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2686 shadow_comparitor));
2687 }
2688
2689 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2690 offset_value));
2691 inst->mlen++;
2692 }
2693 }
2694
2695 emit(inst);
2696
2697 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2698 * spec requires layers.
2699 */
2700 if (op == ir_txs && is_cube_array) {
2701 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2702 writemask(inst->dst, WRITEMASK_Z),
2703 src_reg(inst->dst), src_reg(6));
2704 }
2705
2706 if (devinfo->gen == 6 && op == ir_tg4) {
2707 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2708 }
2709
2710 swizzle_result(op, dest,
2711 src_reg(inst->dst), sampler, dest_type);
2712 }
2713
2714 void
2715 vec4_visitor::visit(ir_texture *ir)
2716 {
2717 uint32_t sampler =
2718 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2719
2720 ir_rvalue *nonconst_sampler_index =
2721 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2722
2723 /* Handle non-constant sampler array indexing */
2724 src_reg sampler_reg;
2725 if (nonconst_sampler_index) {
2726 /* The highest sampler which may be used by this operation is
2727 * the last element of the array. Mark it here, because the generator
2728 * doesn't have enough information to determine the bound.
2729 */
2730 uint32_t array_size = ir->sampler->as_dereference_array()
2731 ->array->type->array_size();
2732
2733 uint32_t max_used = sampler + array_size - 1;
2734 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2735 max_used += prog_data->base.binding_table.gather_texture_start;
2736 } else {
2737 max_used += prog_data->base.binding_table.texture_start;
2738 }
2739
2740 brw_mark_surface_used(&prog_data->base, max_used);
2741
2742 /* Emit code to evaluate the actual indexing expression */
2743 nonconst_sampler_index->accept(this);
2744 src_reg temp(this, glsl_type::uint_type);
2745 emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2746 sampler_reg = emit_uniformize(temp);
2747 } else {
2748 /* Single sampler, or constant array index; the indexing expression
2749 * is just an immediate.
2750 */
2751 sampler_reg = src_reg(sampler);
2752 }
2753
2754 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2755 * emitting anything other than setting up the constant result.
2756 */
2757 if (ir->op == ir_tg4) {
2758 ir_constant *chan = ir->lod_info.component->as_constant();
2759 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2760 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2761 dst_reg result(this, ir->type);
2762 this->result = src_reg(result);
2763 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2764 return;
2765 }
2766 }
2767
2768 /* Should be lowered by do_lower_texture_projection */
2769 assert(!ir->projector);
2770
2771 /* Should be lowered */
2772 assert(!ir->offset || !ir->offset->type->is_array());
2773
2774 /* Generate code to compute all the subexpression trees. This has to be
2775 * done before loading any values into MRFs for the sampler message since
2776 * generating these values may involve SEND messages that need the MRFs.
2777 */
2778 src_reg coordinate;
2779 int coord_components = 0;
2780 if (ir->coordinate) {
2781 coord_components = ir->coordinate->type->vector_elements;
2782 ir->coordinate->accept(this);
2783 coordinate = this->result;
2784 }
2785
2786 src_reg shadow_comparitor;
2787 if (ir->shadow_comparitor) {
2788 ir->shadow_comparitor->accept(this);
2789 shadow_comparitor = this->result;
2790 }
2791
2792 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2793 src_reg offset_value;
2794 if (has_nonconstant_offset) {
2795 ir->offset->accept(this);
2796 offset_value = src_reg(this->result);
2797 }
2798
2799 src_reg lod, lod2, sample_index, mcs;
2800 switch (ir->op) {
2801 case ir_tex:
2802 lod = src_reg(0.0f);
2803 break;
2804 case ir_txf:
2805 case ir_txl:
2806 case ir_txs:
2807 ir->lod_info.lod->accept(this);
2808 lod = this->result;
2809 break;
2810 case ir_query_levels:
2811 lod = src_reg(0);
2812 break;
2813 case ir_txf_ms:
2814 ir->lod_info.sample_index->accept(this);
2815 sample_index = this->result;
2816
2817 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2818 mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2819 else
2820 mcs = src_reg(0u);
2821 break;
2822 case ir_txd:
2823 ir->lod_info.grad.dPdx->accept(this);
2824 lod = this->result;
2825
2826 ir->lod_info.grad.dPdy->accept(this);
2827 lod2 = this->result;
2828 break;
2829 case ir_txb:
2830 case ir_lod:
2831 case ir_tg4:
2832 break;
2833 }
2834
2835 uint32_t constant_offset = 0;
2836 if (ir->offset != NULL && !has_nonconstant_offset) {
2837 constant_offset =
2838 brw_texture_offset(ir->offset->as_constant()->value.i,
2839 ir->offset->type->vector_elements);
2840 }
2841
2842 /* Stuff the channel select bits in the top of the texture offset */
2843 if (ir->op == ir_tg4)
2844 constant_offset |=
2845 gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2846 sampler) << 16;
2847
2848 glsl_type const *type = ir->sampler->type;
2849 bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2850 type->sampler_array;
2851
2852 this->result = src_reg(this, ir->type);
2853 dst_reg dest = dst_reg(this->result);
2854
2855 emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2856 shadow_comparitor,
2857 lod, lod2, sample_index,
2858 constant_offset, offset_value,
2859 mcs, is_cube_array, sampler, sampler_reg);
2860 }
2861
2862 /**
2863 * Apply workarounds for Gen6 gather with UINT/SINT
2864 */
2865 void
2866 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2867 {
2868 if (!wa)
2869 return;
2870
2871 int width = (wa & WA_8BIT) ? 8 : 16;
2872 dst_reg dst_f = dst;
2873 dst_f.type = BRW_REGISTER_TYPE_F;
2874
2875 /* Convert from UNORM to UINT */
2876 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2877 emit(MOV(dst, src_reg(dst_f)));
2878
2879 if (wa & WA_SIGN) {
2880 /* Reinterpret the UINT value as a signed INT value by
2881 * shifting the sign bit into place, then shifting back
2882 * preserving sign.
2883 */
2884 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2885 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2886 }
2887 }
2888
2889 /**
2890 * Set up the gather channel based on the swizzle, for gather4.
2891 */
2892 uint32_t
2893 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2894 {
2895 int swiz = GET_SWZ(key->tex.swizzles[sampler], gather_component);
2896 switch (swiz) {
2897 case SWIZZLE_X: return 0;
2898 case SWIZZLE_Y:
2899 /* gather4 sampler is broken for green channel on RG32F --
2900 * we must ask for blue instead.
2901 */
2902 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2903 return 2;
2904 return 1;
2905 case SWIZZLE_Z: return 2;
2906 case SWIZZLE_W: return 3;
2907 default:
2908 unreachable("Not reached"); /* zero, one swizzles handled already */
2909 }
2910 }
2911
2912 void
2913 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2914 src_reg orig_val, uint32_t sampler,
2915 const glsl_type *dest_type)
2916 {
2917 int s = key->tex.swizzles[sampler];
2918
2919 dst_reg swizzled_result = dest;
2920
2921 if (op == ir_query_levels) {
2922 /* # levels is in .w */
2923 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2924 emit(MOV(swizzled_result, orig_val));
2925 return;
2926 }
2927
2928 if (op == ir_txs || dest_type == glsl_type::float_type
2929 || s == SWIZZLE_NOOP || op == ir_tg4) {
2930 emit(MOV(swizzled_result, orig_val));
2931 return;
2932 }
2933
2934
2935 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2936 int swizzle[4] = {0};
2937
2938 for (int i = 0; i < 4; i++) {
2939 switch (GET_SWZ(s, i)) {
2940 case SWIZZLE_ZERO:
2941 zero_mask |= (1 << i);
2942 break;
2943 case SWIZZLE_ONE:
2944 one_mask |= (1 << i);
2945 break;
2946 default:
2947 copy_mask |= (1 << i);
2948 swizzle[i] = GET_SWZ(s, i);
2949 break;
2950 }
2951 }
2952
2953 if (copy_mask) {
2954 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2955 swizzled_result.writemask = copy_mask;
2956 emit(MOV(swizzled_result, orig_val));
2957 }
2958
2959 if (zero_mask) {
2960 swizzled_result.writemask = zero_mask;
2961 emit(MOV(swizzled_result, src_reg(0.0f)));
2962 }
2963
2964 if (one_mask) {
2965 swizzled_result.writemask = one_mask;
2966 emit(MOV(swizzled_result, src_reg(1.0f)));
2967 }
2968 }
2969
2970 void
2971 vec4_visitor::visit(ir_return *)
2972 {
2973 unreachable("not reached");
2974 }
2975
2976 void
2977 vec4_visitor::visit(ir_discard *)
2978 {
2979 unreachable("not reached");
2980 }
2981
2982 void
2983 vec4_visitor::visit(ir_if *ir)
2984 {
2985 /* Don't point the annotation at the if statement, because then it plus
2986 * the then and else blocks get printed.
2987 */
2988 this->base_ir = ir->condition;
2989
2990 if (devinfo->gen == 6) {
2991 emit_if_gen6(ir);
2992 } else {
2993 enum brw_predicate predicate;
2994 emit_bool_to_cond_code(ir->condition, &predicate);
2995 emit(IF(predicate));
2996 }
2997
2998 visit_instructions(&ir->then_instructions);
2999
3000 if (!ir->else_instructions.is_empty()) {
3001 this->base_ir = ir->condition;
3002 emit(BRW_OPCODE_ELSE);
3003
3004 visit_instructions(&ir->else_instructions);
3005 }
3006
3007 this->base_ir = ir->condition;
3008 emit(BRW_OPCODE_ENDIF);
3009 }
3010
3011 void
3012 vec4_visitor::gs_emit_vertex(int stream_id)
3013 {
3014 unreachable("not reached");
3015 }
3016
3017 void
3018 vec4_visitor::visit(ir_emit_vertex *)
3019 {
3020 unreachable("not reached");
3021 }
3022
3023 void
3024 vec4_visitor::gs_end_primitive()
3025 {
3026 unreachable("not reached");
3027 }
3028
3029
3030 void
3031 vec4_visitor::visit(ir_end_primitive *)
3032 {
3033 unreachable("not reached");
3034 }
3035
3036 void
3037 vec4_visitor::visit(ir_barrier *)
3038 {
3039 unreachable("not reached");
3040 }
3041
3042 void
3043 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3044 dst_reg dst, src_reg offset,
3045 src_reg src0, src_reg src1)
3046 {
3047 unsigned mlen = 0;
3048
3049 /* Set the atomic operation offset. */
3050 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3051 mlen++;
3052
3053 /* Set the atomic operation arguments. */
3054 if (src0.file != BAD_FILE) {
3055 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3056 mlen++;
3057 }
3058
3059 if (src1.file != BAD_FILE) {
3060 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3061 mlen++;
3062 }
3063
3064 /* Emit the instruction. Note that this maps to the normal SIMD8
3065 * untyped atomic message on Ivy Bridge, but that's OK because
3066 * unused channels will be masked out.
3067 */
3068 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3069 brw_message_reg(0),
3070 src_reg(surf_index), src_reg(atomic_op));
3071 inst->mlen = mlen;
3072 }
3073
3074 void
3075 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3076 src_reg offset)
3077 {
3078 /* Set the surface read offset. */
3079 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3080
3081 /* Emit the instruction. Note that this maps to the normal SIMD8
3082 * untyped surface read message, but that's OK because unused
3083 * channels will be masked out.
3084 */
3085 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3086 brw_message_reg(0),
3087 src_reg(surf_index), src_reg(1));
3088 inst->mlen = 1;
3089 }
3090
3091 void
3092 vec4_visitor::emit_ndc_computation()
3093 {
3094 /* Get the position */
3095 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3096
3097 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3098 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3099 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3100
3101 current_annotation = "NDC";
3102 dst_reg ndc_w = ndc;
3103 ndc_w.writemask = WRITEMASK_W;
3104 src_reg pos_w = pos;
3105 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3106 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3107
3108 dst_reg ndc_xyz = ndc;
3109 ndc_xyz.writemask = WRITEMASK_XYZ;
3110
3111 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3112 }
3113
3114 void
3115 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3116 {
3117 if (devinfo->gen < 6 &&
3118 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3119 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3120 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3121 dst_reg header1_w = header1;
3122 header1_w.writemask = WRITEMASK_W;
3123
3124 emit(MOV(header1, 0u));
3125
3126 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3127 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3128
3129 current_annotation = "Point size";
3130 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3131 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3132 }
3133
3134 if (key->userclip_active) {
3135 current_annotation = "Clipping flags";
3136 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3137 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3138
3139 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3140 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3141 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3142
3143 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3144 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3145 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3146 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3147 }
3148
3149 /* i965 clipping workaround:
3150 * 1) Test for -ve rhw
3151 * 2) If set,
3152 * set ndc = (0,0,0,0)
3153 * set ucp[6] = 1
3154 *
3155 * Later, clipping will detect ucp[6] and ensure the primitive is
3156 * clipped against all fixed planes.
3157 */
3158 if (devinfo->has_negative_rhw_bug) {
3159 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3160 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3161 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3162 vec4_instruction *inst;
3163 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3164 inst->predicate = BRW_PREDICATE_NORMAL;
3165 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3166 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3167 inst->predicate = BRW_PREDICATE_NORMAL;
3168 }
3169
3170 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3171 } else if (devinfo->gen < 6) {
3172 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3173 } else {
3174 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3175 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3176 dst_reg reg_w = reg;
3177 reg_w.writemask = WRITEMASK_W;
3178 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3179 reg_as_src.type = reg_w.type;
3180 reg_as_src.swizzle = brw_swizzle_for_size(1);
3181 emit(MOV(reg_w, reg_as_src));
3182 }
3183 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3184 dst_reg reg_y = reg;
3185 reg_y.writemask = WRITEMASK_Y;
3186 reg_y.type = BRW_REGISTER_TYPE_D;
3187 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3188 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3189 }
3190 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3191 dst_reg reg_z = reg;
3192 reg_z.writemask = WRITEMASK_Z;
3193 reg_z.type = BRW_REGISTER_TYPE_D;
3194 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3195 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3196 }
3197 }
3198 }
3199
3200 void
3201 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3202 {
3203 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3204 *
3205 * "If a linked set of shaders forming the vertex stage contains no
3206 * static write to gl_ClipVertex or gl_ClipDistance, but the
3207 * application has requested clipping against user clip planes through
3208 * the API, then the coordinate written to gl_Position is used for
3209 * comparison against the user clip planes."
3210 *
3211 * This function is only called if the shader didn't write to
3212 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3213 * if the user wrote to it; otherwise we use gl_Position.
3214 */
3215 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3216 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3217 clip_vertex = VARYING_SLOT_POS;
3218 }
3219
3220 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3221 ++i) {
3222 reg.writemask = 1 << i;
3223 emit(DP4(reg,
3224 src_reg(output_reg[clip_vertex]),
3225 src_reg(this->userplane[i + offset])));
3226 }
3227 }
3228
3229 vec4_instruction *
3230 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3231 {
3232 assert(varying < VARYING_SLOT_MAX);
3233 assert(output_reg[varying].type == reg.type);
3234 current_annotation = output_reg_annotation[varying];
3235 /* Copy the register, saturating if necessary */
3236 return emit(MOV(reg, src_reg(output_reg[varying])));
3237 }
3238
3239 void
3240 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3241 {
3242 reg.type = BRW_REGISTER_TYPE_F;
3243 output_reg[varying].type = reg.type;
3244
3245 switch (varying) {
3246 case VARYING_SLOT_PSIZ:
3247 {
3248 /* PSIZ is always in slot 0, and is coupled with other flags. */
3249 current_annotation = "indices, point width, clip flags";
3250 emit_psiz_and_flags(reg);
3251 break;
3252 }
3253 case BRW_VARYING_SLOT_NDC:
3254 current_annotation = "NDC";
3255 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3256 break;
3257 case VARYING_SLOT_POS:
3258 current_annotation = "gl_Position";
3259 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3260 break;
3261 case VARYING_SLOT_EDGE:
3262 /* This is present when doing unfilled polygons. We're supposed to copy
3263 * the edge flag from the user-provided vertex array
3264 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3265 * of that attribute (starts as 1.0f). This is then used in clipping to
3266 * determine which edges should be drawn as wireframe.
3267 */
3268 current_annotation = "edge flag";
3269 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3270 glsl_type::float_type, WRITEMASK_XYZW))));
3271 break;
3272 case BRW_VARYING_SLOT_PAD:
3273 /* No need to write to this slot */
3274 break;
3275 case VARYING_SLOT_COL0:
3276 case VARYING_SLOT_COL1:
3277 case VARYING_SLOT_BFC0:
3278 case VARYING_SLOT_BFC1: {
3279 /* These built-in varyings are only supported in compatibility mode,
3280 * and we only support GS in core profile. So, this must be a vertex
3281 * shader.
3282 */
3283 assert(stage == MESA_SHADER_VERTEX);
3284 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3285 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3286 inst->saturate = true;
3287 break;
3288 }
3289
3290 default:
3291 emit_generic_urb_slot(reg, varying);
3292 break;
3293 }
3294 }
3295
3296 static int
3297 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3298 {
3299 if (devinfo->gen >= 6) {
3300 /* URB data written (does not include the message header reg) must
3301 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3302 * section 5.4.3.2.2: URB_INTERLEAVED.
3303 *
3304 * URB entries are allocated on a multiple of 1024 bits, so an
3305 * extra 128 bits written here to make the end align to 256 is
3306 * no problem.
3307 */
3308 if ((mlen % 2) != 1)
3309 mlen++;
3310 }
3311
3312 return mlen;
3313 }
3314
3315
3316 /**
3317 * Generates the VUE payload plus the necessary URB write instructions to
3318 * output it.
3319 *
3320 * The VUE layout is documented in Volume 2a.
3321 */
3322 void
3323 vec4_visitor::emit_vertex()
3324 {
3325 /* MRF 0 is reserved for the debugger, so start with message header
3326 * in MRF 1.
3327 */
3328 int base_mrf = 1;
3329 int mrf = base_mrf;
3330 /* In the process of generating our URB write message contents, we
3331 * may need to unspill a register or load from an array. Those
3332 * reads would use MRFs 14-15.
3333 */
3334 int max_usable_mrf = 13;
3335
3336 /* The following assertion verifies that max_usable_mrf causes an
3337 * even-numbered amount of URB write data, which will meet gen6's
3338 * requirements for length alignment.
3339 */
3340 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3341
3342 /* First mrf is the g0-based message header containing URB handles and
3343 * such.
3344 */
3345 emit_urb_write_header(mrf++);
3346
3347 if (devinfo->gen < 6) {
3348 emit_ndc_computation();
3349 }
3350
3351 /* Lower legacy ff and ClipVertex clipping to clip distances */
3352 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3353 current_annotation = "user clip distances";
3354
3355 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3356 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3357
3358 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3359 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3360 }
3361
3362 /* We may need to split this up into several URB writes, so do them in a
3363 * loop.
3364 */
3365 int slot = 0;
3366 bool complete = false;
3367 do {
3368 /* URB offset is in URB row increments, and each of our MRFs is half of
3369 * one of those, since we're doing interleaved writes.
3370 */
3371 int offset = slot / 2;
3372
3373 mrf = base_mrf + 1;
3374 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3375 emit_urb_slot(dst_reg(MRF, mrf++),
3376 prog_data->vue_map.slot_to_varying[slot]);
3377
3378 /* If this was max_usable_mrf, we can't fit anything more into this
3379 * URB WRITE.
3380 */
3381 if (mrf > max_usable_mrf) {
3382 slot++;
3383 break;
3384 }
3385 }
3386
3387 complete = slot >= prog_data->vue_map.num_slots;
3388 current_annotation = "URB write";
3389 vec4_instruction *inst = emit_urb_write_opcode(complete);
3390 inst->base_mrf = base_mrf;
3391 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3392 inst->offset += offset;
3393 } while(!complete);
3394 }
3395
3396
3397 src_reg
3398 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3399 src_reg *reladdr, int reg_offset)
3400 {
3401 /* Because we store the values to scratch interleaved like our
3402 * vertex data, we need to scale the vec4 index by 2.
3403 */
3404 int message_header_scale = 2;
3405
3406 /* Pre-gen6, the message header uses byte offsets instead of vec4
3407 * (16-byte) offset units.
3408 */
3409 if (devinfo->gen < 6)
3410 message_header_scale *= 16;
3411
3412 if (reladdr) {
3413 src_reg index = src_reg(this, glsl_type::int_type);
3414
3415 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3416 src_reg(reg_offset)));
3417 emit_before(block, inst, MUL(dst_reg(index), index,
3418 src_reg(message_header_scale)));
3419
3420 return index;
3421 } else {
3422 return src_reg(reg_offset * message_header_scale);
3423 }
3424 }
3425
3426 src_reg
3427 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3428 src_reg *reladdr, int reg_offset)
3429 {
3430 if (reladdr) {
3431 src_reg index = src_reg(this, glsl_type::int_type);
3432
3433 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3434 src_reg(reg_offset)));
3435
3436 /* Pre-gen6, the message header uses byte offsets instead of vec4
3437 * (16-byte) offset units.
3438 */
3439 if (devinfo->gen < 6) {
3440 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3441 }
3442
3443 return index;
3444 } else if (devinfo->gen >= 8) {
3445 /* Store the offset in a GRF so we can send-from-GRF. */
3446 src_reg offset = src_reg(this, glsl_type::int_type);
3447 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3448 return offset;
3449 } else {
3450 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3451 return src_reg(reg_offset * message_header_scale);
3452 }
3453 }
3454
3455 /**
3456 * Emits an instruction before @inst to load the value named by @orig_src
3457 * from scratch space at @base_offset to @temp.
3458 *
3459 * @base_offset is measured in 32-byte units (the size of a register).
3460 */
3461 void
3462 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3463 dst_reg temp, src_reg orig_src,
3464 int base_offset)
3465 {
3466 int reg_offset = base_offset + orig_src.reg_offset;
3467 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3468 reg_offset);
3469
3470 emit_before(block, inst, SCRATCH_READ(temp, index));
3471 }
3472
3473 /**
3474 * Emits an instruction after @inst to store the value to be written
3475 * to @orig_dst to scratch space at @base_offset, from @temp.
3476 *
3477 * @base_offset is measured in 32-byte units (the size of a register).
3478 */
3479 void
3480 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3481 int base_offset)
3482 {
3483 int reg_offset = base_offset + inst->dst.reg_offset;
3484 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3485 reg_offset);
3486
3487 /* Create a temporary register to store *inst's result in.
3488 *
3489 * We have to be careful in MOVing from our temporary result register in
3490 * the scratch write. If we swizzle from channels of the temporary that
3491 * weren't initialized, it will confuse live interval analysis, which will
3492 * make spilling fail to make progress.
3493 */
3494 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3495 inst->dst.type),
3496 brw_swizzle_for_mask(inst->dst.writemask));
3497 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3498 inst->dst.writemask));
3499 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3500 if (inst->opcode != BRW_OPCODE_SEL)
3501 write->predicate = inst->predicate;
3502 write->ir = inst->ir;
3503 write->annotation = inst->annotation;
3504 inst->insert_after(block, write);
3505
3506 inst->dst.file = temp.file;
3507 inst->dst.reg = temp.reg;
3508 inst->dst.reg_offset = temp.reg_offset;
3509 inst->dst.reladdr = NULL;
3510 }
3511
3512 /**
3513 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3514 * adds the scratch read(s) before \p inst. The function also checks for
3515 * recursive reladdr scratch accesses, issuing the corresponding scratch
3516 * loads and rewriting reladdr references accordingly.
3517 *
3518 * \return \p src if it did not require a scratch load, otherwise, the
3519 * register holding the result of the scratch load that the caller should
3520 * use to rewrite src.
3521 */
3522 src_reg
3523 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3524 vec4_instruction *inst, src_reg src)
3525 {
3526 /* Resolve recursive reladdr scratch access by calling ourselves
3527 * with src.reladdr
3528 */
3529 if (src.reladdr)
3530 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3531 *src.reladdr);
3532
3533 /* Now handle scratch access on src */
3534 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3535 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3536 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3537 src.reg = temp.reg;
3538 src.reg_offset = temp.reg_offset;
3539 src.reladdr = NULL;
3540 }
3541
3542 return src;
3543 }
3544
3545 /**
3546 * We can't generally support array access in GRF space, because a
3547 * single instruction's destination can only span 2 contiguous
3548 * registers. So, we send all GRF arrays that get variable index
3549 * access to scratch space.
3550 */
3551 void
3552 vec4_visitor::move_grf_array_access_to_scratch()
3553 {
3554 int scratch_loc[this->alloc.count];
3555 memset(scratch_loc, -1, sizeof(scratch_loc));
3556
3557 /* First, calculate the set of virtual GRFs that need to be punted
3558 * to scratch due to having any array access on them, and where in
3559 * scratch.
3560 */
3561 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3562 if (inst->dst.file == GRF && inst->dst.reladdr) {
3563 if (scratch_loc[inst->dst.reg] == -1) {
3564 scratch_loc[inst->dst.reg] = last_scratch;
3565 last_scratch += this->alloc.sizes[inst->dst.reg];
3566 }
3567
3568 for (src_reg *iter = inst->dst.reladdr;
3569 iter->reladdr;
3570 iter = iter->reladdr) {
3571 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3572 scratch_loc[iter->reg] = last_scratch;
3573 last_scratch += this->alloc.sizes[iter->reg];
3574 }
3575 }
3576 }
3577
3578 for (int i = 0 ; i < 3; i++) {
3579 for (src_reg *iter = &inst->src[i];
3580 iter->reladdr;
3581 iter = iter->reladdr) {
3582 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3583 scratch_loc[iter->reg] = last_scratch;
3584 last_scratch += this->alloc.sizes[iter->reg];
3585 }
3586 }
3587 }
3588 }
3589
3590 /* Now, for anything that will be accessed through scratch, rewrite
3591 * it to load/store. Note that this is a _safe list walk, because
3592 * we may generate a new scratch_write instruction after the one
3593 * we're processing.
3594 */
3595 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3596 /* Set up the annotation tracking for new generated instructions. */
3597 base_ir = inst->ir;
3598 current_annotation = inst->annotation;
3599
3600 /* First handle scratch access on the dst. Notice we have to handle
3601 * the case where the dst's reladdr also points to scratch space.
3602 */
3603 if (inst->dst.reladdr)
3604 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3605 *inst->dst.reladdr);
3606
3607 /* Now that we have handled any (possibly recursive) reladdr scratch
3608 * accesses for dst we can safely do the scratch write for dst itself
3609 */
3610 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3611 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3612
3613 /* Now handle scratch access on any src. In this case, since inst->src[i]
3614 * already is a src_reg, we can just call emit_resolve_reladdr with
3615 * inst->src[i] and it will take care of handling scratch loads for
3616 * both src and src.reladdr (recursively).
3617 */
3618 for (int i = 0 ; i < 3; i++) {
3619 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3620 inst->src[i]);
3621 }
3622 }
3623 }
3624
3625 /**
3626 * Emits an instruction before @inst to load the value named by @orig_src
3627 * from the pull constant buffer (surface) at @base_offset to @temp.
3628 */
3629 void
3630 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3631 dst_reg temp, src_reg orig_src,
3632 int base_offset)
3633 {
3634 int reg_offset = base_offset + orig_src.reg_offset;
3635 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3636 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3637 reg_offset);
3638
3639 emit_pull_constant_load_reg(temp,
3640 index,
3641 offset,
3642 block, inst);
3643 }
3644
3645 /**
3646 * Implements array access of uniforms by inserting a
3647 * PULL_CONSTANT_LOAD instruction.
3648 *
3649 * Unlike temporary GRF array access (where we don't support it due to
3650 * the difficulty of doing relative addressing on instruction
3651 * destinations), we could potentially do array access of uniforms
3652 * that were loaded in GRF space as push constants. In real-world
3653 * usage we've seen, though, the arrays being used are always larger
3654 * than we could load as push constants, so just always move all
3655 * uniform array access out to a pull constant buffer.
3656 */
3657 void
3658 vec4_visitor::move_uniform_array_access_to_pull_constants()
3659 {
3660 int pull_constant_loc[this->uniforms];
3661 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3662 bool nested_reladdr;
3663
3664 /* Walk through and find array access of uniforms. Put a copy of that
3665 * uniform in the pull constant buffer.
3666 *
3667 * Note that we don't move constant-indexed accesses to arrays. No
3668 * testing has been done of the performance impact of this choice.
3669 */
3670 do {
3671 nested_reladdr = false;
3672
3673 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3674 for (int i = 0 ; i < 3; i++) {
3675 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3676 continue;
3677
3678 int uniform = inst->src[i].reg;
3679
3680 if (inst->src[i].reladdr->reladdr)
3681 nested_reladdr = true; /* will need another pass */
3682
3683 /* If this array isn't already present in the pull constant buffer,
3684 * add it.
3685 */
3686 if (pull_constant_loc[uniform] == -1) {
3687 const gl_constant_value **values =
3688 &stage_prog_data->param[uniform * 4];
3689
3690 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3691
3692 assert(uniform < uniform_array_size);
3693 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3694 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3695 = values[j];
3696 }
3697 }
3698
3699 /* Set up the annotation tracking for new generated instructions. */
3700 base_ir = inst->ir;
3701 current_annotation = inst->annotation;
3702
3703 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3704
3705 emit_pull_constant_load(block, inst, temp, inst->src[i],
3706 pull_constant_loc[uniform]);
3707
3708 inst->src[i].file = temp.file;
3709 inst->src[i].reg = temp.reg;
3710 inst->src[i].reg_offset = temp.reg_offset;
3711 inst->src[i].reladdr = NULL;
3712 }
3713 }
3714 } while (nested_reladdr);
3715
3716 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3717 * no need to track them as larger-than-vec4 objects. This will be
3718 * relied on in cutting out unused uniform vectors from push
3719 * constants.
3720 */
3721 split_uniform_registers();
3722 }
3723
3724 void
3725 vec4_visitor::resolve_ud_negate(src_reg *reg)
3726 {
3727 if (reg->type != BRW_REGISTER_TYPE_UD ||
3728 !reg->negate)
3729 return;
3730
3731 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3732 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3733 *reg = temp;
3734 }
3735
3736 /**
3737 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3738 *
3739 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3740 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3741 */
3742 void
3743 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3744 {
3745 assert(devinfo->gen <= 5);
3746
3747 if (!rvalue->type->is_boolean())
3748 return;
3749
3750 src_reg and_result = src_reg(this, rvalue->type);
3751 src_reg neg_result = src_reg(this, rvalue->type);
3752 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3753 emit(MOV(dst_reg(neg_result), negate(and_result)));
3754 *reg = neg_result;
3755 }
3756
3757 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3758 void *log_data,
3759 struct gl_program *prog,
3760 const struct brw_vue_prog_key *key,
3761 struct brw_vue_prog_data *prog_data,
3762 struct gl_shader_program *shader_prog,
3763 gl_shader_stage stage,
3764 void *mem_ctx,
3765 bool no_spills,
3766 int shader_time_index)
3767 : backend_shader(compiler, log_data, mem_ctx,
3768 shader_prog, prog, &prog_data->base, stage),
3769 key(key),
3770 prog_data(prog_data),
3771 sanity_param_count(0),
3772 fail_msg(NULL),
3773 first_non_payload_grf(0),
3774 need_all_constants_in_pull_buffer(false),
3775 no_spills(no_spills),
3776 shader_time_index(shader_time_index),
3777 last_scratch(0)
3778 {
3779 this->failed = false;
3780
3781 this->base_ir = NULL;
3782 this->current_annotation = NULL;
3783 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3784
3785 this->variable_ht = hash_table_ctor(0,
3786 hash_table_pointer_hash,
3787 hash_table_pointer_compare);
3788
3789 this->virtual_grf_start = NULL;
3790 this->virtual_grf_end = NULL;
3791 this->live_intervals = NULL;
3792
3793 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3794
3795 this->uniforms = 0;
3796
3797 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3798 * at least one. See setup_uniforms() in brw_vec4.cpp.
3799 */
3800 this->uniform_array_size = 1;
3801 if (prog_data) {
3802 this->uniform_array_size =
3803 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3804 }
3805
3806 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3807 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3808 }
3809
3810 vec4_visitor::~vec4_visitor()
3811 {
3812 hash_table_dtor(this->variable_ht);
3813 }
3814
3815
3816 void
3817 vec4_visitor::fail(const char *format, ...)
3818 {
3819 va_list va;
3820 char *msg;
3821
3822 if (failed)
3823 return;
3824
3825 failed = true;
3826
3827 va_start(va, format);
3828 msg = ralloc_vasprintf(mem_ctx, format, va);
3829 va_end(va);
3830 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3831
3832 this->fail_msg = msg;
3833
3834 if (debug_enabled) {
3835 fprintf(stderr, "%s", msg);
3836 }
3837 }
3838
3839 } /* namespace brw */