i965/vec4: do not predicate scratch writes for BRW_OPCODE_SEL instructions
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (devinfo->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 vec4_instruction *
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 math = emit(MOV(dst, src_reg(math->dst)));
354 } else if (devinfo->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358
359 return math;
360 }
361
362 void
363 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
364 {
365 if (devinfo->gen < 7) {
366 unreachable("ir_unop_pack_half_2x16 should be lowered");
367 }
368
369 assert(dst.type == BRW_REGISTER_TYPE_UD);
370 assert(src0.type == BRW_REGISTER_TYPE_F);
371
372 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
373 *
374 * Because this instruction does not have a 16-bit floating-point type,
375 * the destination data type must be Word (W).
376 *
377 * The destination must be DWord-aligned and specify a horizontal stride
378 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
379 * each destination channel and the upper word is not modified.
380 *
381 * The above restriction implies that the f32to16 instruction must use
382 * align1 mode, because only in align1 mode is it possible to specify
383 * horizontal stride. We choose here to defy the hardware docs and emit
384 * align16 instructions.
385 *
386 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
387 * instructions. I was partially successful in that the code passed all
388 * tests. However, the code was dubiously correct and fragile, and the
389 * tests were not harsh enough to probe that frailty. Not trusting the
390 * code, I chose instead to remain in align16 mode in defiance of the hw
391 * docs).
392 *
393 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
394 * simulator, emitting a f32to16 in align16 mode with UD as destination
395 * data type is safe. The behavior differs from that specified in the PRM
396 * in that the upper word of each destination channel is cleared to 0.
397 */
398
399 dst_reg tmp_dst(this, glsl_type::uvec2_type);
400 src_reg tmp_src(tmp_dst);
401
402 #if 0
403 /* Verify the undocumented behavior on which the following instructions
404 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
405 * then the result of the bit-or instruction below will be incorrect.
406 *
407 * You should inspect the disasm output in order to verify that the MOV is
408 * not optimized away.
409 */
410 emit(MOV(tmp_dst, src_reg(0x12345678u)));
411 #endif
412
413 /* Give tmp the form below, where "." means untouched.
414 *
415 * w z y x w z y x
416 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
417 *
418 * That the upper word of each write-channel be 0 is required for the
419 * following bit-shift and bit-or instructions to work. Note that this
420 * relies on the undocumented hardware behavior mentioned above.
421 */
422 tmp_dst.writemask = WRITEMASK_XY;
423 emit(F32TO16(tmp_dst, src0));
424
425 /* Give the write-channels of dst the form:
426 * 0xhhhh0000
427 */
428 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
429 emit(SHL(dst, tmp_src, src_reg(16u)));
430
431 /* Finally, give the write-channels of dst the form of packHalf2x16's
432 * output:
433 * 0xhhhhllll
434 */
435 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
436 emit(OR(dst, src_reg(dst), tmp_src));
437 }
438
439 void
440 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
441 {
442 if (devinfo->gen < 7) {
443 unreachable("ir_unop_unpack_half_2x16 should be lowered");
444 }
445
446 assert(dst.type == BRW_REGISTER_TYPE_F);
447 assert(src0.type == BRW_REGISTER_TYPE_UD);
448
449 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
450 *
451 * Because this instruction does not have a 16-bit floating-point type,
452 * the source data type must be Word (W). The destination type must be
453 * F (Float).
454 *
455 * To use W as the source data type, we must adjust horizontal strides,
456 * which is only possible in align1 mode. All my [chadv] attempts at
457 * emitting align1 instructions for unpackHalf2x16 failed to pass the
458 * Piglit tests, so I gave up.
459 *
460 * I've verified that, on gen7 hardware and the simulator, it is safe to
461 * emit f16to32 in align16 mode with UD as source data type.
462 */
463
464 dst_reg tmp_dst(this, glsl_type::uvec2_type);
465 src_reg tmp_src(tmp_dst);
466
467 tmp_dst.writemask = WRITEMASK_X;
468 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
469
470 tmp_dst.writemask = WRITEMASK_Y;
471 emit(SHR(tmp_dst, src0, src_reg(16u)));
472
473 dst.writemask = WRITEMASK_XY;
474 emit(F16TO32(dst, tmp_src));
475 }
476
477 void
478 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
479 {
480 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
481 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
482 * is not suitable to generate the shift values, but we can use the packed
483 * vector float and a type-converting MOV.
484 */
485 dst_reg shift(this, glsl_type::uvec4_type);
486 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
487
488 dst_reg shifted(this, glsl_type::uvec4_type);
489 src0.swizzle = BRW_SWIZZLE_XXXX;
490 emit(SHR(shifted, src0, src_reg(shift)));
491
492 shifted.type = BRW_REGISTER_TYPE_UB;
493 dst_reg f(this, glsl_type::vec4_type);
494 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
495
496 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
497 }
498
499 void
500 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
501 {
502 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
503 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
504 * is not suitable to generate the shift values, but we can use the packed
505 * vector float and a type-converting MOV.
506 */
507 dst_reg shift(this, glsl_type::uvec4_type);
508 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
509
510 dst_reg shifted(this, glsl_type::uvec4_type);
511 src0.swizzle = BRW_SWIZZLE_XXXX;
512 emit(SHR(shifted, src0, src_reg(shift)));
513
514 shifted.type = BRW_REGISTER_TYPE_B;
515 dst_reg f(this, glsl_type::vec4_type);
516 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
517
518 dst_reg scaled(this, glsl_type::vec4_type);
519 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
520
521 dst_reg max(this, glsl_type::vec4_type);
522 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
523 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
524 }
525
526 void
527 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
528 {
529 dst_reg saturated(this, glsl_type::vec4_type);
530 vec4_instruction *inst = emit(MOV(saturated, src0));
531 inst->saturate = true;
532
533 dst_reg scaled(this, glsl_type::vec4_type);
534 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
535
536 dst_reg rounded(this, glsl_type::vec4_type);
537 emit(RNDE(rounded, src_reg(scaled)));
538
539 dst_reg u(this, glsl_type::uvec4_type);
540 emit(MOV(u, src_reg(rounded)));
541
542 src_reg bytes(u);
543 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
544 }
545
546 void
547 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
548 {
549 dst_reg max(this, glsl_type::vec4_type);
550 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
551
552 dst_reg min(this, glsl_type::vec4_type);
553 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
554
555 dst_reg scaled(this, glsl_type::vec4_type);
556 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
557
558 dst_reg rounded(this, glsl_type::vec4_type);
559 emit(RNDE(rounded, src_reg(scaled)));
560
561 dst_reg i(this, glsl_type::ivec4_type);
562 emit(MOV(i, src_reg(rounded)));
563
564 src_reg bytes(i);
565 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
566 }
567
568 void
569 vec4_visitor::visit_instructions(const exec_list *list)
570 {
571 foreach_in_list(ir_instruction, ir, list) {
572 base_ir = ir;
573 ir->accept(this);
574 }
575 }
576
577 /**
578 * Returns the minimum number of vec4 elements needed to pack a type.
579 *
580 * For simple types, it will return 1 (a single vec4); for matrices, the
581 * number of columns; for array and struct, the sum of the vec4_size of
582 * each of its elements; and for sampler and atomic, zero.
583 *
584 * This method is useful to calculate how much register space is needed to
585 * store a particular type.
586 */
587 int
588 vec4_visitor::type_size(const struct glsl_type *type)
589 {
590 unsigned int i;
591 int size;
592
593 switch (type->base_type) {
594 case GLSL_TYPE_UINT:
595 case GLSL_TYPE_INT:
596 case GLSL_TYPE_FLOAT:
597 case GLSL_TYPE_BOOL:
598 if (type->is_matrix()) {
599 return type->matrix_columns;
600 } else {
601 /* Regardless of size of vector, it gets a vec4. This is bad
602 * packing for things like floats, but otherwise arrays become a
603 * mess. Hopefully a later pass over the code can pack scalars
604 * down if appropriate.
605 */
606 return 1;
607 }
608 case GLSL_TYPE_ARRAY:
609 assert(type->length > 0);
610 return type_size(type->fields.array) * type->length;
611 case GLSL_TYPE_STRUCT:
612 size = 0;
613 for (i = 0; i < type->length; i++) {
614 size += type_size(type->fields.structure[i].type);
615 }
616 return size;
617 case GLSL_TYPE_SUBROUTINE:
618 return 1;
619
620 case GLSL_TYPE_SAMPLER:
621 /* Samplers take up no register space, since they're baked in at
622 * link time.
623 */
624 return 0;
625 case GLSL_TYPE_ATOMIC_UINT:
626 return 0;
627 case GLSL_TYPE_IMAGE:
628 case GLSL_TYPE_VOID:
629 case GLSL_TYPE_DOUBLE:
630 case GLSL_TYPE_ERROR:
631 case GLSL_TYPE_INTERFACE:
632 unreachable("not reached");
633 }
634
635 return 0;
636 }
637
638 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
639 {
640 init();
641
642 this->file = GRF;
643 this->reg = v->alloc.allocate(v->type_size(type));
644
645 if (type->is_array() || type->is_record()) {
646 this->swizzle = BRW_SWIZZLE_NOOP;
647 } else {
648 this->swizzle = brw_swizzle_for_size(type->vector_elements);
649 }
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
655 {
656 assert(size > 0);
657
658 init();
659
660 this->file = GRF;
661 this->reg = v->alloc.allocate(v->type_size(type) * size);
662
663 this->swizzle = BRW_SWIZZLE_NOOP;
664
665 this->type = brw_type_for_base_type(type);
666 }
667
668 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
669 {
670 init();
671
672 this->file = GRF;
673 this->reg = v->alloc.allocate(v->type_size(type));
674
675 if (type->is_array() || type->is_record()) {
676 this->writemask = WRITEMASK_XYZW;
677 } else {
678 this->writemask = (1 << type->vector_elements) - 1;
679 }
680
681 this->type = brw_type_for_base_type(type);
682 }
683
684 void
685 vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
686 unsigned n)
687 {
688 static const gl_constant_value zero = { 0 };
689
690 for (unsigned i = 0; i < n; ++i)
691 stage_prog_data->param[4 * uniforms + i] = &values[i];
692
693 for (unsigned i = n; i < 4; ++i)
694 stage_prog_data->param[4 * uniforms + i] = &zero;
695
696 uniform_vector_size[uniforms++] = n;
697 }
698
699 /* Our support for uniforms is piggy-backed on the struct
700 * gl_fragment_program, because that's where the values actually
701 * get stored, rather than in some global gl_shader_program uniform
702 * store.
703 */
704 void
705 vec4_visitor::setup_uniform_values(ir_variable *ir)
706 {
707 int namelen = strlen(ir->name);
708
709 /* The data for our (non-builtin) uniforms is stored in a series of
710 * gl_uniform_driver_storage structs for each subcomponent that
711 * glGetUniformLocation() could name. We know it's been set up in the same
712 * order we'd walk the type, so walk the list of storage and find anything
713 * with our name, or the prefix of a component that starts with our name.
714 */
715 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
716 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
717
718 if (storage->builtin)
719 continue;
720
721 if (strncmp(ir->name, storage->name, namelen) != 0 ||
722 (storage->name[namelen] != 0 &&
723 storage->name[namelen] != '.' &&
724 storage->name[namelen] != '[')) {
725 continue;
726 }
727
728 const unsigned vector_count = (MAX2(storage->array_elements, 1) *
729 storage->type->matrix_columns);
730 const unsigned vector_size = storage->type->vector_elements;
731
732 for (unsigned s = 0; s < vector_count; s++)
733 setup_vector_uniform_values(&storage->storage[s * vector_size],
734 vector_size);
735 }
736 }
737
738 void
739 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
740 {
741 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
742 assert(this->uniforms < uniform_array_size);
743 this->uniform_vector_size[this->uniforms] = 4;
744 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
745 this->userplane[i].type = BRW_REGISTER_TYPE_F;
746 for (int j = 0; j < 4; ++j) {
747 stage_prog_data->param[this->uniforms * 4 + j] =
748 (gl_constant_value *) &clip_planes[i][j];
749 }
750 ++this->uniforms;
751 }
752 }
753
754 /* Our support for builtin uniforms is even scarier than non-builtin.
755 * It sits on top of the PROG_STATE_VAR parameters that are
756 * automatically updated from GL context state.
757 */
758 void
759 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
760 {
761 const ir_state_slot *const slots = ir->get_state_slots();
762 assert(slots != NULL);
763
764 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
765 /* This state reference has already been setup by ir_to_mesa,
766 * but we'll get the same index back here. We can reference
767 * ParameterValues directly, since unlike brw_fs.cpp, we never
768 * add new state references during compile.
769 */
770 int index = _mesa_add_state_reference(this->prog->Parameters,
771 (gl_state_index *)slots[i].tokens);
772 gl_constant_value *values =
773 &this->prog->Parameters->ParameterValues[index][0];
774
775 assert(this->uniforms < uniform_array_size);
776
777 for (unsigned j = 0; j < 4; j++)
778 stage_prog_data->param[this->uniforms * 4 + j] =
779 &values[GET_SWZ(slots[i].swizzle, j)];
780
781 this->uniform_vector_size[this->uniforms] =
782 (ir->type->is_scalar() || ir->type->is_vector() ||
783 ir->type->is_matrix() ? ir->type->vector_elements : 4);
784
785 this->uniforms++;
786 }
787 }
788
789 dst_reg *
790 vec4_visitor::variable_storage(ir_variable *var)
791 {
792 return (dst_reg *)hash_table_find(this->variable_ht, var);
793 }
794
795 void
796 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
797 enum brw_predicate *predicate)
798 {
799 ir_expression *expr = ir->as_expression();
800
801 *predicate = BRW_PREDICATE_NORMAL;
802
803 if (expr && expr->operation != ir_binop_ubo_load) {
804 src_reg op[3];
805 vec4_instruction *inst;
806
807 assert(expr->get_num_operands() <= 3);
808 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
809 expr->operands[i]->accept(this);
810 op[i] = this->result;
811
812 resolve_ud_negate(&op[i]);
813 }
814
815 switch (expr->operation) {
816 case ir_unop_logic_not:
817 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
818 inst->conditional_mod = BRW_CONDITIONAL_Z;
819 break;
820
821 case ir_binop_logic_xor:
822 if (devinfo->gen <= 5) {
823 src_reg temp = src_reg(this, ir->type);
824 emit(XOR(dst_reg(temp), op[0], op[1]));
825 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
826 } else {
827 inst = emit(XOR(dst_null_d(), op[0], op[1]));
828 }
829 inst->conditional_mod = BRW_CONDITIONAL_NZ;
830 break;
831
832 case ir_binop_logic_or:
833 if (devinfo->gen <= 5) {
834 src_reg temp = src_reg(this, ir->type);
835 emit(OR(dst_reg(temp), op[0], op[1]));
836 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
837 } else {
838 inst = emit(OR(dst_null_d(), op[0], op[1]));
839 }
840 inst->conditional_mod = BRW_CONDITIONAL_NZ;
841 break;
842
843 case ir_binop_logic_and:
844 if (devinfo->gen <= 5) {
845 src_reg temp = src_reg(this, ir->type);
846 emit(AND(dst_reg(temp), op[0], op[1]));
847 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
848 } else {
849 inst = emit(AND(dst_null_d(), op[0], op[1]));
850 }
851 inst->conditional_mod = BRW_CONDITIONAL_NZ;
852 break;
853
854 case ir_unop_f2b:
855 if (devinfo->gen >= 6) {
856 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
857 } else {
858 inst = emit(MOV(dst_null_f(), op[0]));
859 inst->conditional_mod = BRW_CONDITIONAL_NZ;
860 }
861 break;
862
863 case ir_unop_i2b:
864 if (devinfo->gen >= 6) {
865 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
866 } else {
867 inst = emit(MOV(dst_null_d(), op[0]));
868 inst->conditional_mod = BRW_CONDITIONAL_NZ;
869 }
870 break;
871
872 case ir_binop_all_equal:
873 if (devinfo->gen <= 5) {
874 resolve_bool_comparison(expr->operands[0], &op[0]);
875 resolve_bool_comparison(expr->operands[1], &op[1]);
876 }
877 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
878 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
879 break;
880
881 case ir_binop_any_nequal:
882 if (devinfo->gen <= 5) {
883 resolve_bool_comparison(expr->operands[0], &op[0]);
884 resolve_bool_comparison(expr->operands[1], &op[1]);
885 }
886 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
887 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
888 break;
889
890 case ir_unop_any:
891 if (devinfo->gen <= 5) {
892 resolve_bool_comparison(expr->operands[0], &op[0]);
893 }
894 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
895 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
896 break;
897
898 case ir_binop_greater:
899 case ir_binop_gequal:
900 case ir_binop_less:
901 case ir_binop_lequal:
902 case ir_binop_equal:
903 case ir_binop_nequal:
904 if (devinfo->gen <= 5) {
905 resolve_bool_comparison(expr->operands[0], &op[0]);
906 resolve_bool_comparison(expr->operands[1], &op[1]);
907 }
908 emit(CMP(dst_null_d(), op[0], op[1],
909 brw_conditional_for_comparison(expr->operation)));
910 break;
911
912 case ir_triop_csel: {
913 /* Expand the boolean condition into the flag register. */
914 inst = emit(MOV(dst_null_d(), op[0]));
915 inst->conditional_mod = BRW_CONDITIONAL_NZ;
916
917 /* Select which boolean to return. */
918 dst_reg temp(this, expr->operands[1]->type);
919 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
920 inst->predicate = BRW_PREDICATE_NORMAL;
921
922 /* Expand the result to a condition code. */
923 inst = emit(MOV(dst_null_d(), src_reg(temp)));
924 inst->conditional_mod = BRW_CONDITIONAL_NZ;
925 break;
926 }
927
928 default:
929 unreachable("not reached");
930 }
931 return;
932 }
933
934 ir->accept(this);
935
936 resolve_ud_negate(&this->result);
937
938 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
939 inst->conditional_mod = BRW_CONDITIONAL_NZ;
940 }
941
942 /**
943 * Emit a gen6 IF statement with the comparison folded into the IF
944 * instruction.
945 */
946 void
947 vec4_visitor::emit_if_gen6(ir_if *ir)
948 {
949 ir_expression *expr = ir->condition->as_expression();
950
951 if (expr && expr->operation != ir_binop_ubo_load) {
952 src_reg op[3];
953 dst_reg temp;
954
955 assert(expr->get_num_operands() <= 3);
956 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
957 expr->operands[i]->accept(this);
958 op[i] = this->result;
959 }
960
961 switch (expr->operation) {
962 case ir_unop_logic_not:
963 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
964 return;
965
966 case ir_binop_logic_xor:
967 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
968 return;
969
970 case ir_binop_logic_or:
971 temp = dst_reg(this, glsl_type::bool_type);
972 emit(OR(temp, op[0], op[1]));
973 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
974 return;
975
976 case ir_binop_logic_and:
977 temp = dst_reg(this, glsl_type::bool_type);
978 emit(AND(temp, op[0], op[1]));
979 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
980 return;
981
982 case ir_unop_f2b:
983 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
984 return;
985
986 case ir_unop_i2b:
987 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
988 return;
989
990 case ir_binop_greater:
991 case ir_binop_gequal:
992 case ir_binop_less:
993 case ir_binop_lequal:
994 case ir_binop_equal:
995 case ir_binop_nequal:
996 emit(IF(op[0], op[1],
997 brw_conditional_for_comparison(expr->operation)));
998 return;
999
1000 case ir_binop_all_equal:
1001 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1002 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1003 return;
1004
1005 case ir_binop_any_nequal:
1006 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1007 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1008 return;
1009
1010 case ir_unop_any:
1011 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1012 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1013 return;
1014
1015 case ir_triop_csel: {
1016 /* Expand the boolean condition into the flag register. */
1017 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1018 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1019
1020 /* Select which boolean to return. */
1021 dst_reg temp(this, expr->operands[1]->type);
1022 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1023 inst->predicate = BRW_PREDICATE_NORMAL;
1024
1025 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1026 return;
1027 }
1028
1029 default:
1030 unreachable("not reached");
1031 }
1032 return;
1033 }
1034
1035 ir->condition->accept(this);
1036
1037 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1038 }
1039
1040 void
1041 vec4_visitor::visit(ir_variable *ir)
1042 {
1043 dst_reg *reg = NULL;
1044
1045 if (variable_storage(ir))
1046 return;
1047
1048 switch (ir->data.mode) {
1049 case ir_var_shader_in:
1050 assert(ir->data.location != -1);
1051 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1052 break;
1053
1054 case ir_var_shader_out:
1055 assert(ir->data.location != -1);
1056 reg = new(mem_ctx) dst_reg(this, ir->type);
1057
1058 for (int i = 0; i < type_size(ir->type); i++) {
1059 output_reg[ir->data.location + i] = *reg;
1060 output_reg[ir->data.location + i].reg_offset = i;
1061 output_reg_annotation[ir->data.location + i] = ir->name;
1062 }
1063 break;
1064
1065 case ir_var_auto:
1066 case ir_var_temporary:
1067 reg = new(mem_ctx) dst_reg(this, ir->type);
1068 break;
1069
1070 case ir_var_uniform:
1071 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1072
1073 /* Thanks to the lower_ubo_reference pass, we will see only
1074 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1075 * variables, so no need for them to be in variable_ht.
1076 *
1077 * Some uniforms, such as samplers and atomic counters, have no actual
1078 * storage, so we should ignore them.
1079 */
1080 if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1081 return;
1082
1083 /* Track how big the whole uniform variable is, in case we need to put a
1084 * copy of its data into pull constants for array access.
1085 */
1086 assert(this->uniforms < uniform_array_size);
1087 this->uniform_size[this->uniforms] = type_size(ir->type);
1088
1089 if (!strncmp(ir->name, "gl_", 3)) {
1090 setup_builtin_uniform_values(ir);
1091 } else {
1092 setup_uniform_values(ir);
1093 }
1094 break;
1095
1096 case ir_var_system_value:
1097 reg = make_reg_for_system_value(ir->data.location, ir->type);
1098 break;
1099
1100 default:
1101 unreachable("not reached");
1102 }
1103
1104 reg->type = brw_type_for_base_type(ir->type);
1105 hash_table_insert(this->variable_ht, reg, ir);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop *ir)
1110 {
1111 /* We don't want debugging output to print the whole body of the
1112 * loop as the annotation.
1113 */
1114 this->base_ir = NULL;
1115
1116 emit(BRW_OPCODE_DO);
1117
1118 visit_instructions(&ir->body_instructions);
1119
1120 emit(BRW_OPCODE_WHILE);
1121 }
1122
1123 void
1124 vec4_visitor::visit(ir_loop_jump *ir)
1125 {
1126 switch (ir->mode) {
1127 case ir_loop_jump::jump_break:
1128 emit(BRW_OPCODE_BREAK);
1129 break;
1130 case ir_loop_jump::jump_continue:
1131 emit(BRW_OPCODE_CONTINUE);
1132 break;
1133 }
1134 }
1135
1136
1137 void
1138 vec4_visitor::visit(ir_function_signature *)
1139 {
1140 unreachable("not reached");
1141 }
1142
1143 void
1144 vec4_visitor::visit(ir_function *ir)
1145 {
1146 /* Ignore function bodies other than main() -- we shouldn't see calls to
1147 * them since they should all be inlined.
1148 */
1149 if (strcmp(ir->name, "main") == 0) {
1150 const ir_function_signature *sig;
1151 exec_list empty;
1152
1153 sig = ir->matching_signature(NULL, &empty, false);
1154
1155 assert(sig);
1156
1157 visit_instructions(&sig->body);
1158 }
1159 }
1160
1161 bool
1162 vec4_visitor::try_emit_mad(ir_expression *ir)
1163 {
1164 /* 3-src instructions were introduced in gen6. */
1165 if (devinfo->gen < 6)
1166 return false;
1167
1168 /* MAD can only handle floating-point data. */
1169 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1170 return false;
1171
1172 ir_rvalue *nonmul;
1173 ir_expression *mul;
1174 bool mul_negate, mul_abs;
1175
1176 for (int i = 0; i < 2; i++) {
1177 mul_negate = false;
1178 mul_abs = false;
1179
1180 mul = ir->operands[i]->as_expression();
1181 nonmul = ir->operands[1 - i];
1182
1183 if (mul && mul->operation == ir_unop_abs) {
1184 mul = mul->operands[0]->as_expression();
1185 mul_abs = true;
1186 } else if (mul && mul->operation == ir_unop_neg) {
1187 mul = mul->operands[0]->as_expression();
1188 mul_negate = true;
1189 }
1190
1191 if (mul && mul->operation == ir_binop_mul)
1192 break;
1193 }
1194
1195 if (!mul || mul->operation != ir_binop_mul)
1196 return false;
1197
1198 nonmul->accept(this);
1199 src_reg src0 = fix_3src_operand(this->result);
1200
1201 mul->operands[0]->accept(this);
1202 src_reg src1 = fix_3src_operand(this->result);
1203 src1.negate ^= mul_negate;
1204 src1.abs = mul_abs;
1205 if (mul_abs)
1206 src1.negate = false;
1207
1208 mul->operands[1]->accept(this);
1209 src_reg src2 = fix_3src_operand(this->result);
1210 src2.abs = mul_abs;
1211 if (mul_abs)
1212 src2.negate = false;
1213
1214 this->result = src_reg(this, ir->type);
1215 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1216
1217 return true;
1218 }
1219
1220 bool
1221 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1222 {
1223 /* This optimization relies on CMP setting the destination to 0 when
1224 * false. Early hardware only sets the least significant bit, and
1225 * leaves the other bits undefined. So we can't use it.
1226 */
1227 if (devinfo->gen < 6)
1228 return false;
1229
1230 ir_expression *const cmp = ir->operands[0]->as_expression();
1231
1232 if (cmp == NULL)
1233 return false;
1234
1235 switch (cmp->operation) {
1236 case ir_binop_less:
1237 case ir_binop_greater:
1238 case ir_binop_lequal:
1239 case ir_binop_gequal:
1240 case ir_binop_equal:
1241 case ir_binop_nequal:
1242 break;
1243
1244 default:
1245 return false;
1246 }
1247
1248 cmp->operands[0]->accept(this);
1249 const src_reg cmp_src0 = this->result;
1250
1251 cmp->operands[1]->accept(this);
1252 const src_reg cmp_src1 = this->result;
1253
1254 this->result = src_reg(this, ir->type);
1255
1256 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1257 brw_conditional_for_comparison(cmp->operation)));
1258
1259 /* If the comparison is false, this->result will just happen to be zero.
1260 */
1261 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1262 this->result, src_reg(1.0f));
1263 inst->predicate = BRW_PREDICATE_NORMAL;
1264 inst->predicate_inverse = true;
1265
1266 return true;
1267 }
1268
1269 vec4_instruction *
1270 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1271 src_reg src0, src_reg src1)
1272 {
1273 vec4_instruction *inst;
1274
1275 if (devinfo->gen >= 6) {
1276 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1277 inst->conditional_mod = conditionalmod;
1278 } else {
1279 emit(CMP(dst, src0, src1, conditionalmod));
1280
1281 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1282 inst->predicate = BRW_PREDICATE_NORMAL;
1283 }
1284
1285 return inst;
1286 }
1287
1288 vec4_instruction *
1289 vec4_visitor::emit_lrp(const dst_reg &dst,
1290 const src_reg &x, const src_reg &y, const src_reg &a)
1291 {
1292 if (devinfo->gen >= 6) {
1293 /* Note that the instruction's argument order is reversed from GLSL
1294 * and the IR.
1295 */
1296 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1297 fix_3src_operand(x)));
1298 } else {
1299 /* Earlier generations don't support three source operations, so we
1300 * need to emit x*(1-a) + y*a.
1301 */
1302 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1303 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1304 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1305 y_times_a.writemask = dst.writemask;
1306 one_minus_a.writemask = dst.writemask;
1307 x_times_one_minus_a.writemask = dst.writemask;
1308
1309 emit(MUL(y_times_a, y, a));
1310 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1311 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1312 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1313 }
1314 }
1315
1316 /**
1317 * Emits the instructions needed to perform a pull constant load. before_block
1318 * and before_inst can be NULL in which case the instruction will be appended
1319 * to the end of the instruction list.
1320 */
1321 void
1322 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1323 src_reg surf_index,
1324 src_reg offset_reg,
1325 bblock_t *before_block,
1326 vec4_instruction *before_inst)
1327 {
1328 assert((before_inst == NULL && before_block == NULL) ||
1329 (before_inst && before_block));
1330
1331 vec4_instruction *pull;
1332
1333 if (devinfo->gen >= 9) {
1334 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1335 src_reg header(this, glsl_type::uvec4_type, 2);
1336
1337 pull = new(mem_ctx)
1338 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1339 dst_reg(header));
1340
1341 if (before_inst)
1342 emit_before(before_block, before_inst, pull);
1343 else
1344 emit(pull);
1345
1346 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1347 offset_reg.type);
1348 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1349
1350 if (before_inst)
1351 emit_before(before_block, before_inst, pull);
1352 else
1353 emit(pull);
1354
1355 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1356 dst,
1357 surf_index,
1358 header);
1359 pull->mlen = 2;
1360 pull->header_size = 1;
1361 } else if (devinfo->gen >= 7) {
1362 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1363
1364 grf_offset.type = offset_reg.type;
1365
1366 pull = MOV(grf_offset, offset_reg);
1367
1368 if (before_inst)
1369 emit_before(before_block, before_inst, pull);
1370 else
1371 emit(pull);
1372
1373 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1374 dst,
1375 surf_index,
1376 src_reg(grf_offset));
1377 pull->mlen = 1;
1378 } else {
1379 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1380 dst,
1381 surf_index,
1382 offset_reg);
1383 pull->base_mrf = 14;
1384 pull->mlen = 1;
1385 }
1386
1387 if (before_inst)
1388 emit_before(before_block, before_inst, pull);
1389 else
1390 emit(pull);
1391 }
1392
1393 src_reg
1394 vec4_visitor::emit_uniformize(const src_reg &src)
1395 {
1396 const src_reg chan_index(this, glsl_type::uint_type);
1397 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1398 src.type);
1399
1400 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1401 ->force_writemask_all = true;
1402 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1403 ->force_writemask_all = true;
1404
1405 return src_reg(dst);
1406 }
1407
1408 void
1409 vec4_visitor::visit(ir_expression *ir)
1410 {
1411 unsigned int operand;
1412 src_reg op[ARRAY_SIZE(ir->operands)];
1413 vec4_instruction *inst;
1414
1415 if (ir->operation == ir_binop_add) {
1416 if (try_emit_mad(ir))
1417 return;
1418 }
1419
1420 if (ir->operation == ir_unop_b2f) {
1421 if (try_emit_b2f_of_compare(ir))
1422 return;
1423 }
1424
1425 /* Storage for our result. Ideally for an assignment we'd be using
1426 * the actual storage for the result here, instead.
1427 */
1428 dst_reg result_dst(this, ir->type);
1429 src_reg result_src(result_dst);
1430
1431 if (ir->operation == ir_triop_csel) {
1432 ir->operands[1]->accept(this);
1433 op[1] = this->result;
1434 ir->operands[2]->accept(this);
1435 op[2] = this->result;
1436
1437 enum brw_predicate predicate;
1438 emit_bool_to_cond_code(ir->operands[0], &predicate);
1439 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1440 inst->predicate = predicate;
1441 this->result = result_src;
1442 return;
1443 }
1444
1445 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1446 this->result.file = BAD_FILE;
1447 ir->operands[operand]->accept(this);
1448 if (this->result.file == BAD_FILE) {
1449 fprintf(stderr, "Failed to get tree for expression operand:\n");
1450 ir->operands[operand]->fprint(stderr);
1451 exit(1);
1452 }
1453 op[operand] = this->result;
1454
1455 /* Matrix expression operands should have been broken down to vector
1456 * operations already.
1457 */
1458 assert(!ir->operands[operand]->type->is_matrix());
1459 }
1460
1461 /* If nothing special happens, this is the result. */
1462 this->result = result_src;
1463
1464 switch (ir->operation) {
1465 case ir_unop_logic_not:
1466 emit(NOT(result_dst, op[0]));
1467 break;
1468 case ir_unop_neg:
1469 op[0].negate = !op[0].negate;
1470 emit(MOV(result_dst, op[0]));
1471 break;
1472 case ir_unop_abs:
1473 op[0].abs = true;
1474 op[0].negate = false;
1475 emit(MOV(result_dst, op[0]));
1476 break;
1477
1478 case ir_unop_sign:
1479 if (ir->type->is_float()) {
1480 /* AND(val, 0x80000000) gives the sign bit.
1481 *
1482 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1483 * zero.
1484 */
1485 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1486
1487 op[0].type = BRW_REGISTER_TYPE_UD;
1488 result_dst.type = BRW_REGISTER_TYPE_UD;
1489 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1490
1491 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1492 inst->predicate = BRW_PREDICATE_NORMAL;
1493
1494 this->result.type = BRW_REGISTER_TYPE_F;
1495 } else {
1496 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1497 * -> non-negative val generates 0x00000000.
1498 * Predicated OR sets 1 if val is positive.
1499 */
1500 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1501
1502 emit(ASR(result_dst, op[0], src_reg(31)));
1503
1504 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1505 inst->predicate = BRW_PREDICATE_NORMAL;
1506 }
1507 break;
1508
1509 case ir_unop_rcp:
1510 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1511 break;
1512
1513 case ir_unop_exp2:
1514 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1515 break;
1516 case ir_unop_log2:
1517 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1518 break;
1519 case ir_unop_exp:
1520 case ir_unop_log:
1521 unreachable("not reached: should be handled by ir_explog_to_explog2");
1522 case ir_unop_sin:
1523 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1524 break;
1525 case ir_unop_cos:
1526 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1527 break;
1528
1529 case ir_unop_dFdx:
1530 case ir_unop_dFdx_coarse:
1531 case ir_unop_dFdx_fine:
1532 case ir_unop_dFdy:
1533 case ir_unop_dFdy_coarse:
1534 case ir_unop_dFdy_fine:
1535 unreachable("derivatives not valid in vertex shader");
1536
1537 case ir_unop_bitfield_reverse:
1538 emit(BFREV(result_dst, op[0]));
1539 break;
1540 case ir_unop_bit_count:
1541 emit(CBIT(result_dst, op[0]));
1542 break;
1543 case ir_unop_find_msb: {
1544 src_reg temp = src_reg(this, glsl_type::uint_type);
1545
1546 inst = emit(FBH(dst_reg(temp), op[0]));
1547 inst->dst.writemask = WRITEMASK_XYZW;
1548
1549 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1550 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1551 * subtract the result from 31 to convert the MSB count into an LSB count.
1552 */
1553
1554 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1555 temp.swizzle = BRW_SWIZZLE_NOOP;
1556 emit(MOV(result_dst, temp));
1557
1558 src_reg src_tmp = src_reg(result_dst);
1559 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1560
1561 src_tmp.negate = true;
1562 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1563 inst->predicate = BRW_PREDICATE_NORMAL;
1564 break;
1565 }
1566 case ir_unop_find_lsb:
1567 emit(FBL(result_dst, op[0]));
1568 break;
1569 case ir_unop_saturate:
1570 inst = emit(MOV(result_dst, op[0]));
1571 inst->saturate = true;
1572 break;
1573
1574 case ir_unop_noise:
1575 unreachable("not reached: should be handled by lower_noise");
1576
1577 case ir_unop_subroutine_to_int:
1578 emit(MOV(result_dst, op[0]));
1579 break;
1580
1581 case ir_binop_add:
1582 emit(ADD(result_dst, op[0], op[1]));
1583 break;
1584 case ir_binop_sub:
1585 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1586
1587 case ir_binop_mul:
1588 if (devinfo->gen < 8 && ir->type->is_integer()) {
1589 /* For integer multiplication, the MUL uses the low 16 bits of one of
1590 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1591 * accumulates in the contribution of the upper 16 bits of that
1592 * operand. If we can determine that one of the args is in the low
1593 * 16 bits, though, we can just emit a single MUL.
1594 */
1595 if (ir->operands[0]->is_uint16_constant()) {
1596 if (devinfo->gen < 7)
1597 emit(MUL(result_dst, op[0], op[1]));
1598 else
1599 emit(MUL(result_dst, op[1], op[0]));
1600 } else if (ir->operands[1]->is_uint16_constant()) {
1601 if (devinfo->gen < 7)
1602 emit(MUL(result_dst, op[1], op[0]));
1603 else
1604 emit(MUL(result_dst, op[0], op[1]));
1605 } else {
1606 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1607
1608 emit(MUL(acc, op[0], op[1]));
1609 emit(MACH(dst_null_d(), op[0], op[1]));
1610 emit(MOV(result_dst, src_reg(acc)));
1611 }
1612 } else {
1613 emit(MUL(result_dst, op[0], op[1]));
1614 }
1615 break;
1616 case ir_binop_imul_high: {
1617 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1618
1619 emit(MUL(acc, op[0], op[1]));
1620 emit(MACH(result_dst, op[0], op[1]));
1621 break;
1622 }
1623 case ir_binop_div:
1624 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1625 assert(ir->type->is_integer());
1626 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1627 break;
1628
1629 case ir_binop_carry:
1630 unreachable("Should have been lowered by carry_to_arith().");
1631
1632 case ir_binop_borrow:
1633 unreachable("Should have been lowered by borrow_to_arith().");
1634
1635 case ir_binop_mod:
1636 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1637 assert(ir->type->is_integer());
1638 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1639 break;
1640
1641 case ir_binop_less:
1642 case ir_binop_greater:
1643 case ir_binop_lequal:
1644 case ir_binop_gequal:
1645 case ir_binop_equal:
1646 case ir_binop_nequal: {
1647 if (devinfo->gen <= 5) {
1648 resolve_bool_comparison(ir->operands[0], &op[0]);
1649 resolve_bool_comparison(ir->operands[1], &op[1]);
1650 }
1651 emit(CMP(result_dst, op[0], op[1],
1652 brw_conditional_for_comparison(ir->operation)));
1653 break;
1654 }
1655
1656 case ir_binop_all_equal:
1657 if (devinfo->gen <= 5) {
1658 resolve_bool_comparison(ir->operands[0], &op[0]);
1659 resolve_bool_comparison(ir->operands[1], &op[1]);
1660 }
1661
1662 /* "==" operator producing a scalar boolean. */
1663 if (ir->operands[0]->type->is_vector() ||
1664 ir->operands[1]->type->is_vector()) {
1665 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1666 emit(MOV(result_dst, src_reg(0)));
1667 inst = emit(MOV(result_dst, src_reg(~0)));
1668 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1669 } else {
1670 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1671 }
1672 break;
1673 case ir_binop_any_nequal:
1674 if (devinfo->gen <= 5) {
1675 resolve_bool_comparison(ir->operands[0], &op[0]);
1676 resolve_bool_comparison(ir->operands[1], &op[1]);
1677 }
1678
1679 /* "!=" operator producing a scalar boolean. */
1680 if (ir->operands[0]->type->is_vector() ||
1681 ir->operands[1]->type->is_vector()) {
1682 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1683
1684 emit(MOV(result_dst, src_reg(0)));
1685 inst = emit(MOV(result_dst, src_reg(~0)));
1686 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1687 } else {
1688 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1689 }
1690 break;
1691
1692 case ir_unop_any:
1693 if (devinfo->gen <= 5) {
1694 resolve_bool_comparison(ir->operands[0], &op[0]);
1695 }
1696 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1697 emit(MOV(result_dst, src_reg(0)));
1698
1699 inst = emit(MOV(result_dst, src_reg(~0)));
1700 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1701 break;
1702
1703 case ir_binop_logic_xor:
1704 emit(XOR(result_dst, op[0], op[1]));
1705 break;
1706
1707 case ir_binop_logic_or:
1708 emit(OR(result_dst, op[0], op[1]));
1709 break;
1710
1711 case ir_binop_logic_and:
1712 emit(AND(result_dst, op[0], op[1]));
1713 break;
1714
1715 case ir_binop_dot:
1716 assert(ir->operands[0]->type->is_vector());
1717 assert(ir->operands[0]->type == ir->operands[1]->type);
1718 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1719 break;
1720
1721 case ir_unop_sqrt:
1722 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1723 break;
1724 case ir_unop_rsq:
1725 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1726 break;
1727
1728 case ir_unop_bitcast_i2f:
1729 case ir_unop_bitcast_u2f:
1730 this->result = op[0];
1731 this->result.type = BRW_REGISTER_TYPE_F;
1732 break;
1733
1734 case ir_unop_bitcast_f2i:
1735 this->result = op[0];
1736 this->result.type = BRW_REGISTER_TYPE_D;
1737 break;
1738
1739 case ir_unop_bitcast_f2u:
1740 this->result = op[0];
1741 this->result.type = BRW_REGISTER_TYPE_UD;
1742 break;
1743
1744 case ir_unop_i2f:
1745 case ir_unop_i2u:
1746 case ir_unop_u2i:
1747 case ir_unop_u2f:
1748 case ir_unop_f2i:
1749 case ir_unop_f2u:
1750 emit(MOV(result_dst, op[0]));
1751 break;
1752 case ir_unop_b2i:
1753 case ir_unop_b2f:
1754 if (devinfo->gen <= 5) {
1755 resolve_bool_comparison(ir->operands[0], &op[0]);
1756 }
1757 emit(MOV(result_dst, negate(op[0])));
1758 break;
1759 case ir_unop_f2b:
1760 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1761 break;
1762 case ir_unop_i2b:
1763 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1764 break;
1765
1766 case ir_unop_trunc:
1767 emit(RNDZ(result_dst, op[0]));
1768 break;
1769 case ir_unop_ceil: {
1770 src_reg tmp = src_reg(this, ir->type);
1771 op[0].negate = !op[0].negate;
1772 emit(RNDD(dst_reg(tmp), op[0]));
1773 tmp.negate = true;
1774 emit(MOV(result_dst, tmp));
1775 }
1776 break;
1777 case ir_unop_floor:
1778 inst = emit(RNDD(result_dst, op[0]));
1779 break;
1780 case ir_unop_fract:
1781 inst = emit(FRC(result_dst, op[0]));
1782 break;
1783 case ir_unop_round_even:
1784 emit(RNDE(result_dst, op[0]));
1785 break;
1786
1787 case ir_binop_min:
1788 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1789 break;
1790 case ir_binop_max:
1791 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1792 break;
1793
1794 case ir_binop_pow:
1795 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1796 break;
1797
1798 case ir_unop_bit_not:
1799 inst = emit(NOT(result_dst, op[0]));
1800 break;
1801 case ir_binop_bit_and:
1802 inst = emit(AND(result_dst, op[0], op[1]));
1803 break;
1804 case ir_binop_bit_xor:
1805 inst = emit(XOR(result_dst, op[0], op[1]));
1806 break;
1807 case ir_binop_bit_or:
1808 inst = emit(OR(result_dst, op[0], op[1]));
1809 break;
1810
1811 case ir_binop_lshift:
1812 inst = emit(SHL(result_dst, op[0], op[1]));
1813 break;
1814
1815 case ir_binop_rshift:
1816 if (ir->type->base_type == GLSL_TYPE_INT)
1817 inst = emit(ASR(result_dst, op[0], op[1]));
1818 else
1819 inst = emit(SHR(result_dst, op[0], op[1]));
1820 break;
1821
1822 case ir_binop_bfm:
1823 emit(BFI1(result_dst, op[0], op[1]));
1824 break;
1825
1826 case ir_binop_ubo_load: {
1827 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1828 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1829 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1830 src_reg offset;
1831
1832 /* Now, load the vector from that offset. */
1833 assert(ir->type->is_vector() || ir->type->is_scalar());
1834
1835 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1836 packed_consts.type = result.type;
1837 src_reg surf_index;
1838
1839 if (const_uniform_block) {
1840 /* The block index is a constant, so just emit the binding table entry
1841 * as an immediate.
1842 */
1843 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1844 const_uniform_block->value.u[0]);
1845 } else {
1846 /* The block index is not a constant. Evaluate the index expression
1847 * per-channel and add the base UBO index; we have to select a value
1848 * from any live channel.
1849 */
1850 surf_index = src_reg(this, glsl_type::uint_type);
1851 emit(ADD(dst_reg(surf_index), op[0],
1852 src_reg(prog_data->base.binding_table.ubo_start)));
1853 surf_index = emit_uniformize(surf_index);
1854
1855 /* Assume this may touch any UBO. It would be nice to provide
1856 * a tighter bound, but the array information is already lowered away.
1857 */
1858 brw_mark_surface_used(&prog_data->base,
1859 prog_data->base.binding_table.ubo_start +
1860 shader_prog->NumUniformBlocks - 1);
1861 }
1862
1863 if (const_offset_ir) {
1864 if (devinfo->gen >= 8) {
1865 /* Store the offset in a GRF so we can send-from-GRF. */
1866 offset = src_reg(this, glsl_type::int_type);
1867 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1868 } else {
1869 /* Immediates are fine on older generations since they'll be moved
1870 * to a (potentially fake) MRF at the generator level.
1871 */
1872 offset = src_reg(const_offset / 16);
1873 }
1874 } else {
1875 offset = src_reg(this, glsl_type::uint_type);
1876 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1877 }
1878
1879 emit_pull_constant_load_reg(dst_reg(packed_consts),
1880 surf_index,
1881 offset,
1882 NULL, NULL /* before_block/inst */);
1883
1884 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1885 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1886 const_offset % 16 / 4,
1887 const_offset % 16 / 4,
1888 const_offset % 16 / 4);
1889
1890 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1891 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1892 emit(CMP(result_dst, packed_consts, src_reg(0u),
1893 BRW_CONDITIONAL_NZ));
1894 } else {
1895 emit(MOV(result_dst, packed_consts));
1896 }
1897 break;
1898 }
1899
1900 case ir_binop_vector_extract:
1901 unreachable("should have been lowered by vec_index_to_cond_assign");
1902
1903 case ir_triop_fma:
1904 op[0] = fix_3src_operand(op[0]);
1905 op[1] = fix_3src_operand(op[1]);
1906 op[2] = fix_3src_operand(op[2]);
1907 /* Note that the instruction's argument order is reversed from GLSL
1908 * and the IR.
1909 */
1910 emit(MAD(result_dst, op[2], op[1], op[0]));
1911 break;
1912
1913 case ir_triop_lrp:
1914 emit_lrp(result_dst, op[0], op[1], op[2]);
1915 break;
1916
1917 case ir_triop_csel:
1918 unreachable("already handled above");
1919 break;
1920
1921 case ir_triop_bfi:
1922 op[0] = fix_3src_operand(op[0]);
1923 op[1] = fix_3src_operand(op[1]);
1924 op[2] = fix_3src_operand(op[2]);
1925 emit(BFI2(result_dst, op[0], op[1], op[2]));
1926 break;
1927
1928 case ir_triop_bitfield_extract:
1929 op[0] = fix_3src_operand(op[0]);
1930 op[1] = fix_3src_operand(op[1]);
1931 op[2] = fix_3src_operand(op[2]);
1932 /* Note that the instruction's argument order is reversed from GLSL
1933 * and the IR.
1934 */
1935 emit(BFE(result_dst, op[2], op[1], op[0]));
1936 break;
1937
1938 case ir_triop_vector_insert:
1939 unreachable("should have been lowered by lower_vector_insert");
1940
1941 case ir_quadop_bitfield_insert:
1942 unreachable("not reached: should be handled by "
1943 "bitfield_insert_to_bfm_bfi\n");
1944
1945 case ir_quadop_vector:
1946 unreachable("not reached: should be handled by lower_quadop_vector");
1947
1948 case ir_unop_pack_half_2x16:
1949 emit_pack_half_2x16(result_dst, op[0]);
1950 break;
1951 case ir_unop_unpack_half_2x16:
1952 emit_unpack_half_2x16(result_dst, op[0]);
1953 break;
1954 case ir_unop_unpack_unorm_4x8:
1955 emit_unpack_unorm_4x8(result_dst, op[0]);
1956 break;
1957 case ir_unop_unpack_snorm_4x8:
1958 emit_unpack_snorm_4x8(result_dst, op[0]);
1959 break;
1960 case ir_unop_pack_unorm_4x8:
1961 emit_pack_unorm_4x8(result_dst, op[0]);
1962 break;
1963 case ir_unop_pack_snorm_4x8:
1964 emit_pack_snorm_4x8(result_dst, op[0]);
1965 break;
1966 case ir_unop_pack_snorm_2x16:
1967 case ir_unop_pack_unorm_2x16:
1968 case ir_unop_unpack_snorm_2x16:
1969 case ir_unop_unpack_unorm_2x16:
1970 unreachable("not reached: should be handled by lower_packing_builtins");
1971 case ir_unop_unpack_half_2x16_split_x:
1972 case ir_unop_unpack_half_2x16_split_y:
1973 case ir_binop_pack_half_2x16_split:
1974 case ir_unop_interpolate_at_centroid:
1975 case ir_binop_interpolate_at_sample:
1976 case ir_binop_interpolate_at_offset:
1977 unreachable("not reached: should not occur in vertex shader");
1978 case ir_binop_ldexp:
1979 unreachable("not reached: should be handled by ldexp_to_arith()");
1980 case ir_unop_d2f:
1981 case ir_unop_f2d:
1982 case ir_unop_d2i:
1983 case ir_unop_i2d:
1984 case ir_unop_d2u:
1985 case ir_unop_u2d:
1986 case ir_unop_d2b:
1987 case ir_unop_pack_double_2x32:
1988 case ir_unop_unpack_double_2x32:
1989 case ir_unop_frexp_sig:
1990 case ir_unop_frexp_exp:
1991 unreachable("fp64 todo");
1992 }
1993 }
1994
1995
1996 void
1997 vec4_visitor::visit(ir_swizzle *ir)
1998 {
1999 /* Note that this is only swizzles in expressions, not those on the left
2000 * hand side of an assignment, which do write masking. See ir_assignment
2001 * for that.
2002 */
2003 const unsigned swz = brw_compose_swizzle(
2004 brw_swizzle_for_size(ir->type->vector_elements),
2005 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2006
2007 ir->val->accept(this);
2008 this->result = swizzle(this->result, swz);
2009 }
2010
2011 void
2012 vec4_visitor::visit(ir_dereference_variable *ir)
2013 {
2014 const struct glsl_type *type = ir->type;
2015 dst_reg *reg = variable_storage(ir->var);
2016
2017 if (!reg) {
2018 fail("Failed to find variable storage for %s\n", ir->var->name);
2019 this->result = src_reg(brw_null_reg());
2020 return;
2021 }
2022
2023 this->result = src_reg(*reg);
2024
2025 /* System values get their swizzle from the dst_reg writemask */
2026 if (ir->var->data.mode == ir_var_system_value)
2027 return;
2028
2029 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2030 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2031 }
2032
2033
2034 int
2035 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2036 {
2037 /* Under normal circumstances array elements are stored consecutively, so
2038 * the stride is equal to the size of the array element.
2039 */
2040 return type_size(ir->type);
2041 }
2042
2043
2044 void
2045 vec4_visitor::visit(ir_dereference_array *ir)
2046 {
2047 ir_constant *constant_index;
2048 src_reg src;
2049 int array_stride = compute_array_stride(ir);
2050
2051 constant_index = ir->array_index->constant_expression_value();
2052
2053 ir->array->accept(this);
2054 src = this->result;
2055
2056 if (constant_index) {
2057 src.reg_offset += constant_index->value.i[0] * array_stride;
2058 } else {
2059 /* Variable index array dereference. It eats the "vec4" of the
2060 * base of the array and an index that offsets the Mesa register
2061 * index.
2062 */
2063 ir->array_index->accept(this);
2064
2065 src_reg index_reg;
2066
2067 if (array_stride == 1) {
2068 index_reg = this->result;
2069 } else {
2070 index_reg = src_reg(this, glsl_type::int_type);
2071
2072 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2073 }
2074
2075 if (src.reladdr) {
2076 src_reg temp = src_reg(this, glsl_type::int_type);
2077
2078 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2079
2080 index_reg = temp;
2081 }
2082
2083 src.reladdr = ralloc(mem_ctx, src_reg);
2084 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2085 }
2086
2087 /* If the type is smaller than a vec4, replicate the last channel out. */
2088 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2089 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2090 else
2091 src.swizzle = BRW_SWIZZLE_NOOP;
2092 src.type = brw_type_for_base_type(ir->type);
2093
2094 this->result = src;
2095 }
2096
2097 void
2098 vec4_visitor::visit(ir_dereference_record *ir)
2099 {
2100 unsigned int i;
2101 const glsl_type *struct_type = ir->record->type;
2102 int offset = 0;
2103
2104 ir->record->accept(this);
2105
2106 for (i = 0; i < struct_type->length; i++) {
2107 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2108 break;
2109 offset += type_size(struct_type->fields.structure[i].type);
2110 }
2111
2112 /* If the type is smaller than a vec4, replicate the last channel out. */
2113 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2114 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2115 else
2116 this->result.swizzle = BRW_SWIZZLE_NOOP;
2117 this->result.type = brw_type_for_base_type(ir->type);
2118
2119 this->result.reg_offset += offset;
2120 }
2121
2122 /**
2123 * We want to be careful in assignment setup to hit the actual storage
2124 * instead of potentially using a temporary like we might with the
2125 * ir_dereference handler.
2126 */
2127 static dst_reg
2128 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2129 {
2130 /* The LHS must be a dereference. If the LHS is a variable indexed array
2131 * access of a vector, it must be separated into a series conditional moves
2132 * before reaching this point (see ir_vec_index_to_cond_assign).
2133 */
2134 assert(ir->as_dereference());
2135 ir_dereference_array *deref_array = ir->as_dereference_array();
2136 if (deref_array) {
2137 assert(!deref_array->array->type->is_vector());
2138 }
2139
2140 /* Use the rvalue deref handler for the most part. We'll ignore
2141 * swizzles in it and write swizzles using writemask, though.
2142 */
2143 ir->accept(v);
2144 return dst_reg(v->result);
2145 }
2146
2147 void
2148 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2149 const struct glsl_type *type,
2150 enum brw_predicate predicate)
2151 {
2152 if (type->base_type == GLSL_TYPE_STRUCT) {
2153 for (unsigned int i = 0; i < type->length; i++) {
2154 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2155 }
2156 return;
2157 }
2158
2159 if (type->is_array()) {
2160 for (unsigned int i = 0; i < type->length; i++) {
2161 emit_block_move(dst, src, type->fields.array, predicate);
2162 }
2163 return;
2164 }
2165
2166 if (type->is_matrix()) {
2167 const struct glsl_type *vec_type;
2168
2169 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2170 type->vector_elements, 1);
2171
2172 for (int i = 0; i < type->matrix_columns; i++) {
2173 emit_block_move(dst, src, vec_type, predicate);
2174 }
2175 return;
2176 }
2177
2178 assert(type->is_scalar() || type->is_vector());
2179
2180 dst->type = brw_type_for_base_type(type);
2181 src->type = dst->type;
2182
2183 dst->writemask = (1 << type->vector_elements) - 1;
2184
2185 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2186
2187 vec4_instruction *inst = emit(MOV(*dst, *src));
2188 inst->predicate = predicate;
2189
2190 dst->reg_offset++;
2191 src->reg_offset++;
2192 }
2193
2194
2195 /* If the RHS processing resulted in an instruction generating a
2196 * temporary value, and it would be easy to rewrite the instruction to
2197 * generate its result right into the LHS instead, do so. This ends
2198 * up reliably removing instructions where it can be tricky to do so
2199 * later without real UD chain information.
2200 */
2201 bool
2202 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2203 dst_reg dst,
2204 src_reg src,
2205 vec4_instruction *pre_rhs_inst,
2206 vec4_instruction *last_rhs_inst)
2207 {
2208 /* This could be supported, but it would take more smarts. */
2209 if (ir->condition)
2210 return false;
2211
2212 if (pre_rhs_inst == last_rhs_inst)
2213 return false; /* No instructions generated to work with. */
2214
2215 /* Make sure the last instruction generated our source reg. */
2216 if (src.file != GRF ||
2217 src.file != last_rhs_inst->dst.file ||
2218 src.reg != last_rhs_inst->dst.reg ||
2219 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2220 src.reladdr ||
2221 src.abs ||
2222 src.negate ||
2223 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2224 return false;
2225
2226 /* Check that that last instruction fully initialized the channels
2227 * we want to use, in the order we want to use them. We could
2228 * potentially reswizzle the operands of many instructions so that
2229 * we could handle out of order channels, but don't yet.
2230 */
2231
2232 for (unsigned i = 0; i < 4; i++) {
2233 if (dst.writemask & (1 << i)) {
2234 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2235 return false;
2236
2237 if (BRW_GET_SWZ(src.swizzle, i) != i)
2238 return false;
2239 }
2240 }
2241
2242 /* Success! Rewrite the instruction. */
2243 last_rhs_inst->dst.file = dst.file;
2244 last_rhs_inst->dst.reg = dst.reg;
2245 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2246 last_rhs_inst->dst.reladdr = dst.reladdr;
2247 last_rhs_inst->dst.writemask &= dst.writemask;
2248
2249 return true;
2250 }
2251
2252 void
2253 vec4_visitor::visit(ir_assignment *ir)
2254 {
2255 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2256 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2257
2258 if (!ir->lhs->type->is_scalar() &&
2259 !ir->lhs->type->is_vector()) {
2260 ir->rhs->accept(this);
2261 src_reg src = this->result;
2262
2263 if (ir->condition) {
2264 emit_bool_to_cond_code(ir->condition, &predicate);
2265 }
2266
2267 /* emit_block_move doesn't account for swizzles in the source register.
2268 * This should be ok, since the source register is a structure or an
2269 * array, and those can't be swizzled. But double-check to be sure.
2270 */
2271 assert(src.swizzle ==
2272 (ir->rhs->type->is_matrix()
2273 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2274 : BRW_SWIZZLE_NOOP));
2275
2276 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2277 return;
2278 }
2279
2280 /* Now we're down to just a scalar/vector with writemasks. */
2281 int i;
2282
2283 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2284 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2285
2286 ir->rhs->accept(this);
2287
2288 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2289
2290 int swizzles[4];
2291 int src_chan = 0;
2292
2293 assert(ir->lhs->type->is_vector() ||
2294 ir->lhs->type->is_scalar());
2295 dst.writemask = ir->write_mask;
2296
2297 /* Swizzle a small RHS vector into the channels being written.
2298 *
2299 * glsl ir treats write_mask as dictating how many channels are
2300 * present on the RHS while in our instructions we need to make
2301 * those channels appear in the slots of the vec4 they're written to.
2302 */
2303 for (int i = 0; i < 4; i++)
2304 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2305
2306 src_reg src = swizzle(this->result,
2307 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2308 swizzles[2], swizzles[3]));
2309
2310 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2311 return;
2312 }
2313
2314 if (ir->condition) {
2315 emit_bool_to_cond_code(ir->condition, &predicate);
2316 }
2317
2318 for (i = 0; i < type_size(ir->lhs->type); i++) {
2319 vec4_instruction *inst = emit(MOV(dst, src));
2320 inst->predicate = predicate;
2321
2322 dst.reg_offset++;
2323 src.reg_offset++;
2324 }
2325 }
2326
2327 void
2328 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2329 {
2330 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2331 foreach_in_list(ir_constant, field_value, &ir->components) {
2332 emit_constant_values(dst, field_value);
2333 }
2334 return;
2335 }
2336
2337 if (ir->type->is_array()) {
2338 for (unsigned int i = 0; i < ir->type->length; i++) {
2339 emit_constant_values(dst, ir->array_elements[i]);
2340 }
2341 return;
2342 }
2343
2344 if (ir->type->is_matrix()) {
2345 for (int i = 0; i < ir->type->matrix_columns; i++) {
2346 float *vec = &ir->value.f[i * ir->type->vector_elements];
2347
2348 for (int j = 0; j < ir->type->vector_elements; j++) {
2349 dst->writemask = 1 << j;
2350 dst->type = BRW_REGISTER_TYPE_F;
2351
2352 emit(MOV(*dst, src_reg(vec[j])));
2353 }
2354 dst->reg_offset++;
2355 }
2356 return;
2357 }
2358
2359 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2360
2361 for (int i = 0; i < ir->type->vector_elements; i++) {
2362 if (!(remaining_writemask & (1 << i)))
2363 continue;
2364
2365 dst->writemask = 1 << i;
2366 dst->type = brw_type_for_base_type(ir->type);
2367
2368 /* Find other components that match the one we're about to
2369 * write. Emits fewer instructions for things like vec4(0.5,
2370 * 1.5, 1.5, 1.5).
2371 */
2372 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2373 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2374 if (ir->value.b[i] == ir->value.b[j])
2375 dst->writemask |= (1 << j);
2376 } else {
2377 /* u, i, and f storage all line up, so no need for a
2378 * switch case for comparing each type.
2379 */
2380 if (ir->value.u[i] == ir->value.u[j])
2381 dst->writemask |= (1 << j);
2382 }
2383 }
2384
2385 switch (ir->type->base_type) {
2386 case GLSL_TYPE_FLOAT:
2387 emit(MOV(*dst, src_reg(ir->value.f[i])));
2388 break;
2389 case GLSL_TYPE_INT:
2390 emit(MOV(*dst, src_reg(ir->value.i[i])));
2391 break;
2392 case GLSL_TYPE_UINT:
2393 emit(MOV(*dst, src_reg(ir->value.u[i])));
2394 break;
2395 case GLSL_TYPE_BOOL:
2396 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2397 break;
2398 default:
2399 unreachable("Non-float/uint/int/bool constant");
2400 }
2401
2402 remaining_writemask &= ~dst->writemask;
2403 }
2404 dst->reg_offset++;
2405 }
2406
2407 void
2408 vec4_visitor::visit(ir_constant *ir)
2409 {
2410 dst_reg dst = dst_reg(this, ir->type);
2411 this->result = src_reg(dst);
2412
2413 emit_constant_values(&dst, ir);
2414 }
2415
2416 void
2417 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2418 {
2419 ir_dereference *deref = static_cast<ir_dereference *>(
2420 ir->actual_parameters.get_head());
2421 ir_variable *location = deref->variable_referenced();
2422 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2423 location->data.binding);
2424
2425 /* Calculate the surface offset */
2426 src_reg offset(this, glsl_type::uint_type);
2427 ir_dereference_array *deref_array = deref->as_dereference_array();
2428 if (deref_array) {
2429 deref_array->array_index->accept(this);
2430
2431 src_reg tmp(this, glsl_type::uint_type);
2432 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2433 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2434 } else {
2435 offset = location->data.atomic.offset;
2436 }
2437
2438 /* Emit the appropriate machine instruction */
2439 const char *callee = ir->callee->function_name();
2440 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2441
2442 if (!strcmp("__intrinsic_atomic_read", callee)) {
2443 emit_untyped_surface_read(surf_index, dst, offset);
2444
2445 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2446 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2447 src_reg(), src_reg());
2448
2449 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2450 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2451 src_reg(), src_reg());
2452 }
2453
2454 brw_mark_surface_used(stage_prog_data, surf_index);
2455 }
2456
2457 void
2458 vec4_visitor::visit(ir_call *ir)
2459 {
2460 const char *callee = ir->callee->function_name();
2461
2462 if (!strcmp("__intrinsic_atomic_read", callee) ||
2463 !strcmp("__intrinsic_atomic_increment", callee) ||
2464 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2465 visit_atomic_counter_intrinsic(ir);
2466 } else {
2467 unreachable("Unsupported intrinsic.");
2468 }
2469 }
2470
2471 src_reg
2472 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2473 src_reg coordinate, src_reg sampler)
2474 {
2475 vec4_instruction *inst =
2476 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2477 dst_reg(this, glsl_type::uvec4_type));
2478 inst->base_mrf = 2;
2479 inst->src[1] = sampler;
2480
2481 int param_base;
2482
2483 if (devinfo->gen >= 9) {
2484 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2485 vec4_instruction *header_inst = new(mem_ctx)
2486 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2487 dst_reg(MRF, inst->base_mrf));
2488
2489 emit(header_inst);
2490
2491 inst->mlen = 2;
2492 inst->header_size = 1;
2493 param_base = inst->base_mrf + 1;
2494 } else {
2495 inst->mlen = 1;
2496 param_base = inst->base_mrf;
2497 }
2498
2499 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2500 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2501 int zero_mask = 0xf & ~coord_mask;
2502
2503 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2504 coordinate));
2505
2506 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2507 src_reg(0)));
2508
2509 emit(inst);
2510 return src_reg(inst->dst);
2511 }
2512
2513 bool
2514 vec4_visitor::is_high_sampler(src_reg sampler)
2515 {
2516 if (devinfo->gen < 8 && !devinfo->is_haswell)
2517 return false;
2518
2519 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2520 }
2521
2522 void
2523 vec4_visitor::emit_texture(ir_texture_opcode op,
2524 dst_reg dest,
2525 const glsl_type *dest_type,
2526 src_reg coordinate,
2527 int coord_components,
2528 src_reg shadow_comparitor,
2529 src_reg lod, src_reg lod2,
2530 src_reg sample_index,
2531 uint32_t constant_offset,
2532 src_reg offset_value,
2533 src_reg mcs,
2534 bool is_cube_array,
2535 uint32_t sampler,
2536 src_reg sampler_reg)
2537 {
2538 enum opcode opcode;
2539 switch (op) {
2540 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2541 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2542 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2543 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2544 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2545 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2546 case ir_tg4: opcode = offset_value.file != BAD_FILE
2547 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2548 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2549 case ir_txb:
2550 unreachable("TXB is not valid for vertex shaders.");
2551 case ir_lod:
2552 unreachable("LOD is not valid for vertex shaders.");
2553 default:
2554 unreachable("Unrecognized tex op");
2555 }
2556
2557 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2558 opcode, dst_reg(this, dest_type));
2559
2560 inst->offset = constant_offset;
2561
2562 /* The message header is necessary for:
2563 * - Gen4 (always)
2564 * - Gen9+ for selecting SIMD4x2
2565 * - Texel offsets
2566 * - Gather channel selection
2567 * - Sampler indices too large to fit in a 4-bit value.
2568 */
2569 inst->header_size =
2570 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2571 inst->offset != 0 || op == ir_tg4 ||
2572 is_high_sampler(sampler_reg)) ? 1 : 0;
2573 inst->base_mrf = 2;
2574 inst->mlen = inst->header_size + 1; /* always at least one */
2575 inst->dst.writemask = WRITEMASK_XYZW;
2576 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2577
2578 inst->src[1] = sampler_reg;
2579
2580 /* MRF for the first parameter */
2581 int param_base = inst->base_mrf + inst->header_size;
2582
2583 if (op == ir_txs || op == ir_query_levels) {
2584 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2585 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2586 } else {
2587 /* Load the coordinate */
2588 /* FINISHME: gl_clamp_mask and saturate */
2589 int coord_mask = (1 << coord_components) - 1;
2590 int zero_mask = 0xf & ~coord_mask;
2591
2592 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2593 coordinate));
2594
2595 if (zero_mask != 0) {
2596 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2597 src_reg(0)));
2598 }
2599 /* Load the shadow comparitor */
2600 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2601 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2602 WRITEMASK_X),
2603 shadow_comparitor));
2604 inst->mlen++;
2605 }
2606
2607 /* Load the LOD info */
2608 if (op == ir_tex || op == ir_txl) {
2609 int mrf, writemask;
2610 if (devinfo->gen >= 5) {
2611 mrf = param_base + 1;
2612 if (shadow_comparitor.file != BAD_FILE) {
2613 writemask = WRITEMASK_Y;
2614 /* mlen already incremented */
2615 } else {
2616 writemask = WRITEMASK_X;
2617 inst->mlen++;
2618 }
2619 } else /* devinfo->gen == 4 */ {
2620 mrf = param_base;
2621 writemask = WRITEMASK_W;
2622 }
2623 lod.swizzle = BRW_SWIZZLE_XXXX;
2624 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2625 } else if (op == ir_txf) {
2626 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2627 } else if (op == ir_txf_ms) {
2628 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2629 sample_index));
2630 if (devinfo->gen >= 7) {
2631 /* MCS data is in the first channel of `mcs`, but we need to get it into
2632 * the .y channel of the second vec4 of params, so replicate .x across
2633 * the whole vec4 and then mask off everything except .y
2634 */
2635 mcs.swizzle = BRW_SWIZZLE_XXXX;
2636 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2637 mcs));
2638 }
2639 inst->mlen++;
2640 } else if (op == ir_txd) {
2641 const brw_reg_type type = lod.type;
2642
2643 if (devinfo->gen >= 5) {
2644 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2645 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2646 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2647 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2648 inst->mlen++;
2649
2650 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2651 lod.swizzle = BRW_SWIZZLE_ZZZZ;
2652 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2653 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2654 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2655 inst->mlen++;
2656
2657 if (shadow_comparitor.file != BAD_FILE) {
2658 emit(MOV(dst_reg(MRF, param_base + 2,
2659 shadow_comparitor.type, WRITEMASK_Z),
2660 shadow_comparitor));
2661 }
2662 }
2663 } else /* devinfo->gen == 4 */ {
2664 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2665 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2666 inst->mlen += 2;
2667 }
2668 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2669 if (shadow_comparitor.file != BAD_FILE) {
2670 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2671 shadow_comparitor));
2672 }
2673
2674 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2675 offset_value));
2676 inst->mlen++;
2677 }
2678 }
2679
2680 emit(inst);
2681
2682 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2683 * spec requires layers.
2684 */
2685 if (op == ir_txs && is_cube_array) {
2686 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2687 writemask(inst->dst, WRITEMASK_Z),
2688 src_reg(inst->dst), src_reg(6));
2689 }
2690
2691 if (devinfo->gen == 6 && op == ir_tg4) {
2692 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2693 }
2694
2695 swizzle_result(op, dest,
2696 src_reg(inst->dst), sampler, dest_type);
2697 }
2698
2699 void
2700 vec4_visitor::visit(ir_texture *ir)
2701 {
2702 uint32_t sampler =
2703 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2704
2705 ir_rvalue *nonconst_sampler_index =
2706 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2707
2708 /* Handle non-constant sampler array indexing */
2709 src_reg sampler_reg;
2710 if (nonconst_sampler_index) {
2711 /* The highest sampler which may be used by this operation is
2712 * the last element of the array. Mark it here, because the generator
2713 * doesn't have enough information to determine the bound.
2714 */
2715 uint32_t array_size = ir->sampler->as_dereference_array()
2716 ->array->type->array_size();
2717
2718 uint32_t max_used = sampler + array_size - 1;
2719 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2720 max_used += prog_data->base.binding_table.gather_texture_start;
2721 } else {
2722 max_used += prog_data->base.binding_table.texture_start;
2723 }
2724
2725 brw_mark_surface_used(&prog_data->base, max_used);
2726
2727 /* Emit code to evaluate the actual indexing expression */
2728 nonconst_sampler_index->accept(this);
2729 src_reg temp(this, glsl_type::uint_type);
2730 emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2731 sampler_reg = emit_uniformize(temp);
2732 } else {
2733 /* Single sampler, or constant array index; the indexing expression
2734 * is just an immediate.
2735 */
2736 sampler_reg = src_reg(sampler);
2737 }
2738
2739 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2740 * emitting anything other than setting up the constant result.
2741 */
2742 if (ir->op == ir_tg4) {
2743 ir_constant *chan = ir->lod_info.component->as_constant();
2744 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2745 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2746 dst_reg result(this, ir->type);
2747 this->result = src_reg(result);
2748 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2749 return;
2750 }
2751 }
2752
2753 /* Should be lowered by do_lower_texture_projection */
2754 assert(!ir->projector);
2755
2756 /* Should be lowered */
2757 assert(!ir->offset || !ir->offset->type->is_array());
2758
2759 /* Generate code to compute all the subexpression trees. This has to be
2760 * done before loading any values into MRFs for the sampler message since
2761 * generating these values may involve SEND messages that need the MRFs.
2762 */
2763 src_reg coordinate;
2764 int coord_components = 0;
2765 if (ir->coordinate) {
2766 coord_components = ir->coordinate->type->vector_elements;
2767 ir->coordinate->accept(this);
2768 coordinate = this->result;
2769 }
2770
2771 src_reg shadow_comparitor;
2772 if (ir->shadow_comparitor) {
2773 ir->shadow_comparitor->accept(this);
2774 shadow_comparitor = this->result;
2775 }
2776
2777 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2778 src_reg offset_value;
2779 if (has_nonconstant_offset) {
2780 ir->offset->accept(this);
2781 offset_value = src_reg(this->result);
2782 }
2783
2784 src_reg lod, lod2, sample_index, mcs;
2785 switch (ir->op) {
2786 case ir_tex:
2787 lod = src_reg(0.0f);
2788 break;
2789 case ir_txf:
2790 case ir_txl:
2791 case ir_txs:
2792 ir->lod_info.lod->accept(this);
2793 lod = this->result;
2794 break;
2795 case ir_query_levels:
2796 lod = src_reg(0);
2797 break;
2798 case ir_txf_ms:
2799 ir->lod_info.sample_index->accept(this);
2800 sample_index = this->result;
2801
2802 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2803 mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2804 else
2805 mcs = src_reg(0u);
2806 break;
2807 case ir_txd:
2808 ir->lod_info.grad.dPdx->accept(this);
2809 lod = this->result;
2810
2811 ir->lod_info.grad.dPdy->accept(this);
2812 lod2 = this->result;
2813 break;
2814 case ir_txb:
2815 case ir_lod:
2816 case ir_tg4:
2817 break;
2818 }
2819
2820 uint32_t constant_offset = 0;
2821 if (ir->offset != NULL && !has_nonconstant_offset) {
2822 constant_offset =
2823 brw_texture_offset(ir->offset->as_constant()->value.i,
2824 ir->offset->type->vector_elements);
2825 }
2826
2827 /* Stuff the channel select bits in the top of the texture offset */
2828 if (ir->op == ir_tg4)
2829 constant_offset |=
2830 gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2831 sampler) << 16;
2832
2833 glsl_type const *type = ir->sampler->type;
2834 bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2835 type->sampler_array;
2836
2837 this->result = src_reg(this, ir->type);
2838 dst_reg dest = dst_reg(this->result);
2839
2840 emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2841 shadow_comparitor,
2842 lod, lod2, sample_index,
2843 constant_offset, offset_value,
2844 mcs, is_cube_array, sampler, sampler_reg);
2845 }
2846
2847 /**
2848 * Apply workarounds for Gen6 gather with UINT/SINT
2849 */
2850 void
2851 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2852 {
2853 if (!wa)
2854 return;
2855
2856 int width = (wa & WA_8BIT) ? 8 : 16;
2857 dst_reg dst_f = dst;
2858 dst_f.type = BRW_REGISTER_TYPE_F;
2859
2860 /* Convert from UNORM to UINT */
2861 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2862 emit(MOV(dst, src_reg(dst_f)));
2863
2864 if (wa & WA_SIGN) {
2865 /* Reinterpret the UINT value as a signed INT value by
2866 * shifting the sign bit into place, then shifting back
2867 * preserving sign.
2868 */
2869 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2870 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2871 }
2872 }
2873
2874 /**
2875 * Set up the gather channel based on the swizzle, for gather4.
2876 */
2877 uint32_t
2878 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2879 {
2880 int swiz = GET_SWZ(key->tex.swizzles[sampler], gather_component);
2881 switch (swiz) {
2882 case SWIZZLE_X: return 0;
2883 case SWIZZLE_Y:
2884 /* gather4 sampler is broken for green channel on RG32F --
2885 * we must ask for blue instead.
2886 */
2887 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2888 return 2;
2889 return 1;
2890 case SWIZZLE_Z: return 2;
2891 case SWIZZLE_W: return 3;
2892 default:
2893 unreachable("Not reached"); /* zero, one swizzles handled already */
2894 }
2895 }
2896
2897 void
2898 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2899 src_reg orig_val, uint32_t sampler,
2900 const glsl_type *dest_type)
2901 {
2902 int s = key->tex.swizzles[sampler];
2903
2904 dst_reg swizzled_result = dest;
2905
2906 if (op == ir_query_levels) {
2907 /* # levels is in .w */
2908 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2909 emit(MOV(swizzled_result, orig_val));
2910 return;
2911 }
2912
2913 if (op == ir_txs || dest_type == glsl_type::float_type
2914 || s == SWIZZLE_NOOP || op == ir_tg4) {
2915 emit(MOV(swizzled_result, orig_val));
2916 return;
2917 }
2918
2919
2920 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2921 int swizzle[4] = {0};
2922
2923 for (int i = 0; i < 4; i++) {
2924 switch (GET_SWZ(s, i)) {
2925 case SWIZZLE_ZERO:
2926 zero_mask |= (1 << i);
2927 break;
2928 case SWIZZLE_ONE:
2929 one_mask |= (1 << i);
2930 break;
2931 default:
2932 copy_mask |= (1 << i);
2933 swizzle[i] = GET_SWZ(s, i);
2934 break;
2935 }
2936 }
2937
2938 if (copy_mask) {
2939 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2940 swizzled_result.writemask = copy_mask;
2941 emit(MOV(swizzled_result, orig_val));
2942 }
2943
2944 if (zero_mask) {
2945 swizzled_result.writemask = zero_mask;
2946 emit(MOV(swizzled_result, src_reg(0.0f)));
2947 }
2948
2949 if (one_mask) {
2950 swizzled_result.writemask = one_mask;
2951 emit(MOV(swizzled_result, src_reg(1.0f)));
2952 }
2953 }
2954
2955 void
2956 vec4_visitor::visit(ir_return *)
2957 {
2958 unreachable("not reached");
2959 }
2960
2961 void
2962 vec4_visitor::visit(ir_discard *)
2963 {
2964 unreachable("not reached");
2965 }
2966
2967 void
2968 vec4_visitor::visit(ir_if *ir)
2969 {
2970 /* Don't point the annotation at the if statement, because then it plus
2971 * the then and else blocks get printed.
2972 */
2973 this->base_ir = ir->condition;
2974
2975 if (devinfo->gen == 6) {
2976 emit_if_gen6(ir);
2977 } else {
2978 enum brw_predicate predicate;
2979 emit_bool_to_cond_code(ir->condition, &predicate);
2980 emit(IF(predicate));
2981 }
2982
2983 visit_instructions(&ir->then_instructions);
2984
2985 if (!ir->else_instructions.is_empty()) {
2986 this->base_ir = ir->condition;
2987 emit(BRW_OPCODE_ELSE);
2988
2989 visit_instructions(&ir->else_instructions);
2990 }
2991
2992 this->base_ir = ir->condition;
2993 emit(BRW_OPCODE_ENDIF);
2994 }
2995
2996 void
2997 vec4_visitor::gs_emit_vertex(int stream_id)
2998 {
2999 unreachable("not reached");
3000 }
3001
3002 void
3003 vec4_visitor::visit(ir_emit_vertex *)
3004 {
3005 unreachable("not reached");
3006 }
3007
3008 void
3009 vec4_visitor::gs_end_primitive()
3010 {
3011 unreachable("not reached");
3012 }
3013
3014
3015 void
3016 vec4_visitor::visit(ir_end_primitive *)
3017 {
3018 unreachable("not reached");
3019 }
3020
3021 void
3022 vec4_visitor::visit(ir_barrier *)
3023 {
3024 unreachable("not reached");
3025 }
3026
3027 void
3028 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3029 dst_reg dst, src_reg offset,
3030 src_reg src0, src_reg src1)
3031 {
3032 unsigned mlen = 0;
3033
3034 /* Set the atomic operation offset. */
3035 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3036 mlen++;
3037
3038 /* Set the atomic operation arguments. */
3039 if (src0.file != BAD_FILE) {
3040 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3041 mlen++;
3042 }
3043
3044 if (src1.file != BAD_FILE) {
3045 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3046 mlen++;
3047 }
3048
3049 /* Emit the instruction. Note that this maps to the normal SIMD8
3050 * untyped atomic message on Ivy Bridge, but that's OK because
3051 * unused channels will be masked out.
3052 */
3053 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3054 brw_message_reg(0),
3055 src_reg(surf_index), src_reg(atomic_op));
3056 inst->mlen = mlen;
3057 }
3058
3059 void
3060 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3061 src_reg offset)
3062 {
3063 /* Set the surface read offset. */
3064 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3065
3066 /* Emit the instruction. Note that this maps to the normal SIMD8
3067 * untyped surface read message, but that's OK because unused
3068 * channels will be masked out.
3069 */
3070 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3071 brw_message_reg(0),
3072 src_reg(surf_index), src_reg(1));
3073 inst->mlen = 1;
3074 }
3075
3076 void
3077 vec4_visitor::emit_ndc_computation()
3078 {
3079 /* Get the position */
3080 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3081
3082 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3083 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3084 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3085
3086 current_annotation = "NDC";
3087 dst_reg ndc_w = ndc;
3088 ndc_w.writemask = WRITEMASK_W;
3089 src_reg pos_w = pos;
3090 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3091 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3092
3093 dst_reg ndc_xyz = ndc;
3094 ndc_xyz.writemask = WRITEMASK_XYZ;
3095
3096 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3097 }
3098
3099 void
3100 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3101 {
3102 if (devinfo->gen < 6 &&
3103 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3104 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3105 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3106 dst_reg header1_w = header1;
3107 header1_w.writemask = WRITEMASK_W;
3108
3109 emit(MOV(header1, 0u));
3110
3111 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3112 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3113
3114 current_annotation = "Point size";
3115 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3116 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3117 }
3118
3119 if (key->userclip_active) {
3120 current_annotation = "Clipping flags";
3121 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3122 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3123
3124 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3125 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3126 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3127
3128 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3129 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3130 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3131 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3132 }
3133
3134 /* i965 clipping workaround:
3135 * 1) Test for -ve rhw
3136 * 2) If set,
3137 * set ndc = (0,0,0,0)
3138 * set ucp[6] = 1
3139 *
3140 * Later, clipping will detect ucp[6] and ensure the primitive is
3141 * clipped against all fixed planes.
3142 */
3143 if (devinfo->has_negative_rhw_bug) {
3144 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3145 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3146 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3147 vec4_instruction *inst;
3148 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3149 inst->predicate = BRW_PREDICATE_NORMAL;
3150 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3151 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3152 inst->predicate = BRW_PREDICATE_NORMAL;
3153 }
3154
3155 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3156 } else if (devinfo->gen < 6) {
3157 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3158 } else {
3159 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3160 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3161 dst_reg reg_w = reg;
3162 reg_w.writemask = WRITEMASK_W;
3163 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3164 reg_as_src.type = reg_w.type;
3165 reg_as_src.swizzle = brw_swizzle_for_size(1);
3166 emit(MOV(reg_w, reg_as_src));
3167 }
3168 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3169 dst_reg reg_y = reg;
3170 reg_y.writemask = WRITEMASK_Y;
3171 reg_y.type = BRW_REGISTER_TYPE_D;
3172 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3173 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3174 }
3175 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3176 dst_reg reg_z = reg;
3177 reg_z.writemask = WRITEMASK_Z;
3178 reg_z.type = BRW_REGISTER_TYPE_D;
3179 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3180 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3181 }
3182 }
3183 }
3184
3185 void
3186 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3187 {
3188 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3189 *
3190 * "If a linked set of shaders forming the vertex stage contains no
3191 * static write to gl_ClipVertex or gl_ClipDistance, but the
3192 * application has requested clipping against user clip planes through
3193 * the API, then the coordinate written to gl_Position is used for
3194 * comparison against the user clip planes."
3195 *
3196 * This function is only called if the shader didn't write to
3197 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3198 * if the user wrote to it; otherwise we use gl_Position.
3199 */
3200 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3201 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3202 clip_vertex = VARYING_SLOT_POS;
3203 }
3204
3205 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3206 ++i) {
3207 reg.writemask = 1 << i;
3208 emit(DP4(reg,
3209 src_reg(output_reg[clip_vertex]),
3210 src_reg(this->userplane[i + offset])));
3211 }
3212 }
3213
3214 vec4_instruction *
3215 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3216 {
3217 assert(varying < VARYING_SLOT_MAX);
3218 assert(output_reg[varying].type == reg.type);
3219 current_annotation = output_reg_annotation[varying];
3220 /* Copy the register, saturating if necessary */
3221 return emit(MOV(reg, src_reg(output_reg[varying])));
3222 }
3223
3224 void
3225 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3226 {
3227 reg.type = BRW_REGISTER_TYPE_F;
3228 output_reg[varying].type = reg.type;
3229
3230 switch (varying) {
3231 case VARYING_SLOT_PSIZ:
3232 {
3233 /* PSIZ is always in slot 0, and is coupled with other flags. */
3234 current_annotation = "indices, point width, clip flags";
3235 emit_psiz_and_flags(reg);
3236 break;
3237 }
3238 case BRW_VARYING_SLOT_NDC:
3239 current_annotation = "NDC";
3240 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3241 break;
3242 case VARYING_SLOT_POS:
3243 current_annotation = "gl_Position";
3244 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3245 break;
3246 case VARYING_SLOT_EDGE:
3247 /* This is present when doing unfilled polygons. We're supposed to copy
3248 * the edge flag from the user-provided vertex array
3249 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3250 * of that attribute (starts as 1.0f). This is then used in clipping to
3251 * determine which edges should be drawn as wireframe.
3252 */
3253 current_annotation = "edge flag";
3254 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3255 glsl_type::float_type, WRITEMASK_XYZW))));
3256 break;
3257 case BRW_VARYING_SLOT_PAD:
3258 /* No need to write to this slot */
3259 break;
3260 case VARYING_SLOT_COL0:
3261 case VARYING_SLOT_COL1:
3262 case VARYING_SLOT_BFC0:
3263 case VARYING_SLOT_BFC1: {
3264 /* These built-in varyings are only supported in compatibility mode,
3265 * and we only support GS in core profile. So, this must be a vertex
3266 * shader.
3267 */
3268 assert(stage == MESA_SHADER_VERTEX);
3269 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3270 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3271 inst->saturate = true;
3272 break;
3273 }
3274
3275 default:
3276 emit_generic_urb_slot(reg, varying);
3277 break;
3278 }
3279 }
3280
3281 static int
3282 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3283 {
3284 if (devinfo->gen >= 6) {
3285 /* URB data written (does not include the message header reg) must
3286 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3287 * section 5.4.3.2.2: URB_INTERLEAVED.
3288 *
3289 * URB entries are allocated on a multiple of 1024 bits, so an
3290 * extra 128 bits written here to make the end align to 256 is
3291 * no problem.
3292 */
3293 if ((mlen % 2) != 1)
3294 mlen++;
3295 }
3296
3297 return mlen;
3298 }
3299
3300
3301 /**
3302 * Generates the VUE payload plus the necessary URB write instructions to
3303 * output it.
3304 *
3305 * The VUE layout is documented in Volume 2a.
3306 */
3307 void
3308 vec4_visitor::emit_vertex()
3309 {
3310 /* MRF 0 is reserved for the debugger, so start with message header
3311 * in MRF 1.
3312 */
3313 int base_mrf = 1;
3314 int mrf = base_mrf;
3315 /* In the process of generating our URB write message contents, we
3316 * may need to unspill a register or load from an array. Those
3317 * reads would use MRFs 14-15.
3318 */
3319 int max_usable_mrf = 13;
3320
3321 /* The following assertion verifies that max_usable_mrf causes an
3322 * even-numbered amount of URB write data, which will meet gen6's
3323 * requirements for length alignment.
3324 */
3325 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3326
3327 /* First mrf is the g0-based message header containing URB handles and
3328 * such.
3329 */
3330 emit_urb_write_header(mrf++);
3331
3332 if (devinfo->gen < 6) {
3333 emit_ndc_computation();
3334 }
3335
3336 /* Lower legacy ff and ClipVertex clipping to clip distances */
3337 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3338 current_annotation = "user clip distances";
3339
3340 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3341 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3342
3343 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3344 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3345 }
3346
3347 /* We may need to split this up into several URB writes, so do them in a
3348 * loop.
3349 */
3350 int slot = 0;
3351 bool complete = false;
3352 do {
3353 /* URB offset is in URB row increments, and each of our MRFs is half of
3354 * one of those, since we're doing interleaved writes.
3355 */
3356 int offset = slot / 2;
3357
3358 mrf = base_mrf + 1;
3359 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3360 emit_urb_slot(dst_reg(MRF, mrf++),
3361 prog_data->vue_map.slot_to_varying[slot]);
3362
3363 /* If this was max_usable_mrf, we can't fit anything more into this
3364 * URB WRITE.
3365 */
3366 if (mrf > max_usable_mrf) {
3367 slot++;
3368 break;
3369 }
3370 }
3371
3372 complete = slot >= prog_data->vue_map.num_slots;
3373 current_annotation = "URB write";
3374 vec4_instruction *inst = emit_urb_write_opcode(complete);
3375 inst->base_mrf = base_mrf;
3376 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3377 inst->offset += offset;
3378 } while(!complete);
3379 }
3380
3381
3382 src_reg
3383 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3384 src_reg *reladdr, int reg_offset)
3385 {
3386 /* Because we store the values to scratch interleaved like our
3387 * vertex data, we need to scale the vec4 index by 2.
3388 */
3389 int message_header_scale = 2;
3390
3391 /* Pre-gen6, the message header uses byte offsets instead of vec4
3392 * (16-byte) offset units.
3393 */
3394 if (devinfo->gen < 6)
3395 message_header_scale *= 16;
3396
3397 if (reladdr) {
3398 src_reg index = src_reg(this, glsl_type::int_type);
3399
3400 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3401 src_reg(reg_offset)));
3402 emit_before(block, inst, MUL(dst_reg(index), index,
3403 src_reg(message_header_scale)));
3404
3405 return index;
3406 } else {
3407 return src_reg(reg_offset * message_header_scale);
3408 }
3409 }
3410
3411 src_reg
3412 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3413 src_reg *reladdr, int reg_offset)
3414 {
3415 if (reladdr) {
3416 src_reg index = src_reg(this, glsl_type::int_type);
3417
3418 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3419 src_reg(reg_offset)));
3420
3421 /* Pre-gen6, the message header uses byte offsets instead of vec4
3422 * (16-byte) offset units.
3423 */
3424 if (devinfo->gen < 6) {
3425 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3426 }
3427
3428 return index;
3429 } else if (devinfo->gen >= 8) {
3430 /* Store the offset in a GRF so we can send-from-GRF. */
3431 src_reg offset = src_reg(this, glsl_type::int_type);
3432 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3433 return offset;
3434 } else {
3435 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3436 return src_reg(reg_offset * message_header_scale);
3437 }
3438 }
3439
3440 /**
3441 * Emits an instruction before @inst to load the value named by @orig_src
3442 * from scratch space at @base_offset to @temp.
3443 *
3444 * @base_offset is measured in 32-byte units (the size of a register).
3445 */
3446 void
3447 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3448 dst_reg temp, src_reg orig_src,
3449 int base_offset)
3450 {
3451 int reg_offset = base_offset + orig_src.reg_offset;
3452 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3453 reg_offset);
3454
3455 emit_before(block, inst, SCRATCH_READ(temp, index));
3456 }
3457
3458 /**
3459 * Emits an instruction after @inst to store the value to be written
3460 * to @orig_dst to scratch space at @base_offset, from @temp.
3461 *
3462 * @base_offset is measured in 32-byte units (the size of a register).
3463 */
3464 void
3465 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3466 int base_offset)
3467 {
3468 int reg_offset = base_offset + inst->dst.reg_offset;
3469 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3470 reg_offset);
3471
3472 /* Create a temporary register to store *inst's result in.
3473 *
3474 * We have to be careful in MOVing from our temporary result register in
3475 * the scratch write. If we swizzle from channels of the temporary that
3476 * weren't initialized, it will confuse live interval analysis, which will
3477 * make spilling fail to make progress.
3478 */
3479 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3480 inst->dst.type),
3481 brw_swizzle_for_mask(inst->dst.writemask));
3482 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3483 inst->dst.writemask));
3484 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3485 if (inst->opcode != BRW_OPCODE_SEL)
3486 write->predicate = inst->predicate;
3487 write->ir = inst->ir;
3488 write->annotation = inst->annotation;
3489 inst->insert_after(block, write);
3490
3491 inst->dst.file = temp.file;
3492 inst->dst.reg = temp.reg;
3493 inst->dst.reg_offset = temp.reg_offset;
3494 inst->dst.reladdr = NULL;
3495 }
3496
3497 /**
3498 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3499 * adds the scratch read(s) before \p inst. The function also checks for
3500 * recursive reladdr scratch accesses, issuing the corresponding scratch
3501 * loads and rewriting reladdr references accordingly.
3502 *
3503 * \return \p src if it did not require a scratch load, otherwise, the
3504 * register holding the result of the scratch load that the caller should
3505 * use to rewrite src.
3506 */
3507 src_reg
3508 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3509 vec4_instruction *inst, src_reg src)
3510 {
3511 /* Resolve recursive reladdr scratch access by calling ourselves
3512 * with src.reladdr
3513 */
3514 if (src.reladdr)
3515 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3516 *src.reladdr);
3517
3518 /* Now handle scratch access on src */
3519 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3520 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3521 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3522 src.reg = temp.reg;
3523 src.reg_offset = temp.reg_offset;
3524 src.reladdr = NULL;
3525 }
3526
3527 return src;
3528 }
3529
3530 /**
3531 * We can't generally support array access in GRF space, because a
3532 * single instruction's destination can only span 2 contiguous
3533 * registers. So, we send all GRF arrays that get variable index
3534 * access to scratch space.
3535 */
3536 void
3537 vec4_visitor::move_grf_array_access_to_scratch()
3538 {
3539 int scratch_loc[this->alloc.count];
3540 memset(scratch_loc, -1, sizeof(scratch_loc));
3541
3542 /* First, calculate the set of virtual GRFs that need to be punted
3543 * to scratch due to having any array access on them, and where in
3544 * scratch.
3545 */
3546 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3547 if (inst->dst.file == GRF && inst->dst.reladdr) {
3548 if (scratch_loc[inst->dst.reg] == -1) {
3549 scratch_loc[inst->dst.reg] = last_scratch;
3550 last_scratch += this->alloc.sizes[inst->dst.reg];
3551 }
3552
3553 for (src_reg *iter = inst->dst.reladdr;
3554 iter->reladdr;
3555 iter = iter->reladdr) {
3556 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3557 scratch_loc[iter->reg] = last_scratch;
3558 last_scratch += this->alloc.sizes[iter->reg];
3559 }
3560 }
3561 }
3562
3563 for (int i = 0 ; i < 3; i++) {
3564 for (src_reg *iter = &inst->src[i];
3565 iter->reladdr;
3566 iter = iter->reladdr) {
3567 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3568 scratch_loc[iter->reg] = last_scratch;
3569 last_scratch += this->alloc.sizes[iter->reg];
3570 }
3571 }
3572 }
3573 }
3574
3575 /* Now, for anything that will be accessed through scratch, rewrite
3576 * it to load/store. Note that this is a _safe list walk, because
3577 * we may generate a new scratch_write instruction after the one
3578 * we're processing.
3579 */
3580 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3581 /* Set up the annotation tracking for new generated instructions. */
3582 base_ir = inst->ir;
3583 current_annotation = inst->annotation;
3584
3585 /* First handle scratch access on the dst. Notice we have to handle
3586 * the case where the dst's reladdr also points to scratch space.
3587 */
3588 if (inst->dst.reladdr)
3589 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3590 *inst->dst.reladdr);
3591
3592 /* Now that we have handled any (possibly recursive) reladdr scratch
3593 * accesses for dst we can safely do the scratch write for dst itself
3594 */
3595 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3596 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3597
3598 /* Now handle scratch access on any src. In this case, since inst->src[i]
3599 * already is a src_reg, we can just call emit_resolve_reladdr with
3600 * inst->src[i] and it will take care of handling scratch loads for
3601 * both src and src.reladdr (recursively).
3602 */
3603 for (int i = 0 ; i < 3; i++) {
3604 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3605 inst->src[i]);
3606 }
3607 }
3608 }
3609
3610 /**
3611 * Emits an instruction before @inst to load the value named by @orig_src
3612 * from the pull constant buffer (surface) at @base_offset to @temp.
3613 */
3614 void
3615 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3616 dst_reg temp, src_reg orig_src,
3617 int base_offset)
3618 {
3619 int reg_offset = base_offset + orig_src.reg_offset;
3620 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3621 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3622 reg_offset);
3623
3624 emit_pull_constant_load_reg(temp,
3625 index,
3626 offset,
3627 block, inst);
3628 }
3629
3630 /**
3631 * Implements array access of uniforms by inserting a
3632 * PULL_CONSTANT_LOAD instruction.
3633 *
3634 * Unlike temporary GRF array access (where we don't support it due to
3635 * the difficulty of doing relative addressing on instruction
3636 * destinations), we could potentially do array access of uniforms
3637 * that were loaded in GRF space as push constants. In real-world
3638 * usage we've seen, though, the arrays being used are always larger
3639 * than we could load as push constants, so just always move all
3640 * uniform array access out to a pull constant buffer.
3641 */
3642 void
3643 vec4_visitor::move_uniform_array_access_to_pull_constants()
3644 {
3645 int pull_constant_loc[this->uniforms];
3646 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3647 bool nested_reladdr;
3648
3649 /* Walk through and find array access of uniforms. Put a copy of that
3650 * uniform in the pull constant buffer.
3651 *
3652 * Note that we don't move constant-indexed accesses to arrays. No
3653 * testing has been done of the performance impact of this choice.
3654 */
3655 do {
3656 nested_reladdr = false;
3657
3658 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3659 for (int i = 0 ; i < 3; i++) {
3660 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3661 continue;
3662
3663 int uniform = inst->src[i].reg;
3664
3665 if (inst->src[i].reladdr->reladdr)
3666 nested_reladdr = true; /* will need another pass */
3667
3668 /* If this array isn't already present in the pull constant buffer,
3669 * add it.
3670 */
3671 if (pull_constant_loc[uniform] == -1) {
3672 const gl_constant_value **values =
3673 &stage_prog_data->param[uniform * 4];
3674
3675 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3676
3677 assert(uniform < uniform_array_size);
3678 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3679 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3680 = values[j];
3681 }
3682 }
3683
3684 /* Set up the annotation tracking for new generated instructions. */
3685 base_ir = inst->ir;
3686 current_annotation = inst->annotation;
3687
3688 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3689
3690 emit_pull_constant_load(block, inst, temp, inst->src[i],
3691 pull_constant_loc[uniform]);
3692
3693 inst->src[i].file = temp.file;
3694 inst->src[i].reg = temp.reg;
3695 inst->src[i].reg_offset = temp.reg_offset;
3696 inst->src[i].reladdr = NULL;
3697 }
3698 }
3699 } while (nested_reladdr);
3700
3701 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3702 * no need to track them as larger-than-vec4 objects. This will be
3703 * relied on in cutting out unused uniform vectors from push
3704 * constants.
3705 */
3706 split_uniform_registers();
3707 }
3708
3709 void
3710 vec4_visitor::resolve_ud_negate(src_reg *reg)
3711 {
3712 if (reg->type != BRW_REGISTER_TYPE_UD ||
3713 !reg->negate)
3714 return;
3715
3716 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3717 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3718 *reg = temp;
3719 }
3720
3721 /**
3722 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3723 *
3724 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3725 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3726 */
3727 void
3728 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3729 {
3730 assert(devinfo->gen <= 5);
3731
3732 if (!rvalue->type->is_boolean())
3733 return;
3734
3735 src_reg and_result = src_reg(this, rvalue->type);
3736 src_reg neg_result = src_reg(this, rvalue->type);
3737 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3738 emit(MOV(dst_reg(neg_result), negate(and_result)));
3739 *reg = neg_result;
3740 }
3741
3742 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3743 void *log_data,
3744 struct gl_program *prog,
3745 const struct brw_vue_prog_key *key,
3746 struct brw_vue_prog_data *prog_data,
3747 struct gl_shader_program *shader_prog,
3748 gl_shader_stage stage,
3749 void *mem_ctx,
3750 bool no_spills,
3751 int shader_time_index)
3752 : backend_shader(compiler, log_data, mem_ctx,
3753 shader_prog, prog, &prog_data->base, stage),
3754 key(key),
3755 prog_data(prog_data),
3756 sanity_param_count(0),
3757 fail_msg(NULL),
3758 first_non_payload_grf(0),
3759 need_all_constants_in_pull_buffer(false),
3760 no_spills(no_spills),
3761 shader_time_index(shader_time_index),
3762 last_scratch(0)
3763 {
3764 this->failed = false;
3765
3766 this->base_ir = NULL;
3767 this->current_annotation = NULL;
3768 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3769
3770 this->variable_ht = hash_table_ctor(0,
3771 hash_table_pointer_hash,
3772 hash_table_pointer_compare);
3773
3774 this->virtual_grf_start = NULL;
3775 this->virtual_grf_end = NULL;
3776 this->live_intervals = NULL;
3777
3778 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3779
3780 this->uniforms = 0;
3781
3782 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3783 * at least one. See setup_uniforms() in brw_vec4.cpp.
3784 */
3785 this->uniform_array_size = 1;
3786 if (prog_data) {
3787 this->uniform_array_size =
3788 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3789 }
3790
3791 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3792 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3793 }
3794
3795 vec4_visitor::~vec4_visitor()
3796 {
3797 hash_table_dtor(this->variable_ht);
3798 }
3799
3800
3801 void
3802 vec4_visitor::fail(const char *format, ...)
3803 {
3804 va_list va;
3805 char *msg;
3806
3807 if (failed)
3808 return;
3809
3810 failed = true;
3811
3812 va_start(va, format);
3813 msg = ralloc_vasprintf(mem_ctx, format, va);
3814 va_end(va);
3815 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3816
3817 this->fail_msg = msg;
3818
3819 if (debug_enabled) {
3820 fprintf(stderr, "%s", msg);
3821 }
3822 }
3823
3824 } /* namespace brw */