i965: Use a single index per shader for shader_time.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (devinfo->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (devinfo->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (devinfo->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (devinfo->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_DOUBLE:
616 case GLSL_TYPE_ERROR:
617 case GLSL_TYPE_INTERFACE:
618 unreachable("not reached");
619 }
620
621 return 0;
622 }
623
624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
625 {
626 init();
627
628 this->file = GRF;
629 this->reg = v->alloc.allocate(type_size(type));
630
631 if (type->is_array() || type->is_record()) {
632 this->swizzle = BRW_SWIZZLE_NOOP;
633 } else {
634 this->swizzle = brw_swizzle_for_size(type->vector_elements);
635 }
636
637 this->type = brw_type_for_base_type(type);
638 }
639
640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
641 {
642 assert(size > 0);
643
644 init();
645
646 this->file = GRF;
647 this->reg = v->alloc.allocate(type_size(type) * size);
648
649 this->swizzle = BRW_SWIZZLE_NOOP;
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
655 {
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(type_size(type));
660
661 if (type->is_array() || type->is_record()) {
662 this->writemask = WRITEMASK_XYZW;
663 } else {
664 this->writemask = (1 << type->vector_elements) - 1;
665 }
666
667 this->type = brw_type_for_base_type(type);
668 }
669
670 /* Our support for uniforms is piggy-backed on the struct
671 * gl_fragment_program, because that's where the values actually
672 * get stored, rather than in some global gl_shader_program uniform
673 * store.
674 */
675 void
676 vec4_visitor::setup_uniform_values(ir_variable *ir)
677 {
678 int namelen = strlen(ir->name);
679
680 /* The data for our (non-builtin) uniforms is stored in a series of
681 * gl_uniform_driver_storage structs for each subcomponent that
682 * glGetUniformLocation() could name. We know it's been set up in the same
683 * order we'd walk the type, so walk the list of storage and find anything
684 * with our name, or the prefix of a component that starts with our name.
685 */
686 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
687 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
688
689 if (storage->builtin)
690 continue;
691
692 if (strncmp(ir->name, storage->name, namelen) != 0 ||
693 (storage->name[namelen] != 0 &&
694 storage->name[namelen] != '.' &&
695 storage->name[namelen] != '[')) {
696 continue;
697 }
698
699 gl_constant_value *components = storage->storage;
700 unsigned vector_count = (MAX2(storage->array_elements, 1) *
701 storage->type->matrix_columns);
702
703 for (unsigned s = 0; s < vector_count; s++) {
704 assert(uniforms < uniform_array_size);
705 uniform_vector_size[uniforms] = storage->type->vector_elements;
706
707 int i;
708 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
709 stage_prog_data->param[uniforms * 4 + i] = components;
710 components++;
711 }
712 for (; i < 4; i++) {
713 static gl_constant_value zero = { 0.0 };
714 stage_prog_data->param[uniforms * 4 + i] = &zero;
715 }
716
717 uniforms++;
718 }
719 }
720 }
721
722 void
723 vec4_visitor::setup_uniform_clipplane_values()
724 {
725 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
726
727 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
728 assert(this->uniforms < uniform_array_size);
729 this->uniform_vector_size[this->uniforms] = 4;
730 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
731 this->userplane[i].type = BRW_REGISTER_TYPE_F;
732 for (int j = 0; j < 4; ++j) {
733 stage_prog_data->param[this->uniforms * 4 + j] =
734 (gl_constant_value *) &clip_planes[i][j];
735 }
736 ++this->uniforms;
737 }
738 }
739
740 /* Our support for builtin uniforms is even scarier than non-builtin.
741 * It sits on top of the PROG_STATE_VAR parameters that are
742 * automatically updated from GL context state.
743 */
744 void
745 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
746 {
747 const ir_state_slot *const slots = ir->get_state_slots();
748 assert(slots != NULL);
749
750 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
751 /* This state reference has already been setup by ir_to_mesa,
752 * but we'll get the same index back here. We can reference
753 * ParameterValues directly, since unlike brw_fs.cpp, we never
754 * add new state references during compile.
755 */
756 int index = _mesa_add_state_reference(this->prog->Parameters,
757 (gl_state_index *)slots[i].tokens);
758 gl_constant_value *values =
759 &this->prog->Parameters->ParameterValues[index][0];
760
761 assert(this->uniforms < uniform_array_size);
762
763 for (unsigned j = 0; j < 4; j++)
764 stage_prog_data->param[this->uniforms * 4 + j] =
765 &values[GET_SWZ(slots[i].swizzle, j)];
766
767 this->uniform_vector_size[this->uniforms] =
768 (ir->type->is_scalar() || ir->type->is_vector() ||
769 ir->type->is_matrix() ? ir->type->vector_elements : 4);
770
771 this->uniforms++;
772 }
773 }
774
775 dst_reg *
776 vec4_visitor::variable_storage(ir_variable *var)
777 {
778 return (dst_reg *)hash_table_find(this->variable_ht, var);
779 }
780
781 void
782 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
783 enum brw_predicate *predicate)
784 {
785 ir_expression *expr = ir->as_expression();
786
787 *predicate = BRW_PREDICATE_NORMAL;
788
789 if (expr && expr->operation != ir_binop_ubo_load) {
790 src_reg op[3];
791 vec4_instruction *inst;
792
793 assert(expr->get_num_operands() <= 3);
794 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
795 expr->operands[i]->accept(this);
796 op[i] = this->result;
797
798 resolve_ud_negate(&op[i]);
799 }
800
801 switch (expr->operation) {
802 case ir_unop_logic_not:
803 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
804 inst->conditional_mod = BRW_CONDITIONAL_Z;
805 break;
806
807 case ir_binop_logic_xor:
808 if (devinfo->gen <= 5) {
809 src_reg temp = src_reg(this, ir->type);
810 emit(XOR(dst_reg(temp), op[0], op[1]));
811 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
812 } else {
813 inst = emit(XOR(dst_null_d(), op[0], op[1]));
814 }
815 inst->conditional_mod = BRW_CONDITIONAL_NZ;
816 break;
817
818 case ir_binop_logic_or:
819 if (devinfo->gen <= 5) {
820 src_reg temp = src_reg(this, ir->type);
821 emit(OR(dst_reg(temp), op[0], op[1]));
822 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
823 } else {
824 inst = emit(OR(dst_null_d(), op[0], op[1]));
825 }
826 inst->conditional_mod = BRW_CONDITIONAL_NZ;
827 break;
828
829 case ir_binop_logic_and:
830 if (devinfo->gen <= 5) {
831 src_reg temp = src_reg(this, ir->type);
832 emit(AND(dst_reg(temp), op[0], op[1]));
833 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
834 } else {
835 inst = emit(AND(dst_null_d(), op[0], op[1]));
836 }
837 inst->conditional_mod = BRW_CONDITIONAL_NZ;
838 break;
839
840 case ir_unop_f2b:
841 if (devinfo->gen >= 6) {
842 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
843 } else {
844 inst = emit(MOV(dst_null_f(), op[0]));
845 inst->conditional_mod = BRW_CONDITIONAL_NZ;
846 }
847 break;
848
849 case ir_unop_i2b:
850 if (devinfo->gen >= 6) {
851 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
852 } else {
853 inst = emit(MOV(dst_null_d(), op[0]));
854 inst->conditional_mod = BRW_CONDITIONAL_NZ;
855 }
856 break;
857
858 case ir_binop_all_equal:
859 if (devinfo->gen <= 5) {
860 resolve_bool_comparison(expr->operands[0], &op[0]);
861 resolve_bool_comparison(expr->operands[1], &op[1]);
862 }
863 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
864 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
865 break;
866
867 case ir_binop_any_nequal:
868 if (devinfo->gen <= 5) {
869 resolve_bool_comparison(expr->operands[0], &op[0]);
870 resolve_bool_comparison(expr->operands[1], &op[1]);
871 }
872 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
873 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
874 break;
875
876 case ir_unop_any:
877 if (devinfo->gen <= 5) {
878 resolve_bool_comparison(expr->operands[0], &op[0]);
879 }
880 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
881 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
882 break;
883
884 case ir_binop_greater:
885 case ir_binop_gequal:
886 case ir_binop_less:
887 case ir_binop_lequal:
888 case ir_binop_equal:
889 case ir_binop_nequal:
890 if (devinfo->gen <= 5) {
891 resolve_bool_comparison(expr->operands[0], &op[0]);
892 resolve_bool_comparison(expr->operands[1], &op[1]);
893 }
894 emit(CMP(dst_null_d(), op[0], op[1],
895 brw_conditional_for_comparison(expr->operation)));
896 break;
897
898 case ir_triop_csel: {
899 /* Expand the boolean condition into the flag register. */
900 inst = emit(MOV(dst_null_d(), op[0]));
901 inst->conditional_mod = BRW_CONDITIONAL_NZ;
902
903 /* Select which boolean to return. */
904 dst_reg temp(this, expr->operands[1]->type);
905 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
906 inst->predicate = BRW_PREDICATE_NORMAL;
907
908 /* Expand the result to a condition code. */
909 inst = emit(MOV(dst_null_d(), src_reg(temp)));
910 inst->conditional_mod = BRW_CONDITIONAL_NZ;
911 break;
912 }
913
914 default:
915 unreachable("not reached");
916 }
917 return;
918 }
919
920 ir->accept(this);
921
922 resolve_ud_negate(&this->result);
923
924 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
925 inst->conditional_mod = BRW_CONDITIONAL_NZ;
926 }
927
928 /**
929 * Emit a gen6 IF statement with the comparison folded into the IF
930 * instruction.
931 */
932 void
933 vec4_visitor::emit_if_gen6(ir_if *ir)
934 {
935 ir_expression *expr = ir->condition->as_expression();
936
937 if (expr && expr->operation != ir_binop_ubo_load) {
938 src_reg op[3];
939 dst_reg temp;
940
941 assert(expr->get_num_operands() <= 3);
942 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
943 expr->operands[i]->accept(this);
944 op[i] = this->result;
945 }
946
947 switch (expr->operation) {
948 case ir_unop_logic_not:
949 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
950 return;
951
952 case ir_binop_logic_xor:
953 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
954 return;
955
956 case ir_binop_logic_or:
957 temp = dst_reg(this, glsl_type::bool_type);
958 emit(OR(temp, op[0], op[1]));
959 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
960 return;
961
962 case ir_binop_logic_and:
963 temp = dst_reg(this, glsl_type::bool_type);
964 emit(AND(temp, op[0], op[1]));
965 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
966 return;
967
968 case ir_unop_f2b:
969 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
970 return;
971
972 case ir_unop_i2b:
973 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
974 return;
975
976 case ir_binop_greater:
977 case ir_binop_gequal:
978 case ir_binop_less:
979 case ir_binop_lequal:
980 case ir_binop_equal:
981 case ir_binop_nequal:
982 emit(IF(op[0], op[1],
983 brw_conditional_for_comparison(expr->operation)));
984 return;
985
986 case ir_binop_all_equal:
987 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
988 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
989 return;
990
991 case ir_binop_any_nequal:
992 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
993 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
994 return;
995
996 case ir_unop_any:
997 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
998 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
999 return;
1000
1001 case ir_triop_csel: {
1002 /* Expand the boolean condition into the flag register. */
1003 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1004 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1005
1006 /* Select which boolean to return. */
1007 dst_reg temp(this, expr->operands[1]->type);
1008 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1009 inst->predicate = BRW_PREDICATE_NORMAL;
1010
1011 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1012 return;
1013 }
1014
1015 default:
1016 unreachable("not reached");
1017 }
1018 return;
1019 }
1020
1021 ir->condition->accept(this);
1022
1023 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1024 }
1025
1026 void
1027 vec4_visitor::visit(ir_variable *ir)
1028 {
1029 dst_reg *reg = NULL;
1030
1031 if (variable_storage(ir))
1032 return;
1033
1034 switch (ir->data.mode) {
1035 case ir_var_shader_in:
1036 assert(ir->data.location != -1);
1037 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1038 break;
1039
1040 case ir_var_shader_out:
1041 assert(ir->data.location != -1);
1042 reg = new(mem_ctx) dst_reg(this, ir->type);
1043
1044 for (int i = 0; i < type_size(ir->type); i++) {
1045 output_reg[ir->data.location + i] = *reg;
1046 output_reg[ir->data.location + i].reg_offset = i;
1047 output_reg[ir->data.location + i].type =
1048 brw_type_for_base_type(ir->type->get_scalar_type());
1049 output_reg_annotation[ir->data.location + i] = ir->name;
1050 }
1051 break;
1052
1053 case ir_var_auto:
1054 case ir_var_temporary:
1055 reg = new(mem_ctx) dst_reg(this, ir->type);
1056 break;
1057
1058 case ir_var_uniform:
1059 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1060
1061 /* Thanks to the lower_ubo_reference pass, we will see only
1062 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1063 * variables, so no need for them to be in variable_ht.
1064 *
1065 * Some uniforms, such as samplers and atomic counters, have no actual
1066 * storage, so we should ignore them.
1067 */
1068 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1069 return;
1070
1071 /* Track how big the whole uniform variable is, in case we need to put a
1072 * copy of its data into pull constants for array access.
1073 */
1074 assert(this->uniforms < uniform_array_size);
1075 this->uniform_size[this->uniforms] = type_size(ir->type);
1076
1077 if (!strncmp(ir->name, "gl_", 3)) {
1078 setup_builtin_uniform_values(ir);
1079 } else {
1080 setup_uniform_values(ir);
1081 }
1082 break;
1083
1084 case ir_var_system_value:
1085 reg = make_reg_for_system_value(ir);
1086 break;
1087
1088 default:
1089 unreachable("not reached");
1090 }
1091
1092 reg->type = brw_type_for_base_type(ir->type);
1093 hash_table_insert(this->variable_ht, reg, ir);
1094 }
1095
1096 void
1097 vec4_visitor::visit(ir_loop *ir)
1098 {
1099 /* We don't want debugging output to print the whole body of the
1100 * loop as the annotation.
1101 */
1102 this->base_ir = NULL;
1103
1104 emit(BRW_OPCODE_DO);
1105
1106 visit_instructions(&ir->body_instructions);
1107
1108 emit(BRW_OPCODE_WHILE);
1109 }
1110
1111 void
1112 vec4_visitor::visit(ir_loop_jump *ir)
1113 {
1114 switch (ir->mode) {
1115 case ir_loop_jump::jump_break:
1116 emit(BRW_OPCODE_BREAK);
1117 break;
1118 case ir_loop_jump::jump_continue:
1119 emit(BRW_OPCODE_CONTINUE);
1120 break;
1121 }
1122 }
1123
1124
1125 void
1126 vec4_visitor::visit(ir_function_signature *)
1127 {
1128 unreachable("not reached");
1129 }
1130
1131 void
1132 vec4_visitor::visit(ir_function *ir)
1133 {
1134 /* Ignore function bodies other than main() -- we shouldn't see calls to
1135 * them since they should all be inlined.
1136 */
1137 if (strcmp(ir->name, "main") == 0) {
1138 const ir_function_signature *sig;
1139 exec_list empty;
1140
1141 sig = ir->matching_signature(NULL, &empty, false);
1142
1143 assert(sig);
1144
1145 visit_instructions(&sig->body);
1146 }
1147 }
1148
1149 bool
1150 vec4_visitor::try_emit_mad(ir_expression *ir)
1151 {
1152 /* 3-src instructions were introduced in gen6. */
1153 if (devinfo->gen < 6)
1154 return false;
1155
1156 /* MAD can only handle floating-point data. */
1157 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1158 return false;
1159
1160 ir_rvalue *nonmul;
1161 ir_expression *mul;
1162 bool mul_negate, mul_abs;
1163
1164 for (int i = 0; i < 2; i++) {
1165 mul_negate = false;
1166 mul_abs = false;
1167
1168 mul = ir->operands[i]->as_expression();
1169 nonmul = ir->operands[1 - i];
1170
1171 if (mul && mul->operation == ir_unop_abs) {
1172 mul = mul->operands[0]->as_expression();
1173 mul_abs = true;
1174 } else if (mul && mul->operation == ir_unop_neg) {
1175 mul = mul->operands[0]->as_expression();
1176 mul_negate = true;
1177 }
1178
1179 if (mul && mul->operation == ir_binop_mul)
1180 break;
1181 }
1182
1183 if (!mul || mul->operation != ir_binop_mul)
1184 return false;
1185
1186 nonmul->accept(this);
1187 src_reg src0 = fix_3src_operand(this->result);
1188
1189 mul->operands[0]->accept(this);
1190 src_reg src1 = fix_3src_operand(this->result);
1191 src1.negate ^= mul_negate;
1192 src1.abs = mul_abs;
1193 if (mul_abs)
1194 src1.negate = false;
1195
1196 mul->operands[1]->accept(this);
1197 src_reg src2 = fix_3src_operand(this->result);
1198 src2.abs = mul_abs;
1199 if (mul_abs)
1200 src2.negate = false;
1201
1202 this->result = src_reg(this, ir->type);
1203 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1204
1205 return true;
1206 }
1207
1208 bool
1209 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1210 {
1211 /* This optimization relies on CMP setting the destination to 0 when
1212 * false. Early hardware only sets the least significant bit, and
1213 * leaves the other bits undefined. So we can't use it.
1214 */
1215 if (devinfo->gen < 6)
1216 return false;
1217
1218 ir_expression *const cmp = ir->operands[0]->as_expression();
1219
1220 if (cmp == NULL)
1221 return false;
1222
1223 switch (cmp->operation) {
1224 case ir_binop_less:
1225 case ir_binop_greater:
1226 case ir_binop_lequal:
1227 case ir_binop_gequal:
1228 case ir_binop_equal:
1229 case ir_binop_nequal:
1230 break;
1231
1232 default:
1233 return false;
1234 }
1235
1236 cmp->operands[0]->accept(this);
1237 const src_reg cmp_src0 = this->result;
1238
1239 cmp->operands[1]->accept(this);
1240 const src_reg cmp_src1 = this->result;
1241
1242 this->result = src_reg(this, ir->type);
1243
1244 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1245 brw_conditional_for_comparison(cmp->operation)));
1246
1247 /* If the comparison is false, this->result will just happen to be zero.
1248 */
1249 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1250 this->result, src_reg(1.0f));
1251 inst->predicate = BRW_PREDICATE_NORMAL;
1252 inst->predicate_inverse = true;
1253
1254 return true;
1255 }
1256
1257 void
1258 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1259 src_reg src0, src_reg src1)
1260 {
1261 vec4_instruction *inst;
1262
1263 if (devinfo->gen >= 6) {
1264 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1265 inst->conditional_mod = conditionalmod;
1266 } else {
1267 emit(CMP(dst, src0, src1, conditionalmod));
1268
1269 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1270 inst->predicate = BRW_PREDICATE_NORMAL;
1271 }
1272 }
1273
1274 void
1275 vec4_visitor::emit_lrp(const dst_reg &dst,
1276 const src_reg &x, const src_reg &y, const src_reg &a)
1277 {
1278 if (devinfo->gen >= 6) {
1279 /* Note that the instruction's argument order is reversed from GLSL
1280 * and the IR.
1281 */
1282 emit(LRP(dst,
1283 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1284 } else {
1285 /* Earlier generations don't support three source operations, so we
1286 * need to emit x*(1-a) + y*a.
1287 */
1288 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1289 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1290 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1291 y_times_a.writemask = dst.writemask;
1292 one_minus_a.writemask = dst.writemask;
1293 x_times_one_minus_a.writemask = dst.writemask;
1294
1295 emit(MUL(y_times_a, y, a));
1296 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1297 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1298 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1299 }
1300 }
1301
1302 /**
1303 * Emits the instructions needed to perform a pull constant load. before_block
1304 * and before_inst can be NULL in which case the instruction will be appended
1305 * to the end of the instruction list.
1306 */
1307 void
1308 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1309 src_reg surf_index,
1310 src_reg offset_reg,
1311 bblock_t *before_block,
1312 vec4_instruction *before_inst)
1313 {
1314 assert((before_inst == NULL && before_block == NULL) ||
1315 (before_inst && before_block));
1316
1317 vec4_instruction *pull;
1318
1319 if (devinfo->gen >= 9) {
1320 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1321 src_reg header(this, glsl_type::uvec4_type, 2);
1322
1323 pull = new(mem_ctx)
1324 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1325 dst_reg(header));
1326
1327 if (before_inst)
1328 emit_before(before_block, before_inst, pull);
1329 else
1330 emit(pull);
1331
1332 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1333 offset_reg.type);
1334 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1335
1336 if (before_inst)
1337 emit_before(before_block, before_inst, pull);
1338 else
1339 emit(pull);
1340
1341 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1342 dst,
1343 surf_index,
1344 header);
1345 pull->mlen = 2;
1346 pull->header_size = 1;
1347 } else if (devinfo->gen >= 7) {
1348 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1349
1350 grf_offset.type = offset_reg.type;
1351
1352 pull = MOV(grf_offset, offset_reg);
1353
1354 if (before_inst)
1355 emit_before(before_block, before_inst, pull);
1356 else
1357 emit(pull);
1358
1359 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1360 dst,
1361 surf_index,
1362 src_reg(grf_offset));
1363 pull->mlen = 1;
1364 } else {
1365 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1366 dst,
1367 surf_index,
1368 offset_reg);
1369 pull->base_mrf = 14;
1370 pull->mlen = 1;
1371 }
1372
1373 if (before_inst)
1374 emit_before(before_block, before_inst, pull);
1375 else
1376 emit(pull);
1377 }
1378
1379 void
1380 vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
1381 {
1382 const src_reg chan_index(this, glsl_type::uint_type);
1383
1384 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1385 ->force_writemask_all = true;
1386 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1387 ->force_writemask_all = true;
1388 }
1389
1390 void
1391 vec4_visitor::visit(ir_expression *ir)
1392 {
1393 unsigned int operand;
1394 src_reg op[ARRAY_SIZE(ir->operands)];
1395 vec4_instruction *inst;
1396
1397 if (ir->operation == ir_binop_add) {
1398 if (try_emit_mad(ir))
1399 return;
1400 }
1401
1402 if (ir->operation == ir_unop_b2f) {
1403 if (try_emit_b2f_of_compare(ir))
1404 return;
1405 }
1406
1407 /* Storage for our result. Ideally for an assignment we'd be using
1408 * the actual storage for the result here, instead.
1409 */
1410 dst_reg result_dst(this, ir->type);
1411 src_reg result_src(result_dst);
1412
1413 if (ir->operation == ir_triop_csel) {
1414 ir->operands[1]->accept(this);
1415 op[1] = this->result;
1416 ir->operands[2]->accept(this);
1417 op[2] = this->result;
1418
1419 enum brw_predicate predicate;
1420 emit_bool_to_cond_code(ir->operands[0], &predicate);
1421 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1422 inst->predicate = predicate;
1423 this->result = result_src;
1424 return;
1425 }
1426
1427 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1428 this->result.file = BAD_FILE;
1429 ir->operands[operand]->accept(this);
1430 if (this->result.file == BAD_FILE) {
1431 fprintf(stderr, "Failed to get tree for expression operand:\n");
1432 ir->operands[operand]->fprint(stderr);
1433 exit(1);
1434 }
1435 op[operand] = this->result;
1436
1437 /* Matrix expression operands should have been broken down to vector
1438 * operations already.
1439 */
1440 assert(!ir->operands[operand]->type->is_matrix());
1441 }
1442
1443 /* If nothing special happens, this is the result. */
1444 this->result = result_src;
1445
1446 switch (ir->operation) {
1447 case ir_unop_logic_not:
1448 emit(NOT(result_dst, op[0]));
1449 break;
1450 case ir_unop_neg:
1451 op[0].negate = !op[0].negate;
1452 emit(MOV(result_dst, op[0]));
1453 break;
1454 case ir_unop_abs:
1455 op[0].abs = true;
1456 op[0].negate = false;
1457 emit(MOV(result_dst, op[0]));
1458 break;
1459
1460 case ir_unop_sign:
1461 if (ir->type->is_float()) {
1462 /* AND(val, 0x80000000) gives the sign bit.
1463 *
1464 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1465 * zero.
1466 */
1467 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1468
1469 op[0].type = BRW_REGISTER_TYPE_UD;
1470 result_dst.type = BRW_REGISTER_TYPE_UD;
1471 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1472
1473 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1474 inst->predicate = BRW_PREDICATE_NORMAL;
1475
1476 this->result.type = BRW_REGISTER_TYPE_F;
1477 } else {
1478 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1479 * -> non-negative val generates 0x00000000.
1480 * Predicated OR sets 1 if val is positive.
1481 */
1482 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1483
1484 emit(ASR(result_dst, op[0], src_reg(31)));
1485
1486 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1487 inst->predicate = BRW_PREDICATE_NORMAL;
1488 }
1489 break;
1490
1491 case ir_unop_rcp:
1492 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1493 break;
1494
1495 case ir_unop_exp2:
1496 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1497 break;
1498 case ir_unop_log2:
1499 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1500 break;
1501 case ir_unop_exp:
1502 case ir_unop_log:
1503 unreachable("not reached: should be handled by ir_explog_to_explog2");
1504 case ir_unop_sin:
1505 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1506 break;
1507 case ir_unop_cos:
1508 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1509 break;
1510
1511 case ir_unop_dFdx:
1512 case ir_unop_dFdx_coarse:
1513 case ir_unop_dFdx_fine:
1514 case ir_unop_dFdy:
1515 case ir_unop_dFdy_coarse:
1516 case ir_unop_dFdy_fine:
1517 unreachable("derivatives not valid in vertex shader");
1518
1519 case ir_unop_bitfield_reverse:
1520 emit(BFREV(result_dst, op[0]));
1521 break;
1522 case ir_unop_bit_count:
1523 emit(CBIT(result_dst, op[0]));
1524 break;
1525 case ir_unop_find_msb: {
1526 src_reg temp = src_reg(this, glsl_type::uint_type);
1527
1528 inst = emit(FBH(dst_reg(temp), op[0]));
1529 inst->dst.writemask = WRITEMASK_XYZW;
1530
1531 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1532 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1533 * subtract the result from 31 to convert the MSB count into an LSB count.
1534 */
1535
1536 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1537 temp.swizzle = BRW_SWIZZLE_NOOP;
1538 emit(MOV(result_dst, temp));
1539
1540 src_reg src_tmp = src_reg(result_dst);
1541 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1542
1543 src_tmp.negate = true;
1544 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1545 inst->predicate = BRW_PREDICATE_NORMAL;
1546 break;
1547 }
1548 case ir_unop_find_lsb:
1549 emit(FBL(result_dst, op[0]));
1550 break;
1551 case ir_unop_saturate:
1552 inst = emit(MOV(result_dst, op[0]));
1553 inst->saturate = true;
1554 break;
1555
1556 case ir_unop_noise:
1557 unreachable("not reached: should be handled by lower_noise");
1558
1559 case ir_binop_add:
1560 emit(ADD(result_dst, op[0], op[1]));
1561 break;
1562 case ir_binop_sub:
1563 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1564
1565 case ir_binop_mul:
1566 if (devinfo->gen < 8 && ir->type->is_integer()) {
1567 /* For integer multiplication, the MUL uses the low 16 bits of one of
1568 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1569 * accumulates in the contribution of the upper 16 bits of that
1570 * operand. If we can determine that one of the args is in the low
1571 * 16 bits, though, we can just emit a single MUL.
1572 */
1573 if (ir->operands[0]->is_uint16_constant()) {
1574 if (devinfo->gen < 7)
1575 emit(MUL(result_dst, op[0], op[1]));
1576 else
1577 emit(MUL(result_dst, op[1], op[0]));
1578 } else if (ir->operands[1]->is_uint16_constant()) {
1579 if (devinfo->gen < 7)
1580 emit(MUL(result_dst, op[1], op[0]));
1581 else
1582 emit(MUL(result_dst, op[0], op[1]));
1583 } else {
1584 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1585
1586 emit(MUL(acc, op[0], op[1]));
1587 emit(MACH(dst_null_d(), op[0], op[1]));
1588 emit(MOV(result_dst, src_reg(acc)));
1589 }
1590 } else {
1591 emit(MUL(result_dst, op[0], op[1]));
1592 }
1593 break;
1594 case ir_binop_imul_high: {
1595 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1596
1597 emit(MUL(acc, op[0], op[1]));
1598 emit(MACH(result_dst, op[0], op[1]));
1599 break;
1600 }
1601 case ir_binop_div:
1602 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1603 assert(ir->type->is_integer());
1604 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1605 break;
1606 case ir_binop_carry: {
1607 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1608
1609 emit(ADDC(dst_null_ud(), op[0], op[1]));
1610 emit(MOV(result_dst, src_reg(acc)));
1611 break;
1612 }
1613 case ir_binop_borrow: {
1614 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1615
1616 emit(SUBB(dst_null_ud(), op[0], op[1]));
1617 emit(MOV(result_dst, src_reg(acc)));
1618 break;
1619 }
1620 case ir_binop_mod:
1621 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1622 assert(ir->type->is_integer());
1623 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1624 break;
1625
1626 case ir_binop_less:
1627 case ir_binop_greater:
1628 case ir_binop_lequal:
1629 case ir_binop_gequal:
1630 case ir_binop_equal:
1631 case ir_binop_nequal: {
1632 if (devinfo->gen <= 5) {
1633 resolve_bool_comparison(ir->operands[0], &op[0]);
1634 resolve_bool_comparison(ir->operands[1], &op[1]);
1635 }
1636 emit(CMP(result_dst, op[0], op[1],
1637 brw_conditional_for_comparison(ir->operation)));
1638 break;
1639 }
1640
1641 case ir_binop_all_equal:
1642 if (devinfo->gen <= 5) {
1643 resolve_bool_comparison(ir->operands[0], &op[0]);
1644 resolve_bool_comparison(ir->operands[1], &op[1]);
1645 }
1646
1647 /* "==" operator producing a scalar boolean. */
1648 if (ir->operands[0]->type->is_vector() ||
1649 ir->operands[1]->type->is_vector()) {
1650 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1651 emit(MOV(result_dst, src_reg(0)));
1652 inst = emit(MOV(result_dst, src_reg(~0)));
1653 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1654 } else {
1655 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1656 }
1657 break;
1658 case ir_binop_any_nequal:
1659 if (devinfo->gen <= 5) {
1660 resolve_bool_comparison(ir->operands[0], &op[0]);
1661 resolve_bool_comparison(ir->operands[1], &op[1]);
1662 }
1663
1664 /* "!=" operator producing a scalar boolean. */
1665 if (ir->operands[0]->type->is_vector() ||
1666 ir->operands[1]->type->is_vector()) {
1667 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1668
1669 emit(MOV(result_dst, src_reg(0)));
1670 inst = emit(MOV(result_dst, src_reg(~0)));
1671 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1672 } else {
1673 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1674 }
1675 break;
1676
1677 case ir_unop_any:
1678 if (devinfo->gen <= 5) {
1679 resolve_bool_comparison(ir->operands[0], &op[0]);
1680 }
1681 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1682 emit(MOV(result_dst, src_reg(0)));
1683
1684 inst = emit(MOV(result_dst, src_reg(~0)));
1685 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1686 break;
1687
1688 case ir_binop_logic_xor:
1689 emit(XOR(result_dst, op[0], op[1]));
1690 break;
1691
1692 case ir_binop_logic_or:
1693 emit(OR(result_dst, op[0], op[1]));
1694 break;
1695
1696 case ir_binop_logic_and:
1697 emit(AND(result_dst, op[0], op[1]));
1698 break;
1699
1700 case ir_binop_dot:
1701 assert(ir->operands[0]->type->is_vector());
1702 assert(ir->operands[0]->type == ir->operands[1]->type);
1703 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1704 break;
1705
1706 case ir_unop_sqrt:
1707 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1708 break;
1709 case ir_unop_rsq:
1710 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1711 break;
1712
1713 case ir_unop_bitcast_i2f:
1714 case ir_unop_bitcast_u2f:
1715 this->result = op[0];
1716 this->result.type = BRW_REGISTER_TYPE_F;
1717 break;
1718
1719 case ir_unop_bitcast_f2i:
1720 this->result = op[0];
1721 this->result.type = BRW_REGISTER_TYPE_D;
1722 break;
1723
1724 case ir_unop_bitcast_f2u:
1725 this->result = op[0];
1726 this->result.type = BRW_REGISTER_TYPE_UD;
1727 break;
1728
1729 case ir_unop_i2f:
1730 case ir_unop_i2u:
1731 case ir_unop_u2i:
1732 case ir_unop_u2f:
1733 case ir_unop_f2i:
1734 case ir_unop_f2u:
1735 emit(MOV(result_dst, op[0]));
1736 break;
1737 case ir_unop_b2i:
1738 emit(AND(result_dst, op[0], src_reg(1)));
1739 break;
1740 case ir_unop_b2f:
1741 if (devinfo->gen <= 5) {
1742 resolve_bool_comparison(ir->operands[0], &op[0]);
1743 }
1744 op[0].type = BRW_REGISTER_TYPE_D;
1745 result_dst.type = BRW_REGISTER_TYPE_D;
1746 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1747 result_dst.type = BRW_REGISTER_TYPE_F;
1748 break;
1749 case ir_unop_f2b:
1750 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1751 break;
1752 case ir_unop_i2b:
1753 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1754 break;
1755
1756 case ir_unop_trunc:
1757 emit(RNDZ(result_dst, op[0]));
1758 break;
1759 case ir_unop_ceil: {
1760 src_reg tmp = src_reg(this, ir->type);
1761 op[0].negate = !op[0].negate;
1762 emit(RNDD(dst_reg(tmp), op[0]));
1763 tmp.negate = true;
1764 emit(MOV(result_dst, tmp));
1765 }
1766 break;
1767 case ir_unop_floor:
1768 inst = emit(RNDD(result_dst, op[0]));
1769 break;
1770 case ir_unop_fract:
1771 inst = emit(FRC(result_dst, op[0]));
1772 break;
1773 case ir_unop_round_even:
1774 emit(RNDE(result_dst, op[0]));
1775 break;
1776
1777 case ir_binop_min:
1778 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1779 break;
1780 case ir_binop_max:
1781 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1782 break;
1783
1784 case ir_binop_pow:
1785 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1786 break;
1787
1788 case ir_unop_bit_not:
1789 inst = emit(NOT(result_dst, op[0]));
1790 break;
1791 case ir_binop_bit_and:
1792 inst = emit(AND(result_dst, op[0], op[1]));
1793 break;
1794 case ir_binop_bit_xor:
1795 inst = emit(XOR(result_dst, op[0], op[1]));
1796 break;
1797 case ir_binop_bit_or:
1798 inst = emit(OR(result_dst, op[0], op[1]));
1799 break;
1800
1801 case ir_binop_lshift:
1802 inst = emit(SHL(result_dst, op[0], op[1]));
1803 break;
1804
1805 case ir_binop_rshift:
1806 if (ir->type->base_type == GLSL_TYPE_INT)
1807 inst = emit(ASR(result_dst, op[0], op[1]));
1808 else
1809 inst = emit(SHR(result_dst, op[0], op[1]));
1810 break;
1811
1812 case ir_binop_bfm:
1813 emit(BFI1(result_dst, op[0], op[1]));
1814 break;
1815
1816 case ir_binop_ubo_load: {
1817 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1818 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1819 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1820 src_reg offset;
1821
1822 /* Now, load the vector from that offset. */
1823 assert(ir->type->is_vector() || ir->type->is_scalar());
1824
1825 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1826 packed_consts.type = result.type;
1827 src_reg surf_index;
1828
1829 if (const_uniform_block) {
1830 /* The block index is a constant, so just emit the binding table entry
1831 * as an immediate.
1832 */
1833 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1834 const_uniform_block->value.u[0]);
1835 } else {
1836 /* The block index is not a constant. Evaluate the index expression
1837 * per-channel and add the base UBO index; we have to select a value
1838 * from any live channel.
1839 */
1840 surf_index = src_reg(this, glsl_type::uint_type);
1841 emit(ADD(dst_reg(surf_index), op[0],
1842 src_reg(prog_data->base.binding_table.ubo_start)));
1843 emit_uniformize(dst_reg(surf_index), surf_index);
1844
1845 /* Assume this may touch any UBO. It would be nice to provide
1846 * a tighter bound, but the array information is already lowered away.
1847 */
1848 brw_mark_surface_used(&prog_data->base,
1849 prog_data->base.binding_table.ubo_start +
1850 shader_prog->NumUniformBlocks - 1);
1851 }
1852
1853 if (const_offset_ir) {
1854 if (devinfo->gen >= 8) {
1855 /* Store the offset in a GRF so we can send-from-GRF. */
1856 offset = src_reg(this, glsl_type::int_type);
1857 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1858 } else {
1859 /* Immediates are fine on older generations since they'll be moved
1860 * to a (potentially fake) MRF at the generator level.
1861 */
1862 offset = src_reg(const_offset / 16);
1863 }
1864 } else {
1865 offset = src_reg(this, glsl_type::uint_type);
1866 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1867 }
1868
1869 emit_pull_constant_load_reg(dst_reg(packed_consts),
1870 surf_index,
1871 offset,
1872 NULL, NULL /* before_block/inst */);
1873
1874 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1875 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1876 const_offset % 16 / 4,
1877 const_offset % 16 / 4,
1878 const_offset % 16 / 4);
1879
1880 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1881 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1882 emit(CMP(result_dst, packed_consts, src_reg(0u),
1883 BRW_CONDITIONAL_NZ));
1884 } else {
1885 emit(MOV(result_dst, packed_consts));
1886 }
1887 break;
1888 }
1889
1890 case ir_binop_vector_extract:
1891 unreachable("should have been lowered by vec_index_to_cond_assign");
1892
1893 case ir_triop_fma:
1894 op[0] = fix_3src_operand(op[0]);
1895 op[1] = fix_3src_operand(op[1]);
1896 op[2] = fix_3src_operand(op[2]);
1897 /* Note that the instruction's argument order is reversed from GLSL
1898 * and the IR.
1899 */
1900 emit(MAD(result_dst, op[2], op[1], op[0]));
1901 break;
1902
1903 case ir_triop_lrp:
1904 emit_lrp(result_dst, op[0], op[1], op[2]);
1905 break;
1906
1907 case ir_triop_csel:
1908 unreachable("already handled above");
1909 break;
1910
1911 case ir_triop_bfi:
1912 op[0] = fix_3src_operand(op[0]);
1913 op[1] = fix_3src_operand(op[1]);
1914 op[2] = fix_3src_operand(op[2]);
1915 emit(BFI2(result_dst, op[0], op[1], op[2]));
1916 break;
1917
1918 case ir_triop_bitfield_extract:
1919 op[0] = fix_3src_operand(op[0]);
1920 op[1] = fix_3src_operand(op[1]);
1921 op[2] = fix_3src_operand(op[2]);
1922 /* Note that the instruction's argument order is reversed from GLSL
1923 * and the IR.
1924 */
1925 emit(BFE(result_dst, op[2], op[1], op[0]));
1926 break;
1927
1928 case ir_triop_vector_insert:
1929 unreachable("should have been lowered by lower_vector_insert");
1930
1931 case ir_quadop_bitfield_insert:
1932 unreachable("not reached: should be handled by "
1933 "bitfield_insert_to_bfm_bfi\n");
1934
1935 case ir_quadop_vector:
1936 unreachable("not reached: should be handled by lower_quadop_vector");
1937
1938 case ir_unop_pack_half_2x16:
1939 emit_pack_half_2x16(result_dst, op[0]);
1940 break;
1941 case ir_unop_unpack_half_2x16:
1942 emit_unpack_half_2x16(result_dst, op[0]);
1943 break;
1944 case ir_unop_unpack_unorm_4x8:
1945 emit_unpack_unorm_4x8(result_dst, op[0]);
1946 break;
1947 case ir_unop_unpack_snorm_4x8:
1948 emit_unpack_snorm_4x8(result_dst, op[0]);
1949 break;
1950 case ir_unop_pack_unorm_4x8:
1951 emit_pack_unorm_4x8(result_dst, op[0]);
1952 break;
1953 case ir_unop_pack_snorm_4x8:
1954 emit_pack_snorm_4x8(result_dst, op[0]);
1955 break;
1956 case ir_unop_pack_snorm_2x16:
1957 case ir_unop_pack_unorm_2x16:
1958 case ir_unop_unpack_snorm_2x16:
1959 case ir_unop_unpack_unorm_2x16:
1960 unreachable("not reached: should be handled by lower_packing_builtins");
1961 case ir_unop_unpack_half_2x16_split_x:
1962 case ir_unop_unpack_half_2x16_split_y:
1963 case ir_binop_pack_half_2x16_split:
1964 case ir_unop_interpolate_at_centroid:
1965 case ir_binop_interpolate_at_sample:
1966 case ir_binop_interpolate_at_offset:
1967 unreachable("not reached: should not occur in vertex shader");
1968 case ir_binop_ldexp:
1969 unreachable("not reached: should be handled by ldexp_to_arith()");
1970 case ir_unop_d2f:
1971 case ir_unop_f2d:
1972 case ir_unop_d2i:
1973 case ir_unop_i2d:
1974 case ir_unop_d2u:
1975 case ir_unop_u2d:
1976 case ir_unop_d2b:
1977 case ir_unop_pack_double_2x32:
1978 case ir_unop_unpack_double_2x32:
1979 case ir_unop_frexp_sig:
1980 case ir_unop_frexp_exp:
1981 unreachable("fp64 todo");
1982 }
1983 }
1984
1985
1986 void
1987 vec4_visitor::visit(ir_swizzle *ir)
1988 {
1989 /* Note that this is only swizzles in expressions, not those on the left
1990 * hand side of an assignment, which do write masking. See ir_assignment
1991 * for that.
1992 */
1993 const unsigned swz = brw_compose_swizzle(
1994 brw_swizzle_for_size(ir->type->vector_elements),
1995 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1996
1997 ir->val->accept(this);
1998 this->result = swizzle(this->result, swz);
1999 }
2000
2001 void
2002 vec4_visitor::visit(ir_dereference_variable *ir)
2003 {
2004 const struct glsl_type *type = ir->type;
2005 dst_reg *reg = variable_storage(ir->var);
2006
2007 if (!reg) {
2008 fail("Failed to find variable storage for %s\n", ir->var->name);
2009 this->result = src_reg(brw_null_reg());
2010 return;
2011 }
2012
2013 this->result = src_reg(*reg);
2014
2015 /* System values get their swizzle from the dst_reg writemask */
2016 if (ir->var->data.mode == ir_var_system_value)
2017 return;
2018
2019 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2020 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2021 }
2022
2023
2024 int
2025 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2026 {
2027 /* Under normal circumstances array elements are stored consecutively, so
2028 * the stride is equal to the size of the array element.
2029 */
2030 return type_size(ir->type);
2031 }
2032
2033
2034 void
2035 vec4_visitor::visit(ir_dereference_array *ir)
2036 {
2037 ir_constant *constant_index;
2038 src_reg src;
2039 int array_stride = compute_array_stride(ir);
2040
2041 constant_index = ir->array_index->constant_expression_value();
2042
2043 ir->array->accept(this);
2044 src = this->result;
2045
2046 if (constant_index) {
2047 src.reg_offset += constant_index->value.i[0] * array_stride;
2048 } else {
2049 /* Variable index array dereference. It eats the "vec4" of the
2050 * base of the array and an index that offsets the Mesa register
2051 * index.
2052 */
2053 ir->array_index->accept(this);
2054
2055 src_reg index_reg;
2056
2057 if (array_stride == 1) {
2058 index_reg = this->result;
2059 } else {
2060 index_reg = src_reg(this, glsl_type::int_type);
2061
2062 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2063 }
2064
2065 if (src.reladdr) {
2066 src_reg temp = src_reg(this, glsl_type::int_type);
2067
2068 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2069
2070 index_reg = temp;
2071 }
2072
2073 src.reladdr = ralloc(mem_ctx, src_reg);
2074 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2075 }
2076
2077 /* If the type is smaller than a vec4, replicate the last channel out. */
2078 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2079 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2080 else
2081 src.swizzle = BRW_SWIZZLE_NOOP;
2082 src.type = brw_type_for_base_type(ir->type);
2083
2084 this->result = src;
2085 }
2086
2087 void
2088 vec4_visitor::visit(ir_dereference_record *ir)
2089 {
2090 unsigned int i;
2091 const glsl_type *struct_type = ir->record->type;
2092 int offset = 0;
2093
2094 ir->record->accept(this);
2095
2096 for (i = 0; i < struct_type->length; i++) {
2097 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2098 break;
2099 offset += type_size(struct_type->fields.structure[i].type);
2100 }
2101
2102 /* If the type is smaller than a vec4, replicate the last channel out. */
2103 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2104 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2105 else
2106 this->result.swizzle = BRW_SWIZZLE_NOOP;
2107 this->result.type = brw_type_for_base_type(ir->type);
2108
2109 this->result.reg_offset += offset;
2110 }
2111
2112 /**
2113 * We want to be careful in assignment setup to hit the actual storage
2114 * instead of potentially using a temporary like we might with the
2115 * ir_dereference handler.
2116 */
2117 static dst_reg
2118 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2119 {
2120 /* The LHS must be a dereference. If the LHS is a variable indexed array
2121 * access of a vector, it must be separated into a series conditional moves
2122 * before reaching this point (see ir_vec_index_to_cond_assign).
2123 */
2124 assert(ir->as_dereference());
2125 ir_dereference_array *deref_array = ir->as_dereference_array();
2126 if (deref_array) {
2127 assert(!deref_array->array->type->is_vector());
2128 }
2129
2130 /* Use the rvalue deref handler for the most part. We'll ignore
2131 * swizzles in it and write swizzles using writemask, though.
2132 */
2133 ir->accept(v);
2134 return dst_reg(v->result);
2135 }
2136
2137 void
2138 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2139 const struct glsl_type *type,
2140 enum brw_predicate predicate)
2141 {
2142 if (type->base_type == GLSL_TYPE_STRUCT) {
2143 for (unsigned int i = 0; i < type->length; i++) {
2144 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2145 }
2146 return;
2147 }
2148
2149 if (type->is_array()) {
2150 for (unsigned int i = 0; i < type->length; i++) {
2151 emit_block_move(dst, src, type->fields.array, predicate);
2152 }
2153 return;
2154 }
2155
2156 if (type->is_matrix()) {
2157 const struct glsl_type *vec_type;
2158
2159 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2160 type->vector_elements, 1);
2161
2162 for (int i = 0; i < type->matrix_columns; i++) {
2163 emit_block_move(dst, src, vec_type, predicate);
2164 }
2165 return;
2166 }
2167
2168 assert(type->is_scalar() || type->is_vector());
2169
2170 dst->type = brw_type_for_base_type(type);
2171 src->type = dst->type;
2172
2173 dst->writemask = (1 << type->vector_elements) - 1;
2174
2175 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2176
2177 vec4_instruction *inst = emit(MOV(*dst, *src));
2178 inst->predicate = predicate;
2179
2180 dst->reg_offset++;
2181 src->reg_offset++;
2182 }
2183
2184
2185 /* If the RHS processing resulted in an instruction generating a
2186 * temporary value, and it would be easy to rewrite the instruction to
2187 * generate its result right into the LHS instead, do so. This ends
2188 * up reliably removing instructions where it can be tricky to do so
2189 * later without real UD chain information.
2190 */
2191 bool
2192 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2193 dst_reg dst,
2194 src_reg src,
2195 vec4_instruction *pre_rhs_inst,
2196 vec4_instruction *last_rhs_inst)
2197 {
2198 /* This could be supported, but it would take more smarts. */
2199 if (ir->condition)
2200 return false;
2201
2202 if (pre_rhs_inst == last_rhs_inst)
2203 return false; /* No instructions generated to work with. */
2204
2205 /* Make sure the last instruction generated our source reg. */
2206 if (src.file != GRF ||
2207 src.file != last_rhs_inst->dst.file ||
2208 src.reg != last_rhs_inst->dst.reg ||
2209 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2210 src.reladdr ||
2211 src.abs ||
2212 src.negate ||
2213 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2214 return false;
2215
2216 /* Check that that last instruction fully initialized the channels
2217 * we want to use, in the order we want to use them. We could
2218 * potentially reswizzle the operands of many instructions so that
2219 * we could handle out of order channels, but don't yet.
2220 */
2221
2222 for (unsigned i = 0; i < 4; i++) {
2223 if (dst.writemask & (1 << i)) {
2224 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2225 return false;
2226
2227 if (BRW_GET_SWZ(src.swizzle, i) != i)
2228 return false;
2229 }
2230 }
2231
2232 /* Success! Rewrite the instruction. */
2233 last_rhs_inst->dst.file = dst.file;
2234 last_rhs_inst->dst.reg = dst.reg;
2235 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2236 last_rhs_inst->dst.reladdr = dst.reladdr;
2237 last_rhs_inst->dst.writemask &= dst.writemask;
2238
2239 return true;
2240 }
2241
2242 void
2243 vec4_visitor::visit(ir_assignment *ir)
2244 {
2245 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2246 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2247
2248 if (!ir->lhs->type->is_scalar() &&
2249 !ir->lhs->type->is_vector()) {
2250 ir->rhs->accept(this);
2251 src_reg src = this->result;
2252
2253 if (ir->condition) {
2254 emit_bool_to_cond_code(ir->condition, &predicate);
2255 }
2256
2257 /* emit_block_move doesn't account for swizzles in the source register.
2258 * This should be ok, since the source register is a structure or an
2259 * array, and those can't be swizzled. But double-check to be sure.
2260 */
2261 assert(src.swizzle ==
2262 (ir->rhs->type->is_matrix()
2263 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2264 : BRW_SWIZZLE_NOOP));
2265
2266 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2267 return;
2268 }
2269
2270 /* Now we're down to just a scalar/vector with writemasks. */
2271 int i;
2272
2273 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2274 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2275
2276 ir->rhs->accept(this);
2277
2278 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2279
2280 int swizzles[4];
2281 int src_chan = 0;
2282
2283 assert(ir->lhs->type->is_vector() ||
2284 ir->lhs->type->is_scalar());
2285 dst.writemask = ir->write_mask;
2286
2287 /* Swizzle a small RHS vector into the channels being written.
2288 *
2289 * glsl ir treats write_mask as dictating how many channels are
2290 * present on the RHS while in our instructions we need to make
2291 * those channels appear in the slots of the vec4 they're written to.
2292 */
2293 for (int i = 0; i < 4; i++)
2294 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2295
2296 src_reg src = swizzle(this->result,
2297 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2298 swizzles[2], swizzles[3]));
2299
2300 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2301 return;
2302 }
2303
2304 if (ir->condition) {
2305 emit_bool_to_cond_code(ir->condition, &predicate);
2306 }
2307
2308 for (i = 0; i < type_size(ir->lhs->type); i++) {
2309 vec4_instruction *inst = emit(MOV(dst, src));
2310 inst->predicate = predicate;
2311
2312 dst.reg_offset++;
2313 src.reg_offset++;
2314 }
2315 }
2316
2317 void
2318 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2319 {
2320 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2321 foreach_in_list(ir_constant, field_value, &ir->components) {
2322 emit_constant_values(dst, field_value);
2323 }
2324 return;
2325 }
2326
2327 if (ir->type->is_array()) {
2328 for (unsigned int i = 0; i < ir->type->length; i++) {
2329 emit_constant_values(dst, ir->array_elements[i]);
2330 }
2331 return;
2332 }
2333
2334 if (ir->type->is_matrix()) {
2335 for (int i = 0; i < ir->type->matrix_columns; i++) {
2336 float *vec = &ir->value.f[i * ir->type->vector_elements];
2337
2338 for (int j = 0; j < ir->type->vector_elements; j++) {
2339 dst->writemask = 1 << j;
2340 dst->type = BRW_REGISTER_TYPE_F;
2341
2342 emit(MOV(*dst, src_reg(vec[j])));
2343 }
2344 dst->reg_offset++;
2345 }
2346 return;
2347 }
2348
2349 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2350
2351 for (int i = 0; i < ir->type->vector_elements; i++) {
2352 if (!(remaining_writemask & (1 << i)))
2353 continue;
2354
2355 dst->writemask = 1 << i;
2356 dst->type = brw_type_for_base_type(ir->type);
2357
2358 /* Find other components that match the one we're about to
2359 * write. Emits fewer instructions for things like vec4(0.5,
2360 * 1.5, 1.5, 1.5).
2361 */
2362 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2363 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2364 if (ir->value.b[i] == ir->value.b[j])
2365 dst->writemask |= (1 << j);
2366 } else {
2367 /* u, i, and f storage all line up, so no need for a
2368 * switch case for comparing each type.
2369 */
2370 if (ir->value.u[i] == ir->value.u[j])
2371 dst->writemask |= (1 << j);
2372 }
2373 }
2374
2375 switch (ir->type->base_type) {
2376 case GLSL_TYPE_FLOAT:
2377 emit(MOV(*dst, src_reg(ir->value.f[i])));
2378 break;
2379 case GLSL_TYPE_INT:
2380 emit(MOV(*dst, src_reg(ir->value.i[i])));
2381 break;
2382 case GLSL_TYPE_UINT:
2383 emit(MOV(*dst, src_reg(ir->value.u[i])));
2384 break;
2385 case GLSL_TYPE_BOOL:
2386 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2387 break;
2388 default:
2389 unreachable("Non-float/uint/int/bool constant");
2390 }
2391
2392 remaining_writemask &= ~dst->writemask;
2393 }
2394 dst->reg_offset++;
2395 }
2396
2397 void
2398 vec4_visitor::visit(ir_constant *ir)
2399 {
2400 dst_reg dst = dst_reg(this, ir->type);
2401 this->result = src_reg(dst);
2402
2403 emit_constant_values(&dst, ir);
2404 }
2405
2406 void
2407 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2408 {
2409 ir_dereference *deref = static_cast<ir_dereference *>(
2410 ir->actual_parameters.get_head());
2411 ir_variable *location = deref->variable_referenced();
2412 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2413 location->data.binding);
2414
2415 /* Calculate the surface offset */
2416 src_reg offset(this, glsl_type::uint_type);
2417 ir_dereference_array *deref_array = deref->as_dereference_array();
2418 if (deref_array) {
2419 deref_array->array_index->accept(this);
2420
2421 src_reg tmp(this, glsl_type::uint_type);
2422 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2423 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2424 } else {
2425 offset = location->data.atomic.offset;
2426 }
2427
2428 /* Emit the appropriate machine instruction */
2429 const char *callee = ir->callee->function_name();
2430 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2431
2432 if (!strcmp("__intrinsic_atomic_read", callee)) {
2433 emit_untyped_surface_read(surf_index, dst, offset);
2434
2435 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2436 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2437 src_reg(), src_reg());
2438
2439 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2440 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2441 src_reg(), src_reg());
2442 }
2443 }
2444
2445 void
2446 vec4_visitor::visit(ir_call *ir)
2447 {
2448 const char *callee = ir->callee->function_name();
2449
2450 if (!strcmp("__intrinsic_atomic_read", callee) ||
2451 !strcmp("__intrinsic_atomic_increment", callee) ||
2452 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2453 visit_atomic_counter_intrinsic(ir);
2454 } else {
2455 unreachable("Unsupported intrinsic.");
2456 }
2457 }
2458
2459 src_reg
2460 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2461 {
2462 vec4_instruction *inst =
2463 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2464 dst_reg(this, glsl_type::uvec4_type));
2465 inst->base_mrf = 2;
2466 inst->src[1] = sampler;
2467
2468 int param_base;
2469
2470 if (devinfo->gen >= 9) {
2471 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2472 vec4_instruction *header_inst = new(mem_ctx)
2473 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2474 dst_reg(MRF, inst->base_mrf));
2475
2476 emit(header_inst);
2477
2478 inst->mlen = 2;
2479 inst->header_size = 1;
2480 param_base = inst->base_mrf + 1;
2481 } else {
2482 inst->mlen = 1;
2483 param_base = inst->base_mrf;
2484 }
2485
2486 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2487 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2488 int zero_mask = 0xf & ~coord_mask;
2489
2490 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2491 coordinate));
2492
2493 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2494 src_reg(0)));
2495
2496 emit(inst);
2497 return src_reg(inst->dst);
2498 }
2499
2500 static bool
2501 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2502 {
2503 if (devinfo->gen < 8 && !devinfo->is_haswell)
2504 return false;
2505
2506 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2507 }
2508
2509 void
2510 vec4_visitor::visit(ir_texture *ir)
2511 {
2512 uint32_t sampler =
2513 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2514
2515 ir_rvalue *nonconst_sampler_index =
2516 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2517
2518 /* Handle non-constant sampler array indexing */
2519 src_reg sampler_reg;
2520 if (nonconst_sampler_index) {
2521 /* The highest sampler which may be used by this operation is
2522 * the last element of the array. Mark it here, because the generator
2523 * doesn't have enough information to determine the bound.
2524 */
2525 uint32_t array_size = ir->sampler->as_dereference_array()
2526 ->array->type->array_size();
2527
2528 uint32_t max_used = sampler + array_size - 1;
2529 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2530 max_used += prog_data->base.binding_table.gather_texture_start;
2531 } else {
2532 max_used += prog_data->base.binding_table.texture_start;
2533 }
2534
2535 brw_mark_surface_used(&prog_data->base, max_used);
2536
2537 /* Emit code to evaluate the actual indexing expression */
2538 nonconst_sampler_index->accept(this);
2539 dst_reg temp(this, glsl_type::uint_type);
2540 emit(ADD(temp, this->result, src_reg(sampler)));
2541 emit_uniformize(temp, src_reg(temp));
2542
2543 sampler_reg = src_reg(temp);
2544 } else {
2545 /* Single sampler, or constant array index; the indexing expression
2546 * is just an immediate.
2547 */
2548 sampler_reg = src_reg(sampler);
2549 }
2550
2551 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2552 * emitting anything other than setting up the constant result.
2553 */
2554 if (ir->op == ir_tg4) {
2555 ir_constant *chan = ir->lod_info.component->as_constant();
2556 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2557 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2558 dst_reg result(this, ir->type);
2559 this->result = src_reg(result);
2560 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2561 return;
2562 }
2563 }
2564
2565 /* Should be lowered by do_lower_texture_projection */
2566 assert(!ir->projector);
2567
2568 /* Should be lowered */
2569 assert(!ir->offset || !ir->offset->type->is_array());
2570
2571 /* Generate code to compute all the subexpression trees. This has to be
2572 * done before loading any values into MRFs for the sampler message since
2573 * generating these values may involve SEND messages that need the MRFs.
2574 */
2575 src_reg coordinate;
2576 if (ir->coordinate) {
2577 ir->coordinate->accept(this);
2578 coordinate = this->result;
2579 }
2580
2581 src_reg shadow_comparitor;
2582 if (ir->shadow_comparitor) {
2583 ir->shadow_comparitor->accept(this);
2584 shadow_comparitor = this->result;
2585 }
2586
2587 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2588 src_reg offset_value;
2589 if (has_nonconstant_offset) {
2590 ir->offset->accept(this);
2591 offset_value = src_reg(this->result);
2592 }
2593
2594 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2595 src_reg lod, dPdx, dPdy, sample_index, mcs;
2596 switch (ir->op) {
2597 case ir_tex:
2598 lod = src_reg(0.0f);
2599 lod_type = glsl_type::float_type;
2600 break;
2601 case ir_txf:
2602 case ir_txl:
2603 case ir_txs:
2604 ir->lod_info.lod->accept(this);
2605 lod = this->result;
2606 lod_type = ir->lod_info.lod->type;
2607 break;
2608 case ir_query_levels:
2609 lod = src_reg(0);
2610 lod_type = glsl_type::int_type;
2611 break;
2612 case ir_txf_ms:
2613 ir->lod_info.sample_index->accept(this);
2614 sample_index = this->result;
2615 sample_index_type = ir->lod_info.sample_index->type;
2616
2617 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2618 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2619 else
2620 mcs = src_reg(0u);
2621 break;
2622 case ir_txd:
2623 ir->lod_info.grad.dPdx->accept(this);
2624 dPdx = this->result;
2625
2626 ir->lod_info.grad.dPdy->accept(this);
2627 dPdy = this->result;
2628
2629 lod_type = ir->lod_info.grad.dPdx->type;
2630 break;
2631 case ir_txb:
2632 case ir_lod:
2633 case ir_tg4:
2634 break;
2635 }
2636
2637 enum opcode opcode;
2638 switch (ir->op) {
2639 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2640 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2641 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2642 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2643 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2644 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2645 case ir_tg4: opcode = has_nonconstant_offset
2646 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2647 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2648 case ir_txb:
2649 unreachable("TXB is not valid for vertex shaders.");
2650 case ir_lod:
2651 unreachable("LOD is not valid for vertex shaders.");
2652 default:
2653 unreachable("Unrecognized tex op");
2654 }
2655
2656 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2657 opcode, dst_reg(this, ir->type));
2658
2659 if (ir->offset != NULL && !has_nonconstant_offset) {
2660 inst->offset =
2661 brw_texture_offset(ir->offset->as_constant()->value.i,
2662 ir->offset->type->vector_elements);
2663 }
2664
2665 /* Stuff the channel select bits in the top of the texture offset */
2666 if (ir->op == ir_tg4)
2667 inst->offset |= gather_channel(ir, sampler) << 16;
2668
2669 /* The message header is necessary for:
2670 * - Gen4 (always)
2671 * - Gen9+ for selecting SIMD4x2
2672 * - Texel offsets
2673 * - Gather channel selection
2674 * - Sampler indices too large to fit in a 4-bit value.
2675 */
2676 inst->header_size =
2677 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2678 inst->offset != 0 || ir->op == ir_tg4 ||
2679 is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2680 inst->base_mrf = 2;
2681 inst->mlen = inst->header_size + 1; /* always at least one */
2682 inst->dst.writemask = WRITEMASK_XYZW;
2683 inst->shadow_compare = ir->shadow_comparitor != NULL;
2684
2685 inst->src[1] = sampler_reg;
2686
2687 /* MRF for the first parameter */
2688 int param_base = inst->base_mrf + inst->header_size;
2689
2690 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2691 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2692 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2693 } else {
2694 /* Load the coordinate */
2695 /* FINISHME: gl_clamp_mask and saturate */
2696 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2697 int zero_mask = 0xf & ~coord_mask;
2698
2699 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2700 coordinate));
2701
2702 if (zero_mask != 0) {
2703 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2704 src_reg(0)));
2705 }
2706 /* Load the shadow comparitor */
2707 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2708 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2709 WRITEMASK_X),
2710 shadow_comparitor));
2711 inst->mlen++;
2712 }
2713
2714 /* Load the LOD info */
2715 if (ir->op == ir_tex || ir->op == ir_txl) {
2716 int mrf, writemask;
2717 if (devinfo->gen >= 5) {
2718 mrf = param_base + 1;
2719 if (ir->shadow_comparitor) {
2720 writemask = WRITEMASK_Y;
2721 /* mlen already incremented */
2722 } else {
2723 writemask = WRITEMASK_X;
2724 inst->mlen++;
2725 }
2726 } else /* devinfo->gen == 4 */ {
2727 mrf = param_base;
2728 writemask = WRITEMASK_W;
2729 }
2730 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2731 } else if (ir->op == ir_txf) {
2732 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2733 } else if (ir->op == ir_txf_ms) {
2734 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2735 sample_index));
2736 if (devinfo->gen >= 7) {
2737 /* MCS data is in the first channel of `mcs`, but we need to get it into
2738 * the .y channel of the second vec4 of params, so replicate .x across
2739 * the whole vec4 and then mask off everything except .y
2740 */
2741 mcs.swizzle = BRW_SWIZZLE_XXXX;
2742 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2743 mcs));
2744 }
2745 inst->mlen++;
2746 } else if (ir->op == ir_txd) {
2747 const glsl_type *type = lod_type;
2748
2749 if (devinfo->gen >= 5) {
2750 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2751 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2752 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2753 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2754 inst->mlen++;
2755
2756 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2757 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2758 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2759 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2760 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2761 inst->mlen++;
2762
2763 if (ir->shadow_comparitor) {
2764 emit(MOV(dst_reg(MRF, param_base + 2,
2765 ir->shadow_comparitor->type, WRITEMASK_Z),
2766 shadow_comparitor));
2767 }
2768 }
2769 } else /* devinfo->gen == 4 */ {
2770 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2771 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2772 inst->mlen += 2;
2773 }
2774 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2775 if (ir->shadow_comparitor) {
2776 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2777 shadow_comparitor));
2778 }
2779
2780 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2781 offset_value));
2782 inst->mlen++;
2783 }
2784 }
2785
2786 emit(inst);
2787
2788 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2789 * spec requires layers.
2790 */
2791 if (ir->op == ir_txs) {
2792 glsl_type const *type = ir->sampler->type;
2793 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2794 type->sampler_array) {
2795 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2796 writemask(inst->dst, WRITEMASK_Z),
2797 src_reg(inst->dst), src_reg(6));
2798 }
2799 }
2800
2801 if (devinfo->gen == 6 && ir->op == ir_tg4) {
2802 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2803 }
2804
2805 swizzle_result(ir, src_reg(inst->dst), sampler);
2806 }
2807
2808 /**
2809 * Apply workarounds for Gen6 gather with UINT/SINT
2810 */
2811 void
2812 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2813 {
2814 if (!wa)
2815 return;
2816
2817 int width = (wa & WA_8BIT) ? 8 : 16;
2818 dst_reg dst_f = dst;
2819 dst_f.type = BRW_REGISTER_TYPE_F;
2820
2821 /* Convert from UNORM to UINT */
2822 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2823 emit(MOV(dst, src_reg(dst_f)));
2824
2825 if (wa & WA_SIGN) {
2826 /* Reinterpret the UINT value as a signed INT value by
2827 * shifting the sign bit into place, then shifting back
2828 * preserving sign.
2829 */
2830 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2831 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2832 }
2833 }
2834
2835 /**
2836 * Set up the gather channel based on the swizzle, for gather4.
2837 */
2838 uint32_t
2839 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2840 {
2841 ir_constant *chan = ir->lod_info.component->as_constant();
2842 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2843 switch (swiz) {
2844 case SWIZZLE_X: return 0;
2845 case SWIZZLE_Y:
2846 /* gather4 sampler is broken for green channel on RG32F --
2847 * we must ask for blue instead.
2848 */
2849 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2850 return 2;
2851 return 1;
2852 case SWIZZLE_Z: return 2;
2853 case SWIZZLE_W: return 3;
2854 default:
2855 unreachable("Not reached"); /* zero, one swizzles handled already */
2856 }
2857 }
2858
2859 void
2860 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2861 {
2862 int s = key->tex.swizzles[sampler];
2863
2864 this->result = src_reg(this, ir->type);
2865 dst_reg swizzled_result(this->result);
2866
2867 if (ir->op == ir_query_levels) {
2868 /* # levels is in .w */
2869 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2870 emit(MOV(swizzled_result, orig_val));
2871 return;
2872 }
2873
2874 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2875 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2876 emit(MOV(swizzled_result, orig_val));
2877 return;
2878 }
2879
2880
2881 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2882 int swizzle[4] = {0};
2883
2884 for (int i = 0; i < 4; i++) {
2885 switch (GET_SWZ(s, i)) {
2886 case SWIZZLE_ZERO:
2887 zero_mask |= (1 << i);
2888 break;
2889 case SWIZZLE_ONE:
2890 one_mask |= (1 << i);
2891 break;
2892 default:
2893 copy_mask |= (1 << i);
2894 swizzle[i] = GET_SWZ(s, i);
2895 break;
2896 }
2897 }
2898
2899 if (copy_mask) {
2900 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2901 swizzled_result.writemask = copy_mask;
2902 emit(MOV(swizzled_result, orig_val));
2903 }
2904
2905 if (zero_mask) {
2906 swizzled_result.writemask = zero_mask;
2907 emit(MOV(swizzled_result, src_reg(0.0f)));
2908 }
2909
2910 if (one_mask) {
2911 swizzled_result.writemask = one_mask;
2912 emit(MOV(swizzled_result, src_reg(1.0f)));
2913 }
2914 }
2915
2916 void
2917 vec4_visitor::visit(ir_return *)
2918 {
2919 unreachable("not reached");
2920 }
2921
2922 void
2923 vec4_visitor::visit(ir_discard *)
2924 {
2925 unreachable("not reached");
2926 }
2927
2928 void
2929 vec4_visitor::visit(ir_if *ir)
2930 {
2931 /* Don't point the annotation at the if statement, because then it plus
2932 * the then and else blocks get printed.
2933 */
2934 this->base_ir = ir->condition;
2935
2936 if (devinfo->gen == 6) {
2937 emit_if_gen6(ir);
2938 } else {
2939 enum brw_predicate predicate;
2940 emit_bool_to_cond_code(ir->condition, &predicate);
2941 emit(IF(predicate));
2942 }
2943
2944 visit_instructions(&ir->then_instructions);
2945
2946 if (!ir->else_instructions.is_empty()) {
2947 this->base_ir = ir->condition;
2948 emit(BRW_OPCODE_ELSE);
2949
2950 visit_instructions(&ir->else_instructions);
2951 }
2952
2953 this->base_ir = ir->condition;
2954 emit(BRW_OPCODE_ENDIF);
2955 }
2956
2957 void
2958 vec4_visitor::visit(ir_emit_vertex *)
2959 {
2960 unreachable("not reached");
2961 }
2962
2963 void
2964 vec4_visitor::visit(ir_end_primitive *)
2965 {
2966 unreachable("not reached");
2967 }
2968
2969 void
2970 vec4_visitor::visit(ir_barrier *)
2971 {
2972 unreachable("not reached");
2973 }
2974
2975 void
2976 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2977 dst_reg dst, src_reg offset,
2978 src_reg src0, src_reg src1)
2979 {
2980 unsigned mlen = 0;
2981
2982 /* Set the atomic operation offset. */
2983 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2984 mlen++;
2985
2986 /* Set the atomic operation arguments. */
2987 if (src0.file != BAD_FILE) {
2988 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2989 mlen++;
2990 }
2991
2992 if (src1.file != BAD_FILE) {
2993 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2994 mlen++;
2995 }
2996
2997 /* Emit the instruction. Note that this maps to the normal SIMD8
2998 * untyped atomic message on Ivy Bridge, but that's OK because
2999 * unused channels will be masked out.
3000 */
3001 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3002 brw_message_reg(0),
3003 src_reg(surf_index), src_reg(atomic_op));
3004 inst->mlen = mlen;
3005 }
3006
3007 void
3008 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3009 src_reg offset)
3010 {
3011 /* Set the surface read offset. */
3012 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3013
3014 /* Emit the instruction. Note that this maps to the normal SIMD8
3015 * untyped surface read message, but that's OK because unused
3016 * channels will be masked out.
3017 */
3018 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3019 brw_message_reg(0),
3020 src_reg(surf_index), src_reg(1));
3021 inst->mlen = 1;
3022 }
3023
3024 void
3025 vec4_visitor::emit_ndc_computation()
3026 {
3027 /* Get the position */
3028 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3029
3030 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3031 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3032 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3033
3034 current_annotation = "NDC";
3035 dst_reg ndc_w = ndc;
3036 ndc_w.writemask = WRITEMASK_W;
3037 src_reg pos_w = pos;
3038 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3039 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3040
3041 dst_reg ndc_xyz = ndc;
3042 ndc_xyz.writemask = WRITEMASK_XYZ;
3043
3044 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3045 }
3046
3047 void
3048 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3049 {
3050 if (devinfo->gen < 6 &&
3051 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3052 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3053 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3054 dst_reg header1_w = header1;
3055 header1_w.writemask = WRITEMASK_W;
3056
3057 emit(MOV(header1, 0u));
3058
3059 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3060 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3061
3062 current_annotation = "Point size";
3063 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3064 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3065 }
3066
3067 if (key->userclip_active) {
3068 current_annotation = "Clipping flags";
3069 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3070 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3071
3072 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3073 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3074 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3075
3076 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3077 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3078 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3079 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3080 }
3081
3082 /* i965 clipping workaround:
3083 * 1) Test for -ve rhw
3084 * 2) If set,
3085 * set ndc = (0,0,0,0)
3086 * set ucp[6] = 1
3087 *
3088 * Later, clipping will detect ucp[6] and ensure the primitive is
3089 * clipped against all fixed planes.
3090 */
3091 if (devinfo->has_negative_rhw_bug) {
3092 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3093 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3094 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3095 vec4_instruction *inst;
3096 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3097 inst->predicate = BRW_PREDICATE_NORMAL;
3098 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3099 inst->predicate = BRW_PREDICATE_NORMAL;
3100 }
3101
3102 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3103 } else if (devinfo->gen < 6) {
3104 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3105 } else {
3106 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3107 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3108 dst_reg reg_w = reg;
3109 reg_w.writemask = WRITEMASK_W;
3110 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3111 }
3112 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3113 dst_reg reg_y = reg;
3114 reg_y.writemask = WRITEMASK_Y;
3115 reg_y.type = BRW_REGISTER_TYPE_D;
3116 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3117 }
3118 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3119 dst_reg reg_z = reg;
3120 reg_z.writemask = WRITEMASK_Z;
3121 reg_z.type = BRW_REGISTER_TYPE_D;
3122 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3123 }
3124 }
3125 }
3126
3127 void
3128 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3129 {
3130 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3131 *
3132 * "If a linked set of shaders forming the vertex stage contains no
3133 * static write to gl_ClipVertex or gl_ClipDistance, but the
3134 * application has requested clipping against user clip planes through
3135 * the API, then the coordinate written to gl_Position is used for
3136 * comparison against the user clip planes."
3137 *
3138 * This function is only called if the shader didn't write to
3139 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3140 * if the user wrote to it; otherwise we use gl_Position.
3141 */
3142 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3143 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3144 clip_vertex = VARYING_SLOT_POS;
3145 }
3146
3147 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3148 ++i) {
3149 reg.writemask = 1 << i;
3150 emit(DP4(reg,
3151 src_reg(output_reg[clip_vertex]),
3152 src_reg(this->userplane[i + offset])));
3153 }
3154 }
3155
3156 vec4_instruction *
3157 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3158 {
3159 assert (varying < VARYING_SLOT_MAX);
3160 reg.type = output_reg[varying].type;
3161 current_annotation = output_reg_annotation[varying];
3162 /* Copy the register, saturating if necessary */
3163 return emit(MOV(reg, src_reg(output_reg[varying])));
3164 }
3165
3166 void
3167 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3168 {
3169 reg.type = BRW_REGISTER_TYPE_F;
3170
3171 switch (varying) {
3172 case VARYING_SLOT_PSIZ:
3173 {
3174 /* PSIZ is always in slot 0, and is coupled with other flags. */
3175 current_annotation = "indices, point width, clip flags";
3176 emit_psiz_and_flags(reg);
3177 break;
3178 }
3179 case BRW_VARYING_SLOT_NDC:
3180 current_annotation = "NDC";
3181 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3182 break;
3183 case VARYING_SLOT_POS:
3184 current_annotation = "gl_Position";
3185 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3186 break;
3187 case VARYING_SLOT_EDGE:
3188 /* This is present when doing unfilled polygons. We're supposed to copy
3189 * the edge flag from the user-provided vertex array
3190 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3191 * of that attribute (starts as 1.0f). This is then used in clipping to
3192 * determine which edges should be drawn as wireframe.
3193 */
3194 current_annotation = "edge flag";
3195 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3196 glsl_type::float_type, WRITEMASK_XYZW))));
3197 break;
3198 case BRW_VARYING_SLOT_PAD:
3199 /* No need to write to this slot */
3200 break;
3201 case VARYING_SLOT_COL0:
3202 case VARYING_SLOT_COL1:
3203 case VARYING_SLOT_BFC0:
3204 case VARYING_SLOT_BFC1: {
3205 /* These built-in varyings are only supported in compatibility mode,
3206 * and we only support GS in core profile. So, this must be a vertex
3207 * shader.
3208 */
3209 assert(stage == MESA_SHADER_VERTEX);
3210 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3211 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3212 inst->saturate = true;
3213 break;
3214 }
3215
3216 default:
3217 emit_generic_urb_slot(reg, varying);
3218 break;
3219 }
3220 }
3221
3222 static int
3223 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3224 {
3225 if (devinfo->gen >= 6) {
3226 /* URB data written (does not include the message header reg) must
3227 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3228 * section 5.4.3.2.2: URB_INTERLEAVED.
3229 *
3230 * URB entries are allocated on a multiple of 1024 bits, so an
3231 * extra 128 bits written here to make the end align to 256 is
3232 * no problem.
3233 */
3234 if ((mlen % 2) != 1)
3235 mlen++;
3236 }
3237
3238 return mlen;
3239 }
3240
3241
3242 /**
3243 * Generates the VUE payload plus the necessary URB write instructions to
3244 * output it.
3245 *
3246 * The VUE layout is documented in Volume 2a.
3247 */
3248 void
3249 vec4_visitor::emit_vertex()
3250 {
3251 /* MRF 0 is reserved for the debugger, so start with message header
3252 * in MRF 1.
3253 */
3254 int base_mrf = 1;
3255 int mrf = base_mrf;
3256 /* In the process of generating our URB write message contents, we
3257 * may need to unspill a register or load from an array. Those
3258 * reads would use MRFs 14-15.
3259 */
3260 int max_usable_mrf = 13;
3261
3262 /* The following assertion verifies that max_usable_mrf causes an
3263 * even-numbered amount of URB write data, which will meet gen6's
3264 * requirements for length alignment.
3265 */
3266 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3267
3268 /* First mrf is the g0-based message header containing URB handles and
3269 * such.
3270 */
3271 emit_urb_write_header(mrf++);
3272
3273 if (devinfo->gen < 6) {
3274 emit_ndc_computation();
3275 }
3276
3277 /* Lower legacy ff and ClipVertex clipping to clip distances */
3278 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3279 current_annotation = "user clip distances";
3280
3281 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3282 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3283
3284 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3285 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3286 }
3287
3288 /* We may need to split this up into several URB writes, so do them in a
3289 * loop.
3290 */
3291 int slot = 0;
3292 bool complete = false;
3293 do {
3294 /* URB offset is in URB row increments, and each of our MRFs is half of
3295 * one of those, since we're doing interleaved writes.
3296 */
3297 int offset = slot / 2;
3298
3299 mrf = base_mrf + 1;
3300 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3301 emit_urb_slot(dst_reg(MRF, mrf++),
3302 prog_data->vue_map.slot_to_varying[slot]);
3303
3304 /* If this was max_usable_mrf, we can't fit anything more into this
3305 * URB WRITE.
3306 */
3307 if (mrf > max_usable_mrf) {
3308 slot++;
3309 break;
3310 }
3311 }
3312
3313 complete = slot >= prog_data->vue_map.num_slots;
3314 current_annotation = "URB write";
3315 vec4_instruction *inst = emit_urb_write_opcode(complete);
3316 inst->base_mrf = base_mrf;
3317 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3318 inst->offset += offset;
3319 } while(!complete);
3320 }
3321
3322
3323 src_reg
3324 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3325 src_reg *reladdr, int reg_offset)
3326 {
3327 /* Because we store the values to scratch interleaved like our
3328 * vertex data, we need to scale the vec4 index by 2.
3329 */
3330 int message_header_scale = 2;
3331
3332 /* Pre-gen6, the message header uses byte offsets instead of vec4
3333 * (16-byte) offset units.
3334 */
3335 if (devinfo->gen < 6)
3336 message_header_scale *= 16;
3337
3338 if (reladdr) {
3339 src_reg index = src_reg(this, glsl_type::int_type);
3340
3341 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3342 src_reg(reg_offset)));
3343 emit_before(block, inst, MUL(dst_reg(index), index,
3344 src_reg(message_header_scale)));
3345
3346 return index;
3347 } else {
3348 return src_reg(reg_offset * message_header_scale);
3349 }
3350 }
3351
3352 src_reg
3353 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3354 src_reg *reladdr, int reg_offset)
3355 {
3356 if (reladdr) {
3357 src_reg index = src_reg(this, glsl_type::int_type);
3358
3359 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3360 src_reg(reg_offset)));
3361
3362 /* Pre-gen6, the message header uses byte offsets instead of vec4
3363 * (16-byte) offset units.
3364 */
3365 if (devinfo->gen < 6) {
3366 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3367 }
3368
3369 return index;
3370 } else if (devinfo->gen >= 8) {
3371 /* Store the offset in a GRF so we can send-from-GRF. */
3372 src_reg offset = src_reg(this, glsl_type::int_type);
3373 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3374 return offset;
3375 } else {
3376 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3377 return src_reg(reg_offset * message_header_scale);
3378 }
3379 }
3380
3381 /**
3382 * Emits an instruction before @inst to load the value named by @orig_src
3383 * from scratch space at @base_offset to @temp.
3384 *
3385 * @base_offset is measured in 32-byte units (the size of a register).
3386 */
3387 void
3388 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3389 dst_reg temp, src_reg orig_src,
3390 int base_offset)
3391 {
3392 int reg_offset = base_offset + orig_src.reg_offset;
3393 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3394 reg_offset);
3395
3396 emit_before(block, inst, SCRATCH_READ(temp, index));
3397 }
3398
3399 /**
3400 * Emits an instruction after @inst to store the value to be written
3401 * to @orig_dst to scratch space at @base_offset, from @temp.
3402 *
3403 * @base_offset is measured in 32-byte units (the size of a register).
3404 */
3405 void
3406 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3407 int base_offset)
3408 {
3409 int reg_offset = base_offset + inst->dst.reg_offset;
3410 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3411 reg_offset);
3412
3413 /* Create a temporary register to store *inst's result in.
3414 *
3415 * We have to be careful in MOVing from our temporary result register in
3416 * the scratch write. If we swizzle from channels of the temporary that
3417 * weren't initialized, it will confuse live interval analysis, which will
3418 * make spilling fail to make progress.
3419 */
3420 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3421 inst->dst.type),
3422 brw_swizzle_for_mask(inst->dst.writemask));
3423 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3424 inst->dst.writemask));
3425 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3426 write->predicate = inst->predicate;
3427 write->ir = inst->ir;
3428 write->annotation = inst->annotation;
3429 inst->insert_after(block, write);
3430
3431 inst->dst.file = temp.file;
3432 inst->dst.reg = temp.reg;
3433 inst->dst.reg_offset = temp.reg_offset;
3434 inst->dst.reladdr = NULL;
3435 }
3436
3437 /**
3438 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3439 * adds the scratch read(s) before \p inst. The function also checks for
3440 * recursive reladdr scratch accesses, issuing the corresponding scratch
3441 * loads and rewriting reladdr references accordingly.
3442 *
3443 * \return \p src if it did not require a scratch load, otherwise, the
3444 * register holding the result of the scratch load that the caller should
3445 * use to rewrite src.
3446 */
3447 src_reg
3448 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3449 vec4_instruction *inst, src_reg src)
3450 {
3451 /* Resolve recursive reladdr scratch access by calling ourselves
3452 * with src.reladdr
3453 */
3454 if (src.reladdr)
3455 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3456 *src.reladdr);
3457
3458 /* Now handle scratch access on src */
3459 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3460 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3461 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3462 src.reg = temp.reg;
3463 src.reg_offset = temp.reg_offset;
3464 src.reladdr = NULL;
3465 }
3466
3467 return src;
3468 }
3469
3470 /**
3471 * We can't generally support array access in GRF space, because a
3472 * single instruction's destination can only span 2 contiguous
3473 * registers. So, we send all GRF arrays that get variable index
3474 * access to scratch space.
3475 */
3476 void
3477 vec4_visitor::move_grf_array_access_to_scratch()
3478 {
3479 int scratch_loc[this->alloc.count];
3480 memset(scratch_loc, -1, sizeof(scratch_loc));
3481
3482 /* First, calculate the set of virtual GRFs that need to be punted
3483 * to scratch due to having any array access on them, and where in
3484 * scratch.
3485 */
3486 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3487 if (inst->dst.file == GRF && inst->dst.reladdr) {
3488 if (scratch_loc[inst->dst.reg] == -1) {
3489 scratch_loc[inst->dst.reg] = c->last_scratch;
3490 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3491 }
3492
3493 for (src_reg *iter = inst->dst.reladdr;
3494 iter->reladdr;
3495 iter = iter->reladdr) {
3496 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3497 scratch_loc[iter->reg] = c->last_scratch;
3498 c->last_scratch += this->alloc.sizes[iter->reg];
3499 }
3500 }
3501 }
3502
3503 for (int i = 0 ; i < 3; i++) {
3504 for (src_reg *iter = &inst->src[i];
3505 iter->reladdr;
3506 iter = iter->reladdr) {
3507 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3508 scratch_loc[iter->reg] = c->last_scratch;
3509 c->last_scratch += this->alloc.sizes[iter->reg];
3510 }
3511 }
3512 }
3513 }
3514
3515 /* Now, for anything that will be accessed through scratch, rewrite
3516 * it to load/store. Note that this is a _safe list walk, because
3517 * we may generate a new scratch_write instruction after the one
3518 * we're processing.
3519 */
3520 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3521 /* Set up the annotation tracking for new generated instructions. */
3522 base_ir = inst->ir;
3523 current_annotation = inst->annotation;
3524
3525 /* First handle scratch access on the dst. Notice we have to handle
3526 * the case where the dst's reladdr also points to scratch space.
3527 */
3528 if (inst->dst.reladdr)
3529 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3530 *inst->dst.reladdr);
3531
3532 /* Now that we have handled any (possibly recursive) reladdr scratch
3533 * accesses for dst we can safely do the scratch write for dst itself
3534 */
3535 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3536 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3537
3538 /* Now handle scratch access on any src. In this case, since inst->src[i]
3539 * already is a src_reg, we can just call emit_resolve_reladdr with
3540 * inst->src[i] and it will take care of handling scratch loads for
3541 * both src and src.reladdr (recursively).
3542 */
3543 for (int i = 0 ; i < 3; i++) {
3544 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3545 inst->src[i]);
3546 }
3547 }
3548 }
3549
3550 /**
3551 * Emits an instruction before @inst to load the value named by @orig_src
3552 * from the pull constant buffer (surface) at @base_offset to @temp.
3553 */
3554 void
3555 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3556 dst_reg temp, src_reg orig_src,
3557 int base_offset)
3558 {
3559 int reg_offset = base_offset + orig_src.reg_offset;
3560 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3561 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3562 reg_offset);
3563
3564 emit_pull_constant_load_reg(temp,
3565 index,
3566 offset,
3567 block, inst);
3568 }
3569
3570 /**
3571 * Implements array access of uniforms by inserting a
3572 * PULL_CONSTANT_LOAD instruction.
3573 *
3574 * Unlike temporary GRF array access (where we don't support it due to
3575 * the difficulty of doing relative addressing on instruction
3576 * destinations), we could potentially do array access of uniforms
3577 * that were loaded in GRF space as push constants. In real-world
3578 * usage we've seen, though, the arrays being used are always larger
3579 * than we could load as push constants, so just always move all
3580 * uniform array access out to a pull constant buffer.
3581 */
3582 void
3583 vec4_visitor::move_uniform_array_access_to_pull_constants()
3584 {
3585 int pull_constant_loc[this->uniforms];
3586 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3587 bool nested_reladdr;
3588
3589 /* Walk through and find array access of uniforms. Put a copy of that
3590 * uniform in the pull constant buffer.
3591 *
3592 * Note that we don't move constant-indexed accesses to arrays. No
3593 * testing has been done of the performance impact of this choice.
3594 */
3595 do {
3596 nested_reladdr = false;
3597
3598 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3599 for (int i = 0 ; i < 3; i++) {
3600 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3601 continue;
3602
3603 int uniform = inst->src[i].reg;
3604
3605 if (inst->src[i].reladdr->reladdr)
3606 nested_reladdr = true; /* will need another pass */
3607
3608 /* If this array isn't already present in the pull constant buffer,
3609 * add it.
3610 */
3611 if (pull_constant_loc[uniform] == -1) {
3612 const gl_constant_value **values =
3613 &stage_prog_data->param[uniform * 4];
3614
3615 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3616
3617 assert(uniform < uniform_array_size);
3618 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3619 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3620 = values[j];
3621 }
3622 }
3623
3624 /* Set up the annotation tracking for new generated instructions. */
3625 base_ir = inst->ir;
3626 current_annotation = inst->annotation;
3627
3628 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3629
3630 emit_pull_constant_load(block, inst, temp, inst->src[i],
3631 pull_constant_loc[uniform]);
3632
3633 inst->src[i].file = temp.file;
3634 inst->src[i].reg = temp.reg;
3635 inst->src[i].reg_offset = temp.reg_offset;
3636 inst->src[i].reladdr = NULL;
3637 }
3638 }
3639 } while (nested_reladdr);
3640
3641 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3642 * no need to track them as larger-than-vec4 objects. This will be
3643 * relied on in cutting out unused uniform vectors from push
3644 * constants.
3645 */
3646 split_uniform_registers();
3647 }
3648
3649 void
3650 vec4_visitor::resolve_ud_negate(src_reg *reg)
3651 {
3652 if (reg->type != BRW_REGISTER_TYPE_UD ||
3653 !reg->negate)
3654 return;
3655
3656 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3657 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3658 *reg = temp;
3659 }
3660
3661 /**
3662 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3663 *
3664 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3665 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3666 */
3667 void
3668 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3669 {
3670 assert(devinfo->gen <= 5);
3671
3672 if (!rvalue->type->is_boolean())
3673 return;
3674
3675 src_reg and_result = src_reg(this, rvalue->type);
3676 src_reg neg_result = src_reg(this, rvalue->type);
3677 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3678 emit(MOV(dst_reg(neg_result), negate(and_result)));
3679 *reg = neg_result;
3680 }
3681
3682 vec4_visitor::vec4_visitor(struct brw_context *brw,
3683 struct brw_vec4_compile *c,
3684 struct gl_program *prog,
3685 const struct brw_vue_prog_key *key,
3686 struct brw_vue_prog_data *prog_data,
3687 struct gl_shader_program *shader_prog,
3688 gl_shader_stage stage,
3689 void *mem_ctx,
3690 bool no_spills,
3691 shader_time_shader_type st_type)
3692 : backend_shader(brw, mem_ctx, shader_prog, prog, &prog_data->base, stage),
3693 c(c),
3694 key(key),
3695 prog_data(prog_data),
3696 sanity_param_count(0),
3697 fail_msg(NULL),
3698 first_non_payload_grf(0),
3699 need_all_constants_in_pull_buffer(false),
3700 no_spills(no_spills),
3701 st_type(st_type)
3702 {
3703 this->failed = false;
3704
3705 this->base_ir = NULL;
3706 this->current_annotation = NULL;
3707 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3708
3709 this->variable_ht = hash_table_ctor(0,
3710 hash_table_pointer_hash,
3711 hash_table_pointer_compare);
3712
3713 this->virtual_grf_start = NULL;
3714 this->virtual_grf_end = NULL;
3715 this->live_intervals = NULL;
3716
3717 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3718
3719 this->uniforms = 0;
3720
3721 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3722 * at least one. See setup_uniforms() in brw_vec4.cpp.
3723 */
3724 this->uniform_array_size = 1;
3725 if (prog_data) {
3726 this->uniform_array_size =
3727 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3728 }
3729
3730 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3731 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3732 }
3733
3734 vec4_visitor::~vec4_visitor()
3735 {
3736 hash_table_dtor(this->variable_ht);
3737 }
3738
3739
3740 void
3741 vec4_visitor::fail(const char *format, ...)
3742 {
3743 va_list va;
3744 char *msg;
3745
3746 if (failed)
3747 return;
3748
3749 failed = true;
3750
3751 va_start(va, format);
3752 msg = ralloc_vasprintf(mem_ctx, format, va);
3753 va_end(va);
3754 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3755
3756 this->fail_msg = msg;
3757
3758 if (debug_enabled) {
3759 fprintf(stderr, "%s", msg);
3760 }
3761 }
3762
3763 } /* namespace brw */